Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
From: Harshitha Sarangu Suresh <harshitha@multicorewareinc.com>
To: "ffmpeg-devel@ffmpeg.org" <ffmpeg-devel@ffmpeg.org>
Cc: Dash Santosh Sathyanarayanan <dash.sathyanarayanan@multicorewareinc.com>
Subject: [FFmpeg-devel] [PATCH] swscale/output: Implement neon intrinsics for yuv2planeX_10_c_template()
Date: Thu, 22 May 2025 13:57:31 +0000
Message-ID: <MA0P287MB1158903867B74EADCF10A9C2D699A@MA0P287MB1158.INDP287.PROD.OUTLOOK.COM> (raw)

[-- Attachment #1: Type: text/plain, Size: 4446 bytes --]

This optimization provides 5x improvement for the module. The boost in performance was calculated by adding C timers inside the C function and the optimized neon intrinsic function.


From 904144c2db9e5e72d56360c4c2eb38d426852901 Mon Sep 17 00:00:00 2001
From: Harshitha Suresh <harshitha@multicorewareinc.com>
Date: Thu, 22 May 2025 10:23:55 +0530
Subject: [PATCH] swscale/output: Implement neon intrinsics for
 yuv2planeX_10_c_template()

---
 libswscale/output.c | 76 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 75 insertions(+), 1 deletion(-)

diff --git a/libswscale/output.c b/libswscale/output.c
index c37649e7ce..345df5ce59 100644
--- a/libswscale/output.c
+++ b/libswscale/output.c
@@ -22,7 +22,9 @@
 #include <stddef.h>
 #include <stdint.h>
 #include <string.h>
-
+#if defined (__aarch64__)
+#include <arm_neon.h>
+#endif
 #include "libavutil/attributes.h"
 #include "libavutil/avutil.h"
 #include "libavutil/avassert.h"
@@ -337,6 +339,77 @@ yuv2plane1_10_c_template(const int16_t *src, uint16_t *dest, int dstW,
     }
 }

+
+#if defined (__aarch64__) && !defined(__APPLE__)
+static av_always_inline void
+yuv2planeX_10_c_template(const int16_t *filter, int filterSize,
+                         const int16_t **src, uint16_t *dest, int dstW,
+                         int big_endian, int output_bits)
+{
+    const int shift = 11 + 16 - output_bits;
+    const int bias = 1 << (shift - 1);
+    const int clip_max = (1 << output_bits) - 1;
+    int i;
+
+    for (i = 0; i < dstW; i += 16) {
+        int32x4_t sum0_lo = vdupq_n_s32(bias);
+        int32x4_t sum0_hi = vdupq_n_s32(bias);
+        int32x4_t sum1_lo = vdupq_n_s32(bias);
+        int32x4_t sum1_hi = vdupq_n_s32(bias);
+
+        for (int j = 0; j < filterSize; j++) {
+            int16x8_t src_vec0 = vld1q_s16(&src[j][i]);
+            int16x8_t src_vec1 = vld1q_s16(&src[j][i + 8]);
+            int16x8_t filter_val = vdupq_n_s16(filter[j]);
+
+            sum0_lo = vmlal_s16(sum0_lo, vget_low_s16(src_vec0), vget_low_s16(filter_val));
+            sum0_hi = vmlal_s16(sum0_hi, vget_high_s16(src_vec0), vget_high_s16(filter_val));
+            sum1_lo = vmlal_s16(sum1_lo, vget_low_s16(src_vec1), vget_low_s16(filter_val));
+            sum1_hi = vmlal_s16(sum1_hi, vget_high_s16(src_vec1), vget_high_s16(filter_val));
+        }
+
+        // Right shift with rounding
+        int32x4_t shift_vec = vdupq_n_s32(-shift);
+        sum0_lo = vshlq_s32(sum0_lo, shift_vec);
+        sum0_hi = vshlq_s32(sum0_hi, shift_vec);
+        sum1_lo = vshlq_s32(sum1_lo, shift_vec);
+        sum1_hi = vshlq_s32(sum1_hi, shift_vec);
+
+        // Clip to output_bits range
+        sum0_lo = vmaxq_s32(vminq_s32(sum0_lo, vdupq_n_s32(clip_max)), vdupq_n_s32(0));
+        sum0_hi = vmaxq_s32(vminq_s32(sum0_hi, vdupq_n_s32(clip_max)), vdupq_n_s32(0));
+        sum1_lo = vmaxq_s32(vminq_s32(sum1_lo, vdupq_n_s32(clip_max)), vdupq_n_s32(0));
+        sum1_hi = vmaxq_s32(vminq_s32(sum1_hi, vdupq_n_s32(clip_max)), vdupq_n_s32(0));
+
+        // Convert to 16-bit
+        uint16x8_t result0 = vcombine_u16(
+            vreinterpret_u16_s16(vmovn_s32(sum0_lo)),
+            vreinterpret_u16_s16(vmovn_s32(sum0_hi))
+        );
+        uint16x8_t result1 = vcombine_u16(
+            vreinterpret_u16_s16(vmovn_s32(sum1_lo)),
+            vreinterpret_u16_s16(vmovn_s32(sum1_hi))
+        );
+
+        // Store with proper endianness
+        if (big_endian) {
+            result0 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(result0)));
+            result1 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(result1)));
+        }
+        vst1q_u16(&dest[i], result0);
+        vst1q_u16(&dest[i + 8], result1);
+    }
+
+    // Handle remaining pixels
+    for (; i < dstW; i++) {
+        int val = bias;
+        for (int j = 0; j < filterSize; j++) {
+            val += src[j][i] * filter[j];
+        }
+        output_pixel(&dest[i], val);
+    }
+}
+#else
 static av_always_inline void
 yuv2planeX_10_c_template(const int16_t *filter, int filterSize,
                          const int16_t **src, uint16_t *dest, int dstW,
@@ -355,6 +428,7 @@ yuv2planeX_10_c_template(const int16_t *filter, int filterSize,
         output_pixel(&dest[i], val);
     }
 }
+#endif

 #undef output_pixel

--
2.36.0.windows.1




[-- Attachment #2: 0001-swscale-output-Implement-neon-intrinsics-for-yuv2pla.patch --]
[-- Type: application/octet-stream, Size: 4145 bytes --]

[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

             reply	other threads:[~2025-05-22 13:57 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-05-22 13:57 Harshitha Sarangu Suresh [this message]
2025-05-26  8:40 ` Harshitha Sarangu Suresh
2025-05-26  9:05 ` Martin Storsjö

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=MA0P287MB1158903867B74EADCF10A9C2D699A@MA0P287MB1158.INDP287.PROD.OUTLOOK.COM \
    --to=harshitha@multicorewareinc.com \
    --cc=dash.sathyanarayanan@multicorewareinc.com \
    --cc=ffmpeg-devel@ffmpeg.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git