* [FFmpeg-devel] [PATCH] swscale/aarch64/output: Implement yuv2nv12cx neon assembly
@ 2025-07-04 9:31 Harshitha Sarangu Suresh
0 siblings, 0 replies; only message in thread
From: Harshitha Sarangu Suresh @ 2025-07-04 9:31 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Cc: Dash Santosh Sathyanarayanan, Logaprakash Ramajayam
[-- Attachment #1: Type: text/plain, Size: 13480 bytes --]
Handled all the comments, removed code duplication for swapped and non-swapped case.
Checkasm Benchmark Results
yuv2nv12cX_2_512_accurate_c: 3496.2 ( 1.00x)
yuv2nv12cX_2_512_accurate_neon: 409.5 ( 8.54x)
yuv2nv12cX_2_512_approximate_c: 3495.1 ( 1.00x)
yuv2nv12cX_2_512_approximate_neon: 409.4 ( 8.54x)
yuv2nv12cX_4_512_accurate_c: 4676.5 ( 1.00x)
yuv2nv12cX_4_512_accurate_neon: 613.1 ( 7.63x)
yuv2nv12cX_4_512_approximate_c: 4677.8 ( 1.00x)
yuv2nv12cX_4_512_approximate_neon: 607.8 ( 7.70x)
yuv2nv12cX_8_512_accurate_c: 7221.6 ( 1.00x)
yuv2nv12cX_8_512_accurate_neon: 1003.8 ( 7.19x)
yuv2nv12cX_8_512_approximate_c: 7221.2 ( 1.00x)
yuv2nv12cX_8_512_approximate_neon: 1016.4 ( 7.11x)
yuv2nv12cX_16_512_accurate_c: 13731.1 ( 1.00x)
yuv2nv12cX_16_512_accurate_neon: 1757.2 ( 7.81x)
yuv2nv12cX_16_512_approximate_c: 13740.7 ( 1.00x)
yuv2nv12cX_16_512_approximate_neon: 1757.3 ( 7.82x)
From 4ca92570924ba42e20041feaae6d7488c02c1e6a Mon Sep 17 00:00:00 2001
From: Harshitha Suresh <harshitha@multicorewareinc.com>
Date: Fri, 4 Jul 2025 14:29:11 +0530
Subject: [PATCH] swscale/output: Implement yuv2nv12cx neon assembly
---
libswscale/aarch64/output.S | 233 +++++++++++++++++++++++++++++++++++
libswscale/aarch64/swscale.c | 24 ++++
2 files changed, 257 insertions(+)
diff --git a/libswscale/aarch64/output.S b/libswscale/aarch64/output.S
index 190c438870..92dec3f0ed 100644
--- a/libswscale/aarch64/output.S
+++ b/libswscale/aarch64/output.S
@@ -226,3 +226,236 @@ function ff_yuv2plane1_8_neon, export=1
b.gt 2b // loop until width consumed
ret
endfunc
+
+function ff_yuv2nv12cX_neon_asm, export=1
+// w0 - isSwapped
+// x1 - uint8_t *chrDither
+// x2 - int16_t *chrFilter
+// x3 - int chrFilterSize
+// x4 - int16_t **chrUSrc
+// x5 - int16_t **chrVSrc
+// x6 - uint8_t *dest
+// x7 - int chrDstW
+
+ stp x19, x20, [sp, #-16]!
+ stp x21, x22, [sp, #-16]!
+
+ ld1 {v0.8b}, [x1] // chrDither[0..7]
+ ext v1.8b, v0.8b, v0.8b, #3 // Rotate for V: (i+3)&7
+
+ uxtl v0.8h, v0.8b
+ uxtl v1.8h, v1.8b
+
+ ushll v2.4s, v0.4h, #12 // U dither low
+ ushll2 v3.4s, v0.8h, #12 // U dither high
+ ushll v4.4s, v1.4h, #12 // V dither low
+ ushll2 v5.4s, v1.8h, #12 // V dither high
+
+ mov x8, #0 // i = 0
+1:
+ cmp w7, #16
+ blt 7f
+
+ mov v16.16b, v2.16b // U acc low
+ mov v17.16b, v3.16b // U acc high
+ mov v18.16b, v4.16b // V acc low
+ mov v19.16b, v5.16b // V acc high
+
+ mov v20.16b, v2.16b
+ mov v21.16b, v3.16b
+ mov v22.16b, v4.16b
+ mov v23.16b, v5.16b
+
+ mov w9, w3 // chrFilterSize counter
+ mov x10, x2 // chrFilter pointer
+ mov x11, x4 // chrUSrc base
+ mov x12, x5 // chrVSrc base
+
+2:
+ ldr h6, [x10], #2 // Load filter coefficient
+
+ ldr x13, [x11], #8 // chrUSrc[j]
+ ldr x14, [x12], #8 // chrVSrc[j]
+ add x13, x13, x8, lsl #1 // &chrUSrc[j][i]
+ add x14, x14, x8, lsl #1 // &chrVSrc[j][i]
+ add x15, x13, #16
+ add x16, x14, #16
+
+ ld1 {v24.8h}, [x13] // U samples 0-7
+ ld1 {v25.8h}, [x14] // V samples 0-7
+
+ ld1 {v26.8h}, [x15] // U samples 8-15
+ ld1 {v27.8h}, [x16] // V samples 8-15
+
+ smlal v16.4s, v24.4h, v6.h[0]
+ smlal2 v17.4s, v24.8h, v6.h[0]
+ smlal v18.4s, v25.4h, v6.h[0]
+ smlal2 v19.4s, v25.8h, v6.h[0]
+
+ smlal v20.4s, v26.4h, v6.h[0]
+ smlal2 v21.4s, v26.8h, v6.h[0]
+ smlal v22.4s, v27.4h, v6.h[0]
+ smlal2 v23.4s, v27.8h, v6.h[0]
+
+ subs w9, w9, #1
+ b.gt 2b
+
+ sqshrun v28.4h, v16.4s, #16 // Process and store first 8 pixels
+ sqshrun2 v28.8h, v17.4s, #16
+ sqshrun v29.4h, v18.4s, #16
+ sqshrun2 v29.8h, v19.4s, #16
+
+ cbz w0, 3f
+ uqshrn v24.8b, v28.8h, #3 // Storing U
+ uqshrn v25.8b, v29.8h, #3 // Storing V
+ st2 {v24.8b, v25.8b}, [x6], #16
+ b 4f
+3:
+ uqshrn v24.8b, v29.8h, #3 // Storing V
+ uqshrn v25.8b, v28.8h, #3 // Storing U
+ st2 {v24.8b, v25.8b}, [x6], #16
+
+4:
+ sqshrun v28.4h, v20.4s, #16 // Process and store next 8 pixels
+ sqshrun2 v28.8h, v21.4s, #16
+ sqshrun v29.4h, v22.4s, #16
+ sqshrun2 v29.8h, v23.4s, #16
+
+ cbz w0, 5f
+ uqshrn v30.8b, v28.8h, #3 // Storing U
+ uqshrn v31.8b, v29.8h, #3 // Storing V
+ st2 {v30.8b, v31.8b}, [x6], #16
+ b 6f
+5:
+ uqshrn v30.8b, v29.8h, #3 // Storing V
+ uqshrn v31.8b, v28.8h, #3 // Storing U
+ st2 {v30.8b, v31.8b}, [x6], #16
+
+6:
+ subs w7, w7, #16
+ add x8, x8, #16
+ b.gt 1b
+
+7:
+ cmp w7, #8
+ blt 12f
+8:
+ mov v16.16b, v2.16b // U acc low
+ mov v17.16b, v3.16b // U acc high
+ mov v18.16b, v4.16b // V acc low
+ mov v19.16b, v5.16b // V acc high
+
+ mov w9, w3 // chrFilterSize counter
+ mov x10, x2 // chrFilter pointer
+ mov x11, x4 // chrUSrc base
+ mov x12, x5 // chrVSrc base
+
+9:
+ ldr h6, [x10], #2 // Load filter coefficient
+
+ ldr x13, [x11], #8 // chrUSrc[j]
+ ldr x14, [x12], #8 // chrVSrc[j]
+ add x13, x13, x8, lsl #1 // &chrUSrc[j][i]
+ add x14, x14, x8, lsl #1 // &chrVSrc[j][i]
+
+ ld1 {v20.8h}, [x13] // U samples
+ ld1 {v21.8h}, [x14] // V samples
+
+ smlal v16.4s, v20.4h, v6.h[0]
+ smlal2 v17.4s, v20.8h, v6.h[0]
+ smlal v18.4s, v21.4h, v6.h[0]
+ smlal2 v19.4s, v21.8h, v6.h[0]
+
+ subs w9, w9, #1
+ b.gt 9b
+
+ sqshrun v26.4h, v16.4s, #16 // Final processing and store
+ sqshrun2 v26.8h, v17.4s, #16
+ sqshrun v27.4h, v18.4s, #16
+ sqshrun2 v27.8h, v19.4s, #16
+
+ cbz w0, 10f
+ uqshrn v28.8b, v26.8h, #3 // Storing U
+ uqshrn v29.8b, v27.8h, #3 // Storing V
+ st2 {v28.8b, v29.8b}, [x6], #16
+ b 11f
+10:
+ uqshrn v28.8b, v27.8h, #3 // Storing V
+ uqshrn v29.8b, v26.8h, #3 // Storing U
+ st2 {v28.8b, v29.8b}, [x6], #16
+11:
+ subs w7, w7, #8
+ add x8, x8, #8
+
+12:
+ cbz w7, 17f // Scalar loop
+
+13:
+ and x15, x8, #7
+ ldrb w9, [x1, x15]
+ sxtw x9, w9
+ lsl x9, x9, #12 // u = chrDither[i & 7] << 12;
+
+ add x15, x8, #3
+ and x15, x15, #7
+ ldrb w10, [x1, x15]
+ sxtw x10, w10
+ lsl x10, x10, #12 // v = chrDither[(i + 3) & 7] << 12;
+
+ mov w11, w3 // chrFilterSize counter
+ mov x12, x2 // chrFilter pointer
+ mov x13, x4 // chrUSrc base
+ mov x14, x5 // chrVSrc base
+
+14:
+ ldrsh x16, [x12], #2
+
+ ldr x17, [x13], #8 // chrUSrc[j]
+ ldr x19, [x14], #8 // chrVSrc[j]
+ add x17, x17, x8, lsl #1 // &chrUSrc[j][i]
+ add x19, x19, x8, lsl #1 // &chrVSrc[j][i]
+
+ ldrsh x20, [x17]
+ ldrsh x21, [x19]
+
+ madd x9, x16, x20, x9
+ madd x10, x16, x21, x10
+
+ subs w11, w11, #1
+ b.gt 14b
+
+ asr x9, x9, #19 // Process and store U and V
+ asr x10, x10, #19
+
+ cmp x9, #0
+ csel x9, x9, xzr, ge
+ cmp x10, #0
+ csel x10, x10, xzr, ge
+
+ mov x22, #1
+ lsl x22, x22, #8
+ sub x22, x22, #1
+
+ cmp x9, x22
+ csel x9, x22, x9, gt
+ cmp x10, x22
+ csel x10, x22, x10, gt
+
+ cbz w0, 15f
+ strb w9, [x6], #1 // Storing U
+ strb w10, [x6], #1 // Storing V
+ b 16f
+15:
+ strb w10, [x6], #1 // Storing V
+ strb w9, [x6], #1 // Storing U
+
+16:
+ subs w7, w7, #1
+ add x8, x8, #1
+ b.gt 13b
+17:
+ ldp x21, x22, [sp], #16
+ ldp x19, x20, [sp], #16
+ ret
+
+endfunc
diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
index 6e5a721c1f..a7dcc451dc 100644
--- a/libswscale/aarch64/swscale.c
+++ b/libswscale/aarch64/swscale.c
@@ -168,6 +168,28 @@ void ff_yuv2plane1_8_neon(
const uint8_t *dither,
int offset);
+void ff_yuv2nv12cX_neon_asm(int isSwapped, const uint8_t *chrDither,
+ const int16_t *chrFilter, int chrFilterSize,
+ const int16_t **chrUSrc, const int16_t **chrVSrc,
+ uint8_t *dest, int chrDstW);
+
+static void ff_yuv2nv12cX_neon(enum AVPixelFormat dstFormat, const uint8_t *chrDither,
+ const int16_t *chrFilter, int chrFilterSize,
+ const int16_t **chrUSrc, const int16_t **chrVSrc,
+ uint8_t *dest, int chrDstW)
+{
+ if (!isSwappedChroma(dstFormat))
+ {
+ ff_yuv2nv12cX_neon_asm(1, chrDither, chrFilter, chrFilterSize,
+ chrUSrc, chrVSrc, dest, chrDstW);
+ }
+ else
+ {
+ ff_yuv2nv12cX_neon_asm(0, chrDither, chrFilter, chrFilterSize,
+ chrUSrc, chrVSrc, dest, chrDstW);
+ }
+}
+
#define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt) do { \
if (c->srcBpc == 8) { \
if(c->dstBpc <= 14) { \
@@ -275,6 +297,8 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
ASSIGN_VSCALE_FUNC(c->yuv2plane1, neon);
if (c->dstBpc == 8) {
c->yuv2planeX = ff_yuv2planeX_8_neon;
+ if (isSemiPlanarYUV(c->opts.dst_format))
+ c->yuv2nv12cX = ff_yuv2nv12cX_neon;
}
switch (c->opts.src_format) {
case AV_PIX_FMT_ABGR:
--
2.36.0.windows.1
[-- Attachment #2: 0001-swscale-output-Implement-yuv2nv12cx-neon-assembly.patch --]
[-- Type: application/octet-stream, Size: 11807 bytes --]
[-- Attachment #3: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2025-07-04 9:31 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-07-04 9:31 [FFmpeg-devel] [PATCH] swscale/aarch64/output: Implement yuv2nv12cx neon assembly Harshitha Sarangu Suresh
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git