* [FFmpeg-devel] [PATCH] swscale/aarch64/output: Implement neon assembly for yuv2planeX_10_c_template()
@ 2025-06-06 8:44 Logaprakash Ramajayam
2025-06-12 5:25 ` Logaprakash Ramajayam
2025-06-18 21:15 ` Martin Storsjö
0 siblings, 2 replies; 3+ messages in thread
From: Logaprakash Ramajayam @ 2025-06-06 8:44 UTC (permalink / raw)
To: Kieran Kunhya via ffmpeg-devel
Cc: Dash Santosh Sathyanarayanan, Harshitha Sarangu Suresh
[-- Attachment #1: Type: text/plain, Size: 10121 bytes --]
Checked FATE tests and gha-aarch64 git workflow.
From 34cdef26eaebcf98916e9881b3a04f4f698f09c6 Mon Sep 17 00:00:00 2001
From: Logaprakash Ramajayam <logaprakash.ramajayam@multicorewareinc.com>
Date: Thu, 5 Jun 2025 01:33:39 -0700
Subject: [PATCH] swscale/aarch64/output: Implement neon assembly for
yuv2planeX_10_c_template()
---
libswscale/aarch64/output.S | 167 +++++++++++++++++++++++++++++++++++
libswscale/aarch64/swscale.c | 38 ++++++++
2 files changed, 205 insertions(+)
diff --git a/libswscale/aarch64/output.S b/libswscale/aarch64/output.S
index 190c438870..e039e820ae 100644
--- a/libswscale/aarch64/output.S
+++ b/libswscale/aarch64/output.S
@@ -20,6 +20,173 @@
#include "libavutil/aarch64/asm.S"
+function ff_yuv2planeX_10_neon, export=1
+// x0 = filter (int16_t*)
+// w1 = filterSize
+// x2 = src (int16_t**)
+// x3 = dest (uint16_t*)
+// w4 = dstW
+// w5 = big_endian
+// w6 = output_bits
+
+ mov w8, #27
+ sub w8, w8, w6 // shift = 11 + 16 - output_bits
+
+ sub w9, w8, #1
+ mov w10, #1
+ lsl w9, w10, w9 // val = 1 << (shift - 1)
+
+ dup v1.4s, w9
+ dup v2.4s, w9 // Create vectors with val
+
+ mov w17, #0
+ sub w16, w17, w8
+ dup v8.4s, w16 // Create (-shift) vector for right shift
+
+ movi v11.4s, #0
+
+ mov w10, #1
+ lsl w10, w10, w6
+ sub w10, w10, #1 // (1U << output_bits) - 1
+ dup v12.4s, w10 // Create Clip vector for uppr bound
+
+ tst w4, #15 // if dstW divisible by 16, process 16 elements
+ b.ne 4f // else process 8 elements
+
+ mov x7, #0 // i = 0
+1: // Loop
+
+ mov v3.16b, v1.16b
+ mov v4.16b, v2.16b
+ mov v5.16b, v1.16b
+ mov v6.16b, v2.16b
+
+ mov w11, w1 // tmpfilterSize = filterSize
+ mov x12, x2 // srcp = src
+ mov x13, x0 // filterp = filter
+
+2: // Filter loop
+
+ ldp x14, x15, [x12], #16 // get 2 pointers: src[j] and src[j+1]
+ ldr s7, [x13], #4 // load filter coefficients
+ add x14, x14, x7, lsl #1
+ add x15, x15, x7, lsl #1
+ ld1 {v16.8h, v17.8h}, [x14]
+ ld1 {v18.8h, v19.8h}, [x15]
+
+ // Multiply-accumulate
+ smlal v3.4s, v16.4h, v7.h[0]
+ smlal2 v4.4s, v16.8h, v7.h[0]
+ smlal v5.4s, v17.4h, v7.h[0]
+ smlal2 v6.4s, v17.8h, v7.h[0]
+
+ smlal v3.4s, v18.4h, v7.h[1]
+ smlal2 v4.4s, v18.8h, v7.h[1]
+ smlal v5.4s, v19.4h, v7.h[1]
+ smlal2 v6.4s, v19.8h, v7.h[1]
+
+ subs w11, w11, #2 // tmpfilterSize -= 2
+ b.gt 2b // continue filter loop
+
+ // Shift results
+ sshl v3.4s, v3.4s, v8.4s
+ sshl v4.4s, v4.4s, v8.4s
+ sshl v5.4s, v5.4s, v8.4s
+ sshl v6.4s, v6.4s, v8.4s
+
+ // Clamp to 0
+ smax v3.4s, v3.4s, v11.4s
+ smax v4.4s, v4.4s, v11.4s
+ smax v5.4s, v5.4s, v11.4s
+ smax v6.4s, v6.4s, v11.4s
+
+ // Clip upper bound
+ smin v3.4s, v3.4s, v12.4s
+ smin v4.4s, v4.4s, v12.4s
+ smin v5.4s, v5.4s, v12.4s
+ smin v6.4s, v6.4s, v12.4s
+
+ // Narrow to 16-bit
+ xtn v13.4h, v3.4s
+ xtn2 v13.8h, v4.4s
+ xtn v14.4h, v5.4s
+ xtn2 v14.8h, v6.4s
+
+ cbz w5, 3f // Check if big endian
+ rev16 v13.16b, v13.16b
+ rev16 v14.16b, v14.16b // Swap bits for big endian
+3:
+ // Store 16 pixels
+ st1 {v13.8h}, [x3], #16
+ st1 {v14.8h}, [x3], #16
+
+ add x7, x7, #16 // i = i + 16
+ subs w4, w4, #16 // dstW = dstW - 16
+ b.gt 1b // Continue loop
+ b 8f // end
+
+4: // Process 8 elements
+ mov x7, #0
+5: // Loop
+
+ mov v3.16b, v1.16b
+ mov v4.16b, v2.16b
+
+ mov w11, w1
+ mov x12, x2
+ mov x13, x0
+
+6: // Filter loop
+
+ ldp x14, x15, [x12], #16
+ ldr s7, [x13], #4
+ add x14, x14, x7, lsl #1
+ add x15, x15, x7, lsl #1
+ ld1 {v5.8h}, [x14]
+ ld1 {v6.8h}, [x15]
+
+ // Multiply-accumulate
+ smlal v3.4s, v5.4h, v7.h[0]
+ smlal2 v4.4s, v5.8h, v7.h[0]
+ smlal v3.4s, v6.4h, v7.h[1]
+ smlal2 v4.4s, v6.8h, v7.h[1]
+
+ subs w11, w11, #2 // tmpfilterSize -= 2
+ b.gt 6b // loop until filterSize consumed
+
+ // Shift results
+ sshl v3.4s, v3.4s, v8.4s
+ sshl v4.4s, v4.4s, v8.4s
+
+ // Clamp to 0
+ smax v3.4s, v3.4s, v11.4s
+ smax v4.4s, v4.4s, v11.4s
+
+ // Clip upper bound
+ smin v3.4s, v3.4s, v12.4s
+ smin v4.4s, v4.4s, v12.4s
+
+ // Narrow to 16-bit
+ xtn v9.4h, v3.4s
+ xtn v10.4h, v4.4s
+
+ cbz w5, 7f // Check if big endian
+ rev16 v9.8b, v9.8b
+ rev16 v10.8b, v10.8b // Swap bits for big endian
+
+7:
+ // Store 8 pixels
+ st1 {v9.4h}, [x3], #8
+ st1 {v10.4h}, [x3], #8
+
+ add x7, x7, #8 // i = i + 8
+ subs w4, w4, #8 // dstW = dstW - 8
+ b.gt 5b // Continue Loop
+
+8:
+ ret
+endfunc
+
function ff_yuv2planeX_8_neon, export=1
// x0 - const int16_t *filter,
// x1 - int filterSize,
diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
index 6e5a721c1f..23cdb7d26e 100644
--- a/libswscale/aarch64/swscale.c
+++ b/libswscale/aarch64/swscale.c
@@ -158,6 +158,29 @@ void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \
ALL_SCALE_FUNCS(neon);
+void ff_yuv2planeX_10_neon(const int16_t *filter, int filterSize,
+ const int16_t **src, uint16_t *dest, int dstW,
+ int big_endian, int output_bits);
+
+#define yuv2NBPS(bits, BE_LE, is_be, template_size, typeX_t) \
+static void yuv2planeX_ ## bits ## BE_LE ## _neon(const int16_t *filter, int filterSize, \
+ const int16_t **src, uint8_t *dest, int dstW, \
+ const uint8_t *dither, int offset)\
+{ \
+ ff_yuv2planeX_## template_size ## _neon(filter, \
+ filterSize, (const typeX_t **) src, \
+ (uint16_t *) dest, dstW, is_be, bits); \
+}
+
+yuv2NBPS( 9, BE, 1, 10, int16_t)
+yuv2NBPS( 9, LE, 0, 10, int16_t)
+yuv2NBPS(10, BE, 1, 10, int16_t)
+yuv2NBPS(10, LE, 0, 10, int16_t)
+yuv2NBPS(12, BE, 1, 10, int16_t)
+yuv2NBPS(12, LE, 0, 10, int16_t)
+yuv2NBPS(14, BE, 1, 10, int16_t)
+yuv2NBPS(14, LE, 0, 10, int16_t)
+
void ff_yuv2planeX_8_neon(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset);
@@ -268,6 +291,8 @@ av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c)
av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
{
int cpu_flags = av_get_cpu_flags();
+ enum AVPixelFormat dstFormat = c->opts.dst_format;
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(dstFormat);
if (have_neon(cpu_flags)) {
ASSIGN_SCALE_FUNC(c->hyScale, c->hLumFilterSize, neon);
@@ -276,6 +301,19 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
if (c->dstBpc == 8) {
c->yuv2planeX = ff_yuv2planeX_8_neon;
}
+
+ if (isNBPS(dstFormat) && !isSemiPlanarYUV(dstFormat)) {
+ if (desc->comp[0].depth == 9) {
+ c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_9BE_neon : yuv2planeX_9LE_neon;
+ } else if (desc->comp[0].depth == 10) {
+ c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_10BE_neon : yuv2planeX_10LE_neon;
+ } else if (desc->comp[0].depth == 12) {
+ c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_12BE_neon : yuv2planeX_12LE_neon;
+ } else if (desc->comp[0].depth == 14) {
+ c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_14BE_neon : yuv2planeX_14LE_neon;
+ } else
+ av_assert0(0);
+ }
switch (c->opts.src_format) {
case AV_PIX_FMT_ABGR:
c->lumToYV12 = ff_abgr32ToY_neon;
--
2.36.0.windows.1
[-- Attachment #2: Aarch64-Implement-neon-assembly-yuv2planeX_10_c_template.patch --]
[-- Type: application/octet-stream, Size: 9819 bytes --]
[-- Attachment #3: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [FFmpeg-devel] [PATCH] swscale/aarch64/output: Implement neon assembly for yuv2planeX_10_c_template()
2025-06-06 8:44 [FFmpeg-devel] [PATCH] swscale/aarch64/output: Implement neon assembly for yuv2planeX_10_c_template() Logaprakash Ramajayam
@ 2025-06-12 5:25 ` Logaprakash Ramajayam
2025-06-18 21:15 ` Martin Storsjö
1 sibling, 0 replies; 3+ messages in thread
From: Logaprakash Ramajayam @ 2025-06-12 5:25 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Hi,
Could you please check and review this patch?
________________________________
From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> on behalf of Logaprakash Ramajayam <logaprakash.ramajayam@multicorewareinc.com>
Sent: Friday, June 6, 2025 2:14 PM
To: Kieran Kunhya via ffmpeg-devel <ffmpeg-devel@ffmpeg.org>
Cc: Dash Santosh Sathyanarayanan <dash.sathyanarayanan@multicorewareinc.com>; Harshitha Sarangu Suresh <harshitha@multicorewareinc.com>
Subject: [FFmpeg-devel] [PATCH] swscale/aarch64/output: Implement neon assembly for yuv2planeX_10_c_template()
Checked FATE tests and gha-aarch64 git workflow.
From 34cdef26eaebcf98916e9881b3a04f4f698f09c6 Mon Sep 17 00:00:00 2001
From: Logaprakash Ramajayam <logaprakash.ramajayam@multicorewareinc.com>
Date: Thu, 5 Jun 2025 01:33:39 -0700
Subject: [PATCH] swscale/aarch64/output: Implement neon assembly for
yuv2planeX_10_c_template()
---
libswscale/aarch64/output.S | 167 +++++++++++++++++++++++++++++++++++
libswscale/aarch64/swscale.c | 38 ++++++++
2 files changed, 205 insertions(+)
diff --git a/libswscale/aarch64/output.S b/libswscale/aarch64/output.S
index 190c438870..e039e820ae 100644
--- a/libswscale/aarch64/output.S
+++ b/libswscale/aarch64/output.S
@@ -20,6 +20,173 @@
#include "libavutil/aarch64/asm.S"
+function ff_yuv2planeX_10_neon, export=1
+// x0 = filter (int16_t*)
+// w1 = filterSize
+// x2 = src (int16_t**)
+// x3 = dest (uint16_t*)
+// w4 = dstW
+// w5 = big_endian
+// w6 = output_bits
+
+ mov w8, #27
+ sub w8, w8, w6 // shift = 11 + 16 - output_bits
+
+ sub w9, w8, #1
+ mov w10, #1
+ lsl w9, w10, w9 // val = 1 << (shift - 1)
+
+ dup v1.4s, w9
+ dup v2.4s, w9 // Create vectors with val
+
+ mov w17, #0
+ sub w16, w17, w8
+ dup v8.4s, w16 // Create (-shift) vector for right shift
+
+ movi v11.4s, #0
+
+ mov w10, #1
+ lsl w10, w10, w6
+ sub w10, w10, #1 // (1U << output_bits) - 1
+ dup v12.4s, w10 // Create Clip vector for uppr bound
+
+ tst w4, #15 // if dstW divisible by 16, process 16 elements
+ b.ne 4f // else process 8 elements
+
+ mov x7, #0 // i = 0
+1: // Loop
+
+ mov v3.16b, v1.16b
+ mov v4.16b, v2.16b
+ mov v5.16b, v1.16b
+ mov v6.16b, v2.16b
+
+ mov w11, w1 // tmpfilterSize = filterSize
+ mov x12, x2 // srcp = src
+ mov x13, x0 // filterp = filter
+
+2: // Filter loop
+
+ ldp x14, x15, [x12], #16 // get 2 pointers: src[j] and src[j+1]
+ ldr s7, [x13], #4 // load filter coefficients
+ add x14, x14, x7, lsl #1
+ add x15, x15, x7, lsl #1
+ ld1 {v16.8h, v17.8h}, [x14]
+ ld1 {v18.8h, v19.8h}, [x15]
+
+ // Multiply-accumulate
+ smlal v3.4s, v16.4h, v7.h[0]
+ smlal2 v4.4s, v16.8h, v7.h[0]
+ smlal v5.4s, v17.4h, v7.h[0]
+ smlal2 v6.4s, v17.8h, v7.h[0]
+
+ smlal v3.4s, v18.4h, v7.h[1]
+ smlal2 v4.4s, v18.8h, v7.h[1]
+ smlal v5.4s, v19.4h, v7.h[1]
+ smlal2 v6.4s, v19.8h, v7.h[1]
+
+ subs w11, w11, #2 // tmpfilterSize -= 2
+ b.gt 2b // continue filter loop
+
+ // Shift results
+ sshl v3.4s, v3.4s, v8.4s
+ sshl v4.4s, v4.4s, v8.4s
+ sshl v5.4s, v5.4s, v8.4s
+ sshl v6.4s, v6.4s, v8.4s
+
+ // Clamp to 0
+ smax v3.4s, v3.4s, v11.4s
+ smax v4.4s, v4.4s, v11.4s
+ smax v5.4s, v5.4s, v11.4s
+ smax v6.4s, v6.4s, v11.4s
+
+ // Clip upper bound
+ smin v3.4s, v3.4s, v12.4s
+ smin v4.4s, v4.4s, v12.4s
+ smin v5.4s, v5.4s, v12.4s
+ smin v6.4s, v6.4s, v12.4s
+
+ // Narrow to 16-bit
+ xtn v13.4h, v3.4s
+ xtn2 v13.8h, v4.4s
+ xtn v14.4h, v5.4s
+ xtn2 v14.8h, v6.4s
+
+ cbz w5, 3f // Check if big endian
+ rev16 v13.16b, v13.16b
+ rev16 v14.16b, v14.16b // Swap bits for big endian
+3:
+ // Store 16 pixels
+ st1 {v13.8h}, [x3], #16
+ st1 {v14.8h}, [x3], #16
+
+ add x7, x7, #16 // i = i + 16
+ subs w4, w4, #16 // dstW = dstW - 16
+ b.gt 1b // Continue loop
+ b 8f // end
+
+4: // Process 8 elements
+ mov x7, #0
+5: // Loop
+
+ mov v3.16b, v1.16b
+ mov v4.16b, v2.16b
+
+ mov w11, w1
+ mov x12, x2
+ mov x13, x0
+
+6: // Filter loop
+
+ ldp x14, x15, [x12], #16
+ ldr s7, [x13], #4
+ add x14, x14, x7, lsl #1
+ add x15, x15, x7, lsl #1
+ ld1 {v5.8h}, [x14]
+ ld1 {v6.8h}, [x15]
+
+ // Multiply-accumulate
+ smlal v3.4s, v5.4h, v7.h[0]
+ smlal2 v4.4s, v5.8h, v7.h[0]
+ smlal v3.4s, v6.4h, v7.h[1]
+ smlal2 v4.4s, v6.8h, v7.h[1]
+
+ subs w11, w11, #2 // tmpfilterSize -= 2
+ b.gt 6b // loop until filterSize consumed
+
+ // Shift results
+ sshl v3.4s, v3.4s, v8.4s
+ sshl v4.4s, v4.4s, v8.4s
+
+ // Clamp to 0
+ smax v3.4s, v3.4s, v11.4s
+ smax v4.4s, v4.4s, v11.4s
+
+ // Clip upper bound
+ smin v3.4s, v3.4s, v12.4s
+ smin v4.4s, v4.4s, v12.4s
+
+ // Narrow to 16-bit
+ xtn v9.4h, v3.4s
+ xtn v10.4h, v4.4s
+
+ cbz w5, 7f // Check if big endian
+ rev16 v9.8b, v9.8b
+ rev16 v10.8b, v10.8b // Swap bits for big endian
+
+7:
+ // Store 8 pixels
+ st1 {v9.4h}, [x3], #8
+ st1 {v10.4h}, [x3], #8
+
+ add x7, x7, #8 // i = i + 8
+ subs w4, w4, #8 // dstW = dstW - 8
+ b.gt 5b // Continue Loop
+
+8:
+ ret
+endfunc
+
function ff_yuv2planeX_8_neon, export=1
// x0 - const int16_t *filter,
// x1 - int filterSize,
diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
index 6e5a721c1f..23cdb7d26e 100644
--- a/libswscale/aarch64/swscale.c
+++ b/libswscale/aarch64/swscale.c
@@ -158,6 +158,29 @@ void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \
ALL_SCALE_FUNCS(neon);
+void ff_yuv2planeX_10_neon(const int16_t *filter, int filterSize,
+ const int16_t **src, uint16_t *dest, int dstW,
+ int big_endian, int output_bits);
+
+#define yuv2NBPS(bits, BE_LE, is_be, template_size, typeX_t) \
+static void yuv2planeX_ ## bits ## BE_LE ## _neon(const int16_t *filter, int filterSize, \
+ const int16_t **src, uint8_t *dest, int dstW, \
+ const uint8_t *dither, int offset)\
+{ \
+ ff_yuv2planeX_## template_size ## _neon(filter, \
+ filterSize, (const typeX_t **) src, \
+ (uint16_t *) dest, dstW, is_be, bits); \
+}
+
+yuv2NBPS( 9, BE, 1, 10, int16_t)
+yuv2NBPS( 9, LE, 0, 10, int16_t)
+yuv2NBPS(10, BE, 1, 10, int16_t)
+yuv2NBPS(10, LE, 0, 10, int16_t)
+yuv2NBPS(12, BE, 1, 10, int16_t)
+yuv2NBPS(12, LE, 0, 10, int16_t)
+yuv2NBPS(14, BE, 1, 10, int16_t)
+yuv2NBPS(14, LE, 0, 10, int16_t)
+
void ff_yuv2planeX_8_neon(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset);
@@ -268,6 +291,8 @@ av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c)
av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
{
int cpu_flags = av_get_cpu_flags();
+ enum AVPixelFormat dstFormat = c->opts.dst_format;
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(dstFormat);
if (have_neon(cpu_flags)) {
ASSIGN_SCALE_FUNC(c->hyScale, c->hLumFilterSize, neon);
@@ -276,6 +301,19 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
if (c->dstBpc == 8) {
c->yuv2planeX = ff_yuv2planeX_8_neon;
}
+
+ if (isNBPS(dstFormat) && !isSemiPlanarYUV(dstFormat)) {
+ if (desc->comp[0].depth == 9) {
+ c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_9BE_neon : yuv2planeX_9LE_neon;
+ } else if (desc->comp[0].depth == 10) {
+ c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_10BE_neon : yuv2planeX_10LE_neon;
+ } else if (desc->comp[0].depth == 12) {
+ c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_12BE_neon : yuv2planeX_12LE_neon;
+ } else if (desc->comp[0].depth == 14) {
+ c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_14BE_neon : yuv2planeX_14LE_neon;
+ } else
+ av_assert0(0);
+ }
switch (c->opts.src_format) {
case AV_PIX_FMT_ABGR:
c->lumToYV12 = ff_abgr32ToY_neon;
--
2.36.0.windows.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [FFmpeg-devel] [PATCH] swscale/aarch64/output: Implement neon assembly for yuv2planeX_10_c_template()
2025-06-06 8:44 [FFmpeg-devel] [PATCH] swscale/aarch64/output: Implement neon assembly for yuv2planeX_10_c_template() Logaprakash Ramajayam
2025-06-12 5:25 ` Logaprakash Ramajayam
@ 2025-06-18 21:15 ` Martin Storsjö
1 sibling, 0 replies; 3+ messages in thread
From: Martin Storsjö @ 2025-06-18 21:15 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Cc: Dash Santosh Sathyanarayanan, Harshitha Sarangu Suresh
On Fri, 6 Jun 2025, Logaprakash Ramajayam wrote:
> Checked FATE tests and gha-aarch64 git workflow.
>
> From 34cdef26eaebcf98916e9881b3a04f4f698f09c6 Mon Sep 17 00:00:00 2001
> From: Logaprakash Ramajayam <logaprakash.ramajayam@multicorewareinc.com>
> Date: Thu, 5 Jun 2025 01:33:39 -0700
> Subject: [PATCH] swscale/aarch64/output: Implement neon assembly for
> yuv2planeX_10_c_template()
> ---
> libswscale/aarch64/output.S | 167 +++++++++++++++++++++++++++++++++++
> libswscale/aarch64/swscale.c | 38 ++++++++
> 2 files changed, 205 insertions(+)
This is missing checkasm benchmarks for the function. That is presuming
that there is a checkasm test for it. If not, such a test needs to be
written.
> diff --git a/libswscale/aarch64/output.S b/libswscale/aarch64/output.S
> index 190c438870..e039e820ae 100644
> --- a/libswscale/aarch64/output.S
> +++ b/libswscale/aarch64/output.S
> @@ -20,6 +20,173 @@
>
> #include "libavutil/aarch64/asm.S"
>
> +function ff_yuv2planeX_10_neon, export=1
> +// x0 = filter (int16_t*)
> +// w1 = filterSize
> +// x2 = src (int16_t**)
> +// x3 = dest (uint16_t*)
> +// w4 = dstW
> +// w5 = big_endian
> +// w6 = output_bits
> +
> + mov w8, #27
> + sub w8, w8, w6 // shift = 11 + 16 - output_bits
> +
> + sub w9, w8, #1
> + mov w10, #1
> + lsl w9, w10, w9 // val = 1 << (shift - 1)
> +
> + dup v1.4s, w9
> + dup v2.4s, w9 // Create vectors with val
> +
> + mov w17, #0
> + sub w16, w17, w8
You don't need to assign zero to a register and do subtraction in order to
negate a value, you can also just do "neg w16, w8".
> + dup v8.4s, w16 // Create (-shift) vector for right shift
> +
> + movi v11.4s, #0
> +
> + mov w10, #1
> + lsl w10, w10, w6
> + sub w10, w10, #1 // (1U << output_bits) - 1
> + dup v12.4s, w10 // Create Clip vector for uppr bound
> +
> + tst w4, #15 // if dstW divisible by 16, process 16 elements
> + b.ne 4f // else process 8 elements
Same question as for the other patch; can we assume that it is ok to
always write in increments of 8? If not, we'd need a scalar loop to handle
the tail. And in any case, it's more efficient to use the most unrolled
version of the function for the majority of a line, instead of running the
whole line with a less efficient loop just because the tail doesn't line
up entirely.
> +
> + mov x7, #0 // i = 0
> +1: // Loop
> +
> + mov v3.16b, v1.16b
> + mov v4.16b, v2.16b
> + mov v5.16b, v1.16b
> + mov v6.16b, v2.16b
> +
> + mov w11, w1 // tmpfilterSize = filterSize
> + mov x12, x2 // srcp = src
> + mov x13, x0 // filterp = filter
> +
> +2: // Filter loop
> +
> + ldp x14, x15, [x12], #16 // get 2 pointers: src[j] and src[j+1]
> + ldr s7, [x13], #4 // load filter coefficients
> + add x14, x14, x7, lsl #1
> + add x15, x15, x7, lsl #1
> + ld1 {v16.8h, v17.8h}, [x14]
> + ld1 {v18.8h, v19.8h}, [x15]
> +
> + // Multiply-accumulate
> + smlal v3.4s, v16.4h, v7.h[0]
> + smlal2 v4.4s, v16.8h, v7.h[0]
> + smlal v5.4s, v17.4h, v7.h[0]
> + smlal2 v6.4s, v17.8h, v7.h[0]
> +
> + smlal v3.4s, v18.4h, v7.h[1]
> + smlal2 v4.4s, v18.8h, v7.h[1]
> + smlal v5.4s, v19.4h, v7.h[1]
> + smlal2 v6.4s, v19.8h, v7.h[1]
> +
> + subs w11, w11, #2 // tmpfilterSize -= 2
> + b.gt 2b // continue filter loop
> +
> + // Shift results
> + sshl v3.4s, v3.4s, v8.4s
> + sshl v4.4s, v4.4s, v8.4s
> + sshl v5.4s, v5.4s, v8.4s
> + sshl v6.4s, v6.4s, v8.4s
> +
> + // Clamp to 0
> + smax v3.4s, v3.4s, v11.4s
> + smax v4.4s, v4.4s, v11.4s
> + smax v5.4s, v5.4s, v11.4s
> + smax v6.4s, v6.4s, v11.4s
> +
> + // Clip upper bound
> + smin v3.4s, v3.4s, v12.4s
> + smin v4.4s, v4.4s, v12.4s
> + smin v5.4s, v5.4s, v12.4s
> + smin v6.4s, v6.4s, v12.4s
> +
> + // Narrow to 16-bit
> + xtn v13.4h, v3.4s
> + xtn2 v13.8h, v4.4s
> + xtn v14.4h, v5.4s
> + xtn2 v14.8h, v6.4s
If we are going to narrow things to 16 bit here, I think it would be more
efficient to first narrow to 16 bit. You can do that with sqxtun, then you
also get the clamp to 0 part for free, so you only need to clamp the upper
bound, with half the number of instructions/registers.
> +
> + cbz w5, 3f // Check if big endian
> + rev16 v13.16b, v13.16b
> + rev16 v14.16b, v14.16b // Swap bits for big endian
> +3:
> + // Store 16 pixels
> + st1 {v13.8h}, [x3], #16
> + st1 {v14.8h}, [x3], #16
Write both registers with one store - st1 {v13.8h, v14.8h}, [x3], #32.
> +
> + add x7, x7, #16 // i = i + 16
> + subs w4, w4, #16 // dstW = dstW - 16
> + b.gt 1b // Continue loop
If possible, don't do the calculation that sets the condition codes
directly before the branch; that forces the branch to wait for the
previous instruction to finish. Instead you can move the "subs"
instruction a bit earlier; at least above the "add" above, but it could
also go e.g. after the 3: label.
> diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
> index 6e5a721c1f..23cdb7d26e 100644
> --- a/libswscale/aarch64/swscale.c
> +++ b/libswscale/aarch64/swscale.c
> @@ -158,6 +158,29 @@ void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \
>
> ALL_SCALE_FUNCS(neon);
>
> +void ff_yuv2planeX_10_neon(const int16_t *filter, int filterSize,
> + const int16_t **src, uint16_t *dest, int dstW,
> + int big_endian, int output_bits);
Align the later lines with parameters with the parameters on the first
line. See ff_yuv2planeX_8_neon right below.
> +
> +#define yuv2NBPS(bits, BE_LE, is_be, template_size, typeX_t) \
> +static void yuv2planeX_ ## bits ## BE_LE ## _neon(const int16_t *filter, int filterSize, \
> + const int16_t **src, uint8_t *dest, int dstW, \
> + const uint8_t *dither, int offset)\
Same thing here
> +{ \
> + ff_yuv2planeX_## template_size ## _neon(filter, \
> + filterSize, (const typeX_t **) src, \
> + (uint16_t *) dest, dstW, is_be, bits); \
Same thing here
> +}
> +
> +yuv2NBPS( 9, BE, 1, 10, int16_t)
> +yuv2NBPS( 9, LE, 0, 10, int16_t)
> +yuv2NBPS(10, BE, 1, 10, int16_t)
> +yuv2NBPS(10, LE, 0, 10, int16_t)
> +yuv2NBPS(12, BE, 1, 10, int16_t)
> +yuv2NBPS(12, LE, 0, 10, int16_t)
> +yuv2NBPS(14, BE, 1, 10, int16_t)
> +yuv2NBPS(14, LE, 0, 10, int16_t)
FWIW, I appreciate the effort to save code size here by not templating 8
different copies of the same functions, but making it use one single
implementation for all the variants.
> +
> void ff_yuv2planeX_8_neon(const int16_t *filter, int filterSize,
> const int16_t **src, uint8_t *dest, int dstW,
> const uint8_t *dither, int offset);
> @@ -268,6 +291,8 @@ av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c)
> av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
> {
> int cpu_flags = av_get_cpu_flags();
> + enum AVPixelFormat dstFormat = c->opts.dst_format;
> + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(dstFormat);
>
> if (have_neon(cpu_flags)) {
> ASSIGN_SCALE_FUNC(c->hyScale, c->hLumFilterSize, neon);
> @@ -276,6 +301,19 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
> if (c->dstBpc == 8) {
> c->yuv2planeX = ff_yuv2planeX_8_neon;
> }
> +
> + if (isNBPS(dstFormat) && !isSemiPlanarYUV(dstFormat)) {
> + if (desc->comp[0].depth == 9) {
> + c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_9BE_neon : yuv2planeX_9LE_neon;
> + } else if (desc->comp[0].depth == 10) {
> + c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_10BE_neon : yuv2planeX_10LE_neon;
> + } else if (desc->comp[0].depth == 12) {
> + c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_12BE_neon : yuv2planeX_12LE_neon;
> + } else if (desc->comp[0].depth == 14) {
> + c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_14BE_neon : yuv2planeX_14LE_neon;
> + } else
> + av_assert0(0);
The av_assert is misindented.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2025-06-18 21:16 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-06-06 8:44 [FFmpeg-devel] [PATCH] swscale/aarch64/output: Implement neon assembly for yuv2planeX_10_c_template() Logaprakash Ramajayam
2025-06-12 5:25 ` Logaprakash Ramajayam
2025-06-18 21:15 ` Martin Storsjö
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git