* [FFmpeg-devel] [PATCH] avcodec/aarch64: add NEON optimization for draw_edges
@ 2026-01-22 8:50 chenmingbiao via ffmpeg-devel
2026-01-22 9:11 ` [FFmpeg-devel] " Martin Storsjö via ffmpeg-devel
0 siblings, 1 reply; 2+ messages in thread
From: chenmingbiao via ffmpeg-devel @ 2026-01-22 8:50 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: chenmingbiao
From: mbillchan <mbillchan@gmail.com>
Performance improvement for draw_edges function using NEON instructions.
Optimized for edge widths 4, 8, and 16.
Benchmark results (AArch64):
draw_edges_8_1724_4_c: 851.4
draw_edges_8_1724_4_neon: 824.7 (1.03x)
draw_edges_8_1724_16_c: 1620.9
draw_edges_8_1724_16_neon: 1049.9 (1.54x)
draw_edges_1920_4_16_c: 849.0
draw_edges_1920_4_16_neon: 571.1 (1.49x)
---
libavcodec/aarch64/mpegvideoencdsp_init.c | 3 +
libavcodec/aarch64/mpegvideoencdsp_neon.S | 157 ++++++++++++++++++++++
tests/checkasm/mpegvideoencdsp.c | 5 +
3 files changed, 165 insertions(+)
diff --git a/libavcodec/aarch64/mpegvideoencdsp_init.c b/libavcodec/aarch64/mpegvideoencdsp_init.c
index 0402ac9e82..8363b18f48 100644
--- a/libavcodec/aarch64/mpegvideoencdsp_init.c
+++ b/libavcodec/aarch64/mpegvideoencdsp_init.c
@@ -26,6 +26,8 @@
int ff_pix_sum16_neon(const uint8_t *pix, ptrdiff_t line_size);
int ff_pix_norm1_neon(const uint8_t *pix, ptrdiff_t line_size);
+void ff_draw_edges_neon(uint8_t *buf, ptrdiff_t wrap, int width, int height,
+ int w, int h, int sides);
#if HAVE_DOTPROD
int ff_pix_norm1_neon_dotprod(const uint8_t *pix, ptrdiff_t line_size);
@@ -39,6 +41,7 @@ av_cold void ff_mpegvideoencdsp_init_aarch64(MpegvideoEncDSPContext *c,
if (have_neon(cpu_flags)) {
c->pix_sum = ff_pix_sum16_neon;
c->pix_norm1 = ff_pix_norm1_neon;
+ c->draw_edges = ff_draw_edges_neon;
}
#if HAVE_DOTPROD
diff --git a/libavcodec/aarch64/mpegvideoencdsp_neon.S b/libavcodec/aarch64/mpegvideoencdsp_neon.S
index 8175d8975e..6b9cfc62ea 100644
--- a/libavcodec/aarch64/mpegvideoencdsp_neon.S
+++ b/libavcodec/aarch64/mpegvideoencdsp_neon.S
@@ -91,3 +91,160 @@ endfunc
DISABLE_DOTPROD
#endif
+
+function ff_draw_edges_neon, export=1
+ mov x9, x0
+ mov x10, x1
+ uxtw x11, w2
+ uxtw x12, w3
+ uxtw x13, w4
+ uxtw x14, w5
+ uxtw x15, w6
+ mov x0, x9
+ mov x1, x12
+
+ cmp w13, #16
+ b.eq 1f
+ cmp w13, #8
+ b.eq 2f
+ b 3f
+
+1:
+ ldrb w2, [x0]
+ dup v0.16b, w2
+ sub x3, x0, #16
+ st1 {v0.16b}, [x3]
+
+ add x3, x0, x11
+ sub x4, x3, #1
+ ldrb w2, [x4]
+ dup v1.16b, w2
+ st1 {v1.16b}, [x3]
+
+ add x0, x0, x10
+ subs x1, x1, #1
+ b.ne 1b
+ b 4f
+
+2:
+ ldrb w2, [x0]
+ dup v0.8b, w2
+ sub x3, x0, #8
+ st1 {v0.8b}, [x3]
+
+ add x3, x0, x11
+ sub x4, x3, #1
+ ldrb w2, [x4]
+ dup v1.8b, w2
+ st1 {v1.8b}, [x3]
+
+ add x0, x0, x10
+ subs x1, x1, #1
+ b.ne 2b
+ b 4f
+
+3:
+ ldrb w2, [x0]
+ dup v0.8b, w2
+ sub x3, x0, #4
+ st1 {v0.s}[0], [x3]
+
+ add x3, x0, x11
+ sub x4, x3, #1
+ ldrb w2, [x4]
+ dup v1.8b, w2
+ st1 {v1.s}[0], [x3]
+
+ add x0, x0, x10
+ subs x1, x1, #1
+ b.ne 3b
+
+4:
+ sub x9, x9, x13
+ tbnz w15, #0, 5f
+ b 6f
+
+5:
+ add x2, x11, x13, lsl #1
+ mov x3, x14
+ mov x4, x9
+ sub x5, x9, x10
+
+7:
+ mov x6, x5
+ mov x7, x4
+ mov x8, x2
+
+8:
+ cmp x8, #64
+ b.lt 17f
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x7], #64
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], #64
+ sub x8, x8, #64
+ b 8b
+
+17:
+ cmp x8, #16
+ b.lt 9f
+ ld1 {v0.16b}, [x7], #16
+ st1 {v0.16b}, [x6], #16
+ sub x8, x8, #16
+ b 17b
+9:
+ cbz x8, 10f
+ ldrb w16, [x7], #1
+ strb w16, [x6], #1
+ sub x8, x8, #1
+ b 9b
+10:
+ sub x5, x5, x10
+ subs x3, x3, #1
+ b.ne 7b
+
+6:
+ tbnz w15, #1, 11f
+ b 12f
+
+11:
+ sub x3, x12, #1
+ mul x3, x3, x10
+ add x4, x9, x3
+ add x5, x4, x10
+
+ add x2, x11, x13, lsl #1
+ mov x3, x14
+
+13:
+ mov x6, x5
+ mov x7, x4
+ mov x8, x2
+
+14:
+ cmp x8, #64
+ b.lt 18f
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x7], #64
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], #64
+ sub x8, x8, #64
+ b 14b
+
+18:
+ cmp x8, #16
+ b.lt 15f
+ ld1 {v0.16b}, [x7], #16
+ st1 {v0.16b}, [x6], #16
+ sub x8, x8, #16
+ b 18b
+15:
+ cbz x8, 16f
+ ldrb w16, [x7], #1
+ strb w16, [x6], #1
+ sub x8, x8, #1
+ b 15b
+16:
+ add x5, x5, x10
+ subs x3, x3, #1
+ b.ne 13b
+
+12:
+ ret
+endfunc
diff --git a/tests/checkasm/mpegvideoencdsp.c b/tests/checkasm/mpegvideoencdsp.c
index 955cd9f5b7..fdaed08e86 100644
--- a/tests/checkasm/mpegvideoencdsp.c
+++ b/tests/checkasm/mpegvideoencdsp.c
@@ -147,8 +147,13 @@ static void check_draw_edges(MpegvideoEncDSPContext *c)
LOCAL_ALIGNED_16(uint8_t, buf0, [BUFSIZE]);
LOCAL_ALIGNED_16(uint8_t, buf1, [BUFSIZE]);
+#if ARCH_X86
declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *buf, ptrdiff_t wrap, int width, int height,
int w, int h, int sides);
+#else
+ declare_func(void, uint8_t *buf, ptrdiff_t wrap, int width, int height,
+ int w, int h, int sides);
+#endif
for (int isi = 0; isi < FF_ARRAY_ELEMS(input_sizes); isi++) {
int input_size = input_sizes[isi];
--
2.48.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] 2+ messages in thread* [FFmpeg-devel] Re: [PATCH] avcodec/aarch64: add NEON optimization for draw_edges
2026-01-22 8:50 [FFmpeg-devel] [PATCH] avcodec/aarch64: add NEON optimization for draw_edges chenmingbiao via ffmpeg-devel
@ 2026-01-22 9:11 ` Martin Storsjö via ffmpeg-devel
0 siblings, 0 replies; 2+ messages in thread
From: Martin Storsjö via ffmpeg-devel @ 2026-01-22 9:11 UTC (permalink / raw)
To: chenmingbiao via ffmpeg-devel; +Cc: Martin Storsjö
On Thu, 22 Jan 2026, chenmingbiao via ffmpeg-devel wrote:
> From: mbillchan <mbillchan@gmail.com>
>
> Performance improvement for draw_edges function using NEON instructions.
> Optimized for edge widths 4, 8, and 16.
>
> Benchmark results (AArch64):
> draw_edges_8_1724_4_c: 851.4
> draw_edges_8_1724_4_neon: 824.7 (1.03x)
> draw_edges_8_1724_16_c: 1620.9
> draw_edges_8_1724_16_neon: 1049.9 (1.54x)
> draw_edges_1920_4_16_c: 849.0
> draw_edges_1920_4_16_neon: 571.1 (1.49x)
> ---
Please submit this patch at https://code.ffmpeg.org/FFmpeg/FFmpeg. This
includes CI checking of the code, and will indicate to you that your
assembly is inconsistently indented compared with all the existing
assembly. (You can also check this locally by running
./tools/check_arm_indent.sh.)
// Martin
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2026-01-22 9:12 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2026-01-22 8:50 [FFmpeg-devel] [PATCH] avcodec/aarch64: add NEON optimization for draw_edges chenmingbiao via ffmpeg-devel
2026-01-22 9:11 ` [FFmpeg-devel] " Martin Storsjö via ffmpeg-devel
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git