* [FFmpeg-devel] [PR] lavfi/nlmeans: add aarch64 neon for compute_weights_line (PR #21393)
@ 2026-01-06 13:23 Jun Zhao via ffmpeg-devel
0 siblings, 0 replies; only message in thread
From: Jun Zhao via ffmpeg-devel @ 2026-01-06 13:23 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Jun Zhao
PR #21393 opened by Jun Zhao (mypopydev)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21393
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21393.patch
Implement NEON optimization for compute_weights_line.
Also update the function signature to use ptrdiff_t for stack arguments
(max_meaningful_diff, startx, endx). This is necessary for AArch64 where
32-bit stack arguments are packed (4-byte aligned), while the assembly
implementation expects 8-byte slots. Using ptrdiff_t enforces 8-byte
alignment/size, consistent with the assembly.
The x86 AVX2 prototype is updated to match the new signature.
Performance benchmark (AArch64):
./tests/checkasm/checkasm --test=vf_nlmeans --bench
compute_weights_line_c: 579.2 ( 1.00x)
compute_weights_line_neon: 110.4 ( 5.25x)
Signed-off-by: Jun Zhao <barryjzhao@tencent.com>
>From 46b2331ef11e14dd06d5849a16a63546522ba5f0 Mon Sep 17 00:00:00 2001
From: Jun Zhao <barryjzhao@tencent.com>
Date: Tue, 6 Jan 2026 17:44:01 +0800
Subject: [PATCH] lavfi/nlmeans: add aarch64 neon for compute_weights_line
Implement NEON optimization for compute_weights_line.
Also update the function signature to use ptrdiff_t for stack arguments
(max_meaningful_diff, startx, endx). This is necessary for AArch64 where
32-bit stack arguments are packed (4-byte aligned), while the assembly
implementation expects 8-byte slots. Using ptrdiff_t enforces 8-byte
alignment/size, consistent with the assembly.
The x86 AVX2 prototype is updated to match the new signature.
Performance benchmark (AArch64):
./tests/checkasm/checkasm --test=vf_nlmeans --bench
compute_weights_line_c: 579.2 ( 1.00x)
compute_weights_line_neon: 110.4 ( 5.25x)
Signed-off-by: Jun Zhao <barryjzhao@tencent.com>
---
libavfilter/aarch64/vf_nlmeans_init.c | 15 ++-
libavfilter/aarch64/vf_nlmeans_neon.S | 128 ++++++++++++++++++++++++++
libavfilter/vf_nlmeans.h | 4 +-
libavfilter/vf_nlmeans_init.h | 4 +-
libavfilter/x86/vf_nlmeans_init.c | 4 +-
tests/checkasm/vf_nlmeans.c | 108 ++++++++++++++++++++++
6 files changed, 256 insertions(+), 7 deletions(-)
diff --git a/libavfilter/aarch64/vf_nlmeans_init.c b/libavfilter/aarch64/vf_nlmeans_init.c
index 6793370a4a..fbee336322 100644
--- a/libavfilter/aarch64/vf_nlmeans_init.c
+++ b/libavfilter/aarch64/vf_nlmeans_init.c
@@ -25,10 +25,23 @@ void ff_compute_safe_ssd_integral_image_neon(uint32_t *dst, ptrdiff_t dst_linesi
const uint8_t *s2, ptrdiff_t linesize2,
int w, int h);
+void ff_compute_weights_line_neon(const uint32_t *const iia,
+ const uint32_t *const iib,
+ const uint32_t *const iid,
+ const uint32_t *const iie,
+ const uint8_t *const src,
+ float *total_weight,
+ float *sum,
+ const float *const weight_lut,
+ ptrdiff_t max_meaningful_diff,
+ ptrdiff_t startx, ptrdiff_t endx);
+
av_cold void ff_nlmeans_init_aarch64(NLMeansDSPContext *dsp)
{
int cpu_flags = av_get_cpu_flags();
- if (have_neon(cpu_flags))
+ if (have_neon(cpu_flags)) {
dsp->compute_safe_ssd_integral_image = ff_compute_safe_ssd_integral_image_neon;
+ dsp->compute_weights_line = ff_compute_weights_line_neon;
+ }
}
diff --git a/libavfilter/aarch64/vf_nlmeans_neon.S b/libavfilter/aarch64/vf_nlmeans_neon.S
index a788cffd85..c913ae4e21 100644
--- a/libavfilter/aarch64/vf_nlmeans_neon.S
+++ b/libavfilter/aarch64/vf_nlmeans_neon.S
@@ -78,3 +78,131 @@ function ff_compute_safe_ssd_integral_image_neon, export=1
b.ne 1b
ret
endfunc
+
+function ff_compute_weights_line_neon, export=1
+ // x0 = iia, x1 = iib, x2 = iid, x3 = iie
+ // x4 = src, x5 = total_weight, x6 = sum, x7 = weight_lut
+ // stack: [sp+0] = max_meaningful_diff, [sp+8] = startx, [sp+16] = endx
+
+ ldr w8, [sp, #0] // max_meaningful_diff
+ ldr w9, [sp, #8] // startx
+ ldr w10, [sp, #16] // endx
+
+ cmp w9, w10
+ b.ge 9f // if startx >= endx return
+
+ // Offset pointers
+ lsl x11, x9, #2 // startx * 4 (for uint32/float)
+ add x0, x0, x11 // iia += startx
+ add x1, x1, x11 // iib += startx
+ add x2, x2, x11 // iid += startx
+ add x3, x3, x11 // iie += startx
+ add x5, x5, x11 // total_weight += startx
+ add x6, x6, x11 // sum += startx
+
+ // src is uint8, so offset is just startx
+ add x4, x4, x9 // src += startx
+
+ dup v30.4s, w8 // v30 = max_meaningful_diff
+
+ sub w10, w10, w9 // count = endx - startx
+
+1: // Main loop
+ cmp w10, #4
+ b.lt 2f // Handle leftovers
+
+ // Load integral image values
+ ld1 {v0.4s}, [x0], #16 // iia
+ ld1 {v1.4s}, [x1], #16 // iib
+ ld1 {v2.4s}, [x2], #16 // iid
+ ld1 {v3.4s}, [x3], #16 // iie
+
+ // diff = e - d - b + a
+ sub v3.4s, v3.4s, v2.4s // e - d
+ sub v3.4s, v3.4s, v1.4s // e - d - b
+ add v3.4s, v3.4s, v0.4s // e - d - b + a
+
+ // min(diff, max)
+ umin v3.4s, v3.4s, v30.4s
+
+ // Lookup weights
+ // Move to scalar registers to address lut
+ mov w8, v3.s[0]
+ mov w9, v3.s[1]
+ mov w11, v3.s[2]
+ mov w12, v3.s[3]
+
+ // Load 4 float weights
+ // LDR S register from x7 + offset*4
+ ldr s0, [x7, w8, uxtw #2]
+ ldr s1, [x7, w9, uxtw #2]
+ ldr s2, [x7, w11, uxtw #2]
+ ldr s3, [x7, w12, uxtw #2]
+
+ // Merge into v0.4s (s0 already in v0.s[0] from ldr s0)
+ mov v0.s[1], v1.s[0]
+ mov v0.s[2], v2.s[0]
+ mov v0.s[3], v3.s[0]
+
+ // Load src pixels (uint8)
+ ld1 {v1.s}[0], [x4], #4 // Load 4 bytes
+ uxtl v1.8h, v1.8b // 8b -> 16b (lower half)
+ uxtl v1.4s, v1.4h // 16b -> 32b (lower half)
+ ucvtf v1.4s, v1.4s // int -> float
+
+ // Load total_weight and sum
+ ld1 {v2.4s}, [x5] // total_weight
+ ld1 {v3.4s}, [x6] // sum
+
+ // Update
+ fadd v2.4s, v2.4s, v0.4s // total_weight += weight
+ fmla v3.4s, v1.4s, v0.4s // sum += src * weight
+
+ // Store back
+ st1 {v2.4s}, [x5], #16
+ st1 {v3.4s}, [x6], #16
+
+ sub w10, w10, #4
+ b 1b
+
+2: // Leftovers
+ cmp w10, #0
+ b.le 9f
+
+ // Single pixel handling
+ ldr w13, [x0], #4
+ ldr w14, [x1], #4
+ ldr w15, [x2], #4
+ ldr w16, [x3], #4
+
+ sub w16, w16, w15
+ sub w16, w16, w14
+ add w16, w16, w13
+
+ // min (unsigned comparison)
+ ldr w8, [sp, #0]
+ cmp w16, w8
+ csel w16, w16, w8, ls // unsigned lower or same
+
+ // Load weight
+ ldr s0, [x7, w16, uxtw #2]
+
+ // Load src
+ ldrb w13, [x4], #1
+ ucvtf s1, w13
+
+ // Load acc
+ ldr s2, [x5]
+ ldr s3, [x6]
+
+ fadd s2, s2, s0
+ fmadd s3, s1, s0, s3
+
+ str s2, [x5], #4
+ str s3, [x6], #4
+
+ sub w10, w10, #1
+ b 2b
+
+9: ret
+endfunc
diff --git a/libavfilter/vf_nlmeans.h b/libavfilter/vf_nlmeans.h
index 61377f8c69..4d6ab47f54 100644
--- a/libavfilter/vf_nlmeans.h
+++ b/libavfilter/vf_nlmeans.h
@@ -35,8 +35,8 @@ typedef struct NLMeansDSPContext {
float *total_weight,
float *sum,
const float *const weight_lut,
- int max_meaningful_diff,
- int startx, int endx);
+ ptrdiff_t max_meaningful_diff,
+ ptrdiff_t startx, ptrdiff_t endx);
} NLMeansDSPContext;
void ff_nlmeans_init_aarch64(NLMeansDSPContext *dsp);
diff --git a/libavfilter/vf_nlmeans_init.h b/libavfilter/vf_nlmeans_init.h
index cf31e74bd7..58ba0fdd83 100644
--- a/libavfilter/vf_nlmeans_init.h
+++ b/libavfilter/vf_nlmeans_init.h
@@ -79,8 +79,8 @@ static void compute_weights_line_c(const uint32_t *const iia,
float *total_weight,
float *sum,
const float *const weight_lut,
- int max_meaningful_diff,
- int startx, int endx)
+ ptrdiff_t max_meaningful_diff,
+ ptrdiff_t startx, ptrdiff_t endx)
{
for (int x = startx; x < endx; x++) {
/*
diff --git a/libavfilter/x86/vf_nlmeans_init.c b/libavfilter/x86/vf_nlmeans_init.c
index 5d67090a98..0adb2c7e8a 100644
--- a/libavfilter/x86/vf_nlmeans_init.c
+++ b/libavfilter/x86/vf_nlmeans_init.c
@@ -28,8 +28,8 @@ void ff_compute_weights_line_avx2(const uint32_t *const iia,
float *total_weight,
float *sum,
const float *const weight_lut,
- int max_meaningful_diff,
- int startx, int endx);
+ ptrdiff_t max_meaningful_diff,
+ ptrdiff_t startx, ptrdiff_t endx);
av_cold void ff_nlmeans_init_x86(NLMeansDSPContext *dsp)
{
diff --git a/tests/checkasm/vf_nlmeans.c b/tests/checkasm/vf_nlmeans.c
index e61a2efae6..23b82cbd54 100644
--- a/tests/checkasm/vf_nlmeans.c
+++ b/tests/checkasm/vf_nlmeans.c
@@ -18,6 +18,7 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#include <math.h>
#include "checkasm.h"
#include "libavfilter/vf_nlmeans_init.h"
#include "libavutil/avassert.h"
@@ -29,6 +30,11 @@
((uint32_t *)buf)[i] = rnd(); \
} while (0)
+static int float_almost_equal(float a, float b, float eps)
+{
+ return fabsf(a - b) < eps;
+}
+
void checkasm_check_nlmeans(void)
{
NLMeansDSPContext dsp = {0};
@@ -110,5 +116,107 @@ void checkasm_check_nlmeans(void)
av_freep(&src);
}
+ if (check_func(dsp.compute_weights_line, "compute_weights_line")) {
+ const int test_w = 256;
+ const int max_meaningful_diff = 255;
+ const int startx = 10;
+ const int endx = 200;
+
+ // Allocate aligned buffers
+ uint32_t *iia = av_malloc_array(test_w + 16, sizeof(uint32_t));
+ uint32_t *iib = av_malloc_array(test_w + 16, sizeof(uint32_t));
+ uint32_t *iid = av_malloc_array(test_w + 16, sizeof(uint32_t));
+ uint32_t *iie = av_malloc_array(test_w + 16, sizeof(uint32_t));
+ uint8_t *src = av_malloc(test_w + 16);
+ float *tw_ref = av_calloc(test_w + 16, sizeof(float));
+ float *tw_new = av_calloc(test_w + 16, sizeof(float));
+ float *sum_ref = av_calloc(test_w + 16, sizeof(float));
+ float *sum_new = av_calloc(test_w + 16, sizeof(float));
+ float *lut = av_malloc_array(max_meaningful_diff + 1, sizeof(float));
+
+ declare_func(void, const uint32_t *const iia,
+ const uint32_t *const iib,
+ const uint32_t *const iid,
+ const uint32_t *const iie,
+ const uint8_t *const src,
+ float *total_weight,
+ float *sum,
+ const float *const weight_lut,
+ ptrdiff_t max_meaningful_diff,
+ ptrdiff_t startx, ptrdiff_t endx);
+
+ if (!iia || !iib || !iid || !iie || !src || !tw_ref || !tw_new ||
+ !sum_ref || !sum_new || !lut)
+ goto cleanup_weights;
+
+ // Initialize LUT: weight = exp(-diff * scale)
+ // Using scale = 0.01 for testing
+ for (int i = 0; i <= max_meaningful_diff; i++)
+ lut[i] = expf(-i * 0.01f);
+
+ // Initialize source pixels
+ for (int i = 0; i < test_w; i++)
+ src[i] = rnd() & 0xff;
+
+ // Initialize integral images
+ // We need to ensure diff = e - d - b + a is non-negative and within range
+ // Set up as if computing real integral image values
+ for (int i = 0; i < test_w; i++) {
+ uint32_t base = rnd() % 1000;
+ iia[i] = base;
+ iib[i] = base + (rnd() % 100);
+ iid[i] = base + (rnd() % 100);
+ // e = a + (b - a) + (d - a) + diff
+ // So diff = e - d - b + a will be in range [0, max_meaningful_diff]
+ uint32_t diff = rnd() % (max_meaningful_diff + 1);
+ iie[i] = iia[i] + (iib[i] - iia[i]) + (iid[i] - iia[i]) + diff;
+ }
+
+ // Clear output buffers
+ memset(tw_ref, 0, (test_w + 16) * sizeof(float));
+ memset(tw_new, 0, (test_w + 16) * sizeof(float));
+ memset(sum_ref, 0, (test_w + 16) * sizeof(float));
+ memset(sum_new, 0, (test_w + 16) * sizeof(float));
+
+ call_ref(iia, iib, iid, iie, src, tw_ref, sum_ref, lut,
+ max_meaningful_diff, startx, endx);
+ call_new(iia, iib, iid, iie, src, tw_new, sum_new, lut,
+ max_meaningful_diff, startx, endx);
+
+ // Compare results with small tolerance for floating point
+ for (int i = startx; i < endx; i++) {
+ if (!float_almost_equal(tw_ref[i], tw_new[i], 1e-5f)) {
+ fprintf(stderr, "total_weight mismatch at %d: ref=%f new=%f\n",
+ i, tw_ref[i], tw_new[i]);
+ fail();
+ break;
+ }
+ if (!float_almost_equal(sum_ref[i], sum_new[i], 1e-4f)) {
+ fprintf(stderr, "sum mismatch at %d: ref=%f new=%f\n",
+ i, sum_ref[i], sum_new[i]);
+ fail();
+ break;
+ }
+ }
+
+ // Benchmark
+ memset(tw_new, 0, (test_w + 16) * sizeof(float));
+ memset(sum_new, 0, (test_w + 16) * sizeof(float));
+ bench_new(iia, iib, iid, iie, src, tw_new, sum_new, lut,
+ max_meaningful_diff, startx, endx);
+
+cleanup_weights:
+ av_freep(&iia);
+ av_freep(&iib);
+ av_freep(&iid);
+ av_freep(&iie);
+ av_freep(&src);
+ av_freep(&tw_ref);
+ av_freep(&tw_new);
+ av_freep(&sum_ref);
+ av_freep(&sum_new);
+ av_freep(&lut);
+ }
+
report("dsp");
}
--
2.49.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2026-01-06 13:24 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2026-01-06 13:23 [FFmpeg-devel] [PR] lavfi/nlmeans: add aarch64 neon for compute_weights_line (PR #21393) Jun Zhao via ffmpeg-devel
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git