* [FFmpeg-devel] [PATCH v2] avfilter/x86/vf_nlmeans: add AVX2 safe ssd integral image
@ 2026-02-02 17:35 Andy Wu via ffmpeg-devel
2026-02-03 1:39 ` [FFmpeg-devel] " mypopy--- via ffmpeg-devel
0 siblings, 1 reply; 2+ messages in thread
From: Andy Wu via ffmpeg-devel @ 2026-02-02 17:35 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: lpageo
Add an AVX2 implementation of compute_safe_ssd_integral_image used by vf_nlmeans.
checkasm: vf_nlmeans (x86_64, Windows/MSVC)
checkasm: vf_nlmeans (x86_64, Linux/WSL)
bench: (x86_64, Windows/MSVC) ssd_integral_image 1.94x
bench: (x86_64, Linux/WSL) ssd_integral_image 1.60x
Signed-off-by: Andy Wu <lpageo@163.com>
---
v2: wrap duplicated 8-pixel block in a macro
v2: update bench numbers (Linux/WSL and Windows/MSVC)
libavfilter/x86/vf_nlmeans.asm | 81 +++++++++++++++++++++++++++++++
libavfilter/x86/vf_nlmeans_init.c | 9 +++-
2 files changed, 89 insertions(+), 1 deletion(-)
diff --git a/libavfilter/x86/vf_nlmeans.asm b/libavfilter/x86/vf_nlmeans.asm
index 8f57801035..90cbdabe86 100644
--- a/libavfilter/x86/vf_nlmeans.asm
+++ b/libavfilter/x86/vf_nlmeans.asm
@@ -37,6 +37,87 @@ ending_lut: dd -1, -1, -1, -1, -1, -1, -1, -1,\
SECTION .text
+%macro PROCESS_8_SSD_INTEGRAL 0
+ pmovzxbd m0, [s1q + xq]
+ pmovzxbd m1, [s2q + xq]
+ psubd m0, m1
+ pmulld m0, m0
+
+ movu m1, [dst_topq + xq*4]
+ movu m2, [dst_topq + xq*4 - 4]
+ psubd m1, m2
+ paddd m0, m1
+
+ mova m5, m0
+ pslldq m5, 4
+ paddd m0, m5
+ mova m5, m0
+ pslldq m5, 8
+ paddd m0, m5
+ mova m5, m0
+ pslldq m5, 16
+ paddd m0, m5
+
+ vextracti128 xm5, m0, 0
+ pshufd xm5, xm5, 0xff
+ pxor m4, m4
+ vinserti128 m4, m4, xm5, 1
+ paddd m0, m4
+
+ movd xm5, carryd
+ vpbroadcastd m4, xm5
+ paddd m0, m4
+
+ movu [dstq + xq*4], m0
+
+ vextracti128 xm5, m0, 1
+ pshufd xm5, xm5, 0xff
+ movd carryd, xm5
+
+ add xq, 8
+%endmacro
+
+; void ff_compute_safe_ssd_integral_image(uint32_t *dst, ptrdiff_t dst_linesize_32,
+; const uint8_t *s1, ptrdiff_t linesize1,
+; const uint8_t *s2, ptrdiff_t linesize2,
+; int w, int h);
+;
+; Assumptions (see C version):
+; - w is multiple of 16 and w >= 16
+; - h >= 1
+; - dst[-1] and dst_top[-1] are readable
+
+INIT_YMM avx2
+cglobal compute_safe_ssd_integral_image, 8, 14, 6, 0, dst, dst_lz, s1, ls1, s2, ls2, w, h, dst_top, dst_stride, x, carry, tmp
+ mov wd, dword wm
+ mov hd, dword hm
+ movsxd wq, wd
+
+ mov dst_strideq, dst_lzq
+ shl dst_strideq, 2
+ mov dst_topq, dstq
+ sub dst_topq, dst_strideq
+
+.yloop:
+ xor xq, xq
+ mov carryd, [dstq - 4]
+
+.xloop:
+ ; ---- process 8 pixels ----
+ PROCESS_8_SSD_INTEGRAL
+ ; ---- process 8 pixels ----
+ PROCESS_8_SSD_INTEGRAL
+ cmp xq, wq
+ jl .xloop
+
+ add s1q, ls1q
+ add s2q, ls2q
+ add dstq, dst_strideq
+ add dst_topq, dst_strideq
+ dec hd
+ jg .yloop
+ RET
+
; void ff_compute_weights_line(const uint32_t *const iia,
; const uint32_t *const iib,
; const uint32_t *const iid,
diff --git a/libavfilter/x86/vf_nlmeans_init.c b/libavfilter/x86/vf_nlmeans_init.c
index 0adb2c7e8a..5bfdc7e028 100644
--- a/libavfilter/x86/vf_nlmeans_init.c
+++ b/libavfilter/x86/vf_nlmeans_init.c
@@ -20,6 +20,11 @@
#include "libavutil/x86/cpu.h"
#include "libavfilter/vf_nlmeans.h"
+void ff_compute_safe_ssd_integral_image_avx2(uint32_t *dst, ptrdiff_t dst_linesize_32,
+ const uint8_t *s1, ptrdiff_t linesize1,
+ const uint8_t *s2, ptrdiff_t linesize2,
+ int w, int h);
+
void ff_compute_weights_line_avx2(const uint32_t *const iia,
const uint32_t *const iib,
const uint32_t *const iid,
@@ -36,7 +41,9 @@ av_cold void ff_nlmeans_init_x86(NLMeansDSPContext *dsp)
#if ARCH_X86_64
int cpu_flags = av_get_cpu_flags();
- if (EXTERNAL_AVX2_FAST(cpu_flags))
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ dsp->compute_safe_ssd_integral_image = ff_compute_safe_ssd_integral_image_avx2;
dsp->compute_weights_line = ff_compute_weights_line_avx2;
+ }
#endif
}
--
2.45.1.windows.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] 2+ messages in thread* [FFmpeg-devel] Re: [PATCH v2] avfilter/x86/vf_nlmeans: add AVX2 safe ssd integral image
2026-02-02 17:35 [FFmpeg-devel] [PATCH v2] avfilter/x86/vf_nlmeans: add AVX2 safe ssd integral image Andy Wu via ffmpeg-devel
@ 2026-02-03 1:39 ` mypopy--- via ffmpeg-devel
0 siblings, 0 replies; 2+ messages in thread
From: mypopy--- via ffmpeg-devel @ 2026-02-03 1:39 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: lpageo, mypopy
On Tue, Feb 3, 2026 at 1:36 AM Andy Wu via ffmpeg-devel
<ffmpeg-devel@ffmpeg.org> wrote:
>
> Add an AVX2 implementation of compute_safe_ssd_integral_image used by vf_nlmeans.
>
> checkasm: vf_nlmeans (x86_64, Windows/MSVC)
> checkasm: vf_nlmeans (x86_64, Linux/WSL)
>
> bench: (x86_64, Windows/MSVC) ssd_integral_image 1.94x
> bench: (x86_64, Linux/WSL) ssd_integral_image 1.60x
>
> Signed-off-by: Andy Wu <lpageo@163.com>
> ---
>
> v2: wrap duplicated 8-pixel block in a macro
> v2: update bench numbers (Linux/WSL and Windows/MSVC)
>
> libavfilter/x86/vf_nlmeans.asm | 81 +++++++++++++++++++++++++++++++
> libavfilter/x86/vf_nlmeans_init.c | 9 +++-
> 2 files changed, 89 insertions(+), 1 deletion(-)
>
> diff --git a/libavfilter/x86/vf_nlmeans.asm b/libavfilter/x86/vf_nlmeans.asm
> index 8f57801035..90cbdabe86 100644
> --- a/libavfilter/x86/vf_nlmeans.asm
> +++ b/libavfilter/x86/vf_nlmeans.asm
> @@ -37,6 +37,87 @@ ending_lut: dd -1, -1, -1, -1, -1, -1, -1, -1,\
>
> SECTION .text
>
> +%macro PROCESS_8_SSD_INTEGRAL 0
> + pmovzxbd m0, [s1q + xq]
> + pmovzxbd m1, [s2q + xq]
> + psubd m0, m1
> + pmulld m0, m0
> +
> + movu m1, [dst_topq + xq*4]
> + movu m2, [dst_topq + xq*4 - 4]
> + psubd m1, m2
> + paddd m0, m1
> +
> + mova m5, m0
> + pslldq m5, 4
> + paddd m0, m5
> + mova m5, m0
> + pslldq m5, 8
> + paddd m0, m5
> + mova m5, m0
> + pslldq m5, 16
> + paddd m0, m5
> +
> + vextracti128 xm5, m0, 0
> + pshufd xm5, xm5, 0xff
> + pxor m4, m4
> + vinserti128 m4, m4, xm5, 1
> + paddd m0, m4
> +
> + movd xm5, carryd
> + vpbroadcastd m4, xm5
> + paddd m0, m4
> +
> + movu [dstq + xq*4], m0
> +
> + vextracti128 xm5, m0, 1
> + pshufd xm5, xm5, 0xff
> + movd carryd, xm5
> +
> + add xq, 8
> +%endmacro
> +
> +; void ff_compute_safe_ssd_integral_image(uint32_t *dst, ptrdiff_t dst_linesize_32,
> +; const uint8_t *s1, ptrdiff_t linesize1,
> +; const uint8_t *s2, ptrdiff_t linesize2,
> +; int w, int h);
> +;
> +; Assumptions (see C version):
> +; - w is multiple of 16 and w >= 16
> +; - h >= 1
> +; - dst[-1] and dst_top[-1] are readable
> +
> +INIT_YMM avx2
> +cglobal compute_safe_ssd_integral_image, 8, 14, 6, 0, dst, dst_lz, s1, ls1, s2, ls2, w, h, dst_top, dst_stride, x, carry, tmp
Why are there 13 parameters? In fact, the variable `tmp` does not
appear to be used within the function.
> + mov wd, dword wm
> + mov hd, dword hm
> + movsxd wq, wd
> +
> + mov dst_strideq, dst_lzq
> + shl dst_strideq, 2
> + mov dst_topq, dstq
> + sub dst_topq, dst_strideq
> +
> +.yloop:
> + xor xq, xq
> + mov carryd, [dstq - 4]
> +
> +.xloop:
> + ; ---- process 8 pixels ----
> + PROCESS_8_SSD_INTEGRAL
> + ; ---- process 8 pixels ----
> + PROCESS_8_SSD_INTEGRAL
> + cmp xq, wq
> + jl .xloop
> +
> + add s1q, ls1q
> + add s2q, ls2q
> + add dstq, dst_strideq
> + add dst_topq, dst_strideq
> + dec hd
> + jg .yloop
> + RET
> +
> ; void ff_compute_weights_line(const uint32_t *const iia,
> ; const uint32_t *const iib,
> ; const uint32_t *const iid,
> diff --git a/libavfilter/x86/vf_nlmeans_init.c b/libavfilter/x86/vf_nlmeans_init.c
> index 0adb2c7e8a..5bfdc7e028 100644
> --- a/libavfilter/x86/vf_nlmeans_init.c
> +++ b/libavfilter/x86/vf_nlmeans_init.c
> @@ -20,6 +20,11 @@
> #include "libavutil/x86/cpu.h"
> #include "libavfilter/vf_nlmeans.h"
>
> +void ff_compute_safe_ssd_integral_image_avx2(uint32_t *dst, ptrdiff_t dst_linesize_32,
> + const uint8_t *s1, ptrdiff_t linesize1,
> + const uint8_t *s2, ptrdiff_t linesize2,
> + int w, int h);
> +
> void ff_compute_weights_line_avx2(const uint32_t *const iia,
> const uint32_t *const iib,
> const uint32_t *const iid,
> @@ -36,7 +41,9 @@ av_cold void ff_nlmeans_init_x86(NLMeansDSPContext *dsp)
> #if ARCH_X86_64
> int cpu_flags = av_get_cpu_flags();
>
> - if (EXTERNAL_AVX2_FAST(cpu_flags))
> + if (EXTERNAL_AVX2_FAST(cpu_flags)) {
> + dsp->compute_safe_ssd_integral_image = ff_compute_safe_ssd_integral_image_avx2;
> dsp->compute_weights_line = ff_compute_weights_line_avx2;
> + }
> #endif
> }
> --
> 2.45.1.windows.1
>
--
=======================================
Jun zhao/赵军
+++++++++++++++++++++++++++++++++++++++
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2026-02-03 1:40 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2026-02-02 17:35 [FFmpeg-devel] [PATCH v2] avfilter/x86/vf_nlmeans: add AVX2 safe ssd integral image Andy Wu via ffmpeg-devel
2026-02-03 1:39 ` [FFmpeg-devel] " mypopy--- via ffmpeg-devel
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git