[FFmpeg-devel] Re: [PATCH] avfilter/x86/vf_nlmeans: add AVX2 safe ssd integral image

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed

From: mypopy--- via ffmpeg-devel <ffmpeg-devel@ffmpeg.org>
To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
Cc: Andy Wu <lpageo@163.com>, "mypopy@gmail.com" <mypopy@gmail.com>
Subject: [FFmpeg-devel] Re: [PATCH] avfilter/x86/vf_nlmeans: add AVX2 safe ssd integral image
Date: Mon, 2 Feb 2026 15:18:35 +0800
Message-ID: <CACYjbn0m6rrQuKYX5Gud-RLgWqkGzzjumzAWu=p_zv6yaC_ekQ@mail.gmail.com> (raw)
In-Reply-To: <20260112055109.282-1-lpageo@163.com>

On Mon, Jan 12, 2026 at 1:52 PM Andy Wu via ffmpeg-devel
<ffmpeg-devel@ffmpeg.org> wrote:
>
> Add an AVX2 implementation of compute_safe_ssd_integral_image used by vf_nlmeans.
>
> checkasm: vf_nlmeans
>
> bench: (x86_64, Linux) ssd_integral_image 1.93x
>
> bench: (x86_64, Windows/MSVC) ssd_integral_image 1.71x
>
> Signed-off-by: Andy Wu <lpageo@163.com>
> ---
>  libavfilter/x86/vf_nlmeans.asm    | 114 ++++++++++++++++++++++++++++++
>  libavfilter/x86/vf_nlmeans_init.c |   9 ++-
>  2 files changed, 122 insertions(+), 1 deletion(-)
>
> diff --git a/libavfilter/x86/vf_nlmeans.asm b/libavfilter/x86/vf_nlmeans.asm
> index 8f57801035..c61593b916 100644
> --- a/libavfilter/x86/vf_nlmeans.asm
> +++ b/libavfilter/x86/vf_nlmeans.asm
> @@ -37,6 +37,120 @@ ending_lut: dd -1, -1, -1, -1, -1, -1, -1, -1,\
>
>  SECTION .text
>
> +; void ff_compute_safe_ssd_integral_image(uint32_t *dst, ptrdiff_t dst_linesize_32,
> +;                                         const uint8_t *s1, ptrdiff_t linesize1,
> +;                                         const uint8_t *s2, ptrdiff_t linesize2,
> +;                                         int w, int h);
> +;
> +; Assumptions (see C version):
> +; - w is multiple of 16 and w >= 16
> +; - h >= 1
> +; - dst[-1] and dst_top[-1] are readable
> +
> +INIT_YMM avx2
> +cglobal compute_safe_ssd_integral_image, 8, 14, 6, 0, dst, dst_lz, s1, ls1, s2, ls2, w, h, dst_top, dst_stride, x, carry, tmp
> +    mov            wd, dword wm
> +    mov            hd, dword hm
> +    movsxd         wq, wd
> +
> +    mov   dst_strideq, dst_lzq
> +    shl   dst_strideq, 2
> +    mov      dst_topq, dstq
> +    sub      dst_topq, dst_strideq
> +
> +.yloop:
> +    xor           xq, xq
> +    mov       carryd, [dstq - 4]
> +
> +.xloop:
> +    ; ---- process 8 pixels ----
> +    pmovzxbd      m0, [s1q + xq]
> +    pmovzxbd      m1, [s2q + xq]
> +    psubd         m0, m1
> +    pmulld        m0, m0
> +
> +    movu          m1, [dst_topq + xq*4]
> +    movu          m2, [dst_topq + xq*4 - 4]
> +    psubd         m1, m2
> +    paddd         m0, m1
> +
> +    mova          m5, m0
> +    pslldq        m5, 4
> +    paddd         m0, m5
> +    mova          m5, m0
> +    pslldq        m5, 8
> +    paddd         m0, m5
> +    mova          m5, m0
> +    pslldq        m5, 16
> +    paddd         m0, m5
> +
> +    vextracti128 xm5, m0, 0
> +    pshufd      xm5, xm5, 0xff
> +    pxor          m4, m4
> +    vinserti128   m4, m4, xm5, 1
> +    paddd         m0, m4
> +
> +    movd        xm5, carryd
> +    vpbroadcastd  m4, xm5
> +    paddd         m0, m4
> +
> +    movu [dstq + xq*4], m0
> +
> +    vextracti128 xm5, m0, 1
> +    pshufd      xm5, xm5, 0xff
> +    movd      carryd, xm5
> +
> +    add           xq, 8
> +
> +    ; ---- process 8 pixels ----
> +    pmovzxbd      m0, [s1q + xq]
> +    pmovzxbd      m1, [s2q + xq]
> +    psubd         m0, m1
> +    pmulld        m0, m0
> +
> +    movu          m1, [dst_topq + xq*4]
> +    movu          m2, [dst_topq + xq*4 - 4]
> +    psubd         m1, m2
> +    paddd         m0, m1
> +
> +    mova          m5, m0
> +    pslldq        m5, 4
> +    paddd         m0, m5
> +    mova          m5, m0
> +    pslldq        m5, 8
> +    paddd         m0, m5
> +    mova          m5, m0
> +    pslldq        m5, 16
> +    paddd         m0, m5
> +
> +    vextracti128 xm5, m0, 0
> +    pshufd      xm5, xm5, 0xff
> +    pxor          m4, m4
> +    vinserti128   m4, m4, xm5, 1
> +    paddd         m0, m4
> +
> +    movd        xm5, carryd
> +    vpbroadcastd  m4, xm5
> +    paddd         m0, m4
> +
> +    movu [dstq + xq*4], m0
> +
> +    vextracti128 xm5, m0, 1
> +    pshufd      xm5, xm5, 0xff
> +    movd      carryd, xm5
> +
> +    add           xq, 8
the duplicated block of code for processing 8 pixels could be wrapped
in a macro，it will make the code more maintainable and shorter.
> +    cmp           xq, wq
> +    jl .xloop
> +
> +    add          s1q, ls1q
> +    add          s2q, ls2q
> +    add         dstq, dst_strideq
> +    add     dst_topq, dst_strideq
> +    dec           hd
> +    jg .yloop
> +    RET
> +
>  ; void ff_compute_weights_line(const uint32_t *const iia,
>  ;                              const uint32_t *const iib,
>  ;                              const uint32_t *const iid,
> diff --git a/libavfilter/x86/vf_nlmeans_init.c b/libavfilter/x86/vf_nlmeans_init.c
> index 0adb2c7e8a..5bfdc7e028 100644
> --- a/libavfilter/x86/vf_nlmeans_init.c
> +++ b/libavfilter/x86/vf_nlmeans_init.c
> @@ -20,6 +20,11 @@
>  #include "libavutil/x86/cpu.h"
>  #include "libavfilter/vf_nlmeans.h"
>
> +void ff_compute_safe_ssd_integral_image_avx2(uint32_t *dst, ptrdiff_t dst_linesize_32,
> +                                             const uint8_t *s1, ptrdiff_t linesize1,
> +                                             const uint8_t *s2, ptrdiff_t linesize2,
> +                                             int w, int h);
> +
>  void ff_compute_weights_line_avx2(const uint32_t *const iia,
>                                    const uint32_t *const iib,
>                                    const uint32_t *const iid,
> @@ -36,7 +41,9 @@ av_cold void ff_nlmeans_init_x86(NLMeansDSPContext *dsp)
>  #if ARCH_X86_64
>      int cpu_flags = av_get_cpu_flags();
>
> -    if (EXTERNAL_AVX2_FAST(cpu_flags))
> +    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
> +        dsp->compute_safe_ssd_integral_image = ff_compute_safe_ssd_integral_image_avx2;
>          dsp->compute_weights_line = ff_compute_weights_line_avx2;
> +    }
>  #endif
>  }
> --
> 2.43.0
>
> _______________________________________________
> ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
> To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org



-- 
=======================================
Jun zhao/赵军
+++++++++++++++++++++++++++++++++++++++
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org

     prev parent reply	other threads:[~2026-02-02  7:19 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-01-12  5:51 [FFmpeg-devel] " Andy Wu via ffmpeg-devel
2026-02-02  7:18 ` mypopy--- via ffmpeg-devel [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='CACYjbn0m6rrQuKYX5Gud-RLgWqkGzzjumzAWu=p_zv6yaC_ekQ@mail.gmail.com' \
    --to=ffmpeg-devel@ffmpeg.org \
    --cc=lpageo@163.com \
    --cc=mypopy@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git