Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH v2 1/7] x86/vvcdec: misc, reordered functions in dsp_init for improved readability
@ 2025-05-03  9:13 Nuo Mi
  2025-05-03  9:13 ` [FFmpeg-devel] [PATCH v2 2/7] x86/hevcdec: sao, refact out h26x macros Nuo Mi
                   ` (5 more replies)
  0 siblings, 6 replies; 10+ messages in thread
From: Nuo Mi @ 2025-05-03  9:13 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Nuo Mi

---
 libavcodec/x86/vvc/dsp_init.c | 48 +++++++++++++++++++++--------------
 1 file changed, 29 insertions(+), 19 deletions(-)

diff --git a/libavcodec/x86/vvc/dsp_init.c b/libavcodec/x86/vvc/dsp_init.c
index dc833bb0f1..bb68ba0b1e 100644
--- a/libavcodec/x86/vvc/dsp_init.c
+++ b/libavcodec/x86/vvc/dsp_init.c
@@ -215,6 +215,18 @@ ALF_FUNCS(16, 12, avx2)
 
 #endif
 
+#define AVG_INIT(bd, opt) do {                                       \
+    c->inter.avg    = bf(vvc_avg, bd, opt);                          \
+    c->inter.w_avg  = bf(vvc_w_avg, bd, opt);                        \
+} while (0)
+
+#define DMVR_INIT(bd) do {                                           \
+    c->inter.dmvr[0][0]   = ff_vvc_dmvr_##bd##_avx2;                 \
+    c->inter.dmvr[0][1]   = ff_vvc_dmvr_h_##bd##_avx2;               \
+    c->inter.dmvr[1][0]   = ff_vvc_dmvr_v_##bd##_avx2;               \
+    c->inter.dmvr[1][1]   = ff_vvc_dmvr_hv_##bd##_avx2;              \
+} while (0)
+
 #define PEL_LINK(dst, C, W, idx1, idx2, name, D, opt)                              \
     dst[C][W][idx1][idx2] = vvc_put_## name ## _ ## D ## _##opt;                   \
     dst ## _uni[C][W][idx1][idx2] = ff_h2656_put_uni_ ## name ## _ ## D ## _##opt; \
@@ -280,17 +292,8 @@ ALF_FUNCS(16, 12, avx2)
     MC_TAP_LINKS_16BPC_AVX2(LUMA,   8, bd);                          \
     MC_TAP_LINKS_16BPC_AVX2(CHROMA, 4, bd);
 
-#define AVG_INIT(bd, opt) do {                                       \
-    c->inter.avg    = bf(vvc_avg, bd, opt);                          \
-    c->inter.w_avg  = bf(vvc_w_avg, bd, opt);                        \
-} while (0)
-
-#define DMVR_INIT(bd) do {                                           \
-    c->inter.dmvr[0][0]   = ff_vvc_dmvr_##bd##_avx2;                 \
-    c->inter.dmvr[0][1]   = ff_vvc_dmvr_h_##bd##_avx2;               \
-    c->inter.dmvr[1][0]   = ff_vvc_dmvr_v_##bd##_avx2;               \
-    c->inter.dmvr[1][1]   = ff_vvc_dmvr_hv_##bd##_avx2;              \
-} while (0)
+int ff_vvc_sad_avx2(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h);
+#define SAD_INIT() c->inter.sad = ff_vvc_sad_avx2
 
 #define ALF_INIT(bd) do {                                            \
     c->alf.filter[LUMA]   = vvc_alf_filter_luma_##bd##_avx2;         \
@@ -298,8 +301,6 @@ ALF_FUNCS(16, 12, avx2)
     c->alf.classify       = vvc_alf_classify_##bd##_avx2;            \
 } while (0)
 
-int ff_vvc_sad_avx2(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h);
-#define SAD_INIT() c->inter.sad = ff_vvc_sad_avx2
 #endif
 
 
@@ -319,12 +320,15 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
 #endif
 #if HAVE_AVX2_EXTERNAL
         if (EXTERNAL_AVX2_FAST(cpu_flags)) {
-            ALF_INIT(8);
+            // inter
             AVG_INIT(8, avx2);
+            DMVR_INIT(8);
             MC_LINKS_AVX2(8);
             OF_INIT(8);
-            DMVR_INIT(8);
             SAD_INIT();
+
+            // filter
+            ALF_INIT(8);
         }
 #endif
         break;
@@ -336,13 +340,16 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
 #endif
 #if HAVE_AVX2_EXTERNAL
         if (EXTERNAL_AVX2_FAST(cpu_flags)) {
-            ALF_INIT(10);
+            // inter
             AVG_INIT(10, avx2);
+            DMVR_INIT(10);
             MC_LINKS_AVX2(10);
             MC_LINKS_16BPC_AVX2(10);
             OF_INIT(10);
-            DMVR_INIT(10);
             SAD_INIT();
+
+            // filter
+            ALF_INIT(10);
         }
 #endif
         break;
@@ -354,13 +361,16 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
 #endif
 #if HAVE_AVX2_EXTERNAL
         if (EXTERNAL_AVX2_FAST(cpu_flags)) {
-            ALF_INIT(12);
+            // inter
             AVG_INIT(12, avx2);
+            DMVR_INIT(12);
             MC_LINKS_AVX2(12);
             MC_LINKS_16BPC_AVX2(12);
             OF_INIT(12);
-            DMVR_INIT(12);
             SAD_INIT();
+
+            // filter
+            ALF_INIT(12);
         }
 #endif
         break;
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [FFmpeg-devel] [PATCH v2 2/7] x86/hevcdec: sao, refact out h26x macros
  2025-05-03  9:13 [FFmpeg-devel] [PATCH v2 1/7] x86/vvcdec: misc, reordered functions in dsp_init for improved readability Nuo Mi
@ 2025-05-03  9:13 ` Nuo Mi
  2025-05-03  9:13 ` [FFmpeg-devel] [PATCH v2 3/7] x86/hevcdec: refact, remove duplicate code in HEVC_SAO_{BAND, EDGE}_FILTER Nuo Mi
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 10+ messages in thread
From: Nuo Mi @ 2025-05-03  9:13 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Nuo Mi, Shaun Loo

From: Shaun Loo <shaunloo10@gmail.com>

This is a part of Google Summer of Code 2023

Co-authored-by: Nuo Mi <nuomi2021@gmail.com>
---
 libavcodec/x86/h26x/h2656_sao.asm       | 301 ++++++++++++++++++++++++
 libavcodec/x86/h26x/h2656_sao_10bit.asm | 301 ++++++++++++++++++++++++
 libavcodec/x86/hevc/sao.asm             | 278 +---------------------
 libavcodec/x86/hevc/sao_10bit.asm       | 277 +---------------------
 4 files changed, 610 insertions(+), 547 deletions(-)
 create mode 100644 libavcodec/x86/h26x/h2656_sao.asm
 create mode 100644 libavcodec/x86/h26x/h2656_sao_10bit.asm

diff --git a/libavcodec/x86/h26x/h2656_sao.asm b/libavcodec/x86/h26x/h2656_sao.asm
new file mode 100644
index 0000000000..504fcb388b
--- /dev/null
+++ b/libavcodec/x86/h26x/h2656_sao.asm
@@ -0,0 +1,301 @@
+;******************************************************************************
+;* SIMD optimized SAO functions for HEVC/VVC 8bit decoding
+;*
+;* Copyright (c) 2013 Pierre-Edouard LEPERE
+;* Copyright (c) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pb_edge_shuffle: times 2 db 1, 2, 0, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+pb_eo:                   db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1
+cextern pb_1
+cextern pb_2
+
+SECTION .text
+
+;******************************************************************************
+;SAO Band Filter
+;******************************************************************************
+
+%macro H2656_SAO_BAND_FILTER_INIT 0
+    and            leftq, 31
+    movd             xm0, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm1, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm2, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm3, leftd
+
+    SPLATW            m0, xm0
+    SPLATW            m1, xm1
+    SPLATW            m2, xm2
+    SPLATW            m3, xm3
+%if mmsize > 16
+    SPLATW            m4, [offsetq + 2]
+    SPLATW            m5, [offsetq + 4]
+    SPLATW            m6, [offsetq + 6]
+    SPLATW            m7, [offsetq + 8]
+%else
+    movq              m7, [offsetq + 2]
+    SPLATW            m4, m7, 0
+    SPLATW            m5, m7, 1
+    SPLATW            m6, m7, 2
+    SPLATW            m7, m7, 3
+%endif
+
+%if ARCH_X86_64
+    pxor             m14, m14
+
+%else ; ARCH_X86_32
+    mova  [rsp+mmsize*0], m0
+    mova  [rsp+mmsize*1], m1
+    mova  [rsp+mmsize*2], m2
+    mova  [rsp+mmsize*3], m3
+    mova  [rsp+mmsize*4], m4
+    mova  [rsp+mmsize*5], m5
+    mova  [rsp+mmsize*6], m6
+    pxor              m0, m0
+    %assign MMSIZE mmsize
+    %define m14 m0
+    %define m13 m1
+    %define  m9 m2
+    %define  m8 m3
+%endif ; ARCH
+DEFINE_ARGS dst, src, dststride, srcstride, offset, height
+    mov          heightd, r7m
+%endmacro
+
+%macro H2656_SAO_BAND_FILTER_COMPUTE 2
+    psraw             %1, %2, 3
+%if ARCH_X86_64
+    pcmpeqw          m10, %1, m0
+    pcmpeqw          m11, %1, m1
+    pcmpeqw          m12, %1, m2
+    pcmpeqw           %1, m3
+    pand             m10, m4
+    pand             m11, m5
+    pand             m12, m6
+    pand              %1, m7
+    por              m10, m11
+    por              m12, %1
+    por              m10, m12
+    paddw             %2, m10
+%else ; ARCH_X86_32
+    pcmpeqw           m4, %1, [rsp+MMSIZE*0]
+    pcmpeqw           m5, %1, [rsp+MMSIZE*1]
+    pcmpeqw           m6, %1, [rsp+MMSIZE*2]
+    pcmpeqw           %1, [rsp+MMSIZE*3]
+    pand              m4, [rsp+MMSIZE*4]
+    pand              m5, [rsp+MMSIZE*5]
+    pand              m6, [rsp+MMSIZE*6]
+    pand              %1, m7
+    por               m4, m5
+    por               m6, %1
+    por               m4, m6
+    paddw             %2, m4
+%endif ; ARCH
+%endmacro
+
+;void ff_{hevc, vvc}_sao_band_filter_<width>_8_<opt>(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+;                                             int16_t *sao_offset_val, int sao_left_class, int width, int height);
+%macro H2656_SAO_BAND_FILTER 3
+cglobal %1_sao_band_filter_%2_8, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left
+    H2656_SAO_BAND_FILTER_INIT
+
+align 16
+.loop:
+%if %2 == 8
+    movq              m8, [srcq]
+    punpcklbw         m8, m14
+    H2656_SAO_BAND_FILTER_COMPUTE m9, m8
+    packuswb          m8, m14
+    movq          [dstq], m8
+%endif ; %2 == 8
+
+%assign i 0
+%rep %3
+    mova             m13, [srcq + i]
+    punpcklbw         m8, m13, m14
+    H2656_SAO_BAND_FILTER_COMPUTE m9,  m8
+    punpckhbw        m13, m14
+    H2656_SAO_BAND_FILTER_COMPUTE m9, m13
+    packuswb          m8, m13
+    mova      [dstq + i], m8
+%assign i i+mmsize
+%endrep
+
+%if %2 == 48
+INIT_XMM cpuname
+
+    mova             m13, [srcq + i]
+    punpcklbw         m8, m13, m14
+    H2656_SAO_BAND_FILTER_COMPUTE m9,  m8
+    punpckhbw        m13, m14
+    H2656_SAO_BAND_FILTER_COMPUTE m9, m13
+    packuswb          m8, m13
+    mova      [dstq + i], m8
+%if cpuflag(avx2)
+INIT_YMM cpuname
+%endif
+%endif ; %2 == 48
+
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+%endmacro
+
+;******************************************************************************
+;SAO Edge Filter
+;******************************************************************************
+
+%define PADDING_SIZE 64 ; AV_INPUT_BUFFER_PADDING_SIZE
+%define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE
+
+%macro H2656_SAO_EDGE_FILTER_INIT 0
+%if WIN64
+    movsxd           eoq, dword eom
+%elif ARCH_X86_64
+    movsxd           eoq, eod
+%else
+    mov              eoq, r4m
+%endif
+    lea            tmp2q, [pb_eo]
+    movsx      a_strideq, byte [tmp2q+eoq*4+1]
+    movsx      b_strideq, byte [tmp2q+eoq*4+3]
+    imul       a_strideq, EDGE_SRCSTRIDE
+    imul       b_strideq, EDGE_SRCSTRIDE
+    movsx           tmpq, byte [tmp2q+eoq*4]
+    add        a_strideq, tmpq
+    movsx           tmpq, byte [tmp2q+eoq*4+2]
+    add        b_strideq, tmpq
+%endmacro
+
+%macro H2656_SAO_EDGE_FILTER_COMPUTE 1
+    pminub            m4, m1, m2
+    pminub            m5, m1, m3
+    pcmpeqb           m2, m4
+    pcmpeqb           m3, m5
+    pcmpeqb           m4, m1
+    pcmpeqb           m5, m1
+    psubb             m4, m2
+    psubb             m5, m3
+    paddb             m4, m6
+    paddb             m4, m5
+
+    pshufb            m2, m0, m4
+%if %1 > 8
+    punpckhbw         m5, m7, m1
+    punpckhbw         m4, m2, m7
+    punpcklbw         m3, m7, m1
+    punpcklbw         m2, m7
+    pmaddubsw         m5, m4
+    pmaddubsw         m3, m2
+    packuswb          m3, m5
+%else
+    punpcklbw         m3, m7, m1
+    punpcklbw         m2, m7
+    pmaddubsw         m3, m2
+    packuswb          m3, m3
+%endif
+%endmacro
+
+;void ff_{hevc, vvc}_sao_edge_filter_<width>_8_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
+;                                             int eo, int width, int height);
+%macro H2656_SAO_EDGE_FILTER 3-4
+%if ARCH_X86_64
+cglobal %1_sao_edge_filter_%2_8, 4, 9, 8, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp
+%define tmp2q heightq
+    H2656_SAO_EDGE_FILTER_INIT
+    mov          heightd, r6m
+
+%else ; ARCH_X86_32
+cglobal %1_sao_edge_filter_%2_8, 1, 6, 8, dst, src, dststride, a_stride, b_stride, height
+%define eoq   srcq
+%define tmpq  heightq
+%define tmp2q dststrideq
+%define offsetq heightq
+    H2656_SAO_EDGE_FILTER_INIT
+    mov             srcq, srcm
+    mov          offsetq, r3m
+    mov       dststrideq, dststridem
+%endif ; ARCH
+
+%if mmsize > 16
+    vbroadcasti128    m0, [offsetq]
+%else
+    movu              m0, [offsetq]
+%endif
+    mova              m1, [pb_edge_shuffle]
+    packsswb          m0, m0
+    mova              m7, [pb_1]
+    pshufb            m0, m1
+    mova              m6, [pb_2]
+%if ARCH_X86_32
+    mov          heightd, r6m
+%endif
+
+align 16
+.loop:
+
+%if %2 == 8
+    movq              m1, [srcq]
+    movq              m2, [srcq + a_strideq]
+    movq              m3, [srcq + b_strideq]
+    H2656_SAO_EDGE_FILTER_COMPUTE %2
+    movq          [dstq], m3
+%endif
+
+%assign i 0
+%rep %3
+    mova              m1, [srcq + i]
+    movu              m2, [srcq + a_strideq + i]
+    movu              m3, [srcq + b_strideq + i]
+    H2656_SAO_EDGE_FILTER_COMPUTE %2
+    mov%4     [dstq + i], m3
+%assign i i+mmsize
+%endrep
+
+%if %2 == 48
+INIT_XMM cpuname
+
+    mova              m1, [srcq + i]
+    movu              m2, [srcq + a_strideq + i]
+    movu              m3, [srcq + b_strideq + i]
+    H2656_SAO_EDGE_FILTER_COMPUTE %2
+    mova      [dstq + i], m3
+%if cpuflag(avx2)
+INIT_YMM cpuname
+%endif
+%endif
+
+    add             dstq, dststrideq
+    add             srcq, EDGE_SRCSTRIDE
+    dec          heightd
+    jg .loop
+    RET
+%endmacro
diff --git a/libavcodec/x86/h26x/h2656_sao_10bit.asm b/libavcodec/x86/h26x/h2656_sao_10bit.asm
new file mode 100644
index 0000000000..052f2b1d16
--- /dev/null
+++ b/libavcodec/x86/h26x/h2656_sao_10bit.asm
@@ -0,0 +1,301 @@
+;******************************************************************************
+;* SIMD optimized SAO functions for HEVC/VVC 10/12bit decoding
+;*
+;* Copyright (c) 2013 Pierre-Edouard LEPERE
+;* Copyright (c) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pw_m2:     times 16 dw -2
+pw_mask10: times 16 dw 0x03FF
+pw_mask12: times 16 dw 0x0FFF
+pb_eo:              db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1
+cextern pw_m1
+cextern pw_1
+cextern pw_2
+
+SECTION .text
+
+;******************************************************************************
+;SAO Band Filter
+;******************************************************************************
+
+%macro H2656_SAO_BAND_FILTER_INIT 1
+    and            leftq, 31
+    movd             xm0, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm1, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm2, leftd
+    add            leftq, 1
+    and            leftq, 31
+    movd             xm3, leftd
+
+    SPLATW            m0, xm0
+    SPLATW            m1, xm1
+    SPLATW            m2, xm2
+    SPLATW            m3, xm3
+%if mmsize > 16
+    SPLATW            m4, [offsetq + 2]
+    SPLATW            m5, [offsetq + 4]
+    SPLATW            m6, [offsetq + 6]
+    SPLATW            m7, [offsetq + 8]
+%else
+    movq              m7, [offsetq + 2]
+    SPLATW            m4, m7, 0
+    SPLATW            m5, m7, 1
+    SPLATW            m6, m7, 2
+    SPLATW            m7, m7, 3
+%endif
+
+%if ARCH_X86_64
+    mova             m13, [pw_mask %+ %1]
+    pxor             m14, m14
+
+%else ; ARCH_X86_32
+    mova  [rsp+mmsize*0], m0
+    mova  [rsp+mmsize*1], m1
+    mova  [rsp+mmsize*2], m2
+    mova  [rsp+mmsize*3], m3
+    mova  [rsp+mmsize*4], m4
+    mova  [rsp+mmsize*5], m5
+    mova  [rsp+mmsize*6], m6
+    mova              m1, [pw_mask %+ %1]
+    pxor              m0, m0
+    %define m14 m0
+    %define m13 m1
+    %define  m9 m2
+    %define  m8 m3
+%endif ; ARCH
+DEFINE_ARGS dst, src, dststride, srcstride, offset, height
+    mov          heightd, r7m
+%endmacro
+
+;void ff_{hevc, vvc}_sao_band_filter_<width>_<depth>_<opt>(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+;                                                   int16_t *sao_offset_val, int sao_left_class, int width, int height);
+%macro H2656_SAO_BAND_FILTER 4
+cglobal %1_sao_band_filter_%3_%2, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left
+    H2656_SAO_BAND_FILTER_INIT %2
+
+align 16
+.loop:
+
+%assign i 0
+%assign j 0
+%rep %4
+%assign k 8+(j&1)
+%assign l 9-(j&1)
+    mova          m %+ k, [srcq + i]
+    psraw         m %+ l, m %+ k, %2-5
+%if ARCH_X86_64
+    pcmpeqw          m10, m %+ l, m0
+    pcmpeqw          m11, m %+ l, m1
+    pcmpeqw          m12, m %+ l, m2
+    pcmpeqw       m %+ l, m3
+    pand             m10, m4
+    pand             m11, m5
+    pand             m12, m6
+    pand          m %+ l, m7
+    por              m10, m11
+    por              m12, m %+ l
+    por              m10, m12
+    paddw         m %+ k, m10
+%else ; ARCH_X86_32
+    pcmpeqw           m4, m %+ l, [rsp+mmsize*0]
+    pcmpeqw           m5, m %+ l, [rsp+mmsize*1]
+    pcmpeqw           m6, m %+ l, [rsp+mmsize*2]
+    pcmpeqw       m %+ l, [rsp+mmsize*3]
+    pand              m4, [rsp+mmsize*4]
+    pand              m5, [rsp+mmsize*5]
+    pand              m6, [rsp+mmsize*6]
+    pand          m %+ l, m7
+    por               m4, m5
+    por               m6, m %+ l
+    por               m4, m6
+    paddw         m %+ k, m4
+%endif ; ARCH
+    CLIPW             m %+ k, m14, m13
+    mova      [dstq + i], m %+ k
+%assign i i+mmsize
+%assign j j+1
+%endrep
+
+    add             dstq, dststrideq
+    add             srcq, srcstrideq
+    dec          heightd
+    jg .loop
+    RET
+%endmacro
+
+;******************************************************************************
+;SAO Edge Filter
+;******************************************************************************
+
+%define PADDING_SIZE 64 ; AV_INPUT_BUFFER_PADDING_SIZE
+%define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE
+
+%macro PMINUW 4
+%if cpuflag(sse4)
+    pminuw            %1, %2, %3
+%else
+    psubusw           %4, %2, %3
+    psubw             %1, %2, %4
+%endif
+%endmacro
+
+%macro H2656_SAO_EDGE_FILTER_INIT 0
+%if WIN64
+    movsxd           eoq, dword eom
+%elif ARCH_X86_64
+    movsxd           eoq, eod
+%else
+    mov              eoq, r4m
+%endif
+    lea            tmp2q, [pb_eo]
+    movsx      a_strideq, byte [tmp2q+eoq*4+1]
+    movsx      b_strideq, byte [tmp2q+eoq*4+3]
+    imul       a_strideq, EDGE_SRCSTRIDE >> 1
+    imul       b_strideq, EDGE_SRCSTRIDE >> 1
+    movsx           tmpq, byte [tmp2q+eoq*4]
+    add        a_strideq, tmpq
+    movsx           tmpq, byte [tmp2q+eoq*4+2]
+    add        b_strideq, tmpq
+%endmacro
+
+;void ff_{hevc, vvc}_sao_edge_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
+;                                                   int eo, int width, int height);
+%macro H2656_SAO_EDGE_FILTER 4
+%if ARCH_X86_64
+cglobal %1_sao_edge_filter_%3_%2, 4, 9, 16, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp
+%define tmp2q heightq
+    H2656_SAO_EDGE_FILTER_INIT
+    mov          heightd, r6m
+    add        a_strideq, a_strideq
+    add        b_strideq, b_strideq
+
+%else ; ARCH_X86_32
+cglobal %1_sao_edge_filter_%3_%2, 1, 6, 8, 5*mmsize, dst, src, dststride, a_stride, b_stride, height
+%define eoq   srcq
+%define tmpq  heightq
+%define tmp2q dststrideq
+%define offsetq heightq
+%define m8 m1
+%define m9 m2
+%define m10 m3
+%define m11 m4
+%define m12 m5
+    H2656_SAO_EDGE_FILTER_INIT
+    mov             srcq, srcm
+    mov          offsetq, r3m
+    mov       dststrideq, dststridem
+    add        a_strideq, a_strideq
+    add        b_strideq, b_strideq
+
+%endif ; ARCH
+
+%if mmsize > 16
+    SPLATW            m8, [offsetq+2]
+    SPLATW            m9, [offsetq+4]
+    SPLATW           m10, [offsetq+0]
+    SPLATW           m11, [offsetq+6]
+    SPLATW           m12, [offsetq+8]
+%else
+    movq             m10, [offsetq+0]
+    movd             m12, [offsetq+6]
+    SPLATW            m8, xm10, 1
+    SPLATW            m9, xm10, 2
+    SPLATW           m10, xm10, 0
+    SPLATW           m11, xm12, 0
+    SPLATW           m12, xm12, 1
+%endif
+    pxor              m0, m0
+%if ARCH_X86_64
+    mova             m13, [pw_m1]
+    mova             m14, [pw_1]
+    mova             m15, [pw_2]
+%else
+    mov          heightd, r6m
+    mova  [rsp+mmsize*0], m8
+    mova  [rsp+mmsize*1], m9
+    mova  [rsp+mmsize*2], m10
+    mova  [rsp+mmsize*3], m11
+    mova  [rsp+mmsize*4], m12
+%endif
+
+align 16
+.loop:
+
+%assign i 0
+%rep %4
+    mova              m1, [srcq + i]
+    movu              m2, [srcq+a_strideq + i]
+    movu              m3, [srcq+b_strideq + i]
+    PMINUW            m4, m1, m2, m6
+    PMINUW            m5, m1, m3, m7
+    pcmpeqw           m2, m4
+    pcmpeqw           m3, m5
+    pcmpeqw           m4, m1
+    pcmpeqw           m5, m1
+    psubw             m4, m2
+    psubw             m5, m3
+
+    paddw             m4, m5
+    pcmpeqw           m2, m4, [pw_m2]
+%if ARCH_X86_64
+    pcmpeqw           m3, m4, m13
+    pcmpeqw           m5, m4, m0
+    pcmpeqw           m6, m4, m14
+    pcmpeqw           m7, m4, m15
+    pand              m2, m8
+    pand              m3, m9
+    pand              m5, m10
+    pand              m6, m11
+    pand              m7, m12
+%else
+    pcmpeqw           m3, m4, [pw_m1]
+    pcmpeqw           m5, m4, m0
+    pcmpeqw           m6, m4, [pw_1]
+    pcmpeqw           m7, m4, [pw_2]
+    pand              m2, [rsp+mmsize*0]
+    pand              m3, [rsp+mmsize*1]
+    pand              m5, [rsp+mmsize*2]
+    pand              m6, [rsp+mmsize*3]
+    pand              m7, [rsp+mmsize*4]
+%endif
+    paddw             m2, m3
+    paddw             m5, m6
+    paddw             m2, m7
+    paddw             m2, m1
+    paddw             m2, m5
+    CLIPW             m2, m0, [pw_mask %+ %2]
+    mova      [dstq + i], m2
+%assign i i+mmsize
+%endrep
+
+    add             dstq, dststrideq
+    add             srcq, EDGE_SRCSTRIDE
+    dec          heightd
+    jg .loop
+    RET
+%endmacro
diff --git a/libavcodec/x86/hevc/sao.asm b/libavcodec/x86/hevc/sao.asm
index 8abb16150d..c4f6db4cd5 100644
--- a/libavcodec/x86/hevc/sao.asm
+++ b/libavcodec/x86/hevc/sao.asm
@@ -21,155 +21,13 @@
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA 32
-
-pb_edge_shuffle: times 2 db 1, 2, 0, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
-pb_eo:                   db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1
-cextern pb_1
-cextern pb_2
-
-SECTION .text
-
-;******************************************************************************
-;SAO Band Filter
-;******************************************************************************
-
-%macro HEVC_SAO_BAND_FILTER_INIT 0
-    and            leftq, 31
-    movd             xm0, leftd
-    add            leftq, 1
-    and            leftq, 31
-    movd             xm1, leftd
-    add            leftq, 1
-    and            leftq, 31
-    movd             xm2, leftd
-    add            leftq, 1
-    and            leftq, 31
-    movd             xm3, leftd
-
-    SPLATW            m0, xm0
-    SPLATW            m1, xm1
-    SPLATW            m2, xm2
-    SPLATW            m3, xm3
-%if mmsize > 16
-    SPLATW            m4, [offsetq + 2]
-    SPLATW            m5, [offsetq + 4]
-    SPLATW            m6, [offsetq + 6]
-    SPLATW            m7, [offsetq + 8]
-%else
-    movq              m7, [offsetq + 2]
-    SPLATW            m4, m7, 0
-    SPLATW            m5, m7, 1
-    SPLATW            m6, m7, 2
-    SPLATW            m7, m7, 3
-%endif
-
-%if ARCH_X86_64
-    pxor             m14, m14
-
-%else ; ARCH_X86_32
-    mova  [rsp+mmsize*0], m0
-    mova  [rsp+mmsize*1], m1
-    mova  [rsp+mmsize*2], m2
-    mova  [rsp+mmsize*3], m3
-    mova  [rsp+mmsize*4], m4
-    mova  [rsp+mmsize*5], m5
-    mova  [rsp+mmsize*6], m6
-    pxor              m0, m0
-    %assign MMSIZE mmsize
-    %define m14 m0
-    %define m13 m1
-    %define  m9 m2
-    %define  m8 m3
-%endif ; ARCH
-DEFINE_ARGS dst, src, dststride, srcstride, offset, height
-    mov          heightd, r7m
-%endmacro
-
-%macro HEVC_SAO_BAND_FILTER_COMPUTE 2
-    psraw             %1, %2, 3
-%if ARCH_X86_64
-    pcmpeqw          m10, %1, m0
-    pcmpeqw          m11, %1, m1
-    pcmpeqw          m12, %1, m2
-    pcmpeqw           %1, m3
-    pand             m10, m4
-    pand             m11, m5
-    pand             m12, m6
-    pand              %1, m7
-    por              m10, m11
-    por              m12, %1
-    por              m10, m12
-    paddw             %2, m10
-%else ; ARCH_X86_32
-    pcmpeqw           m4, %1, [rsp+MMSIZE*0]
-    pcmpeqw           m5, %1, [rsp+MMSIZE*1]
-    pcmpeqw           m6, %1, [rsp+MMSIZE*2]
-    pcmpeqw           %1, [rsp+MMSIZE*3]
-    pand              m4, [rsp+MMSIZE*4]
-    pand              m5, [rsp+MMSIZE*5]
-    pand              m6, [rsp+MMSIZE*6]
-    pand              %1, m7
-    por               m4, m5
-    por               m6, %1
-    por               m4, m6
-    paddw             %2, m4
-%endif ; ARCH
-%endmacro
+%define MAX_PB_SIZE  64
+%include "libavcodec/x86/h26x/h2656_sao.asm"
 
-;void ff_hevc_sao_band_filter_<width>_8_<opt>(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
-;                                             int16_t *sao_offset_val, int sao_left_class, int width, int height);
 %macro HEVC_SAO_BAND_FILTER 2
-cglobal hevc_sao_band_filter_%1_8, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left
-    HEVC_SAO_BAND_FILTER_INIT
-
-align 16
-.loop:
-%if %1 == 8
-    movq              m8, [srcq]
-    punpcklbw         m8, m14
-    HEVC_SAO_BAND_FILTER_COMPUTE m9, m8
-    packuswb          m8, m14
-    movq          [dstq], m8
-%endif ; %1 == 8
-
-%assign i 0
-%rep %2
-    mova             m13, [srcq + i]
-    punpcklbw         m8, m13, m14
-    HEVC_SAO_BAND_FILTER_COMPUTE m9,  m8
-    punpckhbw        m13, m14
-    HEVC_SAO_BAND_FILTER_COMPUTE m9, m13
-    packuswb          m8, m13
-    mova      [dstq + i], m8
-%assign i i+mmsize
-%endrep
-
-%if %1 == 48
-INIT_XMM cpuname
-
-    mova             m13, [srcq + i]
-    punpcklbw         m8, m13, m14
-    HEVC_SAO_BAND_FILTER_COMPUTE m9,  m8
-    punpckhbw        m13, m14
-    HEVC_SAO_BAND_FILTER_COMPUTE m9, m13
-    packuswb          m8, m13
-    mova      [dstq + i], m8
-%if cpuflag(avx2)
-INIT_YMM cpuname
-%endif
-%endif ; %1 == 48
-
-    add             dstq, dststrideq             ; dst += dststride
-    add             srcq, srcstrideq             ; src += srcstride
-    dec          heightd                         ; cmp height
-    jnz               .loop                      ; height loop
-    RET
+    H2656_SAO_BAND_FILTER hevc, %1, %2
 %endmacro
 
-
 %macro HEVC_SAO_BAND_FILTER_FUNCS 0
 HEVC_SAO_BAND_FILTER  8, 0
 HEVC_SAO_BAND_FILTER 16, 1
@@ -193,136 +51,8 @@ HEVC_SAO_BAND_FILTER 48, 1
 HEVC_SAO_BAND_FILTER 64, 2
 %endif
 
-;******************************************************************************
-;SAO Edge Filter
-;******************************************************************************
-
-%define MAX_PB_SIZE  64
-%define PADDING_SIZE 64 ; AV_INPUT_BUFFER_PADDING_SIZE
-%define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE
-
-%macro HEVC_SAO_EDGE_FILTER_INIT 0
-%if WIN64
-    movsxd           eoq, dword eom
-%elif ARCH_X86_64
-    movsxd           eoq, eod
-%else
-    mov              eoq, r4m
-%endif
-    lea            tmp2q, [pb_eo]
-    movsx      a_strideq, byte [tmp2q+eoq*4+1]
-    movsx      b_strideq, byte [tmp2q+eoq*4+3]
-    imul       a_strideq, EDGE_SRCSTRIDE
-    imul       b_strideq, EDGE_SRCSTRIDE
-    movsx           tmpq, byte [tmp2q+eoq*4]
-    add        a_strideq, tmpq
-    movsx           tmpq, byte [tmp2q+eoq*4+2]
-    add        b_strideq, tmpq
-%endmacro
-
-%macro HEVC_SAO_EDGE_FILTER_COMPUTE 1
-    pminub            m4, m1, m2
-    pminub            m5, m1, m3
-    pcmpeqb           m2, m4
-    pcmpeqb           m3, m5
-    pcmpeqb           m4, m1
-    pcmpeqb           m5, m1
-    psubb             m4, m2
-    psubb             m5, m3
-    paddb             m4, m6
-    paddb             m4, m5
-
-    pshufb            m2, m0, m4
-%if %1 > 8
-    punpckhbw         m5, m7, m1
-    punpckhbw         m4, m2, m7
-    punpcklbw         m3, m7, m1
-    punpcklbw         m2, m7
-    pmaddubsw         m5, m4
-    pmaddubsw         m3, m2
-    packuswb          m3, m5
-%else
-    punpcklbw         m3, m7, m1
-    punpcklbw         m2, m7
-    pmaddubsw         m3, m2
-    packuswb          m3, m3
-%endif
-%endmacro
-
-;void ff_hevc_sao_edge_filter_<width>_8_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
-;                                             int eo, int width, int height);
 %macro HEVC_SAO_EDGE_FILTER 2-3
-%if ARCH_X86_64
-cglobal hevc_sao_edge_filter_%1_8, 4, 9, 8, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp
-%define tmp2q heightq
-    HEVC_SAO_EDGE_FILTER_INIT
-    mov          heightd, r6m
-
-%else ; ARCH_X86_32
-cglobal hevc_sao_edge_filter_%1_8, 1, 6, 8, dst, src, dststride, a_stride, b_stride, height
-%define eoq   srcq
-%define tmpq  heightq
-%define tmp2q dststrideq
-%define offsetq heightq
-    HEVC_SAO_EDGE_FILTER_INIT
-    mov             srcq, srcm
-    mov          offsetq, r3m
-    mov       dststrideq, dststridem
-%endif ; ARCH
-
-%if mmsize > 16
-    vbroadcasti128    m0, [offsetq]
-%else
-    movu              m0, [offsetq]
-%endif
-    mova              m1, [pb_edge_shuffle]
-    packsswb          m0, m0
-    mova              m7, [pb_1]
-    pshufb            m0, m1
-    mova              m6, [pb_2]
-%if ARCH_X86_32
-    mov          heightd, r6m
-%endif
-
-align 16
-.loop:
-
-%if %1 == 8
-    movq              m1, [srcq]
-    movq              m2, [srcq + a_strideq]
-    movq              m3, [srcq + b_strideq]
-    HEVC_SAO_EDGE_FILTER_COMPUTE %1
-    movq          [dstq], m3
-%endif
-
-%assign i 0
-%rep %2
-    mova              m1, [srcq + i]
-    movu              m2, [srcq + a_strideq + i]
-    movu              m3, [srcq + b_strideq + i]
-    HEVC_SAO_EDGE_FILTER_COMPUTE %1
-    mov%3     [dstq + i], m3
-%assign i i+mmsize
-%endrep
-
-%if %1 == 48
-INIT_XMM cpuname
-
-    mova              m1, [srcq + i]
-    movu              m2, [srcq + a_strideq + i]
-    movu              m3, [srcq + b_strideq + i]
-    HEVC_SAO_EDGE_FILTER_COMPUTE %1
-    mova      [dstq + i], m3
-%if cpuflag(avx2)
-INIT_YMM cpuname
-%endif
-%endif
-
-    add             dstq, dststrideq
-    add             srcq, EDGE_SRCSTRIDE
-    dec          heightd
-    jg .loop
-    RET
+    H2656_SAO_EDGE_FILTER hevc, %{1:-1}
 %endmacro
 
 INIT_XMM ssse3
diff --git a/libavcodec/x86/hevc/sao_10bit.asm b/libavcodec/x86/hevc/sao_10bit.asm
index 0daa9c645c..eeb2b2fe59 100644
--- a/libavcodec/x86/hevc/sao_10bit.asm
+++ b/libavcodec/x86/hevc/sao_10bit.asm
@@ -21,131 +21,11 @@
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
-%include "libavutil/x86/x86util.asm"
-
-SECTION_RODATA 32
-
-pw_m2:     times 16 dw -2
-pw_mask10: times 16 dw 0x03FF
-pw_mask12: times 16 dw 0x0FFF
-pb_eo:              db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1
-cextern pw_m1
-cextern pw_1
-cextern pw_2
-
-SECTION .text
-
-;******************************************************************************
-;SAO Band Filter
-;******************************************************************************
-
-%macro HEVC_SAO_BAND_FILTER_INIT 1
-    and            leftq, 31
-    movd             xm0, leftd
-    add            leftq, 1
-    and            leftq, 31
-    movd             xm1, leftd
-    add            leftq, 1
-    and            leftq, 31
-    movd             xm2, leftd
-    add            leftq, 1
-    and            leftq, 31
-    movd             xm3, leftd
-
-    SPLATW            m0, xm0
-    SPLATW            m1, xm1
-    SPLATW            m2, xm2
-    SPLATW            m3, xm3
-%if mmsize > 16
-    SPLATW            m4, [offsetq + 2]
-    SPLATW            m5, [offsetq + 4]
-    SPLATW            m6, [offsetq + 6]
-    SPLATW            m7, [offsetq + 8]
-%else
-    movq              m7, [offsetq + 2]
-    SPLATW            m4, m7, 0
-    SPLATW            m5, m7, 1
-    SPLATW            m6, m7, 2
-    SPLATW            m7, m7, 3
-%endif
-
-%if ARCH_X86_64
-    mova             m13, [pw_mask %+ %1]
-    pxor             m14, m14
-
-%else ; ARCH_X86_32
-    mova  [rsp+mmsize*0], m0
-    mova  [rsp+mmsize*1], m1
-    mova  [rsp+mmsize*2], m2
-    mova  [rsp+mmsize*3], m3
-    mova  [rsp+mmsize*4], m4
-    mova  [rsp+mmsize*5], m5
-    mova  [rsp+mmsize*6], m6
-    mova              m1, [pw_mask %+ %1]
-    pxor              m0, m0
-    %define m14 m0
-    %define m13 m1
-    %define  m9 m2
-    %define  m8 m3
-%endif ; ARCH
-DEFINE_ARGS dst, src, dststride, srcstride, offset, height
-    mov          heightd, r7m
-%endmacro
+%define MAX_PB_SIZE  64
+%include "libavcodec/x86/h26x/h2656_sao_10bit.asm"
 
-;void ff_hevc_sao_band_filter_<width>_<depth>_<opt>(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
-;                                                   int16_t *sao_offset_val, int sao_left_class, int width, int height);
 %macro HEVC_SAO_BAND_FILTER 3
-cglobal hevc_sao_band_filter_%2_%1, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left
-    HEVC_SAO_BAND_FILTER_INIT %1
-
-align 16
-.loop:
-
-%assign i 0
-%assign j 0
-%rep %3
-%assign k 8+(j&1)
-%assign l 9-(j&1)
-    mova          m %+ k, [srcq + i]
-    psraw         m %+ l, m %+ k, %1-5
-%if ARCH_X86_64
-    pcmpeqw          m10, m %+ l, m0
-    pcmpeqw          m11, m %+ l, m1
-    pcmpeqw          m12, m %+ l, m2
-    pcmpeqw       m %+ l, m3
-    pand             m10, m4
-    pand             m11, m5
-    pand             m12, m6
-    pand          m %+ l, m7
-    por              m10, m11
-    por              m12, m %+ l
-    por              m10, m12
-    paddw         m %+ k, m10
-%else ; ARCH_X86_32
-    pcmpeqw           m4, m %+ l, [rsp+mmsize*0]
-    pcmpeqw           m5, m %+ l, [rsp+mmsize*1]
-    pcmpeqw           m6, m %+ l, [rsp+mmsize*2]
-    pcmpeqw       m %+ l, [rsp+mmsize*3]
-    pand              m4, [rsp+mmsize*4]
-    pand              m5, [rsp+mmsize*5]
-    pand              m6, [rsp+mmsize*6]
-    pand          m %+ l, m7
-    por               m4, m5
-    por               m6, m %+ l
-    por               m4, m6
-    paddw         m %+ k, m4
-%endif ; ARCH
-    CLIPW             m %+ k, m14, m13
-    mova      [dstq + i], m %+ k
-%assign i i+mmsize
-%assign j j+1
-%endrep
-
-    add             dstq, dststrideq
-    add             srcq, srcstrideq
-    dec          heightd
-    jg .loop
-    RET
+    H2656_SAO_BAND_FILTER hevc, %1, %2, %3
 %endmacro
 
 %macro HEVC_SAO_BAND_FILTER_FUNCS 0
@@ -185,157 +65,8 @@ HEVC_SAO_BAND_FILTER 12, 48, 3
 HEVC_SAO_BAND_FILTER 12, 64, 4
 %endif
 
-;******************************************************************************
-;SAO Edge Filter
-;******************************************************************************
-
-%define MAX_PB_SIZE  64
-%define PADDING_SIZE 64 ; AV_INPUT_BUFFER_PADDING_SIZE
-%define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE
-
-%macro PMINUW 4
-%if cpuflag(sse4)
-    pminuw            %1, %2, %3
-%else
-    psubusw           %4, %2, %3
-    psubw             %1, %2, %4
-%endif
-%endmacro
-
-%macro HEVC_SAO_EDGE_FILTER_INIT 0
-%if WIN64
-    movsxd           eoq, dword eom
-%elif ARCH_X86_64
-    movsxd           eoq, eod
-%else
-    mov              eoq, r4m
-%endif
-    lea            tmp2q, [pb_eo]
-    movsx      a_strideq, byte [tmp2q+eoq*4+1]
-    movsx      b_strideq, byte [tmp2q+eoq*4+3]
-    imul       a_strideq, EDGE_SRCSTRIDE >> 1
-    imul       b_strideq, EDGE_SRCSTRIDE >> 1
-    movsx           tmpq, byte [tmp2q+eoq*4]
-    add        a_strideq, tmpq
-    movsx           tmpq, byte [tmp2q+eoq*4+2]
-    add        b_strideq, tmpq
-%endmacro
-
-;void ff_hevc_sao_edge_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
-;                                                   int eo, int width, int height);
 %macro HEVC_SAO_EDGE_FILTER 3
-%if ARCH_X86_64
-cglobal hevc_sao_edge_filter_%2_%1, 4, 9, 16, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp
-%define tmp2q heightq
-    HEVC_SAO_EDGE_FILTER_INIT
-    mov          heightd, r6m
-    add        a_strideq, a_strideq
-    add        b_strideq, b_strideq
-
-%else ; ARCH_X86_32
-cglobal hevc_sao_edge_filter_%2_%1, 1, 6, 8, 5*mmsize, dst, src, dststride, a_stride, b_stride, height
-%define eoq   srcq
-%define tmpq  heightq
-%define tmp2q dststrideq
-%define offsetq heightq
-%define m8 m1
-%define m9 m2
-%define m10 m3
-%define m11 m4
-%define m12 m5
-    HEVC_SAO_EDGE_FILTER_INIT
-    mov             srcq, srcm
-    mov          offsetq, r3m
-    mov       dststrideq, dststridem
-    add        a_strideq, a_strideq
-    add        b_strideq, b_strideq
-
-%endif ; ARCH
-
-%if mmsize > 16
-    SPLATW            m8, [offsetq+2]
-    SPLATW            m9, [offsetq+4]
-    SPLATW           m10, [offsetq+0]
-    SPLATW           m11, [offsetq+6]
-    SPLATW           m12, [offsetq+8]
-%else
-    movq             m10, [offsetq+0]
-    movd             m12, [offsetq+6]
-    SPLATW            m8, xm10, 1
-    SPLATW            m9, xm10, 2
-    SPLATW           m10, xm10, 0
-    SPLATW           m11, xm12, 0
-    SPLATW           m12, xm12, 1
-%endif
-    pxor              m0, m0
-%if ARCH_X86_64
-    mova             m13, [pw_m1]
-    mova             m14, [pw_1]
-    mova             m15, [pw_2]
-%else
-    mov          heightd, r6m
-    mova  [rsp+mmsize*0], m8
-    mova  [rsp+mmsize*1], m9
-    mova  [rsp+mmsize*2], m10
-    mova  [rsp+mmsize*3], m11
-    mova  [rsp+mmsize*4], m12
-%endif
-
-align 16
-.loop:
-
-%assign i 0
-%rep %3
-    mova              m1, [srcq + i]
-    movu              m2, [srcq+a_strideq + i]
-    movu              m3, [srcq+b_strideq + i]
-    PMINUW            m4, m1, m2, m6
-    PMINUW            m5, m1, m3, m7
-    pcmpeqw           m2, m4
-    pcmpeqw           m3, m5
-    pcmpeqw           m4, m1
-    pcmpeqw           m5, m1
-    psubw             m4, m2
-    psubw             m5, m3
-
-    paddw             m4, m5
-    pcmpeqw           m2, m4, [pw_m2]
-%if ARCH_X86_64
-    pcmpeqw           m3, m4, m13
-    pcmpeqw           m5, m4, m0
-    pcmpeqw           m6, m4, m14
-    pcmpeqw           m7, m4, m15
-    pand              m2, m8
-    pand              m3, m9
-    pand              m5, m10
-    pand              m6, m11
-    pand              m7, m12
-%else
-    pcmpeqw           m3, m4, [pw_m1]
-    pcmpeqw           m5, m4, m0
-    pcmpeqw           m6, m4, [pw_1]
-    pcmpeqw           m7, m4, [pw_2]
-    pand              m2, [rsp+mmsize*0]
-    pand              m3, [rsp+mmsize*1]
-    pand              m5, [rsp+mmsize*2]
-    pand              m6, [rsp+mmsize*3]
-    pand              m7, [rsp+mmsize*4]
-%endif
-    paddw             m2, m3
-    paddw             m5, m6
-    paddw             m2, m7
-    paddw             m2, m1
-    paddw             m2, m5
-    CLIPW             m2, m0, [pw_mask %+ %1]
-    mova      [dstq + i], m2
-%assign i i+mmsize
-%endrep
-
-    add             dstq, dststrideq
-    add             srcq, EDGE_SRCSTRIDE
-    dec          heightd
-    jg .loop
-    RET
+    H2656_SAO_EDGE_FILTER hevc, %1, %2, %3
 %endmacro
 
 INIT_XMM sse2
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [FFmpeg-devel] [PATCH v2 3/7] x86/hevcdec: refact, remove duplicate code in HEVC_SAO_{BAND, EDGE}_FILTER
  2025-05-03  9:13 [FFmpeg-devel] [PATCH v2 1/7] x86/vvcdec: misc, reordered functions in dsp_init for improved readability Nuo Mi
  2025-05-03  9:13 ` [FFmpeg-devel] [PATCH v2 2/7] x86/hevcdec: sao, refact out h26x macros Nuo Mi
@ 2025-05-03  9:13 ` Nuo Mi
  2025-05-03  9:13 ` [FFmpeg-devel] [PATCH v2 4/7] x86/vvcdec: sao, add avx2 support Nuo Mi
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 10+ messages in thread
From: Nuo Mi @ 2025-05-03  9:13 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Nuo Mi, Shaun Loo

From: Shaun Loo <shaunloo10@gmail.com>

This is a part of Google Summer of Code 2023

Co-authored-by: Nuo Mi <nuomi2021@gmail.com>
---
 libavcodec/x86/hevc/sao_10bit.asm | 100 ++++++++++++++----------------
 1 file changed, 48 insertions(+), 52 deletions(-)

diff --git a/libavcodec/x86/hevc/sao_10bit.asm b/libavcodec/x86/hevc/sao_10bit.asm
index eeb2b2fe59..0320efd758 100644
--- a/libavcodec/x86/hevc/sao_10bit.asm
+++ b/libavcodec/x86/hevc/sao_10bit.asm
@@ -28,18 +28,17 @@
     H2656_SAO_BAND_FILTER hevc, %1, %2, %3
 %endmacro
 
+%macro HEVC_SAO_BAND_FILTER_FUNCS 1
+    HEVC_SAO_BAND_FILTER %1,  8, 1
+    HEVC_SAO_BAND_FILTER %1, 16, 2
+    HEVC_SAO_BAND_FILTER %1, 32, 4
+    HEVC_SAO_BAND_FILTER %1, 48, 6
+    HEVC_SAO_BAND_FILTER %1, 64, 8
+%endmacro
+
 %macro HEVC_SAO_BAND_FILTER_FUNCS 0
-HEVC_SAO_BAND_FILTER 10,  8, 1
-HEVC_SAO_BAND_FILTER 10, 16, 2
-HEVC_SAO_BAND_FILTER 10, 32, 4
-HEVC_SAO_BAND_FILTER 10, 48, 6
-HEVC_SAO_BAND_FILTER 10, 64, 8
-
-HEVC_SAO_BAND_FILTER 12,  8, 1
-HEVC_SAO_BAND_FILTER 12, 16, 2
-HEVC_SAO_BAND_FILTER 12, 32, 4
-HEVC_SAO_BAND_FILTER 12, 48, 6
-HEVC_SAO_BAND_FILTER 12, 64, 8
+    HEVC_SAO_BAND_FILTER_FUNCS 10
+    HEVC_SAO_BAND_FILTER_FUNCS 12
 %endmacro
 
 INIT_XMM sse2
@@ -48,54 +47,51 @@ INIT_XMM avx
 HEVC_SAO_BAND_FILTER_FUNCS
 
 %if HAVE_AVX2_EXTERNAL
-INIT_XMM avx2
-HEVC_SAO_BAND_FILTER 10,  8, 1
-INIT_YMM avx2
-HEVC_SAO_BAND_FILTER 10, 16, 1
-HEVC_SAO_BAND_FILTER 10, 32, 2
-HEVC_SAO_BAND_FILTER 10, 48, 3
-HEVC_SAO_BAND_FILTER 10, 64, 4
-
-INIT_XMM avx2
-HEVC_SAO_BAND_FILTER 12,  8, 1
-INIT_YMM avx2
-HEVC_SAO_BAND_FILTER 12, 16, 1
-HEVC_SAO_BAND_FILTER 12, 32, 2
-HEVC_SAO_BAND_FILTER 12, 48, 3
-HEVC_SAO_BAND_FILTER 12, 64, 4
+
+%macro HEVC_SAO_BAND_FILTER_FUNCS_AVX2 1
+    INIT_XMM avx2
+    HEVC_SAO_BAND_FILTER %1,  8, 1
+    INIT_YMM avx2
+    HEVC_SAO_BAND_FILTER %1, 16, 1
+    HEVC_SAO_BAND_FILTER %1, 32, 2
+    HEVC_SAO_BAND_FILTER %1, 48, 3
+    HEVC_SAO_BAND_FILTER %1, 64, 4
+%endmacro
+
+    HEVC_SAO_BAND_FILTER_FUNCS_AVX2 10
+    HEVC_SAO_BAND_FILTER_FUNCS_AVX2 12
+
 %endif
 
 %macro HEVC_SAO_EDGE_FILTER 3
     H2656_SAO_EDGE_FILTER hevc, %1, %2, %3
 %endmacro
 
+%macro HEVC_SAO_EDGE_FILTER_FUNCS 1
+    HEVC_SAO_EDGE_FILTER %1,  8, 1
+    HEVC_SAO_EDGE_FILTER %1, 16, 2
+    HEVC_SAO_EDGE_FILTER %1, 32, 4
+    HEVC_SAO_EDGE_FILTER %1, 48, 6
+    HEVC_SAO_EDGE_FILTER %1, 64, 8
+%endmacro
+
 INIT_XMM sse2
-HEVC_SAO_EDGE_FILTER 10,  8, 1
-HEVC_SAO_EDGE_FILTER 10, 16, 2
-HEVC_SAO_EDGE_FILTER 10, 32, 4
-HEVC_SAO_EDGE_FILTER 10, 48, 6
-HEVC_SAO_EDGE_FILTER 10, 64, 8
-
-HEVC_SAO_EDGE_FILTER 12,  8, 1
-HEVC_SAO_EDGE_FILTER 12, 16, 2
-HEVC_SAO_EDGE_FILTER 12, 32, 4
-HEVC_SAO_EDGE_FILTER 12, 48, 6
-HEVC_SAO_EDGE_FILTER 12, 64, 8
+HEVC_SAO_EDGE_FILTER_FUNCS 10
+HEVC_SAO_EDGE_FILTER_FUNCS 12
 
 %if HAVE_AVX2_EXTERNAL
-INIT_XMM avx2
-HEVC_SAO_EDGE_FILTER 10,  8, 1
-INIT_YMM avx2
-HEVC_SAO_EDGE_FILTER 10, 16, 1
-HEVC_SAO_EDGE_FILTER 10, 32, 2
-HEVC_SAO_EDGE_FILTER 10, 48, 3
-HEVC_SAO_EDGE_FILTER 10, 64, 4
-
-INIT_XMM avx2
-HEVC_SAO_EDGE_FILTER 12,  8, 1
-INIT_YMM avx2
-HEVC_SAO_EDGE_FILTER 12, 16, 1
-HEVC_SAO_EDGE_FILTER 12, 32, 2
-HEVC_SAO_EDGE_FILTER 12, 48, 3
-HEVC_SAO_EDGE_FILTER 12, 64, 4
+
+%macro HEVC_SAO_EDGE_FILTER_FUNCS_AVX2 1
+    INIT_XMM avx2
+    HEVC_SAO_EDGE_FILTER %1,  8, 1
+    INIT_YMM avx2
+    HEVC_SAO_EDGE_FILTER %1, 16, 1
+    HEVC_SAO_EDGE_FILTER %1, 32, 2
+    HEVC_SAO_EDGE_FILTER %1, 48, 3
+    HEVC_SAO_EDGE_FILTER %1, 64, 4
+%endmacro
+
+HEVC_SAO_EDGE_FILTER_FUNCS_AVX2 10
+HEVC_SAO_EDGE_FILTER_FUNCS_AVX2 12
+
 %endif
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [FFmpeg-devel] [PATCH v2 4/7] x86/vvcdec: sao, add avx2 support
  2025-05-03  9:13 [FFmpeg-devel] [PATCH v2 1/7] x86/vvcdec: misc, reordered functions in dsp_init for improved readability Nuo Mi
  2025-05-03  9:13 ` [FFmpeg-devel] [PATCH v2 2/7] x86/hevcdec: sao, refact out h26x macros Nuo Mi
  2025-05-03  9:13 ` [FFmpeg-devel] [PATCH v2 3/7] x86/hevcdec: refact, remove duplicate code in HEVC_SAO_{BAND, EDGE}_FILTER Nuo Mi
@ 2025-05-03  9:13 ` Nuo Mi
  2025-05-03  9:13 ` [FFmpeg-devel] [PATCH v2 5/7] checkasm: add vvc_sao Nuo Mi
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 10+ messages in thread
From: Nuo Mi @ 2025-05-03  9:13 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Nuo Mi, Shaun Loo

From: Shaun Loo <shaunloo10@gmail.com>

This is a part of Google Summer of Code 2023

Co-authored-by: Nuo Mi <nuomi2021@gmail.com>
---
 libavcodec/x86/h26x/h2656_sao.asm |   8 +--
 libavcodec/x86/vvc/Makefile       |   2 +
 libavcodec/x86/vvc/dsp_init.c     |  41 +++++++++++
 libavcodec/x86/vvc/sao.asm        |  73 +++++++++++++++++++
 libavcodec/x86/vvc/sao_10bit.asm  | 113 ++++++++++++++++++++++++++++++
 5 files changed, 233 insertions(+), 4 deletions(-)
 create mode 100644 libavcodec/x86/vvc/sao.asm
 create mode 100644 libavcodec/x86/vvc/sao_10bit.asm

diff --git a/libavcodec/x86/h26x/h2656_sao.asm b/libavcodec/x86/h26x/h2656_sao.asm
index 504fcb388b..a80ee26178 100644
--- a/libavcodec/x86/h26x/h2656_sao.asm
+++ b/libavcodec/x86/h26x/h2656_sao.asm
@@ -147,7 +147,7 @@ align 16
 %assign i i+mmsize
 %endrep
 
-%if %2 == 48
+%if %2 == 48 || %2 == 80 || %2 == 112
 INIT_XMM cpuname
 
     mova             m13, [srcq + i]
@@ -160,7 +160,7 @@ INIT_XMM cpuname
 %if cpuflag(avx2)
 INIT_YMM cpuname
 %endif
-%endif ; %2 == 48
+%endif ; %2 == 48 || %2 == 80 || %2 == 112
 
     add             dstq, dststrideq             ; dst += dststride
     add             srcq, srcstrideq             ; src += srcstride
@@ -280,7 +280,7 @@ align 16
 %assign i i+mmsize
 %endrep
 
-%if %2 == 48
+%if %2 == 48 || %2 == 80 || %2 == 112
 INIT_XMM cpuname
 
     mova              m1, [srcq + i]
@@ -291,7 +291,7 @@ INIT_XMM cpuname
 %if cpuflag(avx2)
 INIT_YMM cpuname
 %endif
-%endif
+%endif ; %2 == 48 || %2 == 80 || %2 == 112
 
     add             dstq, dststrideq
     add             srcq, EDGE_SRCSTRIDE
diff --git a/libavcodec/x86/vvc/Makefile b/libavcodec/x86/vvc/Makefile
index 86a6c8ba7c..c426b156c1 100644
--- a/libavcodec/x86/vvc/Makefile
+++ b/libavcodec/x86/vvc/Makefile
@@ -8,4 +8,6 @@ X86ASM-OBJS-$(CONFIG_VVC_DECODER)      += x86/vvc/alf.o             \
                                           x86/vvc/mc.o              \
                                           x86/vvc/of.o              \
                                           x86/vvc/sad.o             \
+                                          x86/vvc/sao.o             \
+                                          x86/vvc/sao_10bit.o       \
                                           x86/h26x/h2656_inter.o
diff --git a/libavcodec/x86/vvc/dsp_init.c b/libavcodec/x86/vvc/dsp_init.c
index bb68ba0b1e..cbcfa40a66 100644
--- a/libavcodec/x86/vvc/dsp_init.c
+++ b/libavcodec/x86/vvc/dsp_init.c
@@ -215,6 +215,44 @@ ALF_FUNCS(16, 12, avx2)
 
 #endif
 
+#define SAO_FILTER_FUNC(wd, bitd, opt)                                                                                               \
+void ff_vvc_sao_band_filter_##wd##_##bitd##_##opt(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,  \
+    const int16_t *sao_offset_val, int sao_left_class, int width, int height);                                                       \
+void ff_vvc_sao_edge_filter_##wd##_##bitd##_##opt(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,                          \
+        const int16_t *sao_offset_val, int eo, int width, int height);                                                               \
+
+#define SAO_FILTER_FUNCS(bitd, opt)     \
+    SAO_FILTER_FUNC(8,   bitd, opt)     \
+    SAO_FILTER_FUNC(16,  bitd, opt)     \
+    SAO_FILTER_FUNC(32,  bitd, opt)     \
+    SAO_FILTER_FUNC(48,  bitd, opt)     \
+    SAO_FILTER_FUNC(64,  bitd, opt)     \
+    SAO_FILTER_FUNC(80,  bitd, opt)     \
+    SAO_FILTER_FUNC(96,  bitd, opt)     \
+    SAO_FILTER_FUNC(112, bitd, opt)     \
+    SAO_FILTER_FUNC(128, bitd, opt)     \
+
+SAO_FILTER_FUNCS(8,  avx2)
+SAO_FILTER_FUNCS(10, avx2)
+SAO_FILTER_FUNCS(12, avx2)
+
+#define SAO_FILTER_INIT(type, bitd, opt) do {                                   \
+    c->sao.type##_filter[0] = ff_vvc_sao_##type##_filter_8_##bitd##_##opt;    \
+    c->sao.type##_filter[1] = ff_vvc_sao_##type##_filter_16_##bitd##_##opt;   \
+    c->sao.type##_filter[2] = ff_vvc_sao_##type##_filter_32_##bitd##_##opt;   \
+    c->sao.type##_filter[3] = ff_vvc_sao_##type##_filter_48_##bitd##_##opt;   \
+    c->sao.type##_filter[4] = ff_vvc_sao_##type##_filter_64_##bitd##_##opt;   \
+    c->sao.type##_filter[5] = ff_vvc_sao_##type##_filter_80_##bitd##_##opt;   \
+    c->sao.type##_filter[6] = ff_vvc_sao_##type##_filter_96_##bitd##_##opt;   \
+    c->sao.type##_filter[7] = ff_vvc_sao_##type##_filter_112_##bitd##_##opt;  \
+    c->sao.type##_filter[8] = ff_vvc_sao_##type##_filter_128_##bitd##_##opt;  \
+} while (0)
+
+#define SAO_INIT(bitd, opt) do {                                     \
+    SAO_FILTER_INIT(band, bitd, opt);                                \
+    SAO_FILTER_INIT(edge, bitd, opt);                                \
+} while (0)
+
 #define AVG_INIT(bd, opt) do {                                       \
     c->inter.avg    = bf(vvc_avg, bd, opt);                          \
     c->inter.w_avg  = bf(vvc_w_avg, bd, opt);                        \
@@ -329,6 +367,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
 
             // filter
             ALF_INIT(8);
+            SAO_INIT(8, avx2);
         }
 #endif
         break;
@@ -350,6 +389,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
 
             // filter
             ALF_INIT(10);
+            SAO_INIT(10, avx2);
         }
 #endif
         break;
@@ -371,6 +411,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
 
             // filter
             ALF_INIT(12);
+            SAO_INIT(12, avx2);
         }
 #endif
         break;
diff --git a/libavcodec/x86/vvc/sao.asm b/libavcodec/x86/vvc/sao.asm
new file mode 100644
index 0000000000..5f7d7e5358
--- /dev/null
+++ b/libavcodec/x86/vvc/sao.asm
@@ -0,0 +1,73 @@
+;******************************************************************************
+;* SIMD optimized SAO functions for VVC 8bit decoding
+;*
+;* Copyright (c) 2024 Shaun Loo
+;* Copyright (c) 2024 Nuo Mi
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%define MAX_PB_SIZE  128
+%include "libavcodec/x86/h26x/h2656_sao.asm"
+
+%macro VVC_SAO_BAND_FILTER 2
+    H2656_SAO_BAND_FILTER vvc, %1, %2
+%endmacro
+
+%macro VVC_SAO_BAND_FILTER_FUNCS 0
+VVC_SAO_BAND_FILTER   8, 0
+VVC_SAO_BAND_FILTER  16, 1
+VVC_SAO_BAND_FILTER  32, 2
+VVC_SAO_BAND_FILTER  48, 2
+VVC_SAO_BAND_FILTER  64, 4
+VVC_SAO_BAND_FILTER  80, 4
+VVC_SAO_BAND_FILTER  96, 6
+VVC_SAO_BAND_FILTER 112, 6
+VVC_SAO_BAND_FILTER 128, 8
+%endmacro
+
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+VVC_SAO_BAND_FILTER   8, 0
+VVC_SAO_BAND_FILTER  16, 1
+INIT_YMM avx2
+VVC_SAO_BAND_FILTER  32, 1
+VVC_SAO_BAND_FILTER  48, 1
+VVC_SAO_BAND_FILTER  64, 2
+VVC_SAO_BAND_FILTER  80, 2
+VVC_SAO_BAND_FILTER  96, 3
+VVC_SAO_BAND_FILTER 112, 3
+VVC_SAO_BAND_FILTER 128, 4
+%endif
+
+%macro VVC_SAO_EDGE_FILTER 2-3
+    H2656_SAO_EDGE_FILTER vvc, %{1:-1}
+%endmacro
+
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+VVC_SAO_EDGE_FILTER  8, 0
+VVC_SAO_EDGE_FILTER 16, 1, a
+INIT_YMM avx2
+VVC_SAO_EDGE_FILTER  32, 1, a
+VVC_SAO_EDGE_FILTER  48, 1, u
+VVC_SAO_EDGE_FILTER  64, 2, a
+VVC_SAO_EDGE_FILTER  80, 2, u
+VVC_SAO_EDGE_FILTER  96, 3, a
+VVC_SAO_EDGE_FILTER 112, 3, u
+VVC_SAO_EDGE_FILTER 128, 4, a
+%endif
diff --git a/libavcodec/x86/vvc/sao_10bit.asm b/libavcodec/x86/vvc/sao_10bit.asm
new file mode 100644
index 0000000000..b7d3d08008
--- /dev/null
+++ b/libavcodec/x86/vvc/sao_10bit.asm
@@ -0,0 +1,113 @@
+;******************************************************************************
+;* SIMD optimized SAO functions for VVC 10/12bit decoding
+;*
+;* Copyright (c) 2024 Shaun Loo
+;* Copyright (c) 2024 Nuo Mi
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%define MAX_PB_SIZE  128
+%include "libavcodec/x86/h26x/h2656_sao_10bit.asm"
+
+%macro VVC_SAO_BAND_FILTER 3
+    H2656_SAO_BAND_FILTER vvc, %1, %2, %3
+%endmacro
+
+%macro VVC_SAO_BAND_FILTER_FUNCS 1
+    VVC_SAO_BAND_FILTER %1,   8,  1
+    VVC_SAO_BAND_FILTER %1,  16,  2
+    VVC_SAO_BAND_FILTER %1,  32,  4
+    VVC_SAO_BAND_FILTER %1,  48,  6
+    VVC_SAO_BAND_FILTER %1,  64,  8
+    VVC_SAO_BAND_FILTER %1,  80, 10
+    VVC_SAO_BAND_FILTER %1,  96, 12
+    VVC_SAO_BAND_FILTER %1, 112, 14
+    VVC_SAO_BAND_FILTER %1, 128, 16
+%endmacro
+
+%macro VVC_SAO_BAND_FILTER_FUNCS 0
+    VVC_SAO_BAND_FILTER_FUNCS 10
+    VVC_SAO_BAND_FILTER_FUNCS 12
+%endmacro
+
+INIT_XMM sse2
+VVC_SAO_BAND_FILTER_FUNCS
+INIT_XMM avx
+VVC_SAO_BAND_FILTER_FUNCS
+
+%if HAVE_AVX2_EXTERNAL
+
+%macro VVC_SAO_BAND_FILTER_FUNCS_AVX2 1
+    INIT_XMM avx2
+    VVC_SAO_BAND_FILTER %1,   8, 1
+    INIT_YMM avx2
+    VVC_SAO_BAND_FILTER %1,  16, 1
+    VVC_SAO_BAND_FILTER %1,  32, 2
+    VVC_SAO_BAND_FILTER %1,  48, 3
+    VVC_SAO_BAND_FILTER %1,  64, 4
+    VVC_SAO_BAND_FILTER %1,  80, 5
+    VVC_SAO_BAND_FILTER %1,  96, 6
+    VVC_SAO_BAND_FILTER %1, 112, 7
+    VVC_SAO_BAND_FILTER %1, 128, 8
+%endmacro
+
+VVC_SAO_BAND_FILTER_FUNCS_AVX2 10
+VVC_SAO_BAND_FILTER_FUNCS_AVX2 12
+
+%endif ; HAVE_AVX2_EXTERNAL
+
+%macro VVC_SAO_EDGE_FILTER 3
+    H2656_SAO_EDGE_FILTER vvc, %1, %2, %3
+%endmacro
+
+%macro VVC_SAO_EDGE_FILTER_FUNCS 1
+    VVC_SAO_EDGE_FILTER %1,   8,  1
+    VVC_SAO_EDGE_FILTER %1,  16,  2
+    VVC_SAO_EDGE_FILTER %1,  32,  4
+    VVC_SAO_EDGE_FILTER %1,  48,  6
+    VVC_SAO_EDGE_FILTER %1,  64,  8
+    VVC_SAO_EDGE_FILTER %1,  80, 10
+    VVC_SAO_EDGE_FILTER %1,  96, 12
+    VVC_SAO_EDGE_FILTER %1, 112, 14
+    VVC_SAO_EDGE_FILTER %1, 128, 16
+%endmacro
+
+INIT_XMM sse2
+VVC_SAO_EDGE_FILTER_FUNCS 10
+VVC_SAO_EDGE_FILTER_FUNCS 12
+
+%if HAVE_AVX2_EXTERNAL
+
+%macro VVC_SAO_EDGE_FILTER_FUNCS_AVX2 1
+    INIT_XMM avx2
+    VVC_SAO_EDGE_FILTER %1,   8, 1
+    INIT_YMM avx2
+    VVC_SAO_EDGE_FILTER %1,  16, 1
+    VVC_SAO_EDGE_FILTER %1,  32, 2
+    VVC_SAO_EDGE_FILTER %1,  48, 3
+    VVC_SAO_EDGE_FILTER %1,  64, 4
+    VVC_SAO_EDGE_FILTER %1,  80, 5
+    VVC_SAO_EDGE_FILTER %1,  96, 6
+    VVC_SAO_EDGE_FILTER %1, 112, 7
+    VVC_SAO_EDGE_FILTER %1, 128, 8
+%endmacro
+
+VVC_SAO_EDGE_FILTER_FUNCS_AVX2 10
+VVC_SAO_EDGE_FILTER_FUNCS_AVX2 12
+
+%endif ; HAVE_AVX2_EXTERNAL
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [FFmpeg-devel] [PATCH v2 5/7] checkasm: add vvc_sao
  2025-05-03  9:13 [FFmpeg-devel] [PATCH v2 1/7] x86/vvcdec: misc, reordered functions in dsp_init for improved readability Nuo Mi
                   ` (2 preceding siblings ...)
  2025-05-03  9:13 ` [FFmpeg-devel] [PATCH v2 4/7] x86/vvcdec: sao, add avx2 support Nuo Mi
@ 2025-05-03  9:13 ` Nuo Mi
  2025-05-03  9:13 ` [FFmpeg-devel] [PATCH v2 6/7] checkasm: hevc sao_edge, benchmarking inside the width loop is meaningless Nuo Mi
  2025-05-03  9:13 ` [FFmpeg-devel] [PATCH v2 7/7] checkasm: hevc sao, use checkasm_check_padded Nuo Mi
  5 siblings, 0 replies; 10+ messages in thread
From: Nuo Mi @ 2025-05-03  9:13 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Nuo Mi, Shaun Loo

From: Shaun Loo <shaunloo10@gmail.com>

This is a part of Google Summer of Code 2023

AVX2:
 - vvc_sao.sao_band [OK]
 - vvc_sao.sao_edge [OK]
checkasm: all 54 tests passed
vvc_sao_band_8_8_c:                                    157.4 ( 1.00x)
vvc_sao_band_8_8_avx2:                                  30.7 ( 5.12x)
vvc_sao_band_8_10_c:                                   119.4 ( 1.00x)
vvc_sao_band_8_10_avx2:                                 29.2 ( 4.09x)
vvc_sao_band_8_12_c:                                   144.6 ( 1.00x)
vvc_sao_band_8_12_avx2:                                 30.0 ( 4.82x)
vvc_sao_band_16_8_c:                                   446.5 ( 1.00x)
vvc_sao_band_16_8_avx2:                                103.3 ( 4.32x)
vvc_sao_band_16_10_c:                                  399.2 ( 1.00x)
vvc_sao_band_16_10_avx2:                                64.3 ( 6.21x)
vvc_sao_band_16_12_c:                                  472.9 ( 1.00x)
vvc_sao_band_16_12_avx2:                                56.5 ( 8.37x)
vvc_sao_band_32_8_c:                                  2430.9 ( 1.00x)
vvc_sao_band_32_8_avx2:                                203.3 (11.96x)
vvc_sao_band_32_10_c:                                 1405.7 ( 1.00x)
vvc_sao_band_32_10_avx2:                               208.5 ( 6.74x)
vvc_sao_band_32_12_c:                                 2054.3 ( 1.00x)
vvc_sao_band_32_12_avx2:                               213.0 ( 9.64x)
vvc_sao_band_48_8_c:                                  3835.4 ( 1.00x)
vvc_sao_band_48_8_avx2:                                604.2 ( 6.35x)
vvc_sao_band_48_10_c:                                 3624.6 ( 1.00x)
vvc_sao_band_48_10_avx2:                               468.8 ( 7.73x)
vvc_sao_band_48_12_c:                                 3752.4 ( 1.00x)
vvc_sao_band_48_12_avx2:                               477.5 ( 7.86x)
vvc_sao_band_64_8_c:                                  6061.1 ( 1.00x)
vvc_sao_band_64_8_avx2:                                803.9 ( 7.54x)
vvc_sao_band_64_10_c:                                 6142.5 ( 1.00x)
vvc_sao_band_64_10_avx2:                               827.3 ( 7.43x)
vvc_sao_band_64_12_c:                                 6106.6 ( 1.00x)
vvc_sao_band_64_12_avx2:                               839.9 ( 7.27x)
vvc_sao_band_80_8_c:                                  9478.0 ( 1.00x)
vvc_sao_band_80_8_avx2:                               1516.7 ( 6.25x)
vvc_sao_band_80_10_c:                                10300.5 ( 1.00x)
vvc_sao_band_80_10_avx2:                              1298.7 ( 7.93x)
vvc_sao_band_80_12_c:                                 8941.1 ( 1.00x)
vvc_sao_band_80_12_avx2:                              1315.3 ( 6.80x)
vvc_sao_band_96_8_c:                                 13351.5 ( 1.00x)
vvc_sao_band_96_8_avx2:                               1815.4 ( 7.35x)
vvc_sao_band_96_10_c:                                13197.5 ( 1.00x)
vvc_sao_band_96_10_avx2:                              1872.4 ( 7.05x)
vvc_sao_band_96_12_c:                                11969.0 ( 1.00x)
vvc_sao_band_96_12_avx2:                              1895.8 ( 6.31x)
vvc_sao_band_112_8_c:                                19936.9 ( 1.00x)
vvc_sao_band_112_8_avx2:                              2802.3 ( 7.11x)
vvc_sao_band_112_10_c:                               19534.9 ( 1.00x)
vvc_sao_band_112_10_avx2:                             2635.0 ( 7.41x)
vvc_sao_band_112_12_c:                               16520.6 ( 1.00x)
vvc_sao_band_112_12_avx2:                             2591.8 ( 6.37x)
vvc_sao_band_128_8_c:                                25967.5 ( 1.00x)
vvc_sao_band_128_8_avx2:                              3155.3 ( 8.23x)
vvc_sao_band_128_10_c:                               24002.6 ( 1.00x)
vvc_sao_band_128_10_avx2:                             3374.6 ( 7.11x)
vvc_sao_band_128_12_c:                               20829.4 ( 1.00x)
vvc_sao_band_128_12_avx2:                             3377.0 ( 6.17x)
vvc_sao_edge_8_8_c:                                    174.6 ( 1.00x)
vvc_sao_edge_8_8_avx2:                                  37.0 ( 4.72x)
vvc_sao_edge_8_10_c:                                   174.4 ( 1.00x)
vvc_sao_edge_8_10_avx2:                                 58.5 ( 2.98x)
vvc_sao_edge_8_12_c:                                   171.1 ( 1.00x)
vvc_sao_edge_8_12_avx2:                                 58.5 ( 2.93x)
vvc_sao_edge_16_8_c:                                   677.7 ( 1.00x)
vvc_sao_edge_16_8_avx2:                                 72.2 ( 9.39x)
vvc_sao_edge_16_10_c:                                  724.8 ( 1.00x)
vvc_sao_edge_16_10_avx2:                               106.4 ( 6.81x)
vvc_sao_edge_16_12_c:                                  647.0 ( 1.00x)
vvc_sao_edge_16_12_avx2:                               106.6 ( 6.07x)
vvc_sao_edge_32_8_c:                                  3001.8 ( 1.00x)
vvc_sao_edge_32_8_avx2:                                157.6 (19.04x)
vvc_sao_edge_32_10_c:                                 3071.1 ( 1.00x)
vvc_sao_edge_32_10_avx2:                               404.2 ( 7.60x)
vvc_sao_edge_32_12_c:                                 2698.6 ( 1.00x)
vvc_sao_edge_32_12_avx2:                               398.8 ( 6.77x)
vvc_sao_edge_48_8_c:                                  6557.7 ( 1.00x)
vvc_sao_edge_48_8_avx2:                                380.1 (17.25x)
vvc_sao_edge_48_10_c:                                 6319.9 ( 1.00x)
vvc_sao_edge_48_10_avx2:                               896.3 ( 7.05x)
vvc_sao_edge_48_12_c:                                 6306.4 ( 1.00x)
vvc_sao_edge_48_12_avx2:                               885.5 ( 7.12x)
vvc_sao_edge_64_8_c:                                 11510.7 ( 1.00x)
vvc_sao_edge_64_8_avx2:                                504.1 (22.84x)
vvc_sao_edge_64_10_c:                                10917.4 ( 1.00x)
vvc_sao_edge_64_10_avx2:                              1608.3 ( 6.79x)
vvc_sao_edge_64_12_c:                                11499.8 ( 1.00x)
vvc_sao_edge_64_12_avx2:                              1586.4 ( 7.25x)
vvc_sao_edge_80_8_c:                                 18193.2 ( 1.00x)
vvc_sao_edge_80_8_avx2:                                930.2 (19.56x)
vvc_sao_edge_80_10_c:                                17984.3 ( 1.00x)
vvc_sao_edge_80_10_avx2:                              2420.9 ( 7.43x)
vvc_sao_edge_80_12_c:                                18289.4 ( 1.00x)
vvc_sao_edge_80_12_avx2:                              2412.1 ( 7.58x)
vvc_sao_edge_96_8_c:                                 26361.8 ( 1.00x)
vvc_sao_edge_96_8_avx2:                               1118.4 (23.57x)
vvc_sao_edge_96_10_c:                                26162.2 ( 1.00x)
vvc_sao_edge_96_10_avx2:                              3666.9 ( 7.13x)
vvc_sao_edge_96_12_c:                                25926.6 ( 1.00x)
vvc_sao_edge_96_12_avx2:                              3433.9 ( 7.55x)
vvc_sao_edge_112_8_c:                                36562.9 ( 1.00x)
vvc_sao_edge_112_8_avx2:                              1741.0 (21.00x)
vvc_sao_edge_112_10_c:                               38126.4 ( 1.00x)
vvc_sao_edge_112_10_avx2:                             5153.3 ( 7.40x)
vvc_sao_edge_112_12_c:                               36345.7 ( 1.00x)
vvc_sao_edge_112_12_avx2:                             4684.9 ( 7.76x)
vvc_sao_edge_128_8_c:                                46379.8 ( 1.00x)
vvc_sao_edge_128_8_avx2:                              2012.4 (23.05x)
vvc_sao_edge_128_10_c:                               47029.5 ( 1.00x)
vvc_sao_edge_128_10_avx2:                             6162.2 ( 7.63x)
vvc_sao_edge_128_12_c:                               49647.3 ( 1.00x)
vvc_sao_edge_128_12_avx2:                             6127.1 ( 8.10x)

Co-authored-by: Nuo Mi <nuomi2021@gmail.com>
---
 tests/checkasm/Makefile   |   2 +-
 tests/checkasm/checkasm.c |   1 +
 tests/checkasm/checkasm.h |   1 +
 tests/checkasm/vvc_sao.c  | 154 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 157 insertions(+), 1 deletion(-)
 create mode 100644 tests/checkasm/vvc_sao.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 193c1e4633..fabbf595b4 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -47,7 +47,7 @@ AVCODECOBJS-$(CONFIG_V210_DECODER)      += v210dec.o
 AVCODECOBJS-$(CONFIG_V210_ENCODER)      += v210enc.o
 AVCODECOBJS-$(CONFIG_VORBIS_DECODER)    += vorbisdsp.o
 AVCODECOBJS-$(CONFIG_VP9_DECODER)       += vp9dsp.o
-AVCODECOBJS-$(CONFIG_VVC_DECODER)       += vvc_alf.o vvc_mc.o
+AVCODECOBJS-$(CONFIG_VVC_DECODER)       += vvc_alf.o vvc_mc.o vvc_sao.o
 
 CHECKASMOBJS-$(CONFIG_AVCODEC)          += $(AVCODECOBJS-yes)
 
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 3bb82ed0e5..0734cd26bf 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -256,6 +256,7 @@ static const struct {
     #if CONFIG_VVC_DECODER
         { "vvc_alf", checkasm_check_vvc_alf },
         { "vvc_mc",  checkasm_check_vvc_mc  },
+        { "vvc_sao", checkasm_check_vvc_sao },
     #endif
 #endif
 #if CONFIG_AVFILTER
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index a6b5965e02..146bfdec35 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -149,6 +149,7 @@ void checkasm_check_videodsp(void);
 void checkasm_check_vorbisdsp(void);
 void checkasm_check_vvc_alf(void);
 void checkasm_check_vvc_mc(void);
+void checkasm_check_vvc_sao(void);
 
 struct CheckasmPerf;
 
diff --git a/tests/checkasm/vvc_sao.c b/tests/checkasm/vvc_sao.c
new file mode 100644
index 0000000000..9e6507b2c3
--- /dev/null
+++ b/tests/checkasm/vvc_sao.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2018 Yingming Fan <yingmingfan@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string.h>
+
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+
+#include "libavcodec/vvc/dsp.h"
+#include "libavcodec/vvc/ctu.h"
+
+#include "checkasm.h"
+
+static const uint32_t pixel_mask[3] = { 0xffffffff, 0x03ff03ff, 0x0fff0fff };
+static const uint32_t sao_size[] = {8, 16, 32, 48, 64, 80, 96, 112, 128};
+
+#define SIZEOF_PIXEL ((bit_depth + 7) / 8)
+#define PIXEL_STRIDE (2*MAX_CTU_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) //same with sao_edge src_stride
+#define BUF_SIZE (PIXEL_STRIDE * (MAX_CTU_SIZE+2) * 2) //+2 for top and bottom row, *2 for high bit depth
+#define OFFSET_THRESH (1 << (bit_depth - 5))
+#define OFFSET_LENGTH 5
+
+#define randomize_buffers(buf0, buf1, size)                 \
+    do {                                                    \
+        uint32_t mask = pixel_mask[(bit_depth - 8) >> 1];   \
+        int k;                                              \
+        for (k = 0; k < size; k += 4) {                     \
+            uint32_t r = rnd() & mask;                      \
+            AV_WN32A(buf0 + k, r);                          \
+            AV_WN32A(buf1 + k, r);                          \
+        }                                                   \
+    } while (0)
+
+#define randomize_buffers2(buf, size)                       \
+    do {                                                    \
+        uint32_t max_offset = OFFSET_THRESH;                \
+        int k;                                              \
+        if (bit_depth == 8) {                               \
+            for (k = 0; k < size; k++) {                    \
+                uint8_t r = rnd() % max_offset;             \
+                buf[k] = r;                                 \
+            }                                               \
+        } else {                                            \
+            for (k = 0; k < size; k++) {                    \
+                uint16_t r = rnd() % max_offset;            \
+                buf[k] = r;                                 \
+            }                                               \
+        }                                                   \
+    } while (0)
+
+static void check_sao_band(VVCDSPContext *h, int bit_depth)
+{
+    PIXEL_RECT(dst0, MAX_CTU_SIZE, MAX_CTU_SIZE);
+    PIXEL_RECT(dst1, MAX_CTU_SIZE, MAX_CTU_SIZE);
+    LOCAL_ALIGNED_32(uint8_t, src0, [BUF_SIZE]);
+    LOCAL_ALIGNED_32(uint8_t, src1, [BUF_SIZE]);
+    int16_t offset_val[OFFSET_LENGTH];
+    const int left_class = rnd()%32;
+    const int walign = 16;
+
+    for (int i = 0; i < FF_ARRAY_ELEMS(sao_size); i++) {
+        const int block_size = sao_size[i];
+        const int prev_size = i > 0 ? sao_size[i - 1] : 0;
+        ptrdiff_t stride = PIXEL_STRIDE*SIZEOF_PIXEL;
+        declare_func(void, uint8_t *dst, const uint8_t *src, ptrdiff_t dst_stride, ptrdiff_t src_stride,
+                     const int16_t *sao_offset_val, int sao_left_class, int width, int height);
+
+        if (check_func(h->sao.band_filter[i], "vvc_sao_band_%d_%d", block_size, bit_depth)) {
+
+            for (int w = prev_size + 4; w <= block_size; w += 4) {
+                randomize_buffers(src0, src1, BUF_SIZE);
+                randomize_buffers2(offset_val, OFFSET_LENGTH);
+                CLEAR_PIXEL_RECT(dst0);
+                CLEAR_PIXEL_RECT(dst1);
+
+                call_ref(dst0, src0, dst0_stride, stride, offset_val, left_class, w, block_size);
+                call_new(dst1, src1, dst1_stride, stride, offset_val, left_class, w, block_size);
+                checkasm_check_pixel_padded_align(dst0, dst0_stride, dst1, dst1_stride, w, block_size, "dst", walign, 1);
+            }
+            bench_new(dst1, src1, dst1_stride, stride, offset_val, left_class, block_size, block_size);
+        }
+    }
+}
+
+static void check_sao_edge(VVCDSPContext *h, int bit_depth)
+{
+    PIXEL_RECT(dst0, MAX_CTU_SIZE, MAX_CTU_SIZE);
+    PIXEL_RECT(dst1, MAX_CTU_SIZE, MAX_CTU_SIZE);
+    LOCAL_ALIGNED_32(uint8_t, src0, [BUF_SIZE]);
+    LOCAL_ALIGNED_32(uint8_t, src1, [BUF_SIZE]);
+    int16_t offset_val[OFFSET_LENGTH];
+    const int eo = rnd()%4;
+    const int walign = 16;
+
+    for (int i = 0; i < FF_ARRAY_ELEMS(sao_size); i++) {
+        int block_size = sao_size[i];
+        int prev_size = i > 0 ? sao_size[i - 1] : 0;
+        int offset = (AV_INPUT_BUFFER_PADDING_SIZE + PIXEL_STRIDE)*SIZEOF_PIXEL;
+        declare_func(void, uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst,
+                     const int16_t *sao_offset_val, int eo, int width, int height);
+
+        if (check_func(h->sao.edge_filter[i], "vvc_sao_edge_%d_%d", block_size, bit_depth)) {
+            for (int w = prev_size + 4; w <= block_size; w += 4) {
+                randomize_buffers(src0, src1, BUF_SIZE);
+                randomize_buffers2(offset_val, OFFSET_LENGTH);
+                CLEAR_PIXEL_RECT(dst0);
+                CLEAR_PIXEL_RECT(dst1);
+
+                call_ref(dst0, src0 + offset, dst0_stride, offset_val, eo, w, block_size);
+                call_new(dst1, src1 + offset, dst1_stride, offset_val, eo, w, block_size);
+                checkasm_check_pixel_padded_align(dst0, dst0_stride, dst1, dst1_stride, w, block_size, "dst", walign, 1);
+            }
+            bench_new(dst1, src1 + offset, dst1_stride, offset_val, eo, block_size, block_size);
+        }
+    }
+}
+
+void checkasm_check_vvc_sao(void)
+{
+    int bit_depth;
+
+    for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
+        VVCDSPContext h;
+
+        ff_vvc_dsp_init(&h, bit_depth);
+        check_sao_band(&h, bit_depth);
+    }
+    report("sao_band");
+
+    for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
+        VVCDSPContext h;
+
+        ff_vvc_dsp_init(&h, bit_depth);
+        check_sao_edge(&h, bit_depth);
+    }
+    report("sao_edge");
+}
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [FFmpeg-devel] [PATCH v2 6/7] checkasm: hevc sao_edge, benchmarking inside the width loop is meaningless
  2025-05-03  9:13 [FFmpeg-devel] [PATCH v2 1/7] x86/vvcdec: misc, reordered functions in dsp_init for improved readability Nuo Mi
                   ` (3 preceding siblings ...)
  2025-05-03  9:13 ` [FFmpeg-devel] [PATCH v2 5/7] checkasm: add vvc_sao Nuo Mi
@ 2025-05-03  9:13 ` Nuo Mi
  2025-05-15  1:05   ` softworkz .
  2025-05-03  9:13 ` [FFmpeg-devel] [PATCH v2 7/7] checkasm: hevc sao, use checkasm_check_padded Nuo Mi
  5 siblings, 1 reply; 10+ messages in thread
From: Nuo Mi @ 2025-05-03  9:13 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Nuo Mi

---
 tests/checkasm/hevc_sao.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/checkasm/hevc_sao.c b/tests/checkasm/hevc_sao.c
index ad47423f10..f597eb5254 100644
--- a/tests/checkasm/hevc_sao.c
+++ b/tests/checkasm/hevc_sao.c
@@ -119,21 +119,21 @@ static void check_sao_edge(HEVCDSPContext *h, int bit_depth)
         declare_func(void, uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst,
                      const int16_t *sao_offset_val, int eo, int width, int height);
 
-        for (int w = prev_size + 4; w <= block_size; w += 4) {
-            randomize_buffers(src0, src1, BUF_SIZE);
-            randomize_buffers2(offset_val, OFFSET_LENGTH);
-            memset(dst0, 0, BUF_SIZE);
-            memset(dst1, 0, BUF_SIZE);
+        if (check_func(h->sao_edge_filter[i], "hevc_sao_edge_%d_%d", block_size, bit_depth)) {
+            for (int w = prev_size + 4; w <= block_size; w += 4) {
+                randomize_buffers(src0, src1, BUF_SIZE);
+                randomize_buffers2(offset_val, OFFSET_LENGTH);
+                memset(dst0, 0, BUF_SIZE);
+                memset(dst1, 0, BUF_SIZE);
 
-            if (check_func(h->sao_edge_filter[i], "hevc_sao_edge_%d_%d", block_size, bit_depth)) {
                 call_ref(dst0, src0 + offset, stride, offset_val, eo, w, block_size);
                 call_new(dst1, src1 + offset, stride, offset_val, eo, w, block_size);
                 for (int j = 0; j < block_size; j++) {
                     if (memcmp(dst0 + j*stride, dst1 + j*stride, w*SIZEOF_PIXEL))
                         fail();
                 }
-                bench_new(dst1, src1 + offset, stride, offset_val, eo, block_size, block_size);
             }
+            bench_new(dst1, src1 + offset, stride, offset_val, eo, block_size, block_size);
         }
     }
 }
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [FFmpeg-devel] [PATCH v2 7/7] checkasm: hevc sao, use checkasm_check_padded
  2025-05-03  9:13 [FFmpeg-devel] [PATCH v2 1/7] x86/vvcdec: misc, reordered functions in dsp_init for improved readability Nuo Mi
                   ` (4 preceding siblings ...)
  2025-05-03  9:13 ` [FFmpeg-devel] [PATCH v2 6/7] checkasm: hevc sao_edge, benchmarking inside the width loop is meaningless Nuo Mi
@ 2025-05-03  9:13 ` Nuo Mi
  5 siblings, 0 replies; 10+ messages in thread
From: Nuo Mi @ 2025-05-03  9:13 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Nuo Mi

---
 tests/checkasm/hevc_sao.c | 45 +++++++++++++++++----------------------
 1 file changed, 20 insertions(+), 25 deletions(-)

diff --git a/tests/checkasm/hevc_sao.c b/tests/checkasm/hevc_sao.c
index f597eb5254..37f6ba8c5c 100644
--- a/tests/checkasm/hevc_sao.c
+++ b/tests/checkasm/hevc_sao.c
@@ -67,12 +67,13 @@ static const uint32_t sao_size[5] = {8, 16, 32, 48, 64};
 static void check_sao_band(HEVCDSPContext *h, int bit_depth)
 {
     int i;
-    LOCAL_ALIGNED_32(uint8_t, dst0, [BUF_SIZE]);
-    LOCAL_ALIGNED_32(uint8_t, dst1, [BUF_SIZE]);
+    PIXEL_RECT(dst0, MAX_PB_SIZE, MAX_PB_SIZE);
+    PIXEL_RECT(dst1, MAX_PB_SIZE, MAX_PB_SIZE);
     LOCAL_ALIGNED_32(uint8_t, src0, [BUF_SIZE]);
     LOCAL_ALIGNED_32(uint8_t, src1, [BUF_SIZE]);
     int16_t offset_val[OFFSET_LENGTH];
     int left_class = rnd()%32;
+    const int walign = 16;
 
     for (i = 0; i <= 4; i++) {
         int block_size = sao_size[i];
@@ -86,17 +87,14 @@ static void check_sao_band(HEVCDSPContext *h, int bit_depth)
             for (int w = prev_size + 4; w <= block_size; w += 4) {
                 randomize_buffers(src0, src1, BUF_SIZE);
                 randomize_buffers2(offset_val, OFFSET_LENGTH);
-                memset(dst0, 0, BUF_SIZE);
-                memset(dst1, 0, BUF_SIZE);
-
-                call_ref(dst0, src0, stride, stride, offset_val, left_class, w, block_size);
-                call_new(dst1, src1, stride, stride, offset_val, left_class, w, block_size);
-                for (int j = 0; j < block_size; j++) {
-                    if (memcmp(dst0 + j*stride, dst1 + j*stride, w*SIZEOF_PIXEL))
-                        fail();
-                }
+                CLEAR_PIXEL_RECT(dst0);
+                CLEAR_PIXEL_RECT(dst1);
+
+                call_ref(dst0, src0, dst0_stride, stride, offset_val, left_class, w, block_size);
+                call_new(dst1, src1, dst1_stride, stride, offset_val, left_class, w, block_size);
+                checkasm_check_pixel_padded_align(dst0, dst0_stride, dst1, dst1_stride, w, block_size, "dst", walign, 1);
             }
-            bench_new(dst1, src1, stride, stride, offset_val, left_class, block_size, block_size);
+            bench_new(dst1, src1, dst1_stride, stride, offset_val, left_class, block_size, block_size);
         }
     }
 }
@@ -104,17 +102,17 @@ static void check_sao_band(HEVCDSPContext *h, int bit_depth)
 static void check_sao_edge(HEVCDSPContext *h, int bit_depth)
 {
     int i;
-    LOCAL_ALIGNED_32(uint8_t, dst0, [BUF_SIZE]);
-    LOCAL_ALIGNED_32(uint8_t, dst1, [BUF_SIZE]);
+    PIXEL_RECT(dst0, MAX_PB_SIZE, MAX_PB_SIZE);
+    PIXEL_RECT(dst1, MAX_PB_SIZE, MAX_PB_SIZE);
     LOCAL_ALIGNED_32(uint8_t, src0, [BUF_SIZE]);
     LOCAL_ALIGNED_32(uint8_t, src1, [BUF_SIZE]);
     int16_t offset_val[OFFSET_LENGTH];
     int eo = rnd()%4;
+    const int walign = 16;
 
     for (i = 0; i <= 4; i++) {
         int block_size = sao_size[i];
         int prev_size = i > 0 ? sao_size[i - 1] : 0;
-        ptrdiff_t stride = PIXEL_STRIDE*SIZEOF_PIXEL;
         int offset = (AV_INPUT_BUFFER_PADDING_SIZE + PIXEL_STRIDE)*SIZEOF_PIXEL;
         declare_func(void, uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst,
                      const int16_t *sao_offset_val, int eo, int width, int height);
@@ -123,17 +121,14 @@ static void check_sao_edge(HEVCDSPContext *h, int bit_depth)
             for (int w = prev_size + 4; w <= block_size; w += 4) {
                 randomize_buffers(src0, src1, BUF_SIZE);
                 randomize_buffers2(offset_val, OFFSET_LENGTH);
-                memset(dst0, 0, BUF_SIZE);
-                memset(dst1, 0, BUF_SIZE);
-
-                call_ref(dst0, src0 + offset, stride, offset_val, eo, w, block_size);
-                call_new(dst1, src1 + offset, stride, offset_val, eo, w, block_size);
-                for (int j = 0; j < block_size; j++) {
-                    if (memcmp(dst0 + j*stride, dst1 + j*stride, w*SIZEOF_PIXEL))
-                        fail();
-                }
+                CLEAR_PIXEL_RECT(dst0);
+                CLEAR_PIXEL_RECT(dst1);
+
+                call_ref(dst0, src0 + offset, dst0_stride, offset_val, eo, w, block_size);
+                call_new(dst1, src1 + offset, dst1_stride, offset_val, eo, w, block_size);
+                checkasm_check_pixel_padded_align(dst0, dst0_stride, dst1, dst1_stride, w, block_size, "dst", walign, 1);
             }
-            bench_new(dst1, src1 + offset, stride, offset_val, eo, block_size, block_size);
+            bench_new(dst1, src1 + offset, dst1_stride, offset_val, eo, block_size, block_size);
         }
     }
 }
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH v2 6/7] checkasm: hevc sao_edge, benchmarking inside the width loop is meaningless
  2025-05-03  9:13 ` [FFmpeg-devel] [PATCH v2 6/7] checkasm: hevc sao_edge, benchmarking inside the width loop is meaningless Nuo Mi
@ 2025-05-15  1:05   ` softworkz .
  2025-05-15 12:49     ` Nuo Mi
  0 siblings, 1 reply; 10+ messages in thread
From: softworkz . @ 2025-05-15  1:05 UTC (permalink / raw)
  To: FFmpeg development discussions and patches; +Cc: Nuo Mi



> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Nuo Mi
> Sent: Samstag, 3. Mai 2025 11:13
> To: ffmpeg-devel@ffmpeg.org
> Cc: Nuo Mi <nuomi2021@gmail.com>
> Subject: [FFmpeg-devel] [PATCH v2 6/7] checkasm: hevc sao_edge, benchmarking
> inside the width loop is meaningless
> 
> ---
>  tests/checkasm/hevc_sao.c | 14 +++++++-------
>  1 file changed, 7 insertions(+), 7 deletions(-)
> 
> diff --git a/tests/checkasm/hevc_sao.c b/tests/checkasm/hevc_sao.c
> index ad47423f10..f597eb5254 100644
> --- a/tests/checkasm/hevc_sao.c
> +++ b/tests/checkasm/hevc_sao.c
> @@ -119,21 +119,21 @@ static void check_sao_edge(HEVCDSPContext *h, int
> bit_depth)
>          declare_func(void, uint8_t *dst, const uint8_t *src, ptrdiff_t
> stride_dst,
>                       const int16_t *sao_offset_val, int eo, int width, int
> height);
> 
> -        for (int w = prev_size + 4; w <= block_size; w += 4) {
> -            randomize_buffers(src0, src1, BUF_SIZE);
> -            randomize_buffers2(offset_val, OFFSET_LENGTH);
> -            memset(dst0, 0, BUF_SIZE);
> -            memset(dst1, 0, BUF_SIZE);
> +        if (check_func(h->sao_edge_filter[i], "hevc_sao_edge_%d_%d",
> block_size, bit_depth)) {
> +            for (int w = prev_size + 4; w <= block_size; w += 4) {
> +                randomize_buffers(src0, src1, BUF_SIZE);
> +                randomize_buffers2(offset_val, OFFSET_LENGTH);
> +                memset(dst0, 0, BUF_SIZE);
> +                memset(dst1, 0, BUF_SIZE);
> 
> -            if (check_func(h->sao_edge_filter[i], "hevc_sao_edge_%d_%d",
> block_size, bit_depth)) {
>                  call_ref(dst0, src0 + offset, stride, offset_val, eo, w,
> block_size);
>                  call_new(dst1, src1 + offset, stride, offset_val, eo, w,
> block_size);
>                  for (int j = 0; j < block_size; j++) {
>                      if (memcmp(dst0 + j*stride, dst1 + j*stride,
> w*SIZEOF_PIXEL))
>                          fail();
>                  }
> -                bench_new(dst1, src1 + offset, stride, offset_val, eo,
> block_size, block_size);
>              }
> +            bench_new(dst1, src1 + offset, stride, offset_val, eo,
> block_size, block_size);
>          }
>      }
>  }
> --

Hi Nuo,

since you have applied this patch (or 7/7)´today, both FATE builds on Windows
(MSVC + GCC) are failing - for all submitted patches. 

https://patchwork.ffmpeg.org/project/ffmpeg 


Could you please take a look?




With MSVC:

D:\a\1\s\libavcodec\get_bits.h(366): warning C4101: 're_cache': unreferenced local variable
vvc_alf.c
CC	tests/checkasm/vp9dsp.o
vp9dsp.c
D:\a\1\s\libavcodec\get_bits.h(366): warning C4101: 're_cache': unreferenced local variable
STRIP	tests/checkasm/x86/checkasm.o
skipping strip -x tests/checkasm/x86/checkasm.o
CC	tests/checkasm/vvc_sao.o
vvc_sao.c
D:\a\1\s\libavcodec\get_bits.h(366): warning C4101: 're_cache': unreferenced local variable
D:\a\1\s\libavcodec\get_bits.h(366): warning C4101: 're_cache': unreferenced local variable
C:\Program Files (x86)\Windows Kits\10\\include\10.0.26100.0\\um\winnt.h(21227): error C2143: syntax error: missing ':' before 'constant'
C:\Program Files (x86)\Windows Kits\10\\include\10.0.26100.0\\um\winnt.h(21227): error C2143: syntax error: missing ';' before ':'
C:\Program Files (x86)\Windows Kits\10\\include\10.0.26100.0\\um\winnt.h(21227): error C2059: syntax error: ':'
C:\Program Files (x86)\Windows Kits\10\\include\10.0.26100.0\\um\winnt.h(21228): error C2143: syntax error: missing '{' before ':'
C:\Program Files (x86)\Windows Kits\10\\include\10.0.26100.0\\um\winnt.h(21228): error C2059: syntax error: ':'
make: *** [ffbuild/common.mak:81: tests/checkasm/vvc_sao.o] Error 2
C:\Program Files (x86)\Windows Kits\10\\include\10.0.26100.0\\um\winnt.h(21229): error C2059: syntax error: '}'
C:\Program Files (x86)\Windows Kits\10\\include\10.0.26100.0\\um\winnt.h(21230): error C2059: syntax error: '}'
C:\Program Files (x86)\Windows Kits\10\\include\10.0.26100.0\\um\winnt.h(21231): error C2059: syntax error: '}'

https://dev.azure.com/githubsync/ffmpeg/_build/results?buildId=87858&view=logs


With GCC:

CC	tests/checkasm/vf_threshold.o
CC	tests/checkasm/videodsp.o
CC	tests/checkasm/vorbisdsp.o
CC	tests/checkasm/vp8dsp.o
CC	tests/checkasm/vp9dsp.o
CC	tests/checkasm/vvc_alf.o
CC	tests/checkasm/vvc_mc.o
CC	tests/checkasm/vvc_sao.o
X86ASM	tests/checkasm/x86/checkasm.o
CC	tests/api/api-threadmessage-test.o
In file included from ./libavcodec/vvc/ctu.h:31,
                 from tests/checkasm/vvc_sao.c:27:
./libavcodec/vvc/dec.h:36:33: error: expected identifier or '(' before numeric constant
   36 | #define CR                      2
      |                                 ^
make: *** [ffbuild/common.mak:81: tests/checkasm/vvc_sao.o] Error 1
CC	tests/api/api-flac-test.o
CC	tests/api/api-seek-test.o
HOSTCC	tests/audiogen.o
STRIP	tests/checkasm/x86/checkasm.o
HOSTCC	tests/videogen.o
CC	libavcodec/tests/apv.o
CC	libavcodec/tests/avpacket.o
CC	libavcodec/tests/bitstream_be.o

https://dev.azure.com/githubsync/ffmpeg/_build/results?buildId=87859&view=logs


Thanks,
sw
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH v2 6/7] checkasm: hevc sao_edge, benchmarking inside the width loop is meaningless
  2025-05-15  1:05   ` softworkz .
@ 2025-05-15 12:49     ` Nuo Mi
  2025-05-15 20:29       ` softworkz .
  0 siblings, 1 reply; 10+ messages in thread
From: Nuo Mi @ 2025-05-15 12:49 UTC (permalink / raw)
  To: softworkz .; +Cc: FFmpeg development discussions and patches

On Thu, May 15, 2025 at 9:05 AM softworkz . <softworkz@hotmail.com> wrote:

>
>
> > -----Original Message-----
> > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of Nuo Mi
> > Sent: Samstag, 3. Mai 2025 11:13
> > To: ffmpeg-devel@ffmpeg.org
> > Cc: Nuo Mi <nuomi2021@gmail.com>
> > Subject: [FFmpeg-devel] [PATCH v2 6/7] checkasm: hevc sao_edge,
> benchmarking
> > inside the width loop is meaningless
> >
> > ---
> >  tests/checkasm/hevc_sao.c | 14 +++++++-------
> >  1 file changed, 7 insertions(+), 7 deletions(-)
> >
> > diff --git a/tests/checkasm/hevc_sao.c b/tests/checkasm/hevc_sao.c
> > index ad47423f10..f597eb5254 100644
> > --- a/tests/checkasm/hevc_sao.c
> > +++ b/tests/checkasm/hevc_sao.c
> > @@ -119,21 +119,21 @@ static void check_sao_edge(HEVCDSPContext *h, int
> > bit_depth)
> >          declare_func(void, uint8_t *dst, const uint8_t *src, ptrdiff_t
> > stride_dst,
> >                       const int16_t *sao_offset_val, int eo, int width,
> int
> > height);
> >
> > -        for (int w = prev_size + 4; w <= block_size; w += 4) {
> > -            randomize_buffers(src0, src1, BUF_SIZE);
> > -            randomize_buffers2(offset_val, OFFSET_LENGTH);
> > -            memset(dst0, 0, BUF_SIZE);
> > -            memset(dst1, 0, BUF_SIZE);
> > +        if (check_func(h->sao_edge_filter[i], "hevc_sao_edge_%d_%d",
> > block_size, bit_depth)) {
> > +            for (int w = prev_size + 4; w <= block_size; w += 4) {
> > +                randomize_buffers(src0, src1, BUF_SIZE);
> > +                randomize_buffers2(offset_val, OFFSET_LENGTH);
> > +                memset(dst0, 0, BUF_SIZE);
> > +                memset(dst1, 0, BUF_SIZE);
> >
> > -            if (check_func(h->sao_edge_filter[i], "hevc_sao_edge_%d_%d",
> > block_size, bit_depth)) {
> >                  call_ref(dst0, src0 + offset, stride, offset_val, eo, w,
> > block_size);
> >                  call_new(dst1, src1 + offset, stride, offset_val, eo, w,
> > block_size);
> >                  for (int j = 0; j < block_size; j++) {
> >                      if (memcmp(dst0 + j*stride, dst1 + j*stride,
> > w*SIZEOF_PIXEL))
> >                          fail();
> >                  }
> > -                bench_new(dst1, src1 + offset, stride, offset_val, eo,
> > block_size, block_size);
> >              }
> > +            bench_new(dst1, src1 + offset, stride, offset_val, eo,
> > block_size, block_size);
> >          }
> >      }
> >  }
> > --
>
> Hi Nuo,
>
> since you have applied this patch (or 7/7)´today, both FATE builds on
> Windows
> (MSVC + GCC) are failing - for all submitted patches.
>
> https://patchwork.ffmpeg.org/project/ffmpeg
>
>
> Could you please take a look?
>

Your CI is great!👍
Should fixed by
https://patchwork.ffmpeg.org/project/ffmpeg/patch/20250515124603.42691-1-nuomi2021@gmail.com/


>
>
>
>
> With MSVC:
>
> D:\a\1\s\libavcodec\get_bits.h(366): warning C4101: 're_cache':
> unreferenced local variable
> vvc_alf.c
> CC      tests/checkasm/vp9dsp.o
> vp9dsp.c
> D:\a\1\s\libavcodec\get_bits.h(366): warning C4101: 're_cache':
> unreferenced local variable
> STRIP   tests/checkasm/x86/checkasm.o
> skipping strip -x tests/checkasm/x86/checkasm.o
> CC      tests/checkasm/vvc_sao.o
> vvc_sao.c
> D:\a\1\s\libavcodec\get_bits.h(366): warning C4101: 're_cache':
> unreferenced local variable
> D:\a\1\s\libavcodec\get_bits.h(366): warning C4101: 're_cache':
> unreferenced local variable
> C:\Program Files (x86)\Windows
> Kits\10\\include\10.0.26100.0\\um\winnt.h(21227): error C2143: syntax
> error: missing ':' before 'constant'
> C:\Program Files (x86)\Windows
> Kits\10\\include\10.0.26100.0\\um\winnt.h(21227): error C2143: syntax
> error: missing ';' before ':'
> C:\Program Files (x86)\Windows
> Kits\10\\include\10.0.26100.0\\um\winnt.h(21227): error C2059: syntax
> error: ':'
> C:\Program Files (x86)\Windows
> Kits\10\\include\10.0.26100.0\\um\winnt.h(21228): error C2143: syntax
> error: missing '{' before ':'
> C:\Program Files (x86)\Windows
> Kits\10\\include\10.0.26100.0\\um\winnt.h(21228): error C2059: syntax
> error: ':'
> make: *** [ffbuild/common.mak:81: tests/checkasm/vvc_sao.o] Error 2
> C:\Program Files (x86)\Windows
> Kits\10\\include\10.0.26100.0\\um\winnt.h(21229): error C2059: syntax
> error: '}'
> C:\Program Files (x86)\Windows
> Kits\10\\include\10.0.26100.0\\um\winnt.h(21230): error C2059: syntax
> error: '}'
> C:\Program Files (x86)\Windows
> Kits\10\\include\10.0.26100.0\\um\winnt.h(21231): error C2059: syntax
> error: '}'
>
>
> https://dev.azure.com/githubsync/ffmpeg/_build/results?buildId=87858&view=logs
>
>
> With GCC:
>
> CC      tests/checkasm/vf_threshold.o
> CC      tests/checkasm/videodsp.o
> CC      tests/checkasm/vorbisdsp.o
> CC      tests/checkasm/vp8dsp.o
> CC      tests/checkasm/vp9dsp.o
> CC      tests/checkasm/vvc_alf.o
> CC      tests/checkasm/vvc_mc.o
> CC      tests/checkasm/vvc_sao.o
> X86ASM  tests/checkasm/x86/checkasm.o
> CC      tests/api/api-threadmessage-test.o
> In file included from ./libavcodec/vvc/ctu.h:31,
>                  from tests/checkasm/vvc_sao.c:27:
> ./libavcodec/vvc/dec.h:36:33: error: expected identifier or '(' before
> numeric constant
>    36 | #define CR                      2
>       |                                 ^
> make: *** [ffbuild/common.mak:81: tests/checkasm/vvc_sao.o] Error 1
> CC      tests/api/api-flac-test.o
> CC      tests/api/api-seek-test.o
> HOSTCC  tests/audiogen.o
> STRIP   tests/checkasm/x86/checkasm.o
> HOSTCC  tests/videogen.o
> CC      libavcodec/tests/apv.o
> CC      libavcodec/tests/avpacket.o
> CC      libavcodec/tests/bitstream_be.o
>
>
> https://dev.azure.com/githubsync/ffmpeg/_build/results?buildId=87859&view=logs
>
>
> Thanks,
> sw
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [FFmpeg-devel] [PATCH v2 6/7] checkasm: hevc sao_edge, benchmarking inside the width loop is meaningless
  2025-05-15 12:49     ` Nuo Mi
@ 2025-05-15 20:29       ` softworkz .
  0 siblings, 0 replies; 10+ messages in thread
From: softworkz . @ 2025-05-15 20:29 UTC (permalink / raw)
  To: Nuo Mi; +Cc: FFmpeg development discussions and patches



From: Nuo Mi <nuomi2021@gmail.com>
Sent: Donnerstag, 15. Mai 2025 14:49
To: softworkz . <softworkz@hotmail.com>
Cc: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
Subject: Re: [FFmpeg-devel] [PATCH v2 6/7] checkasm: hevc sao_edge, benchmarking inside the width loop is meaningless



On Thu, May 15, 2025 at 9:05 AM softworkz . <softworkz@hotmail.com<mailto:softworkz@hotmail.com>> wrote:


> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org<mailto:ffmpeg-devel-bounces@ffmpeg.org>> On Behalf Of Nuo Mi
> Sent: Samstag, 3. Mai 2025 11:13
> To: ffmpeg-devel@ffmpeg.org<mailto:ffmpeg-devel@ffmpeg.org>
> Cc: Nuo Mi <nuomi2021@gmail.com<mailto:nuomi2021@gmail.com>>
> Subject: [FFmpeg-devel] [PATCH v2 6/7] checkasm: hevc sao_edge, benchmarking
> inside the width loop is meaningless
>
> ---
>  tests/checkasm/hevc_sao.c | 14 +++++++-------
>  1 file changed, 7 insertions(+), 7 deletions(-)
>
> diff --git a/tests/checkasm/hevc_sao.c b/tests/checkasm/hevc_sao.c
> index ad47423f10..f597eb5254 100644
> --- a/tests/checkasm/hevc_sao.c
> +++ b/tests/checkasm/hevc_sao.c
> @@ -119,21 +119,21 @@ static void check_sao_edge(HEVCDSPContext *h, int
> bit_depth)
>          declare_func(void, uint8_t *dst, const uint8_t *src, ptrdiff_t
> stride_dst,
>                       const int16_t *sao_offset_val, int eo, int width, int
> height);
>
> -        for (int w = prev_size + 4; w <= block_size; w += 4) {
> -            randomize_buffers(src0, src1, BUF_SIZE);
> -            randomize_buffers2(offset_val, OFFSET_LENGTH);
> -            memset(dst0, 0, BUF_SIZE);
> -            memset(dst1, 0, BUF_SIZE);
> +        if (check_func(h->sao_edge_filter[i], "hevc_sao_edge_%d_%d",
> block_size, bit_depth)) {
> +            for (int w = prev_size + 4; w <= block_size; w += 4) {
> +                randomize_buffers(src0, src1, BUF_SIZE);
> +                randomize_buffers2(offset_val, OFFSET_LENGTH);
> +                memset(dst0, 0, BUF_SIZE);
> +                memset(dst1, 0, BUF_SIZE);
>
> -            if (check_func(h->sao_edge_filter[i], "hevc_sao_edge_%d_%d",
> block_size, bit_depth)) {
>                  call_ref(dst0, src0 + offset, stride, offset_val, eo, w,
> block_size);
>                  call_new(dst1, src1 + offset, stride, offset_val, eo, w,
> block_size);
>                  for (int j = 0; j < block_size; j++) {
>                      if (memcmp(dst0 + j*stride, dst1 + j*stride,
> w*SIZEOF_PIXEL))
>                          fail();
>                  }
> -                bench_new(dst1, src1 + offset, stride, offset_val, eo,
> block_size, block_size);
>              }
> +            bench_new(dst1, src1 + offset, stride, offset_val, eo,
> block_size, block_size);
>          }
>      }
>  }
> --

Hi Nuo,

since you have applied this patch (or 7/7)´today, both FATE builds on Windows
(MSVC + GCC) are failing - for all submitted patches.

https://patchwork.ffmpeg.org/project/ffmpeg


Could you please take a look?

Your CI is great!👍

😊

Should fixed by https://patchwork.ffmpeg.org/project/ffmpeg/patch/20250515124603.42691-1-nuomi2021@gmail.com/

Yup that does the trick, tanks for the quick fix.

If you could apply it quickly (it’s just affecting tests anyway), then I can try to retrigger
the failed builds for the other patches, so that they are getting proper checks results
on Patchwork.

Thanks
sw





_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2025-05-15 20:29 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-05-03  9:13 [FFmpeg-devel] [PATCH v2 1/7] x86/vvcdec: misc, reordered functions in dsp_init for improved readability Nuo Mi
2025-05-03  9:13 ` [FFmpeg-devel] [PATCH v2 2/7] x86/hevcdec: sao, refact out h26x macros Nuo Mi
2025-05-03  9:13 ` [FFmpeg-devel] [PATCH v2 3/7] x86/hevcdec: refact, remove duplicate code in HEVC_SAO_{BAND, EDGE}_FILTER Nuo Mi
2025-05-03  9:13 ` [FFmpeg-devel] [PATCH v2 4/7] x86/vvcdec: sao, add avx2 support Nuo Mi
2025-05-03  9:13 ` [FFmpeg-devel] [PATCH v2 5/7] checkasm: add vvc_sao Nuo Mi
2025-05-03  9:13 ` [FFmpeg-devel] [PATCH v2 6/7] checkasm: hevc sao_edge, benchmarking inside the width loop is meaningless Nuo Mi
2025-05-15  1:05   ` softworkz .
2025-05-15 12:49     ` Nuo Mi
2025-05-15 20:29       ` softworkz .
2025-05-03  9:13 ` [FFmpeg-devel] [PATCH v2 7/7] checkasm: hevc sao, use checkasm_check_padded Nuo Mi

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git