Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH 1/5] avcodec/vvc_mc: split the SAD dsp prototype into one function per blocksize width
@ 2024-05-23 12:27 James Almer
  2024-05-23 12:27 ` [FFmpeg-devel] [PATCH 2/5] x86/vvc_sad: optimize vvc_sad_16 James Almer
                   ` (4 more replies)
  0 siblings, 5 replies; 6+ messages in thread
From: James Almer @ 2024-05-23 12:27 UTC (permalink / raw)
  To: ffmpeg-devel

Signed-off-by: James Almer <jamrial@gmail.com>
---
 libavcodec/vvc/dsp.h             |  2 +-
 libavcodec/vvc/inter.c           |  6 ++++--
 libavcodec/vvc/inter_template.c  |  6 +++++-
 libavcodec/x86/vvc/vvc_sad.asm   | 32 ++++++++++++++++++++++++++------
 libavcodec/x86/vvc/vvcdsp_init.c | 22 +++++++++++++++++-----
 tests/checkasm/vvc_mc.c          |  3 ++-
 6 files changed, 55 insertions(+), 16 deletions(-)

diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h
index 1f14096c41..55c4c81f53 100644
--- a/libavcodec/vvc/dsp.h
+++ b/libavcodec/vvc/dsp.h
@@ -99,7 +99,7 @@ typedef struct VVCInterDSPContext {
 
     void (*apply_bdof)(uint8_t *dst, ptrdiff_t dst_stride, int16_t *src0, int16_t *src1, int block_w, int block_h);
 
-    int (*sad)(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h);
+    int (*sad[5])(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h);
     void (*dmvr[2][2])(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, int height,
         intptr_t mx, intptr_t my, int width);
 } VVCInterDSPContext;
diff --git a/libavcodec/vvc/inter.c b/libavcodec/vvc/inter.c
index e1011b4fa1..0214e46634 100644
--- a/libavcodec/vvc/inter.c
+++ b/libavcodec/vvc/inter.c
@@ -740,6 +740,8 @@ static void dmvr_mv_refine(VVCLocalContext *lc, MvField *mvf, MvField *orig_mv,
     const AVFrame *ref0, const AVFrame *ref1, const int x_off, const int y_off, const int block_w, const int block_h)
 {
     const VVCFrameContext *fc   = lc->fc;
+    static const uint8_t sad_tab[16] = { 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4 };
+    const int tab               = sad_tab[(FFALIGN(block_w, 8) >> 3) - 1];
     const int sr_range          = 2;
     const AVFrame *ref[]        = { ref0, ref1 };
     int16_t *tmp[]              = { lc->tmp, lc->tmp1 };
@@ -763,7 +765,7 @@ static void dmvr_mv_refine(VVCLocalContext *lc, MvField *mvf, MvField *orig_mv,
         fc->vvcdsp.inter.dmvr[!!my][!!mx](tmp[i], src, src_stride, pred_h, mx, my, pred_w);
     }
 
-    min_sad = fc->vvcdsp.inter.sad(tmp[L0], tmp[L1], dx, dy, block_w, block_h);
+    min_sad = fc->vvcdsp.inter.sad[tab](tmp[L0], tmp[L1], dx, dy, block_w, block_h);
     min_sad -= min_sad >> 2;
     sad[dy][dx] = min_sad;
 
@@ -773,7 +775,7 @@ static void dmvr_mv_refine(VVCLocalContext *lc, MvField *mvf, MvField *orig_mv,
         for (dy = 0; dy < SAD_ARRAY_SIZE; dy++) {
             for (dx = 0; dx < SAD_ARRAY_SIZE; dx++) {
                 if (dx != sr_range || dy != sr_range) {
-                    sad[dy][dx] = fc->vvcdsp.inter.sad(lc->tmp, lc->tmp1, dx, dy, block_w, block_h);
+                    sad[dy][dx] = fc->vvcdsp.inter.sad[tab](lc->tmp, lc->tmp1, dx, dy, block_w, block_h);
                     if (sad[dy][dx] < min_sad) {
                         min_sad = sad[dy][dx];
                         min_dx = dx;
diff --git a/libavcodec/vvc/inter_template.c b/libavcodec/vvc/inter_template.c
index a8068f4ba8..34485321d3 100644
--- a/libavcodec/vvc/inter_template.c
+++ b/libavcodec/vvc/inter_template.c
@@ -626,7 +626,11 @@ static void FUNC(ff_vvc_inter_dsp_init)(VVCInterDSPContext *const inter)
     inter->apply_prof_uni_w     = FUNC(apply_prof_uni_w);
     inter->apply_bdof           = FUNC(apply_bdof);
     inter->prof_grad_filter     = FUNC(prof_grad_filter);
-    inter->sad                  = vvc_sad;
+    inter->sad[0]               =
+    inter->sad[1]               =
+    inter->sad[2]               =
+    inter->sad[3]               =
+    inter->sad[4]               = vvc_sad;
 }
 
 #undef FUNCS
diff --git a/libavcodec/x86/vvc/vvc_sad.asm b/libavcodec/x86/vvc/vvc_sad.asm
index b468d89ac2..a20818530f 100644
--- a/libavcodec/x86/vvc/vvc_sad.asm
+++ b/libavcodec/x86/vvc/vvc_sad.asm
@@ -51,7 +51,7 @@ SECTION .text
 
 INIT_YMM avx2
 
-cglobal vvc_sad, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx
+cglobal vvc_sad_8, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx
     movsxdifnidn    dxq, dxd
     movsxdifnidn    dyq, dyd
 
@@ -76,10 +76,6 @@ cglobal vvc_sad, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_
     pxor               m3, m3
     vpbroadcastd       m4, [pw_1]
 
-    cmp          block_wd, 16
-    jge    vvc_sad_16_128
-
-    vvc_sad_8:
         .loop_height:
         movu              xm0, [src1q]
         vinserti128        m0, m0, [src1q + MAX_PB_SIZE * ROWS * 2], 1
@@ -100,7 +96,31 @@ cglobal vvc_sad, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_
         movd          eax, xm0
     RET
 
-    vvc_sad_16_128:
+cglobal vvc_sad_16, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx
+    movsxdifnidn    dxq, dxd
+    movsxdifnidn    dyq, dyd
+
+    sub             dxq, 2
+    sub             dyq, 2
+
+    mov             off1q, 2
+    mov             off2q, 2
+
+    add             off1q, dyq
+    sub             off2q, dyq
+
+    shl             off1q, 7
+    shl             off2q, 7
+
+    add             off1q, dxq
+    sub             off2q, dxq
+
+    lea             src1q, [src1q + off1q * 2 + 2 * 2]
+    lea             src2q, [src2q + off2q * 2 + 2 * 2]
+
+    pxor               m3, m3
+    vpbroadcastd       m4, [pw_1]
+
         sar      block_wd, 4
         .loop_height:
         mov         off1q, src1q
diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
index 4b4a2aa937..bd60963432 100644
--- a/libavcodec/x86/vvc/vvcdsp_init.c
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -312,8 +312,20 @@ ALF_FUNCS(16, 12, avx2)
     c->alf.classify       = ff_vvc_alf_classify_##bd##_avx2;         \
 } while (0)
 
-int ff_vvc_sad_avx2(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h);
-#define SAD_INIT() c->inter.sad = ff_vvc_sad_avx2
+#define SAD_PROTOTYPE(w, opt)                                        \
+int bf(ff_vvc_sad, w, opt)(const int16_t *src0, const int16_t *src1, \
+                           int dx, int dy, int block_w, int block_h) \
+
+SAD_PROTOTYPE(8,   avx2);
+SAD_PROTOTYPE(16,  avx2);
+
+#define SAD_INIT(opt) do {                   \
+    c->inter.sad[0] = ff_vvc_sad_8_##opt;    \
+    c->inter.sad[1] =                        \
+    c->inter.sad[2] =                        \
+    c->inter.sad[3] =                        \
+    c->inter.sad[4] = ff_vvc_sad_16_##opt;   \
+} while (0)
 #endif
 
 void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
@@ -330,7 +342,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
             ALF_INIT(8);
             AVG_INIT(8, avx2);
             MC_LINKS_AVX2(8);
-            SAD_INIT();
+            SAD_INIT(avx2);
         }
         break;
     case 10:
@@ -342,7 +354,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
             AVG_INIT(10, avx2);
             MC_LINKS_AVX2(10);
             MC_LINKS_16BPC_AVX2(10);
-            SAD_INIT();
+            SAD_INIT(avx2);
         }
         break;
     case 12:
@@ -354,7 +366,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
             AVG_INIT(12, avx2);
             MC_LINKS_AVX2(12);
             MC_LINKS_16BPC_AVX2(12);
-            SAD_INIT();
+            SAD_INIT(avx2);
         }
         break;
     default:
diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
index 1e889e2cff..deae1014d2 100644
--- a/tests/checkasm/vvc_mc.c
+++ b/tests/checkasm/vvc_mc.c
@@ -327,6 +327,7 @@ static void check_avg(void)
 static void check_vvc_sad(void)
 {
     const int bit_depth = 10;
+    static const uint8_t sad_tab[16] = { 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4 };
     VVCDSPContext c;
     LOCAL_ALIGNED_32(uint16_t, src0, [MAX_CTU_SIZE * MAX_CTU_SIZE * 4]);
     LOCAL_ALIGNED_32(uint16_t, src1, [MAX_CTU_SIZE * MAX_CTU_SIZE * 4]);
@@ -341,7 +342,7 @@ static void check_vvc_sad(void)
         for (int w = 8; w <= MAX_CTU_SIZE; w *= 2) {
             for(int offy = 0; offy <= 4; offy++) {
                 for(int offx = 0; offx <= 4; offx++) {
-                    if(check_func(c.inter.sad, "sad_%dx%d", w, h)) {
+                    if(check_func(c.inter.sad[sad_tab[(w >> 3) - 1]], "sad_%dx%d", w, h)) {
                         int result0;
                         int result1;
 
-- 
2.45.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [FFmpeg-devel] [PATCH 2/5] x86/vvc_sad: optimize vvc_sad_16
  2024-05-23 12:27 [FFmpeg-devel] [PATCH 1/5] avcodec/vvc_mc: split the SAD dsp prototype into one function per blocksize width James Almer
@ 2024-05-23 12:27 ` James Almer
  2024-05-23 12:27 ` [FFmpeg-devel] [PATCH 3/5] x86/vvc_sad: add sse4 versions of all functions James Almer
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: James Almer @ 2024-05-23 12:27 UTC (permalink / raw)
  To: ffmpeg-devel

Signed-off-by: James Almer <jamrial@gmail.com>
---
 libavcodec/x86/vvc/vvc_sad.asm | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/libavcodec/x86/vvc/vvc_sad.asm b/libavcodec/x86/vvc/vvc_sad.asm
index a20818530f..829dbce489 100644
--- a/libavcodec/x86/vvc/vvc_sad.asm
+++ b/libavcodec/x86/vvc/vvc_sad.asm
@@ -96,7 +96,7 @@ cglobal vvc_sad_8, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, ro
         movd          eax, xm0
     RET
 
-cglobal vvc_sad_16, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx
+cglobal vvc_sad_16, 6, 8, 5, src1, src2, dx, dy, block_w, block_h, off1, off2
     movsxdifnidn    dxq, dxd
     movsxdifnidn    dyq, dyd
 
@@ -121,26 +121,27 @@ cglobal vvc_sad_16, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, r
     pxor               m3, m3
     vpbroadcastd       m4, [pw_1]
 
-        sar      block_wd, 4
+        shl      block_wd, 1
+        add         src1q, block_wq
+        add         src2q, block_wq
+        neg      block_wq
+
+DEFINE_ARGS src1, src2, dx, dy, block_w, block_h, row_idx
         .loop_height:
-        mov         off1q, src1q
-        mov         off2q, src2q
-        mov      row_idxd, block_wd
+        mov      row_idxq, block_wq
 
         .loop_width:
-            movu               m0, [src1q]
-            movu               m1, [src2q]
+            movu               m0, [src1q+row_idxq]
+            movu               m1, [src2q+row_idxq]
             MIN_MAX_SAD        m1, m0, m2
             pmaddwd            m1, m4
             paddd              m3, m1
 
-            add             src1q, 32
-            add             src2q, 32
-            dec          row_idxd
-            jg        .loop_width
+            add          row_idxq, mmsize
+            jl        .loop_width
 
-        lea         src1q, [off1q + ROWS * MAX_PB_SIZE * 2]
-        lea         src2q, [off2q + ROWS * MAX_PB_SIZE * 2]
+        add         src1q, ROWS * MAX_PB_SIZE * 2
+        add         src2q, ROWS * MAX_PB_SIZE * 2
 
         sub      block_hd, 2
         jg   .loop_height
-- 
2.45.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [FFmpeg-devel] [PATCH 3/5] x86/vvc_sad: add sse4 versions of all functions
  2024-05-23 12:27 [FFmpeg-devel] [PATCH 1/5] avcodec/vvc_mc: split the SAD dsp prototype into one function per blocksize width James Almer
  2024-05-23 12:27 ` [FFmpeg-devel] [PATCH 2/5] x86/vvc_sad: optimize vvc_sad_16 James Almer
@ 2024-05-23 12:27 ` James Almer
  2024-05-23 12:27 ` [FFmpeg-devel] [PATCH 4/5] x86/vvc_sad: reduce gpr usage in all loop functions James Almer
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: James Almer @ 2024-05-23 12:27 UTC (permalink / raw)
  To: ffmpeg-devel

And remove sad_8x8_avx2, as it's not faster than sad_8x8_sse4.

sad_8x8_c: 54.8
sad_8x8_sse4: 14.3
sad_16x16_c: 200.8
sad_16x16_sse4: 34.8
sad_16x16_avx2: 29.8
sad_32x32_c: 826.3
sad_32x32_sse4: 113.8
sad_32x32_avx2: 69.3
sad_64x64_c: 3679.8
sad_64x64_sse4: 392.8
sad_64x64_avx2: 257.3
sad_128x128_c: 12581.3
sad_128x128_sse4: 1560.8
sad_128x128_avx2: 1151.8

Signed-off-by: James Almer <jamrial@gmail.com>
---
 libavcodec/x86/vvc/vvc_sad.asm   | 53 +++++++++++++++++++++-----------
 libavcodec/x86/vvc/vvcdsp_init.c | 42 +++++++++++++++++--------
 2 files changed, 65 insertions(+), 30 deletions(-)

diff --git a/libavcodec/x86/vvc/vvc_sad.asm b/libavcodec/x86/vvc/vvc_sad.asm
index 829dbce489..26df25ec66 100644
--- a/libavcodec/x86/vvc/vvc_sad.asm
+++ b/libavcodec/x86/vvc/vvc_sad.asm
@@ -26,7 +26,7 @@
 
 SECTION_RODATA
 
-pw_1: times 2 dw 1
+cextern pw_1
 
 ; DMVR SAD is only calculated on even rows to reduce complexity
 SECTION .text
@@ -38,20 +38,21 @@ SECTION .text
 %endmacro
 
 %macro HORIZ_ADD 3  ; xm0, xm1, m1
+%if mmsize == 32
     vextracti128     %1, %3, q0001  ;        3        2      1          0
-    paddd            %1, %2         ; xm0 (7 + 3) (6 + 2) (5 + 1)   (4 + 0)
-    pshufd           %2, %1, q0032  ; xm1    -      -     (7 + 3)   (6 + 2)
+    paddd            %2, %1         ; xm1 (7 + 3) (6 + 2) (5 + 1)   (4 + 0)
+%endif
+    pshufd           %1, %2, q0032  ; xm0    -      -     (7 + 3)   (6 + 2)
     paddd            %1, %1, %2     ; xm0    _      _     (5 1 7 3) (4 0 6 2)
     pshufd           %2, %1, q0001  ; xm1    _      _     (5 1 7 3) (5 1 7 3)
     paddd            %1, %1, %2     ;                               (01234567)
 %endmacro
 
-%if ARCH_X86_64
-%if HAVE_AVX2_EXTERNAL
-
-INIT_YMM avx2
-
-cglobal vvc_sad_8, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx
+%macro VVC_SAD 1
+cglobal vvc_sad_%1, 4, 7, 5, src1, src2, dx, dy, off1, block_h, off2
+%if UNIX64 == 0
+    mov             block_hd, dword r5m
+%endif
     movsxdifnidn    dxq, dxd
     movsxdifnidn    dyq, dyd
 
@@ -74,29 +75,32 @@ cglobal vvc_sad_8, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, ro
     lea             src2q, [src2q + off2q * 2 + 2 * 2]
 
     pxor               m3, m3
+%if mmsize == 32
     vpbroadcastd       m4, [pw_1]
+%else
+    mova               m4, [pw_1]
+%endif
 
         .loop_height:
-        movu              xm0, [src1q]
-        vinserti128        m0, m0, [src1q + MAX_PB_SIZE * ROWS * 2], 1
-        movu              xm1, [src2q]
-        vinserti128        m1, m1, [src2q + MAX_PB_SIZE * ROWS * 2], 1
-
+        movu               m0, [src1q]
+        movu               m1, [src2q]
         MIN_MAX_SAD        m1, m0, m2
         pmaddwd            m1, m4
         paddd              m3, m1
 
-        add         src1q, 2 * MAX_PB_SIZE * ROWS * 2
-        add         src2q, 2 * MAX_PB_SIZE * ROWS * 2
+        add         src1q, ROWS * MAX_PB_SIZE * 2
+        add         src2q, ROWS * MAX_PB_SIZE * 2
 
-        sub      block_hd, 4
+        sub      block_hd, 2
         jg   .loop_height
 
         HORIZ_ADD     xm0, xm3, m3
         movd          eax, xm0
     RET
+%endmacro
 
-cglobal vvc_sad_16, 6, 8, 5, src1, src2, dx, dy, block_w, block_h, off1, off2
+%macro VVC_SAD_LOOP 1
+cglobal vvc_sad_%1, 6, 8, 5, src1, src2, dx, dy, block_w, block_h, off1, off2
     movsxdifnidn    dxq, dxd
     movsxdifnidn    dyq, dyd
 
@@ -119,7 +123,11 @@ cglobal vvc_sad_16, 6, 8, 5, src1, src2, dx, dy, block_w, block_h, off1, off2
     lea             src2q, [src2q + off2q * 2 + 2 * 2]
 
     pxor               m3, m3
+%if mmsize == 32
     vpbroadcastd       m4, [pw_1]
+%else
+    mova               m4, [pw_1]
+%endif
 
         shl      block_wd, 1
         add         src1q, block_wq
@@ -149,6 +157,15 @@ DEFINE_ARGS src1, src2, dx, dy, block_w, block_h, row_idx
         HORIZ_ADD     xm0, xm3, m3
         movd          eax, xm0
     RET
+%endmacro
 
+%if ARCH_X86_64
+INIT_XMM sse4
+VVC_SAD 8
+VVC_SAD_LOOP 16
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+VVC_SAD 16
+VVC_SAD_LOOP 32
 %endif
 %endif
diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
index bd60963432..cdf0e36b62 100644
--- a/libavcodec/x86/vvc/vvcdsp_init.c
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -316,16 +316,10 @@ ALF_FUNCS(16, 12, avx2)
 int bf(ff_vvc_sad, w, opt)(const int16_t *src0, const int16_t *src1, \
                            int dx, int dy, int block_w, int block_h) \
 
-SAD_PROTOTYPE(8,   avx2);
+SAD_PROTOTYPE(8,   sse4);
+SAD_PROTOTYPE(16,  sse4);
 SAD_PROTOTYPE(16,  avx2);
-
-#define SAD_INIT(opt) do {                   \
-    c->inter.sad[0] = ff_vvc_sad_8_##opt;    \
-    c->inter.sad[1] =                        \
-    c->inter.sad[2] =                        \
-    c->inter.sad[3] =                        \
-    c->inter.sad[4] = ff_vvc_sad_16_##opt;   \
-} while (0)
+SAD_PROTOTYPE(32,  avx2);
 #endif
 
 void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
@@ -337,36 +331,60 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
     case 8:
         if (EXTERNAL_SSE4(cpu_flags)) {
             MC_LINK_SSE4(8);
+            c->inter.sad[0] = ff_vvc_sad_8_sse4;
+            c->inter.sad[1] =
+            c->inter.sad[2] =
+            c->inter.sad[3] =
+            c->inter.sad[4] = ff_vvc_sad_16_sse4;
         }
         if (EXTERNAL_AVX2_FAST(cpu_flags)) {
             ALF_INIT(8);
             AVG_INIT(8, avx2);
             MC_LINKS_AVX2(8);
-            SAD_INIT(avx2);
+            c->inter.sad[1] = ff_vvc_sad_16_avx2;
+            c->inter.sad[2] =
+            c->inter.sad[3] =
+            c->inter.sad[4] = ff_vvc_sad_32_avx2;
         }
         break;
     case 10:
         if (EXTERNAL_SSE4(cpu_flags)) {
             MC_LINK_SSE4(10);
+            c->inter.sad[0] = ff_vvc_sad_8_sse4;
+            c->inter.sad[1] =
+            c->inter.sad[2] =
+            c->inter.sad[3] =
+            c->inter.sad[4] = ff_vvc_sad_16_sse4;
         }
         if (EXTERNAL_AVX2_FAST(cpu_flags)) {
             ALF_INIT(10);
             AVG_INIT(10, avx2);
             MC_LINKS_AVX2(10);
             MC_LINKS_16BPC_AVX2(10);
-            SAD_INIT(avx2);
+            c->inter.sad[1] = ff_vvc_sad_16_avx2;
+            c->inter.sad[2] =
+            c->inter.sad[3] =
+            c->inter.sad[4] = ff_vvc_sad_32_avx2;
         }
         break;
     case 12:
         if (EXTERNAL_SSE4(cpu_flags)) {
             MC_LINK_SSE4(12);
+            c->inter.sad[0] = ff_vvc_sad_8_sse4;
+            c->inter.sad[1] =
+            c->inter.sad[2] =
+            c->inter.sad[3] =
+            c->inter.sad[4] = ff_vvc_sad_16_sse4;
         }
         if (EXTERNAL_AVX2_FAST(cpu_flags)) {
             ALF_INIT(12);
             AVG_INIT(12, avx2);
             MC_LINKS_AVX2(12);
             MC_LINKS_16BPC_AVX2(12);
-            SAD_INIT(avx2);
+            c->inter.sad[1] = ff_vvc_sad_16_avx2;
+            c->inter.sad[2] =
+            c->inter.sad[3] =
+            c->inter.sad[4] = ff_vvc_sad_32_avx2;
         }
         break;
     default:
-- 
2.45.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [FFmpeg-devel] [PATCH 4/5] x86/vvc_sad: reduce gpr usage in all loop functions
  2024-05-23 12:27 [FFmpeg-devel] [PATCH 1/5] avcodec/vvc_mc: split the SAD dsp prototype into one function per blocksize width James Almer
  2024-05-23 12:27 ` [FFmpeg-devel] [PATCH 2/5] x86/vvc_sad: optimize vvc_sad_16 James Almer
  2024-05-23 12:27 ` [FFmpeg-devel] [PATCH 3/5] x86/vvc_sad: add sse4 versions of all functions James Almer
@ 2024-05-23 12:27 ` James Almer
  2024-05-23 12:27 ` [FFmpeg-devel] [PATCH 5/5] x86/vvc_sad: reindent after the previous changes James Almer
  2024-05-23 12:35 ` [FFmpeg-devel] [PATCH 1/5] avcodec/vvc_mc: split the SAD dsp prototype into one function per blocksize width Andreas Rheinhardt
  4 siblings, 0 replies; 6+ messages in thread
From: James Almer @ 2024-05-23 12:27 UTC (permalink / raw)
  To: ffmpeg-devel

This way they can be assembled on x86_32 targets.

Signed-off-by: James Almer <jamrial@gmail.com>
---
 libavcodec/x86/vvc/vvc_sad.asm   | 22 ++++++++++------------
 libavcodec/x86/vvc/vvcdsp_init.c | 16 +++++++++++++---
 2 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/libavcodec/x86/vvc/vvc_sad.asm b/libavcodec/x86/vvc/vvc_sad.asm
index 26df25ec66..9881b1180d 100644
--- a/libavcodec/x86/vvc/vvc_sad.asm
+++ b/libavcodec/x86/vvc/vvc_sad.asm
@@ -49,7 +49,7 @@ SECTION .text
 %endmacro
 
 %macro VVC_SAD 1
-cglobal vvc_sad_%1, 4, 7, 5, src1, src2, dx, dy, off1, block_h, off2
+cglobal vvc_sad_%1, 4, 6, 5, src1, src2, dx, dy, off, block_h
 %if UNIX64 == 0
     mov             block_hd, dword r5m
 %endif
@@ -59,12 +59,12 @@ cglobal vvc_sad_%1, 4, 7, 5, src1, src2, dx, dy, off1, block_h, off2
     sub             dxq, 2
     sub             dyq, 2
 
-    mov             off1q, 2
-    mov             off2q, 2
+    mov             offq, 2
 
-    add             off1q, dyq
-    sub             off2q, dyq
+    sub             offq, dyq
+    add             dyq, 2
 
+DEFINE_ARGS src1, src2, dx, off1, off2, block_h
     shl             off1q, 7
     shl             off2q, 7
 
@@ -100,19 +100,19 @@ cglobal vvc_sad_%1, 4, 7, 5, src1, src2, dx, dy, off1, block_h, off2
 %endmacro
 
 %macro VVC_SAD_LOOP 1
-cglobal vvc_sad_%1, 6, 8, 5, src1, src2, dx, dy, block_w, block_h, off1, off2
+cglobal vvc_sad_%1, 6, 7, 5, src1, src2, dx, dy, block_w, block_h, off
     movsxdifnidn    dxq, dxd
     movsxdifnidn    dyq, dyd
 
     sub             dxq, 2
     sub             dyq, 2
 
-    mov             off1q, 2
-    mov             off2q, 2
+    mov             offq, 2
 
-    add             off1q, dyq
-    sub             off2q, dyq
+    sub             offq, dyq
+    add             dyq, 2
 
+DEFINE_ARGS src1, src2, dx, off1, block_w, block_h, off2
     shl             off1q, 7
     shl             off2q, 7
 
@@ -159,7 +159,6 @@ DEFINE_ARGS src1, src2, dx, dy, block_w, block_h, row_idx
     RET
 %endmacro
 
-%if ARCH_X86_64
 INIT_XMM sse4
 VVC_SAD 8
 VVC_SAD_LOOP 16
@@ -168,4 +167,3 @@ INIT_YMM avx2
 VVC_SAD 16
 VVC_SAD_LOOP 32
 %endif
-%endif
diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
index cdf0e36b62..c0bd145191 100644
--- a/libavcodec/x86/vvc/vvcdsp_init.c
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -311,6 +311,7 @@ ALF_FUNCS(16, 12, avx2)
     c->alf.filter[CHROMA] = ff_vvc_alf_filter_chroma_##bd##_avx2;    \
     c->alf.classify       = ff_vvc_alf_classify_##bd##_avx2;         \
 } while (0)
+#endif
 
 #define SAD_PROTOTYPE(w, opt)                                        \
 int bf(ff_vvc_sad, w, opt)(const int16_t *src0, const int16_t *src1, \
@@ -320,17 +321,17 @@ SAD_PROTOTYPE(8,   sse4);
 SAD_PROTOTYPE(16,  sse4);
 SAD_PROTOTYPE(16,  avx2);
 SAD_PROTOTYPE(32,  avx2);
-#endif
 
 void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
 {
-#if ARCH_X86_64
     const int cpu_flags = av_get_cpu_flags();
 
     switch (bd) {
     case 8:
         if (EXTERNAL_SSE4(cpu_flags)) {
+#if ARCH_X86_64
             MC_LINK_SSE4(8);
+#endif
             c->inter.sad[0] = ff_vvc_sad_8_sse4;
             c->inter.sad[1] =
             c->inter.sad[2] =
@@ -338,9 +339,11 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
             c->inter.sad[4] = ff_vvc_sad_16_sse4;
         }
         if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+#if ARCH_X86_64
             ALF_INIT(8);
             AVG_INIT(8, avx2);
             MC_LINKS_AVX2(8);
+#endif
             c->inter.sad[1] = ff_vvc_sad_16_avx2;
             c->inter.sad[2] =
             c->inter.sad[3] =
@@ -349,7 +352,9 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
         break;
     case 10:
         if (EXTERNAL_SSE4(cpu_flags)) {
+#if ARCH_X86_64
             MC_LINK_SSE4(10);
+#endif
             c->inter.sad[0] = ff_vvc_sad_8_sse4;
             c->inter.sad[1] =
             c->inter.sad[2] =
@@ -357,10 +362,12 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
             c->inter.sad[4] = ff_vvc_sad_16_sse4;
         }
         if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+#if ARCH_X86_64
             ALF_INIT(10);
             AVG_INIT(10, avx2);
             MC_LINKS_AVX2(10);
             MC_LINKS_16BPC_AVX2(10);
+#endif
             c->inter.sad[1] = ff_vvc_sad_16_avx2;
             c->inter.sad[2] =
             c->inter.sad[3] =
@@ -369,7 +376,9 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
         break;
     case 12:
         if (EXTERNAL_SSE4(cpu_flags)) {
+#if ARCH_X86_64
             MC_LINK_SSE4(12);
+#endif
             c->inter.sad[0] = ff_vvc_sad_8_sse4;
             c->inter.sad[1] =
             c->inter.sad[2] =
@@ -377,10 +386,12 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
             c->inter.sad[4] = ff_vvc_sad_16_sse4;
         }
         if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+#if ARCH_X86_64
             ALF_INIT(12);
             AVG_INIT(12, avx2);
             MC_LINKS_AVX2(12);
             MC_LINKS_16BPC_AVX2(12);
+#endif
             c->inter.sad[1] = ff_vvc_sad_16_avx2;
             c->inter.sad[2] =
             c->inter.sad[3] =
@@ -390,5 +401,4 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
     default:
         break;
     }
-#endif
 }
-- 
2.45.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [FFmpeg-devel] [PATCH 5/5] x86/vvc_sad: reindent after the previous changes
  2024-05-23 12:27 [FFmpeg-devel] [PATCH 1/5] avcodec/vvc_mc: split the SAD dsp prototype into one function per blocksize width James Almer
                   ` (2 preceding siblings ...)
  2024-05-23 12:27 ` [FFmpeg-devel] [PATCH 4/5] x86/vvc_sad: reduce gpr usage in all loop functions James Almer
@ 2024-05-23 12:27 ` James Almer
  2024-05-23 12:35 ` [FFmpeg-devel] [PATCH 1/5] avcodec/vvc_mc: split the SAD dsp prototype into one function per blocksize width Andreas Rheinhardt
  4 siblings, 0 replies; 6+ messages in thread
From: James Almer @ 2024-05-23 12:27 UTC (permalink / raw)
  To: ffmpeg-devel

Signed-off-by: James Almer <jamrial@gmail.com>
---
 libavcodec/x86/vvc/vvc_sad.asm | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/libavcodec/x86/vvc/vvc_sad.asm b/libavcodec/x86/vvc/vvc_sad.asm
index 9881b1180d..14f7ce230e 100644
--- a/libavcodec/x86/vvc/vvc_sad.asm
+++ b/libavcodec/x86/vvc/vvc_sad.asm
@@ -81,7 +81,7 @@ DEFINE_ARGS src1, src2, dx, off1, off2, block_h
     mova               m4, [pw_1]
 %endif
 
-        .loop_height:
+    .loop_height:
         movu               m0, [src1q]
         movu               m1, [src2q]
         MIN_MAX_SAD        m1, m0, m2
@@ -94,8 +94,8 @@ DEFINE_ARGS src1, src2, dx, off1, off2, block_h
         sub      block_hd, 2
         jg   .loop_height
 
-        HORIZ_ADD     xm0, xm3, m3
-        movd          eax, xm0
+    HORIZ_ADD     xm0, xm3, m3
+    movd          eax, xm0
     RET
 %endmacro
 
@@ -129,13 +129,13 @@ DEFINE_ARGS src1, src2, dx, off1, block_w, block_h, off2
     mova               m4, [pw_1]
 %endif
 
-        shl      block_wd, 1
-        add         src1q, block_wq
-        add         src2q, block_wq
-        neg      block_wq
+    shl      block_wd, 1
+    add         src1q, block_wq
+    add         src2q, block_wq
+    neg      block_wq
 
 DEFINE_ARGS src1, src2, dx, dy, block_w, block_h, row_idx
-        .loop_height:
+    .loop_height:
         mov      row_idxq, block_wq
 
         .loop_width:
@@ -154,8 +154,8 @@ DEFINE_ARGS src1, src2, dx, dy, block_w, block_h, row_idx
         sub      block_hd, 2
         jg   .loop_height
 
-        HORIZ_ADD     xm0, xm3, m3
-        movd          eax, xm0
+    HORIZ_ADD     xm0, xm3, m3
+    movd          eax, xm0
     RET
 %endmacro
 
-- 
2.45.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/5] avcodec/vvc_mc: split the SAD dsp prototype into one function per blocksize width
  2024-05-23 12:27 [FFmpeg-devel] [PATCH 1/5] avcodec/vvc_mc: split the SAD dsp prototype into one function per blocksize width James Almer
                   ` (3 preceding siblings ...)
  2024-05-23 12:27 ` [FFmpeg-devel] [PATCH 5/5] x86/vvc_sad: reindent after the previous changes James Almer
@ 2024-05-23 12:35 ` Andreas Rheinhardt
  4 siblings, 0 replies; 6+ messages in thread
From: Andreas Rheinhardt @ 2024-05-23 12:35 UTC (permalink / raw)
  To: ffmpeg-devel

James Almer:
> Signed-off-by: James Almer <jamrial@gmail.com>
> ---

The commit message should explain what the advantage of this is.
Particularly, what is the advantage of this over jumping in the function
to based upon blocksize vs. selecting an appropriate function in the
generic code (even when these functions turn out all the same as is for
the C version).

>  libavcodec/vvc/dsp.h             |  2 +-
>  libavcodec/vvc/inter.c           |  6 ++++--
>  libavcodec/vvc/inter_template.c  |  6 +++++-
>  libavcodec/x86/vvc/vvc_sad.asm   | 32 ++++++++++++++++++++++++++------
>  libavcodec/x86/vvc/vvcdsp_init.c | 22 +++++++++++++++++-----
>  tests/checkasm/vvc_mc.c          |  3 ++-
>  6 files changed, 55 insertions(+), 16 deletions(-)
> 
> diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h
> index 1f14096c41..55c4c81f53 100644
> --- a/libavcodec/vvc/dsp.h
> +++ b/libavcodec/vvc/dsp.h
> @@ -99,7 +99,7 @@ typedef struct VVCInterDSPContext {
>  
>      void (*apply_bdof)(uint8_t *dst, ptrdiff_t dst_stride, int16_t *src0, int16_t *src1, int block_w, int block_h);
>  
> -    int (*sad)(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h);
> +    int (*sad[5])(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h);
>      void (*dmvr[2][2])(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, int height,
>          intptr_t mx, intptr_t my, int width);
>  } VVCInterDSPContext;
> diff --git a/libavcodec/vvc/inter.c b/libavcodec/vvc/inter.c
> index e1011b4fa1..0214e46634 100644
> --- a/libavcodec/vvc/inter.c
> +++ b/libavcodec/vvc/inter.c
> @@ -740,6 +740,8 @@ static void dmvr_mv_refine(VVCLocalContext *lc, MvField *mvf, MvField *orig_mv,
>      const AVFrame *ref0, const AVFrame *ref1, const int x_off, const int y_off, const int block_w, const int block_h)
>  {
>      const VVCFrameContext *fc   = lc->fc;
> +    static const uint8_t sad_tab[16] = { 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4 };
> +    const int tab               = sad_tab[(FFALIGN(block_w, 8) >> 3) - 1];
>      const int sr_range          = 2;
>      const AVFrame *ref[]        = { ref0, ref1 };
>      int16_t *tmp[]              = { lc->tmp, lc->tmp1 };
> @@ -763,7 +765,7 @@ static void dmvr_mv_refine(VVCLocalContext *lc, MvField *mvf, MvField *orig_mv,
>          fc->vvcdsp.inter.dmvr[!!my][!!mx](tmp[i], src, src_stride, pred_h, mx, my, pred_w);
>      }
>  
> -    min_sad = fc->vvcdsp.inter.sad(tmp[L0], tmp[L1], dx, dy, block_w, block_h);
> +    min_sad = fc->vvcdsp.inter.sad[tab](tmp[L0], tmp[L1], dx, dy, block_w, block_h);
>      min_sad -= min_sad >> 2;
>      sad[dy][dx] = min_sad;
>  
> @@ -773,7 +775,7 @@ static void dmvr_mv_refine(VVCLocalContext *lc, MvField *mvf, MvField *orig_mv,
>          for (dy = 0; dy < SAD_ARRAY_SIZE; dy++) {
>              for (dx = 0; dx < SAD_ARRAY_SIZE; dx++) {
>                  if (dx != sr_range || dy != sr_range) {
> -                    sad[dy][dx] = fc->vvcdsp.inter.sad(lc->tmp, lc->tmp1, dx, dy, block_w, block_h);
> +                    sad[dy][dx] = fc->vvcdsp.inter.sad[tab](lc->tmp, lc->tmp1, dx, dy, block_w, block_h);
>                      if (sad[dy][dx] < min_sad) {
>                          min_sad = sad[dy][dx];
>                          min_dx = dx;
> diff --git a/libavcodec/vvc/inter_template.c b/libavcodec/vvc/inter_template.c
> index a8068f4ba8..34485321d3 100644
> --- a/libavcodec/vvc/inter_template.c
> +++ b/libavcodec/vvc/inter_template.c
> @@ -626,7 +626,11 @@ static void FUNC(ff_vvc_inter_dsp_init)(VVCInterDSPContext *const inter)
>      inter->apply_prof_uni_w     = FUNC(apply_prof_uni_w);
>      inter->apply_bdof           = FUNC(apply_bdof);
>      inter->prof_grad_filter     = FUNC(prof_grad_filter);
> -    inter->sad                  = vvc_sad;
> +    inter->sad[0]               =
> +    inter->sad[1]               =
> +    inter->sad[2]               =
> +    inter->sad[3]               =
> +    inter->sad[4]               = vvc_sad;
>  }
>  
>  #undef FUNCS
> diff --git a/libavcodec/x86/vvc/vvc_sad.asm b/libavcodec/x86/vvc/vvc_sad.asm
> index b468d89ac2..a20818530f 100644
> --- a/libavcodec/x86/vvc/vvc_sad.asm
> +++ b/libavcodec/x86/vvc/vvc_sad.asm
> @@ -51,7 +51,7 @@ SECTION .text
>  
>  INIT_YMM avx2
>  
> -cglobal vvc_sad, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx
> +cglobal vvc_sad_8, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx
>      movsxdifnidn    dxq, dxd
>      movsxdifnidn    dyq, dyd
>  
> @@ -76,10 +76,6 @@ cglobal vvc_sad, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_
>      pxor               m3, m3
>      vpbroadcastd       m4, [pw_1]
>  
> -    cmp          block_wd, 16
> -    jge    vvc_sad_16_128
> -
> -    vvc_sad_8:
>          .loop_height:
>          movu              xm0, [src1q]
>          vinserti128        m0, m0, [src1q + MAX_PB_SIZE * ROWS * 2], 1
> @@ -100,7 +96,31 @@ cglobal vvc_sad, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_
>          movd          eax, xm0
>      RET
>  
> -    vvc_sad_16_128:
> +cglobal vvc_sad_16, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx
> +    movsxdifnidn    dxq, dxd
> +    movsxdifnidn    dyq, dyd
> +
> +    sub             dxq, 2
> +    sub             dyq, 2
> +
> +    mov             off1q, 2
> +    mov             off2q, 2
> +
> +    add             off1q, dyq
> +    sub             off2q, dyq
> +
> +    shl             off1q, 7
> +    shl             off2q, 7
> +
> +    add             off1q, dxq
> +    sub             off2q, dxq
> +
> +    lea             src1q, [src1q + off1q * 2 + 2 * 2]
> +    lea             src2q, [src2q + off2q * 2 + 2 * 2]
> +
> +    pxor               m3, m3
> +    vpbroadcastd       m4, [pw_1]
> +
>          sar      block_wd, 4
>          .loop_height:
>          mov         off1q, src1q
> diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
> index 4b4a2aa937..bd60963432 100644
> --- a/libavcodec/x86/vvc/vvcdsp_init.c
> +++ b/libavcodec/x86/vvc/vvcdsp_init.c
> @@ -312,8 +312,20 @@ ALF_FUNCS(16, 12, avx2)
>      c->alf.classify       = ff_vvc_alf_classify_##bd##_avx2;         \
>  } while (0)
>  
> -int ff_vvc_sad_avx2(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h);
> -#define SAD_INIT() c->inter.sad = ff_vvc_sad_avx2
> +#define SAD_PROTOTYPE(w, opt)                                        \
> +int bf(ff_vvc_sad, w, opt)(const int16_t *src0, const int16_t *src1, \
> +                           int dx, int dy, int block_w, int block_h) \
> +
> +SAD_PROTOTYPE(8,   avx2);
> +SAD_PROTOTYPE(16,  avx2);
> +
> +#define SAD_INIT(opt) do {                   \
> +    c->inter.sad[0] = ff_vvc_sad_8_##opt;    \
> +    c->inter.sad[1] =                        \
> +    c->inter.sad[2] =                        \
> +    c->inter.sad[3] =                        \
> +    c->inter.sad[4] = ff_vvc_sad_16_##opt;   \
> +} while (0)
>  #endif
>  
>  void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
> @@ -330,7 +342,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
>              ALF_INIT(8);
>              AVG_INIT(8, avx2);
>              MC_LINKS_AVX2(8);
> -            SAD_INIT();
> +            SAD_INIT(avx2);
>          }
>          break;
>      case 10:
> @@ -342,7 +354,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
>              AVG_INIT(10, avx2);
>              MC_LINKS_AVX2(10);
>              MC_LINKS_16BPC_AVX2(10);
> -            SAD_INIT();
> +            SAD_INIT(avx2);
>          }
>          break;
>      case 12:
> @@ -354,7 +366,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
>              AVG_INIT(12, avx2);
>              MC_LINKS_AVX2(12);
>              MC_LINKS_16BPC_AVX2(12);
> -            SAD_INIT();
> +            SAD_INIT(avx2);
>          }
>          break;
>      default:
> diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
> index 1e889e2cff..deae1014d2 100644
> --- a/tests/checkasm/vvc_mc.c
> +++ b/tests/checkasm/vvc_mc.c
> @@ -327,6 +327,7 @@ static void check_avg(void)
>  static void check_vvc_sad(void)
>  {
>      const int bit_depth = 10;
> +    static const uint8_t sad_tab[16] = { 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4 };
>      VVCDSPContext c;
>      LOCAL_ALIGNED_32(uint16_t, src0, [MAX_CTU_SIZE * MAX_CTU_SIZE * 4]);
>      LOCAL_ALIGNED_32(uint16_t, src1, [MAX_CTU_SIZE * MAX_CTU_SIZE * 4]);
> @@ -341,7 +342,7 @@ static void check_vvc_sad(void)
>          for (int w = 8; w <= MAX_CTU_SIZE; w *= 2) {
>              for(int offy = 0; offy <= 4; offy++) {
>                  for(int offx = 0; offx <= 4; offx++) {
> -                    if(check_func(c.inter.sad, "sad_%dx%d", w, h)) {
> +                    if(check_func(c.inter.sad[sad_tab[(w >> 3) - 1]], "sad_%dx%d", w, h)) {
>                          int result0;
>                          int result1;
>  

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2024-05-23 12:36 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-05-23 12:27 [FFmpeg-devel] [PATCH 1/5] avcodec/vvc_mc: split the SAD dsp prototype into one function per blocksize width James Almer
2024-05-23 12:27 ` [FFmpeg-devel] [PATCH 2/5] x86/vvc_sad: optimize vvc_sad_16 James Almer
2024-05-23 12:27 ` [FFmpeg-devel] [PATCH 3/5] x86/vvc_sad: add sse4 versions of all functions James Almer
2024-05-23 12:27 ` [FFmpeg-devel] [PATCH 4/5] x86/vvc_sad: reduce gpr usage in all loop functions James Almer
2024-05-23 12:27 ` [FFmpeg-devel] [PATCH 5/5] x86/vvc_sad: reindent after the previous changes James Almer
2024-05-23 12:35 ` [FFmpeg-devel] [PATCH 1/5] avcodec/vvc_mc: split the SAD dsp prototype into one function per blocksize width Andreas Rheinhardt

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git