Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH v4 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC
@ 2024-05-20  0:42 Stone Chen
  2024-05-20  0:42 ` [FFmpeg-devel] [PATCH v4 2/2][GSoC 2024] tests/checkasm: Add check_vvc_sad to vvc_mc.c Stone Chen
                   ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: Stone Chen @ 2024-05-20  0:42 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Stone Chen

Implements AVX2 DMVR (decoder-side motion vector refinement) SAD functions. DMVR SAD is only calculated if w >= 8, h >= 8, and w * h > 128. To reduce complexity, SAD is only calculated on even rows. This is calculated for all video bitdepths, but the values passed to the function are always 16bit (even if the original video bitdepth is 8). The AVX2 implementation uses min/max/sub.

Benchmarks ( AMD 7940HS )
Before:
BQTerrace_1920x1080_60_10_420_22_RA.vvc | 106.0 |
Chimera_8bit_1080P_1000_frames.vvc | 204.3 |
NovosobornayaSquare_1920x1080.bin | 197.3 |
RitualDance_1920x1080_60_10_420_37_RA.266 | 174.0 |

After:
BQTerrace_1920x1080_60_10_420_22_RA.vvc | 109.3 |
Chimera_8bit_1080P_1000_frames.vvc | 216.0 |
NovosobornayaSquare_1920x1080.bin | 204.0|
RitualDance_1920x1080_60_10_420_37_RA.266 | 181.7 |
---
 libavcodec/x86/vvc/Makefile      |   3 +-
 libavcodec/x86/vvc/vvc_sad.asm   | 138 +++++++++++++++++++++++++++++++
 libavcodec/x86/vvc/vvcdsp_init.c |   6 ++
 3 files changed, 146 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/x86/vvc/vvc_sad.asm

diff --git a/libavcodec/x86/vvc/Makefile b/libavcodec/x86/vvc/Makefile
index d6a66f860a..7b2438ce17 100644
--- a/libavcodec/x86/vvc/Makefile
+++ b/libavcodec/x86/vvc/Makefile
@@ -5,4 +5,5 @@ OBJS-$(CONFIG_VVC_DECODER)             += x86/vvc/vvcdsp_init.o \
                                           x86/h26x/h2656dsp.o
 X86ASM-OBJS-$(CONFIG_VVC_DECODER)      += x86/vvc/vvc_alf.o      \
                                           x86/vvc/vvc_mc.o       \
-                                          x86/h26x/h2656_inter.o
+                                          x86/vvc/vvc_sad.o      \
+                                          x86/h26x/h2656_inter.o 
diff --git a/libavcodec/x86/vvc/vvc_sad.asm b/libavcodec/x86/vvc/vvc_sad.asm
new file mode 100644
index 0000000000..58a24635d2
--- /dev/null
+++ b/libavcodec/x86/vvc/vvc_sad.asm
@@ -0,0 +1,138 @@
+; /*
+; * Provide SIMD DMVR SAD functions for VVC decoding
+; *
+; * Copyright (c) 2024 Stone Chen
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+; */
+
+%include "libavutil/x86/x86util.asm"
+%define MAX_PB_SIZE 128
+%define ROWS 2    
+
+SECTION_RODATA
+
+pw_1: dw 1
+
+; DMVR SAD is only calculated on even rows to reduce complexity
+SECTION .text
+
+%macro MIN_MAX_SAD 3 ; 
+    pminuw           %3, %2, %1
+    pmaxuw           %1, %2, %1
+    psubusw          %1, %1, %3
+%endmacro
+
+%macro HORIZ_ADD 3  ; xm0, xm1, m1
+    vextracti128      %1, %3, q0001  ;        3        2      1          0
+    paddd            %1, %2         ; xm0 (7 + 3) (6 + 2) (5 + 1)   (4 + 0)
+    pshufd           %2, %1, q0032  ; xm1    -      -     (7 + 3)   (6 + 2)
+    paddd            %1, %1, %2     ; xm0    _      _     (5 1 7 3) (4 0 6 2)
+    pshufd           %2, %1, q0001  ; xm1    _      _     (5 1 7 3) (5 1 7 3)
+    paddd            %1, %1, %2     ;                               (01234567)
+%endmacro
+
+%macro INIT_OFFSET 6 ; src1, src2, dxq, dyq, off1, off2
+    sub             %3, 2
+    sub             %4, 2
+
+    mov             %5, 2
+    mov             %6, 2
+
+    add             %5, %4   
+    sub             %6, %4
+
+    imul            %5, 128
+    imul            %6, 128
+
+    add             %5, 2
+    add             %6, 2
+    
+    add             %5, %3
+    sub             %6, %3
+
+    lea             %1, [%1 + %5 * 2]
+    lea             %2, [%2 + %6 * 2]
+%endmacro
+
+%if ARCH_X86_64
+%if HAVE_AVX2_EXTERNAL
+
+INIT_YMM avx2
+
+cglobal vvc_sad, 6, 11, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx, dx2, dy2
+    movsxd           dx2q, dxd
+    movsxd           dy2q, dyd
+    INIT_OFFSET     src1q, src2q, dx2q, dy2q, off1q, off2q
+    pxor               m3, m3
+    vpbroadcastw       m4, [pw_1]
+
+    cmp          block_wd, 16
+    jge    vvc_sad_16_128
+
+    vvc_sad_8:
+        .loop_height:
+        movu              xm0, [src1q]
+        vinserti128        m0, [src1q + MAX_PB_SIZE * ROWS * 2], 1
+        movu              xm1, [src2q]
+        vinserti128        m1, [src2q + MAX_PB_SIZE * ROWS * 2], 1
+
+        MIN_MAX_SAD        m1, m0, m2
+        pmaddwd            m1, m4
+        paddd              m3, m1
+
+        add         src1q, 2 * MAX_PB_SIZE * ROWS * 2 
+        add         src2q, 2 * MAX_PB_SIZE * ROWS * 2
+
+        sub      block_hd, 4
+        jg   .loop_height
+
+        HORIZ_ADD     xm0, xm3, m3
+        movd          eax, xm0
+    RET
+
+    vvc_sad_16_128:
+        .loop_height:
+        mov         off1q, src1q
+        mov         off2q, src2q
+        mov      row_idxd, block_wd
+        sar      row_idxd, 4
+
+        .loop_width:
+            movu               m0, [src1q]
+            movu               m1, [src2q]
+            MIN_MAX_SAD        m1, m0, m2
+            pmaddwd            m1, m4
+            paddd              m3, m1
+
+            add             src1q, 32
+            add             src2q, 32
+            dec          row_idxd
+            jg        .loop_width
+
+        lea         src1q, [off1q + ROWS * MAX_PB_SIZE * 2] 
+        lea         src2q, [off2q + ROWS * MAX_PB_SIZE * 2]
+
+        sub      block_hd, 2
+        jg   .loop_height
+
+        HORIZ_ADD     xm0, xm3, m3
+        movd          eax, xm0
+    RET
+
+%endif
+%endif
diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
index 0e68971b2c..4b4a2aa937 100644
--- a/libavcodec/x86/vvc/vvcdsp_init.c
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -311,6 +311,9 @@ ALF_FUNCS(16, 12, avx2)
     c->alf.filter[CHROMA] = ff_vvc_alf_filter_chroma_##bd##_avx2;    \
     c->alf.classify       = ff_vvc_alf_classify_##bd##_avx2;         \
 } while (0)
+
+int ff_vvc_sad_avx2(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h);
+#define SAD_INIT() c->inter.sad = ff_vvc_sad_avx2
 #endif
 
 void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
@@ -327,6 +330,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
             ALF_INIT(8);
             AVG_INIT(8, avx2);
             MC_LINKS_AVX2(8);
+            SAD_INIT();
         }
         break;
     case 10:
@@ -338,6 +342,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
             AVG_INIT(10, avx2);
             MC_LINKS_AVX2(10);
             MC_LINKS_16BPC_AVX2(10);
+            SAD_INIT();
         }
         break;
     case 12:
@@ -349,6 +354,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
             AVG_INIT(12, avx2);
             MC_LINKS_AVX2(12);
             MC_LINKS_16BPC_AVX2(12);
+            SAD_INIT();
         }
         break;
     default:
-- 
2.45.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [FFmpeg-devel] [PATCH v4 2/2][GSoC 2024] tests/checkasm: Add check_vvc_sad to vvc_mc.c
  2024-05-20  0:42 [FFmpeg-devel] [PATCH v4 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC Stone Chen
@ 2024-05-20  0:42 ` Stone Chen
  2024-05-21  5:12   ` Rémi Denis-Courmont
  2024-05-21 14:35   ` Ronald S. Bultje
  2024-05-20 11:23 ` [FFmpeg-devel] [PATCH v4 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC Ronald S. Bultje
  2024-05-20 15:52 ` Ronald S. Bultje
  2 siblings, 2 replies; 11+ messages in thread
From: Stone Chen @ 2024-05-20  0:42 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Stone Chen

Adds checkasm for DMVR SAD AVX2 implementation.

Benchmarks ( AMD 7940HS )
vvc_sad_8x8_c: 70.0
vvc_sad_8x8_avx2: 10.0
vvc_sad_16x16_c: 280.0
vvc_sad_16x16_avx2: 20.0
vvc_sad_32x32_c: 1020.0
vvc_sad_32x32_avx2: 70.0
vvc_sad_64x64_c: 3560.0
vvc_sad_64x64_avx2: 270.0
vvc_sad_128x128_c: 13760.0
vvc_sad_128x128_avx2: 1070.0
---
 tests/checkasm/vvc_mc.c | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
index 97f57cb401..e251400bfc 100644
--- a/tests/checkasm/vvc_mc.c
+++ b/tests/checkasm/vvc_mc.c
@@ -322,8 +322,46 @@ static void check_avg(void)
     report("avg");
 }
 
+static void check_vvc_sad(void)
+{
+    const int bit_depth = 10;
+    VVCDSPContext c;
+    LOCAL_ALIGNED_32(uint16_t, src0, [MAX_CTU_SIZE * MAX_CTU_SIZE * 4]);
+    LOCAL_ALIGNED_32(uint16_t, src1, [MAX_CTU_SIZE * MAX_CTU_SIZE * 4]);
+    declare_func(int, const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h);
+
+    ff_vvc_dsp_init(&c, bit_depth);
+    memset(src0, 0, MAX_CTU_SIZE * MAX_CTU_SIZE * 4);
+    memset(src1, 0, MAX_CTU_SIZE * MAX_CTU_SIZE * 4);
+
+    randomize_pixels(src0, src1, MAX_CTU_SIZE * MAX_CTU_SIZE * 2);
+     for (int h = 8; h <= MAX_CTU_SIZE; h *= 2) {
+        for (int w = 8; w <= MAX_CTU_SIZE; w *= 2) {
+            for(int offy = 0; offy <= 4; offy++) {
+                for(int offx = 0; offx <= 4; offx++) {
+                    if(check_func(c.inter.sad, "vvc_sad_%dx%d", w, h)) {
+                        int result0;
+                        int result1;
+
+                        result0 =  call_ref(src0 + PIXEL_STRIDE * 2 + 2, src1 + PIXEL_STRIDE * 2 + 2, offx, offy, w, h);
+                        result1 =  call_new(src0 + PIXEL_STRIDE * 2 + 2, src1 + PIXEL_STRIDE * 2 + 2, offx, offy, w, h);
+
+                        if (result1 != result0)
+                            fail();
+                        if(w == h && offx == 0 && offy == 0)
+                            bench_new(src0 + PIXEL_STRIDE * 2 + 2, src1 + PIXEL_STRIDE * 2 + 2, offx, offy, w, h);
+                    }
+                }
+            }
+        }
+     }
+
+    report("check_vvc_sad");
+}
+
 void checkasm_check_vvc_mc(void)
 {
+    check_vvc_sad();
     check_put_vvc_luma();
     check_put_vvc_luma_uni();
     check_put_vvc_chroma();
-- 
2.45.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [FFmpeg-devel] [PATCH v4 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC
  2024-05-20  0:42 [FFmpeg-devel] [PATCH v4 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC Stone Chen
  2024-05-20  0:42 ` [FFmpeg-devel] [PATCH v4 2/2][GSoC 2024] tests/checkasm: Add check_vvc_sad to vvc_mc.c Stone Chen
@ 2024-05-20 11:23 ` Ronald S. Bultje
  2024-05-20 15:52 ` Ronald S. Bultje
  2 siblings, 0 replies; 11+ messages in thread
From: Ronald S. Bultje @ 2024-05-20 11:23 UTC (permalink / raw)
  To: FFmpeg development discussions and patches; +Cc: Stone Chen

Hi,

This is mostly good, the following is tiny nitpicks.

On Sun, May 19, 2024 at 8:46 PM Stone Chen <chen.stonechen@gmail.com> wrote:

> +%macro INIT_OFFSET 6 ; src1, src2, dxq, dyq, off1, off2
>

The macro is only used once, so you could inline it in the calling function.

>
> +    imul            %5, 128
> +    imul            %6, 128
>

I believe shl is typically preferred over imul for powers of two.


> +    add             %5, 2
> +    add             %6, 2
>

And these can be integrated as a constant offset in the lea below (lea %1,
[%1 + %5 * 2 + 2 * 2], same for %2).


> +    add             %5, %3
> +    sub             %6, %3
> +
> +    lea             %1, [%1 + %5 * 2]
> +    lea             %2, [%2 + %6 * 2]

[..]

> +cglobal vvc_sad, 6, 11, 5, src1, src2, dx, dy, block_w, block_h, off1,
> off2, row_idx, dx2, dy2
> +    movsxd           dx2q, dxd
> +    movsxd           dy2q, dyd
>

If you change the argument type from int to intptr_t, this is not necessary
anymore.


> +    vvc_sad_16_128:
> +        .loop_height:
> +        mov         off1q, src1q
> +        mov         off2q, src2q
> +        mov      row_idxd, block_wd
> +        sar      row_idxd, 4
>

You could right-shift block_wd by 4 outside the loop (before .loop_height).

Ronald
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [FFmpeg-devel] [PATCH v4 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC
  2024-05-20  0:42 [FFmpeg-devel] [PATCH v4 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC Stone Chen
  2024-05-20  0:42 ` [FFmpeg-devel] [PATCH v4 2/2][GSoC 2024] tests/checkasm: Add check_vvc_sad to vvc_mc.c Stone Chen
  2024-05-20 11:23 ` [FFmpeg-devel] [PATCH v4 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC Ronald S. Bultje
@ 2024-05-20 15:52 ` Ronald S. Bultje
  2024-05-22  0:05   ` Stone Chen
  2 siblings, 1 reply; 11+ messages in thread
From: Ronald S. Bultje @ 2024-05-20 15:52 UTC (permalink / raw)
  To: FFmpeg development discussions and patches; +Cc: Stone Chen

Hi,

one more, I forgot.

On Sun, May 19, 2024 at 8:46 PM Stone Chen <chen.stonechen@gmail.com> wrote:

> +pw_1: dw 1
>
[..]

> +    vpbroadcastw       m4, [pw_1]
>

We typically suggest to use vpbroadcastd, not w (and then pw_1: times 2 dw
1). agner shows that on e.g. Haswell, the former (d) is 1 uops with 5
cycles latency, whereas the latter (w) is 3 uops with 7 cycles latency, or
more generally d is faster then w.

Ronald
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [FFmpeg-devel] [PATCH v4 2/2][GSoC 2024] tests/checkasm: Add check_vvc_sad to vvc_mc.c
  2024-05-20  0:42 ` [FFmpeg-devel] [PATCH v4 2/2][GSoC 2024] tests/checkasm: Add check_vvc_sad to vvc_mc.c Stone Chen
@ 2024-05-21  5:12   ` Rémi Denis-Courmont
  2024-05-21  6:37     ` Martin Storsjö
  2024-05-21 14:35   ` Ronald S. Bultje
  1 sibling, 1 reply; 11+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-21  5:12 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Hi,

Le 20 mai 2024 03:42:03 GMT+03:00, Stone Chen <chen.stonechen@gmail.com> a écrit :
>Adds checkasm for DMVR SAD AVX2 implementation.
>
>Benchmarks ( AMD 7940HS )
>vvc_sad_8x8_c: 70.0
>vvc_sad_8x8_avx2: 10.0
>vvc_sad_16x16_c: 280.0
>vvc_sad_16x16_avx2: 20.0
>vvc_sad_32x32_c: 1020.0
>vvc_sad_32x32_avx2: 70.0
>vvc_sad_64x64_c: 3560.0
>vvc_sad_64x64_avx2: 270.0
>vvc_sad_128x128_c: 13760.0
>vvc_sad_128x128_avx2: 1070.0
>---
> tests/checkasm/vvc_mc.c | 38 ++++++++++++++++++++++++++++++++++++++
> 1 file changed, 38 insertions(+)

VVC benchmarks have increased checksam runtime by at least an order of magnitude. It's become so prohibitively slow that I could not even get to the end.

This is not an acceptable situation and impedes non-VVC assembler work 

Please fix this before you add any new VVC tests. In the mean time:

-1 / Nack all VVC checksam from my behalf.


>diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
>index 97f57cb401..e251400bfc 100644
>--- a/tests/checkasm/vvc_mc.c
>+++ b/tests/checkasm/vvc_mc.c
>@@ -322,8 +322,46 @@ static void check_avg(void)
>     report("avg");
> }
> 
>+static void check_vvc_sad(void)
>+{
>+    const int bit_depth = 10;
>+    VVCDSPContext c;
>+    LOCAL_ALIGNED_32(uint16_t, src0, [MAX_CTU_SIZE * MAX_CTU_SIZE * 4]);
>+    LOCAL_ALIGNED_32(uint16_t, src1, [MAX_CTU_SIZE * MAX_CTU_SIZE * 4]);
>+    declare_func(int, const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h);
>+
>+    ff_vvc_dsp_init(&c, bit_depth);
>+    memset(src0, 0, MAX_CTU_SIZE * MAX_CTU_SIZE * 4);
>+    memset(src1, 0, MAX_CTU_SIZE * MAX_CTU_SIZE * 4);
>+
>+    randomize_pixels(src0, src1, MAX_CTU_SIZE * MAX_CTU_SIZE * 2);
>+     for (int h = 8; h <= MAX_CTU_SIZE; h *= 2) {
>+        for (int w = 8; w <= MAX_CTU_SIZE; w *= 2) {
>+            for(int offy = 0; offy <= 4; offy++) {
>+                for(int offx = 0; offx <= 4; offx++) {
>+                    if(check_func(c.inter.sad, "vvc_sad_%dx%d", w, h)) {
>+                        int result0;
>+                        int result1;
>+
>+                        result0 =  call_ref(src0 + PIXEL_STRIDE * 2 + 2, src1 + PIXEL_STRIDE * 2 + 2, offx, offy, w, h);
>+                        result1 =  call_new(src0 + PIXEL_STRIDE * 2 + 2, src1 + PIXEL_STRIDE * 2 + 2, offx, offy, w, h);
>+
>+                        if (result1 != result0)
>+                            fail();
>+                        if(w == h && offx == 0 && offy == 0)
>+                            bench_new(src0 + PIXEL_STRIDE * 2 + 2, src1 + PIXEL_STRIDE * 2 + 2, offx, offy, w, h);
>+                    }
>+                }
>+            }
>+        }
>+     }
>+
>+    report("check_vvc_sad");
>+}
>+
> void checkasm_check_vvc_mc(void)
> {
>+    check_vvc_sad();
>     check_put_vvc_luma();
>     check_put_vvc_luma_uni();
>     check_put_vvc_chroma();
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [FFmpeg-devel] [PATCH v4 2/2][GSoC 2024] tests/checkasm: Add check_vvc_sad to vvc_mc.c
  2024-05-21  5:12   ` Rémi Denis-Courmont
@ 2024-05-21  6:37     ` Martin Storsjö
  2024-05-21  8:47       ` Rémi Denis-Courmont
  0 siblings, 1 reply; 11+ messages in thread
From: Martin Storsjö @ 2024-05-21  6:37 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

On Tue, 21 May 2024, Rémi Denis-Courmont wrote:

> Hi,
>
> Le 20 mai 2024 03:42:03 GMT+03:00, Stone Chen <chen.stonechen@gmail.com> a écrit :
>> Adds checkasm for DMVR SAD AVX2 implementation.
>>
>> Benchmarks ( AMD 7940HS )
>> vvc_sad_8x8_c: 70.0
>> vvc_sad_8x8_avx2: 10.0
>> vvc_sad_16x16_c: 280.0
>> vvc_sad_16x16_avx2: 20.0
>> vvc_sad_32x32_c: 1020.0
>> vvc_sad_32x32_avx2: 70.0
>> vvc_sad_64x64_c: 3560.0
>> vvc_sad_64x64_avx2: 270.0
>> vvc_sad_128x128_c: 13760.0
>> vvc_sad_128x128_avx2: 1070.0
>> ---
>> tests/checkasm/vvc_mc.c | 38 ++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 38 insertions(+)
>
> VVC benchmarks have increased checksam runtime by at least an order of 
> magnitude. It's become so prohibitively slow that I could not even get 
> to the end.
>
> This is not an acceptable situation and impedes non-VVC assembler work

I don't quite understand; whenever benchmarking anything in checkasm, I 
would always run e.g. "checkasm --test=ac3dsp 
--bench=ac3_sum_square_bufferfly_float", limiting the total running of 
tests to a specific module, and only benchmarking a subset of the run 
functions. (The --bench parameter specifies a prefix; only functions 
matching that prefix gets benchmarked.)

Without limiting the scope with a --test parameter, checkasm benchmarking 
has always been prohibitively slow for me - so I don't think there's 
anything new here? If you were lucky enough to be able to do a full run of 
checkasm with benchmarks of all functions before, that sounds like an 
exception to me, not a reason to limit adding new tests?

That said I'm not familiar with the VVC tests in checkasm, perhaps they 
benchmark things excessively. But I don't see how that would impede work 
on other DSP functions in any way?

// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [FFmpeg-devel] [PATCH v4 2/2][GSoC 2024] tests/checkasm: Add check_vvc_sad to vvc_mc.c
  2024-05-21  6:37     ` Martin Storsjö
@ 2024-05-21  8:47       ` Rémi Denis-Courmont
  2024-05-21 10:12         ` Martin Storsjö
  0 siblings, 1 reply; 11+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-21  8:47 UTC (permalink / raw)
  To: FFmpeg development discussions and patches



Le 21 mai 2024 09:37:18 GMT+03:00, "Martin Storsjö" <martin@martin.st> a écrit :
>On Tue, 21 May 2024, Rémi Denis-Courmont wrote:
>
>> Hi,
>> 
>> Le 20 mai 2024 03:42:03 GMT+03:00, Stone Chen <chen.stonechen@gmail.com> a écrit :
>>> Adds checkasm for DMVR SAD AVX2 implementation.
>>> 
>>> Benchmarks ( AMD 7940HS )
>>> vvc_sad_8x8_c: 70.0
>>> vvc_sad_8x8_avx2: 10.0
>>> vvc_sad_16x16_c: 280.0
>>> vvc_sad_16x16_avx2: 20.0
>>> vvc_sad_32x32_c: 1020.0
>>> vvc_sad_32x32_avx2: 70.0
>>> vvc_sad_64x64_c: 3560.0
>>> vvc_sad_64x64_avx2: 270.0
>>> vvc_sad_128x128_c: 13760.0
>>> vvc_sad_128x128_avx2: 1070.0
>>> ---
>>> tests/checkasm/vvc_mc.c | 38 ++++++++++++++++++++++++++++++++++++++
>>> 1 file changed, 38 insertions(+)
>> 
>> VVC benchmarks have increased checksam runtime by at least an order of magnitude. It's become so prohibitively slow that I could not even get to the end.
>> 
>> This is not an acceptable situation and impedes non-VVC assembler work
>
>I don't quite understand; whenever benchmarking anything in checkasm, I would always run e.g. "checkasm --test=ac3dsp --bench=ac3_sum_square_bufferfly_float", limiting the total running of tests to a specific module, and only benchmarking a subset of the run functions. (The --bench parameter specifies a prefix; only functions matching that prefix gets benchmarked.)

Sure that's how you do it when you're working on a specific new optimisation. Now we're trying to compare 128-bit and 256-bit vectors for *all* existing functions to see which ones need to be reworked.

That used to work (in 30 minutes on K230, 5 minutes on Zen 2, IIRC). Now it's effectively broken and that's not acceptable'

>
>Without limiting the scope with a --test parameter, checkasm benchmarking has always been prohibitively slow for me - so I don't think there's anything new here?

As said, it seems to be literally an order of magnitude slower than before if not worse.

>That said I'm not familiar with the VVC tests in checkasm, perhaps they benchmark things excessively. But I don't see how that would impede work on other DSP functions in any way?

James also complained about the same thing before I.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [FFmpeg-devel] [PATCH v4 2/2][GSoC 2024] tests/checkasm: Add check_vvc_sad to vvc_mc.c
  2024-05-21  8:47       ` Rémi Denis-Courmont
@ 2024-05-21 10:12         ` Martin Storsjö
  0 siblings, 0 replies; 11+ messages in thread
From: Martin Storsjö @ 2024-05-21 10:12 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

On Tue, 21 May 2024, Rémi Denis-Courmont wrote:

>
>
> Le 21 mai 2024 09:37:18 GMT+03:00, "Martin Storsjö" <martin@martin.st> a écrit :
>> On Tue, 21 May 2024, Rémi Denis-Courmont wrote:
>>
>>> Hi,
>>> 
>>> VVC benchmarks have increased checksam runtime by at least an order of 
>>> magnitude. It's become so prohibitively slow that I could not even get 
>>> to the end.
>>> 
>>> This is not an acceptable situation and impedes non-VVC assembler work
>>
>> I don't quite understand; whenever benchmarking anything in checkasm, I 
>> would always run e.g. "checkasm --test=ac3dsp 
>> --bench=ac3_sum_square_bufferfly_float", limiting the total running of 
>> tests to a specific module, and only benchmarking a subset of the run 
>> functions. (The --bench parameter specifies a prefix; only functions 
>> matching that prefix gets benchmarked.)
>
> Sure that's how you do it when you're working on a specific new 
> optimisation. Now we're trying to compare 128-bit and 256-bit vectors 
> for *all* existing functions to see which ones need to be reworked.
>
> That used to work (in 30 minutes on K230, 5 minutes on Zen 2, IIRC). Now 
> it's effectively broken and that's not acceptable'

Ah, I see. Ok, that's a reasonable thing to do I guess.

(It's of course possible to speed it up further by only testing specific 
--test=foo cases where you know you have riscv assembly worth 
benchmarking, but if it was doable in a tolerable amount of time before, 
that shouldn't be needed.)

>> Without limiting the scope with a --test parameter, checkasm 
>> benchmarking has always been prohibitively slow for me - so I don't 
>> think there's anything new here?
>
> As said, it seems to be literally an order of magnitude slower than 
> before if not worse.
>
>> That said I'm not familiar with the VVC tests in checkasm, perhaps they 
>> benchmark things excessively. But I don't see how that would impede 
>> work on other DSP functions in any way?
>
> James also complained about the same thing before I.

Indeed, the tests in vvc_alf group seem to do excessive benchmarking 
(benchmarking every width/height combination between 4 and 128, in 
increments of 4). I sent a patch to cut this down to a reasonable amount.

Overall, I would expect the vvc checkasm tests to take a notable amount of 
time. Dav1d's checkasm takes twice as long to run as ffmpeg's, and it's 
probably a reasonable to assume that vvc is roughly of the same level of 
complexity as av1, so it's probably expected that ffmpeg's checkasm 
runtime at least doubles, once all vvc routines are integrated in 
checkasm.

But the tests in vvc_alf indeed had an entirely unreasonable amount of 
benchmarking hooked up, and that should indeed be fixed, e.g. with the 
patch I just sent.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [FFmpeg-devel] [PATCH v4 2/2][GSoC 2024] tests/checkasm: Add check_vvc_sad to vvc_mc.c
  2024-05-20  0:42 ` [FFmpeg-devel] [PATCH v4 2/2][GSoC 2024] tests/checkasm: Add check_vvc_sad to vvc_mc.c Stone Chen
  2024-05-21  5:12   ` Rémi Denis-Courmont
@ 2024-05-21 14:35   ` Ronald S. Bultje
  1 sibling, 0 replies; 11+ messages in thread
From: Ronald S. Bultje @ 2024-05-21 14:35 UTC (permalink / raw)
  To: FFmpeg development discussions and patches; +Cc: Stone Chen

Hi,

On Sun, May 19, 2024 at 8:55 PM Stone Chen <chen.stonechen@gmail.com> wrote:

> Adds checkasm for DMVR SAD AVX2 implementation.
>
> Benchmarks ( AMD 7940HS )
> vvc_sad_8x8_c: 70.0
> vvc_sad_8x8_avx2: 10.0
> vvc_sad_16x16_c: 280.0
> vvc_sad_16x16_avx2: 20.0
> vvc_sad_32x32_c: 1020.0
> vvc_sad_32x32_avx2: 70.0
> vvc_sad_64x64_c: 3560.0
> vvc_sad_64x64_avx2: 270.0
> vvc_sad_128x128_c: 13760.0
> vvc_sad_128x128_avx2: 1070.0
> ---
>  tests/checkasm/vvc_mc.c | 38 ++++++++++++++++++++++++++++++++++++++
>  1 file changed, 38 insertions(+)
>

It appears Remi's performance concerns have been addressed separately, so
this patch is good to go.

Ronald
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [FFmpeg-devel] [PATCH v4 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC
  2024-05-20 15:52 ` Ronald S. Bultje
@ 2024-05-22  0:05   ` Stone Chen
  0 siblings, 0 replies; 11+ messages in thread
From: Stone Chen @ 2024-05-22  0:05 UTC (permalink / raw)
  To: Ronald S. Bultje; +Cc: FFmpeg development discussions and patches

On Mon, May 20, 2024 at 7:23 AM Ronald S. Bultje <rsbultje@gmail.com> wrote:

> Hi,
>
> This is mostly good, the following is tiny nitpicks.
>
> On Sun, May 19, 2024 at 8:46 PM Stone Chen <chen.stonechen@gmail.com>
> wrote:
>
>> +%macro INIT_OFFSET 6 ; src1, src2, dxq, dyq, off1, off2
>>
>
> The macro is only used once, so you could inline it in the calling
> function.
>
>>
>> +    imul            %5, 128
>> +    imul            %6, 128
>>
>
> I believe shl is typically preferred over imul for powers of two.
>
>
>> +    add             %5, 2
>> +    add             %6, 2
>>
>
> And these can be integrated as a constant offset in the lea below (lea %1,
> [%1 + %5 * 2 + 2 * 2], same for %2).
>
>
>> +    add             %5, %3
>> +    sub             %6, %3
>> +
>> +    lea             %1, [%1 + %5 * 2]
>> +    lea             %2, [%2 + %6 * 2]
>
> [..]
>
>> +cglobal vvc_sad, 6, 11, 5, src1, src2, dx, dy, block_w, block_h, off1,
>> off2, row_idx, dx2, dy2
>> +    movsxd           dx2q, dxd
>> +    movsxd           dy2q, dyd
>>
>
> If you change the argument type from int to intptr_t, this is not
> necessary anymore.
>
>
>> +    vvc_sad_16_128:
>> +        .loop_height:
>> +        mov         off1q, src1q
>> +        mov         off2q, src2q
>> +        mov      row_idxd, block_wd
>> +        sar      row_idxd, 4
>>
>
> You could right-shift block_wd by 4 outside the loop (before .loop_height).
>
> Ronald
>

On Mon, May 20, 2024 at 11:53 AM Ronald S. Bultje <rsbultje@gmail.com>
wrote:

> Hi,
>
> one more, I forgot.
>
> On Sun, May 19, 2024 at 8:46 PM Stone Chen <chen.stonechen@gmail.com>
> wrote:
>
>> +pw_1: dw 1
>>
> [..]
>
>> +    vpbroadcastw       m4, [pw_1]
>>
>
> We typically suggest to use vpbroadcastd, not w (and then pw_1: times 2 dw
> 1). agner shows that on e.g. Haswell, the former (d) is 1 uops with 5
> cycles latency, whereas the latter (w) is 3 uops with 7 cycles latency, or
> more generally d is faster then w.
>
> Ronald
>

Hi Ronald,

I've sent a v5 incorporating all the above, thank you for the feedback!

-Stone
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [FFmpeg-devel] [PATCH v4 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC
@ 2024-05-20  0:37 Stone Chen
  0 siblings, 0 replies; 11+ messages in thread
From: Stone Chen @ 2024-05-20  0:37 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Stone Chen

Implements AVX2 DMVR (decoder-side motion vector refinement) SAD functions. DMVR SAD is only calculated if w >= 8, h >= 8, and w * h > 128. To reduce complexity, SAD is only calculated on even rows. This is calculated for all video bitdepths, but the values passed to the function are always 16bit (even if the original video bitdepth is 8). The AVX2 implementation uses min/max/sub.

Benchmarks ( AMD 7940HS )
Before:
BQTerrace_1920x1080_60_10_420_22_RA.vvc | 106.0 |
Chimera_8bit_1080P_1000_frames.vvc | 204.3 |
NovosobornayaSquare_1920x1080.bin | 197.3 |
RitualDance_1920x1080_60_10_420_37_RA.266 | 174.0 |

After:
BQTerrace_1920x1080_60_10_420_22_RA.vvc | 109.3 |
Chimera_8bit_1080P_1000_frames.vvc | 216.0 |
NovosobornayaSquare_1920x1080.bin | 204.0|
RitualDance_1920x1080_60_10_420_37_RA.266 | 181.7 |
---
 libavcodec/x86/vvc/Makefile      |   3 +-
 libavcodec/x86/vvc/vvc_sad.asm   | 138 +++++++++++++++++++++++++++++++
 libavcodec/x86/vvc/vvcdsp_init.c |   6 ++
 3 files changed, 146 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/x86/vvc/vvc_sad.asm

diff --git a/libavcodec/x86/vvc/Makefile b/libavcodec/x86/vvc/Makefile
index d6a66f860a..7b2438ce17 100644
--- a/libavcodec/x86/vvc/Makefile
+++ b/libavcodec/x86/vvc/Makefile
@@ -5,4 +5,5 @@ OBJS-$(CONFIG_VVC_DECODER)             += x86/vvc/vvcdsp_init.o \
                                           x86/h26x/h2656dsp.o
 X86ASM-OBJS-$(CONFIG_VVC_DECODER)      += x86/vvc/vvc_alf.o      \
                                           x86/vvc/vvc_mc.o       \
-                                          x86/h26x/h2656_inter.o
+                                          x86/vvc/vvc_sad.o      \
+                                          x86/h26x/h2656_inter.o 
diff --git a/libavcodec/x86/vvc/vvc_sad.asm b/libavcodec/x86/vvc/vvc_sad.asm
new file mode 100644
index 0000000000..58a24635d2
--- /dev/null
+++ b/libavcodec/x86/vvc/vvc_sad.asm
@@ -0,0 +1,138 @@
+; /*
+; * Provide SIMD DMVR SAD functions for VVC decoding
+; *
+; * Copyright (c) 2024 Stone Chen
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+; */
+
+%include "libavutil/x86/x86util.asm"
+%define MAX_PB_SIZE 128
+%define ROWS 2    
+
+SECTION_RODATA
+
+pw_1: dw 1
+
+; DMVR SAD is only calculated on even rows to reduce complexity
+SECTION .text
+
+%macro MIN_MAX_SAD 3 ; 
+    pminuw           %3, %2, %1
+    pmaxuw           %1, %2, %1
+    psubusw          %1, %1, %3
+%endmacro
+
+%macro HORIZ_ADD 3  ; xm0, xm1, m1
+    vextracti128      %1, %3, q0001  ;        3        2      1          0
+    paddd            %1, %2         ; xm0 (7 + 3) (6 + 2) (5 + 1)   (4 + 0)
+    pshufd           %2, %1, q0032  ; xm1    -      -     (7 + 3)   (6 + 2)
+    paddd            %1, %1, %2     ; xm0    _      _     (5 1 7 3) (4 0 6 2)
+    pshufd           %2, %1, q0001  ; xm1    _      _     (5 1 7 3) (5 1 7 3)
+    paddd            %1, %1, %2     ;                               (01234567)
+%endmacro
+
+%macro INIT_OFFSET 6 ; src1, src2, dxq, dyq, off1, off2
+    sub             %3, 2
+    sub             %4, 2
+
+    mov             %5, 2
+    mov             %6, 2
+
+    add             %5, %4   
+    sub             %6, %4
+
+    imul            %5, 128
+    imul            %6, 128
+
+    add             %5, 2
+    add             %6, 2
+    
+    add             %5, %3
+    sub             %6, %3
+
+    lea             %1, [%1 + %5 * 2]
+    lea             %2, [%2 + %6 * 2]
+%endmacro
+
+%if ARCH_X86_64
+%if HAVE_AVX2_EXTERNAL
+
+INIT_YMM avx2
+
+cglobal vvc_sad, 6, 11, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx, dx2, dy2
+    movsxd           dx2q, dxd
+    movsxd           dy2q, dyd
+    INIT_OFFSET     src1q, src2q, dx2q, dy2q, off1q, off2q
+    pxor               m3, m3
+    vpbroadcastw       m4, [pw_1]
+
+    cmp          block_wd, 16
+    jge    vvc_sad_16_128
+
+    vvc_sad_8:
+        .loop_height:
+        movu              xm0, [src1q]
+        vinserti128        m0, [src1q + MAX_PB_SIZE * ROWS * 2], 1
+        movu              xm1, [src2q]
+        vinserti128        m1, [src2q + MAX_PB_SIZE * ROWS * 2], 1
+
+        MIN_MAX_SAD        m1, m0, m2
+        pmaddwd            m1, m4
+        paddd              m3, m1
+
+        add         src1q, 2 * MAX_PB_SIZE * ROWS * 2 
+        add         src2q, 2 * MAX_PB_SIZE * ROWS * 2
+
+        sub      block_hd, 4
+        jg   .loop_height
+
+        HORIZ_ADD     xm0, xm3, m3
+        movd          eax, xm0
+    RET
+
+    vvc_sad_16_128:
+        .loop_height:
+        mov         off1q, src1q
+        mov         off2q, src2q
+        mov      row_idxd, block_wd
+        sar      row_idxd, 4
+
+        .loop_width:
+            movu               m0, [src1q]
+            movu               m1, [src2q]
+            MIN_MAX_SAD        m1, m0, m2
+            pmaddwd            m1, m4
+            paddd              m3, m1
+
+            add             src1q, 32
+            add             src2q, 32
+            dec          row_idxd
+            jg        .loop_width
+
+        lea         src1q, [off1q + ROWS * MAX_PB_SIZE * 2] 
+        lea         src2q, [off2q + ROWS * MAX_PB_SIZE * 2]
+
+        sub      block_hd, 2
+        jg   .loop_height
+
+        HORIZ_ADD     xm0, xm3, m3
+        movd          eax, xm0
+    RET
+
+%endif
+%endif
diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
index 0e68971b2c..4b4a2aa937 100644
--- a/libavcodec/x86/vvc/vvcdsp_init.c
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -311,6 +311,9 @@ ALF_FUNCS(16, 12, avx2)
     c->alf.filter[CHROMA] = ff_vvc_alf_filter_chroma_##bd##_avx2;    \
     c->alf.classify       = ff_vvc_alf_classify_##bd##_avx2;         \
 } while (0)
+
+int ff_vvc_sad_avx2(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h);
+#define SAD_INIT() c->inter.sad = ff_vvc_sad_avx2
 #endif
 
 void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
@@ -327,6 +330,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
             ALF_INIT(8);
             AVG_INIT(8, avx2);
             MC_LINKS_AVX2(8);
+            SAD_INIT();
         }
         break;
     case 10:
@@ -338,6 +342,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
             AVG_INIT(10, avx2);
             MC_LINKS_AVX2(10);
             MC_LINKS_16BPC_AVX2(10);
+            SAD_INIT();
         }
         break;
     case 12:
@@ -349,6 +354,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
             AVG_INIT(12, avx2);
             MC_LINKS_AVX2(12);
             MC_LINKS_16BPC_AVX2(12);
+            SAD_INIT();
         }
         break;
     default:
-- 
2.45.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2024-05-22  0:05 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-05-20  0:42 [FFmpeg-devel] [PATCH v4 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC Stone Chen
2024-05-20  0:42 ` [FFmpeg-devel] [PATCH v4 2/2][GSoC 2024] tests/checkasm: Add check_vvc_sad to vvc_mc.c Stone Chen
2024-05-21  5:12   ` Rémi Denis-Courmont
2024-05-21  6:37     ` Martin Storsjö
2024-05-21  8:47       ` Rémi Denis-Courmont
2024-05-21 10:12         ` Martin Storsjö
2024-05-21 14:35   ` Ronald S. Bultje
2024-05-20 11:23 ` [FFmpeg-devel] [PATCH v4 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC Ronald S. Bultje
2024-05-20 15:52 ` Ronald S. Bultje
2024-05-22  0:05   ` Stone Chen
  -- strict thread matches above, loose matches on Subject: below --
2024-05-20  0:37 Stone Chen

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git