Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH 2/3] x86/vvcdec: add dmvr avx2 code
       [not found] <20240725133546.19125-1-nuomi2021@gmail.com>
@ 2024-07-25 13:35 ` Nuo Mi
  2024-07-25 13:35 ` [FFmpeg-devel] [PATCH 3/3] checkasm: add tests for vvc dmvr Nuo Mi
  1 sibling, 0 replies; 6+ messages in thread
From: Nuo Mi @ 2024-07-25 13:35 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Nuo Mi

Decoder-Side Motion Vector Refinement is about 4~8% CPU usage for some clips

here is the test result for one time
clips                                     | before| after | delta
------------------------------------------|-------|-------|------
RitualDance_1920x1080_60_10_420_37_RA.266 | 338.7 | 354.3 |4.61%
NovosobornayaSquare_1920x1080.bin         | 320.3 | 329.3 |2.81%
Tango2_3840x2160_60_10_420_27_LD.266      | 83.3  | 83.7  |0.48%
RitualDance_1920x1080_60_10_420_32_LD.266 | 320.7 | 327.3 |2.06%
Chimera_8bit_1080P_1000_frames.vvc        | 360.7 | 381.0 |5.63%
BQTerrace_1920x1080_60_10_420_22_RA.vvc   | 161.7 | 163.0 |0.80%
---
 libavcodec/x86/vvc/Makefile      |   1 +
 libavcodec/x86/vvc/vvc_dmvr.asm  | 373 +++++++++++++++++++++++++++++++
 libavcodec/x86/vvc/vvcdsp_init.c |  25 +++
 3 files changed, 399 insertions(+)
 create mode 100644 libavcodec/x86/vvc/vvc_dmvr.asm

diff --git a/libavcodec/x86/vvc/Makefile b/libavcodec/x86/vvc/Makefile
index 832d802daf..04f16bc10c 100644
--- a/libavcodec/x86/vvc/Makefile
+++ b/libavcodec/x86/vvc/Makefile
@@ -4,6 +4,7 @@ clean::
 OBJS-$(CONFIG_VVC_DECODER)             += x86/vvc/vvcdsp_init.o \
                                           x86/h26x/h2656dsp.o
 X86ASM-OBJS-$(CONFIG_VVC_DECODER)      += x86/vvc/vvc_alf.o      \
+                                          x86/vvc/vvc_dmvr.o     \
                                           x86/vvc/vvc_mc.o       \
                                           x86/vvc/vvc_sad.o      \
                                           x86/h26x/h2656_inter.o
diff --git a/libavcodec/x86/vvc/vvc_dmvr.asm b/libavcodec/x86/vvc/vvc_dmvr.asm
new file mode 100644
index 0000000000..4c971f970b
--- /dev/null
+++ b/libavcodec/x86/vvc/vvc_dmvr.asm
@@ -0,0 +1,373 @@
+; /*
+; * Provide AVX2 luma dmvr functions for VVC decoding
+; * Copyright (c) 2024 Nuo Mi
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+; */
+%include "libavutil/x86/x86util.asm"
+
+%define MAX_PB_SIZE             128
+
+SECTION_RODATA 32
+
+shift_12   times 2  dw 1 << (15 - (12 - 10))
+shift3_8   times 2  dw 1 << (15 - (8 - 6))
+shift3_10  times 2  dw 1 << (15 - (10 - 6))
+shift3_12  times 2  dw 1 << (15 - (12 - 6))
+pw_16      times 2  dw 16
+
+%if ARCH_X86_64
+
+%if HAVE_AVX2_EXTERNAL
+
+SECTION .text
+
+%define pstride (bd / 10 + 1)
+
+; LOAD(dst, src)
+%macro LOAD_W16 2
+%if bd == 8
+    pmovzxbw               %1, %2
+%else
+    movu                   %1, %2
+%endif
+%endmacro
+
+%macro SHIFT_W16 2
+%if bd == 8
+    psllw                  %1, (10 - bd)
+%elif bd == 10
+    ; nothing
+%else
+    pmulhrsw               %1, %2
+%endif
+%endmacro
+
+%macro SAVE_W16 2
+    movu                   %1, %2
+%endmacro
+
+; NEXT_4_LINES(is_h)
+%macro NEXT_4_LINES 1
+    lea                 dstq, [dstq + dsq*4]
+    lea                 srcq, [srcq + ssq*4]
+%if %1
+    lea                src1q, [srcq + pstride]
+%endif
+%endmacro
+
+
+; DMVR_4xW16(dst, dst_stride, dst_stride3, src, src_stride, src_stride3)
+%macro DMVR_4xW16 6
+    LOAD_W16               m0, [%4]
+    LOAD_W16               m1, [%4 + %5]
+    LOAD_W16               m2, [%4 + 2 * %5]
+    LOAD_W16               m3, [%4 + %6]
+
+    SHIFT_W16              m0, m4
+    SHIFT_W16              m1, m4
+    SHIFT_W16              m2, m4
+    SHIFT_W16              m3, m4
+
+    SAVE_W16    [%1]         , m0
+    SAVE_W16    [%1 + %2]    , m1
+    SAVE_W16    [%1 + 2 * %2], m2
+    SAVE_W16    [%1 + %3]    , m3
+%endmacro
+
+; buf += -stride * h + off
+; OFFSET_TO_W4(buf, stride, off)
+%macro OFFSET_TO_W4 3
+    mov                    id, hd
+    imul                   iq, %2
+    sub                    %1, iq
+    lea                    %1, [%1 + %3]
+%endmacro
+
+%macro OFFSET_TO_W4 0
+    OFFSET_TO_W4         srcq, ssq, 16 * (bd / 10 + 1)
+    OFFSET_TO_W4         dstq, dsq, 16 * 2
+%endmacro
+
+; void ff_vvc_dmvr_%1_avx2(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,
+;     int height, intptr_t mx, intptr_t my, int width);
+%macro DMVR_AVX2 1
+cglobal vvc_dmvr_%1, 4, 9, 5, dst, src, ss, h, ds, ds3, w, ss3, i
+%define bd %1
+
+    LOAD_STRIDES
+
+%if %1 > 10
+    vpbroadcastd          m4, [shift_%1]
+%endif
+
+    mov                   wd, wm
+    mov                   id, hd
+.w16:
+    sub                   id, 4
+    jl              .w16_end
+    DMVR_4xW16          dstq, dsq, ds3q, srcq, ssq, ss3q
+    NEXT_4_LINES           0
+    jmp                 .w16
+.w16_end:
+
+    sub                   wd, 16
+    jl               .w4_end
+
+    OFFSET_TO_W4
+.w4:
+    sub                   hd, 4
+    jl               .w4_end
+    DMVR_4xW16          dstq, dsq, ds3q, srcq, ssq, ss3q
+    NEXT_4_LINES           0
+    jmp                 .w4
+.w4_end:
+
+    RET
+%endmacro
+
+; LOAD_COEFFS(coeffs0, coeffs1, src)
+%macro LOAD_COEFFS 3
+    movd                xm%2, %3
+    vpbroadcastw         m%2, xm%2
+    vpbroadcastd         m%1, [pw_16]
+    psubw                m%1, m%2
+%endmacro
+
+; LOAD_SHIFT(shift, src)
+%macro LOAD_SHIFT 2
+    vpbroadcastd           %1, [%2]
+%if bd == 12
+    psllw                  %1, 1                        ; avoid signed mul for pmulhrsw
+%endif
+%endmacro
+
+; LOAD_STRIDES(shift, src)
+%macro LOAD_STRIDES 0
+    mov                  dsq, MAX_PB_SIZE * 2
+    lea                 ss3q, [ssq*3]
+    lea                 ds3q, [dsq*3]
+%endmacro
+
+; BILINEAR(dst/src0, src1, coeff0, coeff1, round, tmp)
+%macro BILINEAR 6
+    pmullw                 %1, %3
+    pmullw                 %6, %2, %4
+    paddw                  %1, %6
+%if bd == 12
+    psrlw                  %1, 1                        ; avoid signed mul for pmulhrsw
+%endif
+    pmulhrsw               %1, %5
+%endmacro
+
+; DMVR_H_1xW16(dst, src0, src1, offset, tmp)
+%macro DMVR_H_1xW16 5
+    LOAD_W16               %1, [%2 + %4]
+    LOAD_W16               %5, [%3 + %4]
+    BILINEAR               %1, %5, m10, m11, m12, %5
+%endmacro
+
+; DMVR_H_4xW16(dst, dst_stride, dst_stride3, src, src_stride, src_stride3, src1)
+%macro DMVR_H_4xW16 7
+    DMVR_H_1xW16           m0, %4, %7,      0, m4
+    DMVR_H_1xW16           m1, %4, %7,     %5, m5
+    DMVR_H_1xW16           m2, %4, %7, 2 * %5, m6
+    DMVR_H_1xW16           m3, %4, %7,     %6, m7
+
+    SAVE_W16    [%1]         , m0
+    SAVE_W16    [%1 + %2]    , m1
+    SAVE_W16    [%1 + 2 * %2], m2
+    SAVE_W16    [%1 + %3]    , m3
+%endmacro
+
+; void ff_vvc_dmvr_h_%1_avx2(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,
+;     int height, intptr_t mx, intptr_t my, int width);
+%macro DMVR_H_AVX2 1
+cglobal vvc_dmvr_h_%1, 4, 10, 13, dst, src, ss, h, ds, ds3, w, ss3, src1, i
+%define bd %1
+
+    LOAD_COEFFS           10, 11, dsm
+    LOAD_SHIFT           m12, shift3_%1
+
+    LOAD_STRIDES
+    lea                src1q, [srcq + pstride]
+
+    mov                   wd, wm
+    mov                   id, hd
+.w16:
+    sub                   id, 4
+    jl              .w16_end
+    DMVR_H_4xW16        dstq, dsq, ds3q, srcq, ssq, ss3q, src1q
+    NEXT_4_LINES           1
+    jmp                 .w16
+.w16_end:
+
+    sub                   wd, 16
+    jl               .w4_end
+
+    OFFSET_TO_W4
+    lea                src1q, [srcq + pstride]
+.w4:
+    sub                   hd, 4
+    jl               .w4_end
+    DMVR_H_4xW16        dstq, dsq, ds3q, srcq, ssq, ss3q, src1q
+    NEXT_4_LINES           1
+    jmp                 .w4
+.w4_end:
+
+    RET
+%endmacro
+
+; DMVR_V_4xW16(dst, dst_stride, dst_stride3, src, src_stride, src_stride3)
+%macro DMVR_V_4xW16 6
+    LOAD_W16               m1, [%4 + %5]
+    LOAD_W16               m2, [%4 + 2 * %5]
+    LOAD_W16               m3, [%4 + %6]
+    LOAD_W16               m4, [%4 + 4 * %5]
+
+    BILINEAR               m0, m1, m8, m9, m10, m11
+    BILINEAR               m1, m2, m8, m9, m10, m12
+    BILINEAR               m2, m3, m8, m9, m10, m13
+    BILINEAR               m3, m4, m8, m9, m10, m14
+
+    SAVE_W16    [%1]         , m0
+    SAVE_W16    [%1 + %2]    , m1
+    SAVE_W16    [%1 + 2 * %2], m2
+    SAVE_W16    [%1 + %3]    , m3
+
+    ; why can't we use SWAP m0, m4 here?
+    movaps                 m0, m4
+%endmacro
+
+; void ff_vvc_dmvr_v_%1_avx2(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,
+;     int height, intptr_t mx, intptr_t my, int width);
+%macro DMVR_V_AVX2 1
+cglobal vvc_dmvr_v_%1, 4, 9, 15, dst, src, ss, h, ds, ds3, w, ss3, i
+%define bd %1
+
+    LOAD_COEFFS            8, 9, ds3m
+    LOAD_SHIFT           m10, shift3_%1
+
+    LOAD_STRIDES
+
+    mov                   wd, wm
+    mov                   id, hd
+    LOAD_W16              m0, [srcq]
+.w16:
+    sub                   id, 4
+    jl              .w16_end
+    DMVR_V_4xW16        dstq, dsq, ds3q, srcq, ssq, ss3q
+    NEXT_4_LINES           0
+    jmp                 .w16
+.w16_end:
+
+    sub                   wd, 16
+    jl               .w4_end
+
+    OFFSET_TO_W4
+    LOAD_W16              m0, [srcq]
+.w4:
+    sub                   hd, 4
+    jl               .w4_end
+    DMVR_V_4xW16        dstq, dsq, ds3q, srcq, ssq, ss3q
+    NEXT_4_LINES           0
+    jmp                 .w4
+.w4_end:
+
+    RET
+%endmacro
+
+; DMVR_HV_4xW16(dst, dst_stride, dst_stride3, src, src_stride, src_stride3, src1)
+%macro DMVR_HV_4xW16 7
+    DMVR_H_1xW16           m1, %4, %7,     %5, m6
+    DMVR_H_1xW16           m2, %4, %7, 2 * %5, m7
+    DMVR_H_1xW16           m3, %4, %7,     %6, m8
+    DMVR_H_1xW16           m4, %4, %7, 4 * %5, m9
+
+    BILINEAR               m0, m1, m13, m14, m15, m6
+    BILINEAR               m1, m2, m13, m14, m15, m7
+    BILINEAR               m2, m3, m13, m14, m15, m8
+    BILINEAR               m3, m4, m13, m14, m15, m9
+
+    SAVE_W16    [%1]         , m0
+    SAVE_W16    [%1 + %2]    , m1
+    SAVE_W16    [%1 + 2 * %2], m2
+    SAVE_W16    [%1 + %3]    , m3
+
+    ; why can't we use SWAP m0, m4 here?
+    movaps                 m0, m4
+%endmacro
+
+; void ff_vvc_dmvr_hv_%1_avx2(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,
+;     int height, intptr_t mx, intptr_t my, int width);
+%macro DMVR_HV_AVX2 1
+cglobal vvc_dmvr_hv_%1, 7, 10, 16, dst, src, ss, h, ds, ds3, w, ss3, src1, i
+%define bd %1
+
+    LOAD_COEFFS           10, 11, dsm
+    LOAD_SHIFT           m12, shift3_%1
+
+    LOAD_COEFFS           13, 14, ds3m
+    LOAD_SHIFT           m15, shift3_10
+
+    LOAD_STRIDES
+    lea                src1q, [srcq + pstride]
+
+    mov                   id, hd
+    DMVR_H_1xW16          m0, srcq, src1q, 0, m5
+.w16:
+    sub                   id, 4
+    jl              .w16_end
+    DMVR_HV_4xW16       dstq, dsq, ds3q, srcq, ssq, ss3q, src1q
+    NEXT_4_LINES           1
+    jmp                 .w16
+.w16_end:
+
+    sub                   wd, 16
+    jl               .w4_end
+
+    OFFSET_TO_W4
+    lea                src1q, [srcq + pstride]
+
+    DMVR_H_1xW16          m0, srcq, src1q, 0, m5
+.w4:
+    sub                   hd, 4
+    jl               .w4_end
+    DMVR_HV_4xW16       dstq, dsq, ds3q, srcq, ssq, ss3q, src1q
+    NEXT_4_LINES           1
+    jmp                 .w4
+.w4_end:
+
+    RET
+%endmacro
+
+%macro VVC_DMVR_AVX2 1
+    DMVR_AVX2    %1
+    DMVR_H_AVX2  %1
+    DMVR_V_AVX2  %1
+    DMVR_HV_AVX2 %1
+%endmacro
+
+INIT_YMM avx2
+
+VVC_DMVR_AVX2 8
+VVC_DMVR_AVX2 10
+VVC_DMVR_AVX2 12
+
+%endif ; HAVE_AVX2_EXTERNAL
+
+%endif ; ARCH_X86_64
diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
index 4b4a2aa937..d5b4f4f8a5 100644
--- a/libavcodec/x86/vvc/vvcdsp_init.c
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -87,6 +87,21 @@ AVG_PROTOTYPES( 8, avx2)
 AVG_PROTOTYPES(10, avx2)
 AVG_PROTOTYPES(12, avx2)
 
+
+#define DMVR_PROTOTYPES(bd, opt)                                                                    \
+void ff_vvc_dmvr_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,               \
+     int height, intptr_t mx, intptr_t my, int width);                                              \
+void ff_vvc_dmvr_h_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,             \
+     int height, intptr_t mx, intptr_t my, int width);                                              \
+void ff_vvc_dmvr_v_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,             \
+     int height, intptr_t mx, intptr_t my, int width);                                              \
+void ff_vvc_dmvr_hv_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,            \
+     int height, intptr_t mx, intptr_t my, int width);                                              \
+
+DMVR_PROTOTYPES( 8, avx2)
+DMVR_PROTOTYPES(10, avx2)
+DMVR_PROTOTYPES(12, avx2)
+
 #define ALF_BPC_PROTOTYPES(bpc, opt)                                                                                     \
 void BF(ff_vvc_alf_filter_luma, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride,                                            \
     const uint8_t *src, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height,                                         \
@@ -306,6 +321,13 @@ ALF_FUNCS(16, 12, avx2)
     c->inter.w_avg  = bf(ff_vvc_w_avg, bd, opt);                     \
 } while (0)
 
+#define DMVR_INIT(bd) do {                                           \
+    c->inter.dmvr[0][0]   = ff_vvc_dmvr_##bd##_avx2;                 \
+    c->inter.dmvr[0][1]   = ff_vvc_dmvr_h_##bd##_avx2;               \
+    c->inter.dmvr[1][0]   = ff_vvc_dmvr_v_##bd##_avx2;               \
+    c->inter.dmvr[1][1]   = ff_vvc_dmvr_hv_##bd##_avx2;              \
+} while (0)
+
 #define ALF_INIT(bd) do {                                            \
     c->alf.filter[LUMA]   = ff_vvc_alf_filter_luma_##bd##_avx2;      \
     c->alf.filter[CHROMA] = ff_vvc_alf_filter_chroma_##bd##_avx2;    \
@@ -330,6 +352,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
             ALF_INIT(8);
             AVG_INIT(8, avx2);
             MC_LINKS_AVX2(8);
+            DMVR_INIT(8);
             SAD_INIT();
         }
         break;
@@ -342,6 +365,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
             AVG_INIT(10, avx2);
             MC_LINKS_AVX2(10);
             MC_LINKS_16BPC_AVX2(10);
+            DMVR_INIT(10);
             SAD_INIT();
         }
         break;
@@ -354,6 +378,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
             AVG_INIT(12, avx2);
             MC_LINKS_AVX2(12);
             MC_LINKS_16BPC_AVX2(12);
+            DMVR_INIT(12);
             SAD_INIT();
         }
         break;
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [FFmpeg-devel] [PATCH 3/3] checkasm: add tests for vvc dmvr
       [not found] <20240725133546.19125-1-nuomi2021@gmail.com>
  2024-07-25 13:35 ` [FFmpeg-devel] [PATCH 2/3] x86/vvcdec: add dmvr avx2 code Nuo Mi
@ 2024-07-25 13:35 ` Nuo Mi
  2024-07-26  1:20   ` James Almer
  1 sibling, 1 reply; 6+ messages in thread
From: Nuo Mi @ 2024-07-25 13:35 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Nuo Mi

dmvr_8_12x20_c: 186.2
dmvr_8_12x20_avx2: 25.7
dmvr_8_20x12_c: 181.7
dmvr_8_20x12_avx2: 25.2
dmvr_8_20x20_c: 283.2
dmvr_8_20x20_avx2: 32.0
dmvr_10_12x20_c: 90.0
dmvr_10_12x20_avx2: 15.7
dmvr_10_20x12_c: 41.0
dmvr_10_20x12_avx2: 14.7
dmvr_10_20x20_c: 81.5
dmvr_10_20x20_avx2: 26.7
dmvr_12_12x20_c: 190.7
dmvr_12_12x20_avx2: 20.2
dmvr_12_20x12_c: 187.2
dmvr_12_20x12_avx2: 20.2
dmvr_12_20x20_c: 292.7
dmvr_12_20x20_avx2: 27.2
dmvr_h_8_12x20_c: 317.0
dmvr_h_8_12x20_avx2: 37.0
dmvr_h_8_20x12_c: 340.0
dmvr_h_8_20x12_avx2: 41.0
dmvr_h_8_20x20_c: 540.7
dmvr_h_8_20x20_avx2: 64.0
dmvr_h_10_12x20_c: 322.7
dmvr_h_10_12x20_avx2: 30.7
dmvr_h_10_20x12_c: 344.2
dmvr_h_10_20x12_avx2: 34.0
dmvr_h_10_20x20_c: 529.0
dmvr_h_10_20x20_avx2: 51.5
dmvr_h_12_12x20_c: 326.7
dmvr_h_12_12x20_avx2: 33.5
dmvr_h_12_20x12_c: 331.7
dmvr_h_12_20x12_avx2: 51.2
dmvr_h_12_20x20_c: 534.0
dmvr_h_12_20x20_avx2: 62.7
dmvr_hv_8_12x20_c: 650.0
dmvr_hv_8_12x20_avx2: 57.2
dmvr_hv_8_20x12_c: 676.2
dmvr_hv_8_20x12_avx2: 70.0
dmvr_hv_8_20x20_c: 1068.5
dmvr_hv_8_20x20_avx2: 103.2
dmvr_hv_10_12x20_c: 649.0
dmvr_hv_10_12x20_avx2: 48.2
dmvr_hv_10_20x12_c: 677.7
dmvr_hv_10_20x12_avx2: 59.7
dmvr_hv_10_20x20_c: 1093.5
dmvr_hv_10_20x20_avx2: 91.7
dmvr_hv_12_12x20_c: 660.0
dmvr_hv_12_12x20_avx2: 58.7
dmvr_hv_12_20x12_c: 682.7
dmvr_hv_12_20x12_avx2: 72.0
dmvr_hv_12_20x20_c: 1094.0
dmvr_hv_12_20x20_avx2: 113.2
dmvr_v_8_12x20_c: 325.7
dmvr_v_8_12x20_avx2: 31.2
dmvr_v_8_20x12_c: 326.2
dmvr_v_8_20x12_avx2: 38.5
dmvr_v_8_20x20_c: 538.5
dmvr_v_8_20x20_avx2: 54.2
dmvr_v_10_12x20_c: 318.5
dmvr_v_10_12x20_avx2: 23.7
dmvr_v_10_20x12_c: 330.7
dmvr_v_10_20x12_avx2: 40.5
dmvr_v_10_20x20_c: 567.5
dmvr_v_10_20x20_avx2: 48.0
dmvr_v_12_12x20_c: 335.2
dmvr_v_12_12x20_avx2: 30.0
dmvr_v_12_20x12_c: 330.2
dmvr_v_12_20x12_avx2: 39.5
dmvr_v_12_20x20_c: 535.2
dmvr_v_12_20x20_avx2: 60.0
---
 tests/checkasm/vvc_mc.c | 59 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
index bc6b580f42..62fa6aa7d0 100644
--- a/tests/checkasm/vvc_mc.c
+++ b/tests/checkasm/vvc_mc.c
@@ -324,6 +324,64 @@ static void check_avg(void)
     report("avg");
 }
 
+#define SR_RANGE 2
+static void check_dmvr(void)
+{
+    LOCAL_ALIGNED_32(uint16_t, dst0, [DST_BUF_SIZE]);
+    LOCAL_ALIGNED_32(uint16_t, dst1, [DST_BUF_SIZE]);
+    LOCAL_ALIGNED_32(uint8_t,  src0, [SRC_BUF_SIZE]);
+    LOCAL_ALIGNED_32(uint8_t,  src1, [SRC_BUF_SIZE]);
+    const int dst_stride = MAX_PB_SIZE * sizeof(int16_t);
+
+    VVCDSPContext c;
+    declare_func(void, int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, int height,
+        intptr_t mx, intptr_t my, int width);
+
+    for (int bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
+        ff_vvc_dsp_init(&c, bit_depth);
+        randomize_pixels(src0, src1, SRC_BUF_SIZE);
+        for (int i = 0; i < 2; i++) {
+            for (int j = 0; j < 2; j++) {
+                for (int h = 8; h <= 16; h *= 2) {
+                    for (int w = 8; w <= 16; w *= 2) {
+                        const int pred_w = w + 2 * SR_RANGE;
+                        const int pred_h = h + 2 * SR_RANGE;
+                        const int mx     = rnd() % VVC_INTER_LUMA_DMVR_FACTS;
+                        const int my     = rnd() % VVC_INTER_LUMA_DMVR_FACTS;
+                        const char *type;
+
+                        if (w * h < 128)
+                            continue;
+
+                        switch ((j << 1) | i) {
+                            case 0: type = "dmvr";    break; // 0 0
+                            case 1: type = "dmvr_h";  break; // 0 1
+                            case 2: type = "dmvr_v";  break; // 1 0
+                            case 3: type = "dmvr_hv"; break; // 1 1
+                        }
+
+                        if (check_func(c.inter.dmvr[j][i], "%s_%d_%dx%d", type, bit_depth, pred_w, pred_h)) {
+                            memset(dst0, 0, DST_BUF_SIZE);
+                            memset(dst1, 0, DST_BUF_SIZE);
+                            call_ref(dst0, src0 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w);
+                            call_new(dst1, src1 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w);
+                            for (int k = 0; k < pred_h; k++) {
+                                if (memcmp(dst0 + k * dst_stride, dst1 + k * dst_stride, pred_w * sizeof(int16_t))) {
+                                    fail();
+                                    break;
+                                }
+                            }
+
+                            bench_new(dst1, src1 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w);
+                        }
+                    }
+                }
+            }
+        }
+    }
+    report("dmvr");
+}
+
 static void check_vvc_sad(void)
 {
     const int bit_depth = 10;
@@ -363,6 +421,7 @@ static void check_vvc_sad(void)
 
 void checkasm_check_vvc_mc(void)
 {
+    check_dmvr();
     check_vvc_sad();
     check_put_vvc_luma();
     check_put_vvc_luma_uni();
-- 
2.34.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [FFmpeg-devel] [PATCH 3/3] checkasm: add tests for vvc dmvr
  2024-07-25 13:35 ` [FFmpeg-devel] [PATCH 3/3] checkasm: add tests for vvc dmvr Nuo Mi
@ 2024-07-26  1:20   ` James Almer
  2024-07-26  7:42     ` Nuo Mi
  0 siblings, 1 reply; 6+ messages in thread
From: James Almer @ 2024-07-26  1:20 UTC (permalink / raw)
  To: ffmpeg-devel

On 7/25/2024 10:35 AM, Nuo Mi wrote:
> dmvr_8_12x20_c: 186.2
> dmvr_8_12x20_avx2: 25.7
> dmvr_8_20x12_c: 181.7
> dmvr_8_20x12_avx2: 25.2
> dmvr_8_20x20_c: 283.2
> dmvr_8_20x20_avx2: 32.0
> dmvr_10_12x20_c: 90.0
> dmvr_10_12x20_avx2: 15.7
> dmvr_10_20x12_c: 41.0
> dmvr_10_20x12_avx2: 14.7
> dmvr_10_20x20_c: 81.5
> dmvr_10_20x20_avx2: 26.7
> dmvr_12_12x20_c: 190.7
> dmvr_12_12x20_avx2: 20.2
> dmvr_12_20x12_c: 187.2
> dmvr_12_20x12_avx2: 20.2
> dmvr_12_20x20_c: 292.7
> dmvr_12_20x20_avx2: 27.2
> dmvr_h_8_12x20_c: 317.0
> dmvr_h_8_12x20_avx2: 37.0
> dmvr_h_8_20x12_c: 340.0
> dmvr_h_8_20x12_avx2: 41.0
> dmvr_h_8_20x20_c: 540.7
> dmvr_h_8_20x20_avx2: 64.0
> dmvr_h_10_12x20_c: 322.7
> dmvr_h_10_12x20_avx2: 30.7
> dmvr_h_10_20x12_c: 344.2
> dmvr_h_10_20x12_avx2: 34.0
> dmvr_h_10_20x20_c: 529.0
> dmvr_h_10_20x20_avx2: 51.5
> dmvr_h_12_12x20_c: 326.7
> dmvr_h_12_12x20_avx2: 33.5
> dmvr_h_12_20x12_c: 331.7
> dmvr_h_12_20x12_avx2: 51.2
> dmvr_h_12_20x20_c: 534.0
> dmvr_h_12_20x20_avx2: 62.7
> dmvr_hv_8_12x20_c: 650.0
> dmvr_hv_8_12x20_avx2: 57.2
> dmvr_hv_8_20x12_c: 676.2
> dmvr_hv_8_20x12_avx2: 70.0
> dmvr_hv_8_20x20_c: 1068.5
> dmvr_hv_8_20x20_avx2: 103.2
> dmvr_hv_10_12x20_c: 649.0
> dmvr_hv_10_12x20_avx2: 48.2
> dmvr_hv_10_20x12_c: 677.7
> dmvr_hv_10_20x12_avx2: 59.7
> dmvr_hv_10_20x20_c: 1093.5
> dmvr_hv_10_20x20_avx2: 91.7
> dmvr_hv_12_12x20_c: 660.0
> dmvr_hv_12_12x20_avx2: 58.7
> dmvr_hv_12_20x12_c: 682.7
> dmvr_hv_12_20x12_avx2: 72.0
> dmvr_hv_12_20x20_c: 1094.0
> dmvr_hv_12_20x20_avx2: 113.2
> dmvr_v_8_12x20_c: 325.7
> dmvr_v_8_12x20_avx2: 31.2
> dmvr_v_8_20x12_c: 326.2
> dmvr_v_8_20x12_avx2: 38.5
> dmvr_v_8_20x20_c: 538.5
> dmvr_v_8_20x20_avx2: 54.2
> dmvr_v_10_12x20_c: 318.5
> dmvr_v_10_12x20_avx2: 23.7
> dmvr_v_10_20x12_c: 330.7
> dmvr_v_10_20x12_avx2: 40.5
> dmvr_v_10_20x20_c: 567.5
> dmvr_v_10_20x20_avx2: 48.0
> dmvr_v_12_12x20_c: 335.2
> dmvr_v_12_12x20_avx2: 30.0
> dmvr_v_12_20x12_c: 330.2
> dmvr_v_12_20x12_avx2: 39.5
> dmvr_v_12_20x20_c: 535.2
> dmvr_v_12_20x20_avx2: 60.0
> ---
>   tests/checkasm/vvc_mc.c | 59 +++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 59 insertions(+)
> 
> diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
> index bc6b580f42..62fa6aa7d0 100644
> --- a/tests/checkasm/vvc_mc.c
> +++ b/tests/checkasm/vvc_mc.c
> @@ -324,6 +324,64 @@ static void check_avg(void)
>       report("avg");
>   }
>   
> +#define SR_RANGE 2
> +static void check_dmvr(void)
> +{
> +    LOCAL_ALIGNED_32(uint16_t, dst0, [DST_BUF_SIZE]);
> +    LOCAL_ALIGNED_32(uint16_t, dst1, [DST_BUF_SIZE]);
> +    LOCAL_ALIGNED_32(uint8_t,  src0, [SRC_BUF_SIZE]);
> +    LOCAL_ALIGNED_32(uint8_t,  src1, [SRC_BUF_SIZE]);
> +    const int dst_stride = MAX_PB_SIZE * sizeof(int16_t);
> +
> +    VVCDSPContext c;
> +    declare_func(void, int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, int height,
> +        intptr_t mx, intptr_t my, int width);
> +
> +    for (int bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
> +        ff_vvc_dsp_init(&c, bit_depth);
> +        randomize_pixels(src0, src1, SRC_BUF_SIZE);
> +        for (int i = 0; i < 2; i++) {
> +            for (int j = 0; j < 2; j++) {
> +                for (int h = 8; h <= 16; h *= 2) {
> +                    for (int w = 8; w <= 16; w *= 2) {
> +                        const int pred_w = w + 2 * SR_RANGE;
> +                        const int pred_h = h + 2 * SR_RANGE;
> +                        const int mx     = rnd() % VVC_INTER_LUMA_DMVR_FACTS;
> +                        const int my     = rnd() % VVC_INTER_LUMA_DMVR_FACTS;
> +                        const char *type;
> +
> +                        if (w * h < 128)
> +                            continue;

So h == 8 && w == 8 is not tested?

> +
> +                        switch ((j << 1) | i) {
> +                            case 0: type = "dmvr";    break; // 0 0
> +                            case 1: type = "dmvr_h";  break; // 0 1
> +                            case 2: type = "dmvr_v";  break; // 1 0
> +                            case 3: type = "dmvr_hv"; break; // 1 1
> +                        }
> +
> +                        if (check_func(c.inter.dmvr[j][i], "%s_%d_%dx%d", type, bit_depth, pred_w, pred_h)) {
> +                            memset(dst0, 0, DST_BUF_SIZE);
> +                            memset(dst1, 0, DST_BUF_SIZE);
> +                            call_ref(dst0, src0 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w);
> +                            call_new(dst1, src1 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w);
> +                            for (int k = 0; k < pred_h; k++) {
> +                                if (memcmp(dst0 + k * dst_stride, dst1 + k * dst_stride, pred_w * sizeof(int16_t))) {
> +                                    fail();
> +                                    break;
> +                                }
> +                            }
> +
> +                            bench_new(dst1, src1 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w);
> +                        }
> +                    }
> +                }
> +            }
> +        }
> +    }
> +    report("dmvr");
> +}
> +
>   static void check_vvc_sad(void)
>   {
>       const int bit_depth = 10;
> @@ -363,6 +421,7 @@ static void check_vvc_sad(void)
>   
>   void checkasm_check_vvc_mc(void)
>   {
> +    check_dmvr();
>       check_vvc_sad();
>       check_put_vvc_luma();
>       check_put_vvc_luma_uni();
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [FFmpeg-devel] [PATCH 3/3] checkasm: add tests for vvc dmvr
  2024-07-26  1:20   ` James Almer
@ 2024-07-26  7:42     ` Nuo Mi
  2024-08-11 14:00       ` Nuo Mi
  0 siblings, 1 reply; 6+ messages in thread
From: Nuo Mi @ 2024-07-26  7:42 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

On Fri, Jul 26, 2024 at 9:36 AM James Almer <jamrial@gmail.com> wrote:

> On 7/25/2024 10:35 AM, Nuo Mi wrote:
> > dmvr_8_12x20_c: 186.2
> > dmvr_8_12x20_avx2: 25.7
> > dmvr_8_20x12_c: 181.7
> > dmvr_8_20x12_avx2: 25.2
> > dmvr_8_20x20_c: 283.2
> > dmvr_8_20x20_avx2: 32.0
> > dmvr_10_12x20_c: 90.0
> > dmvr_10_12x20_avx2: 15.7
> > dmvr_10_20x12_c: 41.0
> > dmvr_10_20x12_avx2: 14.7
> > dmvr_10_20x20_c: 81.5
> > dmvr_10_20x20_avx2: 26.7
> > dmvr_12_12x20_c: 190.7
> > dmvr_12_12x20_avx2: 20.2
> > dmvr_12_20x12_c: 187.2
> > dmvr_12_20x12_avx2: 20.2
> > dmvr_12_20x20_c: 292.7
> > dmvr_12_20x20_avx2: 27.2
> > dmvr_h_8_12x20_c: 317.0
> > dmvr_h_8_12x20_avx2: 37.0
> > dmvr_h_8_20x12_c: 340.0
> > dmvr_h_8_20x12_avx2: 41.0
> > dmvr_h_8_20x20_c: 540.7
> > dmvr_h_8_20x20_avx2: 64.0
> > dmvr_h_10_12x20_c: 322.7
> > dmvr_h_10_12x20_avx2: 30.7
> > dmvr_h_10_20x12_c: 344.2
> > dmvr_h_10_20x12_avx2: 34.0
> > dmvr_h_10_20x20_c: 529.0
> > dmvr_h_10_20x20_avx2: 51.5
> > dmvr_h_12_12x20_c: 326.7
> > dmvr_h_12_12x20_avx2: 33.5
> > dmvr_h_12_20x12_c: 331.7
> > dmvr_h_12_20x12_avx2: 51.2
> > dmvr_h_12_20x20_c: 534.0
> > dmvr_h_12_20x20_avx2: 62.7
> > dmvr_hv_8_12x20_c: 650.0
> > dmvr_hv_8_12x20_avx2: 57.2
> > dmvr_hv_8_20x12_c: 676.2
> > dmvr_hv_8_20x12_avx2: 70.0
> > dmvr_hv_8_20x20_c: 1068.5
> > dmvr_hv_8_20x20_avx2: 103.2
> > dmvr_hv_10_12x20_c: 649.0
> > dmvr_hv_10_12x20_avx2: 48.2
> > dmvr_hv_10_20x12_c: 677.7
> > dmvr_hv_10_20x12_avx2: 59.7
> > dmvr_hv_10_20x20_c: 1093.5
> > dmvr_hv_10_20x20_avx2: 91.7
> > dmvr_hv_12_12x20_c: 660.0
> > dmvr_hv_12_12x20_avx2: 58.7
> > dmvr_hv_12_20x12_c: 682.7
> > dmvr_hv_12_20x12_avx2: 72.0
> > dmvr_hv_12_20x20_c: 1094.0
> > dmvr_hv_12_20x20_avx2: 113.2
> > dmvr_v_8_12x20_c: 325.7
> > dmvr_v_8_12x20_avx2: 31.2
> > dmvr_v_8_20x12_c: 326.2
> > dmvr_v_8_20x12_avx2: 38.5
> > dmvr_v_8_20x20_c: 538.5
> > dmvr_v_8_20x20_avx2: 54.2
> > dmvr_v_10_12x20_c: 318.5
> > dmvr_v_10_12x20_avx2: 23.7
> > dmvr_v_10_20x12_c: 330.7
> > dmvr_v_10_20x12_avx2: 40.5
> > dmvr_v_10_20x20_c: 567.5
> > dmvr_v_10_20x20_avx2: 48.0
> > dmvr_v_12_12x20_c: 335.2
> > dmvr_v_12_12x20_avx2: 30.0
> > dmvr_v_12_20x12_c: 330.2
> > dmvr_v_12_20x12_avx2: 39.5
> > dmvr_v_12_20x20_c: 535.2
> > dmvr_v_12_20x20_avx2: 60.0
> > ---
> >   tests/checkasm/vvc_mc.c | 59 +++++++++++++++++++++++++++++++++++++++++
> >   1 file changed, 59 insertions(+)
> >
> > diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
> > index bc6b580f42..62fa6aa7d0 100644
> > --- a/tests/checkasm/vvc_mc.c
> > +++ b/tests/checkasm/vvc_mc.c
> > @@ -324,6 +324,64 @@ static void check_avg(void)
> >       report("avg");
> >   }
> >
> > +#define SR_RANGE 2
> > +static void check_dmvr(void)
> > +{
> > +    LOCAL_ALIGNED_32(uint16_t, dst0, [DST_BUF_SIZE]);
> > +    LOCAL_ALIGNED_32(uint16_t, dst1, [DST_BUF_SIZE]);
> > +    LOCAL_ALIGNED_32(uint8_t,  src0, [SRC_BUF_SIZE]);
> > +    LOCAL_ALIGNED_32(uint8_t,  src1, [SRC_BUF_SIZE]);
> > +    const int dst_stride = MAX_PB_SIZE * sizeof(int16_t);
> > +
> > +    VVCDSPContext c;
> > +    declare_func(void, int16_t *dst, const uint8_t *src, ptrdiff_t
> src_stride, int height,
> > +        intptr_t mx, intptr_t my, int width);
> > +
> > +    for (int bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
> > +        ff_vvc_dsp_init(&c, bit_depth);
> > +        randomize_pixels(src0, src1, SRC_BUF_SIZE);
> > +        for (int i = 0; i < 2; i++) {
> > +            for (int j = 0; j < 2; j++) {
> > +                for (int h = 8; h <= 16; h *= 2) {
> > +                    for (int w = 8; w <= 16; w *= 2) {
> > +                        const int pred_w = w + 2 * SR_RANGE;
> > +                        const int pred_h = h + 2 * SR_RANGE;
> > +                        const int mx     = rnd() %
> VVC_INTER_LUMA_DMVR_FACTS;
> > +                        const int my     = rnd() %
> VVC_INTER_LUMA_DMVR_FACTS;
> > +                        const char *type;
> > +
> > +                        if (w * h < 128)
> > +                            continue;
>
> So h == 8 && w == 8 is not tested?
>
Hi James,
thank you for the review.

Yes, DMVR operates on subblocks with a maximum size of 16x16, and it also
requires that the width multiplied by the height be at least 128.
Therefore, only block sizes of 8x16, 16x8, and 16x16 are valid.

see:
8.5.1 General decoding process for coding units coded in inter prediction
mode
and
https://vicuesoft.com/blog/titles/DMVR_in_VVC/


>
> > +
> > +                        switch ((j << 1) | i) {
> > +                            case 0: type = "dmvr";    break; // 0 0
> > +                            case 1: type = "dmvr_h";  break; // 0 1
> > +                            case 2: type = "dmvr_v";  break; // 1 0
> > +                            case 3: type = "dmvr_hv"; break; // 1 1
> > +                        }
> > +
> > +                        if (check_func(c.inter.dmvr[j][i],
> "%s_%d_%dx%d", type, bit_depth, pred_w, pred_h)) {
> > +                            memset(dst0, 0, DST_BUF_SIZE);
> > +                            memset(dst1, 0, DST_BUF_SIZE);
> > +                            call_ref(dst0, src0 + SRC_OFFSET,
> PIXEL_STRIDE, pred_h, mx, my, pred_w);
> > +                            call_new(dst1, src1 + SRC_OFFSET,
> PIXEL_STRIDE, pred_h, mx, my, pred_w);
> > +                            for (int k = 0; k < pred_h; k++) {
> > +                                if (memcmp(dst0 + k * dst_stride, dst1
> + k * dst_stride, pred_w * sizeof(int16_t))) {
> > +                                    fail();
> > +                                    break;
> > +                                }
> > +                            }
> > +
> > +                            bench_new(dst1, src1 + SRC_OFFSET,
> PIXEL_STRIDE, pred_h, mx, my, pred_w);
> > +                        }
> > +                    }
> > +                }
> > +            }
> > +        }
> > +    }
> > +    report("dmvr");
> > +}
> > +
> >   static void check_vvc_sad(void)
> >   {
> >       const int bit_depth = 10;
> > @@ -363,6 +421,7 @@ static void check_vvc_sad(void)
> >
> >   void checkasm_check_vvc_mc(void)
> >   {
> > +    check_dmvr();
> >       check_vvc_sad();
> >       check_put_vvc_luma();
> >       check_put_vvc_luma_uni();
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [FFmpeg-devel] [PATCH 3/3] checkasm: add tests for vvc dmvr
  2024-07-26  7:42     ` Nuo Mi
@ 2024-08-11 14:00       ` Nuo Mi
  2024-08-15 12:45         ` Nuo Mi
  0 siblings, 1 reply; 6+ messages in thread
From: Nuo Mi @ 2024-08-11 14:00 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

will apply next week if there are no objections

On Fri, Jul 26, 2024 at 3:42 PM Nuo Mi <nuomi2021@gmail.com> wrote:

>
>
> On Fri, Jul 26, 2024 at 9:36 AM James Almer <jamrial@gmail.com> wrote:
>
>> On 7/25/2024 10:35 AM, Nuo Mi wrote:
>> > dmvr_8_12x20_c: 186.2
>> > dmvr_8_12x20_avx2: 25.7
>> > dmvr_8_20x12_c: 181.7
>> > dmvr_8_20x12_avx2: 25.2
>> > dmvr_8_20x20_c: 283.2
>> > dmvr_8_20x20_avx2: 32.0
>> > dmvr_10_12x20_c: 90.0
>> > dmvr_10_12x20_avx2: 15.7
>> > dmvr_10_20x12_c: 41.0
>> > dmvr_10_20x12_avx2: 14.7
>> > dmvr_10_20x20_c: 81.5
>> > dmvr_10_20x20_avx2: 26.7
>> > dmvr_12_12x20_c: 190.7
>> > dmvr_12_12x20_avx2: 20.2
>> > dmvr_12_20x12_c: 187.2
>> > dmvr_12_20x12_avx2: 20.2
>> > dmvr_12_20x20_c: 292.7
>> > dmvr_12_20x20_avx2: 27.2
>> > dmvr_h_8_12x20_c: 317.0
>> > dmvr_h_8_12x20_avx2: 37.0
>> > dmvr_h_8_20x12_c: 340.0
>> > dmvr_h_8_20x12_avx2: 41.0
>> > dmvr_h_8_20x20_c: 540.7
>> > dmvr_h_8_20x20_avx2: 64.0
>> > dmvr_h_10_12x20_c: 322.7
>> > dmvr_h_10_12x20_avx2: 30.7
>> > dmvr_h_10_20x12_c: 344.2
>> > dmvr_h_10_20x12_avx2: 34.0
>> > dmvr_h_10_20x20_c: 529.0
>> > dmvr_h_10_20x20_avx2: 51.5
>> > dmvr_h_12_12x20_c: 326.7
>> > dmvr_h_12_12x20_avx2: 33.5
>> > dmvr_h_12_20x12_c: 331.7
>> > dmvr_h_12_20x12_avx2: 51.2
>> > dmvr_h_12_20x20_c: 534.0
>> > dmvr_h_12_20x20_avx2: 62.7
>> > dmvr_hv_8_12x20_c: 650.0
>> > dmvr_hv_8_12x20_avx2: 57.2
>> > dmvr_hv_8_20x12_c: 676.2
>> > dmvr_hv_8_20x12_avx2: 70.0
>> > dmvr_hv_8_20x20_c: 1068.5
>> > dmvr_hv_8_20x20_avx2: 103.2
>> > dmvr_hv_10_12x20_c: 649.0
>> > dmvr_hv_10_12x20_avx2: 48.2
>> > dmvr_hv_10_20x12_c: 677.7
>> > dmvr_hv_10_20x12_avx2: 59.7
>> > dmvr_hv_10_20x20_c: 1093.5
>> > dmvr_hv_10_20x20_avx2: 91.7
>> > dmvr_hv_12_12x20_c: 660.0
>> > dmvr_hv_12_12x20_avx2: 58.7
>> > dmvr_hv_12_20x12_c: 682.7
>> > dmvr_hv_12_20x12_avx2: 72.0
>> > dmvr_hv_12_20x20_c: 1094.0
>> > dmvr_hv_12_20x20_avx2: 113.2
>> > dmvr_v_8_12x20_c: 325.7
>> > dmvr_v_8_12x20_avx2: 31.2
>> > dmvr_v_8_20x12_c: 326.2
>> > dmvr_v_8_20x12_avx2: 38.5
>> > dmvr_v_8_20x20_c: 538.5
>> > dmvr_v_8_20x20_avx2: 54.2
>> > dmvr_v_10_12x20_c: 318.5
>> > dmvr_v_10_12x20_avx2: 23.7
>> > dmvr_v_10_20x12_c: 330.7
>> > dmvr_v_10_20x12_avx2: 40.5
>> > dmvr_v_10_20x20_c: 567.5
>> > dmvr_v_10_20x20_avx2: 48.0
>> > dmvr_v_12_12x20_c: 335.2
>> > dmvr_v_12_12x20_avx2: 30.0
>> > dmvr_v_12_20x12_c: 330.2
>> > dmvr_v_12_20x12_avx2: 39.5
>> > dmvr_v_12_20x20_c: 535.2
>> > dmvr_v_12_20x20_avx2: 60.0
>> > ---
>> >   tests/checkasm/vvc_mc.c | 59 +++++++++++++++++++++++++++++++++++++++++
>> >   1 file changed, 59 insertions(+)
>> >
>> > diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
>> > index bc6b580f42..62fa6aa7d0 100644
>> > --- a/tests/checkasm/vvc_mc.c
>> > +++ b/tests/checkasm/vvc_mc.c
>> > @@ -324,6 +324,64 @@ static void check_avg(void)
>> >       report("avg");
>> >   }
>> >
>> > +#define SR_RANGE 2
>> > +static void check_dmvr(void)
>> > +{
>> > +    LOCAL_ALIGNED_32(uint16_t, dst0, [DST_BUF_SIZE]);
>> > +    LOCAL_ALIGNED_32(uint16_t, dst1, [DST_BUF_SIZE]);
>> > +    LOCAL_ALIGNED_32(uint8_t,  src0, [SRC_BUF_SIZE]);
>> > +    LOCAL_ALIGNED_32(uint8_t,  src1, [SRC_BUF_SIZE]);
>> > +    const int dst_stride = MAX_PB_SIZE * sizeof(int16_t);
>> > +
>> > +    VVCDSPContext c;
>> > +    declare_func(void, int16_t *dst, const uint8_t *src, ptrdiff_t
>> src_stride, int height,
>> > +        intptr_t mx, intptr_t my, int width);
>> > +
>> > +    for (int bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
>> > +        ff_vvc_dsp_init(&c, bit_depth);
>> > +        randomize_pixels(src0, src1, SRC_BUF_SIZE);
>> > +        for (int i = 0; i < 2; i++) {
>> > +            for (int j = 0; j < 2; j++) {
>> > +                for (int h = 8; h <= 16; h *= 2) {
>> > +                    for (int w = 8; w <= 16; w *= 2) {
>> > +                        const int pred_w = w + 2 * SR_RANGE;
>> > +                        const int pred_h = h + 2 * SR_RANGE;
>> > +                        const int mx     = rnd() %
>> VVC_INTER_LUMA_DMVR_FACTS;
>> > +                        const int my     = rnd() %
>> VVC_INTER_LUMA_DMVR_FACTS;
>> > +                        const char *type;
>> > +
>> > +                        if (w * h < 128)
>> > +                            continue;
>>
>> So h == 8 && w == 8 is not tested?
>>
> Hi James,
> thank you for the review.
>
> Yes, DMVR operates on subblocks with a maximum size of 16x16, and it also
> requires that the width multiplied by the height be at least 128.
> Therefore, only block sizes of 8x16, 16x8, and 16x16 are valid.
>
> see:
> 8.5.1 General decoding process for coding units coded in inter prediction
> mode
> and
> https://vicuesoft.com/blog/titles/DMVR_in_VVC/
>
Will apply this next week.
Thank you

>
>
>
>>
>> > +
>> > +                        switch ((j << 1) | i) {
>> > +                            case 0: type = "dmvr";    break; // 0 0
>> > +                            case 1: type = "dmvr_h";  break; // 0 1
>> > +                            case 2: type = "dmvr_v";  break; // 1 0
>> > +                            case 3: type = "dmvr_hv"; break; // 1 1
>> > +                        }
>> > +
>> > +                        if (check_func(c.inter.dmvr[j][i],
>> "%s_%d_%dx%d", type, bit_depth, pred_w, pred_h)) {
>> > +                            memset(dst0, 0, DST_BUF_SIZE);
>> > +                            memset(dst1, 0, DST_BUF_SIZE);
>> > +                            call_ref(dst0, src0 + SRC_OFFSET,
>> PIXEL_STRIDE, pred_h, mx, my, pred_w);
>> > +                            call_new(dst1, src1 + SRC_OFFSET,
>> PIXEL_STRIDE, pred_h, mx, my, pred_w);
>> > +                            for (int k = 0; k < pred_h; k++) {
>> > +                                if (memcmp(dst0 + k * dst_stride, dst1
>> + k * dst_stride, pred_w * sizeof(int16_t))) {
>> > +                                    fail();
>> > +                                    break;
>> > +                                }
>> > +                            }
>> > +
>> > +                            bench_new(dst1, src1 + SRC_OFFSET,
>> PIXEL_STRIDE, pred_h, mx, my, pred_w);
>> > +                        }
>> > +                    }
>> > +                }
>> > +            }
>> > +        }
>> > +    }
>> > +    report("dmvr");
>> > +}
>> > +
>> >   static void check_vvc_sad(void)
>> >   {
>> >       const int bit_depth = 10;
>> > @@ -363,6 +421,7 @@ static void check_vvc_sad(void)
>> >
>> >   void checkasm_check_vvc_mc(void)
>> >   {
>> > +    check_dmvr();
>> >       check_vvc_sad();
>> >       check_put_vvc_luma();
>> >       check_put_vvc_luma_uni();
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>>
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [FFmpeg-devel] [PATCH 3/3] checkasm: add tests for vvc dmvr
  2024-08-11 14:00       ` Nuo Mi
@ 2024-08-15 12:45         ` Nuo Mi
  0 siblings, 0 replies; 6+ messages in thread
From: Nuo Mi @ 2024-08-15 12:45 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

On Sun, Aug 11, 2024 at 10:00 PM Nuo Mi <nuomi2021@gmail.com> wrote:

> will apply next week if there are no objections
>
Done.
Thank you James for reviewing.

>
> On Fri, Jul 26, 2024 at 3:42 PM Nuo Mi <nuomi2021@gmail.com> wrote:
>
>>
>>
>> On Fri, Jul 26, 2024 at 9:36 AM James Almer <jamrial@gmail.com> wrote:
>>
>>> On 7/25/2024 10:35 AM, Nuo Mi wrote:
>>> > dmvr_8_12x20_c: 186.2
>>> > dmvr_8_12x20_avx2: 25.7
>>> > dmvr_8_20x12_c: 181.7
>>> > dmvr_8_20x12_avx2: 25.2
>>> > dmvr_8_20x20_c: 283.2
>>> > dmvr_8_20x20_avx2: 32.0
>>> > dmvr_10_12x20_c: 90.0
>>> > dmvr_10_12x20_avx2: 15.7
>>> > dmvr_10_20x12_c: 41.0
>>> > dmvr_10_20x12_avx2: 14.7
>>> > dmvr_10_20x20_c: 81.5
>>> > dmvr_10_20x20_avx2: 26.7
>>> > dmvr_12_12x20_c: 190.7
>>> > dmvr_12_12x20_avx2: 20.2
>>> > dmvr_12_20x12_c: 187.2
>>> > dmvr_12_20x12_avx2: 20.2
>>> > dmvr_12_20x20_c: 292.7
>>> > dmvr_12_20x20_avx2: 27.2
>>> > dmvr_h_8_12x20_c: 317.0
>>> > dmvr_h_8_12x20_avx2: 37.0
>>> > dmvr_h_8_20x12_c: 340.0
>>> > dmvr_h_8_20x12_avx2: 41.0
>>> > dmvr_h_8_20x20_c: 540.7
>>> > dmvr_h_8_20x20_avx2: 64.0
>>> > dmvr_h_10_12x20_c: 322.7
>>> > dmvr_h_10_12x20_avx2: 30.7
>>> > dmvr_h_10_20x12_c: 344.2
>>> > dmvr_h_10_20x12_avx2: 34.0
>>> > dmvr_h_10_20x20_c: 529.0
>>> > dmvr_h_10_20x20_avx2: 51.5
>>> > dmvr_h_12_12x20_c: 326.7
>>> > dmvr_h_12_12x20_avx2: 33.5
>>> > dmvr_h_12_20x12_c: 331.7
>>> > dmvr_h_12_20x12_avx2: 51.2
>>> > dmvr_h_12_20x20_c: 534.0
>>> > dmvr_h_12_20x20_avx2: 62.7
>>> > dmvr_hv_8_12x20_c: 650.0
>>> > dmvr_hv_8_12x20_avx2: 57.2
>>> > dmvr_hv_8_20x12_c: 676.2
>>> > dmvr_hv_8_20x12_avx2: 70.0
>>> > dmvr_hv_8_20x20_c: 1068.5
>>> > dmvr_hv_8_20x20_avx2: 103.2
>>> > dmvr_hv_10_12x20_c: 649.0
>>> > dmvr_hv_10_12x20_avx2: 48.2
>>> > dmvr_hv_10_20x12_c: 677.7
>>> > dmvr_hv_10_20x12_avx2: 59.7
>>> > dmvr_hv_10_20x20_c: 1093.5
>>> > dmvr_hv_10_20x20_avx2: 91.7
>>> > dmvr_hv_12_12x20_c: 660.0
>>> > dmvr_hv_12_12x20_avx2: 58.7
>>> > dmvr_hv_12_20x12_c: 682.7
>>> > dmvr_hv_12_20x12_avx2: 72.0
>>> > dmvr_hv_12_20x20_c: 1094.0
>>> > dmvr_hv_12_20x20_avx2: 113.2
>>> > dmvr_v_8_12x20_c: 325.7
>>> > dmvr_v_8_12x20_avx2: 31.2
>>> > dmvr_v_8_20x12_c: 326.2
>>> > dmvr_v_8_20x12_avx2: 38.5
>>> > dmvr_v_8_20x20_c: 538.5
>>> > dmvr_v_8_20x20_avx2: 54.2
>>> > dmvr_v_10_12x20_c: 318.5
>>> > dmvr_v_10_12x20_avx2: 23.7
>>> > dmvr_v_10_20x12_c: 330.7
>>> > dmvr_v_10_20x12_avx2: 40.5
>>> > dmvr_v_10_20x20_c: 567.5
>>> > dmvr_v_10_20x20_avx2: 48.0
>>> > dmvr_v_12_12x20_c: 335.2
>>> > dmvr_v_12_12x20_avx2: 30.0
>>> > dmvr_v_12_20x12_c: 330.2
>>> > dmvr_v_12_20x12_avx2: 39.5
>>> > dmvr_v_12_20x20_c: 535.2
>>> > dmvr_v_12_20x20_avx2: 60.0
>>> > ---
>>> >   tests/checkasm/vvc_mc.c | 59
>>> +++++++++++++++++++++++++++++++++++++++++
>>> >   1 file changed, 59 insertions(+)
>>> >
>>> > diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
>>> > index bc6b580f42..62fa6aa7d0 100644
>>> > --- a/tests/checkasm/vvc_mc.c
>>> > +++ b/tests/checkasm/vvc_mc.c
>>> > @@ -324,6 +324,64 @@ static void check_avg(void)
>>> >       report("avg");
>>> >   }
>>> >
>>> > +#define SR_RANGE 2
>>> > +static void check_dmvr(void)
>>> > +{
>>> > +    LOCAL_ALIGNED_32(uint16_t, dst0, [DST_BUF_SIZE]);
>>> > +    LOCAL_ALIGNED_32(uint16_t, dst1, [DST_BUF_SIZE]);
>>> > +    LOCAL_ALIGNED_32(uint8_t,  src0, [SRC_BUF_SIZE]);
>>> > +    LOCAL_ALIGNED_32(uint8_t,  src1, [SRC_BUF_SIZE]);
>>> > +    const int dst_stride = MAX_PB_SIZE * sizeof(int16_t);
>>> > +
>>> > +    VVCDSPContext c;
>>> > +    declare_func(void, int16_t *dst, const uint8_t *src, ptrdiff_t
>>> src_stride, int height,
>>> > +        intptr_t mx, intptr_t my, int width);
>>> > +
>>> > +    for (int bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
>>> > +        ff_vvc_dsp_init(&c, bit_depth);
>>> > +        randomize_pixels(src0, src1, SRC_BUF_SIZE);
>>> > +        for (int i = 0; i < 2; i++) {
>>> > +            for (int j = 0; j < 2; j++) {
>>> > +                for (int h = 8; h <= 16; h *= 2) {
>>> > +                    for (int w = 8; w <= 16; w *= 2) {
>>> > +                        const int pred_w = w + 2 * SR_RANGE;
>>> > +                        const int pred_h = h + 2 * SR_RANGE;
>>> > +                        const int mx     = rnd() %
>>> VVC_INTER_LUMA_DMVR_FACTS;
>>> > +                        const int my     = rnd() %
>>> VVC_INTER_LUMA_DMVR_FACTS;
>>> > +                        const char *type;
>>> > +
>>> > +                        if (w * h < 128)
>>> > +                            continue;
>>>
>>> So h == 8 && w == 8 is not tested?
>>>
>> Hi James,
>> thank you for the review.
>>
>> Yes, DMVR operates on subblocks with a maximum size of 16x16, and it also
>> requires that the width multiplied by the height be at least 128.
>> Therefore, only block sizes of 8x16, 16x8, and 16x16 are valid.
>>
>> see:
>> 8.5.1 General decoding process for coding units coded in inter prediction
>> mode
>> and
>> https://vicuesoft.com/blog/titles/DMVR_in_VVC/
>>
> Will apply this next week.
> Thank you
>
>>
>>
>>
>>>
>>> > +
>>> > +                        switch ((j << 1) | i) {
>>> > +                            case 0: type = "dmvr";    break; // 0 0
>>> > +                            case 1: type = "dmvr_h";  break; // 0 1
>>> > +                            case 2: type = "dmvr_v";  break; // 1 0
>>> > +                            case 3: type = "dmvr_hv"; break; // 1 1
>>> > +                        }
>>> > +
>>> > +                        if (check_func(c.inter.dmvr[j][i],
>>> "%s_%d_%dx%d", type, bit_depth, pred_w, pred_h)) {
>>> > +                            memset(dst0, 0, DST_BUF_SIZE);
>>> > +                            memset(dst1, 0, DST_BUF_SIZE);
>>> > +                            call_ref(dst0, src0 + SRC_OFFSET,
>>> PIXEL_STRIDE, pred_h, mx, my, pred_w);
>>> > +                            call_new(dst1, src1 + SRC_OFFSET,
>>> PIXEL_STRIDE, pred_h, mx, my, pred_w);
>>> > +                            for (int k = 0; k < pred_h; k++) {
>>> > +                                if (memcmp(dst0 + k * dst_stride,
>>> dst1 + k * dst_stride, pred_w * sizeof(int16_t))) {
>>> > +                                    fail();
>>> > +                                    break;
>>> > +                                }
>>> > +                            }
>>> > +
>>> > +                            bench_new(dst1, src1 + SRC_OFFSET,
>>> PIXEL_STRIDE, pred_h, mx, my, pred_w);
>>> > +                        }
>>> > +                    }
>>> > +                }
>>> > +            }
>>> > +        }
>>> > +    }
>>> > +    report("dmvr");
>>> > +}
>>> > +
>>> >   static void check_vvc_sad(void)
>>> >   {
>>> >       const int bit_depth = 10;
>>> > @@ -363,6 +421,7 @@ static void check_vvc_sad(void)
>>> >
>>> >   void checkasm_check_vvc_mc(void)
>>> >   {
>>> > +    check_dmvr();
>>> >       check_vvc_sad();
>>> >       check_put_vvc_luma();
>>> >       check_put_vvc_luma_uni();
>>> _______________________________________________
>>> ffmpeg-devel mailing list
>>> ffmpeg-devel@ffmpeg.org
>>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>>
>>> To unsubscribe, visit link above, or email
>>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>>>
>>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2024-08-15 12:45 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20240725133546.19125-1-nuomi2021@gmail.com>
2024-07-25 13:35 ` [FFmpeg-devel] [PATCH 2/3] x86/vvcdec: add dmvr avx2 code Nuo Mi
2024-07-25 13:35 ` [FFmpeg-devel] [PATCH 3/3] checkasm: add tests for vvc dmvr Nuo Mi
2024-07-26  1:20   ` James Almer
2024-07-26  7:42     ` Nuo Mi
2024-08-11 14:00       ` Nuo Mi
2024-08-15 12:45         ` Nuo Mi

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git