* [FFmpeg-devel] [PATCH 2/3] x86/vvcdec: add dmvr avx2 code
[not found] <20240725133546.19125-1-nuomi2021@gmail.com>
@ 2024-07-25 13:35 ` Nuo Mi
2024-07-25 13:35 ` [FFmpeg-devel] [PATCH 3/3] checkasm: add tests for vvc dmvr Nuo Mi
1 sibling, 0 replies; 6+ messages in thread
From: Nuo Mi @ 2024-07-25 13:35 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Nuo Mi
Decoder-Side Motion Vector Refinement is about 4~8% CPU usage for some clips
here is the test result for one time
clips | before| after | delta
------------------------------------------|-------|-------|------
RitualDance_1920x1080_60_10_420_37_RA.266 | 338.7 | 354.3 |4.61%
NovosobornayaSquare_1920x1080.bin | 320.3 | 329.3 |2.81%
Tango2_3840x2160_60_10_420_27_LD.266 | 83.3 | 83.7 |0.48%
RitualDance_1920x1080_60_10_420_32_LD.266 | 320.7 | 327.3 |2.06%
Chimera_8bit_1080P_1000_frames.vvc | 360.7 | 381.0 |5.63%
BQTerrace_1920x1080_60_10_420_22_RA.vvc | 161.7 | 163.0 |0.80%
---
libavcodec/x86/vvc/Makefile | 1 +
libavcodec/x86/vvc/vvc_dmvr.asm | 373 +++++++++++++++++++++++++++++++
libavcodec/x86/vvc/vvcdsp_init.c | 25 +++
3 files changed, 399 insertions(+)
create mode 100644 libavcodec/x86/vvc/vvc_dmvr.asm
diff --git a/libavcodec/x86/vvc/Makefile b/libavcodec/x86/vvc/Makefile
index 832d802daf..04f16bc10c 100644
--- a/libavcodec/x86/vvc/Makefile
+++ b/libavcodec/x86/vvc/Makefile
@@ -4,6 +4,7 @@ clean::
OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvcdsp_init.o \
x86/h26x/h2656dsp.o
X86ASM-OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvc_alf.o \
+ x86/vvc/vvc_dmvr.o \
x86/vvc/vvc_mc.o \
x86/vvc/vvc_sad.o \
x86/h26x/h2656_inter.o
diff --git a/libavcodec/x86/vvc/vvc_dmvr.asm b/libavcodec/x86/vvc/vvc_dmvr.asm
new file mode 100644
index 0000000000..4c971f970b
--- /dev/null
+++ b/libavcodec/x86/vvc/vvc_dmvr.asm
@@ -0,0 +1,373 @@
+; /*
+; * Provide AVX2 luma dmvr functions for VVC decoding
+; * Copyright (c) 2024 Nuo Mi
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+; */
+%include "libavutil/x86/x86util.asm"
+
+%define MAX_PB_SIZE 128
+
+SECTION_RODATA 32
+
+shift_12 times 2 dw 1 << (15 - (12 - 10))
+shift3_8 times 2 dw 1 << (15 - (8 - 6))
+shift3_10 times 2 dw 1 << (15 - (10 - 6))
+shift3_12 times 2 dw 1 << (15 - (12 - 6))
+pw_16 times 2 dw 16
+
+%if ARCH_X86_64
+
+%if HAVE_AVX2_EXTERNAL
+
+SECTION .text
+
+%define pstride (bd / 10 + 1)
+
+; LOAD(dst, src)
+%macro LOAD_W16 2
+%if bd == 8
+ pmovzxbw %1, %2
+%else
+ movu %1, %2
+%endif
+%endmacro
+
+%macro SHIFT_W16 2
+%if bd == 8
+ psllw %1, (10 - bd)
+%elif bd == 10
+ ; nothing
+%else
+ pmulhrsw %1, %2
+%endif
+%endmacro
+
+%macro SAVE_W16 2
+ movu %1, %2
+%endmacro
+
+; NEXT_4_LINES(is_h)
+%macro NEXT_4_LINES 1
+ lea dstq, [dstq + dsq*4]
+ lea srcq, [srcq + ssq*4]
+%if %1
+ lea src1q, [srcq + pstride]
+%endif
+%endmacro
+
+
+; DMVR_4xW16(dst, dst_stride, dst_stride3, src, src_stride, src_stride3)
+%macro DMVR_4xW16 6
+ LOAD_W16 m0, [%4]
+ LOAD_W16 m1, [%4 + %5]
+ LOAD_W16 m2, [%4 + 2 * %5]
+ LOAD_W16 m3, [%4 + %6]
+
+ SHIFT_W16 m0, m4
+ SHIFT_W16 m1, m4
+ SHIFT_W16 m2, m4
+ SHIFT_W16 m3, m4
+
+ SAVE_W16 [%1] , m0
+ SAVE_W16 [%1 + %2] , m1
+ SAVE_W16 [%1 + 2 * %2], m2
+ SAVE_W16 [%1 + %3] , m3
+%endmacro
+
+; buf += -stride * h + off
+; OFFSET_TO_W4(buf, stride, off)
+%macro OFFSET_TO_W4 3
+ mov id, hd
+ imul iq, %2
+ sub %1, iq
+ lea %1, [%1 + %3]
+%endmacro
+
+%macro OFFSET_TO_W4 0
+ OFFSET_TO_W4 srcq, ssq, 16 * (bd / 10 + 1)
+ OFFSET_TO_W4 dstq, dsq, 16 * 2
+%endmacro
+
+; void ff_vvc_dmvr_%1_avx2(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,
+; int height, intptr_t mx, intptr_t my, int width);
+%macro DMVR_AVX2 1
+cglobal vvc_dmvr_%1, 4, 9, 5, dst, src, ss, h, ds, ds3, w, ss3, i
+%define bd %1
+
+ LOAD_STRIDES
+
+%if %1 > 10
+ vpbroadcastd m4, [shift_%1]
+%endif
+
+ mov wd, wm
+ mov id, hd
+.w16:
+ sub id, 4
+ jl .w16_end
+ DMVR_4xW16 dstq, dsq, ds3q, srcq, ssq, ss3q
+ NEXT_4_LINES 0
+ jmp .w16
+.w16_end:
+
+ sub wd, 16
+ jl .w4_end
+
+ OFFSET_TO_W4
+.w4:
+ sub hd, 4
+ jl .w4_end
+ DMVR_4xW16 dstq, dsq, ds3q, srcq, ssq, ss3q
+ NEXT_4_LINES 0
+ jmp .w4
+.w4_end:
+
+ RET
+%endmacro
+
+; LOAD_COEFFS(coeffs0, coeffs1, src)
+%macro LOAD_COEFFS 3
+ movd xm%2, %3
+ vpbroadcastw m%2, xm%2
+ vpbroadcastd m%1, [pw_16]
+ psubw m%1, m%2
+%endmacro
+
+; LOAD_SHIFT(shift, src)
+%macro LOAD_SHIFT 2
+ vpbroadcastd %1, [%2]
+%if bd == 12
+ psllw %1, 1 ; avoid signed mul for pmulhrsw
+%endif
+%endmacro
+
+; LOAD_STRIDES(shift, src)
+%macro LOAD_STRIDES 0
+ mov dsq, MAX_PB_SIZE * 2
+ lea ss3q, [ssq*3]
+ lea ds3q, [dsq*3]
+%endmacro
+
+; BILINEAR(dst/src0, src1, coeff0, coeff1, round, tmp)
+%macro BILINEAR 6
+ pmullw %1, %3
+ pmullw %6, %2, %4
+ paddw %1, %6
+%if bd == 12
+ psrlw %1, 1 ; avoid signed mul for pmulhrsw
+%endif
+ pmulhrsw %1, %5
+%endmacro
+
+; DMVR_H_1xW16(dst, src0, src1, offset, tmp)
+%macro DMVR_H_1xW16 5
+ LOAD_W16 %1, [%2 + %4]
+ LOAD_W16 %5, [%3 + %4]
+ BILINEAR %1, %5, m10, m11, m12, %5
+%endmacro
+
+; DMVR_H_4xW16(dst, dst_stride, dst_stride3, src, src_stride, src_stride3, src1)
+%macro DMVR_H_4xW16 7
+ DMVR_H_1xW16 m0, %4, %7, 0, m4
+ DMVR_H_1xW16 m1, %4, %7, %5, m5
+ DMVR_H_1xW16 m2, %4, %7, 2 * %5, m6
+ DMVR_H_1xW16 m3, %4, %7, %6, m7
+
+ SAVE_W16 [%1] , m0
+ SAVE_W16 [%1 + %2] , m1
+ SAVE_W16 [%1 + 2 * %2], m2
+ SAVE_W16 [%1 + %3] , m3
+%endmacro
+
+; void ff_vvc_dmvr_h_%1_avx2(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,
+; int height, intptr_t mx, intptr_t my, int width);
+%macro DMVR_H_AVX2 1
+cglobal vvc_dmvr_h_%1, 4, 10, 13, dst, src, ss, h, ds, ds3, w, ss3, src1, i
+%define bd %1
+
+ LOAD_COEFFS 10, 11, dsm
+ LOAD_SHIFT m12, shift3_%1
+
+ LOAD_STRIDES
+ lea src1q, [srcq + pstride]
+
+ mov wd, wm
+ mov id, hd
+.w16:
+ sub id, 4
+ jl .w16_end
+ DMVR_H_4xW16 dstq, dsq, ds3q, srcq, ssq, ss3q, src1q
+ NEXT_4_LINES 1
+ jmp .w16
+.w16_end:
+
+ sub wd, 16
+ jl .w4_end
+
+ OFFSET_TO_W4
+ lea src1q, [srcq + pstride]
+.w4:
+ sub hd, 4
+ jl .w4_end
+ DMVR_H_4xW16 dstq, dsq, ds3q, srcq, ssq, ss3q, src1q
+ NEXT_4_LINES 1
+ jmp .w4
+.w4_end:
+
+ RET
+%endmacro
+
+; DMVR_V_4xW16(dst, dst_stride, dst_stride3, src, src_stride, src_stride3)
+%macro DMVR_V_4xW16 6
+ LOAD_W16 m1, [%4 + %5]
+ LOAD_W16 m2, [%4 + 2 * %5]
+ LOAD_W16 m3, [%4 + %6]
+ LOAD_W16 m4, [%4 + 4 * %5]
+
+ BILINEAR m0, m1, m8, m9, m10, m11
+ BILINEAR m1, m2, m8, m9, m10, m12
+ BILINEAR m2, m3, m8, m9, m10, m13
+ BILINEAR m3, m4, m8, m9, m10, m14
+
+ SAVE_W16 [%1] , m0
+ SAVE_W16 [%1 + %2] , m1
+ SAVE_W16 [%1 + 2 * %2], m2
+ SAVE_W16 [%1 + %3] , m3
+
+ ; why can't we use SWAP m0, m4 here?
+ movaps m0, m4
+%endmacro
+
+; void ff_vvc_dmvr_v_%1_avx2(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,
+; int height, intptr_t mx, intptr_t my, int width);
+%macro DMVR_V_AVX2 1
+cglobal vvc_dmvr_v_%1, 4, 9, 15, dst, src, ss, h, ds, ds3, w, ss3, i
+%define bd %1
+
+ LOAD_COEFFS 8, 9, ds3m
+ LOAD_SHIFT m10, shift3_%1
+
+ LOAD_STRIDES
+
+ mov wd, wm
+ mov id, hd
+ LOAD_W16 m0, [srcq]
+.w16:
+ sub id, 4
+ jl .w16_end
+ DMVR_V_4xW16 dstq, dsq, ds3q, srcq, ssq, ss3q
+ NEXT_4_LINES 0
+ jmp .w16
+.w16_end:
+
+ sub wd, 16
+ jl .w4_end
+
+ OFFSET_TO_W4
+ LOAD_W16 m0, [srcq]
+.w4:
+ sub hd, 4
+ jl .w4_end
+ DMVR_V_4xW16 dstq, dsq, ds3q, srcq, ssq, ss3q
+ NEXT_4_LINES 0
+ jmp .w4
+.w4_end:
+
+ RET
+%endmacro
+
+; DMVR_HV_4xW16(dst, dst_stride, dst_stride3, src, src_stride, src_stride3, src1)
+%macro DMVR_HV_4xW16 7
+ DMVR_H_1xW16 m1, %4, %7, %5, m6
+ DMVR_H_1xW16 m2, %4, %7, 2 * %5, m7
+ DMVR_H_1xW16 m3, %4, %7, %6, m8
+ DMVR_H_1xW16 m4, %4, %7, 4 * %5, m9
+
+ BILINEAR m0, m1, m13, m14, m15, m6
+ BILINEAR m1, m2, m13, m14, m15, m7
+ BILINEAR m2, m3, m13, m14, m15, m8
+ BILINEAR m3, m4, m13, m14, m15, m9
+
+ SAVE_W16 [%1] , m0
+ SAVE_W16 [%1 + %2] , m1
+ SAVE_W16 [%1 + 2 * %2], m2
+ SAVE_W16 [%1 + %3] , m3
+
+ ; why can't we use SWAP m0, m4 here?
+ movaps m0, m4
+%endmacro
+
+; void ff_vvc_dmvr_hv_%1_avx2(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,
+; int height, intptr_t mx, intptr_t my, int width);
+%macro DMVR_HV_AVX2 1
+cglobal vvc_dmvr_hv_%1, 7, 10, 16, dst, src, ss, h, ds, ds3, w, ss3, src1, i
+%define bd %1
+
+ LOAD_COEFFS 10, 11, dsm
+ LOAD_SHIFT m12, shift3_%1
+
+ LOAD_COEFFS 13, 14, ds3m
+ LOAD_SHIFT m15, shift3_10
+
+ LOAD_STRIDES
+ lea src1q, [srcq + pstride]
+
+ mov id, hd
+ DMVR_H_1xW16 m0, srcq, src1q, 0, m5
+.w16:
+ sub id, 4
+ jl .w16_end
+ DMVR_HV_4xW16 dstq, dsq, ds3q, srcq, ssq, ss3q, src1q
+ NEXT_4_LINES 1
+ jmp .w16
+.w16_end:
+
+ sub wd, 16
+ jl .w4_end
+
+ OFFSET_TO_W4
+ lea src1q, [srcq + pstride]
+
+ DMVR_H_1xW16 m0, srcq, src1q, 0, m5
+.w4:
+ sub hd, 4
+ jl .w4_end
+ DMVR_HV_4xW16 dstq, dsq, ds3q, srcq, ssq, ss3q, src1q
+ NEXT_4_LINES 1
+ jmp .w4
+.w4_end:
+
+ RET
+%endmacro
+
+%macro VVC_DMVR_AVX2 1
+ DMVR_AVX2 %1
+ DMVR_H_AVX2 %1
+ DMVR_V_AVX2 %1
+ DMVR_HV_AVX2 %1
+%endmacro
+
+INIT_YMM avx2
+
+VVC_DMVR_AVX2 8
+VVC_DMVR_AVX2 10
+VVC_DMVR_AVX2 12
+
+%endif ; HAVE_AVX2_EXTERNAL
+
+%endif ; ARCH_X86_64
diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
index 4b4a2aa937..d5b4f4f8a5 100644
--- a/libavcodec/x86/vvc/vvcdsp_init.c
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -87,6 +87,21 @@ AVG_PROTOTYPES( 8, avx2)
AVG_PROTOTYPES(10, avx2)
AVG_PROTOTYPES(12, avx2)
+
+#define DMVR_PROTOTYPES(bd, opt) \
+void ff_vvc_dmvr_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \
+ int height, intptr_t mx, intptr_t my, int width); \
+void ff_vvc_dmvr_h_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \
+ int height, intptr_t mx, intptr_t my, int width); \
+void ff_vvc_dmvr_v_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \
+ int height, intptr_t mx, intptr_t my, int width); \
+void ff_vvc_dmvr_hv_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \
+ int height, intptr_t mx, intptr_t my, int width); \
+
+DMVR_PROTOTYPES( 8, avx2)
+DMVR_PROTOTYPES(10, avx2)
+DMVR_PROTOTYPES(12, avx2)
+
#define ALF_BPC_PROTOTYPES(bpc, opt) \
void BF(ff_vvc_alf_filter_luma, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, \
@@ -306,6 +321,13 @@ ALF_FUNCS(16, 12, avx2)
c->inter.w_avg = bf(ff_vvc_w_avg, bd, opt); \
} while (0)
+#define DMVR_INIT(bd) do { \
+ c->inter.dmvr[0][0] = ff_vvc_dmvr_##bd##_avx2; \
+ c->inter.dmvr[0][1] = ff_vvc_dmvr_h_##bd##_avx2; \
+ c->inter.dmvr[1][0] = ff_vvc_dmvr_v_##bd##_avx2; \
+ c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_##bd##_avx2; \
+} while (0)
+
#define ALF_INIT(bd) do { \
c->alf.filter[LUMA] = ff_vvc_alf_filter_luma_##bd##_avx2; \
c->alf.filter[CHROMA] = ff_vvc_alf_filter_chroma_##bd##_avx2; \
@@ -330,6 +352,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
ALF_INIT(8);
AVG_INIT(8, avx2);
MC_LINKS_AVX2(8);
+ DMVR_INIT(8);
SAD_INIT();
}
break;
@@ -342,6 +365,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
AVG_INIT(10, avx2);
MC_LINKS_AVX2(10);
MC_LINKS_16BPC_AVX2(10);
+ DMVR_INIT(10);
SAD_INIT();
}
break;
@@ -354,6 +378,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
AVG_INIT(12, avx2);
MC_LINKS_AVX2(12);
MC_LINKS_16BPC_AVX2(12);
+ DMVR_INIT(12);
SAD_INIT();
}
break;
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
* [FFmpeg-devel] [PATCH 3/3] checkasm: add tests for vvc dmvr
[not found] <20240725133546.19125-1-nuomi2021@gmail.com>
2024-07-25 13:35 ` [FFmpeg-devel] [PATCH 2/3] x86/vvcdec: add dmvr avx2 code Nuo Mi
@ 2024-07-25 13:35 ` Nuo Mi
2024-07-26 1:20 ` James Almer
1 sibling, 1 reply; 6+ messages in thread
From: Nuo Mi @ 2024-07-25 13:35 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Nuo Mi
dmvr_8_12x20_c: 186.2
dmvr_8_12x20_avx2: 25.7
dmvr_8_20x12_c: 181.7
dmvr_8_20x12_avx2: 25.2
dmvr_8_20x20_c: 283.2
dmvr_8_20x20_avx2: 32.0
dmvr_10_12x20_c: 90.0
dmvr_10_12x20_avx2: 15.7
dmvr_10_20x12_c: 41.0
dmvr_10_20x12_avx2: 14.7
dmvr_10_20x20_c: 81.5
dmvr_10_20x20_avx2: 26.7
dmvr_12_12x20_c: 190.7
dmvr_12_12x20_avx2: 20.2
dmvr_12_20x12_c: 187.2
dmvr_12_20x12_avx2: 20.2
dmvr_12_20x20_c: 292.7
dmvr_12_20x20_avx2: 27.2
dmvr_h_8_12x20_c: 317.0
dmvr_h_8_12x20_avx2: 37.0
dmvr_h_8_20x12_c: 340.0
dmvr_h_8_20x12_avx2: 41.0
dmvr_h_8_20x20_c: 540.7
dmvr_h_8_20x20_avx2: 64.0
dmvr_h_10_12x20_c: 322.7
dmvr_h_10_12x20_avx2: 30.7
dmvr_h_10_20x12_c: 344.2
dmvr_h_10_20x12_avx2: 34.0
dmvr_h_10_20x20_c: 529.0
dmvr_h_10_20x20_avx2: 51.5
dmvr_h_12_12x20_c: 326.7
dmvr_h_12_12x20_avx2: 33.5
dmvr_h_12_20x12_c: 331.7
dmvr_h_12_20x12_avx2: 51.2
dmvr_h_12_20x20_c: 534.0
dmvr_h_12_20x20_avx2: 62.7
dmvr_hv_8_12x20_c: 650.0
dmvr_hv_8_12x20_avx2: 57.2
dmvr_hv_8_20x12_c: 676.2
dmvr_hv_8_20x12_avx2: 70.0
dmvr_hv_8_20x20_c: 1068.5
dmvr_hv_8_20x20_avx2: 103.2
dmvr_hv_10_12x20_c: 649.0
dmvr_hv_10_12x20_avx2: 48.2
dmvr_hv_10_20x12_c: 677.7
dmvr_hv_10_20x12_avx2: 59.7
dmvr_hv_10_20x20_c: 1093.5
dmvr_hv_10_20x20_avx2: 91.7
dmvr_hv_12_12x20_c: 660.0
dmvr_hv_12_12x20_avx2: 58.7
dmvr_hv_12_20x12_c: 682.7
dmvr_hv_12_20x12_avx2: 72.0
dmvr_hv_12_20x20_c: 1094.0
dmvr_hv_12_20x20_avx2: 113.2
dmvr_v_8_12x20_c: 325.7
dmvr_v_8_12x20_avx2: 31.2
dmvr_v_8_20x12_c: 326.2
dmvr_v_8_20x12_avx2: 38.5
dmvr_v_8_20x20_c: 538.5
dmvr_v_8_20x20_avx2: 54.2
dmvr_v_10_12x20_c: 318.5
dmvr_v_10_12x20_avx2: 23.7
dmvr_v_10_20x12_c: 330.7
dmvr_v_10_20x12_avx2: 40.5
dmvr_v_10_20x20_c: 567.5
dmvr_v_10_20x20_avx2: 48.0
dmvr_v_12_12x20_c: 335.2
dmvr_v_12_12x20_avx2: 30.0
dmvr_v_12_20x12_c: 330.2
dmvr_v_12_20x12_avx2: 39.5
dmvr_v_12_20x20_c: 535.2
dmvr_v_12_20x20_avx2: 60.0
---
tests/checkasm/vvc_mc.c | 59 +++++++++++++++++++++++++++++++++++++++++
1 file changed, 59 insertions(+)
diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
index bc6b580f42..62fa6aa7d0 100644
--- a/tests/checkasm/vvc_mc.c
+++ b/tests/checkasm/vvc_mc.c
@@ -324,6 +324,64 @@ static void check_avg(void)
report("avg");
}
+#define SR_RANGE 2
+static void check_dmvr(void)
+{
+ LOCAL_ALIGNED_32(uint16_t, dst0, [DST_BUF_SIZE]);
+ LOCAL_ALIGNED_32(uint16_t, dst1, [DST_BUF_SIZE]);
+ LOCAL_ALIGNED_32(uint8_t, src0, [SRC_BUF_SIZE]);
+ LOCAL_ALIGNED_32(uint8_t, src1, [SRC_BUF_SIZE]);
+ const int dst_stride = MAX_PB_SIZE * sizeof(int16_t);
+
+ VVCDSPContext c;
+ declare_func(void, int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, int height,
+ intptr_t mx, intptr_t my, int width);
+
+ for (int bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
+ ff_vvc_dsp_init(&c, bit_depth);
+ randomize_pixels(src0, src1, SRC_BUF_SIZE);
+ for (int i = 0; i < 2; i++) {
+ for (int j = 0; j < 2; j++) {
+ for (int h = 8; h <= 16; h *= 2) {
+ for (int w = 8; w <= 16; w *= 2) {
+ const int pred_w = w + 2 * SR_RANGE;
+ const int pred_h = h + 2 * SR_RANGE;
+ const int mx = rnd() % VVC_INTER_LUMA_DMVR_FACTS;
+ const int my = rnd() % VVC_INTER_LUMA_DMVR_FACTS;
+ const char *type;
+
+ if (w * h < 128)
+ continue;
+
+ switch ((j << 1) | i) {
+ case 0: type = "dmvr"; break; // 0 0
+ case 1: type = "dmvr_h"; break; // 0 1
+ case 2: type = "dmvr_v"; break; // 1 0
+ case 3: type = "dmvr_hv"; break; // 1 1
+ }
+
+ if (check_func(c.inter.dmvr[j][i], "%s_%d_%dx%d", type, bit_depth, pred_w, pred_h)) {
+ memset(dst0, 0, DST_BUF_SIZE);
+ memset(dst1, 0, DST_BUF_SIZE);
+ call_ref(dst0, src0 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w);
+ call_new(dst1, src1 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w);
+ for (int k = 0; k < pred_h; k++) {
+ if (memcmp(dst0 + k * dst_stride, dst1 + k * dst_stride, pred_w * sizeof(int16_t))) {
+ fail();
+ break;
+ }
+ }
+
+ bench_new(dst1, src1 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w);
+ }
+ }
+ }
+ }
+ }
+ }
+ report("dmvr");
+}
+
static void check_vvc_sad(void)
{
const int bit_depth = 10;
@@ -363,6 +421,7 @@ static void check_vvc_sad(void)
void checkasm_check_vvc_mc(void)
{
+ check_dmvr();
check_vvc_sad();
check_put_vvc_luma();
check_put_vvc_luma_uni();
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [FFmpeg-devel] [PATCH 3/3] checkasm: add tests for vvc dmvr
2024-07-25 13:35 ` [FFmpeg-devel] [PATCH 3/3] checkasm: add tests for vvc dmvr Nuo Mi
@ 2024-07-26 1:20 ` James Almer
2024-07-26 7:42 ` Nuo Mi
0 siblings, 1 reply; 6+ messages in thread
From: James Almer @ 2024-07-26 1:20 UTC (permalink / raw)
To: ffmpeg-devel
On 7/25/2024 10:35 AM, Nuo Mi wrote:
> dmvr_8_12x20_c: 186.2
> dmvr_8_12x20_avx2: 25.7
> dmvr_8_20x12_c: 181.7
> dmvr_8_20x12_avx2: 25.2
> dmvr_8_20x20_c: 283.2
> dmvr_8_20x20_avx2: 32.0
> dmvr_10_12x20_c: 90.0
> dmvr_10_12x20_avx2: 15.7
> dmvr_10_20x12_c: 41.0
> dmvr_10_20x12_avx2: 14.7
> dmvr_10_20x20_c: 81.5
> dmvr_10_20x20_avx2: 26.7
> dmvr_12_12x20_c: 190.7
> dmvr_12_12x20_avx2: 20.2
> dmvr_12_20x12_c: 187.2
> dmvr_12_20x12_avx2: 20.2
> dmvr_12_20x20_c: 292.7
> dmvr_12_20x20_avx2: 27.2
> dmvr_h_8_12x20_c: 317.0
> dmvr_h_8_12x20_avx2: 37.0
> dmvr_h_8_20x12_c: 340.0
> dmvr_h_8_20x12_avx2: 41.0
> dmvr_h_8_20x20_c: 540.7
> dmvr_h_8_20x20_avx2: 64.0
> dmvr_h_10_12x20_c: 322.7
> dmvr_h_10_12x20_avx2: 30.7
> dmvr_h_10_20x12_c: 344.2
> dmvr_h_10_20x12_avx2: 34.0
> dmvr_h_10_20x20_c: 529.0
> dmvr_h_10_20x20_avx2: 51.5
> dmvr_h_12_12x20_c: 326.7
> dmvr_h_12_12x20_avx2: 33.5
> dmvr_h_12_20x12_c: 331.7
> dmvr_h_12_20x12_avx2: 51.2
> dmvr_h_12_20x20_c: 534.0
> dmvr_h_12_20x20_avx2: 62.7
> dmvr_hv_8_12x20_c: 650.0
> dmvr_hv_8_12x20_avx2: 57.2
> dmvr_hv_8_20x12_c: 676.2
> dmvr_hv_8_20x12_avx2: 70.0
> dmvr_hv_8_20x20_c: 1068.5
> dmvr_hv_8_20x20_avx2: 103.2
> dmvr_hv_10_12x20_c: 649.0
> dmvr_hv_10_12x20_avx2: 48.2
> dmvr_hv_10_20x12_c: 677.7
> dmvr_hv_10_20x12_avx2: 59.7
> dmvr_hv_10_20x20_c: 1093.5
> dmvr_hv_10_20x20_avx2: 91.7
> dmvr_hv_12_12x20_c: 660.0
> dmvr_hv_12_12x20_avx2: 58.7
> dmvr_hv_12_20x12_c: 682.7
> dmvr_hv_12_20x12_avx2: 72.0
> dmvr_hv_12_20x20_c: 1094.0
> dmvr_hv_12_20x20_avx2: 113.2
> dmvr_v_8_12x20_c: 325.7
> dmvr_v_8_12x20_avx2: 31.2
> dmvr_v_8_20x12_c: 326.2
> dmvr_v_8_20x12_avx2: 38.5
> dmvr_v_8_20x20_c: 538.5
> dmvr_v_8_20x20_avx2: 54.2
> dmvr_v_10_12x20_c: 318.5
> dmvr_v_10_12x20_avx2: 23.7
> dmvr_v_10_20x12_c: 330.7
> dmvr_v_10_20x12_avx2: 40.5
> dmvr_v_10_20x20_c: 567.5
> dmvr_v_10_20x20_avx2: 48.0
> dmvr_v_12_12x20_c: 335.2
> dmvr_v_12_12x20_avx2: 30.0
> dmvr_v_12_20x12_c: 330.2
> dmvr_v_12_20x12_avx2: 39.5
> dmvr_v_12_20x20_c: 535.2
> dmvr_v_12_20x20_avx2: 60.0
> ---
> tests/checkasm/vvc_mc.c | 59 +++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 59 insertions(+)
>
> diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
> index bc6b580f42..62fa6aa7d0 100644
> --- a/tests/checkasm/vvc_mc.c
> +++ b/tests/checkasm/vvc_mc.c
> @@ -324,6 +324,64 @@ static void check_avg(void)
> report("avg");
> }
>
> +#define SR_RANGE 2
> +static void check_dmvr(void)
> +{
> + LOCAL_ALIGNED_32(uint16_t, dst0, [DST_BUF_SIZE]);
> + LOCAL_ALIGNED_32(uint16_t, dst1, [DST_BUF_SIZE]);
> + LOCAL_ALIGNED_32(uint8_t, src0, [SRC_BUF_SIZE]);
> + LOCAL_ALIGNED_32(uint8_t, src1, [SRC_BUF_SIZE]);
> + const int dst_stride = MAX_PB_SIZE * sizeof(int16_t);
> +
> + VVCDSPContext c;
> + declare_func(void, int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, int height,
> + intptr_t mx, intptr_t my, int width);
> +
> + for (int bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
> + ff_vvc_dsp_init(&c, bit_depth);
> + randomize_pixels(src0, src1, SRC_BUF_SIZE);
> + for (int i = 0; i < 2; i++) {
> + for (int j = 0; j < 2; j++) {
> + for (int h = 8; h <= 16; h *= 2) {
> + for (int w = 8; w <= 16; w *= 2) {
> + const int pred_w = w + 2 * SR_RANGE;
> + const int pred_h = h + 2 * SR_RANGE;
> + const int mx = rnd() % VVC_INTER_LUMA_DMVR_FACTS;
> + const int my = rnd() % VVC_INTER_LUMA_DMVR_FACTS;
> + const char *type;
> +
> + if (w * h < 128)
> + continue;
So h == 8 && w == 8 is not tested?
> +
> + switch ((j << 1) | i) {
> + case 0: type = "dmvr"; break; // 0 0
> + case 1: type = "dmvr_h"; break; // 0 1
> + case 2: type = "dmvr_v"; break; // 1 0
> + case 3: type = "dmvr_hv"; break; // 1 1
> + }
> +
> + if (check_func(c.inter.dmvr[j][i], "%s_%d_%dx%d", type, bit_depth, pred_w, pred_h)) {
> + memset(dst0, 0, DST_BUF_SIZE);
> + memset(dst1, 0, DST_BUF_SIZE);
> + call_ref(dst0, src0 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w);
> + call_new(dst1, src1 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w);
> + for (int k = 0; k < pred_h; k++) {
> + if (memcmp(dst0 + k * dst_stride, dst1 + k * dst_stride, pred_w * sizeof(int16_t))) {
> + fail();
> + break;
> + }
> + }
> +
> + bench_new(dst1, src1 + SRC_OFFSET, PIXEL_STRIDE, pred_h, mx, my, pred_w);
> + }
> + }
> + }
> + }
> + }
> + }
> + report("dmvr");
> +}
> +
> static void check_vvc_sad(void)
> {
> const int bit_depth = 10;
> @@ -363,6 +421,7 @@ static void check_vvc_sad(void)
>
> void checkasm_check_vvc_mc(void)
> {
> + check_dmvr();
> check_vvc_sad();
> check_put_vvc_luma();
> check_put_vvc_luma_uni();
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [FFmpeg-devel] [PATCH 3/3] checkasm: add tests for vvc dmvr
2024-07-26 1:20 ` James Almer
@ 2024-07-26 7:42 ` Nuo Mi
2024-08-11 14:00 ` Nuo Mi
0 siblings, 1 reply; 6+ messages in thread
From: Nuo Mi @ 2024-07-26 7:42 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Fri, Jul 26, 2024 at 9:36 AM James Almer <jamrial@gmail.com> wrote:
> On 7/25/2024 10:35 AM, Nuo Mi wrote:
> > dmvr_8_12x20_c: 186.2
> > dmvr_8_12x20_avx2: 25.7
> > dmvr_8_20x12_c: 181.7
> > dmvr_8_20x12_avx2: 25.2
> > dmvr_8_20x20_c: 283.2
> > dmvr_8_20x20_avx2: 32.0
> > dmvr_10_12x20_c: 90.0
> > dmvr_10_12x20_avx2: 15.7
> > dmvr_10_20x12_c: 41.0
> > dmvr_10_20x12_avx2: 14.7
> > dmvr_10_20x20_c: 81.5
> > dmvr_10_20x20_avx2: 26.7
> > dmvr_12_12x20_c: 190.7
> > dmvr_12_12x20_avx2: 20.2
> > dmvr_12_20x12_c: 187.2
> > dmvr_12_20x12_avx2: 20.2
> > dmvr_12_20x20_c: 292.7
> > dmvr_12_20x20_avx2: 27.2
> > dmvr_h_8_12x20_c: 317.0
> > dmvr_h_8_12x20_avx2: 37.0
> > dmvr_h_8_20x12_c: 340.0
> > dmvr_h_8_20x12_avx2: 41.0
> > dmvr_h_8_20x20_c: 540.7
> > dmvr_h_8_20x20_avx2: 64.0
> > dmvr_h_10_12x20_c: 322.7
> > dmvr_h_10_12x20_avx2: 30.7
> > dmvr_h_10_20x12_c: 344.2
> > dmvr_h_10_20x12_avx2: 34.0
> > dmvr_h_10_20x20_c: 529.0
> > dmvr_h_10_20x20_avx2: 51.5
> > dmvr_h_12_12x20_c: 326.7
> > dmvr_h_12_12x20_avx2: 33.5
> > dmvr_h_12_20x12_c: 331.7
> > dmvr_h_12_20x12_avx2: 51.2
> > dmvr_h_12_20x20_c: 534.0
> > dmvr_h_12_20x20_avx2: 62.7
> > dmvr_hv_8_12x20_c: 650.0
> > dmvr_hv_8_12x20_avx2: 57.2
> > dmvr_hv_8_20x12_c: 676.2
> > dmvr_hv_8_20x12_avx2: 70.0
> > dmvr_hv_8_20x20_c: 1068.5
> > dmvr_hv_8_20x20_avx2: 103.2
> > dmvr_hv_10_12x20_c: 649.0
> > dmvr_hv_10_12x20_avx2: 48.2
> > dmvr_hv_10_20x12_c: 677.7
> > dmvr_hv_10_20x12_avx2: 59.7
> > dmvr_hv_10_20x20_c: 1093.5
> > dmvr_hv_10_20x20_avx2: 91.7
> > dmvr_hv_12_12x20_c: 660.0
> > dmvr_hv_12_12x20_avx2: 58.7
> > dmvr_hv_12_20x12_c: 682.7
> > dmvr_hv_12_20x12_avx2: 72.0
> > dmvr_hv_12_20x20_c: 1094.0
> > dmvr_hv_12_20x20_avx2: 113.2
> > dmvr_v_8_12x20_c: 325.7
> > dmvr_v_8_12x20_avx2: 31.2
> > dmvr_v_8_20x12_c: 326.2
> > dmvr_v_8_20x12_avx2: 38.5
> > dmvr_v_8_20x20_c: 538.5
> > dmvr_v_8_20x20_avx2: 54.2
> > dmvr_v_10_12x20_c: 318.5
> > dmvr_v_10_12x20_avx2: 23.7
> > dmvr_v_10_20x12_c: 330.7
> > dmvr_v_10_20x12_avx2: 40.5
> > dmvr_v_10_20x20_c: 567.5
> > dmvr_v_10_20x20_avx2: 48.0
> > dmvr_v_12_12x20_c: 335.2
> > dmvr_v_12_12x20_avx2: 30.0
> > dmvr_v_12_20x12_c: 330.2
> > dmvr_v_12_20x12_avx2: 39.5
> > dmvr_v_12_20x20_c: 535.2
> > dmvr_v_12_20x20_avx2: 60.0
> > ---
> > tests/checkasm/vvc_mc.c | 59 +++++++++++++++++++++++++++++++++++++++++
> > 1 file changed, 59 insertions(+)
> >
> > diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
> > index bc6b580f42..62fa6aa7d0 100644
> > --- a/tests/checkasm/vvc_mc.c
> > +++ b/tests/checkasm/vvc_mc.c
> > @@ -324,6 +324,64 @@ static void check_avg(void)
> > report("avg");
> > }
> >
> > +#define SR_RANGE 2
> > +static void check_dmvr(void)
> > +{
> > + LOCAL_ALIGNED_32(uint16_t, dst0, [DST_BUF_SIZE]);
> > + LOCAL_ALIGNED_32(uint16_t, dst1, [DST_BUF_SIZE]);
> > + LOCAL_ALIGNED_32(uint8_t, src0, [SRC_BUF_SIZE]);
> > + LOCAL_ALIGNED_32(uint8_t, src1, [SRC_BUF_SIZE]);
> > + const int dst_stride = MAX_PB_SIZE * sizeof(int16_t);
> > +
> > + VVCDSPContext c;
> > + declare_func(void, int16_t *dst, const uint8_t *src, ptrdiff_t
> src_stride, int height,
> > + intptr_t mx, intptr_t my, int width);
> > +
> > + for (int bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
> > + ff_vvc_dsp_init(&c, bit_depth);
> > + randomize_pixels(src0, src1, SRC_BUF_SIZE);
> > + for (int i = 0; i < 2; i++) {
> > + for (int j = 0; j < 2; j++) {
> > + for (int h = 8; h <= 16; h *= 2) {
> > + for (int w = 8; w <= 16; w *= 2) {
> > + const int pred_w = w + 2 * SR_RANGE;
> > + const int pred_h = h + 2 * SR_RANGE;
> > + const int mx = rnd() %
> VVC_INTER_LUMA_DMVR_FACTS;
> > + const int my = rnd() %
> VVC_INTER_LUMA_DMVR_FACTS;
> > + const char *type;
> > +
> > + if (w * h < 128)
> > + continue;
>
> So h == 8 && w == 8 is not tested?
>
Hi James,
thank you for the review.
Yes, DMVR operates on subblocks with a maximum size of 16x16, and it also
requires that the width multiplied by the height be at least 128.
Therefore, only block sizes of 8x16, 16x8, and 16x16 are valid.
see:
8.5.1 General decoding process for coding units coded in inter prediction
mode
and
https://vicuesoft.com/blog/titles/DMVR_in_VVC/
>
> > +
> > + switch ((j << 1) | i) {
> > + case 0: type = "dmvr"; break; // 0 0
> > + case 1: type = "dmvr_h"; break; // 0 1
> > + case 2: type = "dmvr_v"; break; // 1 0
> > + case 3: type = "dmvr_hv"; break; // 1 1
> > + }
> > +
> > + if (check_func(c.inter.dmvr[j][i],
> "%s_%d_%dx%d", type, bit_depth, pred_w, pred_h)) {
> > + memset(dst0, 0, DST_BUF_SIZE);
> > + memset(dst1, 0, DST_BUF_SIZE);
> > + call_ref(dst0, src0 + SRC_OFFSET,
> PIXEL_STRIDE, pred_h, mx, my, pred_w);
> > + call_new(dst1, src1 + SRC_OFFSET,
> PIXEL_STRIDE, pred_h, mx, my, pred_w);
> > + for (int k = 0; k < pred_h; k++) {
> > + if (memcmp(dst0 + k * dst_stride, dst1
> + k * dst_stride, pred_w * sizeof(int16_t))) {
> > + fail();
> > + break;
> > + }
> > + }
> > +
> > + bench_new(dst1, src1 + SRC_OFFSET,
> PIXEL_STRIDE, pred_h, mx, my, pred_w);
> > + }
> > + }
> > + }
> > + }
> > + }
> > + }
> > + report("dmvr");
> > +}
> > +
> > static void check_vvc_sad(void)
> > {
> > const int bit_depth = 10;
> > @@ -363,6 +421,7 @@ static void check_vvc_sad(void)
> >
> > void checkasm_check_vvc_mc(void)
> > {
> > + check_dmvr();
> > check_vvc_sad();
> > check_put_vvc_luma();
> > check_put_vvc_luma_uni();
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [FFmpeg-devel] [PATCH 3/3] checkasm: add tests for vvc dmvr
2024-07-26 7:42 ` Nuo Mi
@ 2024-08-11 14:00 ` Nuo Mi
2024-08-15 12:45 ` Nuo Mi
0 siblings, 1 reply; 6+ messages in thread
From: Nuo Mi @ 2024-08-11 14:00 UTC (permalink / raw)
To: FFmpeg development discussions and patches
will apply next week if there are no objections
On Fri, Jul 26, 2024 at 3:42 PM Nuo Mi <nuomi2021@gmail.com> wrote:
>
>
> On Fri, Jul 26, 2024 at 9:36 AM James Almer <jamrial@gmail.com> wrote:
>
>> On 7/25/2024 10:35 AM, Nuo Mi wrote:
>> > dmvr_8_12x20_c: 186.2
>> > dmvr_8_12x20_avx2: 25.7
>> > dmvr_8_20x12_c: 181.7
>> > dmvr_8_20x12_avx2: 25.2
>> > dmvr_8_20x20_c: 283.2
>> > dmvr_8_20x20_avx2: 32.0
>> > dmvr_10_12x20_c: 90.0
>> > dmvr_10_12x20_avx2: 15.7
>> > dmvr_10_20x12_c: 41.0
>> > dmvr_10_20x12_avx2: 14.7
>> > dmvr_10_20x20_c: 81.5
>> > dmvr_10_20x20_avx2: 26.7
>> > dmvr_12_12x20_c: 190.7
>> > dmvr_12_12x20_avx2: 20.2
>> > dmvr_12_20x12_c: 187.2
>> > dmvr_12_20x12_avx2: 20.2
>> > dmvr_12_20x20_c: 292.7
>> > dmvr_12_20x20_avx2: 27.2
>> > dmvr_h_8_12x20_c: 317.0
>> > dmvr_h_8_12x20_avx2: 37.0
>> > dmvr_h_8_20x12_c: 340.0
>> > dmvr_h_8_20x12_avx2: 41.0
>> > dmvr_h_8_20x20_c: 540.7
>> > dmvr_h_8_20x20_avx2: 64.0
>> > dmvr_h_10_12x20_c: 322.7
>> > dmvr_h_10_12x20_avx2: 30.7
>> > dmvr_h_10_20x12_c: 344.2
>> > dmvr_h_10_20x12_avx2: 34.0
>> > dmvr_h_10_20x20_c: 529.0
>> > dmvr_h_10_20x20_avx2: 51.5
>> > dmvr_h_12_12x20_c: 326.7
>> > dmvr_h_12_12x20_avx2: 33.5
>> > dmvr_h_12_20x12_c: 331.7
>> > dmvr_h_12_20x12_avx2: 51.2
>> > dmvr_h_12_20x20_c: 534.0
>> > dmvr_h_12_20x20_avx2: 62.7
>> > dmvr_hv_8_12x20_c: 650.0
>> > dmvr_hv_8_12x20_avx2: 57.2
>> > dmvr_hv_8_20x12_c: 676.2
>> > dmvr_hv_8_20x12_avx2: 70.0
>> > dmvr_hv_8_20x20_c: 1068.5
>> > dmvr_hv_8_20x20_avx2: 103.2
>> > dmvr_hv_10_12x20_c: 649.0
>> > dmvr_hv_10_12x20_avx2: 48.2
>> > dmvr_hv_10_20x12_c: 677.7
>> > dmvr_hv_10_20x12_avx2: 59.7
>> > dmvr_hv_10_20x20_c: 1093.5
>> > dmvr_hv_10_20x20_avx2: 91.7
>> > dmvr_hv_12_12x20_c: 660.0
>> > dmvr_hv_12_12x20_avx2: 58.7
>> > dmvr_hv_12_20x12_c: 682.7
>> > dmvr_hv_12_20x12_avx2: 72.0
>> > dmvr_hv_12_20x20_c: 1094.0
>> > dmvr_hv_12_20x20_avx2: 113.2
>> > dmvr_v_8_12x20_c: 325.7
>> > dmvr_v_8_12x20_avx2: 31.2
>> > dmvr_v_8_20x12_c: 326.2
>> > dmvr_v_8_20x12_avx2: 38.5
>> > dmvr_v_8_20x20_c: 538.5
>> > dmvr_v_8_20x20_avx2: 54.2
>> > dmvr_v_10_12x20_c: 318.5
>> > dmvr_v_10_12x20_avx2: 23.7
>> > dmvr_v_10_20x12_c: 330.7
>> > dmvr_v_10_20x12_avx2: 40.5
>> > dmvr_v_10_20x20_c: 567.5
>> > dmvr_v_10_20x20_avx2: 48.0
>> > dmvr_v_12_12x20_c: 335.2
>> > dmvr_v_12_12x20_avx2: 30.0
>> > dmvr_v_12_20x12_c: 330.2
>> > dmvr_v_12_20x12_avx2: 39.5
>> > dmvr_v_12_20x20_c: 535.2
>> > dmvr_v_12_20x20_avx2: 60.0
>> > ---
>> > tests/checkasm/vvc_mc.c | 59 +++++++++++++++++++++++++++++++++++++++++
>> > 1 file changed, 59 insertions(+)
>> >
>> > diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
>> > index bc6b580f42..62fa6aa7d0 100644
>> > --- a/tests/checkasm/vvc_mc.c
>> > +++ b/tests/checkasm/vvc_mc.c
>> > @@ -324,6 +324,64 @@ static void check_avg(void)
>> > report("avg");
>> > }
>> >
>> > +#define SR_RANGE 2
>> > +static void check_dmvr(void)
>> > +{
>> > + LOCAL_ALIGNED_32(uint16_t, dst0, [DST_BUF_SIZE]);
>> > + LOCAL_ALIGNED_32(uint16_t, dst1, [DST_BUF_SIZE]);
>> > + LOCAL_ALIGNED_32(uint8_t, src0, [SRC_BUF_SIZE]);
>> > + LOCAL_ALIGNED_32(uint8_t, src1, [SRC_BUF_SIZE]);
>> > + const int dst_stride = MAX_PB_SIZE * sizeof(int16_t);
>> > +
>> > + VVCDSPContext c;
>> > + declare_func(void, int16_t *dst, const uint8_t *src, ptrdiff_t
>> src_stride, int height,
>> > + intptr_t mx, intptr_t my, int width);
>> > +
>> > + for (int bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
>> > + ff_vvc_dsp_init(&c, bit_depth);
>> > + randomize_pixels(src0, src1, SRC_BUF_SIZE);
>> > + for (int i = 0; i < 2; i++) {
>> > + for (int j = 0; j < 2; j++) {
>> > + for (int h = 8; h <= 16; h *= 2) {
>> > + for (int w = 8; w <= 16; w *= 2) {
>> > + const int pred_w = w + 2 * SR_RANGE;
>> > + const int pred_h = h + 2 * SR_RANGE;
>> > + const int mx = rnd() %
>> VVC_INTER_LUMA_DMVR_FACTS;
>> > + const int my = rnd() %
>> VVC_INTER_LUMA_DMVR_FACTS;
>> > + const char *type;
>> > +
>> > + if (w * h < 128)
>> > + continue;
>>
>> So h == 8 && w == 8 is not tested?
>>
> Hi James,
> thank you for the review.
>
> Yes, DMVR operates on subblocks with a maximum size of 16x16, and it also
> requires that the width multiplied by the height be at least 128.
> Therefore, only block sizes of 8x16, 16x8, and 16x16 are valid.
>
> see:
> 8.5.1 General decoding process for coding units coded in inter prediction
> mode
> and
> https://vicuesoft.com/blog/titles/DMVR_in_VVC/
>
Will apply this next week.
Thank you
>
>
>
>>
>> > +
>> > + switch ((j << 1) | i) {
>> > + case 0: type = "dmvr"; break; // 0 0
>> > + case 1: type = "dmvr_h"; break; // 0 1
>> > + case 2: type = "dmvr_v"; break; // 1 0
>> > + case 3: type = "dmvr_hv"; break; // 1 1
>> > + }
>> > +
>> > + if (check_func(c.inter.dmvr[j][i],
>> "%s_%d_%dx%d", type, bit_depth, pred_w, pred_h)) {
>> > + memset(dst0, 0, DST_BUF_SIZE);
>> > + memset(dst1, 0, DST_BUF_SIZE);
>> > + call_ref(dst0, src0 + SRC_OFFSET,
>> PIXEL_STRIDE, pred_h, mx, my, pred_w);
>> > + call_new(dst1, src1 + SRC_OFFSET,
>> PIXEL_STRIDE, pred_h, mx, my, pred_w);
>> > + for (int k = 0; k < pred_h; k++) {
>> > + if (memcmp(dst0 + k * dst_stride, dst1
>> + k * dst_stride, pred_w * sizeof(int16_t))) {
>> > + fail();
>> > + break;
>> > + }
>> > + }
>> > +
>> > + bench_new(dst1, src1 + SRC_OFFSET,
>> PIXEL_STRIDE, pred_h, mx, my, pred_w);
>> > + }
>> > + }
>> > + }
>> > + }
>> > + }
>> > + }
>> > + report("dmvr");
>> > +}
>> > +
>> > static void check_vvc_sad(void)
>> > {
>> > const int bit_depth = 10;
>> > @@ -363,6 +421,7 @@ static void check_vvc_sad(void)
>> >
>> > void checkasm_check_vvc_mc(void)
>> > {
>> > + check_dmvr();
>> > check_vvc_sad();
>> > check_put_vvc_luma();
>> > check_put_vvc_luma_uni();
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>>
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [FFmpeg-devel] [PATCH 3/3] checkasm: add tests for vvc dmvr
2024-08-11 14:00 ` Nuo Mi
@ 2024-08-15 12:45 ` Nuo Mi
0 siblings, 0 replies; 6+ messages in thread
From: Nuo Mi @ 2024-08-15 12:45 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Sun, Aug 11, 2024 at 10:00 PM Nuo Mi <nuomi2021@gmail.com> wrote:
> will apply next week if there are no objections
>
Done.
Thank you James for reviewing.
>
> On Fri, Jul 26, 2024 at 3:42 PM Nuo Mi <nuomi2021@gmail.com> wrote:
>
>>
>>
>> On Fri, Jul 26, 2024 at 9:36 AM James Almer <jamrial@gmail.com> wrote:
>>
>>> On 7/25/2024 10:35 AM, Nuo Mi wrote:
>>> > dmvr_8_12x20_c: 186.2
>>> > dmvr_8_12x20_avx2: 25.7
>>> > dmvr_8_20x12_c: 181.7
>>> > dmvr_8_20x12_avx2: 25.2
>>> > dmvr_8_20x20_c: 283.2
>>> > dmvr_8_20x20_avx2: 32.0
>>> > dmvr_10_12x20_c: 90.0
>>> > dmvr_10_12x20_avx2: 15.7
>>> > dmvr_10_20x12_c: 41.0
>>> > dmvr_10_20x12_avx2: 14.7
>>> > dmvr_10_20x20_c: 81.5
>>> > dmvr_10_20x20_avx2: 26.7
>>> > dmvr_12_12x20_c: 190.7
>>> > dmvr_12_12x20_avx2: 20.2
>>> > dmvr_12_20x12_c: 187.2
>>> > dmvr_12_20x12_avx2: 20.2
>>> > dmvr_12_20x20_c: 292.7
>>> > dmvr_12_20x20_avx2: 27.2
>>> > dmvr_h_8_12x20_c: 317.0
>>> > dmvr_h_8_12x20_avx2: 37.0
>>> > dmvr_h_8_20x12_c: 340.0
>>> > dmvr_h_8_20x12_avx2: 41.0
>>> > dmvr_h_8_20x20_c: 540.7
>>> > dmvr_h_8_20x20_avx2: 64.0
>>> > dmvr_h_10_12x20_c: 322.7
>>> > dmvr_h_10_12x20_avx2: 30.7
>>> > dmvr_h_10_20x12_c: 344.2
>>> > dmvr_h_10_20x12_avx2: 34.0
>>> > dmvr_h_10_20x20_c: 529.0
>>> > dmvr_h_10_20x20_avx2: 51.5
>>> > dmvr_h_12_12x20_c: 326.7
>>> > dmvr_h_12_12x20_avx2: 33.5
>>> > dmvr_h_12_20x12_c: 331.7
>>> > dmvr_h_12_20x12_avx2: 51.2
>>> > dmvr_h_12_20x20_c: 534.0
>>> > dmvr_h_12_20x20_avx2: 62.7
>>> > dmvr_hv_8_12x20_c: 650.0
>>> > dmvr_hv_8_12x20_avx2: 57.2
>>> > dmvr_hv_8_20x12_c: 676.2
>>> > dmvr_hv_8_20x12_avx2: 70.0
>>> > dmvr_hv_8_20x20_c: 1068.5
>>> > dmvr_hv_8_20x20_avx2: 103.2
>>> > dmvr_hv_10_12x20_c: 649.0
>>> > dmvr_hv_10_12x20_avx2: 48.2
>>> > dmvr_hv_10_20x12_c: 677.7
>>> > dmvr_hv_10_20x12_avx2: 59.7
>>> > dmvr_hv_10_20x20_c: 1093.5
>>> > dmvr_hv_10_20x20_avx2: 91.7
>>> > dmvr_hv_12_12x20_c: 660.0
>>> > dmvr_hv_12_12x20_avx2: 58.7
>>> > dmvr_hv_12_20x12_c: 682.7
>>> > dmvr_hv_12_20x12_avx2: 72.0
>>> > dmvr_hv_12_20x20_c: 1094.0
>>> > dmvr_hv_12_20x20_avx2: 113.2
>>> > dmvr_v_8_12x20_c: 325.7
>>> > dmvr_v_8_12x20_avx2: 31.2
>>> > dmvr_v_8_20x12_c: 326.2
>>> > dmvr_v_8_20x12_avx2: 38.5
>>> > dmvr_v_8_20x20_c: 538.5
>>> > dmvr_v_8_20x20_avx2: 54.2
>>> > dmvr_v_10_12x20_c: 318.5
>>> > dmvr_v_10_12x20_avx2: 23.7
>>> > dmvr_v_10_20x12_c: 330.7
>>> > dmvr_v_10_20x12_avx2: 40.5
>>> > dmvr_v_10_20x20_c: 567.5
>>> > dmvr_v_10_20x20_avx2: 48.0
>>> > dmvr_v_12_12x20_c: 335.2
>>> > dmvr_v_12_12x20_avx2: 30.0
>>> > dmvr_v_12_20x12_c: 330.2
>>> > dmvr_v_12_20x12_avx2: 39.5
>>> > dmvr_v_12_20x20_c: 535.2
>>> > dmvr_v_12_20x20_avx2: 60.0
>>> > ---
>>> > tests/checkasm/vvc_mc.c | 59
>>> +++++++++++++++++++++++++++++++++++++++++
>>> > 1 file changed, 59 insertions(+)
>>> >
>>> > diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
>>> > index bc6b580f42..62fa6aa7d0 100644
>>> > --- a/tests/checkasm/vvc_mc.c
>>> > +++ b/tests/checkasm/vvc_mc.c
>>> > @@ -324,6 +324,64 @@ static void check_avg(void)
>>> > report("avg");
>>> > }
>>> >
>>> > +#define SR_RANGE 2
>>> > +static void check_dmvr(void)
>>> > +{
>>> > + LOCAL_ALIGNED_32(uint16_t, dst0, [DST_BUF_SIZE]);
>>> > + LOCAL_ALIGNED_32(uint16_t, dst1, [DST_BUF_SIZE]);
>>> > + LOCAL_ALIGNED_32(uint8_t, src0, [SRC_BUF_SIZE]);
>>> > + LOCAL_ALIGNED_32(uint8_t, src1, [SRC_BUF_SIZE]);
>>> > + const int dst_stride = MAX_PB_SIZE * sizeof(int16_t);
>>> > +
>>> > + VVCDSPContext c;
>>> > + declare_func(void, int16_t *dst, const uint8_t *src, ptrdiff_t
>>> src_stride, int height,
>>> > + intptr_t mx, intptr_t my, int width);
>>> > +
>>> > + for (int bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
>>> > + ff_vvc_dsp_init(&c, bit_depth);
>>> > + randomize_pixels(src0, src1, SRC_BUF_SIZE);
>>> > + for (int i = 0; i < 2; i++) {
>>> > + for (int j = 0; j < 2; j++) {
>>> > + for (int h = 8; h <= 16; h *= 2) {
>>> > + for (int w = 8; w <= 16; w *= 2) {
>>> > + const int pred_w = w + 2 * SR_RANGE;
>>> > + const int pred_h = h + 2 * SR_RANGE;
>>> > + const int mx = rnd() %
>>> VVC_INTER_LUMA_DMVR_FACTS;
>>> > + const int my = rnd() %
>>> VVC_INTER_LUMA_DMVR_FACTS;
>>> > + const char *type;
>>> > +
>>> > + if (w * h < 128)
>>> > + continue;
>>>
>>> So h == 8 && w == 8 is not tested?
>>>
>> Hi James,
>> thank you for the review.
>>
>> Yes, DMVR operates on subblocks with a maximum size of 16x16, and it also
>> requires that the width multiplied by the height be at least 128.
>> Therefore, only block sizes of 8x16, 16x8, and 16x16 are valid.
>>
>> see:
>> 8.5.1 General decoding process for coding units coded in inter prediction
>> mode
>> and
>> https://vicuesoft.com/blog/titles/DMVR_in_VVC/
>>
> Will apply this next week.
> Thank you
>
>>
>>
>>
>>>
>>> > +
>>> > + switch ((j << 1) | i) {
>>> > + case 0: type = "dmvr"; break; // 0 0
>>> > + case 1: type = "dmvr_h"; break; // 0 1
>>> > + case 2: type = "dmvr_v"; break; // 1 0
>>> > + case 3: type = "dmvr_hv"; break; // 1 1
>>> > + }
>>> > +
>>> > + if (check_func(c.inter.dmvr[j][i],
>>> "%s_%d_%dx%d", type, bit_depth, pred_w, pred_h)) {
>>> > + memset(dst0, 0, DST_BUF_SIZE);
>>> > + memset(dst1, 0, DST_BUF_SIZE);
>>> > + call_ref(dst0, src0 + SRC_OFFSET,
>>> PIXEL_STRIDE, pred_h, mx, my, pred_w);
>>> > + call_new(dst1, src1 + SRC_OFFSET,
>>> PIXEL_STRIDE, pred_h, mx, my, pred_w);
>>> > + for (int k = 0; k < pred_h; k++) {
>>> > + if (memcmp(dst0 + k * dst_stride,
>>> dst1 + k * dst_stride, pred_w * sizeof(int16_t))) {
>>> > + fail();
>>> > + break;
>>> > + }
>>> > + }
>>> > +
>>> > + bench_new(dst1, src1 + SRC_OFFSET,
>>> PIXEL_STRIDE, pred_h, mx, my, pred_w);
>>> > + }
>>> > + }
>>> > + }
>>> > + }
>>> > + }
>>> > + }
>>> > + report("dmvr");
>>> > +}
>>> > +
>>> > static void check_vvc_sad(void)
>>> > {
>>> > const int bit_depth = 10;
>>> > @@ -363,6 +421,7 @@ static void check_vvc_sad(void)
>>> >
>>> > void checkasm_check_vvc_mc(void)
>>> > {
>>> > + check_dmvr();
>>> > check_vvc_sad();
>>> > check_put_vvc_luma();
>>> > check_put_vvc_luma_uni();
>>> _______________________________________________
>>> ffmpeg-devel mailing list
>>> ffmpeg-devel@ffmpeg.org
>>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>>
>>> To unsubscribe, visit link above, or email
>>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>>>
>>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2024-08-15 12:45 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
[not found] <20240725133546.19125-1-nuomi2021@gmail.com>
2024-07-25 13:35 ` [FFmpeg-devel] [PATCH 2/3] x86/vvcdec: add dmvr avx2 code Nuo Mi
2024-07-25 13:35 ` [FFmpeg-devel] [PATCH 3/3] checkasm: add tests for vvc dmvr Nuo Mi
2024-07-26 1:20 ` James Almer
2024-07-26 7:42 ` Nuo Mi
2024-08-11 14:00 ` Nuo Mi
2024-08-15 12:45 ` Nuo Mi
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git