From: Nuo Mi <nuomi2021@gmail.com> To: ffmpeg-devel@ffmpeg.org Cc: Nuo Mi <nuomi2021@gmail.com> Subject: [FFmpeg-devel] [PATCH 2/3] x86/vvcdec: add dmvr avx2 code Date: Thu, 25 Jul 2024 21:35:45 +0800 Message-ID: <TYSPR06MB64333950CE2429A5119F3F36AAAB2@TYSPR06MB6433.apcprd06.prod.outlook.com> (raw) In-Reply-To: <20240725133546.19125-1-nuomi2021@gmail.com> Decoder-Side Motion Vector Refinement is about 4~8% CPU usage for some clips here is the test result for one time clips | before| after | delta ------------------------------------------|-------|-------|------ RitualDance_1920x1080_60_10_420_37_RA.266 | 338.7 | 354.3 |4.61% NovosobornayaSquare_1920x1080.bin | 320.3 | 329.3 |2.81% Tango2_3840x2160_60_10_420_27_LD.266 | 83.3 | 83.7 |0.48% RitualDance_1920x1080_60_10_420_32_LD.266 | 320.7 | 327.3 |2.06% Chimera_8bit_1080P_1000_frames.vvc | 360.7 | 381.0 |5.63% BQTerrace_1920x1080_60_10_420_22_RA.vvc | 161.7 | 163.0 |0.80% --- libavcodec/x86/vvc/Makefile | 1 + libavcodec/x86/vvc/vvc_dmvr.asm | 373 +++++++++++++++++++++++++++++++ libavcodec/x86/vvc/vvcdsp_init.c | 25 +++ 3 files changed, 399 insertions(+) create mode 100644 libavcodec/x86/vvc/vvc_dmvr.asm diff --git a/libavcodec/x86/vvc/Makefile b/libavcodec/x86/vvc/Makefile index 832d802daf..04f16bc10c 100644 --- a/libavcodec/x86/vvc/Makefile +++ b/libavcodec/x86/vvc/Makefile @@ -4,6 +4,7 @@ clean:: OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvcdsp_init.o \ x86/h26x/h2656dsp.o X86ASM-OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvc_alf.o \ + x86/vvc/vvc_dmvr.o \ x86/vvc/vvc_mc.o \ x86/vvc/vvc_sad.o \ x86/h26x/h2656_inter.o diff --git a/libavcodec/x86/vvc/vvc_dmvr.asm b/libavcodec/x86/vvc/vvc_dmvr.asm new file mode 100644 index 0000000000..4c971f970b --- /dev/null +++ b/libavcodec/x86/vvc/vvc_dmvr.asm @@ -0,0 +1,373 @@ +; /* +; * Provide AVX2 luma dmvr functions for VVC decoding +; * Copyright (c) 2024 Nuo Mi +; * +; * This file is part of FFmpeg. +; * +; * FFmpeg is free software; you can redistribute it and/or +; * modify it under the terms of the GNU Lesser General Public +; * License as published by the Free Software Foundation; either +; * version 2.1 of the License, or (at your option) any later version. +; * +; * FFmpeg is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; * Lesser General Public License for more details. +; * +; * You should have received a copy of the GNU Lesser General Public +; * License along with FFmpeg; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +; */ +%include "libavutil/x86/x86util.asm" + +%define MAX_PB_SIZE 128 + +SECTION_RODATA 32 + +shift_12 times 2 dw 1 << (15 - (12 - 10)) +shift3_8 times 2 dw 1 << (15 - (8 - 6)) +shift3_10 times 2 dw 1 << (15 - (10 - 6)) +shift3_12 times 2 dw 1 << (15 - (12 - 6)) +pw_16 times 2 dw 16 + +%if ARCH_X86_64 + +%if HAVE_AVX2_EXTERNAL + +SECTION .text + +%define pstride (bd / 10 + 1) + +; LOAD(dst, src) +%macro LOAD_W16 2 +%if bd == 8 + pmovzxbw %1, %2 +%else + movu %1, %2 +%endif +%endmacro + +%macro SHIFT_W16 2 +%if bd == 8 + psllw %1, (10 - bd) +%elif bd == 10 + ; nothing +%else + pmulhrsw %1, %2 +%endif +%endmacro + +%macro SAVE_W16 2 + movu %1, %2 +%endmacro + +; NEXT_4_LINES(is_h) +%macro NEXT_4_LINES 1 + lea dstq, [dstq + dsq*4] + lea srcq, [srcq + ssq*4] +%if %1 + lea src1q, [srcq + pstride] +%endif +%endmacro + + +; DMVR_4xW16(dst, dst_stride, dst_stride3, src, src_stride, src_stride3) +%macro DMVR_4xW16 6 + LOAD_W16 m0, [%4] + LOAD_W16 m1, [%4 + %5] + LOAD_W16 m2, [%4 + 2 * %5] + LOAD_W16 m3, [%4 + %6] + + SHIFT_W16 m0, m4 + SHIFT_W16 m1, m4 + SHIFT_W16 m2, m4 + SHIFT_W16 m3, m4 + + SAVE_W16 [%1] , m0 + SAVE_W16 [%1 + %2] , m1 + SAVE_W16 [%1 + 2 * %2], m2 + SAVE_W16 [%1 + %3] , m3 +%endmacro + +; buf += -stride * h + off +; OFFSET_TO_W4(buf, stride, off) +%macro OFFSET_TO_W4 3 + mov id, hd + imul iq, %2 + sub %1, iq + lea %1, [%1 + %3] +%endmacro + +%macro OFFSET_TO_W4 0 + OFFSET_TO_W4 srcq, ssq, 16 * (bd / 10 + 1) + OFFSET_TO_W4 dstq, dsq, 16 * 2 +%endmacro + +; void ff_vvc_dmvr_%1_avx2(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, +; int height, intptr_t mx, intptr_t my, int width); +%macro DMVR_AVX2 1 +cglobal vvc_dmvr_%1, 4, 9, 5, dst, src, ss, h, ds, ds3, w, ss3, i +%define bd %1 + + LOAD_STRIDES + +%if %1 > 10 + vpbroadcastd m4, [shift_%1] +%endif + + mov wd, wm + mov id, hd +.w16: + sub id, 4 + jl .w16_end + DMVR_4xW16 dstq, dsq, ds3q, srcq, ssq, ss3q + NEXT_4_LINES 0 + jmp .w16 +.w16_end: + + sub wd, 16 + jl .w4_end + + OFFSET_TO_W4 +.w4: + sub hd, 4 + jl .w4_end + DMVR_4xW16 dstq, dsq, ds3q, srcq, ssq, ss3q + NEXT_4_LINES 0 + jmp .w4 +.w4_end: + + RET +%endmacro + +; LOAD_COEFFS(coeffs0, coeffs1, src) +%macro LOAD_COEFFS 3 + movd xm%2, %3 + vpbroadcastw m%2, xm%2 + vpbroadcastd m%1, [pw_16] + psubw m%1, m%2 +%endmacro + +; LOAD_SHIFT(shift, src) +%macro LOAD_SHIFT 2 + vpbroadcastd %1, [%2] +%if bd == 12 + psllw %1, 1 ; avoid signed mul for pmulhrsw +%endif +%endmacro + +; LOAD_STRIDES(shift, src) +%macro LOAD_STRIDES 0 + mov dsq, MAX_PB_SIZE * 2 + lea ss3q, [ssq*3] + lea ds3q, [dsq*3] +%endmacro + +; BILINEAR(dst/src0, src1, coeff0, coeff1, round, tmp) +%macro BILINEAR 6 + pmullw %1, %3 + pmullw %6, %2, %4 + paddw %1, %6 +%if bd == 12 + psrlw %1, 1 ; avoid signed mul for pmulhrsw +%endif + pmulhrsw %1, %5 +%endmacro + +; DMVR_H_1xW16(dst, src0, src1, offset, tmp) +%macro DMVR_H_1xW16 5 + LOAD_W16 %1, [%2 + %4] + LOAD_W16 %5, [%3 + %4] + BILINEAR %1, %5, m10, m11, m12, %5 +%endmacro + +; DMVR_H_4xW16(dst, dst_stride, dst_stride3, src, src_stride, src_stride3, src1) +%macro DMVR_H_4xW16 7 + DMVR_H_1xW16 m0, %4, %7, 0, m4 + DMVR_H_1xW16 m1, %4, %7, %5, m5 + DMVR_H_1xW16 m2, %4, %7, 2 * %5, m6 + DMVR_H_1xW16 m3, %4, %7, %6, m7 + + SAVE_W16 [%1] , m0 + SAVE_W16 [%1 + %2] , m1 + SAVE_W16 [%1 + 2 * %2], m2 + SAVE_W16 [%1 + %3] , m3 +%endmacro + +; void ff_vvc_dmvr_h_%1_avx2(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, +; int height, intptr_t mx, intptr_t my, int width); +%macro DMVR_H_AVX2 1 +cglobal vvc_dmvr_h_%1, 4, 10, 13, dst, src, ss, h, ds, ds3, w, ss3, src1, i +%define bd %1 + + LOAD_COEFFS 10, 11, dsm + LOAD_SHIFT m12, shift3_%1 + + LOAD_STRIDES + lea src1q, [srcq + pstride] + + mov wd, wm + mov id, hd +.w16: + sub id, 4 + jl .w16_end + DMVR_H_4xW16 dstq, dsq, ds3q, srcq, ssq, ss3q, src1q + NEXT_4_LINES 1 + jmp .w16 +.w16_end: + + sub wd, 16 + jl .w4_end + + OFFSET_TO_W4 + lea src1q, [srcq + pstride] +.w4: + sub hd, 4 + jl .w4_end + DMVR_H_4xW16 dstq, dsq, ds3q, srcq, ssq, ss3q, src1q + NEXT_4_LINES 1 + jmp .w4 +.w4_end: + + RET +%endmacro + +; DMVR_V_4xW16(dst, dst_stride, dst_stride3, src, src_stride, src_stride3) +%macro DMVR_V_4xW16 6 + LOAD_W16 m1, [%4 + %5] + LOAD_W16 m2, [%4 + 2 * %5] + LOAD_W16 m3, [%4 + %6] + LOAD_W16 m4, [%4 + 4 * %5] + + BILINEAR m0, m1, m8, m9, m10, m11 + BILINEAR m1, m2, m8, m9, m10, m12 + BILINEAR m2, m3, m8, m9, m10, m13 + BILINEAR m3, m4, m8, m9, m10, m14 + + SAVE_W16 [%1] , m0 + SAVE_W16 [%1 + %2] , m1 + SAVE_W16 [%1 + 2 * %2], m2 + SAVE_W16 [%1 + %3] , m3 + + ; why can't we use SWAP m0, m4 here? + movaps m0, m4 +%endmacro + +; void ff_vvc_dmvr_v_%1_avx2(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, +; int height, intptr_t mx, intptr_t my, int width); +%macro DMVR_V_AVX2 1 +cglobal vvc_dmvr_v_%1, 4, 9, 15, dst, src, ss, h, ds, ds3, w, ss3, i +%define bd %1 + + LOAD_COEFFS 8, 9, ds3m + LOAD_SHIFT m10, shift3_%1 + + LOAD_STRIDES + + mov wd, wm + mov id, hd + LOAD_W16 m0, [srcq] +.w16: + sub id, 4 + jl .w16_end + DMVR_V_4xW16 dstq, dsq, ds3q, srcq, ssq, ss3q + NEXT_4_LINES 0 + jmp .w16 +.w16_end: + + sub wd, 16 + jl .w4_end + + OFFSET_TO_W4 + LOAD_W16 m0, [srcq] +.w4: + sub hd, 4 + jl .w4_end + DMVR_V_4xW16 dstq, dsq, ds3q, srcq, ssq, ss3q + NEXT_4_LINES 0 + jmp .w4 +.w4_end: + + RET +%endmacro + +; DMVR_HV_4xW16(dst, dst_stride, dst_stride3, src, src_stride, src_stride3, src1) +%macro DMVR_HV_4xW16 7 + DMVR_H_1xW16 m1, %4, %7, %5, m6 + DMVR_H_1xW16 m2, %4, %7, 2 * %5, m7 + DMVR_H_1xW16 m3, %4, %7, %6, m8 + DMVR_H_1xW16 m4, %4, %7, 4 * %5, m9 + + BILINEAR m0, m1, m13, m14, m15, m6 + BILINEAR m1, m2, m13, m14, m15, m7 + BILINEAR m2, m3, m13, m14, m15, m8 + BILINEAR m3, m4, m13, m14, m15, m9 + + SAVE_W16 [%1] , m0 + SAVE_W16 [%1 + %2] , m1 + SAVE_W16 [%1 + 2 * %2], m2 + SAVE_W16 [%1 + %3] , m3 + + ; why can't we use SWAP m0, m4 here? + movaps m0, m4 +%endmacro + +; void ff_vvc_dmvr_hv_%1_avx2(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, +; int height, intptr_t mx, intptr_t my, int width); +%macro DMVR_HV_AVX2 1 +cglobal vvc_dmvr_hv_%1, 7, 10, 16, dst, src, ss, h, ds, ds3, w, ss3, src1, i +%define bd %1 + + LOAD_COEFFS 10, 11, dsm + LOAD_SHIFT m12, shift3_%1 + + LOAD_COEFFS 13, 14, ds3m + LOAD_SHIFT m15, shift3_10 + + LOAD_STRIDES + lea src1q, [srcq + pstride] + + mov id, hd + DMVR_H_1xW16 m0, srcq, src1q, 0, m5 +.w16: + sub id, 4 + jl .w16_end + DMVR_HV_4xW16 dstq, dsq, ds3q, srcq, ssq, ss3q, src1q + NEXT_4_LINES 1 + jmp .w16 +.w16_end: + + sub wd, 16 + jl .w4_end + + OFFSET_TO_W4 + lea src1q, [srcq + pstride] + + DMVR_H_1xW16 m0, srcq, src1q, 0, m5 +.w4: + sub hd, 4 + jl .w4_end + DMVR_HV_4xW16 dstq, dsq, ds3q, srcq, ssq, ss3q, src1q + NEXT_4_LINES 1 + jmp .w4 +.w4_end: + + RET +%endmacro + +%macro VVC_DMVR_AVX2 1 + DMVR_AVX2 %1 + DMVR_H_AVX2 %1 + DMVR_V_AVX2 %1 + DMVR_HV_AVX2 %1 +%endmacro + +INIT_YMM avx2 + +VVC_DMVR_AVX2 8 +VVC_DMVR_AVX2 10 +VVC_DMVR_AVX2 12 + +%endif ; HAVE_AVX2_EXTERNAL + +%endif ; ARCH_X86_64 diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c index 4b4a2aa937..d5b4f4f8a5 100644 --- a/libavcodec/x86/vvc/vvcdsp_init.c +++ b/libavcodec/x86/vvc/vvcdsp_init.c @@ -87,6 +87,21 @@ AVG_PROTOTYPES( 8, avx2) AVG_PROTOTYPES(10, avx2) AVG_PROTOTYPES(12, avx2) + +#define DMVR_PROTOTYPES(bd, opt) \ +void ff_vvc_dmvr_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \ + int height, intptr_t mx, intptr_t my, int width); \ +void ff_vvc_dmvr_h_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \ + int height, intptr_t mx, intptr_t my, int width); \ +void ff_vvc_dmvr_v_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \ + int height, intptr_t mx, intptr_t my, int width); \ +void ff_vvc_dmvr_hv_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \ + int height, intptr_t mx, intptr_t my, int width); \ + +DMVR_PROTOTYPES( 8, avx2) +DMVR_PROTOTYPES(10, avx2) +DMVR_PROTOTYPES(12, avx2) + #define ALF_BPC_PROTOTYPES(bpc, opt) \ void BF(ff_vvc_alf_filter_luma, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ const uint8_t *src, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, \ @@ -306,6 +321,13 @@ ALF_FUNCS(16, 12, avx2) c->inter.w_avg = bf(ff_vvc_w_avg, bd, opt); \ } while (0) +#define DMVR_INIT(bd) do { \ + c->inter.dmvr[0][0] = ff_vvc_dmvr_##bd##_avx2; \ + c->inter.dmvr[0][1] = ff_vvc_dmvr_h_##bd##_avx2; \ + c->inter.dmvr[1][0] = ff_vvc_dmvr_v_##bd##_avx2; \ + c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_##bd##_avx2; \ +} while (0) + #define ALF_INIT(bd) do { \ c->alf.filter[LUMA] = ff_vvc_alf_filter_luma_##bd##_avx2; \ c->alf.filter[CHROMA] = ff_vvc_alf_filter_chroma_##bd##_avx2; \ @@ -330,6 +352,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd) ALF_INIT(8); AVG_INIT(8, avx2); MC_LINKS_AVX2(8); + DMVR_INIT(8); SAD_INIT(); } break; @@ -342,6 +365,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd) AVG_INIT(10, avx2); MC_LINKS_AVX2(10); MC_LINKS_16BPC_AVX2(10); + DMVR_INIT(10); SAD_INIT(); } break; @@ -354,6 +378,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd) AVG_INIT(12, avx2); MC_LINKS_AVX2(12); MC_LINKS_16BPC_AVX2(12); + DMVR_INIT(12); SAD_INIT(); } break; -- 2.34.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next parent reply other threads:[~2024-07-25 13:52 UTC|newest] Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top [not found] <20240725133546.19125-1-nuomi2021@gmail.com> 2024-07-25 13:35 ` Nuo Mi [this message] 2024-07-25 13:35 ` [FFmpeg-devel] [PATCH 3/3] checkasm: add tests for vvc dmvr Nuo Mi 2024-07-26 1:20 ` James Almer 2024-07-26 7:42 ` Nuo Mi 2024-08-11 14:00 ` Nuo Mi 2024-08-15 12:45 ` Nuo Mi
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=TYSPR06MB64333950CE2429A5119F3F36AAAB2@TYSPR06MB6433.apcprd06.prod.outlook.com \ --to=nuomi2021@gmail.com \ --cc=ffmpeg-devel@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git