From: "Rémi Denis-Courmont" <remi@remlab.net> To: ffmpeg-devel@ffmpeg.org Subject: Re: [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg Date: Tue, 21 May 2024 19:03:48 +0300 Message-ID: <5155455.rPxRsNYblX@basile.remlab.net> (raw) In-Reply-To: <tencent_F675C9C260C8A998600E0CEB21EC9EE74105@qq.com> Le tiistaina 21. toukokuuta 2024, 10.37.51 EEST uk7b@foxmail.com a écrit : > From: sunyuechi <sunyuechi@iscas.ac.cn> > --- > libavcodec/riscv/Makefile | 2 + > libavcodec/riscv/vvc_mc_rvv.S | 312 +++++++++++++++++++++++++++++++++ > libavcodec/riscv/vvcdsp_init.c | 76 ++++++++ > libavcodec/vvc/dsp.c | 4 +- > libavcodec/vvc/dsp.h | 1 + > 5 files changed, 394 insertions(+), 1 deletion(-) > create mode 100644 libavcodec/riscv/vvc_mc_rvv.S > create mode 100644 libavcodec/riscv/vvcdsp_init.c > > diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile > index 27b268ae39..6297664fc9 100644 > --- a/libavcodec/riscv/Makefile > +++ b/libavcodec/riscv/Makefile > @@ -68,3 +68,5 @@ RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \ > RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o > OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o > RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o > +OBJS-$(CONFIG_VVC_DECODER) += riscv/vvcdsp_init.o > +RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc_mc_rvv.o > diff --git a/libavcodec/riscv/vvc_mc_rvv.S b/libavcodec/riscv/vvc_mc_rvv.S > new file mode 100644 > index 0000000000..26a6afba1f > --- /dev/null > +++ b/libavcodec/riscv/vvc_mc_rvv.S > @@ -0,0 +1,312 @@ > +/* > + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences > (ISCAS). + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA + */ > + > +#include "libavutil/riscv/asm.S" > + > +.macro vsetvlstatic8 w vlen is_w > + .if \w <= 2 > + vsetivli zero, \w, e8, mf8, ta, ma > + .elseif \w <= 4 && \vlen == 128 > + vsetivli zero, \w, e8, mf4, ta, ma > + .elseif \w <= 4 && \vlen >= 256 > + vsetivli zero, \w, e8, mf8, ta, ma > + .elseif \w <= 8 && \vlen == 128 > + vsetivli zero, \w, e8, mf2, ta, ma > + .elseif \w <= 8 && \vlen >= 256 > + vsetivli zero, \w, e8, mf4, ta, ma > + .elseif \w <= 16 && \vlen == 128 > + vsetivli zero, \w, e8, m1, ta, ma > + .elseif \w <= 16 && \vlen >= 256 > + vsetivli zero, \w, e8, mf2, ta, ma > + .elseif \w <= 32 && \vlen >= 256 > + li t0, \w > + vsetvli zero, t0, e8, m1, ta, ma > + .elseif \w <= (\vlen / 4) || \is_w > + li t0, 64 > + vsetvli zero, t0, e8, m2, ta, ma > + .else > + li t0, \w > + vsetvli zero, t0, e8, m4, ta, ma > + .endif > +.endm > + > +.macro vsetvlstatic16 w vlen is_w > + .if \w <= 2 > + vsetivli zero, \w, e16, mf4, ta, ma > + .elseif \w <= 4 && \vlen == 128 > + vsetivli zero, \w, e16, mf2, ta, ma > + .elseif \w <= 4 && \vlen >= 256 > + vsetivli zero, \w, e16, mf4, ta, ma > + .elseif \w <= 8 && \vlen == 128 > + vsetivli zero, \w, e16, m1, ta, ma > + .elseif \w <= 8 && \vlen >= 256 > + vsetivli zero, \w, e16, mf2, ta, ma > + .elseif \w <= 16 && \vlen == 128 > + vsetivli zero, \w, e16, m2, ta, ma > + .elseif \w <= 16 && \vlen >= 256 > + vsetivli zero, \w, e16, m1, ta, ma > + .elseif \w <= 32 && \vlen >= 256 > + li t0, \w > + vsetvli zero, t0, e16, m2, ta, ma > + .elseif \w <= (\vlen / 4) || \is_w > + li t0, 64 > + vsetvli zero, t0, e16, m4, ta, ma > + .else > + li t0, \w > + vsetvli zero, t0, e16, m8, ta, ma > + .endif > +.endm > + > +.macro vsetvlstatic32 w vlen > + .if \w <= 2 > + vsetivli zero, \w, e32, mf2, ta, ma > + .elseif \w <= 4 && \vlen == 128 > + vsetivli zero, \w, e32, m1, ta, ma > + .elseif \w <= 4 && \vlen >= 256 > + vsetivli zero, \w, e32, mf2, ta, ma > + .elseif \w <= 8 && \vlen == 128 > + vsetivli zero, \w, e32, m2, ta, ma > + .elseif \w <= 8 && \vlen >= 256 > + vsetivli zero, \w, e32, m1, ta, ma > + .elseif \w <= 16 && \vlen == 128 > + vsetivli zero, \w, e32, m4, ta, ma > + .elseif \w <= 16 && \vlen >= 256 > + vsetivli zero, \w, e32, m2, ta, ma > + .elseif \w <= 32 && \vlen >= 256 > + li t0, \w > + vsetvli zero, t0, e32, m4, ta, ma > + .else > + li t0, \w > + vsetvli zero, t0, e32, m8, ta, ma > + .endif > +.endm > + > +.macro avg_nx1 w vlen > + vsetvlstatic16 \w, \vlen, 0 > + vle16.v v0, (a2) > + vle16.v v8, (a3) > + vadd.vv v8, v8, v0 > + vmax.vx v8, v8, zero > + vsetvlstatic8 \w, \vlen, 0 > + vnclipu.wi v8, v8, 7 > + vse8.v v8, (a0) > +.endm > + > +.macro avg w h vlen > + csrw vxrm, zero > + > +.if \w <= (\vlen / 4) && \h >= 4 > +.rept (\h / 4) > + vsetvlstatic16 \w, \vlen, 0 > + addi t0, a2, 128*2 > + addi t1, a3, 128*2 > + addi t3, a2, 128*2*2 > + addi t4, a3, 128*2*2 > + addi a7, a3, 128*2*3 > + addi t6, a2, 128*2*3 > + add t2, a0, a1 > + sh1add t5, a1, a0 > + add a6, t5, a1 > + vle16.v v0, (a2) > + vle16.v v4, (a3) > + vle16.v v8, (t0) > + vle16.v v12, (t1) > + vle16.v v16, (t3) > + vle16.v v20, (t4) > + vle16.v v24, (t6) > + vle16.v v28, (a7) I would expect that you can get better performance by interleaving scalar and vector stuff, and possibly also vector loads and vector arithmetic. > + vadd.vv v4, v4, v0 > + vadd.vv v12, v12, v8 > + vadd.vv v20, v20, v16 > + vadd.vv v28, v28, v24 > + vmax.vx v4, v4, zero > + vmax.vx v12, v12, zero > + vmax.vx v20, v20, zero > + vmax.vx v28, v28, zero > + vsetvlstatic8 \w, \vlen, 0 > + vnclipu.wi v4, v4, 7 > + vnclipu.wi v12, v12, 7 > + vnclipu.wi v20, v20, 7 > + vnclipu.wi v28, v28, 7 > + vse8.v v4, (a0) > + vse8.v v12, (t2) > + vse8.v v20, (t5) > + vse8.v v28, (a6) > + addi a2, a2, 128*8 > + addi a3, a3, 128*8 > + sh2add a0, a1, a0 > +.endr > + > +.elseif (\w <= (\vlen / 4) && \h == 2) || (\w == (\vlen / 2)) > +.rept (\h / 2) > + vsetvlstatic16 \w, \vlen, 0 > + addi t0, a2, 128*2 > + addi t1, a3, 128*2 > + add t2, a0, a1 > + vle16.v v0, (a2) > + vle16.v v8, (a3) > + vle16.v v16, (t0) > + vle16.v v24, (t1) > + vadd.vv v8, v8, v0 > + vadd.vv v24, v24, v16 > + vmax.vx v8, v8, zero > + vmax.vx v24, v24, zero > + vsetvlstatic8 \w, \vlen, 0 > + vnclipu.wi v8, v8, 7 > + vnclipu.wi v24, v24, 7 > + vse8.v v8, (a0) > + vse8.v v24, (t2) > + addi a2, a2, 128*4 > + addi a3, a3, 128*4 > + sh1add a0, a1, a0 > +.endr > + > +.else > +.rept \h > + avg_nx1 \w, \vlen > + .if \w == 128 && \vlen == 128 > + addi a2, a2, 64*2 > + addi a3, a3, 64*2 > + addi a0, a0, 64 > + avg_nx1 \w, \vlen > + addi a2, a2, -64*2 > + addi a3, a3, -64*2 > + addi a0, a0, -64 > + .endif > + addi a2, a2, 128*2 > + addi a3, a3, 128*2 > + add a0, a0, a1 > +.endr > +.endif > +.endm > + > +.macro w_avg_nx1 w vlen > + vsetvlstatic16 \w, \vlen, 1 > + vle16.v v0, (a2) > + vle16.v v8, (a3) > + vwmul.vx v16, v0, a7 > + vwmacc.vx v16, t3, v8 > + vsetvlstatic32 \w, \vlen > + vadd.vx v16, v16, t4 > + vsetvlstatic16 \w, \vlen, 1 > + vnsrl.wx v16, v16, t6 > + vmax.vx v16, v16, zero > + vsetvlstatic8 \w, \vlen, 1 > + vnclipu.wi v16, v16, 0 > + vse8.v v16, (a0) > +.endm > + > +#if (__riscv_xlen == 64) > +.macro w_avg w h vlen > + csrw vxrm, zero > + addi t6, a6, 7 > + ld t3, (sp) > + ld t4, 8(sp) > + ld t5, 16(sp) > + add t4, t4, t5 > + addi t4, t4, 1 // o0 + o1 + 1 > + addi t5, t6, -1 // shift - 1 > + sll t4, t4, t5 > + > +.if \w <= (\vlen / 8) > + .rept (\h / 2) > + vsetvlstatic16 \w, \vlen, 1 > + addi t0, a2, 128*2 > + addi t1, a3, 128*2 > + add t2, a0, a1 > + vle16.v v0, (a2) > + vle16.v v8, (a3) > + vle16.v v20, (t0) > + vle16.v v24, (t1) > + vwmul.vx v16, v0, a7 > + vwmul.vx v28, v20, a7 > + vwmacc.vx v16, t3, v8 > + vwmacc.vx v28, t3, v24 > + vsetvlstatic32 \w, \vlen > + vadd.vx v16, v16, t4 > + vadd.vx v28, v28, t4 > + vsetvlstatic16 \w, \vlen, 1 > + vnsrl.wx v16, v16, t6 > + vnsrl.wx v28, v28, t6 > + vmax.vx v16, v16, zero > + vmax.vx v28, v28, zero > + vsetvlstatic8 \w, \vlen, 1 > + vnclipu.wi v16, v16, 0 > + vnclipu.wi v28, v28, 0 > + vse8.v v16, (a0) > + vse8.v v28, (t2) > + addi a2, a2, 128*4 > + addi a3, a3, 128*4 > + sh1add a0, a1, a0 > + .endr > +.else > + .rept \h > + w_avg_nx1 \w, \vlen > + .if \w == (\vlen / 2) > + addi a2, a2, (\vlen / 2) > + addi a3, a3, (\vlen / 2) > + addi a0, a0, (\vlen / 4) > + w_avg_nx1 \w, \vlen > + addi a2, a2, -(\vlen / 2) > + addi a3, a3, -(\vlen / 2) > + addi a0, a0, -(\vlen / 4) > + .elseif \w == 128 && \vlen == 128 > + .rept 3 > + addi a2, a2, (\vlen / 2) > + addi a3, a3, (\vlen / 2) > + addi a0, a0, (\vlen / 4) > + w_avg_nx1 \w, \vlen > + .endr > + addi a2, a2, -(\vlen / 2) * 3 > + addi a3, a3, -(\vlen / 2) * 3 > + addi a0, a0, -(\vlen / 4) * 3 > + .endif > + > + addi a2, a2, 128*2 > + addi a3, a3, 128*2 > + add a0, a0, a1 > + .endr > +.endif > +.endm > +#endif > + > +.macro func_avg name vlen > +func ff_vvc_\name\()_8_rvv_\vlen\(), zve32x > +.irp w,2,4,8,16,32,64,128 > + li t3, \w > + bne a4, t3, \name\vlen\()end\w > +.irp h,2,4,8,16,32,64,128 > + li t4, \h > + bne a5, t4, \name\vlen\()end\w\h > + \name \w \h \vlen > + ret > +\name\vlen\()end\w\h: > +.endr > +\name\vlen\()end\w: These labels lead to nowhere? If you actually mean to implicitly fall through to the next function, you can use the function name directly rather than add odd labels. > +.endr > +endfunc > +.endm > + > +func_avg avg 256 > +func_avg avg 128 > +#if (__riscv_xlen == 64) > +func_avg w_avg 256 > +func_avg w_avg 128 > +#endif > diff --git a/libavcodec/riscv/vvcdsp_init.c b/libavcodec/riscv/vvcdsp_init.c > new file mode 100644 > index 0000000000..d26b4c1c4a > --- /dev/null > +++ b/libavcodec/riscv/vvcdsp_init.c > @@ -0,0 +1,76 @@ > +/* > + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences > (ISCAS). + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA + */ > + > +#include "config.h" > + > +#include "libavutil/attributes.h" > +#include "libavutil/cpu.h" > +#include "libavutil/riscv/cpu.h" > +#include "libavcodec/vvc/dsp.h" > + > +#define bf(fn, bd, opt) fn##_##bd##_##opt > + > +#define AVG_PROTOTYPES(bd, opt) > \ +void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, > ptrdiff_t dst_stride, \ + const > int16_t *src0, const int16_t *src1, int width, int height); > \ +void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t > dst_stride, \ + const int16_t *src0, > const int16_t *src1, int width, int height, > \ + int denom, int w0, int w1, int o0, int o1); > + > +AVG_PROTOTYPES(8, rvv_128) > +AVG_PROTOTYPES(8, rvv_256) > + > +#define AVG_INIT(bd, opt) do { \ > + c->inter.avg = bf(ff_vvc_avg, bd, opt); \ > + c->inter.w_avg = bf(ff_vvc_w_avg, bd, opt); \ > +} while (0) > + > +void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd) > +{ > +#if HAVE_RVV > + const int flags = av_get_cpu_flags(); > + > + if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) && > + ff_rv_vlen_least(256)) { > + switch (bd) { > + case 8: > + c->inter.avg = ff_vvc_avg_8_rvv_256; > +# if (__riscv_xlen == 64) > + c->inter.w_avg = ff_vvc_w_avg_8_rvv_256; > +# endif > + break; > + default: > + break; > + } > + } else if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & > AV_CPU_FLAG_RVB_ADDR) && > + ff_rv_vlen_least(128)) { > + switch (bd) { > + case 8: > + c->inter.avg = ff_vvc_avg_8_rvv_128; > +# if (__riscv_xlen == 64) > + c->inter.w_avg = ff_vvc_w_avg_8_rvv_128; > +# endif > + break; > + default: > + break; > + } > + } > +#endif > +} > diff --git a/libavcodec/vvc/dsp.c b/libavcodec/vvc/dsp.c > index 41e830a98a..c55a37d255 100644 > --- a/libavcodec/vvc/dsp.c > +++ b/libavcodec/vvc/dsp.c > @@ -121,7 +121,9 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int > bit_depth) break; > } > > -#if ARCH_X86 > +#if ARCH_RISCV > + ff_vvc_dsp_init_riscv(vvcdsp, bit_depth); > +#elif ARCH_X86 > ff_vvc_dsp_init_x86(vvcdsp, bit_depth); > #endif > } > diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h > index 9810ac314c..dcb978549f 100644 > --- a/libavcodec/vvc/dsp.h > +++ b/libavcodec/vvc/dsp.h > @@ -167,6 +167,7 @@ typedef struct VVCDSPContext { > > void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth); > > +void ff_vvc_dsp_init_riscv(VVCDSPContext *hpc, const int bit_depth); > void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth); > > #endif /* AVCODEC_VVC_DSP_H */ -- Rémi Denis-Courmont http://www.remlab.net/ _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next prev parent reply other threads:[~2024-05-21 16:04 UTC|newest] Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top 2024-05-21 7:37 uk7b 2024-05-21 7:38 ` flow gg 2024-05-21 7:47 ` uk7b 2024-05-21 7:48 ` flow gg 2024-05-21 16:03 ` Rémi Denis-Courmont [this message] 2024-05-21 17:24 ` flow gg 2024-05-21 19:24 ` uk7b 2024-05-21 19:26 ` flow gg 2024-05-25 8:27 ` Rémi Denis-Courmont 2024-07-08 15:41 [FFmpeg-devel] [PATCH v5] " Rémi Denis-Courmont 2024-07-10 10:02 ` [FFmpeg-devel] [PATCH] " uk7b 2024-07-16 14:21 ` Rémi Denis-Courmont 2024-07-18 15:02 ` uk7b 2024-07-18 15:04 ` flow gg 2024-07-19 15:55 ` Rémi Denis-Courmont 2024-07-21 13:43 ` uk7b 2024-07-21 13:45 ` flow gg 2024-08-03 10:30 uk7b 2024-08-03 10:31 ` flow gg 2024-08-15 8:10 ` Rémi Denis-Courmont
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=5155455.rPxRsNYblX@basile.remlab.net \ --to=remi@remlab.net \ --cc=ffmpeg-devel@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git