* [FFmpeg-devel] [PATCH v3 2/9] lavc/vp9dsp: R-V mc copy [not found] <20240513165926.1467967-1-uk7b@foxmail.com> @ 2024-05-13 16:59 ` uk7b 2024-05-13 19:55 ` Rémi Denis-Courmont 2024-05-13 16:59 ` [FFmpeg-devel] [PATCH v3 3/9] lavc/vp9dsp: R-V V ipred hor uk7b ` (6 subsequent siblings) 7 siblings, 1 reply; 25+ messages in thread From: uk7b @ 2024-05-13 16:59 UTC (permalink / raw) To: ffmpeg-devel; +Cc: sunyuechi From: sunyuechi <sunyuechi@iscas.ac.cn> C908: vp9_put4_8bpp_c: 0.7 vp9_put4_8bpp_rvi: 0.5 vp9_put8_8bpp_c: 2.5 vp9_put8_8bpp_rvi: 0.5 vp9_put16_8bpp_c: 16.7 vp9_put16_8bpp_rvi: 1.5 vp9_put32_8bpp_c: 37.2 vp9_put32_8bpp_rvi: 5.7 vp9_put64_8bpp_c: 107.5 vp9_put64_8bpp_rvi: 21.7 --- libavcodec/riscv/Makefile | 3 +- libavcodec/riscv/vp9_mc_rvi.S | 105 +++++++++++++++++++++++++++++++++ libavcodec/riscv/vp9dsp.h | 3 + libavcodec/riscv/vp9dsp_init.c | 28 +++++++++ 4 files changed, 138 insertions(+), 1 deletion(-) create mode 100644 libavcodec/riscv/vp9_mc_rvi.S diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile index ccd060c666..0cd900104f 100644 --- a/libavcodec/riscv/Makefile +++ b/libavcodec/riscv/Makefile @@ -62,7 +62,8 @@ OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_init.o RV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvi.o RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9dsp_init.o -RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o +RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \ + riscv/vp9_mc_rvi.o RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o diff --git a/libavcodec/riscv/vp9_mc_rvi.S b/libavcodec/riscv/vp9_mc_rvi.S new file mode 100644 index 0000000000..0db14e83c7 --- /dev/null +++ b/libavcodec/riscv/vp9_mc_rvi.S @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS). + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/riscv/asm.S" + +#if __riscv_xlen >= 64 +func ff_copy64_rvi +1: + addi a4, a4, -1 + ld t0, (a2) + ld t1, 8(a2) + ld t2, 16(a2) + ld t3, 24(a2) + ld t4, 32(a2) + ld t5, 40(a2) + ld t6, 48(a2) + ld a7, 56(a2) + sd t0, (a0) + sd t1, 8(a0) + sd t2, 16(a0) + sd t3, 24(a0) + sd t4, 32(a0) + sd t5, 40(a0) + sd t6, 48(a0) + sd a7, 56(a0) + add a2, a2, a3 + add a0, a0, a1 + bnez a4, 1b + + ret +endfunc + +func ff_copy32_rvi +1: + addi a4, a4, -1 + ld t0, (a2) + ld t1, 8(a2) + ld t2, 16(a2) + ld t3, 24(a2) + sd t0, (a0) + sd t1, 8(a0) + sd t2, 16(a0) + sd t3, 24(a0) + add a2, a2, a3 + add a0, a0, a1 + bnez a4, 1b + + ret +endfunc + +func ff_copy16_rvi +1: + addi a4, a4, -1 + ld t0, (a2) + ld t1, 8(a2) + sd t0, (a0) + sd t1, 8(a0) + add a2, a2, a3 + add a0, a0, a1 + bnez a4, 1b + + ret +endfunc + +func ff_copy8_rvi +1: + addi a4, a4, -1 + ld t0, (a2) + sd t0, (a0) + add a2, a2, a3 + add a0, a0, a1 + bnez a4, 1b + + ret +endfunc +#endif + +func ff_copy4_rvi +1: + addi a4, a4, -1 + lw t0, (a2) + sw t0, (a0) + add a2, a2, a3 + add a0, a0, a1 + bnez a4, 1b + + ret +endfunc diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h index f8bc6563a5..b8ff282f8a 100644 --- a/libavcodec/riscv/vp9dsp.h +++ b/libavcodec/riscv/vp9dsp.h @@ -167,6 +167,9 @@ void ff_copy##SIZE##_rvi(uint8_t *dst, ptrdiff_t dststride, \ const uint8_t *src, ptrdiff_t srcstride, \ int h, int mx, int my); +VP9_COPY_RISCV_RVI_FUNC(64); +VP9_COPY_RISCV_RVI_FUNC(32); +VP9_COPY_RISCV_RVI_FUNC(16); VP9_COPY_RISCV_RVI_FUNC(8); VP9_COPY_RISCV_RVI_FUNC(4); diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c index 0f64afc6d2..dace51cf06 100644 --- a/libavcodec/riscv/vp9dsp_init.c +++ b/libavcodec/riscv/vp9dsp_init.c @@ -24,6 +24,33 @@ #include "libavcodec/vp9dsp.h" #include "vp9dsp.h" +static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp) +{ +#if HAVE_RV + int flags = av_get_cpu_flags(); + +# if __riscv_xlen >= 64 + if (bpp == 8 && (flags & AV_CPU_FLAG_RV_MISALIGNED)) { + +#define init_fpel(idx1, sz) \ + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][0][0][0] = ff_copy##sz##_rvi; \ + dsp->mc[idx1][FILTER_8TAP_REGULAR][0][0][0] = ff_copy##sz##_rvi; \ + dsp->mc[idx1][FILTER_8TAP_SHARP ][0][0][0] = ff_copy##sz##_rvi; \ + dsp->mc[idx1][FILTER_BILINEAR ][0][0][0] = ff_copy##sz##_rvi + + init_fpel(0, 64); + init_fpel(1, 32); + init_fpel(2, 16); + init_fpel(3, 8); + init_fpel(4, 4); + +#undef init_fpel + } +# endif + +#endif +} + static av_cold void vp9dsp_intrapred_init_riscv(VP9DSPContext *dsp, int bpp) { #if HAVE_RV @@ -67,4 +94,5 @@ static av_cold void vp9dsp_intrapred_init_riscv(VP9DSPContext *dsp, int bpp) av_cold void ff_vp9dsp_init_riscv(VP9DSPContext *dsp, int bpp, int bitexact) { vp9dsp_intrapred_init_riscv(dsp, bpp); + vp9dsp_mc_init_riscv(dsp, bpp); } -- 2.45.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3 2/9] lavc/vp9dsp: R-V mc copy 2024-05-13 16:59 ` [FFmpeg-devel] [PATCH v3 2/9] lavc/vp9dsp: R-V mc copy uk7b @ 2024-05-13 19:55 ` Rémi Denis-Courmont 2024-05-14 4:44 ` flow gg 0 siblings, 1 reply; 25+ messages in thread From: Rémi Denis-Courmont @ 2024-05-13 19:55 UTC (permalink / raw) To: ffmpeg-devel Le maanantaina 13. toukokuuta 2024, 19.59.19 EEST uk7b@foxmail.com a écrit : > From: sunyuechi <sunyuechi@iscas.ac.cn> > > C908: > vp9_put4_8bpp_c: 0.7 > vp9_put4_8bpp_rvi: 0.5 > vp9_put8_8bpp_c: 2.5 > vp9_put8_8bpp_rvi: 0.5 > vp9_put16_8bpp_c: 16.7 > vp9_put16_8bpp_rvi: 1.5 > vp9_put32_8bpp_c: 37.2 > vp9_put32_8bpp_rvi: 5.7 > vp9_put64_8bpp_c: 107.5 > vp9_put64_8bpp_rvi: 21.7 This patch does not produce any (new) results here though? -- レミ・デニ-クールモン http://www.remlab.net/ _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3 2/9] lavc/vp9dsp: R-V mc copy 2024-05-13 19:55 ` Rémi Denis-Courmont @ 2024-05-14 4:44 ` flow gg 2024-05-14 17:02 ` Rémi Denis-Courmont 0 siblings, 1 reply; 25+ messages in thread From: flow gg @ 2024-05-14 4:44 UTC (permalink / raw) To: FFmpeg development discussions and patches I am locally using: if (bpp == 8 && (flags & AV_CPU_FLAG_RVI)) { this performs better on k230/banana_f3 than C. For email, refer to [FFmpeg-devel] [PATCH 2/2] lavc/vp8dsp: restrict RVI optimisations and change it to if (bpp == 8 && (flags & AV_CPU_FLAG_RV_MISALIGNED)) { So no output, but I think the same modification should be made here? Rémi Denis-Courmont <remi@remlab.net> 于2024年5月14日周二 03:55写道: > Le maanantaina 13. toukokuuta 2024, 19.59.19 EEST uk7b@foxmail.com a > écrit : > > From: sunyuechi <sunyuechi@iscas.ac.cn> > > > > C908: > > vp9_put4_8bpp_c: 0.7 > > vp9_put4_8bpp_rvi: 0.5 > > vp9_put8_8bpp_c: 2.5 > > vp9_put8_8bpp_rvi: 0.5 > > vp9_put16_8bpp_c: 16.7 > > vp9_put16_8bpp_rvi: 1.5 > > vp9_put32_8bpp_c: 37.2 > > vp9_put32_8bpp_rvi: 5.7 > > vp9_put64_8bpp_c: 107.5 > > vp9_put64_8bpp_rvi: 21.7 > > This patch does not produce any (new) results here though? > > -- > レミ・デニ-クールモン > http://www.remlab.net/ > > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3 2/9] lavc/vp9dsp: R-V mc copy 2024-05-14 4:44 ` flow gg @ 2024-05-14 17:02 ` Rémi Denis-Courmont 2024-05-14 17:20 ` flow gg 0 siblings, 1 reply; 25+ messages in thread From: Rémi Denis-Courmont @ 2024-05-14 17:02 UTC (permalink / raw) To: FFmpeg development discussions and patches Le tiistaina 14. toukokuuta 2024, 7.44.55 EEST flow gg a écrit : > I am locally using: > if (bpp == 8 && (flags & AV_CPU_FLAG_RVI)) { > this performs better on k230/banana_f3 than C. > For email, refer to [FFmpeg-devel] [PATCH 2/2] lavc/vp8dsp: restrict RVI > optimisations and change it to > if (bpp == 8 && (flags & AV_CPU_FLAG_RV_MISALIGNED)) { > So no output, but I think the same modification should be made here? I just can't get any benchmarks out of checkasm. Even if I comment out the MISALIGNED flag check, this is not reporting anything. I tested with only patch 1/9 and 2/9, not the following. I don't know why. -- 雷米‧德尼-库尔蒙 http://www.remlab.net/ _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3 2/9] lavc/vp9dsp: R-V mc copy 2024-05-14 17:02 ` Rémi Denis-Courmont @ 2024-05-14 17:20 ` flow gg 0 siblings, 0 replies; 25+ messages in thread From: flow gg @ 2024-05-14 17:20 UTC (permalink / raw) To: FFmpeg development discussions and patches Using this will give output `if (bpp == 8 && (flags & AV_CPU_FLAG_RVI)) {` Did you comment out the MISALIGNED flag check but not add RVI, resulting in no output? Rémi Denis-Courmont <remi@remlab.net> 于2024年5月15日周三 01:02写道: > Le tiistaina 14. toukokuuta 2024, 7.44.55 EEST flow gg a écrit : > > I am locally using: > > if (bpp == 8 && (flags & AV_CPU_FLAG_RVI)) { > > this performs better on k230/banana_f3 than C. > > For email, refer to [FFmpeg-devel] [PATCH 2/2] lavc/vp8dsp: restrict RVI > > optimisations and change it to > > if (bpp == 8 && (flags & AV_CPU_FLAG_RV_MISALIGNED)) { > > So no output, but I think the same modification should be made here? > > I just can't get any benchmarks out of checkasm. Even if I comment out the > MISALIGNED flag check, this is not reporting anything. I tested with only > patch > 1/9 and 2/9, not the following. I don't know why. > > -- > 雷米‧德尼-库尔蒙 > http://www.remlab.net/ > > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 25+ messages in thread
* [FFmpeg-devel] [PATCH v3 3/9] lavc/vp9dsp: R-V V ipred hor [not found] <20240513165926.1467967-1-uk7b@foxmail.com> 2024-05-13 16:59 ` [FFmpeg-devel] [PATCH v3 2/9] lavc/vp9dsp: R-V mc copy uk7b @ 2024-05-13 16:59 ` uk7b 2024-05-13 16:59 ` [FFmpeg-devel] [PATCH v3 4/9] lavc/vp9dsp: R-V V ipred tm uk7b ` (5 subsequent siblings) 7 siblings, 0 replies; 25+ messages in thread From: uk7b @ 2024-05-13 16:59 UTC (permalink / raw) To: ffmpeg-devel; +Cc: sunyuechi From: sunyuechi <sunyuechi@iscas.ac.cn> C908: vp9_hor_8x8_8bpp_c: 74.7 vp9_hor_8x8_8bpp_rvv_i32: 35.7 vp9_hor_16x16_8bpp_c: 175.5 vp9_hor_16x16_8bpp_rvv_i32: 80.2 vp9_hor_32x32_8bpp_c: 510.2 vp9_hor_32x32_8bpp_rvv_i32: 264.0 --- libavcodec/riscv/vp9_intra_rvv.S | 56 ++++++++++++++++++++++++++++++++ libavcodec/riscv/vp9dsp.h | 6 ++++ libavcodec/riscv/vp9dsp_init.c | 3 ++ 3 files changed, 65 insertions(+) diff --git a/libavcodec/riscv/vp9_intra_rvv.S b/libavcodec/riscv/vp9_intra_rvv.S index 40e38ba83e..ca156d65cd 100644 --- a/libavcodec/riscv/vp9_intra_rvv.S +++ b/libavcodec/riscv/vp9_intra_rvv.S @@ -117,3 +117,59 @@ func_dc dc_left 8 left 3 0 zve64x func_dc dc_top 32 top 5 1 zve32x func_dc dc_top 16 top 4 1 zve32x func_dc dc_top 8 top 3 0 zve64x + +func ff_h_32x32_rvv, zve32x + li t0, 32 + addi a2, a2, 31 + vsetvli zero, t0, e8, m2, ta, ma + + .rept 2 + .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + lbu t1, (a2) + addi a2, a2, -1 + vmv.v.x v\n, t1 + .endr + .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + vse8.v v\n, (a0) + add a0, a0, a1 + .endr + .endr + + ret +endfunc + +func ff_h_16x16_rvv, zve32x + addi a2, a2, 15 + vsetivli zero, 16, e8, m1, ta, ma + + .irp n 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 + lbu t1, (a2) + addi a2, a2, -1 + vmv.v.x v\n, t1 + .endr + .irp n 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22 + vse8.v v\n, (a0) + add a0, a0, a1 + .endr + vse8.v v23, (a0) + + ret +endfunc + +func ff_h_8x8_rvv, zve32x + addi a2, a2, 7 + vsetivli zero, 8, e8, mf2, ta, ma + + .irp n 8, 9, 10, 11, 12, 13, 14, 15 + lbu t1, (a2) + addi a2, a2, -1 + vmv.v.x v\n, t1 + .endr + .irp n 8, 9, 10, 11, 12, 13, 14 + vse8.v v\n, (a0) + add a0, a0, a1 + .endr + vse8.v v15, (a0) + + ret +endfunc diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h index b8ff282f8a..0ad961c7e0 100644 --- a/libavcodec/riscv/vp9dsp.h +++ b/libavcodec/riscv/vp9dsp.h @@ -66,6 +66,12 @@ void ff_v_16x16_rvi(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a); void ff_v_8x8_rvi(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a); +void ff_h_32x32_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, + const uint8_t *a); +void ff_h_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, + const uint8_t *a); +void ff_h_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, + const uint8_t *a); #define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx) \ void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \ diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c index dace51cf06..eab3e9cb0a 100644 --- a/libavcodec/riscv/vp9dsp_init.c +++ b/libavcodec/riscv/vp9dsp_init.c @@ -86,6 +86,9 @@ static av_cold void vp9dsp_intrapred_init_riscv(VP9DSPContext *dsp, int bpp) dsp->intra_pred[TX_16X16][DC_129_PRED] = ff_dc_129_16x16_rvv; dsp->intra_pred[TX_32X32][TOP_DC_PRED] = ff_dc_top_32x32_rvv; dsp->intra_pred[TX_16X16][TOP_DC_PRED] = ff_dc_top_16x16_rvv; + dsp->intra_pred[TX_32X32][HOR_PRED] = ff_h_32x32_rvv; + dsp->intra_pred[TX_16X16][HOR_PRED] = ff_h_16x16_rvv; + dsp->intra_pred[TX_8X8][HOR_PRED] = ff_h_8x8_rvv; } #endif #endif -- 2.45.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 25+ messages in thread
* [FFmpeg-devel] [PATCH v3 4/9] lavc/vp9dsp: R-V V ipred tm [not found] <20240513165926.1467967-1-uk7b@foxmail.com> 2024-05-13 16:59 ` [FFmpeg-devel] [PATCH v3 2/9] lavc/vp9dsp: R-V mc copy uk7b 2024-05-13 16:59 ` [FFmpeg-devel] [PATCH v3 3/9] lavc/vp9dsp: R-V V ipred hor uk7b @ 2024-05-13 16:59 ` uk7b 2024-05-14 17:45 ` Rémi Denis-Courmont 2024-05-13 16:59 ` [FFmpeg-devel] [PATCH v3 5/9] lavc/vp9dsp: R-V V mc avg uk7b ` (4 subsequent siblings) 7 siblings, 1 reply; 25+ messages in thread From: uk7b @ 2024-05-13 16:59 UTC (permalink / raw) To: ffmpeg-devel; +Cc: sunyuechi From: sunyuechi <sunyuechi@iscas.ac.cn> C908: vp9_tm_4x4_8bpp_c: 116.5 vp9_tm_4x4_8bpp_rvv_i32: 43.5 vp9_tm_8x8_8bpp_c: 416.2 vp9_tm_8x8_8bpp_rvv_i32: 86.0 vp9_tm_16x16_8bpp_c: 1665.5 vp9_tm_16x16_8bpp_rvv_i32: 187.2 vp9_tm_32x32_8bpp_c: 6974.2 vp9_tm_32x32_8bpp_rvv_i32: 625.7 --- libavcodec/riscv/vp9_intra_rvv.S | 141 +++++++++++++++++++++++++++++++ libavcodec/riscv/vp9dsp.h | 8 ++ libavcodec/riscv/vp9dsp_init.c | 4 + 3 files changed, 153 insertions(+) diff --git a/libavcodec/riscv/vp9_intra_rvv.S b/libavcodec/riscv/vp9_intra_rvv.S index ca156d65cd..7e1046bc13 100644 --- a/libavcodec/riscv/vp9_intra_rvv.S +++ b/libavcodec/riscv/vp9_intra_rvv.S @@ -173,3 +173,144 @@ func ff_h_8x8_rvv, zve32x ret endfunc + +.macro tm_sum dst, top, offset + lbu t3, \offset(a2) + sub t3, t3, a4 + vadd.vx \dst, \top, t3 +.endm + +func ff_tm_32x32_rvv, zve32x + lbu a4, -1(a3) + li t5, 32 + + .macro tm_sum32 n1,n2,n3,n4,n5,n6,n7,n8 + vsetvli zero, t5, e16, m4, ta, ma + vle8.v v8, (a3) + vzext.vf2 v28, v8 + + tm_sum v0, v28, \n1 + tm_sum v4, v28, \n2 + tm_sum v8, v28, \n3 + tm_sum v12, v28, \n4 + tm_sum v16, v28, \n5 + tm_sum v20, v28, \n6 + tm_sum v24, v28, \n7 + tm_sum v28, v28, \n8 + + .irp n 0, 4, 8, 12, 16, 20, 24, 28 + vmax.vx v\n, v\n, zero + .endr + + vsetvli zero, zero, e8, m2, ta, ma + .irp n 0, 4, 8, 12, 16, 20, 24, 28 + vnclipu.wi v\n, v\n, 0 + vse8.v v\n, (a0) + add a0, a0, a1 + .endr + .endm + + tm_sum32 31, 30, 29, 28, 27, 26, 25, 24 + tm_sum32 23, 22, 21, 20, 19, 18, 17, 16 + tm_sum32 15, 14, 13, 12, 11, 10, 9, 8 + tm_sum32 7, 6, 5, 4, 3, 2, 1, 0 + + ret +endfunc + +func ff_tm_16x16_rvv, zve32x + vsetivli zero, 16, e16, m2, ta, ma + vle8.v v8, (a3) + vzext.vf2 v30, v8 + lbu a4, -1(a3) + + tm_sum v0, v30, 15 + tm_sum v2, v30, 14 + tm_sum v4, v30, 13 + tm_sum v6, v30, 12 + tm_sum v8, v30, 11 + tm_sum v10, v30, 10 + tm_sum v12, v30, 9 + tm_sum v14, v30, 8 + tm_sum v16, v30, 7 + tm_sum v18, v30, 6 + tm_sum v20, v30, 5 + tm_sum v22, v30, 4 + tm_sum v24, v30, 3 + tm_sum v26, v30, 2 + tm_sum v28, v30, 1 + tm_sum v30, v30, 0 + + .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + vmax.vx v\n, v\n, zero + .endr + + vsetvli zero, zero, e8, m1, ta, ma + .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28 + vnclipu.wi v\n, v\n, 0 + vse8.v v\n, (a0) + add a0, a0, a1 + .endr + vnclipu.wi v30, v30, 0 + vse8.v v30, (a0) + + ret +endfunc + +func ff_tm_8x8_rvv, zve32x + vsetivli zero, 8, e16, m1, ta, ma + vle8.v v8, (a3) + vzext.vf2 v28, v8 + lbu a4, -1(a3) + + tm_sum v16, v28, 7 + tm_sum v17, v28, 6 + tm_sum v18, v28, 5 + tm_sum v19, v28, 4 + tm_sum v20, v28, 3 + tm_sum v21, v28, 2 + tm_sum v22, v28, 1 + tm_sum v23, v28, 0 + + .irp n 16, 17, 18, 19, 20, 21, 22, 23 + vmax.vx v\n, v\n, zero + .endr + + vsetvli zero, zero, e8, mf2, ta, ma + .irp n 16, 17, 18, 19, 20, 21, 22 + vnclipu.wi v\n, v\n, 0 + vse8.v v\n, (a0) + add a0, a0, a1 + .endr + vnclipu.wi v24, v23, 0 + vse8.v v24, (a0) + + ret +endfunc + +func ff_tm_4x4_rvv, zve32x + vsetivli zero, 4, e16, mf2, ta, ma + vle8.v v8, (a3) + vzext.vf2 v28, v8 + lbu a4, -1(a3) + + tm_sum v16, v28, 3 + tm_sum v17, v28, 2 + tm_sum v18, v28, 1 + tm_sum v19, v28, 0 + + .irp n 16, 17, 18, 19 + vmax.vx v\n, v\n, zero + .endr + + vsetvli zero, zero, e8, mf4, ta, ma + .irp n 16, 17, 18 + vnclipu.wi v\n, v\n, 0 + vse8.v v\n, (a0) + add a0, a0, a1 + .endr + vnclipu.wi v24, v19, 0 + vse8.v v24, (a0) + + ret +endfunc diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h index 0ad961c7e0..79330b4968 100644 --- a/libavcodec/riscv/vp9dsp.h +++ b/libavcodec/riscv/vp9dsp.h @@ -72,6 +72,14 @@ void ff_h_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a); void ff_h_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a); +void ff_tm_32x32_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, + const uint8_t *a); +void ff_tm_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, + const uint8_t *a); +void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, + const uint8_t *a); +void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, + const uint8_t *a); #define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx) \ void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \ diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c index eab3e9cb0a..184fadbaf7 100644 --- a/libavcodec/riscv/vp9dsp_init.c +++ b/libavcodec/riscv/vp9dsp_init.c @@ -89,6 +89,10 @@ static av_cold void vp9dsp_intrapred_init_riscv(VP9DSPContext *dsp, int bpp) dsp->intra_pred[TX_32X32][HOR_PRED] = ff_h_32x32_rvv; dsp->intra_pred[TX_16X16][HOR_PRED] = ff_h_16x16_rvv; dsp->intra_pred[TX_8X8][HOR_PRED] = ff_h_8x8_rvv; + dsp->intra_pred[TX_32X32][TM_VP8_PRED] = ff_tm_32x32_rvv; + dsp->intra_pred[TX_16X16][TM_VP8_PRED] = ff_tm_16x16_rvv; + dsp->intra_pred[TX_8X8][TM_VP8_PRED] = ff_tm_8x8_rvv; + dsp->intra_pred[TX_4X4][TM_VP8_PRED] = ff_tm_4x4_rvv; } #endif #endif -- 2.45.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3 4/9] lavc/vp9dsp: R-V V ipred tm 2024-05-13 16:59 ` [FFmpeg-devel] [PATCH v3 4/9] lavc/vp9dsp: R-V V ipred tm uk7b @ 2024-05-14 17:45 ` Rémi Denis-Courmont 2024-05-14 17:57 ` flow gg 0 siblings, 1 reply; 25+ messages in thread From: Rémi Denis-Courmont @ 2024-05-14 17:45 UTC (permalink / raw) To: ffmpeg-devel Le maanantaina 13. toukokuuta 2024, 19.59.21 EEST uk7b@foxmail.com a écrit : > From: sunyuechi <sunyuechi@iscas.ac.cn> > > C908: > vp9_tm_4x4_8bpp_c: 116.5 > vp9_tm_4x4_8bpp_rvv_i32: 43.5 > vp9_tm_8x8_8bpp_c: 416.2 > vp9_tm_8x8_8bpp_rvv_i32: 86.0 > vp9_tm_16x16_8bpp_c: 1665.5 > vp9_tm_16x16_8bpp_rvv_i32: 187.2 > vp9_tm_32x32_8bpp_c: 6974.2 > vp9_tm_32x32_8bpp_rvv_i32: 625.7 > --- > libavcodec/riscv/vp9_intra_rvv.S | 141 +++++++++++++++++++++++++++++++ > libavcodec/riscv/vp9dsp.h | 8 ++ > libavcodec/riscv/vp9dsp_init.c | 4 + > 3 files changed, 153 insertions(+) > > diff --git a/libavcodec/riscv/vp9_intra_rvv.S > b/libavcodec/riscv/vp9_intra_rvv.S index ca156d65cd..7e1046bc13 100644 > --- a/libavcodec/riscv/vp9_intra_rvv.S > +++ b/libavcodec/riscv/vp9_intra_rvv.S > @@ -173,3 +173,144 @@ func ff_h_8x8_rvv, zve32x > > ret > endfunc > + > +.macro tm_sum dst, top, offset > + lbu t3, \offset(a2) > + sub t3, t3, a4 > + vadd.vx \dst, \top, t3 The macro saves some copycat code, but it seems to prevent good scheduling. Consuming t3 right after loading it is not ideal. > +.endm > + > +func ff_tm_32x32_rvv, zve32x > + lbu a4, -1(a3) > + li t5, 32 > + > + .macro tm_sum32 n1,n2,n3,n4,n5,n6,n7,n8 > + vsetvli zero, t5, e16, m4, ta, ma AFAICT, you do not need to reset the vector configuration every time. > + vle8.v v8, (a3) > + vzext.vf2 v28, v8 > + > + tm_sum v0, v28, \n1 > + tm_sum v4, v28, \n2 > + tm_sum v8, v28, \n3 > + tm_sum v12, v28, \n4 > + tm_sum v16, v28, \n5 > + tm_sum v20, v28, \n6 > + tm_sum v24, v28, \n7 > + tm_sum v28, v28, \n8 > + > + .irp n 0, 4, 8, 12, 16, 20, 24, 28 > + vmax.vx v\n, v\n, zero > + .endr > + > + vsetvli zero, zero, e8, m2, ta, ma > + .irp n 0, 4, 8, 12, 16, 20, 24, 28 > + vnclipu.wi v\n, v\n, 0 > + vse8.v v\n, (a0) > + add a0, a0, a1 > + .endr > + .endm > + > + tm_sum32 31, 30, 29, 28, 27, 26, 25, 24 > + tm_sum32 23, 22, 21, 20, 19, 18, 17, 16 > + tm_sum32 15, 14, 13, 12, 11, 10, 9, 8 > + tm_sum32 7, 6, 5, 4, 3, 2, 1, 0 > + > + ret > +endfunc > + > +func ff_tm_16x16_rvv, zve32x > + vsetivli zero, 16, e16, m2, ta, ma > + vle8.v v8, (a3) > + vzext.vf2 v30, v8 > + lbu a4, -1(a3) > + > + tm_sum v0, v30, 15 > + tm_sum v2, v30, 14 > + tm_sum v4, v30, 13 > + tm_sum v6, v30, 12 > + tm_sum v8, v30, 11 > + tm_sum v10, v30, 10 > + tm_sum v12, v30, 9 > + tm_sum v14, v30, 8 > + tm_sum v16, v30, 7 > + tm_sum v18, v30, 6 > + tm_sum v20, v30, 5 > + tm_sum v22, v30, 4 > + tm_sum v24, v30, 3 > + tm_sum v26, v30, 2 > + tm_sum v28, v30, 1 > + tm_sum v30, v30, 0 > + > + .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 > + vmax.vx v\n, v\n, zero > + .endr > + > + vsetvli zero, zero, e8, m1, ta, ma > + .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28 > + vnclipu.wi v\n, v\n, 0 > + vse8.v v\n, (a0) > + add a0, a0, a1 > + .endr > + vnclipu.wi v30, v30, 0 > + vse8.v v30, (a0) > + > + ret > +endfunc > + > +func ff_tm_8x8_rvv, zve32x > + vsetivli zero, 8, e16, m1, ta, ma > + vle8.v v8, (a3) > + vzext.vf2 v28, v8 > + lbu a4, -1(a3) > + > + tm_sum v16, v28, 7 > + tm_sum v17, v28, 6 > + tm_sum v18, v28, 5 > + tm_sum v19, v28, 4 > + tm_sum v20, v28, 3 > + tm_sum v21, v28, 2 > + tm_sum v22, v28, 1 > + tm_sum v23, v28, 0 > + > + .irp n 16, 17, 18, 19, 20, 21, 22, 23 > + vmax.vx v\n, v\n, zero > + .endr > + > + vsetvli zero, zero, e8, mf2, ta, ma > + .irp n 16, 17, 18, 19, 20, 21, 22 > + vnclipu.wi v\n, v\n, 0 > + vse8.v v\n, (a0) > + add a0, a0, a1 > + .endr > + vnclipu.wi v24, v23, 0 > + vse8.v v24, (a0) > + > + ret > +endfunc > + > +func ff_tm_4x4_rvv, zve32x > + vsetivli zero, 4, e16, mf2, ta, ma > + vle8.v v8, (a3) > + vzext.vf2 v28, v8 > + lbu a4, -1(a3) > + > + tm_sum v16, v28, 3 > + tm_sum v17, v28, 2 > + tm_sum v18, v28, 1 > + tm_sum v19, v28, 0 > + > + .irp n 16, 17, 18, 19 > + vmax.vx v\n, v\n, zero > + .endr > + > + vsetvli zero, zero, e8, mf4, ta, ma > + .irp n 16, 17, 18 > + vnclipu.wi v\n, v\n, 0 > + vse8.v v\n, (a0) > + add a0, a0, a1 > + .endr > + vnclipu.wi v24, v19, 0 > + vse8.v v24, (a0) > + > + ret > +endfunc > diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h > index 0ad961c7e0..79330b4968 100644 > --- a/libavcodec/riscv/vp9dsp.h > +++ b/libavcodec/riscv/vp9dsp.h > @@ -72,6 +72,14 @@ void ff_h_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const > uint8_t *l, const uint8_t *a); > void ff_h_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, > const uint8_t *a); > +void ff_tm_32x32_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, > + const uint8_t *a); > +void ff_tm_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, > + const uint8_t *a); > +void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, > + const uint8_t *a); > +void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, > + const uint8_t *a); > > #define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx) > \ void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t > dststride, \ diff --git a/libavcodec/riscv/vp9dsp_init.c > b/libavcodec/riscv/vp9dsp_init.c index eab3e9cb0a..184fadbaf7 100644 > --- a/libavcodec/riscv/vp9dsp_init.c > +++ b/libavcodec/riscv/vp9dsp_init.c > @@ -89,6 +89,10 @@ static av_cold void > vp9dsp_intrapred_init_riscv(VP9DSPContext *dsp, int bpp) > dsp->intra_pred[TX_32X32][HOR_PRED] = ff_h_32x32_rvv; > dsp->intra_pred[TX_16X16][HOR_PRED] = ff_h_16x16_rvv; > dsp->intra_pred[TX_8X8][HOR_PRED] = ff_h_8x8_rvv; > + dsp->intra_pred[TX_32X32][TM_VP8_PRED] = ff_tm_32x32_rvv; > + dsp->intra_pred[TX_16X16][TM_VP8_PRED] = ff_tm_16x16_rvv; > + dsp->intra_pred[TX_8X8][TM_VP8_PRED] = ff_tm_8x8_rvv; > + dsp->intra_pred[TX_4X4][TM_VP8_PRED] = ff_tm_4x4_rvv; > } > #endif > #endif -- レミ・デニ-クールモン http://www.remlab.net/ _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3 4/9] lavc/vp9dsp: R-V V ipred tm 2024-05-14 17:45 ` Rémi Denis-Courmont @ 2024-05-14 17:57 ` flow gg 2024-05-14 18:08 ` Rémi Denis-Courmont 0 siblings, 1 reply; 25+ messages in thread From: flow gg @ 2024-05-14 17:57 UTC (permalink / raw) To: FFmpeg development discussions and patches Why is it unnecessary to reset the vector configuration every time? I think it is necessary to reset e16/e8 each time. Rémi Denis-Courmont <remi@remlab.net> 于2024年5月15日周三 01:46写道: > Le maanantaina 13. toukokuuta 2024, 19.59.21 EEST uk7b@foxmail.com a > écrit : > > From: sunyuechi <sunyuechi@iscas.ac.cn> > > > > C908: > > vp9_tm_4x4_8bpp_c: 116.5 > > vp9_tm_4x4_8bpp_rvv_i32: 43.5 > > vp9_tm_8x8_8bpp_c: 416.2 > > vp9_tm_8x8_8bpp_rvv_i32: 86.0 > > vp9_tm_16x16_8bpp_c: 1665.5 > > vp9_tm_16x16_8bpp_rvv_i32: 187.2 > > vp9_tm_32x32_8bpp_c: 6974.2 > > vp9_tm_32x32_8bpp_rvv_i32: 625.7 > > --- > > libavcodec/riscv/vp9_intra_rvv.S | 141 +++++++++++++++++++++++++++++++ > > libavcodec/riscv/vp9dsp.h | 8 ++ > > libavcodec/riscv/vp9dsp_init.c | 4 + > > 3 files changed, 153 insertions(+) > > > > diff --git a/libavcodec/riscv/vp9_intra_rvv.S > > b/libavcodec/riscv/vp9_intra_rvv.S index ca156d65cd..7e1046bc13 100644 > > --- a/libavcodec/riscv/vp9_intra_rvv.S > > +++ b/libavcodec/riscv/vp9_intra_rvv.S > > @@ -173,3 +173,144 @@ func ff_h_8x8_rvv, zve32x > > > > ret > > endfunc > > + > > +.macro tm_sum dst, top, offset > > + lbu t3, \offset(a2) > > + sub t3, t3, a4 > > + vadd.vx \dst, \top, t3 > > The macro saves some copycat code, but it seems to prevent good > scheduling. > Consuming t3 right after loading it is not ideal. > > > +.endm > > + > > +func ff_tm_32x32_rvv, zve32x > > + lbu a4, -1(a3) > > + li t5, 32 > > + > > + .macro tm_sum32 n1,n2,n3,n4,n5,n6,n7,n8 > > + vsetvli zero, t5, e16, m4, ta, ma > > AFAICT, you do not need to reset the vector configuration every time. > > > + vle8.v v8, (a3) > > + vzext.vf2 v28, v8 > > + > > + tm_sum v0, v28, \n1 > > + tm_sum v4, v28, \n2 > > + tm_sum v8, v28, \n3 > > + tm_sum v12, v28, \n4 > > + tm_sum v16, v28, \n5 > > + tm_sum v20, v28, \n6 > > + tm_sum v24, v28, \n7 > > + tm_sum v28, v28, \n8 > > + > > + .irp n 0, 4, 8, 12, 16, 20, 24, 28 > > + vmax.vx v\n, v\n, zero > > + .endr > > + > > + vsetvli zero, zero, e8, m2, ta, ma > > + .irp n 0, 4, 8, 12, 16, 20, 24, 28 > > + vnclipu.wi v\n, v\n, 0 > > + vse8.v v\n, (a0) > > + add a0, a0, a1 > > + .endr > > + .endm > > + > > + tm_sum32 31, 30, 29, 28, 27, 26, 25, 24 > > + tm_sum32 23, 22, 21, 20, 19, 18, 17, 16 > > + tm_sum32 15, 14, 13, 12, 11, 10, 9, 8 > > + tm_sum32 7, 6, 5, 4, 3, 2, 1, 0 > > + > > + ret > > +endfunc > > + > > +func ff_tm_16x16_rvv, zve32x > > + vsetivli zero, 16, e16, m2, ta, ma > > + vle8.v v8, (a3) > > + vzext.vf2 v30, v8 > > + lbu a4, -1(a3) > > + > > + tm_sum v0, v30, 15 > > + tm_sum v2, v30, 14 > > + tm_sum v4, v30, 13 > > + tm_sum v6, v30, 12 > > + tm_sum v8, v30, 11 > > + tm_sum v10, v30, 10 > > + tm_sum v12, v30, 9 > > + tm_sum v14, v30, 8 > > + tm_sum v16, v30, 7 > > + tm_sum v18, v30, 6 > > + tm_sum v20, v30, 5 > > + tm_sum v22, v30, 4 > > + tm_sum v24, v30, 3 > > + tm_sum v26, v30, 2 > > + tm_sum v28, v30, 1 > > + tm_sum v30, v30, 0 > > + > > + .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 > > + vmax.vx v\n, v\n, zero > > + .endr > > + > > + vsetvli zero, zero, e8, m1, ta, ma > > + .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28 > > + vnclipu.wi v\n, v\n, 0 > > + vse8.v v\n, (a0) > > + add a0, a0, a1 > > + .endr > > + vnclipu.wi v30, v30, 0 > > + vse8.v v30, (a0) > > + > > + ret > > +endfunc > > + > > +func ff_tm_8x8_rvv, zve32x > > + vsetivli zero, 8, e16, m1, ta, ma > > + vle8.v v8, (a3) > > + vzext.vf2 v28, v8 > > + lbu a4, -1(a3) > > + > > + tm_sum v16, v28, 7 > > + tm_sum v17, v28, 6 > > + tm_sum v18, v28, 5 > > + tm_sum v19, v28, 4 > > + tm_sum v20, v28, 3 > > + tm_sum v21, v28, 2 > > + tm_sum v22, v28, 1 > > + tm_sum v23, v28, 0 > > + > > + .irp n 16, 17, 18, 19, 20, 21, 22, 23 > > + vmax.vx v\n, v\n, zero > > + .endr > > + > > + vsetvli zero, zero, e8, mf2, ta, ma > > + .irp n 16, 17, 18, 19, 20, 21, 22 > > + vnclipu.wi v\n, v\n, 0 > > + vse8.v v\n, (a0) > > + add a0, a0, a1 > > + .endr > > + vnclipu.wi v24, v23, 0 > > + vse8.v v24, (a0) > > + > > + ret > > +endfunc > > + > > +func ff_tm_4x4_rvv, zve32x > > + vsetivli zero, 4, e16, mf2, ta, ma > > + vle8.v v8, (a3) > > + vzext.vf2 v28, v8 > > + lbu a4, -1(a3) > > + > > + tm_sum v16, v28, 3 > > + tm_sum v17, v28, 2 > > + tm_sum v18, v28, 1 > > + tm_sum v19, v28, 0 > > + > > + .irp n 16, 17, 18, 19 > > + vmax.vx v\n, v\n, zero > > + .endr > > + > > + vsetvli zero, zero, e8, mf4, ta, ma > > + .irp n 16, 17, 18 > > + vnclipu.wi v\n, v\n, 0 > > + vse8.v v\n, (a0) > > + add a0, a0, a1 > > + .endr > > + vnclipu.wi v24, v19, 0 > > + vse8.v v24, (a0) > > + > > + ret > > +endfunc > > diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h > > index 0ad961c7e0..79330b4968 100644 > > --- a/libavcodec/riscv/vp9dsp.h > > +++ b/libavcodec/riscv/vp9dsp.h > > @@ -72,6 +72,14 @@ void ff_h_16x16_rvv(uint8_t *dst, ptrdiff_t stride, > const > > uint8_t *l, const uint8_t *a); > > void ff_h_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, > > const uint8_t *a); > > +void ff_tm_32x32_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, > > + const uint8_t *a); > > +void ff_tm_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, > > + const uint8_t *a); > > +void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, > > + const uint8_t *a); > > +void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, > > + const uint8_t *a); > > > > #define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx) > > > \ void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t > > dststride, \ diff --git a/libavcodec/riscv/vp9dsp_init.c > > b/libavcodec/riscv/vp9dsp_init.c index eab3e9cb0a..184fadbaf7 100644 > > --- a/libavcodec/riscv/vp9dsp_init.c > > +++ b/libavcodec/riscv/vp9dsp_init.c > > @@ -89,6 +89,10 @@ static av_cold void > > vp9dsp_intrapred_init_riscv(VP9DSPContext *dsp, int bpp) > > dsp->intra_pred[TX_32X32][HOR_PRED] = ff_h_32x32_rvv; > > dsp->intra_pred[TX_16X16][HOR_PRED] = ff_h_16x16_rvv; > > dsp->intra_pred[TX_8X8][HOR_PRED] = ff_h_8x8_rvv; > > + dsp->intra_pred[TX_32X32][TM_VP8_PRED] = ff_tm_32x32_rvv; > > + dsp->intra_pred[TX_16X16][TM_VP8_PRED] = ff_tm_16x16_rvv; > > + dsp->intra_pred[TX_8X8][TM_VP8_PRED] = ff_tm_8x8_rvv; > > + dsp->intra_pred[TX_4X4][TM_VP8_PRED] = ff_tm_4x4_rvv; > > } > > #endif > > #endif > > > -- > レミ・デニ-クールモン > http://www.remlab.net/ > > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3 4/9] lavc/vp9dsp: R-V V ipred tm 2024-05-14 17:57 ` flow gg @ 2024-05-14 18:08 ` Rémi Denis-Courmont 2024-05-14 18:33 ` uk7b 2024-05-14 18:34 ` flow gg 0 siblings, 2 replies; 25+ messages in thread From: Rémi Denis-Courmont @ 2024-05-14 18:08 UTC (permalink / raw) To: FFmpeg development discussions and patches Le tiistaina 14. toukokuuta 2024, 20.57.17 EEST flow gg a écrit : > Why is it unnecessary to reset the vector configuration every time? I think > it is necessary to reset e16/e8 each time. I misread the placement of .endm OTOH, it seems that you could just write the tm_sum32 with a single parameter, as the other ones are just relative by constant +/-1. -- 雷米‧德尼-库尔蒙 http://www.remlab.net/ _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 25+ messages in thread
* [FFmpeg-devel] [PATCH v3 4/9] lavc/vp9dsp: R-V V ipred tm 2024-05-14 18:08 ` Rémi Denis-Courmont @ 2024-05-14 18:33 ` uk7b 2024-05-14 18:34 ` flow gg 1 sibling, 0 replies; 25+ messages in thread From: uk7b @ 2024-05-14 18:33 UTC (permalink / raw) To: ffmpeg-devel; +Cc: sunyuechi From: sunyuechi <sunyuechi@iscas.ac.cn> C908: vp9_tm_4x4_8bpp_c: 116.5 vp9_tm_4x4_8bpp_rvv_i32: 43.5 vp9_tm_8x8_8bpp_c: 416.2 vp9_tm_8x8_8bpp_rvv_i32: 86.0 vp9_tm_16x16_8bpp_c: 1665.5 vp9_tm_16x16_8bpp_rvv_i32: 187.2 vp9_tm_32x32_8bpp_c: 6974.2 vp9_tm_32x32_8bpp_rvv_i32: 625.7 --- libavcodec/riscv/vp9_intra_rvv.S | 123 +++++++++++++++++++++++++++++++ libavcodec/riscv/vp9dsp.h | 8 ++ libavcodec/riscv/vp9dsp_init.c | 4 + 3 files changed, 135 insertions(+) diff --git a/libavcodec/riscv/vp9_intra_rvv.S b/libavcodec/riscv/vp9_intra_rvv.S index ca156d65cd..5fb546c12d 100644 --- a/libavcodec/riscv/vp9_intra_rvv.S +++ b/libavcodec/riscv/vp9_intra_rvv.S @@ -173,3 +173,126 @@ func ff_h_8x8_rvv, zve32x ret endfunc + +.macro tm_sum4 dst1, dst2, dst3, dst4, top, n1 + lbu t1, \n1(a2) + lbu t2, (\n1-1)(a2) + lbu t3, (\n1-2)(a2) + lbu t4, (\n1-3)(a2) + sub t1, t1, a4 + sub t2, t2, a4 + sub t3, t3, a4 + sub t4, t4, a4 + vadd.vx \dst1, \top, t1 + vadd.vx \dst2, \top, t2 + vadd.vx \dst3, \top, t3 + vadd.vx \dst4, \top, t4 +.endm + +func ff_tm_32x32_rvv, zve32x + lbu a4, -1(a3) + li t5, 32 + + .macro tm_sum32 offset + vsetvli zero, t5, e16, m4, ta, ma + vle8.v v8, (a3) + vzext.vf2 v28, v8 + + tm_sum4 v0, v4, v8, v12, v28, \offset + tm_sum4 v16, v20, v24, v28, v28, (\offset-4) + + .irp n 0, 4, 8, 12, 16, 20, 24, 28 + vmax.vx v\n, v\n, zero + .endr + + vsetvli zero, zero, e8, m2, ta, ma + .irp n 0, 4, 8, 12, 16, 20, 24, 28 + vnclipu.wi v\n, v\n, 0 + vse8.v v\n, (a0) + add a0, a0, a1 + .endr + .endm + + tm_sum32 31 + tm_sum32 23 + tm_sum32 15 + tm_sum32 7 + + ret +endfunc + +func ff_tm_16x16_rvv, zve32x + vsetivli zero, 16, e16, m2, ta, ma + vle8.v v8, (a3) + vzext.vf2 v30, v8 + lbu a4, -1(a3) + + tm_sum4 v0, v2, v4, v6, v30, 15 + tm_sum4 v8, v10, v12, v14, v30, 11 + tm_sum4 v16, v18, v20, v22, v30, 7 + tm_sum4 v24, v26, v28, v30, v30, 3 + + .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + vmax.vx v\n, v\n, zero + .endr + + vsetvli zero, zero, e8, m1, ta, ma + .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28 + vnclipu.wi v\n, v\n, 0 + vse8.v v\n, (a0) + add a0, a0, a1 + .endr + vnclipu.wi v30, v30, 0 + vse8.v v30, (a0) + + ret +endfunc + +func ff_tm_8x8_rvv, zve32x + vsetivli zero, 8, e16, m1, ta, ma + vle8.v v8, (a3) + vzext.vf2 v28, v8 + lbu a4, -1(a3) + + tm_sum4 v16, v17, v18, v19, v28, 7 + tm_sum4 v20, v21, v22, v23, v28, 3 + + .irp n 16, 17, 18, 19, 20, 21, 22, 23 + vmax.vx v\n, v\n, zero + .endr + + vsetvli zero, zero, e8, mf2, ta, ma + .irp n 16, 17, 18, 19, 20, 21, 22 + vnclipu.wi v\n, v\n, 0 + vse8.v v\n, (a0) + add a0, a0, a1 + .endr + vnclipu.wi v24, v23, 0 + vse8.v v24, (a0) + + ret +endfunc + +func ff_tm_4x4_rvv, zve32x + vsetivli zero, 4, e16, mf2, ta, ma + vle8.v v8, (a3) + vzext.vf2 v28, v8 + lbu a4, -1(a3) + + tm_sum4 v16, v17, v18, v19, v28, 3 + + .irp n 16, 17, 18, 19 + vmax.vx v\n, v\n, zero + .endr + + vsetvli zero, zero, e8, mf4, ta, ma + .irp n 16, 17, 18 + vnclipu.wi v\n, v\n, 0 + vse8.v v\n, (a0) + add a0, a0, a1 + .endr + vnclipu.wi v24, v19, 0 + vse8.v v24, (a0) + + ret +endfunc diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h index 0ad961c7e0..79330b4968 100644 --- a/libavcodec/riscv/vp9dsp.h +++ b/libavcodec/riscv/vp9dsp.h @@ -72,6 +72,14 @@ void ff_h_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a); void ff_h_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a); +void ff_tm_32x32_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, + const uint8_t *a); +void ff_tm_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, + const uint8_t *a); +void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, + const uint8_t *a); +void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, + const uint8_t *a); #define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx) \ void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \ diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c index eab3e9cb0a..184fadbaf7 100644 --- a/libavcodec/riscv/vp9dsp_init.c +++ b/libavcodec/riscv/vp9dsp_init.c @@ -89,6 +89,10 @@ static av_cold void vp9dsp_intrapred_init_riscv(VP9DSPContext *dsp, int bpp) dsp->intra_pred[TX_32X32][HOR_PRED] = ff_h_32x32_rvv; dsp->intra_pred[TX_16X16][HOR_PRED] = ff_h_16x16_rvv; dsp->intra_pred[TX_8X8][HOR_PRED] = ff_h_8x8_rvv; + dsp->intra_pred[TX_32X32][TM_VP8_PRED] = ff_tm_32x32_rvv; + dsp->intra_pred[TX_16X16][TM_VP8_PRED] = ff_tm_16x16_rvv; + dsp->intra_pred[TX_8X8][TM_VP8_PRED] = ff_tm_8x8_rvv; + dsp->intra_pred[TX_4X4][TM_VP8_PRED] = ff_tm_4x4_rvv; } #endif #endif -- 2.45.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3 4/9] lavc/vp9dsp: R-V V ipred tm 2024-05-14 18:08 ` Rémi Denis-Courmont 2024-05-14 18:33 ` uk7b @ 2024-05-14 18:34 ` flow gg 2024-05-15 3:55 ` [FFmpeg-devel] [PATCH " uk7b 1 sibling, 1 reply; 25+ messages in thread From: flow gg @ 2024-05-14 18:34 UTC (permalink / raw) To: FFmpeg development discussions and patches > The macro saves some copycat code, but it seems to prevent good scheduling. > Consuming t3 right after loading it is not ideal. > OTOH, it seems that you could just write the tm_sum32 with a single parameter, > as the other ones are just relative by constant +/-1. Okay, updated it in the reply Rémi Denis-Courmont <remi@remlab.net> 于2024年5月15日周三 02:08写道: > Le tiistaina 14. toukokuuta 2024, 20.57.17 EEST flow gg a écrit : > > Why is it unnecessary to reset the vector configuration every time? I > think > > it is necessary to reset e16/e8 each time. > > I misread the placement of .endm > > OTOH, it seems that you could just write the tm_sum32 with a single > parameter, > as the other ones are just relative by constant +/-1. > > -- > 雷米‧德尼-库尔蒙 > http://www.remlab.net/ > > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 25+ messages in thread
* [FFmpeg-devel] [PATCH 4/9] lavc/vp9dsp: R-V V ipred tm 2024-05-14 18:34 ` flow gg @ 2024-05-15 3:55 ` uk7b 2024-05-15 3:56 ` flow gg 0 siblings, 1 reply; 25+ messages in thread From: uk7b @ 2024-05-15 3:55 UTC (permalink / raw) To: ffmpeg-devel; +Cc: sunyuechi From: sunyuechi <sunyuechi@iscas.ac.cn> C908: vp9_tm_4x4_8bpp_c: 116.5 vp9_tm_4x4_8bpp_rvv_i32: 43.5 vp9_tm_8x8_8bpp_c: 416.2 vp9_tm_8x8_8bpp_rvv_i32: 86.0 vp9_tm_16x16_8bpp_c: 1665.5 vp9_tm_16x16_8bpp_rvv_i32: 187.2 vp9_tm_32x32_8bpp_c: 6974.2 vp9_tm_32x32_8bpp_rvv_i32: 625.7 --- libavcodec/riscv/vp9_intra_rvv.S | 118 +++++++++++++++++++++++++++++++ libavcodec/riscv/vp9dsp.h | 8 +++ libavcodec/riscv/vp9dsp_init.c | 4 ++ 3 files changed, 130 insertions(+) diff --git a/libavcodec/riscv/vp9_intra_rvv.S b/libavcodec/riscv/vp9_intra_rvv.S index ca156d65cd..280c497687 100644 --- a/libavcodec/riscv/vp9_intra_rvv.S +++ b/libavcodec/riscv/vp9_intra_rvv.S @@ -173,3 +173,121 @@ func ff_h_8x8_rvv, zve32x ret endfunc + +.macro tm_sum4 dst1, dst2, dst3, dst4, top, n1 + lbu t1, \n1(a2) + lbu t2, (\n1-1)(a2) + lbu t3, (\n1-2)(a2) + lbu t4, (\n1-3)(a2) + sub t1, t1, a4 + sub t2, t2, a4 + sub t3, t3, a4 + sub t4, t4, a4 + vadd.vx \dst1, \top, t1 + vadd.vx \dst2, \top, t2 + vadd.vx \dst3, \top, t3 + vadd.vx \dst4, \top, t4 +.endm + +func ff_tm_32x32_rvv, zve32x + lbu a4, -1(a3) + li t5, 32 + + .irp offset 31, 23, 15, 7 + vsetvli zero, t5, e16, m4, ta, ma + vle8.v v8, (a3) + vzext.vf2 v28, v8 + + tm_sum4 v0, v4, v8, v12, v28, \offset + tm_sum4 v16, v20, v24, v28, v28, (\offset-4) + + .irp n 0, 4, 8, 12, 16, 20, 24, 28 + vmax.vx v\n, v\n, zero + .endr + + vsetvli zero, zero, e8, m2, ta, ma + .irp n 0, 4, 8, 12, 16, 20, 24, 28 + vnclipu.wi v\n, v\n, 0 + vse8.v v\n, (a0) + add a0, a0, a1 + .endr + .endr + + ret +endfunc + +func ff_tm_16x16_rvv, zve32x + vsetivli zero, 16, e16, m2, ta, ma + vle8.v v8, (a3) + vzext.vf2 v30, v8 + lbu a4, -1(a3) + + tm_sum4 v0, v2, v4, v6, v30, 15 + tm_sum4 v8, v10, v12, v14, v30, 11 + tm_sum4 v16, v18, v20, v22, v30, 7 + tm_sum4 v24, v26, v28, v30, v30, 3 + + .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + vmax.vx v\n, v\n, zero + .endr + + vsetvli zero, zero, e8, m1, ta, ma + .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28 + vnclipu.wi v\n, v\n, 0 + vse8.v v\n, (a0) + add a0, a0, a1 + .endr + vnclipu.wi v30, v30, 0 + vse8.v v30, (a0) + + ret +endfunc + +func ff_tm_8x8_rvv, zve32x + vsetivli zero, 8, e16, m1, ta, ma + vle8.v v8, (a3) + vzext.vf2 v28, v8 + lbu a4, -1(a3) + + tm_sum4 v16, v17, v18, v19, v28, 7 + tm_sum4 v20, v21, v22, v23, v28, 3 + + .irp n 16, 17, 18, 19, 20, 21, 22, 23 + vmax.vx v\n, v\n, zero + .endr + + vsetvli zero, zero, e8, mf2, ta, ma + .irp n 16, 17, 18, 19, 20, 21, 22 + vnclipu.wi v\n, v\n, 0 + vse8.v v\n, (a0) + add a0, a0, a1 + .endr + vnclipu.wi v24, v23, 0 + vse8.v v24, (a0) + + ret +endfunc + +func ff_tm_4x4_rvv, zve32x + vsetivli zero, 4, e16, mf2, ta, ma + vle8.v v8, (a3) + vzext.vf2 v28, v8 + lbu a4, -1(a3) + + tm_sum4 v16, v17, v18, v19, v28, 3 + + .irp n 16, 17, 18, 19 + vmax.vx v\n, v\n, zero + .endr + + vsetvli zero, zero, e8, mf4, ta, ma + .irp n 16, 17, 18 + vnclipu.wi v\n, v\n, 0 + vse8.v v\n, (a0) + add a0, a0, a1 + .endr + vnclipu.wi v24, v19, 0 + vse8.v v24, (a0) + + ret +endfunc diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h index 0ad961c7e0..79330b4968 100644 --- a/libavcodec/riscv/vp9dsp.h +++ b/libavcodec/riscv/vp9dsp.h @@ -72,6 +72,14 @@ void ff_h_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a); void ff_h_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a); +void ff_tm_32x32_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, + const uint8_t *a); +void ff_tm_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, + const uint8_t *a); +void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, + const uint8_t *a); +void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, + const uint8_t *a); #define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx) \ void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \ diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c index eab3e9cb0a..184fadbaf7 100644 --- a/libavcodec/riscv/vp9dsp_init.c +++ b/libavcodec/riscv/vp9dsp_init.c @@ -89,6 +89,10 @@ static av_cold void vp9dsp_intrapred_init_riscv(VP9DSPContext *dsp, int bpp) dsp->intra_pred[TX_32X32][HOR_PRED] = ff_h_32x32_rvv; dsp->intra_pred[TX_16X16][HOR_PRED] = ff_h_16x16_rvv; dsp->intra_pred[TX_8X8][HOR_PRED] = ff_h_8x8_rvv; + dsp->intra_pred[TX_32X32][TM_VP8_PRED] = ff_tm_32x32_rvv; + dsp->intra_pred[TX_16X16][TM_VP8_PRED] = ff_tm_16x16_rvv; + dsp->intra_pred[TX_8X8][TM_VP8_PRED] = ff_tm_8x8_rvv; + dsp->intra_pred[TX_4X4][TM_VP8_PRED] = ff_tm_4x4_rvv; } #endif #endif -- 2.45.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [FFmpeg-devel] [PATCH 4/9] lavc/vp9dsp: R-V V ipred tm 2024-05-15 3:55 ` [FFmpeg-devel] [PATCH " uk7b @ 2024-05-15 3:56 ` flow gg 0 siblings, 0 replies; 25+ messages in thread From: flow gg @ 2024-05-15 3:56 UTC (permalink / raw) To: FFmpeg development discussions and patches updated for clean code <uk7b@foxmail.com> 于2024年5月15日周三 11:56写道: > From: sunyuechi <sunyuechi@iscas.ac.cn> > > C908: > vp9_tm_4x4_8bpp_c: 116.5 > vp9_tm_4x4_8bpp_rvv_i32: 43.5 > vp9_tm_8x8_8bpp_c: 416.2 > vp9_tm_8x8_8bpp_rvv_i32: 86.0 > vp9_tm_16x16_8bpp_c: 1665.5 > vp9_tm_16x16_8bpp_rvv_i32: 187.2 > vp9_tm_32x32_8bpp_c: 6974.2 > vp9_tm_32x32_8bpp_rvv_i32: 625.7 > --- > libavcodec/riscv/vp9_intra_rvv.S | 118 +++++++++++++++++++++++++++++++ > libavcodec/riscv/vp9dsp.h | 8 +++ > libavcodec/riscv/vp9dsp_init.c | 4 ++ > 3 files changed, 130 insertions(+) > > diff --git a/libavcodec/riscv/vp9_intra_rvv.S > b/libavcodec/riscv/vp9_intra_rvv.S > index ca156d65cd..280c497687 100644 > --- a/libavcodec/riscv/vp9_intra_rvv.S > +++ b/libavcodec/riscv/vp9_intra_rvv.S > @@ -173,3 +173,121 @@ func ff_h_8x8_rvv, zve32x > > ret > endfunc > + > +.macro tm_sum4 dst1, dst2, dst3, dst4, top, n1 > + lbu t1, \n1(a2) > + lbu t2, (\n1-1)(a2) > + lbu t3, (\n1-2)(a2) > + lbu t4, (\n1-3)(a2) > + sub t1, t1, a4 > + sub t2, t2, a4 > + sub t3, t3, a4 > + sub t4, t4, a4 > + vadd.vx \dst1, \top, t1 > + vadd.vx \dst2, \top, t2 > + vadd.vx \dst3, \top, t3 > + vadd.vx \dst4, \top, t4 > +.endm > + > +func ff_tm_32x32_rvv, zve32x > + lbu a4, -1(a3) > + li t5, 32 > + > + .irp offset 31, 23, 15, 7 > + vsetvli zero, t5, e16, m4, ta, ma > + vle8.v v8, (a3) > + vzext.vf2 v28, v8 > + > + tm_sum4 v0, v4, v8, v12, v28, \offset > + tm_sum4 v16, v20, v24, v28, v28, (\offset-4) > + > + .irp n 0, 4, 8, 12, 16, 20, 24, 28 > + vmax.vx v\n, v\n, zero > + .endr > + > + vsetvli zero, zero, e8, m2, ta, ma > + .irp n 0, 4, 8, 12, 16, 20, 24, 28 > + vnclipu.wi v\n, v\n, 0 > + vse8.v v\n, (a0) > + add a0, a0, a1 > + .endr > + .endr > + > + ret > +endfunc > + > +func ff_tm_16x16_rvv, zve32x > + vsetivli zero, 16, e16, m2, ta, ma > + vle8.v v8, (a3) > + vzext.vf2 v30, v8 > + lbu a4, -1(a3) > + > + tm_sum4 v0, v2, v4, v6, v30, 15 > + tm_sum4 v8, v10, v12, v14, v30, 11 > + tm_sum4 v16, v18, v20, v22, v30, 7 > + tm_sum4 v24, v26, v28, v30, v30, 3 > + > + .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 > + vmax.vx v\n, v\n, zero > + .endr > + > + vsetvli zero, zero, e8, m1, ta, ma > + .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28 > + vnclipu.wi v\n, v\n, 0 > + vse8.v v\n, (a0) > + add a0, a0, a1 > + .endr > + vnclipu.wi v30, v30, 0 > + vse8.v v30, (a0) > + > + ret > +endfunc > + > +func ff_tm_8x8_rvv, zve32x > + vsetivli zero, 8, e16, m1, ta, ma > + vle8.v v8, (a3) > + vzext.vf2 v28, v8 > + lbu a4, -1(a3) > + > + tm_sum4 v16, v17, v18, v19, v28, 7 > + tm_sum4 v20, v21, v22, v23, v28, 3 > + > + .irp n 16, 17, 18, 19, 20, 21, 22, 23 > + vmax.vx v\n, v\n, zero > + .endr > + > + vsetvli zero, zero, e8, mf2, ta, ma > + .irp n 16, 17, 18, 19, 20, 21, 22 > + vnclipu.wi v\n, v\n, 0 > + vse8.v v\n, (a0) > + add a0, a0, a1 > + .endr > + vnclipu.wi v24, v23, 0 > + vse8.v v24, (a0) > + > + ret > +endfunc > + > +func ff_tm_4x4_rvv, zve32x > + vsetivli zero, 4, e16, mf2, ta, ma > + vle8.v v8, (a3) > + vzext.vf2 v28, v8 > + lbu a4, -1(a3) > + > + tm_sum4 v16, v17, v18, v19, v28, 3 > + > + .irp n 16, 17, 18, 19 > + vmax.vx v\n, v\n, zero > + .endr > + > + vsetvli zero, zero, e8, mf4, ta, ma > + .irp n 16, 17, 18 > + vnclipu.wi v\n, v\n, 0 > + vse8.v v\n, (a0) > + add a0, a0, a1 > + .endr > + vnclipu.wi v24, v19, 0 > + vse8.v v24, (a0) > + > + ret > +endfunc > diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h > index 0ad961c7e0..79330b4968 100644 > --- a/libavcodec/riscv/vp9dsp.h > +++ b/libavcodec/riscv/vp9dsp.h > @@ -72,6 +72,14 @@ void ff_h_16x16_rvv(uint8_t *dst, ptrdiff_t stride, > const uint8_t *l, > const uint8_t *a); > void ff_h_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, > const uint8_t *a); > +void ff_tm_32x32_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, > + const uint8_t *a); > +void ff_tm_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, > + const uint8_t *a); > +void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, > + const uint8_t *a); > +void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, > + const uint8_t *a); > > #define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx) > \ > void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t > dststride, \ > diff --git a/libavcodec/riscv/vp9dsp_init.c > b/libavcodec/riscv/vp9dsp_init.c > index eab3e9cb0a..184fadbaf7 100644 > --- a/libavcodec/riscv/vp9dsp_init.c > +++ b/libavcodec/riscv/vp9dsp_init.c > @@ -89,6 +89,10 @@ static av_cold void > vp9dsp_intrapred_init_riscv(VP9DSPContext *dsp, int bpp) > dsp->intra_pred[TX_32X32][HOR_PRED] = ff_h_32x32_rvv; > dsp->intra_pred[TX_16X16][HOR_PRED] = ff_h_16x16_rvv; > dsp->intra_pred[TX_8X8][HOR_PRED] = ff_h_8x8_rvv; > + dsp->intra_pred[TX_32X32][TM_VP8_PRED] = ff_tm_32x32_rvv; > + dsp->intra_pred[TX_16X16][TM_VP8_PRED] = ff_tm_16x16_rvv; > + dsp->intra_pred[TX_8X8][TM_VP8_PRED] = ff_tm_8x8_rvv; > + dsp->intra_pred[TX_4X4][TM_VP8_PRED] = ff_tm_4x4_rvv; > } > #endif > #endif > -- > 2.45.1 > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 25+ messages in thread
* [FFmpeg-devel] [PATCH v3 5/9] lavc/vp9dsp: R-V V mc avg [not found] <20240513165926.1467967-1-uk7b@foxmail.com> ` (2 preceding siblings ...) 2024-05-13 16:59 ` [FFmpeg-devel] [PATCH v3 4/9] lavc/vp9dsp: R-V V ipred tm uk7b @ 2024-05-13 16:59 ` uk7b 2024-05-17 15:11 ` Rémi Denis-Courmont 2024-05-13 16:59 ` [FFmpeg-devel] [PATCH v3 6/9] lavc/vp9dsp: R-V V mc bilin h v uk7b ` (3 subsequent siblings) 7 siblings, 1 reply; 25+ messages in thread From: uk7b @ 2024-05-13 16:59 UTC (permalink / raw) To: ffmpeg-devel; +Cc: sunyuechi From: sunyuechi <sunyuechi@iscas.ac.cn> C908: vp9_avg4_8bpp_c: 1.2 vp9_avg4_8bpp_rvv_i64: 1.0 vp9_avg8_8bpp_c: 3.7 vp9_avg8_8bpp_rvv_i64: 1.5 vp9_avg16_8bpp_c: 14.7 vp9_avg16_8bpp_rvv_i64: 3.5 vp9_avg32_8bpp_c: 57.7 vp9_avg32_8bpp_rvv_i64: 10.0 vp9_avg64_8bpp_c: 229.0 vp9_avg64_8bpp_rvv_i64: 31.7 --- libavcodec/riscv/Makefile | 3 +- libavcodec/riscv/vp9_mc_rvv.S | 58 ++++++++++++++++++++++++++++++++++ libavcodec/riscv/vp9dsp_init.c | 18 +++++++++++ 3 files changed, 78 insertions(+), 1 deletion(-) create mode 100644 libavcodec/riscv/vp9_mc_rvv.S diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile index 0cd900104f..1183357b37 100644 --- a/libavcodec/riscv/Makefile +++ b/libavcodec/riscv/Makefile @@ -64,6 +64,7 @@ RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9dsp_init.o RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \ riscv/vp9_mc_rvi.o -RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o +RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o \ + riscv/vp9_mc_rvv.o OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S new file mode 100644 index 0000000000..5d917e7b98 --- /dev/null +++ b/libavcodec/riscv/vp9_mc_rvv.S @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS). + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/riscv/asm.S" + +.macro vsetvlstatic8 len an maxlen mn=m4 +.if \len == 4 + vsetivli zero, \len, e8, mf4, ta, ma +.elseif \len == 8 + vsetivli zero, \len, e8, mf2, ta, ma +.elseif \len == 16 + vsetivli zero, \len, e8, m1, ta, ma +.elseif \len == 32 + li \an, \len + vsetvli zero, \an, e8, m2, ta, ma +.elseif \len == 64 + li \an, \maxlen + vsetvli zero, \an, e8, \mn, ta, ma +.endif +.endm + +.macro copy_avg len +func ff_avg\len\()_rvv, zve32x + csrwi vxrm, 0 + vsetvlstatic8 \len t0 64 +1: + addi a4, a4, -1 + vle8.v v8, (a2) + vle8.v v16, (a0) + vaaddu.vv v8, v8, v16 + vse8.v v8, (a0) + add a2, a2, a3 + add a0, a0, a1 + bnez a4, 1b + ret +endfunc +.endm + +.irp len 64, 32, 16, 8, 4 + copy_avg \len +.endr diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c index 184fadbaf7..1922484a1d 100644 --- a/libavcodec/riscv/vp9dsp_init.c +++ b/libavcodec/riscv/vp9dsp_init.c @@ -48,6 +48,24 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp) } # endif +#if HAVE_RVV + if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) && ff_rv_vlen_least(128)) { + +#define init_fpel(idx1, sz) \ + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_avg##sz##_rvv; \ + dsp->mc[idx1][FILTER_8TAP_REGULAR][1][0][0] = ff_avg##sz##_rvv; \ + dsp->mc[idx1][FILTER_8TAP_SHARP ][1][0][0] = ff_avg##sz##_rvv; \ + dsp->mc[idx1][FILTER_BILINEAR ][1][0][0] = ff_avg##sz##_rvv + + init_fpel(0, 64); + init_fpel(1, 32); + init_fpel(2, 16); + init_fpel(3, 8); + init_fpel(4, 4); + +#undef init_fpel + } +#endif #endif } -- 2.45.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3 5/9] lavc/vp9dsp: R-V V mc avg 2024-05-13 16:59 ` [FFmpeg-devel] [PATCH v3 5/9] lavc/vp9dsp: R-V V mc avg uk7b @ 2024-05-17 15:11 ` Rémi Denis-Courmont 2024-05-17 16:05 ` [FFmpeg-devel] [PATCH 1/5] " uk7b 2024-05-17 16:06 ` [FFmpeg-devel] [PATCH v3 5/9] " flow gg 0 siblings, 2 replies; 25+ messages in thread From: Rémi Denis-Courmont @ 2024-05-17 15:11 UTC (permalink / raw) To: ffmpeg-devel Le maanantaina 13. toukokuuta 2024, 19.59.22 EEST uk7b@foxmail.com a écrit : > From: sunyuechi <sunyuechi@iscas.ac.cn> > > C908: > vp9_avg4_8bpp_c: 1.2 > vp9_avg4_8bpp_rvv_i64: 1.0 > vp9_avg8_8bpp_c: 3.7 > vp9_avg8_8bpp_rvv_i64: 1.5 > vp9_avg16_8bpp_c: 14.7 > vp9_avg16_8bpp_rvv_i64: 3.5 > vp9_avg32_8bpp_c: 57.7 > vp9_avg32_8bpp_rvv_i64: 10.0 > vp9_avg64_8bpp_c: 229.0 > vp9_avg64_8bpp_rvv_i64: 31.7 > --- > libavcodec/riscv/Makefile | 3 +- > libavcodec/riscv/vp9_mc_rvv.S | 58 ++++++++++++++++++++++++++++++++++ > libavcodec/riscv/vp9dsp_init.c | 18 +++++++++++ > 3 files changed, 78 insertions(+), 1 deletion(-) > create mode 100644 libavcodec/riscv/vp9_mc_rvv.S > > diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile > index 0cd900104f..1183357b37 100644 > --- a/libavcodec/riscv/Makefile > +++ b/libavcodec/riscv/Makefile > @@ -64,6 +64,7 @@ RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o > OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9dsp_init.o > RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \ > riscv/vp9_mc_rvi.o > -RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o > +RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o \ > + riscv/vp9_mc_rvv.o > OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o > RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o > diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S > new file mode 100644 > index 0000000000..5d917e7b98 > --- /dev/null > +++ b/libavcodec/riscv/vp9_mc_rvv.S > @@ -0,0 +1,58 @@ > +/* > + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences > (ISCAS). + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA + */ > + > +#include "libavutil/riscv/asm.S" > + > +.macro vsetvlstatic8 len an maxlen mn=m4 > +.if \len == 4 > + vsetivli zero, \len, e8, mf4, ta, ma > +.elseif \len == 8 > + vsetivli zero, \len, e8, mf2, ta, ma > +.elseif \len == 16 > + vsetivli zero, \len, e8, m1, ta, ma > +.elseif \len == 32 > + li \an, \len > + vsetvli zero, \an, e8, m2, ta, ma > +.elseif \len == 64 > + li \an, \maxlen > + vsetvli zero, \an, e8, \mn, ta, ma > +.endif > +.endm > + > +.macro copy_avg len > +func ff_avg\len\()_rvv, zve32x > + csrwi vxrm, 0 > + vsetvlstatic8 \len t0 64 > +1: > + addi a4, a4, -1 > + vle8.v v8, (a2) > + vle8.v v16, (a0) > + vaaddu.vv v8, v8, v16 > + vse8.v v8, (a0) > + add a2, a2, a3 > + add a0, a0, a1 > + bnez a4, 1b > + ret Doesn't this get slightly faster by interleaving scalar and vector instructions? > +endfunc > +.endm > + > +.irp len 64, 32, 16, 8, 4 > + copy_avg \len > +.endr > diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c > index 184fadbaf7..1922484a1d 100644 > --- a/libavcodec/riscv/vp9dsp_init.c > +++ b/libavcodec/riscv/vp9dsp_init.c > @@ -48,6 +48,24 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext > *dsp, int bpp) } > # endif > > +#if HAVE_RVV > + if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) && ff_rv_vlen_least(128)) > { + > +#define init_fpel(idx1, sz) \ > + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_avg##sz##_rvv; \ > + dsp->mc[idx1][FILTER_8TAP_REGULAR][1][0][0] = ff_avg##sz##_rvv; \ > + dsp->mc[idx1][FILTER_8TAP_SHARP ][1][0][0] = ff_avg##sz##_rvv; \ > + dsp->mc[idx1][FILTER_BILINEAR ][1][0][0] = ff_avg##sz##_rvv > + > + init_fpel(0, 64); > + init_fpel(1, 32); > + init_fpel(2, 16); > + init_fpel(3, 8); > + init_fpel(4, 4); > + > +#undef init_fpel > + } > +#endif > #endif > } -- 雷米‧德尼-库尔蒙 http://www.remlab.net/ _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 25+ messages in thread
* [FFmpeg-devel] [PATCH 1/5] lavc/vp9dsp: R-V V mc avg 2024-05-17 15:11 ` Rémi Denis-Courmont @ 2024-05-17 16:05 ` uk7b 2024-05-17 16:06 ` [FFmpeg-devel] [PATCH v3 5/9] " flow gg 1 sibling, 0 replies; 25+ messages in thread From: uk7b @ 2024-05-17 16:05 UTC (permalink / raw) To: ffmpeg-devel; +Cc: sunyuechi From: sunyuechi <sunyuechi@iscas.ac.cn> C908: vp9_avg4_8bpp_c: 1.2 vp9_avg4_8bpp_rvv_i64: 1.0 vp9_avg8_8bpp_c: 3.7 vp9_avg8_8bpp_rvv_i64: 1.5 vp9_avg16_8bpp_c: 14.7 vp9_avg16_8bpp_rvv_i64: 3.5 vp9_avg32_8bpp_c: 57.7 vp9_avg32_8bpp_rvv_i64: 10.0 vp9_avg64_8bpp_c: 229.0 vp9_avg64_8bpp_rvv_i64: 31.7 --- libavcodec/riscv/Makefile | 3 +- libavcodec/riscv/vp9_mc_rvv.S | 58 ++++++++++++++++++++++++++++++++++ libavcodec/riscv/vp9dsp_init.c | 18 +++++++++++ 3 files changed, 78 insertions(+), 1 deletion(-) create mode 100644 libavcodec/riscv/vp9_mc_rvv.S diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile index 27b268ae39..4739d83522 100644 --- a/libavcodec/riscv/Makefile +++ b/libavcodec/riscv/Makefile @@ -65,6 +65,7 @@ RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9dsp_init.o RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \ riscv/vp9_mc_rvi.o -RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o +RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o \ + riscv/vp9_mc_rvv.o OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S new file mode 100644 index 0000000000..9ee7f04dd1 --- /dev/null +++ b/libavcodec/riscv/vp9_mc_rvv.S @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS). + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/riscv/asm.S" + +.macro vsetvlstatic8 len an maxlen mn=m4 +.if \len == 4 + vsetivli zero, \len, e8, mf4, ta, ma +.elseif \len == 8 + vsetivli zero, \len, e8, mf2, ta, ma +.elseif \len == 16 + vsetivli zero, \len, e8, m1, ta, ma +.elseif \len == 32 + li \an, \len + vsetvli zero, \an, e8, m2, ta, ma +.elseif \len == 64 + li \an, \maxlen + vsetvli zero, \an, e8, \mn, ta, ma +.endif +.endm + +.macro copy_avg len +func ff_avg\len\()_rvv, zve32x + csrwi vxrm, 0 + vsetvlstatic8 \len t0 64 +1: + vle8.v v8, (a2) + vle8.v v16, (a0) + vaaddu.vv v8, v8, v16 + addi a4, a4, -1 + vse8.v v8, (a0) + add a2, a2, a3 + add a0, a0, a1 + bnez a4, 1b + ret +endfunc +.endm + +.irp len 64, 32, 16, 8, 4 + copy_avg \len +.endr diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c index ab99294d44..6bfe23563a 100644 --- a/libavcodec/riscv/vp9dsp_init.c +++ b/libavcodec/riscv/vp9dsp_init.c @@ -48,6 +48,24 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp) } # endif +#if HAVE_RVV + if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) && ff_rv_vlen_least(128)) { + +#define init_fpel(idx1, sz) \ + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_avg##sz##_rvv; \ + dsp->mc[idx1][FILTER_8TAP_REGULAR][1][0][0] = ff_avg##sz##_rvv; \ + dsp->mc[idx1][FILTER_8TAP_SHARP ][1][0][0] = ff_avg##sz##_rvv; \ + dsp->mc[idx1][FILTER_BILINEAR ][1][0][0] = ff_avg##sz##_rvv + + init_fpel(0, 64); + init_fpel(1, 32); + init_fpel(2, 16); + init_fpel(3, 8); + init_fpel(4, 4); + +#undef init_fpel + } +#endif #endif } -- 2.45.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3 5/9] lavc/vp9dsp: R-V V mc avg 2024-05-17 15:11 ` Rémi Denis-Courmont 2024-05-17 16:05 ` [FFmpeg-devel] [PATCH 1/5] " uk7b @ 2024-05-17 16:06 ` flow gg 1 sibling, 0 replies; 25+ messages in thread From: flow gg @ 2024-05-17 16:06 UTC (permalink / raw) To: FFmpeg development discussions and patches yeah, updated it in the reply Rémi Denis-Courmont <remi@remlab.net> 于2024年5月17日周五 23:11写道: > Le maanantaina 13. toukokuuta 2024, 19.59.22 EEST uk7b@foxmail.com a > écrit : > > From: sunyuechi <sunyuechi@iscas.ac.cn> > > > > C908: > > vp9_avg4_8bpp_c: 1.2 > > vp9_avg4_8bpp_rvv_i64: 1.0 > > vp9_avg8_8bpp_c: 3.7 > > vp9_avg8_8bpp_rvv_i64: 1.5 > > vp9_avg16_8bpp_c: 14.7 > > vp9_avg16_8bpp_rvv_i64: 3.5 > > vp9_avg32_8bpp_c: 57.7 > > vp9_avg32_8bpp_rvv_i64: 10.0 > > vp9_avg64_8bpp_c: 229.0 > > vp9_avg64_8bpp_rvv_i64: 31.7 > > --- > > libavcodec/riscv/Makefile | 3 +- > > libavcodec/riscv/vp9_mc_rvv.S | 58 ++++++++++++++++++++++++++++++++++ > > libavcodec/riscv/vp9dsp_init.c | 18 +++++++++++ > > 3 files changed, 78 insertions(+), 1 deletion(-) > > create mode 100644 libavcodec/riscv/vp9_mc_rvv.S > > > > diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile > > index 0cd900104f..1183357b37 100644 > > --- a/libavcodec/riscv/Makefile > > +++ b/libavcodec/riscv/Makefile > > @@ -64,6 +64,7 @@ RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o > > OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9dsp_init.o > > RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \ > > riscv/vp9_mc_rvi.o > > -RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o > > +RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o \ > > + riscv/vp9_mc_rvv.o > > OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o > > RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o > > diff --git a/libavcodec/riscv/vp9_mc_rvv.S > b/libavcodec/riscv/vp9_mc_rvv.S > > new file mode 100644 > > index 0000000000..5d917e7b98 > > --- /dev/null > > +++ b/libavcodec/riscv/vp9_mc_rvv.S > > @@ -0,0 +1,58 @@ > > +/* > > + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences > > (ISCAS). + * > > + * This file is part of FFmpeg. > > + * > > + * FFmpeg is free software; you can redistribute it and/or > > + * modify it under the terms of the GNU Lesser General Public > > + * License as published by the Free Software Foundation; either > > + * version 2.1 of the License, or (at your option) any later version. > > + * > > + * FFmpeg is distributed in the hope that it will be useful, > > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + * Lesser General Public License for more details. > > + * > > + * You should have received a copy of the GNU Lesser General Public > > + * License along with FFmpeg; if not, write to the Free Software > > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > 02110-1301 > > USA + */ > > + > > +#include "libavutil/riscv/asm.S" > > + > > +.macro vsetvlstatic8 len an maxlen mn=m4 > > +.if \len == 4 > > + vsetivli zero, \len, e8, mf4, ta, ma > > +.elseif \len == 8 > > + vsetivli zero, \len, e8, mf2, ta, ma > > +.elseif \len == 16 > > + vsetivli zero, \len, e8, m1, ta, ma > > +.elseif \len == 32 > > + li \an, \len > > + vsetvli zero, \an, e8, m2, ta, ma > > +.elseif \len == 64 > > + li \an, \maxlen > > + vsetvli zero, \an, e8, \mn, ta, ma > > +.endif > > +.endm > > + > > +.macro copy_avg len > > +func ff_avg\len\()_rvv, zve32x > > + csrwi vxrm, 0 > > + vsetvlstatic8 \len t0 64 > > +1: > > + addi a4, a4, -1 > > + vle8.v v8, (a2) > > + vle8.v v16, (a0) > > + vaaddu.vv v8, v8, v16 > > + vse8.v v8, (a0) > > + add a2, a2, a3 > > + add a0, a0, a1 > > + bnez a4, 1b > > + ret > > Doesn't this get slightly faster by interleaving scalar and vector > instructions? > > > +endfunc > > +.endm > > + > > +.irp len 64, 32, 16, 8, 4 > > + copy_avg \len > > +.endr > > diff --git a/libavcodec/riscv/vp9dsp_init.c > b/libavcodec/riscv/vp9dsp_init.c > > index 184fadbaf7..1922484a1d 100644 > > --- a/libavcodec/riscv/vp9dsp_init.c > > +++ b/libavcodec/riscv/vp9dsp_init.c > > @@ -48,6 +48,24 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext > > *dsp, int bpp) } > > # endif > > > > +#if HAVE_RVV > > + if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) && > ff_rv_vlen_least(128)) > > { + > > +#define init_fpel(idx1, sz) \ > > + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_avg##sz##_rvv; \ > > + dsp->mc[idx1][FILTER_8TAP_REGULAR][1][0][0] = ff_avg##sz##_rvv; \ > > + dsp->mc[idx1][FILTER_8TAP_SHARP ][1][0][0] = ff_avg##sz##_rvv; \ > > + dsp->mc[idx1][FILTER_BILINEAR ][1][0][0] = ff_avg##sz##_rvv > > + > > + init_fpel(0, 64); > > + init_fpel(1, 32); > > + init_fpel(2, 16); > > + init_fpel(3, 8); > > + init_fpel(4, 4); > > + > > +#undef init_fpel > > + } > > +#endif > > #endif > > } > > > -- > 雷米‧德尼-库尔蒙 > http://www.remlab.net/ > > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 25+ messages in thread
* [FFmpeg-devel] [PATCH v3 6/9] lavc/vp9dsp: R-V V mc bilin h v [not found] <20240513165926.1467967-1-uk7b@foxmail.com> ` (3 preceding siblings ...) 2024-05-13 16:59 ` [FFmpeg-devel] [PATCH v3 5/9] lavc/vp9dsp: R-V V mc avg uk7b @ 2024-05-13 16:59 ` uk7b 2024-05-18 15:56 ` Rémi Denis-Courmont 2024-05-13 16:59 ` [FFmpeg-devel] [PATCH v3 7/9] lavc/vp9dsp: R-V V mc tap " uk7b ` (2 subsequent siblings) 7 siblings, 1 reply; 25+ messages in thread From: uk7b @ 2024-05-13 16:59 UTC (permalink / raw) To: ffmpeg-devel; +Cc: sunyuechi From: sunyuechi <sunyuechi@iscas.ac.cn> C908: vp9_avg_bilin_4h_8bpp_c: 5.2 vp9_avg_bilin_4h_8bpp_rvv_i64: 2.2 vp9_avg_bilin_4v_8bpp_c: 5.5 vp9_avg_bilin_4v_8bpp_rvv_i64: 2.2 vp9_avg_bilin_8h_8bpp_c: 20.0 vp9_avg_bilin_8h_8bpp_rvv_i64: 4.5 vp9_avg_bilin_8v_8bpp_c: 21.0 vp9_avg_bilin_8v_8bpp_rvv_i64: 4.2 vp9_avg_bilin_16h_8bpp_c: 78.2 vp9_avg_bilin_16h_8bpp_rvv_i64: 9.0 vp9_avg_bilin_16v_8bpp_c: 82.0 vp9_avg_bilin_16v_8bpp_rvv_i64: 9.0 vp9_avg_bilin_32h_8bpp_c: 325.5 vp9_avg_bilin_32h_8bpp_rvv_i64: 26.2 vp9_avg_bilin_32v_8bpp_c: 326.2 vp9_avg_bilin_32v_8bpp_rvv_i64: 26.2 vp9_avg_bilin_64h_8bpp_c: 1265.7 vp9_avg_bilin_64h_8bpp_rvv_i64: 91.5 vp9_avg_bilin_64v_8bpp_c: 1317.0 vp9_avg_bilin_64v_8bpp_rvv_i64: 91.2 vp9_put_bilin_4h_8bpp_c: 4.5 vp9_put_bilin_4h_8bpp_rvv_i64: 1.7 vp9_put_bilin_4v_8bpp_c: 4.7 vp9_put_bilin_4v_8bpp_rvv_i64: 1.7 vp9_put_bilin_8h_8bpp_c: 17.0 vp9_put_bilin_8h_8bpp_rvv_i64: 3.5 vp9_put_bilin_8v_8bpp_c: 18.0 vp9_put_bilin_8v_8bpp_rvv_i64: 3.5 vp9_put_bilin_16h_8bpp_c: 65.2 vp9_put_bilin_16h_8bpp_rvv_i64: 7.5 vp9_put_bilin_16v_8bpp_c: 85.7 vp9_put_bilin_16v_8bpp_rvv_i64: 7.5 vp9_put_bilin_32h_8bpp_c: 257.5 vp9_put_bilin_32h_8bpp_rvv_i64: 23.5 vp9_put_bilin_32v_8bpp_c: 274.5 vp9_put_bilin_32v_8bpp_rvv_i64: 23.5 vp9_put_bilin_64h_8bpp_c: 1040.5 vp9_put_bilin_64h_8bpp_rvv_i64: 82.5 vp9_put_bilin_64v_8bpp_c: 1108.7 vp9_put_bilin_64v_8bpp_rvv_i64: 82.2 --- libavcodec/riscv/vp9_mc_rvv.S | 43 ++++++++++++++++++++++++++++++++++ libavcodec/riscv/vp9dsp_init.c | 21 +++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S index 5d917e7b98..986cc3760d 100644 --- a/libavcodec/riscv/vp9_mc_rvv.S +++ b/libavcodec/riscv/vp9_mc_rvv.S @@ -53,6 +53,49 @@ func ff_avg\len\()_rvv, zve32x endfunc .endm +.macro bilin_load dst len op type mn +.ifc \type,v + add t5, a2, a3 +.elseif \type == h + addi t5, a2, 1 +.endif + vle8.v v8, (a2) + vle8.v v0, (t5) + vwmulu.vx v16, v0, \mn + vwmaccsu.vx v16, t1, v8 + vwadd.wx v16, v16, t4 + vnsra.wi v16, v16, 4 + vadd.vv \dst, v16, v8 +.ifc \op,avg + vle8.v v16, (a0) + vaaddu.vv \dst, \dst, v16 +.endif +.endm + +.macro bilin_h_v len op type mn +func ff_\op\()_bilin_\len\()\type\()_rvv, zve32x +.ifc \op,avg + csrwi vxrm, 0 +.endif + vsetvlstatic8 \len t0 64 + li t4, 8 + neg t1, \mn +1: + addi a4, a4, -1 + bilin_load v0, \len, \op, \type, \mn + vse8.v v0, (a0) + add a2, a2, a3 + add a0, a0, a1 + bnez a4, 1b + + ret +endfunc +.endm + .irp len 64, 32, 16, 8, 4 copy_avg \len + .irp op put avg + bilin_h_v \len \op h a5 + bilin_h_v \len \op v a6 + .endr .endr diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c index 1922484a1d..ec6db51774 100644 --- a/libavcodec/riscv/vp9dsp_init.c +++ b/libavcodec/riscv/vp9dsp_init.c @@ -63,6 +63,27 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp) init_fpel(3, 8); init_fpel(4, 4); + dsp->mc[0][FILTER_BILINEAR ][0][0][1] = ff_put_bilin_64v_rvv; + dsp->mc[0][FILTER_BILINEAR ][0][1][0] = ff_put_bilin_64h_rvv; + dsp->mc[0][FILTER_BILINEAR ][1][0][1] = ff_avg_bilin_64v_rvv; + dsp->mc[0][FILTER_BILINEAR ][1][1][0] = ff_avg_bilin_64h_rvv; + dsp->mc[1][FILTER_BILINEAR ][0][0][1] = ff_put_bilin_32v_rvv; + dsp->mc[1][FILTER_BILINEAR ][0][1][0] = ff_put_bilin_32h_rvv; + dsp->mc[1][FILTER_BILINEAR ][1][0][1] = ff_avg_bilin_32v_rvv; + dsp->mc[1][FILTER_BILINEAR ][1][1][0] = ff_avg_bilin_32h_rvv; + dsp->mc[2][FILTER_BILINEAR ][0][0][1] = ff_put_bilin_16v_rvv; + dsp->mc[2][FILTER_BILINEAR ][0][1][0] = ff_put_bilin_16h_rvv; + dsp->mc[2][FILTER_BILINEAR ][1][0][1] = ff_avg_bilin_16v_rvv; + dsp->mc[2][FILTER_BILINEAR ][1][1][0] = ff_avg_bilin_16h_rvv; + dsp->mc[3][FILTER_BILINEAR ][0][0][1] = ff_put_bilin_8v_rvv; + dsp->mc[3][FILTER_BILINEAR ][0][1][0] = ff_put_bilin_8h_rvv; + dsp->mc[3][FILTER_BILINEAR ][1][0][1] = ff_avg_bilin_8v_rvv; + dsp->mc[3][FILTER_BILINEAR ][1][1][0] = ff_avg_bilin_8h_rvv; + dsp->mc[4][FILTER_BILINEAR ][0][0][1] = ff_put_bilin_4v_rvv; + dsp->mc[4][FILTER_BILINEAR ][0][1][0] = ff_put_bilin_4h_rvv; + dsp->mc[4][FILTER_BILINEAR ][1][0][1] = ff_avg_bilin_4v_rvv; + dsp->mc[4][FILTER_BILINEAR ][1][1][0] = ff_avg_bilin_4h_rvv; + #undef init_fpel } #endif -- 2.45.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3 6/9] lavc/vp9dsp: R-V V mc bilin h v 2024-05-13 16:59 ` [FFmpeg-devel] [PATCH v3 6/9] lavc/vp9dsp: R-V V mc bilin h v uk7b @ 2024-05-18 15:56 ` Rémi Denis-Courmont 2024-05-18 18:18 ` flow gg 0 siblings, 1 reply; 25+ messages in thread From: Rémi Denis-Courmont @ 2024-05-18 15:56 UTC (permalink / raw) To: ffmpeg-devel Le maanantaina 13. toukokuuta 2024, 19.59.23 EEST uk7b@foxmail.com a écrit : > From: sunyuechi <sunyuechi@iscas.ac.cn> > > C908: > vp9_avg_bilin_4h_8bpp_c: 5.2 > vp9_avg_bilin_4h_8bpp_rvv_i64: 2.2 > vp9_avg_bilin_4v_8bpp_c: 5.5 > vp9_avg_bilin_4v_8bpp_rvv_i64: 2.2 > vp9_avg_bilin_8h_8bpp_c: 20.0 > vp9_avg_bilin_8h_8bpp_rvv_i64: 4.5 > vp9_avg_bilin_8v_8bpp_c: 21.0 > vp9_avg_bilin_8v_8bpp_rvv_i64: 4.2 > vp9_avg_bilin_16h_8bpp_c: 78.2 > vp9_avg_bilin_16h_8bpp_rvv_i64: 9.0 > vp9_avg_bilin_16v_8bpp_c: 82.0 > vp9_avg_bilin_16v_8bpp_rvv_i64: 9.0 > vp9_avg_bilin_32h_8bpp_c: 325.5 > vp9_avg_bilin_32h_8bpp_rvv_i64: 26.2 > vp9_avg_bilin_32v_8bpp_c: 326.2 > vp9_avg_bilin_32v_8bpp_rvv_i64: 26.2 > vp9_avg_bilin_64h_8bpp_c: 1265.7 > vp9_avg_bilin_64h_8bpp_rvv_i64: 91.5 > vp9_avg_bilin_64v_8bpp_c: 1317.0 > vp9_avg_bilin_64v_8bpp_rvv_i64: 91.2 > vp9_put_bilin_4h_8bpp_c: 4.5 > vp9_put_bilin_4h_8bpp_rvv_i64: 1.7 > vp9_put_bilin_4v_8bpp_c: 4.7 > vp9_put_bilin_4v_8bpp_rvv_i64: 1.7 > vp9_put_bilin_8h_8bpp_c: 17.0 > vp9_put_bilin_8h_8bpp_rvv_i64: 3.5 > vp9_put_bilin_8v_8bpp_c: 18.0 > vp9_put_bilin_8v_8bpp_rvv_i64: 3.5 > vp9_put_bilin_16h_8bpp_c: 65.2 > vp9_put_bilin_16h_8bpp_rvv_i64: 7.5 > vp9_put_bilin_16v_8bpp_c: 85.7 > vp9_put_bilin_16v_8bpp_rvv_i64: 7.5 > vp9_put_bilin_32h_8bpp_c: 257.5 > vp9_put_bilin_32h_8bpp_rvv_i64: 23.5 > vp9_put_bilin_32v_8bpp_c: 274.5 > vp9_put_bilin_32v_8bpp_rvv_i64: 23.5 > vp9_put_bilin_64h_8bpp_c: 1040.5 > vp9_put_bilin_64h_8bpp_rvv_i64: 82.5 > vp9_put_bilin_64v_8bpp_c: 1108.7 > vp9_put_bilin_64v_8bpp_rvv_i64: 82.2 > --- > libavcodec/riscv/vp9_mc_rvv.S | 43 ++++++++++++++++++++++++++++++++++ > libavcodec/riscv/vp9dsp_init.c | 21 +++++++++++++++++ > 2 files changed, 64 insertions(+) > > diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S > index 5d917e7b98..986cc3760d 100644 > --- a/libavcodec/riscv/vp9_mc_rvv.S > +++ b/libavcodec/riscv/vp9_mc_rvv.S > @@ -53,6 +53,49 @@ func ff_avg\len\()_rvv, zve32x > endfunc > .endm > > +.macro bilin_load dst len op type mn > +.ifc \type,v > + add t5, a2, a3 > +.elseif \type == h > + addi t5, a2, 1 > +.endif > + vle8.v v8, (a2) > + vle8.v v0, (t5) > + vwmulu.vx v16, v0, \mn > + vwmaccsu.vx v16, t1, v8 > + vwadd.wx v16, v16, t4 > + vnsra.wi v16, v16, 4 > + vadd.vv \dst, v16, v8 > +.ifc \op,avg > + vle8.v v16, (a0) > + vaaddu.vv \dst, \dst, v16 > +.endif > +.endm > + > +.macro bilin_h_v len op type mn > +func ff_\op\()_bilin_\len\()\type\()_rvv, zve32x > +.ifc \op,avg > + csrwi vxrm, 0 > +.endif > + vsetvlstatic8 \len t0 64 > + li t4, 8 > + neg t1, \mn > +1: > + addi a4, a4, -1 > + bilin_load v0, \len, \op, \type, \mn > + vse8.v v0, (a0) > + add a2, a2, a3 > + add a0, a0, a1 > + bnez a4, 1b > + > + ret > +endfunc > +.endm > + > .irp len 64, 32, 16, 8, 4 Missing comma after len > copy_avg \len > + .irp op put avg > + bilin_h_v \len \op h a5 > + bilin_h_v \len \op v a6 > + .endr > .endr > diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c > index 1922484a1d..ec6db51774 100644 > --- a/libavcodec/riscv/vp9dsp_init.c > +++ b/libavcodec/riscv/vp9dsp_init.c > @@ -63,6 +63,27 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext > *dsp, int bpp) init_fpel(3, 8); > init_fpel(4, 4); > > + dsp->mc[0][FILTER_BILINEAR ][0][0][1] = ff_put_bilin_64v_rvv; > + dsp->mc[0][FILTER_BILINEAR ][0][1][0] = ff_put_bilin_64h_rvv; > + dsp->mc[0][FILTER_BILINEAR ][1][0][1] = ff_avg_bilin_64v_rvv; > + dsp->mc[0][FILTER_BILINEAR ][1][1][0] = ff_avg_bilin_64h_rvv; > + dsp->mc[1][FILTER_BILINEAR ][0][0][1] = ff_put_bilin_32v_rvv; > + dsp->mc[1][FILTER_BILINEAR ][0][1][0] = ff_put_bilin_32h_rvv; > + dsp->mc[1][FILTER_BILINEAR ][1][0][1] = ff_avg_bilin_32v_rvv; > + dsp->mc[1][FILTER_BILINEAR ][1][1][0] = ff_avg_bilin_32h_rvv; > + dsp->mc[2][FILTER_BILINEAR ][0][0][1] = ff_put_bilin_16v_rvv; > + dsp->mc[2][FILTER_BILINEAR ][0][1][0] = ff_put_bilin_16h_rvv; > + dsp->mc[2][FILTER_BILINEAR ][1][0][1] = ff_avg_bilin_16v_rvv; > + dsp->mc[2][FILTER_BILINEAR ][1][1][0] = ff_avg_bilin_16h_rvv; > + dsp->mc[3][FILTER_BILINEAR ][0][0][1] = ff_put_bilin_8v_rvv; > + dsp->mc[3][FILTER_BILINEAR ][0][1][0] = ff_put_bilin_8h_rvv; > + dsp->mc[3][FILTER_BILINEAR ][1][0][1] = ff_avg_bilin_8v_rvv; > + dsp->mc[3][FILTER_BILINEAR ][1][1][0] = ff_avg_bilin_8h_rvv; > + dsp->mc[4][FILTER_BILINEAR ][0][0][1] = ff_put_bilin_4v_rvv; > + dsp->mc[4][FILTER_BILINEAR ][0][1][0] = ff_put_bilin_4h_rvv; > + dsp->mc[4][FILTER_BILINEAR ][1][0][1] = ff_avg_bilin_4v_rvv; > + dsp->mc[4][FILTER_BILINEAR ][1][1][0] = ff_avg_bilin_4h_rvv; > + > #undef init_fpel > } > #endif -- レミ・デニ-クールモン http://www.remlab.net/ _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3 6/9] lavc/vp9dsp: R-V V mc bilin h v 2024-05-18 15:56 ` Rémi Denis-Courmont @ 2024-05-18 18:18 ` flow gg 0 siblings, 0 replies; 25+ messages in thread From: flow gg @ 2024-05-18 18:18 UTC (permalink / raw) To: FFmpeg development discussions and patches fixed in v4 Rémi Denis-Courmont <remi@remlab.net> 于2024年5月18日周六 23:56写道: > Le maanantaina 13. toukokuuta 2024, 19.59.23 EEST uk7b@foxmail.com a > écrit : > > From: sunyuechi <sunyuechi@iscas.ac.cn> > > > > C908: > > vp9_avg_bilin_4h_8bpp_c: 5.2 > > vp9_avg_bilin_4h_8bpp_rvv_i64: 2.2 > > vp9_avg_bilin_4v_8bpp_c: 5.5 > > vp9_avg_bilin_4v_8bpp_rvv_i64: 2.2 > > vp9_avg_bilin_8h_8bpp_c: 20.0 > > vp9_avg_bilin_8h_8bpp_rvv_i64: 4.5 > > vp9_avg_bilin_8v_8bpp_c: 21.0 > > vp9_avg_bilin_8v_8bpp_rvv_i64: 4.2 > > vp9_avg_bilin_16h_8bpp_c: 78.2 > > vp9_avg_bilin_16h_8bpp_rvv_i64: 9.0 > > vp9_avg_bilin_16v_8bpp_c: 82.0 > > vp9_avg_bilin_16v_8bpp_rvv_i64: 9.0 > > vp9_avg_bilin_32h_8bpp_c: 325.5 > > vp9_avg_bilin_32h_8bpp_rvv_i64: 26.2 > > vp9_avg_bilin_32v_8bpp_c: 326.2 > > vp9_avg_bilin_32v_8bpp_rvv_i64: 26.2 > > vp9_avg_bilin_64h_8bpp_c: 1265.7 > > vp9_avg_bilin_64h_8bpp_rvv_i64: 91.5 > > vp9_avg_bilin_64v_8bpp_c: 1317.0 > > vp9_avg_bilin_64v_8bpp_rvv_i64: 91.2 > > vp9_put_bilin_4h_8bpp_c: 4.5 > > vp9_put_bilin_4h_8bpp_rvv_i64: 1.7 > > vp9_put_bilin_4v_8bpp_c: 4.7 > > vp9_put_bilin_4v_8bpp_rvv_i64: 1.7 > > vp9_put_bilin_8h_8bpp_c: 17.0 > > vp9_put_bilin_8h_8bpp_rvv_i64: 3.5 > > vp9_put_bilin_8v_8bpp_c: 18.0 > > vp9_put_bilin_8v_8bpp_rvv_i64: 3.5 > > vp9_put_bilin_16h_8bpp_c: 65.2 > > vp9_put_bilin_16h_8bpp_rvv_i64: 7.5 > > vp9_put_bilin_16v_8bpp_c: 85.7 > > vp9_put_bilin_16v_8bpp_rvv_i64: 7.5 > > vp9_put_bilin_32h_8bpp_c: 257.5 > > vp9_put_bilin_32h_8bpp_rvv_i64: 23.5 > > vp9_put_bilin_32v_8bpp_c: 274.5 > > vp9_put_bilin_32v_8bpp_rvv_i64: 23.5 > > vp9_put_bilin_64h_8bpp_c: 1040.5 > > vp9_put_bilin_64h_8bpp_rvv_i64: 82.5 > > vp9_put_bilin_64v_8bpp_c: 1108.7 > > vp9_put_bilin_64v_8bpp_rvv_i64: 82.2 > > --- > > libavcodec/riscv/vp9_mc_rvv.S | 43 ++++++++++++++++++++++++++++++++++ > > libavcodec/riscv/vp9dsp_init.c | 21 +++++++++++++++++ > > 2 files changed, 64 insertions(+) > > > > diff --git a/libavcodec/riscv/vp9_mc_rvv.S > b/libavcodec/riscv/vp9_mc_rvv.S > > index 5d917e7b98..986cc3760d 100644 > > --- a/libavcodec/riscv/vp9_mc_rvv.S > > +++ b/libavcodec/riscv/vp9_mc_rvv.S > > @@ -53,6 +53,49 @@ func ff_avg\len\()_rvv, zve32x > > endfunc > > .endm > > > > +.macro bilin_load dst len op type mn > > +.ifc \type,v > > + add t5, a2, a3 > > +.elseif \type == h > > + addi t5, a2, 1 > > +.endif > > + vle8.v v8, (a2) > > + vle8.v v0, (t5) > > + vwmulu.vx v16, v0, \mn > > + vwmaccsu.vx v16, t1, v8 > > + vwadd.wx v16, v16, t4 > > + vnsra.wi v16, v16, 4 > > + vadd.vv \dst, v16, v8 > > +.ifc \op,avg > > + vle8.v v16, (a0) > > + vaaddu.vv \dst, \dst, v16 > > +.endif > > +.endm > > + > > +.macro bilin_h_v len op type mn > > +func ff_\op\()_bilin_\len\()\type\()_rvv, zve32x > > +.ifc \op,avg > > + csrwi vxrm, 0 > > +.endif > > + vsetvlstatic8 \len t0 64 > > + li t4, 8 > > + neg t1, \mn > > +1: > > + addi a4, a4, -1 > > + bilin_load v0, \len, \op, \type, \mn > > + vse8.v v0, (a0) > > + add a2, a2, a3 > > + add a0, a0, a1 > > + bnez a4, 1b > > + > > + ret > > +endfunc > > +.endm > > + > > .irp len 64, 32, 16, 8, 4 > > Missing comma after len > > > copy_avg \len > > + .irp op put avg > > + bilin_h_v \len \op h a5 > > + bilin_h_v \len \op v a6 > > + .endr > > .endr > > diff --git a/libavcodec/riscv/vp9dsp_init.c > b/libavcodec/riscv/vp9dsp_init.c > > index 1922484a1d..ec6db51774 100644 > > --- a/libavcodec/riscv/vp9dsp_init.c > > +++ b/libavcodec/riscv/vp9dsp_init.c > > @@ -63,6 +63,27 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext > > *dsp, int bpp) init_fpel(3, 8); > > init_fpel(4, 4); > > > > + dsp->mc[0][FILTER_BILINEAR ][0][0][1] = ff_put_bilin_64v_rvv; > > + dsp->mc[0][FILTER_BILINEAR ][0][1][0] = ff_put_bilin_64h_rvv; > > + dsp->mc[0][FILTER_BILINEAR ][1][0][1] = ff_avg_bilin_64v_rvv; > > + dsp->mc[0][FILTER_BILINEAR ][1][1][0] = ff_avg_bilin_64h_rvv; > > + dsp->mc[1][FILTER_BILINEAR ][0][0][1] = ff_put_bilin_32v_rvv; > > + dsp->mc[1][FILTER_BILINEAR ][0][1][0] = ff_put_bilin_32h_rvv; > > + dsp->mc[1][FILTER_BILINEAR ][1][0][1] = ff_avg_bilin_32v_rvv; > > + dsp->mc[1][FILTER_BILINEAR ][1][1][0] = ff_avg_bilin_32h_rvv; > > + dsp->mc[2][FILTER_BILINEAR ][0][0][1] = ff_put_bilin_16v_rvv; > > + dsp->mc[2][FILTER_BILINEAR ][0][1][0] = ff_put_bilin_16h_rvv; > > + dsp->mc[2][FILTER_BILINEAR ][1][0][1] = ff_avg_bilin_16v_rvv; > > + dsp->mc[2][FILTER_BILINEAR ][1][1][0] = ff_avg_bilin_16h_rvv; > > + dsp->mc[3][FILTER_BILINEAR ][0][0][1] = ff_put_bilin_8v_rvv; > > + dsp->mc[3][FILTER_BILINEAR ][0][1][0] = ff_put_bilin_8h_rvv; > > + dsp->mc[3][FILTER_BILINEAR ][1][0][1] = ff_avg_bilin_8v_rvv; > > + dsp->mc[3][FILTER_BILINEAR ][1][1][0] = ff_avg_bilin_8h_rvv; > > + dsp->mc[4][FILTER_BILINEAR ][0][0][1] = ff_put_bilin_4v_rvv; > > + dsp->mc[4][FILTER_BILINEAR ][0][1][0] = ff_put_bilin_4h_rvv; > > + dsp->mc[4][FILTER_BILINEAR ][1][0][1] = ff_avg_bilin_4v_rvv; > > + dsp->mc[4][FILTER_BILINEAR ][1][1][0] = ff_avg_bilin_4h_rvv; > > + > > #undef init_fpel > > } > > #endif > > > -- > レミ・デニ-クールモン > http://www.remlab.net/ > > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 25+ messages in thread
* [FFmpeg-devel] [PATCH v3 7/9] lavc/vp9dsp: R-V V mc tap h v [not found] <20240513165926.1467967-1-uk7b@foxmail.com> ` (4 preceding siblings ...) 2024-05-13 16:59 ` [FFmpeg-devel] [PATCH v3 6/9] lavc/vp9dsp: R-V V mc bilin h v uk7b @ 2024-05-13 16:59 ` uk7b 2024-05-13 16:59 ` [FFmpeg-devel] [PATCH v3 8/9] lavc/vp9dsp: R-V V mc bilin hv uk7b 2024-05-13 16:59 ` [FFmpeg-devel] [PATCH v3 9/9] lavc/vp9dsp: R-V V mc tap hv uk7b 7 siblings, 0 replies; 25+ messages in thread From: uk7b @ 2024-05-13 16:59 UTC (permalink / raw) To: ffmpeg-devel; +Cc: sunyuechi From: sunyuechi <sunyuechi@iscas.ac.cn> C908 X60 vp9_avg_8tap_smooth_4h_8bpp_c : 13.0 11.2 vp9_avg_8tap_smooth_4h_8bpp_rvv_i32 : 5.0 4.2 vp9_avg_8tap_smooth_4v_8bpp_c : 13.7 12.5 vp9_avg_8tap_smooth_4v_8bpp_rvv_i32 : 5.0 4.2 vp9_avg_8tap_smooth_8h_8bpp_c : 49.5 42.2 vp9_avg_8tap_smooth_8h_8bpp_rvv_i32 : 9.2 8.5 vp9_avg_8tap_smooth_8v_8bpp_c : 66.5 45.0 vp9_avg_8tap_smooth_8v_8bpp_rvv_i32 : 9.5 8.5 vp9_avg_8tap_smooth_16h_8bpp_c : 192.7 166.5 vp9_avg_8tap_smooth_16h_8bpp_rvv_i32 : 21.2 18.7 vp9_avg_8tap_smooth_16v_8bpp_c : 192.2 175.7 vp9_avg_8tap_smooth_16v_8bpp_rvv_i32 : 21.5 19.0 vp9_avg_8tap_smooth_32h_8bpp_c : 780.2 663.7 vp9_avg_8tap_smooth_32h_8bpp_rvv_i32 : 83.5 60.0 vp9_avg_8tap_smooth_32v_8bpp_c : 770.5 689.2 vp9_avg_8tap_smooth_32v_8bpp_rvv_i32 : 67.2 60.0 vp9_avg_8tap_smooth_64h_8bpp_c : 3115.5 2647.2 vp9_avg_8tap_smooth_64h_8bpp_rvv_i32 : 283.5 119.2 vp9_avg_8tap_smooth_64v_8bpp_c : 3082.2 2729.0 vp9_avg_8tap_smooth_64v_8bpp_rvv_i32 : 305.2 119.0 vp9_put_8tap_smooth_4h_8bpp_c : 11.2 9.7 vp9_put_8tap_smooth_4h_8bpp_rvv_i32 : 4.2 4.0 vp9_put_8tap_smooth_4v_8bpp_c : 11.7 10.7 vp9_put_8tap_smooth_4v_8bpp_rvv_i32 : 4.2 4.0 vp9_put_8tap_smooth_8h_8bpp_c : 42.0 37.5 vp9_put_8tap_smooth_8h_8bpp_rvv_i32 : 8.5 7.7 vp9_put_8tap_smooth_8v_8bpp_c : 44.2 38.7 vp9_put_8tap_smooth_8v_8bpp_rvv_i32 : 8.5 7.7 vp9_put_8tap_smooth_16h_8bpp_c : 165.7 147.2 vp9_put_8tap_smooth_16h_8bpp_rvv_i32 : 19.5 17.5 vp9_put_8tap_smooth_16v_8bpp_c : 169.0 149.7 vp9_put_8tap_smooth_16v_8bpp_rvv_i32 : 19.7 17.5 vp9_put_8tap_smooth_32h_8bpp_c : 659.7 586.7 vp9_put_8tap_smooth_32h_8bpp_rvv_i32 : 64.2 57.2 vp9_put_8tap_smooth_32v_8bpp_c : 680.5 591.2 vp9_put_8tap_smooth_32v_8bpp_rvv_i32 : 64.2 57.2 vp9_put_8tap_smooth_64h_8bpp_c : 2681.5 2339.0 vp9_put_8tap_smooth_64h_8bpp_rvv_i32 : 255.5 114.2 vp9_put_8tap_smooth_64v_8bpp_c : 2709.7 2348.7 vp9_put_8tap_smooth_64v_8bpp_rvv_i32 : 255.5 114.0 --- libavcodec/riscv/vp9_mc_rvv.S | 243 +++++++++++++++++++++++++++++++++ libavcodec/riscv/vp9dsp.h | 72 ++++++---- libavcodec/riscv/vp9dsp_init.c | 40 +++++- 3 files changed, 329 insertions(+), 26 deletions(-) diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S index 986cc3760d..c633809675 100644 --- a/libavcodec/riscv/vp9_mc_rvv.S +++ b/libavcodec/riscv/vp9_mc_rvv.S @@ -36,6 +36,18 @@ .endif .endm +.macro vsetvlstatic16 len +.ifc \len,4 + vsetvli zero, zero, e16, mf2, ta, ma +.elseif \len == 8 + vsetvli zero, zero, e16, m1, ta, ma +.elseif \len == 16 + vsetvli zero, zero, e16, m2, ta, ma +.else + vsetvli zero, zero, e16, m4, ta, ma +.endif +.endm + .macro copy_avg len func ff_avg\len\()_rvv, zve32x csrwi vxrm, 0 @@ -92,10 +104,241 @@ func ff_\op\()_bilin_\len\()\type\()_rvv, zve32x endfunc .endm +const subpel_filters_regular + .byte 0, 0, 0, 128, 0, 0, 0, 0 + .byte 0, 1, -5, 126, 8, -3, 1, 0 + .byte -1, 3, -10, 122, 18, -6, 2, 0 + .byte -1, 4, -13, 118, 27, -9, 3, -1 + .byte -1, 4, -16, 112, 37, -11, 4, -1 + .byte -1, 5, -18, 105, 48, -14, 4, -1 + .byte -1, 5, -19, 97, 58, -16, 5, -1 + .byte -1, 6, -19, 88, 68, -18, 5, -1 + .byte -1, 6, -19, 78, 78, -19, 6, -1 + .byte -1, 5, -18, 68, 88, -19, 6, -1 + .byte -1, 5, -16, 58, 97, -19, 5, -1 + .byte -1, 4, -14, 48, 105, -18, 5, -1 + .byte -1, 4, -11, 37, 112, -16, 4, -1 + .byte -1, 3, -9, 27, 118, -13, 4, -1 + .byte 0, 2, -6, 18, 122, -10, 3, -1 + .byte 0, 1, -3, 8, 126, -5, 1, 0 +subpel_filters_sharp: + .byte 0, 0, 0, 128, 0, 0, 0, 0 + .byte -1, 3, -7, 127, 8, -3, 1, 0 + .byte -2, 5, -13, 125, 17, -6, 3, -1 + .byte -3, 7, -17, 121, 27, -10, 5, -2 + .byte -4, 9, -20, 115, 37, -13, 6, -2 + .byte -4, 10, -23, 108, 48, -16, 8, -3 + .byte -4, 10, -24, 100, 59, -19, 9, -3 + .byte -4, 11, -24, 90, 70, -21, 10, -4 + .byte -4, 11, -23, 80, 80, -23, 11, -4 + .byte -4, 10, -21, 70, 90, -24, 11, -4 + .byte -3, 9, -19, 59, 100, -24, 10, -4 + .byte -3, 8, -16, 48, 108, -23, 10, -4 + .byte -2, 6, -13, 37, 115, -20, 9, -4 + .byte -2, 5, -10, 27, 121, -17, 7, -3 + .byte -1, 3, -6, 17, 125, -13, 5, -2 + .byte 0, 1, -3, 8, 127, -7, 3, -1 +subpel_filters_smooth: + .byte 0, 0, 0, 128, 0, 0, 0, 0 + .byte -3, -1, 32, 64, 38, 1, -3, 0 + .byte -2, -2, 29, 63, 41, 2, -3, 0 + .byte -2, -2, 26, 63, 43, 4, -4, 0 + .byte -2, -3, 24, 62, 46, 5, -4, 0 + .byte -2, -3, 21, 60, 49, 7, -4, 0 + .byte -1, -4, 18, 59, 51, 9, -4, 0 + .byte -1, -4, 16, 57, 53, 12, -4, -1 + .byte -1, -4, 14, 55, 55, 14, -4, -1 + .byte -1, -4, 12, 53, 57, 16, -4, -1 + .byte 0, -4, 9, 51, 59, 18, -4, -1 + .byte 0, -4, 7, 49, 60, 21, -3, -2 + .byte 0, -4, 5, 46, 62, 24, -3, -2 + .byte 0, -4, 4, 43, 63, 26, -2, -2 + .byte 0, -3, 2, 41, 63, 29, -2, -2 + .byte 0, -3, 1, 38, 64, 32, -1, -3 +endconst + +.macro epel_filter name type regtype + lla \regtype\()2, subpel_filters_\name + li \regtype\()1, 8 +.ifc \type,v + mul \regtype\()0, a6, \regtype\()1 +.elseif \type == h + mul \regtype\()0, a5, \regtype\()1 +.endif + add \regtype\()0, \regtype\()0, \regtype\()2 + .irp n 1,2,3,4,5,6 + lb \regtype\n, \n(\regtype\()0) + .endr +.ifc \regtype,t + lb a7, 7(\regtype\()0) +.elseif \regtype == s + lb s7, 7(\regtype\()0) +.endif + lb \regtype\()0, 0(\regtype\()0) +.endm + +.macro epel_load dst len op name type from_mem regtype + li a5, 64 +.ifc \from_mem, 1 + vle8.v v22, (a2) +.ifc \type,v + sub a2, a2, a3 + vle8.v v20, (a2) + sh1add a2, a3, a2 + vle8.v v24, (a2) + add a2, a2, a3 + vle8.v v26, (a2) + add a2, a2, a3 + vle8.v v28, (a2) + add a2, a2, a3 + vle8.v v30, (a2) +.elseif \type == h + addi a2, a2, -1 + vle8.v v20, (a2) + addi a2, a2, 2 + vle8.v v24, (a2) + addi a2, a2, 1 + vle8.v v26, (a2) + addi a2, a2, 1 + vle8.v v28, (a2) + addi a2, a2, 1 + vle8.v v30, (a2) +.endif + +.ifc \name,smooth + vwmulu.vx v16, v24, \regtype\()4 + vwmaccu.vx v16, \regtype\()2, v20 + vwmaccu.vx v16, \regtype\()5, v26 + vwmaccsu.vx v16, \regtype\()6, v28 +.else + vwmulu.vx v16, v28, \regtype\()6 + vwmaccsu.vx v16, \regtype\()2, v20 + vwmaccsu.vx v16, \regtype\()5, v26 +.endif + +.ifc \regtype,t + vwmaccsu.vx v16, a7, v30 +.elseif \regtype == s + vwmaccsu.vx v16, s7, v30 +.endif + +.ifc \type,v + .rept 6 + sub a2, a2, a3 + .endr + vle8.v v28, (a2) + sub a2, a2, a3 + vle8.v v26, (a2) + sh1add a2, a3, a2 + add a2, a2, a3 +.elseif \type == h + addi a2, a2, -6 + vle8.v v28, (a2) + addi a2, a2, -1 + vle8.v v26, (a2) + addi a2, a2, 3 +.endif + +.ifc \name,smooth + vwmaccsu.vx v16, \regtype\()1, v28 +.else + vwmaccu.vx v16, \regtype\()1, v28 + vwmulu.vx v28, v24, \regtype\()4 +.endif + vwmaccsu.vx v16, \regtype\()0, v26 + vwmulu.vx v20, v22, \regtype\()3 +.else +.ifc \name,smooth + vwmulu.vx v16, v8, \regtype\()4 + vwmaccu.vx v16, \regtype\()2, v4 + vwmaccu.vx v16, \regtype\()5, v10 + vwmaccsu.vx v16, \regtype\()6, v12 + vwmaccsu.vx v16, \regtype\()1, v2 +.else + vwmulu.vx v16, v2, \regtype\()1 + vwmaccu.vx v16, \regtype\()6, v12 + vwmaccsu.vx v16, \regtype\()5, v10 + vwmaccsu.vx v16, \regtype\()2, v4 + vwmulu.vx v28, v8, \regtype\()4 +.endif + vwmaccsu.vx v16, \regtype\()0, v0 + vwmulu.vx v20, v6, \regtype\()3 + +.ifc \regtype,t + vwmaccsu.vx v16, a7, v14 +.elseif \regtype == s + vwmaccsu.vx v16, s7, v14 +.endif + +.endif + vwadd.wx v16, v16, a5 + vsetvlstatic16 \len + +.ifc \name,smooth + vwadd.vv v24, v16, v20 +.else + vwadd.vv v24, v16, v28 + vwadd.wv v24, v24, v20 +.endif + vnsra.wi v24, v24, 7 + vmax.vx v24, v24, zero + vsetvlstatic8 \len, zero, 32, m2 + + vnclipu.wi \dst, v24, 0 +.ifc \op,avg + vle8.v v24, (a0) + vaaddu.vv \dst, \dst, v24 +.endif + +.endm + +.macro epel_load_inc dst len op name type from_mem regtype + epel_load \dst \len \op \name \type \from_mem \regtype + add a2, a2, a3 +.endm + +.macro epel len op name type vlen +func ff_\op\()_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x + epel_filter \name \type t +.if \vlen < 256 + vsetvlstatic8 \len a5 32 m2 +.else + vsetvlstatic8 \len a5 64 m2 +.endif +.ifc \op,avg + csrwi vxrm, 0 +.endif + +1: + addi a4, a4, -1 + epel_load v30 \len \op \name \type 1 t + vse8.v v30, (a0) +.if \len == 64 && \vlen < 256 + addi a0, a0, 32 + addi a2, a2, 32 + epel_load v30 \len \op \name \type 1 t + vse8.v v30, (a0) + addi a0, a0, -32 + addi a2, a2, -32 +.endif + add a2, a2, a3 + add a0, a0, a1 + bnez a4, 1b + + ret +endfunc +.endm + .irp len 64, 32, 16, 8, 4 copy_avg \len .irp op put avg bilin_h_v \len \op h a5 bilin_h_v \len \op v a6 + .irp name regular sharp smooth + .irp type h v + epel \len \op \name \type 128 + epel \len \op \name \type 256 + .endr + .endr .endr .endr diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h index 79330b4968..1638daaae3 100644 --- a/libavcodec/riscv/vp9dsp.h +++ b/libavcodec/riscv/vp9dsp.h @@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a); -#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx) \ -void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \ +#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx, min_vlen) \ +void ff_put_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst, \ + ptrdiff_t dststride, \ const uint8_t *src, \ ptrdiff_t srcstride, \ int h, int mx, int my); \ \ -void ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \ +void ff_put_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst, \ + ptrdiff_t dststride, \ const uint8_t *src, \ ptrdiff_t srcstride, \ int h, int mx, int my); \ \ -void ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \ +void ff_put_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst, \ + ptrdiff_t dststride, \ const uint8_t *src, \ ptrdiff_t srcstride, \ int h, int mx, int my); \ \ -void ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \ +void ff_avg_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst, \ + ptrdiff_t dststride, \ const uint8_t *src, \ ptrdiff_t srcstride, \ int h, int mx, int my); \ \ -void ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \ +void ff_avg_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst, \ + ptrdiff_t dststride, \ const uint8_t *src, \ ptrdiff_t srcstride, \ int h, int mx, int my); \ \ -void ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \ +void ff_avg_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst, \ + ptrdiff_t dststride, \ const uint8_t *src, \ ptrdiff_t srcstride, \ int h, int mx, int my); @@ -146,23 +152,41 @@ void ff_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t dststride, \ const uint8_t *src, ptrdiff_t srcstride, \ int h, int mx, int my); -VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR); -VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR); -VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR); -VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR); -VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR); - -VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP); -VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP); -VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP); -VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP); -VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP); - -VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH); -VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH); -VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH); -VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH); -VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH); +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128); +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128); +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128); +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128); +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128); + +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128); +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128); +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128); +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128); +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128); + +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128); +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128); +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128); +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128); +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128); + +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256); +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256); +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256); +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256); +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256); + +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256); +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256); +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256); +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256); +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256); + +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256); +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256); +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256); +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256); +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256); VP9_BILINEAR_RISCV_RVV_FUNC(64); VP9_BILINEAR_RISCV_RVV_FUNC(32); diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c index ec6db51774..c78d22a7f3 100644 --- a/libavcodec/riscv/vp9dsp_init.c +++ b/libavcodec/riscv/vp9dsp_init.c @@ -49,7 +49,8 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp) # endif #if HAVE_RVV - if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) && ff_rv_vlen_least(128)) { + if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) { + if (ff_rv_vlen_least(128)) { #define init_fpel(idx1, sz) \ dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_avg##sz##_rvv; \ @@ -63,6 +64,26 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp) init_fpel(3, 8); init_fpel(4, 4); +#undef init_fpel + +#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen) \ + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = \ + ff_##type##_8tap_smooth_##sz##dir##_rvv##vlen; \ + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = \ + ff_##type##_8tap_regular_##sz##dir##_rvv##vlen; \ + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = \ + ff_##type##_8tap_sharp_##sz##dir##_rvv##vlen; + +#define init_subpel2(idx, idxh, idxv, dir, type, vlen) \ + init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen); \ + init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen); \ + init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen); \ + init_subpel1(3, idx, idxh, idxv, 8, dir, type, vlen); \ + init_subpel1(4, idx, idxh, idxv, 4, dir, type, vlen) + + init_subpel2(0, 1, 0, h, put, 128); + init_subpel2(1, 1, 0, h, avg, 128); + dsp->mc[0][FILTER_BILINEAR ][0][0][1] = ff_put_bilin_64v_rvv; dsp->mc[0][FILTER_BILINEAR ][0][1][0] = ff_put_bilin_64h_rvv; dsp->mc[0][FILTER_BILINEAR ][1][0][1] = ff_avg_bilin_64v_rvv; @@ -84,8 +105,23 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp) dsp->mc[4][FILTER_BILINEAR ][1][0][1] = ff_avg_bilin_4v_rvv; dsp->mc[4][FILTER_BILINEAR ][1][1][0] = ff_avg_bilin_4h_rvv; -#undef init_fpel + if (flags & AV_CPU_FLAG_RVB_ADDR) { + init_subpel2(0, 0, 1, v, put, 128); + init_subpel2(1, 0, 1, v, avg, 128); } + + } + if (ff_rv_vlen_least(256)) { + init_subpel2(0, 1, 0, h, put, 256); + init_subpel2(1, 1, 0, h, avg, 256); + + if (flags & AV_CPU_FLAG_RVB_ADDR) { + init_subpel2(0, 0, 1, v, put, 256); + init_subpel2(1, 0, 1, v, avg, 256); + } + } + } + #endif #endif } -- 2.45.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 25+ messages in thread
* [FFmpeg-devel] [PATCH v3 8/9] lavc/vp9dsp: R-V V mc bilin hv [not found] <20240513165926.1467967-1-uk7b@foxmail.com> ` (5 preceding siblings ...) 2024-05-13 16:59 ` [FFmpeg-devel] [PATCH v3 7/9] lavc/vp9dsp: R-V V mc tap " uk7b @ 2024-05-13 16:59 ` uk7b 2024-05-13 16:59 ` [FFmpeg-devel] [PATCH v3 9/9] lavc/vp9dsp: R-V V mc tap hv uk7b 7 siblings, 0 replies; 25+ messages in thread From: uk7b @ 2024-05-13 16:59 UTC (permalink / raw) To: ffmpeg-devel; +Cc: sunyuechi From: sunyuechi <sunyuechi@iscas.ac.cn> C908: vp9_avg_bilin_4hv_8bpp_c: 11.0 vp9_avg_bilin_4hv_8bpp_rvv_i64: 3.7 vp9_avg_bilin_8hv_8bpp_c: 38.7 vp9_avg_bilin_8hv_8bpp_rvv_i64: 7.2 vp9_avg_bilin_16hv_8bpp_c: 147.0 vp9_avg_bilin_16hv_8bpp_rvv_i64: 14.2 vp9_avg_bilin_32hv_8bpp_c: 574.5 vp9_avg_bilin_32hv_8bpp_rvv_i64: 42.7 vp9_avg_bilin_64hv_8bpp_c: 2311.5 vp9_avg_bilin_64hv_8bpp_rvv_i64: 201.7 vp9_put_bilin_4hv_8bpp_c: 10.0 vp9_put_bilin_4hv_8bpp_rvv_i64: 3.2 vp9_put_bilin_8hv_8bpp_c: 35.2 vp9_put_bilin_8hv_8bpp_rvv_i64: 6.5 vp9_put_bilin_16hv_8bpp_c: 133.7 vp9_put_bilin_16hv_8bpp_rvv_i64: 13.0 vp9_put_bilin_32hv_8bpp_c: 538.2 vp9_put_bilin_32hv_8bpp_rvv_i64: 39.7 vp9_put_bilin_64hv_8bpp_c: 2114.0 vp9_put_bilin_64hv_8bpp_rvv_i64: 153.7 --- libavcodec/riscv/vp9_mc_rvv.S | 34 ++++++++++++++++++++++++++++++++++ libavcodec/riscv/vp9dsp_init.c | 10 ++++++++++ 2 files changed, 44 insertions(+) diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S index c633809675..22ae194367 100644 --- a/libavcodec/riscv/vp9_mc_rvv.S +++ b/libavcodec/riscv/vp9_mc_rvv.S @@ -104,6 +104,39 @@ func ff_\op\()_bilin_\len\()\type\()_rvv, zve32x endfunc .endm +.macro bilin_hv len op +func ff_\op\()_bilin_\len\()hv_rvv, zve32x +.ifc \op,avg + csrwi vxrm, 0 +.endif + vsetvlstatic8 \len t0 64 + neg t1, a5 + neg t2, a6 + li t4, 8 + bilin_load v24, \len, put, h, a5 + add a2, a2, a3 +1: + addi a4, a4, -1 + bilin_load v4, \len, put, h, a5 + vwmulu.vx v16, v4, a6 + vwmaccsu.vx v16, t2, v24 + vwadd.wx v16, v16, t4 + vnsra.wi v16, v16, 4 + vadd.vv v0, v16, v24 +.ifc \op,avg + vle8.v v16, (a0) + vaaddu.vv v0, v0, v16 +.endif + vse8.v v0, (a0) + vmv.v.v v24, v4 + add a2, a2, a3 + add a0, a0, a1 + bnez a4, 1b + + ret +endfunc +.endm + const subpel_filters_regular .byte 0, 0, 0, 128, 0, 0, 0, 0 .byte 0, 1, -5, 126, 8, -3, 1, 0 @@ -334,6 +367,7 @@ endfunc .irp op put avg bilin_h_v \len \op h a5 bilin_h_v \len \op v a6 + bilin_hv \len \op .irp name regular sharp smooth .irp type h v epel \len \op \name \type 128 diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c index c78d22a7f3..f3e9302a73 100644 --- a/libavcodec/riscv/vp9dsp_init.c +++ b/libavcodec/riscv/vp9dsp_init.c @@ -104,6 +104,16 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp) dsp->mc[4][FILTER_BILINEAR ][0][1][0] = ff_put_bilin_4h_rvv; dsp->mc[4][FILTER_BILINEAR ][1][0][1] = ff_avg_bilin_4v_rvv; dsp->mc[4][FILTER_BILINEAR ][1][1][0] = ff_avg_bilin_4h_rvv; + dsp->mc[0][FILTER_BILINEAR ][0][1][1] = ff_put_bilin_64hv_rvv; + dsp->mc[0][FILTER_BILINEAR ][1][1][1] = ff_avg_bilin_64hv_rvv; + dsp->mc[1][FILTER_BILINEAR ][0][1][1] = ff_put_bilin_32hv_rvv; + dsp->mc[1][FILTER_BILINEAR ][1][1][1] = ff_avg_bilin_32hv_rvv; + dsp->mc[2][FILTER_BILINEAR ][0][1][1] = ff_put_bilin_16hv_rvv; + dsp->mc[2][FILTER_BILINEAR ][1][1][1] = ff_avg_bilin_16hv_rvv; + dsp->mc[3][FILTER_BILINEAR ][0][1][1] = ff_put_bilin_8hv_rvv; + dsp->mc[3][FILTER_BILINEAR ][1][1][1] = ff_avg_bilin_8hv_rvv; + dsp->mc[4][FILTER_BILINEAR ][0][1][1] = ff_put_bilin_4hv_rvv; + dsp->mc[4][FILTER_BILINEAR ][1][1][1] = ff_avg_bilin_4hv_rvv; if (flags & AV_CPU_FLAG_RVB_ADDR) { init_subpel2(0, 0, 1, v, put, 128); -- 2.45.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 25+ messages in thread
* [FFmpeg-devel] [PATCH v3 9/9] lavc/vp9dsp: R-V V mc tap hv [not found] <20240513165926.1467967-1-uk7b@foxmail.com> ` (6 preceding siblings ...) 2024-05-13 16:59 ` [FFmpeg-devel] [PATCH v3 8/9] lavc/vp9dsp: R-V V mc bilin hv uk7b @ 2024-05-13 16:59 ` uk7b 7 siblings, 0 replies; 25+ messages in thread From: uk7b @ 2024-05-13 16:59 UTC (permalink / raw) To: ffmpeg-devel; +Cc: sunyuechi From: sunyuechi <sunyuechi@iscas.ac.cn> C908 X60 vp9_avg_8tap_smooth_4hv_8bpp_c : 32.0 28.2 vp9_avg_8tap_smooth_4hv_8bpp_rvv_i32 : 15.0 13.2 vp9_avg_8tap_smooth_8hv_8bpp_c : 98.0 86.2 vp9_avg_8tap_smooth_8hv_8bpp_rvv_i32 : 23.7 21.0 vp9_avg_8tap_smooth_16hv_8bpp_c : 355.5 297.0 vp9_avg_8tap_smooth_16hv_8bpp_rvv_i32 : 62.7 41.2 vp9_avg_8tap_smooth_32hv_8bpp_c : 1273.0 1099.7 vp9_avg_8tap_smooth_32hv_8bpp_rvv_i32 : 133.7 119.2 vp9_avg_8tap_smooth_64hv_8bpp_c : 4933.0 4240.5 vp9_avg_8tap_smooth_64hv_8bpp_rvv_i32 : 506.7 227.0 vp9_put_8tap_smooth_4hv_8bpp_c : 30.2 27.0 vp9_put_8tap_smooth_4hv_8bpp_rvv_i32 : 14.5 12.7 vp9_put_8tap_smooth_8hv_8bpp_c : 91.2 81.2 vp9_put_8tap_smooth_8hv_8bpp_rvv_i32 : 22.7 20.2 vp9_put_8tap_smooth_16hv_8bpp_c : 329.2 277.7 vp9_put_8tap_smooth_16hv_8bpp_rvv_i32 : 44.7 40.0 vp9_put_8tap_smooth_32hv_8bpp_c : 1183.7 1022.7 vp9_put_8tap_smooth_32hv_8bpp_rvv_i32 : 130.7 116.5 vp9_put_8tap_smooth_64hv_8bpp_c : 4502.7 3954.5 vp9_put_8tap_smooth_64hv_8bpp_rvv_i32 : 496.0 224.7 --- libavcodec/riscv/vp9_mc_rvv.S | 75 ++++++++++++++++++++++++++++++++++ libavcodec/riscv/vp9dsp_init.c | 8 ++++ 2 files changed, 83 insertions(+) diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S index 22ae194367..958460d165 100644 --- a/libavcodec/riscv/vp9_mc_rvv.S +++ b/libavcodec/riscv/vp9_mc_rvv.S @@ -362,6 +362,77 @@ func ff_\op\()_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x endfunc .endm +#if __riscv_xlen == 64 +.macro epel_hv_once len name op + sub a2, a2, a3 + sub a2, a2, a3 + sub a2, a2, a3 + .irp n 0 2 4 6 8 10 12 14 + epel_load_inc v\n \len put \name h 1 t + .endr + addi a4, a4, -1 +1: + addi a4, a4, -1 + epel_load v30 \len \op \name v 0 s + vse8.v v30, (a0) + vmv.v.v v0, v2 + vmv.v.v v2, v4 + vmv.v.v v4, v6 + vmv.v.v v6, v8 + vmv.v.v v8, v10 + vmv.v.v v10, v12 + vmv.v.v v12, v14 + epel_load v14 \len put \name h 1 t + add a2, a2, a3 + add a0, a0, a1 + bnez a4, 1b + epel_load v30 \len \op \name v 0 s + vse8.v v30, (a0) +.endm + +.macro epel_hv op name len vlen +func ff_\op\()_8tap_\name\()_\len\()hv_rvv\vlen\(), zve32x + addi sp, sp, -64 + .irp n 0,1,2,3,4,5,6,7 + sd s\n, \n\()<<3(sp) + .endr +.if \len == 64 && \vlen < 256 + addi sp, sp, -48 + .irp n 0,1,2,3,4,5 + sd a\n, \n\()<<3(sp) + .endr +.endif +.ifc \op,avg + csrwi vxrm, 0 +.endif + epel_filter \name h t + epel_filter \name v s +.if \vlen < 256 + vsetvlstatic8 \len a6 32 m2 +.else + vsetvlstatic8 \len a6 64 m2 +.endif + epel_hv_once \len \name \op +.if \len == 64 && \vlen < 256 + .irp n 0,1,2,3,4,5 + ld a\n, \n\()<<3(sp) + .endr + addi sp, sp, 48 + addi a0, a0, 32 + addi a2, a2, 32 + epel_filter \name h t + epel_hv_once \len \name \op +.endif + .irp n 0,1,2,3,4,5,6,7 + ld s\n, \n\()<<3(sp) + .endr + addi sp, sp, 64 + + ret +endfunc +.endm +#endif + .irp len 64, 32, 16, 8, 4 copy_avg \len .irp op put avg @@ -373,6 +444,10 @@ endfunc epel \len \op \name \type 128 epel \len \op \name \type 256 .endr + #if __riscv_xlen == 64 + epel_hv \op \name \len 128 + epel_hv \op \name \len 256 + #endif .endr .endr .endr diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c index f3e9302a73..cc5878f414 100644 --- a/libavcodec/riscv/vp9dsp_init.c +++ b/libavcodec/riscv/vp9dsp_init.c @@ -118,6 +118,10 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp) if (flags & AV_CPU_FLAG_RVB_ADDR) { init_subpel2(0, 0, 1, v, put, 128); init_subpel2(1, 0, 1, v, avg, 128); +# if __riscv_xlen == 64 + init_subpel2(0, 1, 1, hv, put, 128); + init_subpel2(1, 1, 1, hv, avg, 128); +# endif } } @@ -128,6 +132,10 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp) if (flags & AV_CPU_FLAG_RVB_ADDR) { init_subpel2(0, 0, 1, v, put, 256); init_subpel2(1, 0, 1, v, avg, 256); +# if __riscv_xlen == 64 + init_subpel2(0, 1, 1, hv, put, 256); + init_subpel2(1, 1, 1, hv, avg, 256); +# endif } } } -- 2.45.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 25+ messages in thread
[parent not found: <20240512100331.995415-1-uk7b@foxmail.com>]
* [FFmpeg-devel] [PATCH v3 7/9] lavc/vp9dsp: R-V V mc tap h v [not found] <20240512100331.995415-1-uk7b@foxmail.com> @ 2024-05-12 10:03 ` uk7b 0 siblings, 0 replies; 25+ messages in thread From: uk7b @ 2024-05-12 10:03 UTC (permalink / raw) To: ffmpeg-devel; +Cc: sunyuechi From: sunyuechi <sunyuechi@iscas.ac.cn> C908 X60 vp9_avg_8tap_smooth_4h_8bpp_c : 13.0 11.2 vp9_avg_8tap_smooth_4h_8bpp_rvv_i32 : 5.0 4.2 vp9_avg_8tap_smooth_4v_8bpp_c : 13.7 12.5 vp9_avg_8tap_smooth_4v_8bpp_rvv_i32 : 5.0 4.2 vp9_avg_8tap_smooth_8h_8bpp_c : 49.5 42.2 vp9_avg_8tap_smooth_8h_8bpp_rvv_i32 : 9.2 8.5 vp9_avg_8tap_smooth_8v_8bpp_c : 66.5 45.0 vp9_avg_8tap_smooth_8v_8bpp_rvv_i32 : 9.5 8.5 vp9_avg_8tap_smooth_16h_8bpp_c : 192.7 166.5 vp9_avg_8tap_smooth_16h_8bpp_rvv_i32 : 21.2 18.7 vp9_avg_8tap_smooth_16v_8bpp_c : 192.2 175.7 vp9_avg_8tap_smooth_16v_8bpp_rvv_i32 : 21.5 19.0 vp9_avg_8tap_smooth_32h_8bpp_c : 780.2 663.7 vp9_avg_8tap_smooth_32h_8bpp_rvv_i32 : 83.5 60.0 vp9_avg_8tap_smooth_32v_8bpp_c : 770.5 689.2 vp9_avg_8tap_smooth_32v_8bpp_rvv_i32 : 67.2 60.0 vp9_avg_8tap_smooth_64h_8bpp_c : 3115.5 2647.2 vp9_avg_8tap_smooth_64h_8bpp_rvv_i32 : 283.5 119.2 vp9_avg_8tap_smooth_64v_8bpp_c : 3082.2 2729.0 vp9_avg_8tap_smooth_64v_8bpp_rvv_i32 : 305.2 119.0 vp9_put_8tap_smooth_4h_8bpp_c : 11.2 9.7 vp9_put_8tap_smooth_4h_8bpp_rvv_i32 : 4.2 4.0 vp9_put_8tap_smooth_4v_8bpp_c : 11.7 10.7 vp9_put_8tap_smooth_4v_8bpp_rvv_i32 : 4.2 4.0 vp9_put_8tap_smooth_8h_8bpp_c : 42.0 37.5 vp9_put_8tap_smooth_8h_8bpp_rvv_i32 : 8.5 7.7 vp9_put_8tap_smooth_8v_8bpp_c : 44.2 38.7 vp9_put_8tap_smooth_8v_8bpp_rvv_i32 : 8.5 7.7 vp9_put_8tap_smooth_16h_8bpp_c : 165.7 147.2 vp9_put_8tap_smooth_16h_8bpp_rvv_i32 : 19.5 17.5 vp9_put_8tap_smooth_16v_8bpp_c : 169.0 149.7 vp9_put_8tap_smooth_16v_8bpp_rvv_i32 : 19.7 17.5 vp9_put_8tap_smooth_32h_8bpp_c : 659.7 586.7 vp9_put_8tap_smooth_32h_8bpp_rvv_i32 : 64.2 57.2 vp9_put_8tap_smooth_32v_8bpp_c : 680.5 591.2 vp9_put_8tap_smooth_32v_8bpp_rvv_i32 : 64.2 57.2 vp9_put_8tap_smooth_64h_8bpp_c : 2681.5 2339.0 vp9_put_8tap_smooth_64h_8bpp_rvv_i32 : 255.5 114.2 vp9_put_8tap_smooth_64v_8bpp_c : 2709.7 2348.7 vp9_put_8tap_smooth_64v_8bpp_rvv_i32 : 255.5 114.0 --- libavcodec/riscv/vp9_mc_rvv.S | 243 +++++++++++++++++++++++++++++++++ libavcodec/riscv/vp9dsp.h | 72 ++++++---- libavcodec/riscv/vp9dsp_init.c | 40 +++++- 3 files changed, 329 insertions(+), 26 deletions(-) diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S index 986cc3760d..c633809675 100644 --- a/libavcodec/riscv/vp9_mc_rvv.S +++ b/libavcodec/riscv/vp9_mc_rvv.S @@ -36,6 +36,18 @@ .endif .endm +.macro vsetvlstatic16 len +.ifc \len,4 + vsetvli zero, zero, e16, mf2, ta, ma +.elseif \len == 8 + vsetvli zero, zero, e16, m1, ta, ma +.elseif \len == 16 + vsetvli zero, zero, e16, m2, ta, ma +.else + vsetvli zero, zero, e16, m4, ta, ma +.endif +.endm + .macro copy_avg len func ff_avg\len\()_rvv, zve32x csrwi vxrm, 0 @@ -92,10 +104,241 @@ func ff_\op\()_bilin_\len\()\type\()_rvv, zve32x endfunc .endm +const subpel_filters_regular + .byte 0, 0, 0, 128, 0, 0, 0, 0 + .byte 0, 1, -5, 126, 8, -3, 1, 0 + .byte -1, 3, -10, 122, 18, -6, 2, 0 + .byte -1, 4, -13, 118, 27, -9, 3, -1 + .byte -1, 4, -16, 112, 37, -11, 4, -1 + .byte -1, 5, -18, 105, 48, -14, 4, -1 + .byte -1, 5, -19, 97, 58, -16, 5, -1 + .byte -1, 6, -19, 88, 68, -18, 5, -1 + .byte -1, 6, -19, 78, 78, -19, 6, -1 + .byte -1, 5, -18, 68, 88, -19, 6, -1 + .byte -1, 5, -16, 58, 97, -19, 5, -1 + .byte -1, 4, -14, 48, 105, -18, 5, -1 + .byte -1, 4, -11, 37, 112, -16, 4, -1 + .byte -1, 3, -9, 27, 118, -13, 4, -1 + .byte 0, 2, -6, 18, 122, -10, 3, -1 + .byte 0, 1, -3, 8, 126, -5, 1, 0 +subpel_filters_sharp: + .byte 0, 0, 0, 128, 0, 0, 0, 0 + .byte -1, 3, -7, 127, 8, -3, 1, 0 + .byte -2, 5, -13, 125, 17, -6, 3, -1 + .byte -3, 7, -17, 121, 27, -10, 5, -2 + .byte -4, 9, -20, 115, 37, -13, 6, -2 + .byte -4, 10, -23, 108, 48, -16, 8, -3 + .byte -4, 10, -24, 100, 59, -19, 9, -3 + .byte -4, 11, -24, 90, 70, -21, 10, -4 + .byte -4, 11, -23, 80, 80, -23, 11, -4 + .byte -4, 10, -21, 70, 90, -24, 11, -4 + .byte -3, 9, -19, 59, 100, -24, 10, -4 + .byte -3, 8, -16, 48, 108, -23, 10, -4 + .byte -2, 6, -13, 37, 115, -20, 9, -4 + .byte -2, 5, -10, 27, 121, -17, 7, -3 + .byte -1, 3, -6, 17, 125, -13, 5, -2 + .byte 0, 1, -3, 8, 127, -7, 3, -1 +subpel_filters_smooth: + .byte 0, 0, 0, 128, 0, 0, 0, 0 + .byte -3, -1, 32, 64, 38, 1, -3, 0 + .byte -2, -2, 29, 63, 41, 2, -3, 0 + .byte -2, -2, 26, 63, 43, 4, -4, 0 + .byte -2, -3, 24, 62, 46, 5, -4, 0 + .byte -2, -3, 21, 60, 49, 7, -4, 0 + .byte -1, -4, 18, 59, 51, 9, -4, 0 + .byte -1, -4, 16, 57, 53, 12, -4, -1 + .byte -1, -4, 14, 55, 55, 14, -4, -1 + .byte -1, -4, 12, 53, 57, 16, -4, -1 + .byte 0, -4, 9, 51, 59, 18, -4, -1 + .byte 0, -4, 7, 49, 60, 21, -3, -2 + .byte 0, -4, 5, 46, 62, 24, -3, -2 + .byte 0, -4, 4, 43, 63, 26, -2, -2 + .byte 0, -3, 2, 41, 63, 29, -2, -2 + .byte 0, -3, 1, 38, 64, 32, -1, -3 +endconst + +.macro epel_filter name type regtype + lla \regtype\()2, subpel_filters_\name + li \regtype\()1, 8 +.ifc \type,v + mul \regtype\()0, a6, \regtype\()1 +.elseif \type == h + mul \regtype\()0, a5, \regtype\()1 +.endif + add \regtype\()0, \regtype\()0, \regtype\()2 + .irp n 1,2,3,4,5,6 + lb \regtype\n, \n(\regtype\()0) + .endr +.ifc \regtype,t + lb a7, 7(\regtype\()0) +.elseif \regtype == s + lb s7, 7(\regtype\()0) +.endif + lb \regtype\()0, 0(\regtype\()0) +.endm + +.macro epel_load dst len op name type from_mem regtype + li a5, 64 +.ifc \from_mem, 1 + vle8.v v22, (a2) +.ifc \type,v + sub a2, a2, a3 + vle8.v v20, (a2) + sh1add a2, a3, a2 + vle8.v v24, (a2) + add a2, a2, a3 + vle8.v v26, (a2) + add a2, a2, a3 + vle8.v v28, (a2) + add a2, a2, a3 + vle8.v v30, (a2) +.elseif \type == h + addi a2, a2, -1 + vle8.v v20, (a2) + addi a2, a2, 2 + vle8.v v24, (a2) + addi a2, a2, 1 + vle8.v v26, (a2) + addi a2, a2, 1 + vle8.v v28, (a2) + addi a2, a2, 1 + vle8.v v30, (a2) +.endif + +.ifc \name,smooth + vwmulu.vx v16, v24, \regtype\()4 + vwmaccu.vx v16, \regtype\()2, v20 + vwmaccu.vx v16, \regtype\()5, v26 + vwmaccsu.vx v16, \regtype\()6, v28 +.else + vwmulu.vx v16, v28, \regtype\()6 + vwmaccsu.vx v16, \regtype\()2, v20 + vwmaccsu.vx v16, \regtype\()5, v26 +.endif + +.ifc \regtype,t + vwmaccsu.vx v16, a7, v30 +.elseif \regtype == s + vwmaccsu.vx v16, s7, v30 +.endif + +.ifc \type,v + .rept 6 + sub a2, a2, a3 + .endr + vle8.v v28, (a2) + sub a2, a2, a3 + vle8.v v26, (a2) + sh1add a2, a3, a2 + add a2, a2, a3 +.elseif \type == h + addi a2, a2, -6 + vle8.v v28, (a2) + addi a2, a2, -1 + vle8.v v26, (a2) + addi a2, a2, 3 +.endif + +.ifc \name,smooth + vwmaccsu.vx v16, \regtype\()1, v28 +.else + vwmaccu.vx v16, \regtype\()1, v28 + vwmulu.vx v28, v24, \regtype\()4 +.endif + vwmaccsu.vx v16, \regtype\()0, v26 + vwmulu.vx v20, v22, \regtype\()3 +.else +.ifc \name,smooth + vwmulu.vx v16, v8, \regtype\()4 + vwmaccu.vx v16, \regtype\()2, v4 + vwmaccu.vx v16, \regtype\()5, v10 + vwmaccsu.vx v16, \regtype\()6, v12 + vwmaccsu.vx v16, \regtype\()1, v2 +.else + vwmulu.vx v16, v2, \regtype\()1 + vwmaccu.vx v16, \regtype\()6, v12 + vwmaccsu.vx v16, \regtype\()5, v10 + vwmaccsu.vx v16, \regtype\()2, v4 + vwmulu.vx v28, v8, \regtype\()4 +.endif + vwmaccsu.vx v16, \regtype\()0, v0 + vwmulu.vx v20, v6, \regtype\()3 + +.ifc \regtype,t + vwmaccsu.vx v16, a7, v14 +.elseif \regtype == s + vwmaccsu.vx v16, s7, v14 +.endif + +.endif + vwadd.wx v16, v16, a5 + vsetvlstatic16 \len + +.ifc \name,smooth + vwadd.vv v24, v16, v20 +.else + vwadd.vv v24, v16, v28 + vwadd.wv v24, v24, v20 +.endif + vnsra.wi v24, v24, 7 + vmax.vx v24, v24, zero + vsetvlstatic8 \len, zero, 32, m2 + + vnclipu.wi \dst, v24, 0 +.ifc \op,avg + vle8.v v24, (a0) + vaaddu.vv \dst, \dst, v24 +.endif + +.endm + +.macro epel_load_inc dst len op name type from_mem regtype + epel_load \dst \len \op \name \type \from_mem \regtype + add a2, a2, a3 +.endm + +.macro epel len op name type vlen +func ff_\op\()_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x + epel_filter \name \type t +.if \vlen < 256 + vsetvlstatic8 \len a5 32 m2 +.else + vsetvlstatic8 \len a5 64 m2 +.endif +.ifc \op,avg + csrwi vxrm, 0 +.endif + +1: + addi a4, a4, -1 + epel_load v30 \len \op \name \type 1 t + vse8.v v30, (a0) +.if \len == 64 && \vlen < 256 + addi a0, a0, 32 + addi a2, a2, 32 + epel_load v30 \len \op \name \type 1 t + vse8.v v30, (a0) + addi a0, a0, -32 + addi a2, a2, -32 +.endif + add a2, a2, a3 + add a0, a0, a1 + bnez a4, 1b + + ret +endfunc +.endm + .irp len 64, 32, 16, 8, 4 copy_avg \len .irp op put avg bilin_h_v \len \op h a5 bilin_h_v \len \op v a6 + .irp name regular sharp smooth + .irp type h v + epel \len \op \name \type 128 + epel \len \op \name \type 256 + .endr + .endr .endr .endr diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h index 79330b4968..1638daaae3 100644 --- a/libavcodec/riscv/vp9dsp.h +++ b/libavcodec/riscv/vp9dsp.h @@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a); -#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx) \ -void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \ +#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx, min_vlen) \ +void ff_put_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst, \ + ptrdiff_t dststride, \ const uint8_t *src, \ ptrdiff_t srcstride, \ int h, int mx, int my); \ \ -void ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \ +void ff_put_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst, \ + ptrdiff_t dststride, \ const uint8_t *src, \ ptrdiff_t srcstride, \ int h, int mx, int my); \ \ -void ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \ +void ff_put_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst, \ + ptrdiff_t dststride, \ const uint8_t *src, \ ptrdiff_t srcstride, \ int h, int mx, int my); \ \ -void ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \ +void ff_avg_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst, \ + ptrdiff_t dststride, \ const uint8_t *src, \ ptrdiff_t srcstride, \ int h, int mx, int my); \ \ -void ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \ +void ff_avg_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst, \ + ptrdiff_t dststride, \ const uint8_t *src, \ ptrdiff_t srcstride, \ int h, int mx, int my); \ \ -void ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \ +void ff_avg_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst, \ + ptrdiff_t dststride, \ const uint8_t *src, \ ptrdiff_t srcstride, \ int h, int mx, int my); @@ -146,23 +152,41 @@ void ff_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t dststride, \ const uint8_t *src, ptrdiff_t srcstride, \ int h, int mx, int my); -VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR); -VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR); -VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR); -VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR); -VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR); - -VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP); -VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP); -VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP); -VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP); -VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP); - -VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH); -VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH); -VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH); -VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH); -VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH); +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128); +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128); +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128); +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128); +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128); + +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128); +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128); +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128); +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128); +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128); + +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128); +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128); +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128); +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128); +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128); + +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256); +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256); +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256); +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256); +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256); + +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256); +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256); +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256); +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256); +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256); + +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256); +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256); +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256); +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256); +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256); VP9_BILINEAR_RISCV_RVV_FUNC(64); VP9_BILINEAR_RISCV_RVV_FUNC(32); diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c index 31120a7893..0ae14879ea 100644 --- a/libavcodec/riscv/vp9dsp_init.c +++ b/libavcodec/riscv/vp9dsp_init.c @@ -49,7 +49,8 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp) # endif #if HAVE_RVV - if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) && ff_get_rv_vlenb() >= 16) { + if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) { + if (ff_get_rv_vlenb() >= 16) { #define init_fpel(idx1, sz) \ dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_avg##sz##_rvv; \ @@ -63,6 +64,26 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp) init_fpel(3, 8); init_fpel(4, 4); +#undef init_fpel + +#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen) \ + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = \ + ff_##type##_8tap_smooth_##sz##dir##_rvv##vlen; \ + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = \ + ff_##type##_8tap_regular_##sz##dir##_rvv##vlen; \ + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = \ + ff_##type##_8tap_sharp_##sz##dir##_rvv##vlen; + +#define init_subpel2(idx, idxh, idxv, dir, type, vlen) \ + init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen); \ + init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen); \ + init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen); \ + init_subpel1(3, idx, idxh, idxv, 8, dir, type, vlen); \ + init_subpel1(4, idx, idxh, idxv, 4, dir, type, vlen) + + init_subpel2(0, 1, 0, h, put, 128); + init_subpel2(1, 1, 0, h, avg, 128); + dsp->mc[0][FILTER_BILINEAR ][0][0][1] = ff_put_bilin_64v_rvv; dsp->mc[0][FILTER_BILINEAR ][0][1][0] = ff_put_bilin_64h_rvv; dsp->mc[0][FILTER_BILINEAR ][1][0][1] = ff_avg_bilin_64v_rvv; @@ -84,8 +105,23 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp) dsp->mc[4][FILTER_BILINEAR ][1][0][1] = ff_avg_bilin_4v_rvv; dsp->mc[4][FILTER_BILINEAR ][1][1][0] = ff_avg_bilin_4h_rvv; -#undef init_fpel + if (flags & AV_CPU_FLAG_RVB_ADDR) { + init_subpel2(0, 0, 1, v, put, 128); + init_subpel2(1, 0, 1, v, avg, 128); } + + } + if (ff_get_rv_vlenb() >= 32) { + init_subpel2(0, 1, 0, h, put, 256); + init_subpel2(1, 1, 0, h, avg, 256); + + if (flags & AV_CPU_FLAG_RVB_ADDR) { + init_subpel2(0, 0, 1, v, put, 256); + init_subpel2(1, 0, 1, v, avg, 256); + } + } + } + #endif #endif } -- 2.45.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 25+ messages in thread
end of thread, other threads:[~2024-05-18 18:18 UTC | newest] Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- [not found] <20240513165926.1467967-1-uk7b@foxmail.com> 2024-05-13 16:59 ` [FFmpeg-devel] [PATCH v3 2/9] lavc/vp9dsp: R-V mc copy uk7b 2024-05-13 19:55 ` Rémi Denis-Courmont 2024-05-14 4:44 ` flow gg 2024-05-14 17:02 ` Rémi Denis-Courmont 2024-05-14 17:20 ` flow gg 2024-05-13 16:59 ` [FFmpeg-devel] [PATCH v3 3/9] lavc/vp9dsp: R-V V ipred hor uk7b 2024-05-13 16:59 ` [FFmpeg-devel] [PATCH v3 4/9] lavc/vp9dsp: R-V V ipred tm uk7b 2024-05-14 17:45 ` Rémi Denis-Courmont 2024-05-14 17:57 ` flow gg 2024-05-14 18:08 ` Rémi Denis-Courmont 2024-05-14 18:33 ` uk7b 2024-05-14 18:34 ` flow gg 2024-05-15 3:55 ` [FFmpeg-devel] [PATCH " uk7b 2024-05-15 3:56 ` flow gg 2024-05-13 16:59 ` [FFmpeg-devel] [PATCH v3 5/9] lavc/vp9dsp: R-V V mc avg uk7b 2024-05-17 15:11 ` Rémi Denis-Courmont 2024-05-17 16:05 ` [FFmpeg-devel] [PATCH 1/5] " uk7b 2024-05-17 16:06 ` [FFmpeg-devel] [PATCH v3 5/9] " flow gg 2024-05-13 16:59 ` [FFmpeg-devel] [PATCH v3 6/9] lavc/vp9dsp: R-V V mc bilin h v uk7b 2024-05-18 15:56 ` Rémi Denis-Courmont 2024-05-18 18:18 ` flow gg 2024-05-13 16:59 ` [FFmpeg-devel] [PATCH v3 7/9] lavc/vp9dsp: R-V V mc tap " uk7b 2024-05-13 16:59 ` [FFmpeg-devel] [PATCH v3 8/9] lavc/vp9dsp: R-V V mc bilin hv uk7b 2024-05-13 16:59 ` [FFmpeg-devel] [PATCH v3 9/9] lavc/vp9dsp: R-V V mc tap hv uk7b [not found] <20240512100331.995415-1-uk7b@foxmail.com> 2024-05-12 10:03 ` [FFmpeg-devel] [PATCH v3 7/9] lavc/vp9dsp: R-V V mc tap h v uk7b
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git