* [FFmpeg-devel] [PATCH v2 2/9] lavc/vp9dsp: R-V mc copy
[not found] <20240507073613.2871668-1-uk7b@foxmail.com>
@ 2024-05-07 7:36 ` uk7b
2024-05-07 7:36 ` [FFmpeg-devel] [PATCH v2 3/9] lavc/vp9dsp: R-V V ipred hor uk7b
` (6 subsequent siblings)
7 siblings, 0 replies; 10+ messages in thread
From: uk7b @ 2024-05-07 7:36 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908:
vp9_put4_8bpp_c: 0.7
vp9_put4_8bpp_rvi: 0.5
vp9_put8_8bpp_c: 2.5
vp9_put8_8bpp_rvi: 0.5
vp9_put16_8bpp_c: 16.7
vp9_put16_8bpp_rvi: 1.5
vp9_put32_8bpp_c: 37.2
vp9_put32_8bpp_rvi: 5.7
vp9_put64_8bpp_c: 107.5
vp9_put64_8bpp_rvi: 21.7
---
libavcodec/riscv/Makefile | 3 +-
libavcodec/riscv/vp9_mc_rvi.S | 105 +++++++++++++++++++++++++++++++++
libavcodec/riscv/vp9dsp.h | 3 +
libavcodec/riscv/vp9dsp_init.c | 25 ++++++++
4 files changed, 135 insertions(+), 1 deletion(-)
create mode 100644 libavcodec/riscv/vp9_mc_rvi.S
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 65dd0d656a..5846861bac 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -63,7 +63,8 @@ RVV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvv.o
OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_init.o
RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o
OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9dsp_init.o
-RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o
+RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \
+ riscv/vp9_mc_rvi.o
RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
diff --git a/libavcodec/riscv/vp9_mc_rvi.S b/libavcodec/riscv/vp9_mc_rvi.S
new file mode 100644
index 0000000000..0db14e83c7
--- /dev/null
+++ b/libavcodec/riscv/vp9_mc_rvi.S
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+#if __riscv_xlen >= 64
+func ff_copy64_rvi
+1:
+ addi a4, a4, -1
+ ld t0, (a2)
+ ld t1, 8(a2)
+ ld t2, 16(a2)
+ ld t3, 24(a2)
+ ld t4, 32(a2)
+ ld t5, 40(a2)
+ ld t6, 48(a2)
+ ld a7, 56(a2)
+ sd t0, (a0)
+ sd t1, 8(a0)
+ sd t2, 16(a0)
+ sd t3, 24(a0)
+ sd t4, 32(a0)
+ sd t5, 40(a0)
+ sd t6, 48(a0)
+ sd a7, 56(a0)
+ add a2, a2, a3
+ add a0, a0, a1
+ bnez a4, 1b
+
+ ret
+endfunc
+
+func ff_copy32_rvi
+1:
+ addi a4, a4, -1
+ ld t0, (a2)
+ ld t1, 8(a2)
+ ld t2, 16(a2)
+ ld t3, 24(a2)
+ sd t0, (a0)
+ sd t1, 8(a0)
+ sd t2, 16(a0)
+ sd t3, 24(a0)
+ add a2, a2, a3
+ add a0, a0, a1
+ bnez a4, 1b
+
+ ret
+endfunc
+
+func ff_copy16_rvi
+1:
+ addi a4, a4, -1
+ ld t0, (a2)
+ ld t1, 8(a2)
+ sd t0, (a0)
+ sd t1, 8(a0)
+ add a2, a2, a3
+ add a0, a0, a1
+ bnez a4, 1b
+
+ ret
+endfunc
+
+func ff_copy8_rvi
+1:
+ addi a4, a4, -1
+ ld t0, (a2)
+ sd t0, (a0)
+ add a2, a2, a3
+ add a0, a0, a1
+ bnez a4, 1b
+
+ ret
+endfunc
+#endif
+
+func ff_copy4_rvi
+1:
+ addi a4, a4, -1
+ lw t0, (a2)
+ sw t0, (a0)
+ add a2, a2, a3
+ add a0, a0, a1
+ bnez a4, 1b
+
+ ret
+endfunc
diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
index f8bc6563a5..b8ff282f8a 100644
--- a/libavcodec/riscv/vp9dsp.h
+++ b/libavcodec/riscv/vp9dsp.h
@@ -167,6 +167,9 @@ void ff_copy##SIZE##_rvi(uint8_t *dst, ptrdiff_t dststride, \
const uint8_t *src, ptrdiff_t srcstride, \
int h, int mx, int my);
+VP9_COPY_RISCV_RVI_FUNC(64);
+VP9_COPY_RISCV_RVI_FUNC(32);
+VP9_COPY_RISCV_RVI_FUNC(16);
VP9_COPY_RISCV_RVI_FUNC(8);
VP9_COPY_RISCV_RVI_FUNC(4);
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index d249dd71b2..c10f8bbe41 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -64,7 +64,32 @@ static av_cold void vp9dsp_intrapred_init_riscv(VP9DSPContext *dsp, int bpp)
#endif
}
+static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
+{
+#if HAVE_RV
+ int flags = av_get_cpu_flags();
+
+ if (bpp == 8 && flags & AV_CPU_FLAG_RVI) {
+
+#define init_fpel(idx1, sz) \
+ dsp->mc[idx1][FILTER_8TAP_SMOOTH ][0][0][0] = ff_copy##sz##_rvi; \
+ dsp->mc[idx1][FILTER_8TAP_REGULAR][0][0][0] = ff_copy##sz##_rvi; \
+ dsp->mc[idx1][FILTER_8TAP_SHARP ][0][0][0] = ff_copy##sz##_rvi; \
+ dsp->mc[idx1][FILTER_BILINEAR ][0][0][0] = ff_copy##sz##_rvi
+
+ init_fpel(0, 64);
+ init_fpel(1, 32);
+ init_fpel(2, 16);
+ init_fpel(3, 8);
+ init_fpel(4, 4);
+
+#undef init_fpel
+ }
+#endif
+}
+
av_cold void ff_vp9dsp_init_riscv(VP9DSPContext *dsp, int bpp, int bitexact)
{
vp9dsp_intrapred_init_riscv(dsp, bpp);
+ vp9dsp_mc_init_riscv(dsp, bpp);
}
--
2.45.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH v2 3/9] lavc/vp9dsp: R-V V ipred hor
[not found] <20240507073613.2871668-1-uk7b@foxmail.com>
2024-05-07 7:36 ` [FFmpeg-devel] [PATCH v2 2/9] lavc/vp9dsp: R-V mc copy uk7b
@ 2024-05-07 7:36 ` uk7b
2024-05-07 16:08 ` Rémi Denis-Courmont
2024-05-07 7:36 ` [FFmpeg-devel] [PATCH v2 4/9] lavc/vp9dsp: R-V V ipred tm uk7b
` (5 subsequent siblings)
7 siblings, 1 reply; 10+ messages in thread
From: uk7b @ 2024-05-07 7:36 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908:
vp9_hor_8x8_8bpp_c: 74.7
vp9_hor_8x8_8bpp_rvv_i32: 35.7
vp9_hor_16x16_8bpp_c: 175.5
vp9_hor_16x16_8bpp_rvv_i32: 80.2
vp9_hor_32x32_8bpp_c: 510.2
vp9_hor_32x32_8bpp_rvv_i32: 264.0
---
libavcodec/riscv/vp9_intra_rvv.S | 56 ++++++++++++++++++++++++++++++++
libavcodec/riscv/vp9dsp.h | 6 ++++
libavcodec/riscv/vp9dsp_init.c | 3 ++
3 files changed, 65 insertions(+)
diff --git a/libavcodec/riscv/vp9_intra_rvv.S b/libavcodec/riscv/vp9_intra_rvv.S
index db9774c263..dd9bc036e7 100644
--- a/libavcodec/riscv/vp9_intra_rvv.S
+++ b/libavcodec/riscv/vp9_intra_rvv.S
@@ -113,3 +113,59 @@ func_dc dc_left 8 left 3 0 zve64x
func_dc dc_top 32 top 5 1 zve32x
func_dc dc_top 16 top 4 1 zve32x
func_dc dc_top 8 top 3 0 zve64x
+
+func ff_h_32x32_rvv, zve32x
+ li t0, 32
+ addi a2, a2, 31
+ vsetvli zero, t0, e8, m2, ta, ma
+
+ .rept 2
+ .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ lbu t1, (a2)
+ addi a2, a2, -1
+ vmv.v.x v\n, t1
+ .endr
+ .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ vse8.v v\n, (a0)
+ add a0, a0, a1
+ .endr
+ .endr
+
+ ret
+endfunc
+
+func ff_h_16x16_rvv, zve32x
+ addi a2, a2, 15
+ vsetivli zero, 16, e8, m1, ta, ma
+
+ .irp n 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23
+ lbu t1, (a2)
+ addi a2, a2, -1
+ vmv.v.x v\n, t1
+ .endr
+ .irp n 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22
+ vse8.v v\n, (a0)
+ add a0, a0, a1
+ .endr
+ vse8.v v23, (a0)
+
+ ret
+endfunc
+
+func ff_h_8x8_rvv, zve32x
+ addi a2, a2, 7
+ vsetivli zero, 8, e8, mf2, ta, ma
+
+ .irp n 8, 9, 10, 11, 12, 13, 14, 15
+ lbu t1, (a2)
+ addi a2, a2, -1
+ vmv.v.x v\n, t1
+ .endr
+ .irp n 8, 9, 10, 11, 12, 13, 14
+ vse8.v v\n, (a0)
+ add a0, a0, a1
+ .endr
+ vse8.v v15, (a0)
+
+ ret
+endfunc
diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
index b8ff282f8a..0ad961c7e0 100644
--- a/libavcodec/riscv/vp9dsp.h
+++ b/libavcodec/riscv/vp9dsp.h
@@ -66,6 +66,12 @@ void ff_v_16x16_rvi(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
const uint8_t *a);
void ff_v_8x8_rvi(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
const uint8_t *a);
+void ff_h_32x32_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
+ const uint8_t *a);
+void ff_h_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
+ const uint8_t *a);
+void ff_h_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
+ const uint8_t *a);
#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx) \
void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index c10f8bbe41..7816b13fe0 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -59,6 +59,9 @@ static av_cold void vp9dsp_intrapred_init_riscv(VP9DSPContext *dsp, int bpp)
dsp->intra_pred[TX_16X16][DC_129_PRED] = ff_dc_129_16x16_rvv;
dsp->intra_pred[TX_32X32][TOP_DC_PRED] = ff_dc_top_32x32_rvv;
dsp->intra_pred[TX_16X16][TOP_DC_PRED] = ff_dc_top_16x16_rvv;
+ dsp->intra_pred[TX_32X32][HOR_PRED] = ff_h_32x32_rvv;
+ dsp->intra_pred[TX_16X16][HOR_PRED] = ff_h_16x16_rvv;
+ dsp->intra_pred[TX_8X8][HOR_PRED] = ff_h_8x8_rvv;
}
#endif
#endif
--
2.45.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH v2 4/9] lavc/vp9dsp: R-V V ipred tm
[not found] <20240507073613.2871668-1-uk7b@foxmail.com>
2024-05-07 7:36 ` [FFmpeg-devel] [PATCH v2 2/9] lavc/vp9dsp: R-V mc copy uk7b
2024-05-07 7:36 ` [FFmpeg-devel] [PATCH v2 3/9] lavc/vp9dsp: R-V V ipred hor uk7b
@ 2024-05-07 7:36 ` uk7b
2024-05-07 7:36 ` [FFmpeg-devel] [PATCH v2 5/9] lavc/vp9dsp: R-V V mc avg uk7b
` (4 subsequent siblings)
7 siblings, 0 replies; 10+ messages in thread
From: uk7b @ 2024-05-07 7:36 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908:
vp9_tm_4x4_8bpp_c: 116.5
vp9_tm_4x4_8bpp_rvv_i32: 43.5
vp9_tm_8x8_8bpp_c: 416.2
vp9_tm_8x8_8bpp_rvv_i32: 86.0
vp9_tm_16x16_8bpp_c: 1665.5
vp9_tm_16x16_8bpp_rvv_i32: 187.2
vp9_tm_32x32_8bpp_c: 6974.2
vp9_tm_32x32_8bpp_rvv_i32: 625.7
---
libavcodec/riscv/vp9_intra_rvv.S | 141 +++++++++++++++++++++++++++++++
libavcodec/riscv/vp9dsp.h | 8 ++
libavcodec/riscv/vp9dsp_init.c | 4 +
3 files changed, 153 insertions(+)
diff --git a/libavcodec/riscv/vp9_intra_rvv.S b/libavcodec/riscv/vp9_intra_rvv.S
index dd9bc036e7..7a51aa2bf1 100644
--- a/libavcodec/riscv/vp9_intra_rvv.S
+++ b/libavcodec/riscv/vp9_intra_rvv.S
@@ -169,3 +169,144 @@ func ff_h_8x8_rvv, zve32x
ret
endfunc
+
+.macro tm_sum dst, top, offset
+ lbu t3, \offset(a2)
+ sub t3, t3, a4
+ vadd.vx \dst, \top, t3
+.endm
+
+func ff_tm_32x32_rvv, zve32x
+ lbu a4, -1(a3)
+ li t5, 32
+
+ .macro tm_sum32 n1,n2,n3,n4,n5,n6,n7,n8
+ vsetvli zero, t5, e16, m4, ta, ma
+ vle8.v v8, (a3)
+ vzext.vf2 v28, v8
+
+ tm_sum v0, v28, \n1
+ tm_sum v4, v28, \n2
+ tm_sum v8, v28, \n3
+ tm_sum v12, v28, \n4
+ tm_sum v16, v28, \n5
+ tm_sum v20, v28, \n6
+ tm_sum v24, v28, \n7
+ tm_sum v28, v28, \n8
+
+ .irp n 0, 4, 8, 12, 16, 20, 24, 28
+ vmax.vx v\n, v\n, zero
+ .endr
+
+ vsetvli zero, zero, e8, m2, ta, ma
+ .irp n 0, 4, 8, 12, 16, 20, 24, 28
+ vnclipu.wi v\n, v\n, 0
+ vse8.v v\n, (a0)
+ add a0, a0, a1
+ .endr
+ .endm
+
+ tm_sum32 31, 30, 29, 28, 27, 26, 25, 24
+ tm_sum32 23, 22, 21, 20, 19, 18, 17, 16
+ tm_sum32 15, 14, 13, 12, 11, 10, 9, 8
+ tm_sum32 7, 6, 5, 4, 3, 2, 1, 0
+
+ ret
+endfunc
+
+func ff_tm_16x16_rvv, zve32x
+ vsetivli zero, 16, e16, m2, ta, ma
+ vle8.v v8, (a3)
+ vzext.vf2 v30, v8
+ lbu a4, -1(a3)
+
+ tm_sum v0, v30, 15
+ tm_sum v2, v30, 14
+ tm_sum v4, v30, 13
+ tm_sum v6, v30, 12
+ tm_sum v8, v30, 11
+ tm_sum v10, v30, 10
+ tm_sum v12, v30, 9
+ tm_sum v14, v30, 8
+ tm_sum v16, v30, 7
+ tm_sum v18, v30, 6
+ tm_sum v20, v30, 5
+ tm_sum v22, v30, 4
+ tm_sum v24, v30, 3
+ tm_sum v26, v30, 2
+ tm_sum v28, v30, 1
+ tm_sum v30, v30, 0
+
+ .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ vmax.vx v\n, v\n, zero
+ .endr
+
+ vsetvli zero, zero, e8, m1, ta, ma
+ .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28
+ vnclipu.wi v\n, v\n, 0
+ vse8.v v\n, (a0)
+ add a0, a0, a1
+ .endr
+ vnclipu.wi v30, v30, 0
+ vse8.v v30, (a0)
+
+ ret
+endfunc
+
+func ff_tm_8x8_rvv, zve32x
+ vsetivli zero, 8, e16, m1, ta, ma
+ vle8.v v8, (a3)
+ vzext.vf2 v28, v8
+ lbu a4, -1(a3)
+
+ tm_sum v16, v28, 7
+ tm_sum v17, v28, 6
+ tm_sum v18, v28, 5
+ tm_sum v19, v28, 4
+ tm_sum v20, v28, 3
+ tm_sum v21, v28, 2
+ tm_sum v22, v28, 1
+ tm_sum v23, v28, 0
+
+ .irp n 16, 17, 18, 19, 20, 21, 22, 23
+ vmax.vx v\n, v\n, zero
+ .endr
+
+ vsetvli zero, zero, e8, mf2, ta, ma
+ .irp n 16, 17, 18, 19, 20, 21, 22
+ vnclipu.wi v\n, v\n, 0
+ vse8.v v\n, (a0)
+ add a0, a0, a1
+ .endr
+ vnclipu.wi v24, v23, 0
+ vse8.v v24, (a0)
+
+ ret
+endfunc
+
+func ff_tm_4x4_rvv, zve32x
+ vsetivli zero, 4, e16, mf2, ta, ma
+ vle8.v v8, (a3)
+ vzext.vf2 v28, v8
+ lbu a4, -1(a3)
+
+ tm_sum v16, v28, 3
+ tm_sum v17, v28, 2
+ tm_sum v18, v28, 1
+ tm_sum v19, v28, 0
+
+ .irp n 16, 17, 18, 19
+ vmax.vx v\n, v\n, zero
+ .endr
+
+ vsetvli zero, zero, e8, mf4, ta, ma
+ .irp n 16, 17, 18
+ vnclipu.wi v\n, v\n, 0
+ vse8.v v\n, (a0)
+ add a0, a0, a1
+ .endr
+ vnclipu.wi v24, v19, 0
+ vse8.v v24, (a0)
+
+ ret
+endfunc
diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
index 0ad961c7e0..79330b4968 100644
--- a/libavcodec/riscv/vp9dsp.h
+++ b/libavcodec/riscv/vp9dsp.h
@@ -72,6 +72,14 @@ void ff_h_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
const uint8_t *a);
void ff_h_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
const uint8_t *a);
+void ff_tm_32x32_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
+ const uint8_t *a);
+void ff_tm_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
+ const uint8_t *a);
+void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
+ const uint8_t *a);
+void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
+ const uint8_t *a);
#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx) \
void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index 7816b13fe0..8023c333db 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -62,6 +62,10 @@ static av_cold void vp9dsp_intrapred_init_riscv(VP9DSPContext *dsp, int bpp)
dsp->intra_pred[TX_32X32][HOR_PRED] = ff_h_32x32_rvv;
dsp->intra_pred[TX_16X16][HOR_PRED] = ff_h_16x16_rvv;
dsp->intra_pred[TX_8X8][HOR_PRED] = ff_h_8x8_rvv;
+ dsp->intra_pred[TX_32X32][TM_VP8_PRED] = ff_tm_32x32_rvv;
+ dsp->intra_pred[TX_16X16][TM_VP8_PRED] = ff_tm_16x16_rvv;
+ dsp->intra_pred[TX_8X8][TM_VP8_PRED] = ff_tm_8x8_rvv;
+ dsp->intra_pred[TX_4X4][TM_VP8_PRED] = ff_tm_4x4_rvv;
}
#endif
#endif
--
2.45.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH v2 5/9] lavc/vp9dsp: R-V V mc avg
[not found] <20240507073613.2871668-1-uk7b@foxmail.com>
` (2 preceding siblings ...)
2024-05-07 7:36 ` [FFmpeg-devel] [PATCH v2 4/9] lavc/vp9dsp: R-V V ipred tm uk7b
@ 2024-05-07 7:36 ` uk7b
2024-05-07 7:36 ` [FFmpeg-devel] [PATCH v2 6/9] lavc/vp9dsp: R-V V mc bilin h v uk7b
` (3 subsequent siblings)
7 siblings, 0 replies; 10+ messages in thread
From: uk7b @ 2024-05-07 7:36 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908:
vp9_avg4_8bpp_c: 1.2
vp9_avg4_8bpp_rvv_i64: 1.0
vp9_avg8_8bpp_c: 3.7
vp9_avg8_8bpp_rvv_i64: 1.5
vp9_avg16_8bpp_c: 14.7
vp9_avg16_8bpp_rvv_i64: 3.5
vp9_avg32_8bpp_c: 57.7
vp9_avg32_8bpp_rvv_i64: 10.0
vp9_avg64_8bpp_c: 229.0
vp9_avg64_8bpp_rvv_i64: 31.7
---
libavcodec/riscv/Makefile | 3 +-
libavcodec/riscv/vp9_mc_rvv.S | 58 ++++++++++++++++++++++++++++++++++
libavcodec/riscv/vp9dsp_init.c | 19 +++++++++++
3 files changed, 79 insertions(+), 1 deletion(-)
create mode 100644 libavcodec/riscv/vp9_mc_rvv.S
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 5846861bac..73c9f24d97 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -65,6 +65,7 @@ RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o
OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9dsp_init.o
RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \
riscv/vp9_mc_rvi.o
-RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
+RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o \
+ riscv/vp9_mc_rvv.o
OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
new file mode 100644
index 0000000000..81ecb49435
--- /dev/null
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+.macro vsetvlstatic8 len an mn=m4
+.if \len <= 4
+ vsetivli zero, \len, e8, mf4, ta, ma
+.elseif \len <= 8
+ vsetivli zero, \len, e8, mf2, ta, ma
+.elseif \len <= 16
+ vsetivli zero, \len, e8, m1, ta, ma
+.elseif \len <= 32
+ li \an, \len
+ vsetvli zero, \an, e8, m2, ta, ma
+.elseif \len <= 64
+ li \an, \len
+ vsetvli zero, \an, e8, \mn, ta, ma
+.endif
+.endm
+
+.macro copy_avg len
+func ff_avg\len\()_rvv, zve32x
+ csrwi vxrm, 0
+ vsetvlstatic8 \len t0
+1:
+ addi a4, a4, -1
+ vle8.v v8, (a2)
+ vle8.v v16, (a0)
+ vaaddu.vv v8, v8, v16
+ vse8.v v8, (a0)
+ add a2, a2, a3
+ add a0, a0, a1
+ bnez a4, 1b
+ ret
+endfunc
+.endm
+
+.irp len 64, 32, 16, 8, 4
+ copy_avg \len
+.endr
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index 8023c333db..2caaf732db 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -92,6 +92,25 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
#undef init_fpel
}
+
+#if HAVE_RVV
+ if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I64) && ff_get_rv_vlenb() >= 16) {
+
+#define init_fpel(idx1, sz) \
+ dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_avg##sz##_rvv; \
+ dsp->mc[idx1][FILTER_8TAP_REGULAR][1][0][0] = ff_avg##sz##_rvv; \
+ dsp->mc[idx1][FILTER_8TAP_SHARP ][1][0][0] = ff_avg##sz##_rvv; \
+ dsp->mc[idx1][FILTER_BILINEAR ][1][0][0] = ff_avg##sz##_rvv
+
+ init_fpel(0, 64);
+ init_fpel(1, 32);
+ init_fpel(2, 16);
+ init_fpel(3, 8);
+ init_fpel(4, 4);
+
+#undef init_fpel
+ }
+#endif
#endif
}
--
2.45.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH v2 6/9] lavc/vp9dsp: R-V V mc bilin h v
[not found] <20240507073613.2871668-1-uk7b@foxmail.com>
` (3 preceding siblings ...)
2024-05-07 7:36 ` [FFmpeg-devel] [PATCH v2 5/9] lavc/vp9dsp: R-V V mc avg uk7b
@ 2024-05-07 7:36 ` uk7b
2024-05-07 7:36 ` [FFmpeg-devel] [PATCH v2 7/9] lavc/vp9dsp: R-V V mc tap " uk7b
` (2 subsequent siblings)
7 siblings, 0 replies; 10+ messages in thread
From: uk7b @ 2024-05-07 7:36 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908:
vp9_avg_bilin_4h_8bpp_c: 5.2
vp9_avg_bilin_4h_8bpp_rvv_i64: 2.2
vp9_avg_bilin_4v_8bpp_c: 5.5
vp9_avg_bilin_4v_8bpp_rvv_i64: 2.2
vp9_avg_bilin_8h_8bpp_c: 20.0
vp9_avg_bilin_8h_8bpp_rvv_i64: 4.5
vp9_avg_bilin_8v_8bpp_c: 21.0
vp9_avg_bilin_8v_8bpp_rvv_i64: 4.2
vp9_avg_bilin_16h_8bpp_c: 78.2
vp9_avg_bilin_16h_8bpp_rvv_i64: 9.0
vp9_avg_bilin_16v_8bpp_c: 82.0
vp9_avg_bilin_16v_8bpp_rvv_i64: 9.0
vp9_avg_bilin_32h_8bpp_c: 325.5
vp9_avg_bilin_32h_8bpp_rvv_i64: 26.2
vp9_avg_bilin_32v_8bpp_c: 326.2
vp9_avg_bilin_32v_8bpp_rvv_i64: 26.2
vp9_avg_bilin_64h_8bpp_c: 1265.7
vp9_avg_bilin_64h_8bpp_rvv_i64: 91.5
vp9_avg_bilin_64v_8bpp_c: 1317.0
vp9_avg_bilin_64v_8bpp_rvv_i64: 91.2
vp9_put_bilin_4h_8bpp_c: 4.5
vp9_put_bilin_4h_8bpp_rvv_i64: 1.7
vp9_put_bilin_4v_8bpp_c: 4.7
vp9_put_bilin_4v_8bpp_rvv_i64: 1.7
vp9_put_bilin_8h_8bpp_c: 17.0
vp9_put_bilin_8h_8bpp_rvv_i64: 3.5
vp9_put_bilin_8v_8bpp_c: 18.0
vp9_put_bilin_8v_8bpp_rvv_i64: 3.5
vp9_put_bilin_16h_8bpp_c: 65.2
vp9_put_bilin_16h_8bpp_rvv_i64: 7.5
vp9_put_bilin_16v_8bpp_c: 85.7
vp9_put_bilin_16v_8bpp_rvv_i64: 7.5
vp9_put_bilin_32h_8bpp_c: 257.5
vp9_put_bilin_32h_8bpp_rvv_i64: 23.5
vp9_put_bilin_32v_8bpp_c: 274.5
vp9_put_bilin_32v_8bpp_rvv_i64: 23.5
vp9_put_bilin_64h_8bpp_c: 1040.5
vp9_put_bilin_64h_8bpp_rvv_i64: 82.5
vp9_put_bilin_64v_8bpp_c: 1108.7
vp9_put_bilin_64v_8bpp_rvv_i64: 82.2
---
libavcodec/riscv/vp9_mc_rvv.S | 43 ++++++++++++++++++++++++++++++++++
libavcodec/riscv/vp9dsp_init.c | 22 +++++++++++++++++
2 files changed, 65 insertions(+)
diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index 81ecb49435..598a67fc94 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -53,6 +53,49 @@ func ff_avg\len\()_rvv, zve32x
endfunc
.endm
+.macro bilin_load dst len op type mn
+.ifc \type,v
+ add t5, a2, a3
+.elseif \type == h
+ addi t5, a2, 1
+.endif
+ vle8.v v8, (a2)
+ vle8.v v0, (t5)
+ vwmulu.vx v16, v0, \mn
+ vwmaccsu.vx v16, t1, v8
+ vwadd.wx v16, v16, t4
+ vnsra.wi v16, v16, 4
+ vadd.vv \dst, v16, v8
+.ifc \op,avg
+ vle8.v v16, (a0)
+ vaaddu.vv \dst, \dst, v16
+.endif
+.endm
+
+.macro bilin_h_v len op type mn
+func ff_\op\()_bilin_\len\()\type\()_rvv, zve32x
+.ifc \op,avg
+ csrwi vxrm, 0
+.endif
+ vsetvlstatic8 \len t0
+ li t4, 8
+ neg t1, \mn
+1:
+ addi a4, a4, -1
+ bilin_load v0, \len, \op, \type, \mn
+ vse8.v v0, (a0)
+ add a2, a2, a3
+ add a0, a0, a1
+ bnez a4, 1b
+
+ ret
+endfunc
+.endm
+
.irp len 64, 32, 16, 8, 4
copy_avg \len
+ .irp op put avg
+ bilin_h_v \len \op h a5
+ bilin_h_v \len \op v a6
+ .endr
.endr
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index 2caaf732db..cfeaa06c0a 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -109,6 +109,28 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
init_fpel(4, 4);
#undef init_fpel
+
+#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type) \
+ dsp->mc[idx1][FILTER_BILINEAR ][idx2][idxh][idxv] = \
+ ff_##type##_bilin_##sz##dir##_rvv; \
+
+#define init_subpel2(idx, idxh, idxv, dir, type) \
+ init_subpel1(0, idx, idxh, idxv, 64, dir, type); \
+ init_subpel1(1, idx, idxh, idxv, 32, dir, type); \
+ init_subpel1(2, idx, idxh, idxv, 16, dir, type); \
+ init_subpel1(3, idx, idxh, idxv, 8, dir, type); \
+ init_subpel1(4, idx, idxh, idxv, 4, dir, type)
+
+#define init_subpel3(idx, type) \
+ init_subpel2(idx, 1, 0, h, type); \
+ init_subpel2(idx, 0, 1, v, type); \
+
+ init_subpel3(0, put);
+ init_subpel3(1, avg);
+
+#undef init_subpel1
+#undef init_subpel2
+#undef init_subpel3
}
#endif
#endif
--
2.45.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH v2 7/9] lavc/vp9dsp: R-V V mc tap h v
[not found] <20240507073613.2871668-1-uk7b@foxmail.com>
` (4 preceding siblings ...)
2024-05-07 7:36 ` [FFmpeg-devel] [PATCH v2 6/9] lavc/vp9dsp: R-V V mc bilin h v uk7b
@ 2024-05-07 7:36 ` uk7b
2024-05-07 7:36 ` [FFmpeg-devel] [PATCH v2 8/9] lavc/vp9dsp: R-V V mc bilin hv uk7b
2024-05-07 7:36 ` [FFmpeg-devel] [PATCH v2 9/9] lavc/vp9dsp: R-V V mc tap hv uk7b
7 siblings, 0 replies; 10+ messages in thread
From: uk7b @ 2024-05-07 7:36 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908:
vp9_avg_8tap_smooth_4h_8bpp_c: 13.0
vp9_avg_8tap_smooth_4h_8bpp_rvv_i64: 5.0
vp9_avg_8tap_smooth_4v_8bpp_c: 13.7
vp9_avg_8tap_smooth_4v_8bpp_rvv_i64: 5.0
vp9_avg_8tap_smooth_8h_8bpp_c: 48.7
vp9_avg_8tap_smooth_8h_8bpp_rvv_i64: 9.5
vp9_avg_8tap_smooth_8v_8bpp_c: 50.0
vp9_avg_8tap_smooth_8v_8bpp_rvv_i64: 9.5
vp9_avg_8tap_smooth_16h_8bpp_c: 192.5
vp9_avg_8tap_smooth_16h_8bpp_rvv_i64: 21.2
vp9_avg_8tap_smooth_16v_8bpp_c: 191.5
vp9_avg_8tap_smooth_16v_8bpp_rvv_i64: 21.2
vp9_avg_8tap_smooth_32h_8bpp_c: 763.7
vp9_avg_8tap_smooth_32h_8bpp_rvv_i64: 67.2
vp9_avg_8tap_smooth_32v_8bpp_c: 770.7
vp9_avg_8tap_smooth_32v_8bpp_rvv_i64: 67.2
vp9_avg_8tap_smooth_64h_8bpp_c: 3098.7
vp9_avg_8tap_smooth_64h_8bpp_rvv_i64: 283.2
vp9_avg_8tap_smooth_64v_8bpp_c: 3045.2
vp9_avg_8tap_smooth_64v_8bpp_rvv_i64: 266.7
vp9_put_8tap_smooth_4h_8bpp_c: 11.0
vp9_put_8tap_smooth_4h_8bpp_rvv_i64: 4.2
vp9_put_8tap_smooth_4v_8bpp_c: 28.5
vp9_put_8tap_smooth_4v_8bpp_rvv_i64: 4.2
vp9_put_8tap_smooth_8h_8bpp_c: 42.2
vp9_put_8tap_smooth_8h_8bpp_rvv_i64: 8.5
vp9_put_8tap_smooth_8v_8bpp_c: 43.7
vp9_put_8tap_smooth_8v_8bpp_rvv_i64: 8.5
vp9_put_8tap_smooth_16h_8bpp_c: 165.7
vp9_put_8tap_smooth_16h_8bpp_rvv_i64: 19.7
vp9_put_8tap_smooth_16v_8bpp_c: 168.5
vp9_put_8tap_smooth_16v_8bpp_rvv_i64: 19.5
vp9_put_8tap_smooth_32h_8bpp_c: 675.5
vp9_put_8tap_smooth_32h_8bpp_rvv_i64: 64.2
vp9_put_8tap_smooth_32v_8bpp_c: 664.7
vp9_put_8tap_smooth_32v_8bpp_rvv_i64: 64.2
vp9_put_8tap_smooth_64h_8bpp_c: 2680.5
vp9_put_8tap_smooth_64h_8bpp_rvv_i64: 272.0
vp9_put_8tap_smooth_64v_8bpp_c: 2692.5
vp9_put_8tap_smooth_64v_8bpp_rvv_i64: 272.0
---
libavcodec/riscv/vp9_mc_rvv.S | 238 +++++++++++++++++++++++++++++++++
libavcodec/riscv/vp9dsp_init.c | 8 ++
2 files changed, 246 insertions(+)
diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index 598a67fc94..99605dfbb5 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -36,6 +36,18 @@
.endif
.endm
+.macro vsetvlstatic16 len
+.ifc \len,4
+ vsetvli zero, zero, e16, mf2, ta, ma
+.elseif \len == 8
+ vsetvli zero, zero, e16, m1, ta, ma
+.elseif \len == 16
+ vsetvli zero, zero, e16, m2, ta, ma
+.else
+ vsetvli zero, zero, e16, m4, ta, ma
+.endif
+.endm
+
.macro copy_avg len
func ff_avg\len\()_rvv, zve32x
csrwi vxrm, 0
@@ -92,10 +104,236 @@ func ff_\op\()_bilin_\len\()\type\()_rvv, zve32x
endfunc
.endm
+const subpel_filters_regular
+ .byte 0, 0, 0, 128, 0, 0, 0, 0
+ .byte 0, 1, -5, 126, 8, -3, 1, 0
+ .byte -1, 3, -10, 122, 18, -6, 2, 0
+ .byte -1, 4, -13, 118, 27, -9, 3, -1
+ .byte -1, 4, -16, 112, 37, -11, 4, -1
+ .byte -1, 5, -18, 105, 48, -14, 4, -1
+ .byte -1, 5, -19, 97, 58, -16, 5, -1
+ .byte -1, 6, -19, 88, 68, -18, 5, -1
+ .byte -1, 6, -19, 78, 78, -19, 6, -1
+ .byte -1, 5, -18, 68, 88, -19, 6, -1
+ .byte -1, 5, -16, 58, 97, -19, 5, -1
+ .byte -1, 4, -14, 48, 105, -18, 5, -1
+ .byte -1, 4, -11, 37, 112, -16, 4, -1
+ .byte -1, 3, -9, 27, 118, -13, 4, -1
+ .byte 0, 2, -6, 18, 122, -10, 3, -1
+ .byte 0, 1, -3, 8, 126, -5, 1, 0
+subpel_filters_sharp:
+ .byte 0, 0, 0, 128, 0, 0, 0, 0
+ .byte -1, 3, -7, 127, 8, -3, 1, 0
+ .byte -2, 5, -13, 125, 17, -6, 3, -1
+ .byte -3, 7, -17, 121, 27, -10, 5, -2
+ .byte -4, 9, -20, 115, 37, -13, 6, -2
+ .byte -4, 10, -23, 108, 48, -16, 8, -3
+ .byte -4, 10, -24, 100, 59, -19, 9, -3
+ .byte -4, 11, -24, 90, 70, -21, 10, -4
+ .byte -4, 11, -23, 80, 80, -23, 11, -4
+ .byte -4, 10, -21, 70, 90, -24, 11, -4
+ .byte -3, 9, -19, 59, 100, -24, 10, -4
+ .byte -3, 8, -16, 48, 108, -23, 10, -4
+ .byte -2, 6, -13, 37, 115, -20, 9, -4
+ .byte -2, 5, -10, 27, 121, -17, 7, -3
+ .byte -1, 3, -6, 17, 125, -13, 5, -2
+ .byte 0, 1, -3, 8, 127, -7, 3, -1
+subpel_filters_smooth:
+ .byte 0, 0, 0, 128, 0, 0, 0, 0
+ .byte -3, -1, 32, 64, 38, 1, -3, 0
+ .byte -2, -2, 29, 63, 41, 2, -3, 0
+ .byte -2, -2, 26, 63, 43, 4, -4, 0
+ .byte -2, -3, 24, 62, 46, 5, -4, 0
+ .byte -2, -3, 21, 60, 49, 7, -4, 0
+ .byte -1, -4, 18, 59, 51, 9, -4, 0
+ .byte -1, -4, 16, 57, 53, 12, -4, -1
+ .byte -1, -4, 14, 55, 55, 14, -4, -1
+ .byte -1, -4, 12, 53, 57, 16, -4, -1
+ .byte 0, -4, 9, 51, 59, 18, -4, -1
+ .byte 0, -4, 7, 49, 60, 21, -3, -2
+ .byte 0, -4, 5, 46, 62, 24, -3, -2
+ .byte 0, -4, 4, 43, 63, 26, -2, -2
+ .byte 0, -3, 2, 41, 63, 29, -2, -2
+ .byte 0, -3, 1, 38, 64, 32, -1, -3
+endconst
+
+.macro epel_filter name type regtype
+ lla \regtype\()2, subpel_filters_\name
+ li \regtype\()1, 8
+.ifc \type,v
+ mul \regtype\()0, a6, \regtype\()1
+.elseif \type == h
+ mul \regtype\()0, a5, \regtype\()1
+.endif
+ add \regtype\()0, \regtype\()0, \regtype\()2
+ .irp n 1,2,3,4,5,6
+ lb \regtype\n, \n(\regtype\()0)
+ .endr
+.ifc \regtype,t
+ lb a7, 7(\regtype\()0)
+.elseif \regtype == s
+ lb s7, 7(\regtype\()0)
+.endif
+ lb \regtype\()0, 0(\regtype\()0)
+.endm
+
+.macro epel_load dst len op name type from_mem regtype
+ li a5, 64
+.ifc \from_mem, 1
+ vle8.v v22, (a2)
+.ifc \type,v
+ sub a2, a2, a3
+ vle8.v v20, (a2)
+ sh1add a2, a3, a2
+ vle8.v v24, (a2)
+ add a2, a2, a3
+ vle8.v v26, (a2)
+ add a2, a2, a3
+ vle8.v v28, (a2)
+ add a2, a2, a3
+ vle8.v v30, (a2)
+.elseif \type == h
+ addi a2, a2, -1
+ vle8.v v20, (a2)
+ addi a2, a2, 2
+ vle8.v v24, (a2)
+ addi a2, a2, 1
+ vle8.v v26, (a2)
+ addi a2, a2, 1
+ vle8.v v28, (a2)
+ addi a2, a2, 1
+ vle8.v v30, (a2)
+.endif
+
+.ifc \name,smooth
+ vwmulu.vx v16, v24, \regtype\()4
+ vwmaccu.vx v16, \regtype\()2, v20
+ vwmaccu.vx v16, \regtype\()5, v26
+ vwmaccsu.vx v16, \regtype\()6, v28
+.else
+ vwmulu.vx v16, v28, \regtype\()6
+ vwmaccsu.vx v16, \regtype\()2, v20
+ vwmaccsu.vx v16, \regtype\()5, v26
+.endif
+
+.ifc \regtype,t
+ vwmaccsu.vx v16, a7, v30
+.elseif \regtype == s
+ vwmaccsu.vx v16, s7, v30
+.endif
+
+.ifc \type,v
+ .rept 6
+ sub a2, a2, a3
+ .endr
+ vle8.v v28, (a2)
+ sub a2, a2, a3
+ vle8.v v26, (a2)
+ sh1add a2, a3, a2
+ add a2, a2, a3
+.elseif \type == h
+ addi a2, a2, -6
+ vle8.v v28, (a2)
+ addi a2, a2, -1
+ vle8.v v26, (a2)
+ addi a2, a2, 3
+.endif
+
+.ifc \name,smooth
+ vwmaccsu.vx v16, \regtype\()1, v28
+.else
+ vwmaccu.vx v16, \regtype\()1, v28
+ vwmulu.vx v28, v24, \regtype\()4
+.endif
+ vwmaccsu.vx v16, \regtype\()0, v26
+ vwmulu.vx v20, v22, \regtype\()3
+.else
+.ifc \name,smooth
+ vwmulu.vx v16, v8, \regtype\()4
+ vwmaccu.vx v16, \regtype\()2, v4
+ vwmaccu.vx v16, \regtype\()5, v10
+ vwmaccsu.vx v16, \regtype\()6, v12
+ vwmaccsu.vx v16, \regtype\()1, v2
+.else
+ vwmulu.vx v16, v2, \regtype\()1
+ vwmaccu.vx v16, \regtype\()6, v12
+ vwmaccsu.vx v16, \regtype\()5, v10
+ vwmaccsu.vx v16, \regtype\()2, v4
+ vwmulu.vx v28, v8, \regtype\()4
+.endif
+ vwmaccsu.vx v16, \regtype\()0, v0
+ vwmulu.vx v20, v6, \regtype\()3
+
+.ifc \regtype,t
+ vwmaccsu.vx v16, a7, v14
+.elseif \regtype == s
+ vwmaccsu.vx v16, s7, v14
+.endif
+
+.endif
+ vwadd.wx v16, v16, a5
+ vsetvlstatic16 \len
+
+.ifc \name,smooth
+ vwadd.vv v24, v16, v20
+.else
+ vwadd.vv v24, v16, v28
+ vwadd.wv v24, v24, v20
+.endif
+ vnsra.wi v24, v24, 7
+ vmax.vx v24, v24, zero
+ vsetvlstatic8 \len, zero, m2
+
+ vnclipu.wi \dst, v24, 0
+.ifc \op,avg
+ vle8.v v24, (a0)
+ vaaddu.vv \dst, \dst, v24
+.endif
+
+.endm
+
+.macro epel_load_inc dst len op name type from_mem regtype
+ epel_load \dst \len \op \name \type \from_mem \regtype
+ add a2, a2, a3
+.endm
+
+.macro epel len op name type
+func ff_\op\()_8tap_\name\()_\len\()\type\()_rvv, zve32x
+ epel_filter \name \type t
+ vsetvlstatic8 \len a5 m2
+.ifc \op,avg
+ csrwi vxrm, 0
+.endif
+
+1:
+ addi a4, a4, -1
+ epel_load v30 \len \op \name \type 1 t
+ vse8.v v30, (a0)
+.ifc \len,64
+ addi a0, a0, 32
+ addi a2, a2, 32
+ epel_load v30 \len \op \name \type 1 t
+ vse8.v v30, (a0)
+ addi a0, a0, -32
+ addi a2, a2, -32
+.endif
+ add a2, a2, a3
+ add a0, a0, a1
+ bnez a4, 1b
+
+ ret
+endfunc
+.endm
+
.irp len 64, 32, 16, 8, 4
copy_avg \len
.irp op put avg
bilin_h_v \len \op h a5
bilin_h_v \len \op v a6
+ .irp name regular sharp smooth
+ .irp type h v
+ epel \len \op \name \type
+ .endr
+ .endr
.endr
.endr
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index cfeaa06c0a..a45aea530d 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -113,6 +113,12 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type) \
dsp->mc[idx1][FILTER_BILINEAR ][idx2][idxh][idxv] = \
ff_##type##_bilin_##sz##dir##_rvv; \
+ dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = \
+ ff_##type##_8tap_smooth_##sz##dir##_rvv; \
+ dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = \
+ ff_##type##_8tap_regular_##sz##dir##_rvv; \
+ dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = \
+ ff_##type##_8tap_sharp_##sz##dir##_rvv;
#define init_subpel2(idx, idxh, idxv, dir, type) \
init_subpel1(0, idx, idxh, idxv, 64, dir, type); \
@@ -123,7 +129,9 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
#define init_subpel3(idx, type) \
init_subpel2(idx, 1, 0, h, type); \
+ if (flags & AV_CPU_FLAG_RVB_ADDR) { \
init_subpel2(idx, 0, 1, v, type); \
+ }
init_subpel3(0, put);
init_subpel3(1, avg);
--
2.45.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH v2 8/9] lavc/vp9dsp: R-V V mc bilin hv
[not found] <20240507073613.2871668-1-uk7b@foxmail.com>
` (5 preceding siblings ...)
2024-05-07 7:36 ` [FFmpeg-devel] [PATCH v2 7/9] lavc/vp9dsp: R-V V mc tap " uk7b
@ 2024-05-07 7:36 ` uk7b
2024-05-07 7:36 ` [FFmpeg-devel] [PATCH v2 9/9] lavc/vp9dsp: R-V V mc tap hv uk7b
7 siblings, 0 replies; 10+ messages in thread
From: uk7b @ 2024-05-07 7:36 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908:
vp9_avg_bilin_4hv_8bpp_c: 11.0
vp9_avg_bilin_4hv_8bpp_rvv_i64: 3.7
vp9_avg_bilin_8hv_8bpp_c: 38.7
vp9_avg_bilin_8hv_8bpp_rvv_i64: 7.2
vp9_avg_bilin_16hv_8bpp_c: 147.0
vp9_avg_bilin_16hv_8bpp_rvv_i64: 14.2
vp9_avg_bilin_32hv_8bpp_c: 574.5
vp9_avg_bilin_32hv_8bpp_rvv_i64: 42.7
vp9_avg_bilin_64hv_8bpp_c: 2311.5
vp9_avg_bilin_64hv_8bpp_rvv_i64: 201.7
vp9_put_bilin_4hv_8bpp_c: 10.0
vp9_put_bilin_4hv_8bpp_rvv_i64: 3.2
vp9_put_bilin_8hv_8bpp_c: 35.2
vp9_put_bilin_8hv_8bpp_rvv_i64: 6.5
vp9_put_bilin_16hv_8bpp_c: 133.7
vp9_put_bilin_16hv_8bpp_rvv_i64: 13.0
vp9_put_bilin_32hv_8bpp_c: 538.2
vp9_put_bilin_32hv_8bpp_rvv_i64: 39.7
vp9_put_bilin_64hv_8bpp_c: 2114.0
vp9_put_bilin_64hv_8bpp_rvv_i64: 153.7
---
libavcodec/riscv/vp9_mc_rvv.S | 34 ++++++++++++++++++++++++++++++++++
1 file changed, 34 insertions(+)
diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index 99605dfbb5..01404bbde5 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -104,6 +104,39 @@ func ff_\op\()_bilin_\len\()\type\()_rvv, zve32x
endfunc
.endm
+.macro bilin_hv len op
+func ff_\op\()_bilin_\len\()hv_rvv, zve32x
+.ifc \op,avg
+ csrwi vxrm, 0
+.endif
+ vsetvlstatic8 \len t0
+ neg t1, a5
+ neg t2, a6
+ li t4, 8
+ bilin_load v24, \len, put, h, a5
+ add a2, a2, a3
+1:
+ addi a4, a4, -1
+ bilin_load v4, \len, put, h, a5
+ vwmulu.vx v16, v4, a6
+ vwmaccsu.vx v16, t2, v24
+ vwadd.wx v16, v16, t4
+ vnsra.wi v16, v16, 4
+ vadd.vv v0, v16, v24
+.ifc \op,avg
+ vle8.v v16, (a0)
+ vaaddu.vv v0, v0, v16
+.endif
+ vse8.v v0, (a0)
+ vmv.v.v v24, v4
+ add a2, a2, a3
+ add a0, a0, a1
+ bnez a4, 1b
+
+ ret
+endfunc
+.endm
+
const subpel_filters_regular
.byte 0, 0, 0, 128, 0, 0, 0, 0
.byte 0, 1, -5, 126, 8, -3, 1, 0
@@ -330,6 +363,7 @@ endfunc
.irp op put avg
bilin_h_v \len \op h a5
bilin_h_v \len \op v a6
+ bilin_hv \len \op
.irp name regular sharp smooth
.irp type h v
epel \len \op \name \type
--
2.45.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH v2 9/9] lavc/vp9dsp: R-V V mc tap hv
[not found] <20240507073613.2871668-1-uk7b@foxmail.com>
` (6 preceding siblings ...)
2024-05-07 7:36 ` [FFmpeg-devel] [PATCH v2 8/9] lavc/vp9dsp: R-V V mc bilin hv uk7b
@ 2024-05-07 7:36 ` uk7b
7 siblings, 0 replies; 10+ messages in thread
From: uk7b @ 2024-05-07 7:36 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908:
vp9_avg_8tap_smooth_4hv_8bpp_c: 32.0
vp9_avg_8tap_smooth_4hv_8bpp_rvv_i64: 15.0
vp9_avg_8tap_smooth_8hv_8bpp_c: 114.5
vp9_avg_8tap_smooth_8hv_8bpp_rvv_i64: 40.5
vp9_avg_8tap_smooth_16hv_8bpp_c: 338.7
vp9_avg_8tap_smooth_16hv_8bpp_rvv_i64: 46.5
vp9_avg_8tap_smooth_32hv_8bpp_c: 1270.7
vp9_avg_8tap_smooth_32hv_8bpp_rvv_i64: 134.0
vp9_avg_8tap_smooth_64hv_8bpp_c: 4923.5
vp9_avg_8tap_smooth_64hv_8bpp_rvv_i64: 523.5
vp9_put_8tap_smooth_4hv_8bpp_c: 30.5
vp9_put_8tap_smooth_4hv_8bpp_rvv_i64: 14.2
vp9_put_8tap_smooth_8hv_8bpp_c: 91.7
vp9_put_8tap_smooth_8hv_8bpp_rvv_i64: 22.7
vp9_put_8tap_smooth_16hv_8bpp_c: 328.7
vp9_put_8tap_smooth_16hv_8bpp_rvv_i64: 45.0
vp9_put_8tap_smooth_32hv_8bpp_c: 1166.7
vp9_put_8tap_smooth_32hv_8bpp_rvv_i64: 131.0
vp9_put_8tap_smooth_64hv_8bpp_c: 4532.5
vp9_put_8tap_smooth_64hv_8bpp_rvv_i64: 512.5
---
libavcodec/riscv/vp9_mc_rvv.S | 94 ++++++++++++++++++++++++++++++++++
libavcodec/riscv/vp9dsp_init.c | 3 +-
2 files changed, 96 insertions(+), 1 deletion(-)
diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index 01404bbde5..0dcec94bbf 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -358,6 +358,99 @@ func ff_\op\()_8tap_\name\()_\len\()\type\()_rvv, zve32x
endfunc
.endm
+.macro epel_hv_once len name op
+ sub a2, a2, a3
+ sub a2, a2, a3
+ sub a2, a2, a3
+ .irp n 0 2 4 6 8 10 12 14
+ epel_load_inc v\n \len put \name h 1 t
+ .endr
+ addi a4, a4, -1
+1:
+ addi a4, a4, -1
+ epel_load v30 \len \op \name v 0 s
+ vse8.v v30, (a0)
+ vmv.v.v v0, v2
+ vmv.v.v v2, v4
+ vmv.v.v v4, v6
+ vmv.v.v v6, v8
+ vmv.v.v v8, v10
+ vmv.v.v v10, v12
+ vmv.v.v v12, v14
+ epel_load v14 \len put \name h 1 t
+ add a2, a2, a3
+ add a0, a0, a1
+ bnez a4, 1b
+ epel_load v30 \len \op \name v 0 s
+ vse8.v v30, (a0)
+.endm
+
+.macro epel_hv op name len
+func ff_\op\()_8tap_\name\()_\len\()hv_rvv, zve32x
+#if __riscv_xlen == 64
+ addi sp, sp, -64
+ .irp n 0,1,2,3,4,5,6,7
+ sd s\n, \n\()<<3(sp)
+ .endr
+#else
+ addi sp, sp, -32
+ .irp n 0,1,2,3,4,5,6,7
+ sw s\n, \n\()<<2(sp)
+ .endr
+#endif
+.ifc \len,64
+#if __riscv_xlen == 64
+ addi sp, sp, -48
+ .irp n 0,1,2,3,4,5
+ sd a\n, \n\()<<3(sp)
+ .endr
+#else
+ addi sp, sp, -24
+ .irp n 0,1,2,3,4,5
+ sw a\n, \n\()<<2(sp)
+ .endr
+#endif
+.endif
+.ifc \op,avg
+ csrwi vxrm, 0
+.endif
+ epel_filter \name h t
+ epel_filter \name v s
+ vsetvlstatic8 \len a6 m2
+ epel_hv_once \len \name \op
+.ifc \len,64
+#if __riscv_xlen == 64
+ .irp n 0,1,2,3,4,5
+ ld a\n, \n\()<<3(sp)
+ .endr
+ addi sp, sp, 48
+#else
+ .irp n 0,1,2,3,4,5
+ lw a\n, \n\()<<2(sp)
+ .endr
+ addi sp, sp, 24
+#endif
+ addi a0, a0, 32
+ addi a2, a2, 32
+ epel_filter \name h t
+ epel_hv_once \len \name \op
+.endif
+#if __riscv_xlen == 64
+ .irp n 0,1,2,3,4,5,6,7
+ ld s\n, \n\()<<3(sp)
+ .endr
+ addi sp, sp, 64
+#else
+ .irp n 0,1,2,3,4,5,6,7
+ lw s\n, \n\()<<2(sp)
+ .endr
+ addi sp, sp, 32
+#endif
+
+ ret
+endfunc
+.endm
+
.irp len 64, 32, 16, 8, 4
copy_avg \len
.irp op put avg
@@ -368,6 +461,7 @@ endfunc
.irp type h v
epel \len \op \name \type
.endr
+ epel_hv \op \name \len
.endr
.endr
.endr
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index a45aea530d..554fcefa6e 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -131,7 +131,8 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
init_subpel2(idx, 1, 0, h, type); \
if (flags & AV_CPU_FLAG_RVB_ADDR) { \
init_subpel2(idx, 0, 1, v, type); \
- }
+ } \
+ init_subpel2(idx, 1, 1, hv, type)
init_subpel3(0, put);
init_subpel3(1, avg);
--
2.45.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 3/9] lavc/vp9dsp: R-V V ipred hor
2024-05-07 7:36 ` [FFmpeg-devel] [PATCH v2 3/9] lavc/vp9dsp: R-V V ipred hor uk7b
@ 2024-05-07 16:08 ` Rémi Denis-Courmont
2024-05-07 18:42 ` flow gg
0 siblings, 1 reply; 10+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-07 16:08 UTC (permalink / raw)
To: ffmpeg-devel
Le tiistaina 7. toukokuuta 2024, 10.36.07 EEST uk7b@foxmail.com a écrit :
> From: sunyuechi <sunyuechi@iscas.ac.cn>
>
> C908:
> vp9_hor_8x8_8bpp_c: 74.7
> vp9_hor_8x8_8bpp_rvv_i32: 35.7
> vp9_hor_16x16_8bpp_c: 175.5
> vp9_hor_16x16_8bpp_rvv_i32: 80.2
> vp9_hor_32x32_8bpp_c: 510.2
> vp9_hor_32x32_8bpp_rvv_i32: 264.0
> ---
> libavcodec/riscv/vp9_intra_rvv.S | 56 ++++++++++++++++++++++++++++++++
> libavcodec/riscv/vp9dsp.h | 6 ++++
> libavcodec/riscv/vp9dsp_init.c | 3 ++
> 3 files changed, 65 insertions(+)
>
> diff --git a/libavcodec/riscv/vp9_intra_rvv.S
> b/libavcodec/riscv/vp9_intra_rvv.S index db9774c263..dd9bc036e7 100644
> --- a/libavcodec/riscv/vp9_intra_rvv.S
> +++ b/libavcodec/riscv/vp9_intra_rvv.S
> @@ -113,3 +113,59 @@ func_dc dc_left 8 left 3 0 zve64x
> func_dc dc_top 32 top 5 1 zve32x
> func_dc dc_top 16 top 4 1 zve32x
> func_dc dc_top 8 top 3 0 zve64x
> +
> +func ff_h_32x32_rvv, zve32x
> + li t0, 32
> + addi a2, a2, 31
> + vsetvli zero, t0, e8, m2, ta, ma
> +
> + .rept 2
> + .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
> + lbu t1, (a2)
> + addi a2, a2, -1
> + vmv.v.x v\n, t1
> + .endr
> + .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
> + vse8.v v\n, (a0)
> + add a0, a0, a1
> + .endr
> + .endr
Do you gain much by unrolling all the way to 16x? Given that you have the
counter value already in t0, it should not make much difference to just unroll
2x or maybe 4x and then loop.
It might also be faster to use lhu or lwu and shift to reduce scalar loads, at
least if the vector is suitably aligned.
> +
> + ret
> +endfunc
> +
> +func ff_h_16x16_rvv, zve32x
> + addi a2, a2, 15
> + vsetivli zero, 16, e8, m1, ta, ma
> +
> + .irp n 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23
> + lbu t1, (a2)
> + addi a2, a2, -1
> + vmv.v.x v\n, t1
> + .endr
> + .irp n 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22
> + vse8.v v\n, (a0)
> + add a0, a0, a1
> + .endr
> + vse8.v v23, (a0)
> +
> + ret
> +endfunc
> +
> +func ff_h_8x8_rvv, zve32x
> + addi a2, a2, 7
> + vsetivli zero, 8, e8, mf2, ta, ma
> +
> + .irp n 8, 9, 10, 11, 12, 13, 14, 15
> + lbu t1, (a2)
> + addi a2, a2, -1
> + vmv.v.x v\n, t1
> + .endr
> + .irp n 8, 9, 10, 11, 12, 13, 14
> + vse8.v v\n, (a0)
> + add a0, a0, a1
> + .endr
> + vse8.v v15, (a0)
> +
> + ret
> +endfunc
> diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> index b8ff282f8a..0ad961c7e0 100644
> --- a/libavcodec/riscv/vp9dsp.h
> +++ b/libavcodec/riscv/vp9dsp.h
> @@ -66,6 +66,12 @@ void ff_v_16x16_rvi(uint8_t *dst, ptrdiff_t stride, const
> uint8_t *l, const uint8_t *a);
> void ff_v_8x8_rvi(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> const uint8_t *a);
> +void ff_h_32x32_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> + const uint8_t *a);
> +void ff_h_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> + const uint8_t *a);
> +void ff_h_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> + const uint8_t *a);
>
> #define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)
> \ void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> dststride, \ diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c index c10f8bbe41..7816b13fe0 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -59,6 +59,9 @@ static av_cold void
> vp9dsp_intrapred_init_riscv(VP9DSPContext *dsp, int bpp)
> dsp->intra_pred[TX_16X16][DC_129_PRED] = ff_dc_129_16x16_rvv;
> dsp->intra_pred[TX_32X32][TOP_DC_PRED] = ff_dc_top_32x32_rvv;
> dsp->intra_pred[TX_16X16][TOP_DC_PRED] = ff_dc_top_16x16_rvv; +
> dsp->intra_pred[TX_32X32][HOR_PRED] = ff_h_32x32_rvv; +
> dsp->intra_pred[TX_16X16][HOR_PRED] = ff_h_16x16_rvv; +
> dsp->intra_pred[TX_8X8][HOR_PRED] = ff_h_8x8_rvv;
> }
> #endif
> #endif
--
Rémi Denis-Courmont
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 3/9] lavc/vp9dsp: R-V V ipred hor
2024-05-07 16:08 ` Rémi Denis-Courmont
@ 2024-05-07 18:42 ` flow gg
0 siblings, 0 replies; 10+ messages in thread
From: flow gg @ 2024-05-07 18:42 UTC (permalink / raw)
To: FFmpeg development discussions and patches
> Do you gain much by unrolling all the way to 16x? Given that you have the
> counter value already in t0, it should not make much difference to just
unroll
> 2x or maybe 4x and then loop.
I chose this simple method because I think the effect is about the same..
Do I need to change it?
> It might also be faster to use lhu or lwu and shift to reduce scalar
loads, at
least if the vector is suitably aligned.
I just tested ff_h_16x16_rvv, lbu version is faster (lbu * 16 version:
80.2, lwu * 4 version: 117.2).
Rémi Denis-Courmont <remi@remlab.net> 于2024年5月8日周三 00:10写道:
> Le tiistaina 7. toukokuuta 2024, 10.36.07 EEST uk7b@foxmail.com a écrit :
> > From: sunyuechi <sunyuechi@iscas.ac.cn>
> >
> > C908:
> > vp9_hor_8x8_8bpp_c: 74.7
> > vp9_hor_8x8_8bpp_rvv_i32: 35.7
> > vp9_hor_16x16_8bpp_c: 175.5
> > vp9_hor_16x16_8bpp_rvv_i32: 80.2
> > vp9_hor_32x32_8bpp_c: 510.2
> > vp9_hor_32x32_8bpp_rvv_i32: 264.0
> > ---
> > libavcodec/riscv/vp9_intra_rvv.S | 56 ++++++++++++++++++++++++++++++++
> > libavcodec/riscv/vp9dsp.h | 6 ++++
> > libavcodec/riscv/vp9dsp_init.c | 3 ++
> > 3 files changed, 65 insertions(+)
> >
> > diff --git a/libavcodec/riscv/vp9_intra_rvv.S
> > b/libavcodec/riscv/vp9_intra_rvv.S index db9774c263..dd9bc036e7 100644
> > --- a/libavcodec/riscv/vp9_intra_rvv.S
> > +++ b/libavcodec/riscv/vp9_intra_rvv.S
> > @@ -113,3 +113,59 @@ func_dc dc_left 8 left 3 0 zve64x
> > func_dc dc_top 32 top 5 1 zve32x
> > func_dc dc_top 16 top 4 1 zve32x
> > func_dc dc_top 8 top 3 0 zve64x
> > +
> > +func ff_h_32x32_rvv, zve32x
> > + li t0, 32
> > + addi a2, a2, 31
> > + vsetvli zero, t0, e8, m2, ta, ma
> > +
> > + .rept 2
> > + .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
> > + lbu t1, (a2)
> > + addi a2, a2, -1
> > + vmv.v.x v\n, t1
> > + .endr
> > + .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
> > + vse8.v v\n, (a0)
> > + add a0, a0, a1
> > + .endr
> > + .endr
>
> Do you gain much by unrolling all the way to 16x? Given that you have the
> counter value already in t0, it should not make much difference to just
> unroll
> 2x or maybe 4x and then loop.
>
> It might also be faster to use lhu or lwu and shift to reduce scalar
> loads, at
> least if the vector is suitably aligned.
>
> > +
> > + ret
> > +endfunc
> > +
> > +func ff_h_16x16_rvv, zve32x
> > + addi a2, a2, 15
> > + vsetivli zero, 16, e8, m1, ta, ma
> > +
> > + .irp n 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
> 22, 23
> > + lbu t1, (a2)
> > + addi a2, a2, -1
> > + vmv.v.x v\n, t1
> > + .endr
> > + .irp n 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22
> > + vse8.v v\n, (a0)
> > + add a0, a0, a1
> > + .endr
> > + vse8.v v23, (a0)
> > +
> > + ret
> > +endfunc
> > +
> > +func ff_h_8x8_rvv, zve32x
> > + addi a2, a2, 7
> > + vsetivli zero, 8, e8, mf2, ta, ma
> > +
> > + .irp n 8, 9, 10, 11, 12, 13, 14, 15
> > + lbu t1, (a2)
> > + addi a2, a2, -1
> > + vmv.v.x v\n, t1
> > + .endr
> > + .irp n 8, 9, 10, 11, 12, 13, 14
> > + vse8.v v\n, (a0)
> > + add a0, a0, a1
> > + .endr
> > + vse8.v v15, (a0)
> > +
> > + ret
> > +endfunc
> > diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> > index b8ff282f8a..0ad961c7e0 100644
> > --- a/libavcodec/riscv/vp9dsp.h
> > +++ b/libavcodec/riscv/vp9dsp.h
> > @@ -66,6 +66,12 @@ void ff_v_16x16_rvi(uint8_t *dst, ptrdiff_t stride,
> const
> > uint8_t *l, const uint8_t *a);
> > void ff_v_8x8_rvi(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> > const uint8_t *a);
> > +void ff_h_32x32_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> > + const uint8_t *a);
> > +void ff_h_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> > + const uint8_t *a);
> > +void ff_h_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> > + const uint8_t *a);
> >
> > #define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)
>
> > \ void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> > dststride, \ diff --git a/libavcodec/riscv/vp9dsp_init.c
> > b/libavcodec/riscv/vp9dsp_init.c index c10f8bbe41..7816b13fe0 100644
> > --- a/libavcodec/riscv/vp9dsp_init.c
> > +++ b/libavcodec/riscv/vp9dsp_init.c
> > @@ -59,6 +59,9 @@ static av_cold void
> > vp9dsp_intrapred_init_riscv(VP9DSPContext *dsp, int bpp)
> > dsp->intra_pred[TX_16X16][DC_129_PRED] = ff_dc_129_16x16_rvv;
> > dsp->intra_pred[TX_32X32][TOP_DC_PRED] = ff_dc_top_32x32_rvv;
> > dsp->intra_pred[TX_16X16][TOP_DC_PRED] = ff_dc_top_16x16_rvv; +
>
> > dsp->intra_pred[TX_32X32][HOR_PRED] = ff_h_32x32_rvv; +
> > dsp->intra_pred[TX_16X16][HOR_PRED] = ff_h_16x16_rvv; +
> > dsp->intra_pred[TX_8X8][HOR_PRED] = ff_h_8x8_rvv;
> > }
> > #endif
> > #endif
>
>
> --
> Rémi Denis-Courmont
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2024-05-07 18:43 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
[not found] <20240507073613.2871668-1-uk7b@foxmail.com>
2024-05-07 7:36 ` [FFmpeg-devel] [PATCH v2 2/9] lavc/vp9dsp: R-V mc copy uk7b
2024-05-07 7:36 ` [FFmpeg-devel] [PATCH v2 3/9] lavc/vp9dsp: R-V V ipred hor uk7b
2024-05-07 16:08 ` Rémi Denis-Courmont
2024-05-07 18:42 ` flow gg
2024-05-07 7:36 ` [FFmpeg-devel] [PATCH v2 4/9] lavc/vp9dsp: R-V V ipred tm uk7b
2024-05-07 7:36 ` [FFmpeg-devel] [PATCH v2 5/9] lavc/vp9dsp: R-V V mc avg uk7b
2024-05-07 7:36 ` [FFmpeg-devel] [PATCH v2 6/9] lavc/vp9dsp: R-V V mc bilin h v uk7b
2024-05-07 7:36 ` [FFmpeg-devel] [PATCH v2 7/9] lavc/vp9dsp: R-V V mc tap " uk7b
2024-05-07 7:36 ` [FFmpeg-devel] [PATCH v2 8/9] lavc/vp9dsp: R-V V mc bilin hv uk7b
2024-05-07 7:36 ` [FFmpeg-devel] [PATCH v2 9/9] lavc/vp9dsp: R-V V mc tap hv uk7b
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git