* [FFmpeg-devel] [PATCH v3 2/5] lavc/vp9dsp: R-V V mc bilin h v
[not found] <20240529171540.911099-1-uk7b@foxmail.com>
@ 2024-05-29 17:15 ` uk7b
2024-05-29 17:15 ` [FFmpeg-devel] [PATCH v3 3/5] lavc/vp9dsp: R-V V mc bilin hv uk7b
` (2 subsequent siblings)
3 siblings, 0 replies; 7+ messages in thread
From: uk7b @ 2024-05-29 17:15 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908:
vp9_avg_bilin_4h_8bpp_c: 5.2
vp9_avg_bilin_4h_8bpp_rvv_i64: 2.2
vp9_avg_bilin_4v_8bpp_c: 5.5
vp9_avg_bilin_4v_8bpp_rvv_i64: 2.2
vp9_avg_bilin_8h_8bpp_c: 20.0
vp9_avg_bilin_8h_8bpp_rvv_i64: 4.5
vp9_avg_bilin_8v_8bpp_c: 21.0
vp9_avg_bilin_8v_8bpp_rvv_i64: 4.2
vp9_avg_bilin_16h_8bpp_c: 78.2
vp9_avg_bilin_16h_8bpp_rvv_i64: 9.0
vp9_avg_bilin_16v_8bpp_c: 82.0
vp9_avg_bilin_16v_8bpp_rvv_i64: 9.0
vp9_avg_bilin_32h_8bpp_c: 325.5
vp9_avg_bilin_32h_8bpp_rvv_i64: 26.2
vp9_avg_bilin_32v_8bpp_c: 326.2
vp9_avg_bilin_32v_8bpp_rvv_i64: 26.2
vp9_avg_bilin_64h_8bpp_c: 1265.7
vp9_avg_bilin_64h_8bpp_rvv_i64: 91.5
vp9_avg_bilin_64v_8bpp_c: 1317.0
vp9_avg_bilin_64v_8bpp_rvv_i64: 91.2
vp9_put_bilin_4h_8bpp_c: 4.5
vp9_put_bilin_4h_8bpp_rvv_i64: 1.7
vp9_put_bilin_4v_8bpp_c: 4.7
vp9_put_bilin_4v_8bpp_rvv_i64: 1.7
vp9_put_bilin_8h_8bpp_c: 17.0
vp9_put_bilin_8h_8bpp_rvv_i64: 3.5
vp9_put_bilin_8v_8bpp_c: 18.0
vp9_put_bilin_8v_8bpp_rvv_i64: 3.5
vp9_put_bilin_16h_8bpp_c: 65.2
vp9_put_bilin_16h_8bpp_rvv_i64: 7.5
vp9_put_bilin_16v_8bpp_c: 85.7
vp9_put_bilin_16v_8bpp_rvv_i64: 7.5
vp9_put_bilin_32h_8bpp_c: 257.5
vp9_put_bilin_32h_8bpp_rvv_i64: 23.5
vp9_put_bilin_32v_8bpp_c: 274.5
vp9_put_bilin_32v_8bpp_rvv_i64: 23.5
vp9_put_bilin_64h_8bpp_c: 1040.5
vp9_put_bilin_64h_8bpp_rvv_i64: 82.5
vp9_put_bilin_64v_8bpp_c: 1108.7
vp9_put_bilin_64v_8bpp_rvv_i64: 82.2
---
libavcodec/riscv/vp9_mc_rvv.S | 60 ++++++++++++++++++++++++++++++++++
libavcodec/riscv/vp9dsp.h | 12 +++----
libavcodec/riscv/vp9dsp_init.c | 21 ++++++++++++
3 files changed, 87 insertions(+), 6 deletions(-)
diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index 7cb38ec94a..9611aba0ed 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -53,6 +53,66 @@ func ff_vp9_avg\len\()_rvv, zve32x
endfunc
.endm
+.macro bilin_load dst, op, type, mn
+.ifc \type,v
+ add t5, a2, a3
+.else
+ addi t5, a2, 1
+.endif
+ vle8.v v8, (a2)
+ vle8.v v0, (t5)
+ vwmulu.vx v16, v0, \mn
+ vwmaccsu.vx v16, t1, v8
+ vwadd.wx v16, v16, t4
+ vnsra.wi v16, v16, 4
+ vadd.vv \dst, v16, v8
+.ifc \op,avg
+ vle8.v v16, (a0)
+ vaaddu.vv \dst, \dst, v16
+.endif
+.endm
+
+.macro bilin_h_v op, type, mn
+func ff_\op\()_vp9_bilin_4\type\()_rvv, zve32x
+ vsetvlstatic8 4, t0, 64
+.Lbilin_\type\op:
+.ifc \op,avg
+ csrwi vxrm, 0
+.endif
+ li t4, 8
+ neg t1, \mn
+1:
+ addi a4, a4, -1
+ bilin_load v0, \op, \type, \mn
+ vse8.v v0, (a0)
+ add a2, a2, a3
+ add a0, a0, a1
+ bnez a4, 1b
+
+ ret
+endfunc
+.endm
+
.irp len, 64, 32, 16, 8, 4
copy_avg \len
.endr
+
+bilin_h_v put, h, a5
+bilin_h_v avg, h, a5
+bilin_h_v put, v, a6
+bilin_h_v avg, v, a6
+
+.macro func_bilin_h_v len, op, type
+func ff_\op\()_vp9_bilin_\len\()\type\()_rvv, zve32x
+ vsetvlstatic8 \len, t0, 64
+ j .Lbilin_\type\()\op
+endfunc
+.endm
+
+.irp len, 64, 32, 16, 8
+ .irp op, put, avg
+ .irp type, h, v
+ func_bilin_h_v \len, \op, \type
+ .endr
+ .endr
+.endr
diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
index ff8431591c..8fb326dae0 100644
--- a/libavcodec/riscv/vp9dsp.h
+++ b/libavcodec/riscv/vp9dsp.h
@@ -113,27 +113,27 @@ void ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \
int h, int mx, int my);
#define VP9_BILINEAR_RISCV_RVV_FUNC(SIZE) \
-void ff_put_bilin_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \
+void ff_put_vp9_bilin_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \
const uint8_t *src, ptrdiff_t srcstride, \
int h, int mx, int my); \
\
-void ff_put_bilin_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \
+void ff_put_vp9_bilin_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \
const uint8_t *src, ptrdiff_t srcstride, \
int h, int mx, int my); \
\
-void ff_put_bilin_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \
+void ff_put_vp9_bilin_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \
const uint8_t *src, ptrdiff_t srcstride, \
int h, int mx, int my); \
\
-void ff_avg_bilin_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \
+void ff_avg_vp9_bilin_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \
const uint8_t *src, ptrdiff_t srcstride, \
int h, int mx, int my); \
\
-void ff_avg_bilin_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \
+void ff_avg_vp9_bilin_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \
const uint8_t *src, ptrdiff_t srcstride, \
int h, int mx, int my); \
\
-void ff_avg_bilin_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \
+void ff_avg_vp9_bilin_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \
const uint8_t *src, ptrdiff_t srcstride, \
int h, int mx, int my);
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index 454dcd963f..9606d8545f 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -63,6 +63,27 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
init_fpel(3, 8);
init_fpel(4, 4);
+ dsp->mc[0][FILTER_BILINEAR ][0][0][1] = ff_put_vp9_bilin_64v_rvv;
+ dsp->mc[0][FILTER_BILINEAR ][0][1][0] = ff_put_vp9_bilin_64h_rvv;
+ dsp->mc[0][FILTER_BILINEAR ][1][0][1] = ff_avg_vp9_bilin_64v_rvv;
+ dsp->mc[0][FILTER_BILINEAR ][1][1][0] = ff_avg_vp9_bilin_64h_rvv;
+ dsp->mc[1][FILTER_BILINEAR ][0][0][1] = ff_put_vp9_bilin_32v_rvv;
+ dsp->mc[1][FILTER_BILINEAR ][0][1][0] = ff_put_vp9_bilin_32h_rvv;
+ dsp->mc[1][FILTER_BILINEAR ][1][0][1] = ff_avg_vp9_bilin_32v_rvv;
+ dsp->mc[1][FILTER_BILINEAR ][1][1][0] = ff_avg_vp9_bilin_32h_rvv;
+ dsp->mc[2][FILTER_BILINEAR ][0][0][1] = ff_put_vp9_bilin_16v_rvv;
+ dsp->mc[2][FILTER_BILINEAR ][0][1][0] = ff_put_vp9_bilin_16h_rvv;
+ dsp->mc[2][FILTER_BILINEAR ][1][0][1] = ff_avg_vp9_bilin_16v_rvv;
+ dsp->mc[2][FILTER_BILINEAR ][1][1][0] = ff_avg_vp9_bilin_16h_rvv;
+ dsp->mc[3][FILTER_BILINEAR ][0][0][1] = ff_put_vp9_bilin_8v_rvv;
+ dsp->mc[3][FILTER_BILINEAR ][0][1][0] = ff_put_vp9_bilin_8h_rvv;
+ dsp->mc[3][FILTER_BILINEAR ][1][0][1] = ff_avg_vp9_bilin_8v_rvv;
+ dsp->mc[3][FILTER_BILINEAR ][1][1][0] = ff_avg_vp9_bilin_8h_rvv;
+ dsp->mc[4][FILTER_BILINEAR ][0][0][1] = ff_put_vp9_bilin_4v_rvv;
+ dsp->mc[4][FILTER_BILINEAR ][0][1][0] = ff_put_vp9_bilin_4h_rvv;
+ dsp->mc[4][FILTER_BILINEAR ][1][0][1] = ff_avg_vp9_bilin_4v_rvv;
+ dsp->mc[4][FILTER_BILINEAR ][1][1][0] = ff_avg_vp9_bilin_4h_rvv;
+
#undef init_fpel
}
#endif
--
2.45.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
* [FFmpeg-devel] [PATCH v3 3/5] lavc/vp9dsp: R-V V mc bilin hv
[not found] <20240529171540.911099-1-uk7b@foxmail.com>
2024-05-29 17:15 ` [FFmpeg-devel] [PATCH v3 2/5] lavc/vp9dsp: R-V V mc bilin h v uk7b
@ 2024-05-29 17:15 ` uk7b
2024-05-29 20:07 ` Rémi Denis-Courmont
2024-05-29 17:15 ` [FFmpeg-devel] [PATCH v3 4/5] lavc/vp9dsp: R-V V mc tap h v uk7b
2024-05-29 17:15 ` [FFmpeg-devel] [PATCH v3 5/5] lavc/vp9dsp: R-V V mc tap hv uk7b
3 siblings, 1 reply; 7+ messages in thread
From: uk7b @ 2024-05-29 17:15 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908:
vp9_avg_bilin_4hv_8bpp_c: 11.0
vp9_avg_bilin_4hv_8bpp_rvv_i64: 3.7
vp9_avg_bilin_8hv_8bpp_c: 38.7
vp9_avg_bilin_8hv_8bpp_rvv_i64: 7.2
vp9_avg_bilin_16hv_8bpp_c: 147.0
vp9_avg_bilin_16hv_8bpp_rvv_i64: 14.2
vp9_avg_bilin_32hv_8bpp_c: 574.5
vp9_avg_bilin_32hv_8bpp_rvv_i64: 42.7
vp9_avg_bilin_64hv_8bpp_c: 2311.5
vp9_avg_bilin_64hv_8bpp_rvv_i64: 201.7
vp9_put_bilin_4hv_8bpp_c: 10.0
vp9_put_bilin_4hv_8bpp_rvv_i64: 3.2
vp9_put_bilin_8hv_8bpp_c: 35.2
vp9_put_bilin_8hv_8bpp_rvv_i64: 6.5
vp9_put_bilin_16hv_8bpp_c: 133.7
vp9_put_bilin_16hv_8bpp_rvv_i64: 13.0
vp9_put_bilin_32hv_8bpp_c: 538.2
vp9_put_bilin_32hv_8bpp_rvv_i64: 39.7
vp9_put_bilin_64hv_8bpp_c: 2114.0
vp9_put_bilin_64hv_8bpp_rvv_i64: 153.7
---
libavcodec/riscv/vp9_mc_rvv.S | 38 +++++++++++++++++++++++++++++++++-
libavcodec/riscv/vp9dsp_init.c | 10 +++++++++
2 files changed, 47 insertions(+), 1 deletion(-)
diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index 9611aba0ed..990271736b 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -93,6 +93,40 @@ func ff_\op\()_vp9_bilin_4\type\()_rvv, zve32x
endfunc
.endm
+.macro bilin_hv op
+func ff_\op\()_vp9_bilin_4hv_rvv, zve32x
+ vsetvlstatic8 4, t0, 64
+.Lbilin_hv\op:
+.ifc \op,avg
+ csrwi vxrm, 0
+.endif
+ neg t1, a5
+ neg t2, a6
+ li t4, 8
+ bilin_load v24, put, h, a5
+ add a2, a2, a3
+1:
+ addi a4, a4, -1
+ bilin_load v4, put, h, a5
+ vwmulu.vx v16, v4, a6
+ vwmaccsu.vx v16, t2, v24
+ vwadd.wx v16, v16, t4
+ vnsra.wi v16, v16, 4
+ vadd.vv v0, v16, v24
+.ifc \op,avg
+ vle8.v v16, (a0)
+ vaaddu.vv v0, v0, v16
+.endif
+ vse8.v v0, (a0)
+ vmv.v.v v24, v4
+ add a2, a2, a3
+ add a0, a0, a1
+ bnez a4, 1b
+
+ ret
+endfunc
+.endm
+
.irp len, 64, 32, 16, 8, 4
copy_avg \len
.endr
@@ -101,6 +135,8 @@ bilin_h_v put, h, a5
bilin_h_v avg, h, a5
bilin_h_v put, v, a6
bilin_h_v avg, v, a6
+bilin_hv put
+bilin_hv avg
.macro func_bilin_h_v len, op, type
func ff_\op\()_vp9_bilin_\len\()\type\()_rvv, zve32x
@@ -111,7 +147,7 @@ endfunc
.irp len, 64, 32, 16, 8
.irp op, put, avg
- .irp type, h, v
+ .irp type, h, v, hv
func_bilin_h_v \len, \op, \type
.endr
.endr
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index 9606d8545f..b3700dfb08 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -83,6 +83,16 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
dsp->mc[4][FILTER_BILINEAR ][0][1][0] = ff_put_vp9_bilin_4h_rvv;
dsp->mc[4][FILTER_BILINEAR ][1][0][1] = ff_avg_vp9_bilin_4v_rvv;
dsp->mc[4][FILTER_BILINEAR ][1][1][0] = ff_avg_vp9_bilin_4h_rvv;
+ dsp->mc[0][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_64hv_rvv;
+ dsp->mc[0][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_64hv_rvv;
+ dsp->mc[1][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_32hv_rvv;
+ dsp->mc[1][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_32hv_rvv;
+ dsp->mc[2][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_16hv_rvv;
+ dsp->mc[2][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_16hv_rvv;
+ dsp->mc[3][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_8hv_rvv;
+ dsp->mc[3][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_8hv_rvv;
+ dsp->mc[4][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_4hv_rvv;
+ dsp->mc[4][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_4hv_rvv;
#undef init_fpel
}
--
2.45.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
* [FFmpeg-devel] [PATCH v3 4/5] lavc/vp9dsp: R-V V mc tap h v
[not found] <20240529171540.911099-1-uk7b@foxmail.com>
2024-05-29 17:15 ` [FFmpeg-devel] [PATCH v3 2/5] lavc/vp9dsp: R-V V mc bilin h v uk7b
2024-05-29 17:15 ` [FFmpeg-devel] [PATCH v3 3/5] lavc/vp9dsp: R-V V mc bilin hv uk7b
@ 2024-05-29 17:15 ` uk7b
2024-05-29 17:19 ` flow gg
2024-05-29 20:29 ` Rémi Denis-Courmont
2024-05-29 17:15 ` [FFmpeg-devel] [PATCH v3 5/5] lavc/vp9dsp: R-V V mc tap hv uk7b
3 siblings, 2 replies; 7+ messages in thread
From: uk7b @ 2024-05-29 17:15 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908 X60
vp9_avg_8tap_smooth_4h_8bpp_c : 13.0 11.2
vp9_avg_8tap_smooth_4h_8bpp_rvv_i32 : 5.0 4.2
vp9_avg_8tap_smooth_4v_8bpp_c : 13.7 12.5
vp9_avg_8tap_smooth_4v_8bpp_rvv_i32 : 5.0 4.2
vp9_avg_8tap_smooth_8h_8bpp_c : 49.5 42.2
vp9_avg_8tap_smooth_8h_8bpp_rvv_i32 : 9.2 8.5
vp9_avg_8tap_smooth_8v_8bpp_c : 66.5 45.0
vp9_avg_8tap_smooth_8v_8bpp_rvv_i32 : 9.5 8.5
vp9_avg_8tap_smooth_16h_8bpp_c : 192.7 166.5
vp9_avg_8tap_smooth_16h_8bpp_rvv_i32 : 21.2 18.7
vp9_avg_8tap_smooth_16v_8bpp_c : 192.2 175.7
vp9_avg_8tap_smooth_16v_8bpp_rvv_i32 : 21.5 19.0
vp9_avg_8tap_smooth_32h_8bpp_c : 780.2 663.7
vp9_avg_8tap_smooth_32h_8bpp_rvv_i32 : 83.5 60.0
vp9_avg_8tap_smooth_32v_8bpp_c : 770.5 689.2
vp9_avg_8tap_smooth_32v_8bpp_rvv_i32 : 67.2 60.0
vp9_avg_8tap_smooth_64h_8bpp_c : 3115.5 2647.2
vp9_avg_8tap_smooth_64h_8bpp_rvv_i32 : 283.5 119.2
vp9_avg_8tap_smooth_64v_8bpp_c : 3082.2 2729.0
vp9_avg_8tap_smooth_64v_8bpp_rvv_i32 : 305.2 119.0
vp9_put_8tap_smooth_4h_8bpp_c : 11.2 9.7
vp9_put_8tap_smooth_4h_8bpp_rvv_i32 : 4.2 4.0
vp9_put_8tap_smooth_4v_8bpp_c : 11.7 10.7
vp9_put_8tap_smooth_4v_8bpp_rvv_i32 : 4.2 4.0
vp9_put_8tap_smooth_8h_8bpp_c : 42.0 37.5
vp9_put_8tap_smooth_8h_8bpp_rvv_i32 : 8.5 7.7
vp9_put_8tap_smooth_8v_8bpp_c : 44.2 38.7
vp9_put_8tap_smooth_8v_8bpp_rvv_i32 : 8.5 7.7
vp9_put_8tap_smooth_16h_8bpp_c : 165.7 147.2
vp9_put_8tap_smooth_16h_8bpp_rvv_i32 : 19.5 17.5
vp9_put_8tap_smooth_16v_8bpp_c : 169.0 149.7
vp9_put_8tap_smooth_16v_8bpp_rvv_i32 : 19.7 17.5
vp9_put_8tap_smooth_32h_8bpp_c : 659.7 586.7
vp9_put_8tap_smooth_32h_8bpp_rvv_i32 : 64.2 57.2
vp9_put_8tap_smooth_32v_8bpp_c : 680.5 591.2
vp9_put_8tap_smooth_32v_8bpp_rvv_i32 : 64.2 57.2
vp9_put_8tap_smooth_64h_8bpp_c : 2681.5 2339.0
vp9_put_8tap_smooth_64h_8bpp_rvv_i32 : 255.5 114.2
vp9_put_8tap_smooth_64v_8bpp_c : 2709.7 2348.7
vp9_put_8tap_smooth_64v_8bpp_rvv_i32 : 255.5 114.0
---
libavcodec/riscv/vp9_mc_rvv.S | 204 +++++++++++++++++++++++++++++++++
libavcodec/riscv/vp9dsp.h | 72 ++++++++----
libavcodec/riscv/vp9dsp_init.c | 37 +++++-
3 files changed, 288 insertions(+), 25 deletions(-)
diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index 990271736b..53dd833dac 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -36,6 +36,18 @@
.endif
.endm
+.macro vsetvlstatic16 len
+.ifc \len,4
+ vsetvli zero, zero, e16, mf2, ta, ma
+.elseif \len == 8
+ vsetvli zero, zero, e16, m1, ta, ma
+.elseif \len == 16
+ vsetvli zero, zero, e16, m2, ta, ma
+.else
+ vsetvli zero, zero, e16, m4, ta, ma
+.endif
+.endm
+
.macro copy_avg len
func ff_vp9_avg\len\()_rvv, zve32x
csrwi vxrm, 0
@@ -127,8 +139,200 @@ func ff_\op\()_vp9_bilin_4hv_rvv, zve32x
endfunc
.endm
+.macro epel_filter name, type, regtype
+ lla \regtype\()2, ff_vp9_subpel_filters
+
+.ifc \name,regular
+ addi \regtype\()2, \regtype\()2, 16*8*2
+.endif
+.ifc \name,sharp
+ addi \regtype\()2, \regtype\()2, 16*8*2*2
+.endif
+
+.ifc \type,v
+ slli \regtype\()0, a6, 4
+.else
+ slli \regtype\()0, a5, 4
+.endif
+ add \regtype\()0, \regtype\()0, \regtype\()2
+
+ lh \regtype\()1, 2(\regtype\()0)
+ lh \regtype\()2, 4(\regtype\()0)
+ lh \regtype\()3, 6(\regtype\()0)
+ lh \regtype\()4, 8(\regtype\()0)
+ lh \regtype\()5, 10(\regtype\()0)
+ lh \regtype\()6, 12(\regtype\()0)
+
+.ifc \regtype,t
+ lh a7, 14(\regtype\()0)
+.else
+ lh s7, 14(\regtype\()0)
+.endif
+ lh \regtype\()0, 0(\regtype\()0)
+.endm
+
+.macro epel_load dst, len, op, name, type, from_mem, regtype
+ li a5, 64
+.ifc \from_mem, 1
+ vle8.v v22, (a2)
+.ifc \type,v
+ sub a2, a2, a3
+ vle8.v v20, (a2)
+ sh1add a2, a3, a2
+ vle8.v v24, (a2)
+ add a2, a2, a3
+ vle8.v v26, (a2)
+ add a2, a2, a3
+ vle8.v v28, (a2)
+ add a2, a2, a3
+ vle8.v v30, (a2)
+.else
+ addi a2, a2, -1
+ vle8.v v20, (a2)
+ addi a2, a2, 2
+ vle8.v v24, (a2)
+ addi a2, a2, 1
+ vle8.v v26, (a2)
+ addi a2, a2, 1
+ vle8.v v28, (a2)
+ addi a2, a2, 1
+ vle8.v v30, (a2)
+.endif
+
+.ifc \name,smooth
+ vwmulu.vx v16, v24, \regtype\()4
+ vwmaccu.vx v16, \regtype\()2, v20
+ vwmaccu.vx v16, \regtype\()5, v26
+ vwmaccsu.vx v16, \regtype\()6, v28
+.else
+ vwmulu.vx v16, v28, \regtype\()6
+ vwmaccsu.vx v16, \regtype\()2, v20
+ vwmaccsu.vx v16, \regtype\()5, v26
+.endif
+
+.ifc \regtype,t
+ vwmaccsu.vx v16, a7, v30
+.else
+ vwmaccsu.vx v16, s7, v30
+.endif
+
+.ifc \type,v
+ .rept 6
+ sub a2, a2, a3
+ .endr
+ vle8.v v28, (a2)
+ sub a2, a2, a3
+ vle8.v v26, (a2)
+ sh1add a2, a3, a2
+ add a2, a2, a3
+.else
+ addi a2, a2, -6
+ vle8.v v28, (a2)
+ addi a2, a2, -1
+ vle8.v v26, (a2)
+ addi a2, a2, 3
+.endif
+
+.ifc \name,smooth
+ vwmaccsu.vx v16, \regtype\()1, v28
+.else
+ vwmaccu.vx v16, \regtype\()1, v28
+ vwmulu.vx v28, v24, \regtype\()4
+.endif
+ vwmaccsu.vx v16, \regtype\()0, v26
+ vwmulu.vx v20, v22, \regtype\()3
+.else
+.ifc \name,smooth
+ vwmulu.vx v16, v8, \regtype\()4
+ vwmaccu.vx v16, \regtype\()2, v4
+ vwmaccu.vx v16, \regtype\()5, v10
+ vwmaccsu.vx v16, \regtype\()6, v12
+ vwmaccsu.vx v16, \regtype\()1, v2
+.else
+ vwmulu.vx v16, v2, \regtype\()1
+ vwmaccu.vx v16, \regtype\()6, v12
+ vwmaccsu.vx v16, \regtype\()5, v10
+ vwmaccsu.vx v16, \regtype\()2, v4
+ vwmulu.vx v28, v8, \regtype\()4
+.endif
+ vwmaccsu.vx v16, \regtype\()0, v0
+ vwmulu.vx v20, v6, \regtype\()3
+
+.ifc \regtype,t
+ vwmaccsu.vx v16, a7, v14
+.else
+ vwmaccsu.vx v16, s7, v14
+.endif
+
+.endif
+ vwadd.wx v16, v16, a5
+ vsetvlstatic16 \len
+
+.ifc \name,smooth
+ vwadd.vv v24, v16, v20
+.else
+ vwadd.vv v24, v16, v28
+ vwadd.wv v24, v24, v20
+.endif
+ vnsra.wi v24, v24, 7
+ vmax.vx v24, v24, zero
+ vsetvlstatic8 \len, zero, 32, m2
+
+ vnclipu.wi \dst, v24, 0
+.ifc \op,avg
+ vle8.v v24, (a0)
+ vaaddu.vv \dst, \dst, v24
+.endif
+
+.endm
+
+.macro epel_load_inc dst, len, op, name, type, from_mem, regtype
+ epel_load \dst, \len, \op, \name, \type, \from_mem, \regtype
+ add a2, a2, a3
+.endm
+
+.macro epel len, op, name, type, vlen
+func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
+ epel_filter \name, \type, t
+.if \vlen < 256
+ vsetvlstatic8 \len, a5, 32, m2
+.else
+ vsetvlstatic8 \len, a5, 64, m2
+.endif
+.ifc \op,avg
+ csrwi vxrm, 0
+.endif
+
+1:
+ addi a4, a4, -1
+ epel_load v30, \len, \op, \name, \type, 1, t
+ vse8.v v30, (a0)
+.if \len == 64 && \vlen < 256
+ addi a0, a0, 32
+ addi a2, a2, 32
+ epel_load v30, \len, \op, \name, \type, 1, t
+ vse8.v v30, (a0)
+ addi a0, a0, -32
+ addi a2, a2, -32
+.endif
+ add a2, a2, a3
+ add a0, a0, a1
+ bnez a4, 1b
+
+ ret
+endfunc
+.endm
+
.irp len, 64, 32, 16, 8, 4
copy_avg \len
+ .irp op, put, avg
+ .irp name, regular, sharp, smooth
+ .irp type, h, v
+ epel \len, \op, \name, \type, 128
+ epel \len, \op, \name, \type, 256
+ .endr
+ .endr
+ .endr
.endr
bilin_h_v put, h, a5
diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
index 8fb326dae0..5fd64a1b8c 100644
--- a/libavcodec/riscv/vp9dsp.h
+++ b/libavcodec/riscv/vp9dsp.h
@@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
const uint8_t *a);
-#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx) \
-void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \
+#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx, min_vlen) \
+void ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst, \
+ ptrdiff_t dststride, \
const uint8_t *src, \
ptrdiff_t srcstride, \
int h, int mx, int my); \
\
-void ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \
+void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst, \
+ ptrdiff_t dststride, \
const uint8_t *src, \
ptrdiff_t srcstride, \
int h, int mx, int my); \
\
-void ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \
+void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst, \
+ ptrdiff_t dststride, \
const uint8_t *src, \
ptrdiff_t srcstride, \
int h, int mx, int my); \
\
-void ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \
+void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst, \
+ ptrdiff_t dststride, \
const uint8_t *src, \
ptrdiff_t srcstride, \
int h, int mx, int my); \
\
-void ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \
+void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst, \
+ ptrdiff_t dststride, \
const uint8_t *src, \
ptrdiff_t srcstride, \
int h, int mx, int my); \
\
-void ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \
+void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst, \
+ ptrdiff_t dststride, \
const uint8_t *src, \
ptrdiff_t srcstride, \
int h, int mx, int my);
@@ -146,23 +152,41 @@ void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t dststride, \
const uint8_t *src, ptrdiff_t srcstride, \
int h, int mx, int my);
-VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR);
-
-VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP);
-
-VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256);
VP9_BILINEAR_RISCV_RVV_FUNC(64);
VP9_BILINEAR_RISCV_RVV_FUNC(32);
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index b3700dfb08..5f759e6bc8 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -49,7 +49,8 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
# endif
#if HAVE_RVV
- if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) && ff_rv_vlen_least(128)) {
+ if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
+ if (ff_rv_vlen_least(128)) {
#define init_fpel(idx1, sz) \
dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_vp9_avg##sz##_rvv; \
@@ -95,6 +96,40 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
dsp->mc[4][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_4hv_rvv;
#undef init_fpel
+
+#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen) \
+ dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = \
+ ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen; \
+ dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = \
+ ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen; \
+ dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = \
+ ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen;
+
+#define init_subpel2(idx, idxh, idxv, dir, type, vlen) \
+ init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen); \
+ init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen); \
+ init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen); \
+ init_subpel1(3, idx, idxh, idxv, 8, dir, type, vlen); \
+ init_subpel1(4, idx, idxh, idxv, 4, dir, type, vlen)
+
+ init_subpel2(0, 1, 0, h, put, 128);
+ init_subpel2(1, 1, 0, h, avg, 128);
+
+ if (flags & AV_CPU_FLAG_RVB_ADDR) {
+ init_subpel2(0, 0, 1, v, put, 128);
+ init_subpel2(1, 0, 1, v, avg, 128);
+ }
+
+ }
+ if (ff_rv_vlen_least(256)) {
+ init_subpel2(0, 1, 0, h, put, 256);
+ init_subpel2(1, 1, 0, h, avg, 256);
+
+ if (flags & AV_CPU_FLAG_RVB_ADDR) {
+ init_subpel2(0, 0, 1, v, put, 256);
+ init_subpel2(1, 0, 1, v, avg, 256);
+ }
+ }
}
#endif
#endif
--
2.45.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
* [FFmpeg-devel] [PATCH v3 5/5] lavc/vp9dsp: R-V V mc tap hv
[not found] <20240529171540.911099-1-uk7b@foxmail.com>
` (2 preceding siblings ...)
2024-05-29 17:15 ` [FFmpeg-devel] [PATCH v3 4/5] lavc/vp9dsp: R-V V mc tap h v uk7b
@ 2024-05-29 17:15 ` uk7b
3 siblings, 0 replies; 7+ messages in thread
From: uk7b @ 2024-05-29 17:15 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: sunyuechi
From: sunyuechi <sunyuechi@iscas.ac.cn>
C908 X60
vp9_avg_8tap_smooth_4hv_8bpp_c : 32.0 28.2
vp9_avg_8tap_smooth_4hv_8bpp_rvv_i32 : 15.0 13.2
vp9_avg_8tap_smooth_8hv_8bpp_c : 98.0 86.2
vp9_avg_8tap_smooth_8hv_8bpp_rvv_i32 : 23.7 21.0
vp9_avg_8tap_smooth_16hv_8bpp_c : 355.5 297.0
vp9_avg_8tap_smooth_16hv_8bpp_rvv_i32 : 62.7 41.2
vp9_avg_8tap_smooth_32hv_8bpp_c : 1273.0 1099.7
vp9_avg_8tap_smooth_32hv_8bpp_rvv_i32 : 133.7 119.2
vp9_avg_8tap_smooth_64hv_8bpp_c : 4933.0 4240.5
vp9_avg_8tap_smooth_64hv_8bpp_rvv_i32 : 506.7 227.0
vp9_put_8tap_smooth_4hv_8bpp_c : 30.2 27.0
vp9_put_8tap_smooth_4hv_8bpp_rvv_i32 : 14.5 12.7
vp9_put_8tap_smooth_8hv_8bpp_c : 91.2 81.2
vp9_put_8tap_smooth_8hv_8bpp_rvv_i32 : 22.7 20.2
vp9_put_8tap_smooth_16hv_8bpp_c : 329.2 277.7
vp9_put_8tap_smooth_16hv_8bpp_rvv_i32 : 44.7 40.0
vp9_put_8tap_smooth_32hv_8bpp_c : 1183.7 1022.7
vp9_put_8tap_smooth_32hv_8bpp_rvv_i32 : 130.7 116.5
vp9_put_8tap_smooth_64hv_8bpp_c : 4502.7 3954.5
vp9_put_8tap_smooth_64hv_8bpp_rvv_i32 : 496.0 224.7
---
libavcodec/riscv/vp9_mc_rvv.S | 75 ++++++++++++++++++++++++++++++++++
libavcodec/riscv/vp9dsp_init.c | 8 ++++
2 files changed, 83 insertions(+)
diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index 53dd833dac..fed698c802 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -323,6 +323,77 @@ func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
endfunc
.endm
+#if __riscv_xlen == 64
+.macro epel_hv_once len, name, op
+ sub a2, a2, a3
+ sub a2, a2, a3
+ sub a2, a2, a3
+ .irp n,0,2,4,6,8,10,12,14
+ epel_load_inc v\n, \len, put, \name, h, 1, t
+ .endr
+ addi a4, a4, -1
+1:
+ addi a4, a4, -1
+ epel_load v30, \len, \op, \name, v, 0, s
+ vse8.v v30, (a0)
+ vmv.v.v v0, v2
+ vmv.v.v v2, v4
+ vmv.v.v v4, v6
+ vmv.v.v v6, v8
+ vmv.v.v v8, v10
+ vmv.v.v v10, v12
+ vmv.v.v v12, v14
+ epel_load v14, \len, put, \name, h, 1, t
+ add a2, a2, a3
+ add a0, a0, a1
+ bnez a4, 1b
+ epel_load v30, \len, \op, \name, v, 0, s
+ vse8.v v30, (a0)
+.endm
+
+.macro epel_hv op, name, len, vlen
+func ff_\op\()_vp9_8tap_\name\()_\len\()hv_rvv\vlen\(), zve32x
+ addi sp, sp, -64
+ .irp n,0,1,2,3,4,5,6,7
+ sd s\n, \n\()<<3(sp)
+ .endr
+.if \len == 64 && \vlen < 256
+ addi sp, sp, -48
+ .irp n,0,1,2,3,4,5
+ sd a\n, \n\()<<3(sp)
+ .endr
+.endif
+.ifc \op,avg
+ csrwi vxrm, 0
+.endif
+ epel_filter \name, h, t
+ epel_filter \name, v, s
+.if \vlen < 256
+ vsetvlstatic8 \len, a6, 32, m2
+.else
+ vsetvlstatic8 \len, a6, 64, m2
+.endif
+ epel_hv_once \len, \name, \op
+.if \len == 64 && \vlen < 256
+ .irp n,0,1,2,3,4,5
+ ld a\n, \n\()<<3(sp)
+ .endr
+ addi sp, sp, 48
+ addi a0, a0, 32
+ addi a2, a2, 32
+ epel_filter \name, h, t
+ epel_hv_once \len, \name, \op
+.endif
+ .irp n,0,1,2,3,4,5,6,7
+ ld s\n, \n\()<<3(sp)
+ .endr
+ addi sp, sp, 64
+
+ ret
+endfunc
+.endm
+#endif
+
.irp len, 64, 32, 16, 8, 4
copy_avg \len
.irp op, put, avg
@@ -331,6 +402,10 @@ endfunc
epel \len, \op, \name, \type, 128
epel \len, \op, \name, \type, 256
.endr
+ #if __riscv_xlen == 64
+ epel_hv \op, \name, \len, 128
+ epel_hv \op, \name, \len, 256
+ #endif
.endr
.endr
.endr
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index 5f759e6bc8..6b39ad8ee0 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -118,6 +118,10 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
if (flags & AV_CPU_FLAG_RVB_ADDR) {
init_subpel2(0, 0, 1, v, put, 128);
init_subpel2(1, 0, 1, v, avg, 128);
+# if __riscv_xlen == 64
+ init_subpel2(0, 1, 1, hv, put, 128);
+ init_subpel2(1, 1, 1, hv, avg, 128);
+# endif
}
}
@@ -128,6 +132,10 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
if (flags & AV_CPU_FLAG_RVB_ADDR) {
init_subpel2(0, 0, 1, v, put, 256);
init_subpel2(1, 0, 1, v, avg, 256);
+# if __riscv_xlen == 64
+ init_subpel2(0, 1, 1, hv, put, 256);
+ init_subpel2(1, 1, 1, hv, avg, 256);
+# endif
}
}
}
--
2.45.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3 4/5] lavc/vp9dsp: R-V V mc tap h v
2024-05-29 17:15 ` [FFmpeg-devel] [PATCH v3 4/5] lavc/vp9dsp: R-V V mc tap h v uk7b
@ 2024-05-29 17:19 ` flow gg
2024-05-29 20:29 ` Rémi Denis-Courmont
1 sibling, 0 replies; 7+ messages in thread
From: flow gg @ 2024-05-29 17:19 UTC (permalink / raw)
To: FFmpeg development discussions and patches
A portion has been modified according to the previous review, but there are
still some parts that haven't been updated
> Similarly, it
> should be possible to share most of the horizontal and vertical code
(maybe
> also for bilinear. not just EPel) with separate load/store then inner
> procedures. The H.263 loop filter already does that though with almost no
> overhead, though
> H.263 is obviously simpler than VP9.
>
> A French philosopher famously said that Perfect is the ennemy of Good.
> Generally, as with VVC, nested repetition macros for finely specialised
> functions tend to generate way too much byte code, and this ends up being
> worse rather than better in the big picture.
Here, bilin is modified with reference to your vp8 modification method, but
there are some issues with epel. I want to share most of the horizontal and
vertical code like h263, but because there are different types
(op/name/len), such changes seem hard. Trying to make similar modifications
for bilin also seems some hard , maybe leaving it for future optimization
:'(
> It should be possible to spare one ADDI by using just AUIPC here, and
folding
> the immediate offset into the LB's below (see also H.263 loop filter).
I'm not sure where the problem lies, but for smooth it works, but for
sharp, regular, it gives this error:
dangerous relocation: %pcrel_lo overflow with an addend, the value of
%pcrel_hi is 0xa5000 without any addend, but may be 0xa6000 after adding
the %pcrel_lo addend
<uk7b@foxmail.com> 于2024年5月30日周四 01:16写道:
> From: sunyuechi <sunyuechi@iscas.ac.cn>
>
> C908 X60
> vp9_avg_8tap_smooth_4h_8bpp_c : 13.0 11.2
> vp9_avg_8tap_smooth_4h_8bpp_rvv_i32 : 5.0 4.2
> vp9_avg_8tap_smooth_4v_8bpp_c : 13.7 12.5
> vp9_avg_8tap_smooth_4v_8bpp_rvv_i32 : 5.0 4.2
> vp9_avg_8tap_smooth_8h_8bpp_c : 49.5 42.2
> vp9_avg_8tap_smooth_8h_8bpp_rvv_i32 : 9.2 8.5
> vp9_avg_8tap_smooth_8v_8bpp_c : 66.5 45.0
> vp9_avg_8tap_smooth_8v_8bpp_rvv_i32 : 9.5 8.5
> vp9_avg_8tap_smooth_16h_8bpp_c : 192.7 166.5
> vp9_avg_8tap_smooth_16h_8bpp_rvv_i32 : 21.2 18.7
> vp9_avg_8tap_smooth_16v_8bpp_c : 192.2 175.7
> vp9_avg_8tap_smooth_16v_8bpp_rvv_i32 : 21.5 19.0
> vp9_avg_8tap_smooth_32h_8bpp_c : 780.2 663.7
> vp9_avg_8tap_smooth_32h_8bpp_rvv_i32 : 83.5 60.0
> vp9_avg_8tap_smooth_32v_8bpp_c : 770.5 689.2
> vp9_avg_8tap_smooth_32v_8bpp_rvv_i32 : 67.2 60.0
> vp9_avg_8tap_smooth_64h_8bpp_c : 3115.5 2647.2
> vp9_avg_8tap_smooth_64h_8bpp_rvv_i32 : 283.5 119.2
> vp9_avg_8tap_smooth_64v_8bpp_c : 3082.2 2729.0
> vp9_avg_8tap_smooth_64v_8bpp_rvv_i32 : 305.2 119.0
> vp9_put_8tap_smooth_4h_8bpp_c : 11.2 9.7
> vp9_put_8tap_smooth_4h_8bpp_rvv_i32 : 4.2 4.0
> vp9_put_8tap_smooth_4v_8bpp_c : 11.7 10.7
> vp9_put_8tap_smooth_4v_8bpp_rvv_i32 : 4.2 4.0
> vp9_put_8tap_smooth_8h_8bpp_c : 42.0 37.5
> vp9_put_8tap_smooth_8h_8bpp_rvv_i32 : 8.5 7.7
> vp9_put_8tap_smooth_8v_8bpp_c : 44.2 38.7
> vp9_put_8tap_smooth_8v_8bpp_rvv_i32 : 8.5 7.7
> vp9_put_8tap_smooth_16h_8bpp_c : 165.7 147.2
> vp9_put_8tap_smooth_16h_8bpp_rvv_i32 : 19.5 17.5
> vp9_put_8tap_smooth_16v_8bpp_c : 169.0 149.7
> vp9_put_8tap_smooth_16v_8bpp_rvv_i32 : 19.7 17.5
> vp9_put_8tap_smooth_32h_8bpp_c : 659.7 586.7
> vp9_put_8tap_smooth_32h_8bpp_rvv_i32 : 64.2 57.2
> vp9_put_8tap_smooth_32v_8bpp_c : 680.5 591.2
> vp9_put_8tap_smooth_32v_8bpp_rvv_i32 : 64.2 57.2
> vp9_put_8tap_smooth_64h_8bpp_c : 2681.5 2339.0
> vp9_put_8tap_smooth_64h_8bpp_rvv_i32 : 255.5 114.2
> vp9_put_8tap_smooth_64v_8bpp_c : 2709.7 2348.7
> vp9_put_8tap_smooth_64v_8bpp_rvv_i32 : 255.5 114.0
> ---
> libavcodec/riscv/vp9_mc_rvv.S | 204 +++++++++++++++++++++++++++++++++
> libavcodec/riscv/vp9dsp.h | 72 ++++++++----
> libavcodec/riscv/vp9dsp_init.c | 37 +++++-
> 3 files changed, 288 insertions(+), 25 deletions(-)
>
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index 990271736b..53dd833dac 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -36,6 +36,18 @@
> .endif
> .endm
>
> +.macro vsetvlstatic16 len
> +.ifc \len,4
> + vsetvli zero, zero, e16, mf2, ta, ma
> +.elseif \len == 8
> + vsetvli zero, zero, e16, m1, ta, ma
> +.elseif \len == 16
> + vsetvli zero, zero, e16, m2, ta, ma
> +.else
> + vsetvli zero, zero, e16, m4, ta, ma
> +.endif
> +.endm
> +
> .macro copy_avg len
> func ff_vp9_avg\len\()_rvv, zve32x
> csrwi vxrm, 0
> @@ -127,8 +139,200 @@ func ff_\op\()_vp9_bilin_4hv_rvv, zve32x
> endfunc
> .endm
>
> +.macro epel_filter name, type, regtype
> + lla \regtype\()2, ff_vp9_subpel_filters
> +
> +.ifc \name,regular
> + addi \regtype\()2, \regtype\()2, 16*8*2
> +.endif
> +.ifc \name,sharp
> + addi \regtype\()2, \regtype\()2, 16*8*2*2
> +.endif
> +
> +.ifc \type,v
> + slli \regtype\()0, a6, 4
> +.else
> + slli \regtype\()0, a5, 4
> +.endif
> + add \regtype\()0, \regtype\()0, \regtype\()2
> +
> + lh \regtype\()1, 2(\regtype\()0)
> + lh \regtype\()2, 4(\regtype\()0)
> + lh \regtype\()3, 6(\regtype\()0)
> + lh \regtype\()4, 8(\regtype\()0)
> + lh \regtype\()5, 10(\regtype\()0)
> + lh \regtype\()6, 12(\regtype\()0)
> +
> +.ifc \regtype,t
> + lh a7, 14(\regtype\()0)
> +.else
> + lh s7, 14(\regtype\()0)
> +.endif
> + lh \regtype\()0, 0(\regtype\()0)
> +.endm
> +
> +.macro epel_load dst, len, op, name, type, from_mem, regtype
> + li a5, 64
> +.ifc \from_mem, 1
> + vle8.v v22, (a2)
> +.ifc \type,v
> + sub a2, a2, a3
> + vle8.v v20, (a2)
> + sh1add a2, a3, a2
> + vle8.v v24, (a2)
> + add a2, a2, a3
> + vle8.v v26, (a2)
> + add a2, a2, a3
> + vle8.v v28, (a2)
> + add a2, a2, a3
> + vle8.v v30, (a2)
> +.else
> + addi a2, a2, -1
> + vle8.v v20, (a2)
> + addi a2, a2, 2
> + vle8.v v24, (a2)
> + addi a2, a2, 1
> + vle8.v v26, (a2)
> + addi a2, a2, 1
> + vle8.v v28, (a2)
> + addi a2, a2, 1
> + vle8.v v30, (a2)
> +.endif
> +
> +.ifc \name,smooth
> + vwmulu.vx v16, v24, \regtype\()4
> + vwmaccu.vx v16, \regtype\()2, v20
> + vwmaccu.vx v16, \regtype\()5, v26
> + vwmaccsu.vx v16, \regtype\()6, v28
> +.else
> + vwmulu.vx v16, v28, \regtype\()6
> + vwmaccsu.vx v16, \regtype\()2, v20
> + vwmaccsu.vx v16, \regtype\()5, v26
> +.endif
> +
> +.ifc \regtype,t
> + vwmaccsu.vx v16, a7, v30
> +.else
> + vwmaccsu.vx v16, s7, v30
> +.endif
> +
> +.ifc \type,v
> + .rept 6
> + sub a2, a2, a3
> + .endr
> + vle8.v v28, (a2)
> + sub a2, a2, a3
> + vle8.v v26, (a2)
> + sh1add a2, a3, a2
> + add a2, a2, a3
> +.else
> + addi a2, a2, -6
> + vle8.v v28, (a2)
> + addi a2, a2, -1
> + vle8.v v26, (a2)
> + addi a2, a2, 3
> +.endif
> +
> +.ifc \name,smooth
> + vwmaccsu.vx v16, \regtype\()1, v28
> +.else
> + vwmaccu.vx v16, \regtype\()1, v28
> + vwmulu.vx v28, v24, \regtype\()4
> +.endif
> + vwmaccsu.vx v16, \regtype\()0, v26
> + vwmulu.vx v20, v22, \regtype\()3
> +.else
> +.ifc \name,smooth
> + vwmulu.vx v16, v8, \regtype\()4
> + vwmaccu.vx v16, \regtype\()2, v4
> + vwmaccu.vx v16, \regtype\()5, v10
> + vwmaccsu.vx v16, \regtype\()6, v12
> + vwmaccsu.vx v16, \regtype\()1, v2
> +.else
> + vwmulu.vx v16, v2, \regtype\()1
> + vwmaccu.vx v16, \regtype\()6, v12
> + vwmaccsu.vx v16, \regtype\()5, v10
> + vwmaccsu.vx v16, \regtype\()2, v4
> + vwmulu.vx v28, v8, \regtype\()4
> +.endif
> + vwmaccsu.vx v16, \regtype\()0, v0
> + vwmulu.vx v20, v6, \regtype\()3
> +
> +.ifc \regtype,t
> + vwmaccsu.vx v16, a7, v14
> +.else
> + vwmaccsu.vx v16, s7, v14
> +.endif
> +
> +.endif
> + vwadd.wx v16, v16, a5
> + vsetvlstatic16 \len
> +
> +.ifc \name,smooth
> + vwadd.vv v24, v16, v20
> +.else
> + vwadd.vv v24, v16, v28
> + vwadd.wv v24, v24, v20
> +.endif
> + vnsra.wi v24, v24, 7
> + vmax.vx v24, v24, zero
> + vsetvlstatic8 \len, zero, 32, m2
> +
> + vnclipu.wi \dst, v24, 0
> +.ifc \op,avg
> + vle8.v v24, (a0)
> + vaaddu.vv \dst, \dst, v24
> +.endif
> +
> +.endm
> +
> +.macro epel_load_inc dst, len, op, name, type, from_mem, regtype
> + epel_load \dst, \len, \op, \name, \type, \from_mem, \regtype
> + add a2, a2, a3
> +.endm
> +
> +.macro epel len, op, name, type, vlen
> +func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
> + epel_filter \name, \type, t
> +.if \vlen < 256
> + vsetvlstatic8 \len, a5, 32, m2
> +.else
> + vsetvlstatic8 \len, a5, 64, m2
> +.endif
> +.ifc \op,avg
> + csrwi vxrm, 0
> +.endif
> +
> +1:
> + addi a4, a4, -1
> + epel_load v30, \len, \op, \name, \type, 1, t
> + vse8.v v30, (a0)
> +.if \len == 64 && \vlen < 256
> + addi a0, a0, 32
> + addi a2, a2, 32
> + epel_load v30, \len, \op, \name, \type, 1, t
> + vse8.v v30, (a0)
> + addi a0, a0, -32
> + addi a2, a2, -32
> +.endif
> + add a2, a2, a3
> + add a0, a0, a1
> + bnez a4, 1b
> +
> + ret
> +endfunc
> +.endm
> +
> .irp len, 64, 32, 16, 8, 4
> copy_avg \len
> + .irp op, put, avg
> + .irp name, regular, sharp, smooth
> + .irp type, h, v
> + epel \len, \op, \name, \type, 128
> + epel \len, \op, \name, \type, 256
> + .endr
> + .endr
> + .endr
> .endr
>
> bilin_h_v put, h, a5
> diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> index 8fb326dae0..5fd64a1b8c 100644
> --- a/libavcodec/riscv/vp9dsp.h
> +++ b/libavcodec/riscv/vp9dsp.h
> @@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride,
> const uint8_t *l,
> void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> const uint8_t *a);
>
> -#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)
> \
> -void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> dststride, \
> +#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx, min_vlen)
> \
> +void ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,
> \
> + ptrdiff_t dststride,
> \
> const uint8_t *src,
> \
> ptrdiff_t srcstride,
> \
> int h, int mx, int my);
> \
>
> \
> -void ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t
> dststride, \
> +void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,
> \
> + ptrdiff_t dststride,
> \
> const uint8_t *src,
> \
> ptrdiff_t srcstride,
> \
> int h, int mx, int my);
> \
>
> \
> -void ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t
> dststride, \
> +void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,
> \
> + ptrdiff_t dststride,
> \
> const uint8_t *src,
> \
> ptrdiff_t srcstride,
> \
> int h, int mx, int my);
> \
>
> \
> -void ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> dststride, \
> +void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,
> \
> + ptrdiff_t dststride,
> \
> const uint8_t *src,
> \
> ptrdiff_t srcstride,
> \
> int h, int mx, int my);
> \
>
> \
> -void ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t
> dststride, \
> +void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,
> \
> + ptrdiff_t dststride,
> \
> const uint8_t *src,
> \
> ptrdiff_t srcstride,
> \
> int h, int mx, int my);
> \
>
> \
> -void ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t
> dststride, \
> +void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,
> \
> + ptrdiff_t dststride,
> \
> const uint8_t *src,
> \
> ptrdiff_t srcstride,
> \
> int h, int mx, int my);
> @@ -146,23 +152,41 @@ void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t
> dststride, \
> const uint8_t *src, ptrdiff_t srcstride, \
> int h, int mx, int my);
>
> -VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR);
> -
> -VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP);
> -
> -VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
> +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256);
>
> VP9_BILINEAR_RISCV_RVV_FUNC(64);
> VP9_BILINEAR_RISCV_RVV_FUNC(32);
> diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> index b3700dfb08..5f759e6bc8 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -49,7 +49,8 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp)
> # endif
>
> #if HAVE_RVV
> - if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) &&
> ff_rv_vlen_least(128)) {
> + if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
> + if (ff_rv_vlen_least(128)) {
>
> #define init_fpel(idx1, sz) \
> dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_vp9_avg##sz##_rvv; \
> @@ -95,6 +96,40 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp)
> dsp->mc[4][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_4hv_rvv;
>
> #undef init_fpel
> +
> +#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen) \
> + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = \
> + ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen; \
> + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = \
> + ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen; \
> + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = \
> + ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen;
> +
> +#define init_subpel2(idx, idxh, idxv, dir, type, vlen) \
> + init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen); \
> + init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen); \
> + init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen); \
> + init_subpel1(3, idx, idxh, idxv, 8, dir, type, vlen); \
> + init_subpel1(4, idx, idxh, idxv, 4, dir, type, vlen)
> +
> + init_subpel2(0, 1, 0, h, put, 128);
> + init_subpel2(1, 1, 0, h, avg, 128);
> +
> + if (flags & AV_CPU_FLAG_RVB_ADDR) {
> + init_subpel2(0, 0, 1, v, put, 128);
> + init_subpel2(1, 0, 1, v, avg, 128);
> + }
> +
> + }
> + if (ff_rv_vlen_least(256)) {
> + init_subpel2(0, 1, 0, h, put, 256);
> + init_subpel2(1, 1, 0, h, avg, 256);
> +
> + if (flags & AV_CPU_FLAG_RVB_ADDR) {
> + init_subpel2(0, 0, 1, v, put, 256);
> + init_subpel2(1, 0, 1, v, avg, 256);
> + }
> + }
> }
> #endif
> #endif
> --
> 2.45.1
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3 3/5] lavc/vp9dsp: R-V V mc bilin hv
2024-05-29 17:15 ` [FFmpeg-devel] [PATCH v3 3/5] lavc/vp9dsp: R-V V mc bilin hv uk7b
@ 2024-05-29 20:07 ` Rémi Denis-Courmont
0 siblings, 0 replies; 7+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-29 20:07 UTC (permalink / raw)
To: ffmpeg-devel
Le keskiviikkona 29. toukokuuta 2024, 20.15.38 EEST uk7b@foxmail.com a écrit :
> From: sunyuechi <sunyuechi@iscas.ac.cn>
>
> C908:
> vp9_avg_bilin_4hv_8bpp_c: 11.0
> vp9_avg_bilin_4hv_8bpp_rvv_i64: 3.7
> vp9_avg_bilin_8hv_8bpp_c: 38.7
> vp9_avg_bilin_8hv_8bpp_rvv_i64: 7.2
> vp9_avg_bilin_16hv_8bpp_c: 147.0
> vp9_avg_bilin_16hv_8bpp_rvv_i64: 14.2
> vp9_avg_bilin_32hv_8bpp_c: 574.5
> vp9_avg_bilin_32hv_8bpp_rvv_i64: 42.7
> vp9_avg_bilin_64hv_8bpp_c: 2311.5
> vp9_avg_bilin_64hv_8bpp_rvv_i64: 201.7
> vp9_put_bilin_4hv_8bpp_c: 10.0
> vp9_put_bilin_4hv_8bpp_rvv_i64: 3.2
> vp9_put_bilin_8hv_8bpp_c: 35.2
> vp9_put_bilin_8hv_8bpp_rvv_i64: 6.5
> vp9_put_bilin_16hv_8bpp_c: 133.7
> vp9_put_bilin_16hv_8bpp_rvv_i64: 13.0
> vp9_put_bilin_32hv_8bpp_c: 538.2
> vp9_put_bilin_32hv_8bpp_rvv_i64: 39.7
> vp9_put_bilin_64hv_8bpp_c: 2114.0
> vp9_put_bilin_64hv_8bpp_rvv_i64: 153.7
> ---
> libavcodec/riscv/vp9_mc_rvv.S | 38 +++++++++++++++++++++++++++++++++-
> libavcodec/riscv/vp9dsp_init.c | 10 +++++++++
> 2 files changed, 47 insertions(+), 1 deletion(-)
>
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index 9611aba0ed..990271736b 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -93,6 +93,40 @@ func ff_\op\()_vp9_bilin_4\type\()_rvv, zve32x
> endfunc
> .endm
>
> +.macro bilin_hv op
> +func ff_\op\()_vp9_bilin_4hv_rvv, zve32x
> + vsetvlstatic8 4, t0, 64
> +.Lbilin_hv\op:
> +.ifc \op,avg
> + csrwi vxrm, 0
> +.endif
> + neg t1, a5
> + neg t2, a6
> + li t4, 8
> + bilin_load v24, put, h, a5
> + add a2, a2, a3
> +1:
> + addi a4, a4, -1
> + bilin_load v4, put, h, a5
> + vwmulu.vx v16, v4, a6
> + vwmaccsu.vx v16, t2, v24
> + vwadd.wx v16, v16, t4
> + vnsra.wi v16, v16, 4
> + vadd.vv v0, v16, v24
> +.ifc \op,avg
> + vle8.v v16, (a0)
> + vaaddu.vv v0, v0, v16
> +.endif
> + vse8.v v0, (a0)
> + vmv.v.v v24, v4
Copying vectors is rarely justified - mostly only before destructive
instructions such as FMA. If a4 is even (?), this should be faster by
unrolling to two rows per iteration.
> + add a2, a2, a3
> + add a0, a0, a1
> + bnez a4, 1b
> +
> + ret
> +endfunc
> +.endm
> +
> .irp len, 64, 32, 16, 8, 4
> copy_avg \len
> .endr
> @@ -101,6 +135,8 @@ bilin_h_v put, h, a5
> bilin_h_v avg, h, a5
> bilin_h_v put, v, a6
> bilin_h_v avg, v, a6
> +bilin_hv put
> +bilin_hv avg
>
> .macro func_bilin_h_v len, op, type
> func ff_\op\()_vp9_bilin_\len\()\type\()_rvv, zve32x
> @@ -111,7 +147,7 @@ endfunc
>
> .irp len, 64, 32, 16, 8
> .irp op, put, avg
> - .irp type, h, v
> + .irp type, h, v, hv
> func_bilin_h_v \len, \op, \type
> .endr
> .endr
> diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
> index 9606d8545f..b3700dfb08 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -83,6 +83,16 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp) dsp->mc[4][FILTER_BILINEAR ][0][1][0] =
> ff_put_vp9_bilin_4h_rvv; dsp->mc[4][FILTER_BILINEAR ][1][0][1] =
> ff_avg_vp9_bilin_4v_rvv; dsp->mc[4][FILTER_BILINEAR ][1][1][0] =
> ff_avg_vp9_bilin_4h_rvv; + dsp->mc[0][FILTER_BILINEAR ][0][1][1] =
> ff_put_vp9_bilin_64hv_rvv; + dsp->mc[0][FILTER_BILINEAR ][1][1][1] =
> ff_avg_vp9_bilin_64hv_rvv; + dsp->mc[1][FILTER_BILINEAR ][0][1][1] =
> ff_put_vp9_bilin_32hv_rvv; + dsp->mc[1][FILTER_BILINEAR ][1][1][1] =
> ff_avg_vp9_bilin_32hv_rvv; + dsp->mc[2][FILTER_BILINEAR ][0][1][1] =
> ff_put_vp9_bilin_16hv_rvv; + dsp->mc[2][FILTER_BILINEAR ][1][1][1] =
> ff_avg_vp9_bilin_16hv_rvv; + dsp->mc[3][FILTER_BILINEAR ][0][1][1] =
> ff_put_vp9_bilin_8hv_rvv; + dsp->mc[3][FILTER_BILINEAR ][1][1][1] =
> ff_avg_vp9_bilin_8hv_rvv; + dsp->mc[4][FILTER_BILINEAR ][0][1][1] =
> ff_put_vp9_bilin_4hv_rvv; + dsp->mc[4][FILTER_BILINEAR ][1][1][1] =
> ff_avg_vp9_bilin_4hv_rvv;
>
> #undef init_fpel
> }
--
雷米‧德尼-库尔蒙
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3 4/5] lavc/vp9dsp: R-V V mc tap h v
2024-05-29 17:15 ` [FFmpeg-devel] [PATCH v3 4/5] lavc/vp9dsp: R-V V mc tap h v uk7b
2024-05-29 17:19 ` flow gg
@ 2024-05-29 20:29 ` Rémi Denis-Courmont
1 sibling, 0 replies; 7+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-29 20:29 UTC (permalink / raw)
To: ffmpeg-devel
Le keskiviikkona 29. toukokuuta 2024, 20.15.39 EEST uk7b@foxmail.com a écrit :
> From: sunyuechi <sunyuechi@iscas.ac.cn>
>
> C908 X60
> vp9_avg_8tap_smooth_4h_8bpp_c : 13.0 11.2
> vp9_avg_8tap_smooth_4h_8bpp_rvv_i32 : 5.0 4.2
> vp9_avg_8tap_smooth_4v_8bpp_c : 13.7 12.5
> vp9_avg_8tap_smooth_4v_8bpp_rvv_i32 : 5.0 4.2
> vp9_avg_8tap_smooth_8h_8bpp_c : 49.5 42.2
> vp9_avg_8tap_smooth_8h_8bpp_rvv_i32 : 9.2 8.5
> vp9_avg_8tap_smooth_8v_8bpp_c : 66.5 45.0
> vp9_avg_8tap_smooth_8v_8bpp_rvv_i32 : 9.5 8.5
> vp9_avg_8tap_smooth_16h_8bpp_c : 192.7 166.5
> vp9_avg_8tap_smooth_16h_8bpp_rvv_i32 : 21.2 18.7
> vp9_avg_8tap_smooth_16v_8bpp_c : 192.2 175.7
> vp9_avg_8tap_smooth_16v_8bpp_rvv_i32 : 21.5 19.0
> vp9_avg_8tap_smooth_32h_8bpp_c : 780.2 663.7
> vp9_avg_8tap_smooth_32h_8bpp_rvv_i32 : 83.5 60.0
> vp9_avg_8tap_smooth_32v_8bpp_c : 770.5 689.2
> vp9_avg_8tap_smooth_32v_8bpp_rvv_i32 : 67.2 60.0
> vp9_avg_8tap_smooth_64h_8bpp_c : 3115.5 2647.2
> vp9_avg_8tap_smooth_64h_8bpp_rvv_i32 : 283.5 119.2
> vp9_avg_8tap_smooth_64v_8bpp_c : 3082.2 2729.0
> vp9_avg_8tap_smooth_64v_8bpp_rvv_i32 : 305.2 119.0
> vp9_put_8tap_smooth_4h_8bpp_c : 11.2 9.7
> vp9_put_8tap_smooth_4h_8bpp_rvv_i32 : 4.2 4.0
> vp9_put_8tap_smooth_4v_8bpp_c : 11.7 10.7
> vp9_put_8tap_smooth_4v_8bpp_rvv_i32 : 4.2 4.0
> vp9_put_8tap_smooth_8h_8bpp_c : 42.0 37.5
> vp9_put_8tap_smooth_8h_8bpp_rvv_i32 : 8.5 7.7
> vp9_put_8tap_smooth_8v_8bpp_c : 44.2 38.7
> vp9_put_8tap_smooth_8v_8bpp_rvv_i32 : 8.5 7.7
> vp9_put_8tap_smooth_16h_8bpp_c : 165.7 147.2
> vp9_put_8tap_smooth_16h_8bpp_rvv_i32 : 19.5 17.5
> vp9_put_8tap_smooth_16v_8bpp_c : 169.0 149.7
> vp9_put_8tap_smooth_16v_8bpp_rvv_i32 : 19.7 17.5
> vp9_put_8tap_smooth_32h_8bpp_c : 659.7 586.7
> vp9_put_8tap_smooth_32h_8bpp_rvv_i32 : 64.2 57.2
> vp9_put_8tap_smooth_32v_8bpp_c : 680.5 591.2
> vp9_put_8tap_smooth_32v_8bpp_rvv_i32 : 64.2 57.2
> vp9_put_8tap_smooth_64h_8bpp_c : 2681.5 2339.0
> vp9_put_8tap_smooth_64h_8bpp_rvv_i32 : 255.5 114.2
> vp9_put_8tap_smooth_64v_8bpp_c : 2709.7 2348.7
> vp9_put_8tap_smooth_64v_8bpp_rvv_i32 : 255.5 114.0
> ---
> libavcodec/riscv/vp9_mc_rvv.S | 204 +++++++++++++++++++++++++++++++++
> libavcodec/riscv/vp9dsp.h | 72 ++++++++----
> libavcodec/riscv/vp9dsp_init.c | 37 +++++-
> 3 files changed, 288 insertions(+), 25 deletions(-)
>
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index 990271736b..53dd833dac 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -36,6 +36,18 @@
> .endif
> .endm
>
> +.macro vsetvlstatic16 len
> +.ifc \len,4
> + vsetvli zero, zero, e16, mf2, ta, ma
> +.elseif \len == 8
> + vsetvli zero, zero, e16, m1, ta, ma
> +.elseif \len == 16
> + vsetvli zero, zero, e16, m2, ta, ma
> +.else
> + vsetvli zero, zero, e16, m4, ta, ma
> +.endif
> +.endm
> +
> .macro copy_avg len
> func ff_vp9_avg\len\()_rvv, zve32x
> csrwi vxrm, 0
> @@ -127,8 +139,200 @@ func ff_\op\()_vp9_bilin_4hv_rvv, zve32x
> endfunc
> .endm
>
> +.macro epel_filter name, type, regtype
> + lla \regtype\()2, ff_vp9_subpel_filters
> +
> +.ifc \name,regular
> + addi \regtype\()2, \regtype\()2, 16*8*2
You can directly LLA filters + 16 * 8 * 2 and save one add. Same below. You can
also use .equ to alias the filter addresses, and avoid if's.
> +.endif
> +.ifc \name,sharp
> + addi \regtype\()2, \regtype\()2, 16*8*2*2
> +.endif
> +
> +.ifc \type,v
> + slli \regtype\()0, a6, 4
> +.else
> + slli \regtype\()0, a5, 4
> +.endif
Use a macro parameter for the stride register.
> + add \regtype\()0, \regtype\()0, \regtype\()2
> +
> + lh \regtype\()1, 2(\regtype\()0)
> + lh \regtype\()2, 4(\regtype\()0)
> + lh \regtype\()3, 6(\regtype\()0)
> + lh \regtype\()4, 8(\regtype\()0)
> + lh \regtype\()5, 10(\regtype\()0)
> + lh \regtype\()6, 12(\regtype\()0)
> +
> +.ifc \regtype,t
> + lh a7, 14(\regtype\()0)
> +.else
> + lh s7, 14(\regtype\()0)
> +.endif
> + lh \regtype\()0, 0(\regtype\()0)
> +.endm
> +
> +.macro epel_load dst, len, op, name, type, from_mem, regtype
> + li a5, 64
> +.ifc \from_mem, 1
> + vle8.v v22, (a2)
> +.ifc \type,v
> + sub a2, a2, a3
> + vle8.v v20, (a2)
> + sh1add a2, a3, a2
> + vle8.v v24, (a2)
> + add a2, a2, a3
> + vle8.v v26, (a2)
> + add a2, a2, a3
> + vle8.v v28, (a2)
> + add a2, a2, a3
> + vle8.v v30, (a2)
> +.else
> + addi a2, a2, -1
> + vle8.v v20, (a2)
> + addi a2, a2, 2
> + vle8.v v24, (a2)
> + addi a2, a2, 1
> + vle8.v v26, (a2)
> + addi a2, a2, 1
> + vle8.v v28, (a2)
> + addi a2, a2, 1
> + vle8.v v30, (a2)
That's a lot of address dependencies, which is going to hurt performance. It
might help to just spill more S registers if needed.
> +.endif
> +
> +.ifc \name,smooth
> + vwmulu.vx v16, v24, \regtype\()4
> + vwmaccu.vx v16, \regtype\()2, v20
> + vwmaccu.vx v16, \regtype\()5, v26
> + vwmaccsu.vx v16, \regtype\()6, v28
> +.else
> + vwmulu.vx v16, v28, \regtype\()6
> + vwmaccsu.vx v16, \regtype\()2, v20
> + vwmaccsu.vx v16, \regtype\()5, v26
> +.endif
> +
> +.ifc \regtype,t
> + vwmaccsu.vx v16, a7, v30
> +.else
> + vwmaccsu.vx v16, s7, v30
> +.endif
> +
> +.ifc \type,v
> + .rept 6
> + sub a2, a2, a3
> + .endr
This can be done in 3 instructions, even without mul. Of course you'll again
need a spare register.
> + vle8.v v28, (a2)
> + sub a2, a2, a3
> + vle8.v v26, (a2)
> + sh1add a2, a3, a2
> + add a2, a2, a3
> +.else
> + addi a2, a2, -6
> + vle8.v v28, (a2)
> + addi a2, a2, -1
> + vle8.v v26, (a2)
> + addi a2, a2, 3
> +.endif
> +
> +.ifc \name,smooth
> + vwmaccsu.vx v16, \regtype\()1, v28
> +.else
> + vwmaccu.vx v16, \regtype\()1, v28
> + vwmulu.vx v28, v24, \regtype\()4
> +.endif
> + vwmaccsu.vx v16, \regtype\()0, v26
> + vwmulu.vx v20, v22, \regtype\()3
> +.else
> +.ifc \name,smooth
> + vwmulu.vx v16, v8, \regtype\()4
> + vwmaccu.vx v16, \regtype\()2, v4
> + vwmaccu.vx v16, \regtype\()5, v10
> + vwmaccsu.vx v16, \regtype\()6, v12
> + vwmaccsu.vx v16, \regtype\()1, v2
> +.else
> + vwmulu.vx v16, v2, \regtype\()1
> + vwmaccu.vx v16, \regtype\()6, v12
> + vwmaccsu.vx v16, \regtype\()5, v10
> + vwmaccsu.vx v16, \regtype\()2, v4
> + vwmulu.vx v28, v8, \regtype\()4
> +.endif
> + vwmaccsu.vx v16, \regtype\()0, v0
> + vwmulu.vx v20, v6, \regtype\()3
> +
> +.ifc \regtype,t
> + vwmaccsu.vx v16, a7, v14
> +.else
> + vwmaccsu.vx v16, s7, v14
> +.endif
> +
> +.endif
> + vwadd.wx v16, v16, a5
> + vsetvlstatic16 \len
> +
> +.ifc \name,smooth
> + vwadd.vv v24, v16, v20
> +.else
> + vwadd.vv v24, v16, v28
> + vwadd.wv v24, v24, v20
> +.endif
> + vnsra.wi v24, v24, 7
> + vmax.vx v24, v24, zero
> + vsetvlstatic8 \len, zero, 32, m2
> +
> + vnclipu.wi \dst, v24, 0
> +.ifc \op,avg
> + vle8.v v24, (a0)
> + vaaddu.vv \dst, \dst, v24
> +.endif
> +
> +.endm
> +
> +.macro epel_load_inc dst, len, op, name, type, from_mem, regtype
> + epel_load \dst, \len, \op, \name, \type, \from_mem, \regtype
> + add a2, a2, a3
> +.endm
> +
> +.macro epel len, op, name, type, vlen
> +func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
> + epel_filter \name, \type, t
> +.if \vlen < 256
> + vsetvlstatic8 \len, a5, 32, m2
> +.else
> + vsetvlstatic8 \len, a5, 64, m2
> +.endif
> +.ifc \op,avg
> + csrwi vxrm, 0
> +.endif
> +
> +1:
> + addi a4, a4, -1
> + epel_load v30, \len, \op, \name, \type, 1, t
> + vse8.v v30, (a0)
> +.if \len == 64 && \vlen < 256
> + addi a0, a0, 32
> + addi a2, a2, 32
> + epel_load v30, \len, \op, \name, \type, 1, t
> + vse8.v v30, (a0)
> + addi a0, a0, -32
> + addi a2, a2, -32
> +.endif
> + add a2, a2, a3
> + add a0, a0, a1
> + bnez a4, 1b
> +
> + ret
> +endfunc
> +.endm
> +
> .irp len, 64, 32, 16, 8, 4
> copy_avg \len
> + .irp op, put, avg
> + .irp name, regular, sharp, smooth
> + .irp type, h, v
> + epel \len, \op, \name, \type, 128
> + epel \len, \op, \name, \type, 256
> + .endr
> + .endr
> + .endr
> .endr
>
> bilin_h_v put, h, a5
> diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> index 8fb326dae0..5fd64a1b8c 100644
> --- a/libavcodec/riscv/vp9dsp.h
> +++ b/libavcodec/riscv/vp9dsp.h
> @@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const
> uint8_t *l, void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const
> uint8_t *l, const uint8_t *a);
>
> -#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)
> \ -void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> dststride, \ +#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx,
> min_vlen) \ +void
> ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst, \ +
> ptrdiff_t dststride,
> \ const uint8_t *src, \ ptrdiff_t srcstride,
> \ int h, int mx, int my); \ \ -void
> ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \
> +void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,
> \ + ptrdiff_t dststride,
> \ const uint8_t *src, \ ptrdiff_t srcstride,
> \ int h, int mx, int my); \ \ -void
> ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \
> +void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,
> \ + ptrdiff_t dststride,
> \ const uint8_t *src, \ ptrdiff_t srcstride,
> \ int h, int mx, int my); \ \ -void
> ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \
> +void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,
> \ + ptrdiff_t dststride,
> \ const uint8_t *src, \ ptrdiff_t srcstride,
> \ int h, int mx, int my); \ \ -void
> ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride, \
> +void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,
> \ + ptrdiff_t dststride,
> \ const uint8_t *src, \ ptrdiff_t srcstride,
> \ int h, int mx, int my); \ \ -void
> ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride, \
> +void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,
> \ + ptrdiff_t dststride,
> \ const uint8_t *src, \ ptrdiff_t srcstride,
> \ int h, int mx, int my);
> @@ -146,23 +152,41 @@ void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t
> dststride, \ const uint8_t *src, ptrdiff_t srcstride, \ int h, int
> mx, int my);
>
> -VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR);
> -
> -VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP);
> -
> -VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
> +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256);
>
> VP9_BILINEAR_RISCV_RVV_FUNC(64);
> VP9_BILINEAR_RISCV_RVV_FUNC(32);
> diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
> index b3700dfb08..5f759e6bc8 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -49,7 +49,8 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp) # endif
>
> #if HAVE_RVV
> - if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) && ff_rv_vlen_least(128))
> { + if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
> + if (ff_rv_vlen_least(128)) {
>
> #define init_fpel(idx1, sz) \
> dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_vp9_avg##sz##_rvv; \
> @@ -95,6 +96,40 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp) dsp->mc[4][FILTER_BILINEAR ][1][1][1] =
> ff_avg_vp9_bilin_4hv_rvv;
>
> #undef init_fpel
> +
> +#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen) \
> + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = \
> + ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen; \
> + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = \
> + ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen; \
> + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = \
> + ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen;
> +
> +#define init_subpel2(idx, idxh, idxv, dir, type, vlen) \
> + init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen); \
> + init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen); \
> + init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen); \
> + init_subpel1(3, idx, idxh, idxv, 8, dir, type, vlen); \
> + init_subpel1(4, idx, idxh, idxv, 4, dir, type, vlen)
> +
> + init_subpel2(0, 1, 0, h, put, 128);
> + init_subpel2(1, 1, 0, h, avg, 128);
> +
> + if (flags & AV_CPU_FLAG_RVB_ADDR) {
> + init_subpel2(0, 0, 1, v, put, 128);
> + init_subpel2(1, 0, 1, v, avg, 128);
> + }
> +
> + }
> + if (ff_rv_vlen_least(256)) {
> + init_subpel2(0, 1, 0, h, put, 256);
> + init_subpel2(1, 1, 0, h, avg, 256);
> +
> + if (flags & AV_CPU_FLAG_RVB_ADDR) {
> + init_subpel2(0, 0, 1, v, put, 256);
> + init_subpel2(1, 0, 1, v, avg, 256);
> + }
> + }
> }
> #endif
> #endif
--
レミ・デニ-クールモン
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2024-05-29 20:29 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
[not found] <20240529171540.911099-1-uk7b@foxmail.com>
2024-05-29 17:15 ` [FFmpeg-devel] [PATCH v3 2/5] lavc/vp9dsp: R-V V mc bilin h v uk7b
2024-05-29 17:15 ` [FFmpeg-devel] [PATCH v3 3/5] lavc/vp9dsp: R-V V mc bilin hv uk7b
2024-05-29 20:07 ` Rémi Denis-Courmont
2024-05-29 17:15 ` [FFmpeg-devel] [PATCH v3 4/5] lavc/vp9dsp: R-V V mc tap h v uk7b
2024-05-29 17:19 ` flow gg
2024-05-29 20:29 ` Rémi Denis-Courmont
2024-05-29 17:15 ` [FFmpeg-devel] [PATCH v3 5/5] lavc/vp9dsp: R-V V mc tap hv uk7b
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git