* [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks
@ 2023-06-09 7:17 Arnie Chang
2023-06-10 14:55 ` Lynne
` (2 more replies)
0 siblings, 3 replies; 9+ messages in thread
From: Arnie Chang @ 2023-06-09 7:17 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Arnie Chang
Optimize the put and avg filtering for 4xH and 2xH blocks
Signed-off-by: Arnie Chang <arnie.chang@sifive.com>
---
checkasm: using random seed 3475799765
RVVi32:
- h264chroma.chroma_mc [OK]
checkasm: all 6 tests passed
avg_h264_chroma_mc1_8_c: 1821.5
avg_h264_chroma_mc1_8_rvv_i32: 466.5
avg_h264_chroma_mc2_8_c: 939.2
avg_h264_chroma_mc2_8_rvv_i32: 466.5
avg_h264_chroma_mc4_8_c: 502.2
avg_h264_chroma_mc4_8_rvv_i32: 466.5
put_h264_chroma_mc1_8_c: 1436.5
put_h264_chroma_mc1_8_rvv_i32: 382.5
put_h264_chroma_mc2_8_c: 824.2
put_h264_chroma_mc2_8_rvv_i32: 382.5
put_h264_chroma_mc4_8_c: 431.2
put_h264_chroma_mc4_8_rvv_i32: 382.5
libavcodec/riscv/h264_chroma_init_riscv.c | 8 +
libavcodec/riscv/h264_mc_chroma.S | 216 ++++++++++++++--------
2 files changed, 144 insertions(+), 80 deletions(-)
diff --git a/libavcodec/riscv/h264_chroma_init_riscv.c b/libavcodec/riscv/h264_chroma_init_riscv.c
index 7c905edfcd..9f95150ea3 100644
--- a/libavcodec/riscv/h264_chroma_init_riscv.c
+++ b/libavcodec/riscv/h264_chroma_init_riscv.c
@@ -27,6 +27,10 @@
void h264_put_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
void h264_avg_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
+void h264_put_chroma_mc4_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
+void h264_avg_chroma_mc4_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
+void h264_put_chroma_mc2_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
+void h264_avg_chroma_mc2_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
av_cold void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth)
{
@@ -36,6 +40,10 @@ av_cold void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth)
if (bit_depth == 8 && (flags & AV_CPU_FLAG_RVV_I32) && ff_get_rv_vlenb() >= 16) {
c->put_h264_chroma_pixels_tab[0] = h264_put_chroma_mc8_rvv;
c->avg_h264_chroma_pixels_tab[0] = h264_avg_chroma_mc8_rvv;
+ c->put_h264_chroma_pixels_tab[1] = h264_put_chroma_mc4_rvv;
+ c->avg_h264_chroma_pixels_tab[1] = h264_avg_chroma_mc4_rvv;
+ c->put_h264_chroma_pixels_tab[2] = h264_put_chroma_mc2_rvv;
+ c->avg_h264_chroma_pixels_tab[2] = h264_avg_chroma_mc2_rvv;
}
#endif
}
diff --git a/libavcodec/riscv/h264_mc_chroma.S b/libavcodec/riscv/h264_mc_chroma.S
index 364bc3156e..c97cdbad86 100644
--- a/libavcodec/riscv/h264_mc_chroma.S
+++ b/libavcodec/riscv/h264_mc_chroma.S
@@ -19,8 +19,7 @@
*/
#include "libavutil/riscv/asm.S"
-.macro h264_chroma_mc8 type
-func h264_\type\()_chroma_mc8_rvv, zve32x
+.macro do_chroma_mc type width unroll
csrw vxrm, zero
slli t2, a5, 3
mul t1, a5, a4
@@ -30,94 +29,104 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
sub a7, a4, t1
addi a6, a5, 64
sub t0, t2, t1
- vsetivli t3, 8, e8, m1, ta, mu
+ vsetivli t3, \width, e8, m1, ta, mu
beqz t1, 2f
blez a3, 8f
li t4, 0
li t2, 0
li t5, 1
addi a5, t3, 1
+ .ifc \unroll,1
slli t3, a2, 2
+ .else
+ slli t3, a2, 1
+ .endif
1: # if (xy != 0)
add a4, a1, t4
vsetvli zero, a5, e8, m1, ta, ma
+ .ifc \unroll,1
addi t2, t2, 4
+ .else
+ addi t2, t2, 2
+ .endif
vle8.v v10, (a4)
add a4, a4, a2
vslide1down.vx v11, v10, t5
- vsetivli zero, 8, e8, m1, ta, ma
+ vsetivli zero, \width, e8, m1, ta, ma
vwmulu.vx v8, v10, a6
vwmaccu.vx v8, a7, v11
vsetvli zero, a5, e8, m1, ta, ma
vle8.v v12, (a4)
- vsetivli zero, 8, e8, m1, ta, ma
+ vsetivli zero, \width, e8, m1, ta, ma
add a4, a4, a2
vwmaccu.vx v8, t0, v12
vsetvli zero, a5, e8, m1, ta, ma
vslide1down.vx v13, v12, t5
- vsetivli zero, 8, e8, m1, ta, ma
+ vsetivli zero, \width, e8, m1, ta, ma
vwmulu.vx v10, v12, a6
vwmaccu.vx v8, t1, v13
vwmaccu.vx v10, a7, v13
vsetvli zero, a5, e8, m1, ta, ma
vle8.v v14, (a4)
- vsetivli zero, 8, e8, m1, ta, ma
+ vsetivli zero, \width, e8, m1, ta, ma
add a4, a4, a2
vwmaccu.vx v10, t0, v14
vsetvli zero, a5, e8, m1, ta, ma
vslide1down.vx v15, v14, t5
- vsetivli zero, 8, e8, m1, ta, ma
+ vsetivli zero, \width, e8, m1, ta, ma
vwmulu.vx v12, v14, a6
vwmaccu.vx v10, t1, v15
vwmaccu.vx v12, a7, v15
+ vnclipu.wi v15, v8, 6
+ .ifc \type,avg
+ vle8.v v9, (a0)
+ vaaddu.vv v15, v15, v9
+ .endif
+ vse8.v v15, (a0)
+ add a0, a0, a2
+ vnclipu.wi v8, v10, 6
+ .ifc \type,avg
+ vle8.v v9, (a0)
+ vaaddu.vv v8, v8, v9
+ .endif
+ add t4, t4, t3
+ vse8.v v8, (a0)
+ add a0, a0, a2
+ .ifc \unroll,1
vsetvli zero, a5, e8, m1, ta, ma
vle8.v v14, (a4)
- vsetivli zero, 8, e8, m1, ta, ma
+ vsetivli zero, \width, e8, m1, ta, ma
add a4, a4, a2
vwmaccu.vx v12, t0, v14
vsetvli zero, a5, e8, m1, ta, ma
vslide1down.vx v15, v14, t5
- vsetivli zero, 8, e8, m1, ta, ma
+ vsetivli zero, \width, e8, m1, ta, ma
vwmulu.vx v16, v14, a6
vwmaccu.vx v12, t1, v15
vwmaccu.vx v16, a7, v15
vsetvli zero, a5, e8, m1, ta, ma
vle8.v v14, (a4)
- vsetivli zero, 8, e8, m1, ta, ma
- add a4, a0, t4
- add t4, t4, t3
+ vsetivli zero, \width, e8, m1, ta, ma
vwmaccu.vx v16, t0, v14
vsetvli zero, a5, e8, m1, ta, ma
vslide1down.vx v14, v14, t5
- vsetivli zero, 8, e8, m1, ta, ma
- vnclipu.wi v15, v8, 6
+ vsetivli zero, \width, e8, m1, ta, ma
vwmaccu.vx v16, t1, v14
- .ifc \type,avg
- vle8.v v9, (a4)
- vaaddu.vv v15, v15, v9
- .endif
- vse8.v v15, (a4)
- add a4, a4, a2
- vnclipu.wi v8, v10, 6
- .ifc \type,avg
- vle8.v v9, (a4)
- vaaddu.vv v8, v8, v9
- .endif
- vse8.v v8, (a4)
- add a4, a4, a2
vnclipu.wi v8, v12, 6
.ifc \type,avg
- vle8.v v9, (a4)
+ vle8.v v9, (a0)
vaaddu.vv v8, v8, v9
.endif
- vse8.v v8, (a4)
- add a4, a4, a2
+ vse8.v v8, (a0)
+ add a0, a0, a2
vnclipu.wi v8, v16, 6
.ifc \type,avg
- vle8.v v9, (a4)
+ vle8.v v9, (a0)
vaaddu.vv v8, v8, v9
.endif
- vse8.v v8, (a4)
+ vse8.v v8, (a0)
+ add a0, a0, a2
+ .endif
blt t2, a3, 1b
j 8f
2:
@@ -126,11 +135,19 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
blez a3, 8f
li a4, 0
li t1, 0
+ .ifc \unroll,1
slli a7, a2, 2
+ .else
+ slli a7, a2, 1
+ .endif
3: # if ((x8 - xy) == 0 && (y8 -xy) != 0)
add a5, a1, a4
vsetvli zero, zero, e8, m1, ta, ma
+ .ifc \unroll,1
addi t1, t1, 4
+ .else
+ addi t1, t1, 2
+ .endif
vle8.v v8, (a5)
add a5, a5, a2
add t2, a5, a2
@@ -141,42 +158,44 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
add t2, t2, a2
add a5, t2, a2
vwmaccu.vx v10, t0, v8
- vle8.v v8, (t2)
- vle8.v v14, (a5)
- add a5, a0, a4
add a4, a4, a7
vwmaccu.vx v12, t0, v9
vnclipu.wi v15, v10, 6
vwmulu.vx v10, v9, a6
+ vnclipu.wi v9, v12, 6
.ifc \type,avg
- vle8.v v16, (a5)
+ vle8.v v16, (a0)
vaaddu.vv v15, v15, v16
.endif
- vse8.v v15, (a5)
- add a5, a5, a2
- vnclipu.wi v9, v12, 6
- vwmaccu.vx v10, t0, v8
- vwmulu.vx v12, v8, a6
+ vse8.v v15, (a0)
+ add a0, a0, a2
.ifc \type,avg
- vle8.v v16, (a5)
+ vle8.v v16, (a0)
vaaddu.vv v9, v9, v16
.endif
- vse8.v v9, (a5)
- add a5, a5, a2
+ vse8.v v9, (a0)
+ add a0, a0, a2
+ .ifc \unroll,1
+ vle8.v v8, (t2)
+ vle8.v v14, (a5)
+ vwmaccu.vx v10, t0, v8
+ vwmulu.vx v12, v8, a6
vnclipu.wi v8, v10, 6
vwmaccu.vx v12, t0, v14
.ifc \type,avg
- vle8.v v16, (a5)
+ vle8.v v16, (a0)
vaaddu.vv v8, v8, v16
.endif
- vse8.v v8, (a5)
- add a5, a5, a2
+ vse8.v v8, (a0)
+ add a0, a0, a2
vnclipu.wi v8, v12, 6
.ifc \type,avg
- vle8.v v16, (a5)
+ vle8.v v16, (a0)
vaaddu.vv v8, v8, v16
.endif
- vse8.v v8, (a5)
+ vse8.v v8, (a0)
+ add a0, a0, a2
+ .endif
blt t1, a3, 3b
j 8f
4:
@@ -186,87 +205,103 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
li a4, 0
li t2, 0
addi t0, t3, 1
+ .ifc \unroll,1
slli t1, a2, 2
+ .else
+ slli t1, a2, 1
+ .endif
5: # if ((x8 - xy) != 0 && (y8 -xy) == 0)
add a5, a1, a4
vsetvli zero, t0, e8, m1, ta, ma
+ .ifc \unroll,1
addi t2, t2, 4
+ .else
+ addi t2, t2, 2
+ .endif
vle8.v v8, (a5)
add a5, a5, a2
vslide1down.vx v9, v8, t5
- vsetivli zero, 8, e8, m1, ta, ma
+ vsetivli zero, \width, e8, m1, ta, ma
vwmulu.vx v10, v8, a6
vwmaccu.vx v10, a7, v9
vsetvli zero, t0, e8, m1, ta, ma
vle8.v v8, (a5)
add a5, a5, a2
vslide1down.vx v9, v8, t5
- vsetivli zero, 8, e8, m1, ta, ma
+ vsetivli zero, \width, e8, m1, ta, ma
vwmulu.vx v12, v8, a6
vwmaccu.vx v12, a7, v9
+ vnclipu.wi v16, v10, 6
+ .ifc \type,avg
+ vle8.v v18, (a0)
+ vaaddu.vv v16, v16, v18
+ .endif
+ vse8.v v16, (a0)
+ add a0, a0, a2
+ vnclipu.wi v10, v12, 6
+ .ifc \type,avg
+ vle8.v v18, (a0)
+ vaaddu.vv v10, v10, v18
+ .endif
+ add a4, a4, t1
+ vse8.v v10, (a0)
+ add a0, a0, a2
+ .ifc \unroll,1
vsetvli zero, t0, e8, m1, ta, ma
vle8.v v8, (a5)
add a5, a5, a2
vslide1down.vx v9, v8, t5
- vsetivli zero, 8, e8, m1, ta, ma
+ vsetivli zero, \width, e8, m1, ta, ma
vwmulu.vx v14, v8, a6
vwmaccu.vx v14, a7, v9
vsetvli zero, t0, e8, m1, ta, ma
vle8.v v8, (a5)
- add a5, a0, a4
- add a4, a4, t1
vslide1down.vx v9, v8, t5
- vsetivli zero, 8, e8, m1, ta, ma
- vnclipu.wi v16, v10, 6
- .ifc \type,avg
- vle8.v v18, (a5)
- vaaddu.vv v16, v16, v18
- .endif
- vse8.v v16, (a5)
- add a5, a5, a2
- vnclipu.wi v10, v12, 6
+ vsetivli zero, \width, e8, m1, ta, ma
vwmulu.vx v12, v8, a6
- .ifc \type,avg
- vle8.v v18, (a5)
- vaaddu.vv v10, v10, v18
- .endif
- vse8.v v10, (a5)
- add a5, a5, a2
vnclipu.wi v8, v14, 6
vwmaccu.vx v12, a7, v9
.ifc \type,avg
- vle8.v v18, (a5)
+ vle8.v v18, (a0)
vaaddu.vv v8, v8, v18
.endif
- vse8.v v8, (a5)
- add a5, a5, a2
+ vse8.v v8, (a0)
+ add a0, a0, a2
vnclipu.wi v8, v12, 6
.ifc \type,avg
- vle8.v v18, (a5)
+ vle8.v v18, (a0)
vaaddu.vv v8, v8, v18
.endif
- vse8.v v8, (a5)
+ vse8.v v8, (a0)
+ add a0, a0, a2
+ .endif
blt t2, a3, 5b
j 8f
6:
blez a3, 8f
li a4, 0
li t2, 0
+ .ifc \unroll,1
slli a7, a2, 2
+ .else
+ slli a7, a2, 1
+ .endif
7: # the final else, none of the above conditions are met
add t0, a1, a4
vsetvli zero, zero, e8, m1, ta, ma
add a5, a0, a4
add a4, a4, a7
+ .ifc \unroll,1
addi t2, t2, 4
+ .else
+ addi t2, t2, 2
+ .endif
vle8.v v8, (t0)
add t0, t0, a2
add t1, t0, a2
vwmulu.vx v10, v8, a6
vle8.v v8, (t0)
add t0, t1, a2
- vle8.v v9, (t1)
- vle8.v v12, (t0)
vnclipu.wi v13, v10, 6
vwmulu.vx v10, v8, a6
.ifc \type,avg
@@ -276,13 +311,16 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
vse8.v v13, (a5)
add a5, a5, a2
vnclipu.wi v8, v10, 6
- vwmulu.vx v10, v9, a6
.ifc \type,avg
vle8.v v18, (a5)
vaaddu.vv v8, v8, v18
.endif
vse8.v v8, (a5)
add a5, a5, a2
+ .ifc \unroll,1
+ vle8.v v9, (t1)
+ vle8.v v12, (t0)
+ vwmulu.vx v10, v9, a6
vnclipu.wi v8, v10, 6
vwmulu.vx v10, v12, a6
.ifc \type,avg
@@ -297,11 +335,29 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
vaaddu.vv v8, v8, v18
.endif
vse8.v v8, (a5)
+ .endif
blt t2, a3, 7b
8:
ret
+.endm
+
+.macro h264_chroma_mc type width
+func h264_\type\()_chroma_mc\width\()_rvv, zve32x
+ .ifc \width,8
+ do_chroma_mc \type 8 1
+ .else
+ li a7, 3
+ blt a3, a7, 11f
+ do_chroma_mc \type \width 1
+11:
+ do_chroma_mc \type \width 0
+ .endif
endfunc
.endm
-h264_chroma_mc8 put
-h264_chroma_mc8 avg
+h264_chroma_mc put 8
+h264_chroma_mc avg 8
+h264_chroma_mc put 4
+h264_chroma_mc avg 4
+h264_chroma_mc put 2
+h264_chroma_mc avg 2
--
2.17.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks
2023-06-09 7:17 [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks Arnie Chang
@ 2023-06-10 14:55 ` Lynne
2023-06-10 16:43 ` Arnie Chang
2023-06-12 14:59 ` Rémi Denis-Courmont
2023-06-14 15:57 ` Rémi Denis-Courmont
2 siblings, 1 reply; 9+ messages in thread
From: Lynne @ 2023-06-10 14:55 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Jun 9, 2023, 09:17 by arnie.chang-at-sifive.com@ffmpeg.org:
> Optimize the put and avg filtering for 4xH and 2xH blocks
>
> Signed-off-by: Arnie Chang <arnie.chang@sifive.com>
> ---
> checkasm: using random seed 3475799765
> RVVi32:
> - h264chroma.chroma_mc [OK]
> checkasm: all 6 tests passed
> avg_h264_chroma_mc1_8_c: 1821.5
> avg_h264_chroma_mc1_8_rvv_i32: 466.5
> avg_h264_chroma_mc2_8_c: 939.2
> avg_h264_chroma_mc2_8_rvv_i32: 466.5
> avg_h264_chroma_mc4_8_c: 502.2
> avg_h264_chroma_mc4_8_rvv_i32: 466.5
> put_h264_chroma_mc1_8_c: 1436.5
> put_h264_chroma_mc1_8_rvv_i32: 382.5
> put_h264_chroma_mc2_8_c: 824.2
> put_h264_chroma_mc2_8_rvv_i32: 382.5
> put_h264_chroma_mc4_8_c: 431.2
> put_h264_chroma_mc4_8_rvv_i32: 382.5
>
Why do they all have the same timing?
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks
2023-06-10 14:55 ` Lynne
@ 2023-06-10 16:43 ` Arnie Chang
0 siblings, 0 replies; 9+ messages in thread
From: Arnie Chang @ 2023-06-10 16:43 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Sat, Jun 10, 2023 at 10:55 PM Lynne <dev@lynne.ee> wrote:
> Why do they all have the same timing?
>
The processing procedure for these workloads is the same,
except for the difference in block width. (8xH, 4xH, 2xH)
So, the number of instructions remains constant.
Since these workloads handle a small amount of data each time,
the cycle count also stays the same.
However, if the quantity increases, instructions may involve more
micro-operations,
which affects the cycle count (workloads with larger block width may take
more cycles).
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks
2023-06-09 7:17 [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks Arnie Chang
2023-06-10 14:55 ` Lynne
@ 2023-06-12 14:59 ` Rémi Denis-Courmont
2023-06-12 15:28 ` Arnie Chang
2023-06-14 15:57 ` Rémi Denis-Courmont
2 siblings, 1 reply; 9+ messages in thread
From: Rémi Denis-Courmont @ 2023-06-12 14:59 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Arnie Chang
Le perjantaina 9. kesäkuuta 2023, 10.17.27 EEST Arnie Chang a écrit :
> Optimize the put and avg filtering for 4xH and 2xH blocks
>
> Signed-off-by: Arnie Chang <arnie.chang@sifive.com>
> diff --git a/libavcodec/riscv/h264_mc_chroma.S
> b/libavcodec/riscv/h264_mc_chroma.S index 364bc3156e..c97cdbad86 100644
> --- a/libavcodec/riscv/h264_mc_chroma.S
> +++ b/libavcodec/riscv/h264_mc_chroma.S
> @@ -19,8 +19,7 @@
> */
> #include "libavutil/riscv/asm.S"
>
> -.macro h264_chroma_mc8 type
> -func h264_\type\()_chroma_mc8_rvv, zve32x
> +.macro do_chroma_mc type width unroll
> csrw vxrm, zero
> slli t2, a5, 3
> mul t1, a5, a4
> @@ -30,94 +29,104 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
> sub a7, a4, t1
> addi a6, a5, 64
> sub t0, t2, t1
> - vsetivli t3, 8, e8, m1, ta, mu
> + vsetivli t3, \width, e8, m1, ta, mu
> beqz t1, 2f
> blez a3, 8f
> li t4, 0
> li t2, 0
> li t5, 1
> addi a5, t3, 1
> + .ifc \unroll,1
It would seem more simpler and more intuitive to just use `.if` here. (Ditto
below.)
--
レミ・デニ-クールモン
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks
2023-06-12 14:59 ` Rémi Denis-Courmont
@ 2023-06-12 15:28 ` Arnie Chang
2023-06-12 15:29 ` Rémi Denis-Courmont
0 siblings, 1 reply; 9+ messages in thread
From: Arnie Chang @ 2023-06-12 15:28 UTC (permalink / raw)
To: Rémi Denis-Courmont; +Cc: ffmpeg-devel
On Mon, Jun 12, 2023 at 10:59 PM Rémi Denis-Courmont <remi@remlab.net>
wrote:
> It would seem more simpler and more intuitive to just use `.if` here.
> (Ditto
> below.)
>
hi,
Do you mean using .if to modify this line of code?
+ vsetivli t3, \width, e8, m1, ta, mu
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks
2023-06-12 15:28 ` Arnie Chang
@ 2023-06-12 15:29 ` Rémi Denis-Courmont
0 siblings, 0 replies; 9+ messages in thread
From: Rémi Denis-Courmont @ 2023-06-12 15:29 UTC (permalink / raw)
To: Arnie Chang; +Cc: ffmpeg-devel
Le maanantaina 12. kesäkuuta 2023, 18.28.34 EEST Arnie Chang a écrit :
> On Mon, Jun 12, 2023 at 10:59 PM Rémi Denis-Courmont <remi@remlab.net>
>
> wrote:
> > It would seem more simpler and more intuitive to just use `.if` here.
> > (Ditto
> > below.)
>
> hi,
> Do you mean using .if to modify this line of code?
> + vsetivli t3, \width, e8, m1, ta, mu
No. I mean that I don't see a reason to use .ifc as opposed to just .if.
--
Rémi Denis-Courmont
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks
2023-06-09 7:17 [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks Arnie Chang
2023-06-10 14:55 ` Lynne
2023-06-12 14:59 ` Rémi Denis-Courmont
@ 2023-06-14 15:57 ` Rémi Denis-Courmont
2023-06-15 14:58 ` Arnie Chang
2 siblings, 1 reply; 9+ messages in thread
From: Rémi Denis-Courmont @ 2023-06-14 15:57 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Arnie Chang
Le perjantaina 9. kesäkuuta 2023, 10.17.27 EEST Arnie Chang a écrit :
> Optimize the put and avg filtering for 4xH and 2xH blocks
>
> Signed-off-by: Arnie Chang <arnie.chang@sifive.com>
> ---
> checkasm: using random seed 3475799765
> RVVi32:
> - h264chroma.chroma_mc [OK]
> checkasm: all 6 tests passed
> avg_h264_chroma_mc1_8_c: 1821.5
> avg_h264_chroma_mc1_8_rvv_i32: 466.5
> avg_h264_chroma_mc2_8_c: 939.2
> avg_h264_chroma_mc2_8_rvv_i32: 466.5
> avg_h264_chroma_mc4_8_c: 502.2
> avg_h264_chroma_mc4_8_rvv_i32: 466.5
> put_h264_chroma_mc1_8_c: 1436.5
> put_h264_chroma_mc1_8_rvv_i32: 382.5
> put_h264_chroma_mc2_8_c: 824.2
> put_h264_chroma_mc2_8_rvv_i32: 382.5
> put_h264_chroma_mc4_8_c: 431.2
> put_h264_chroma_mc4_8_rvv_i32: 382.5
>
> libavcodec/riscv/h264_chroma_init_riscv.c | 8 +
> libavcodec/riscv/h264_mc_chroma.S | 216 ++++++++++++++--------
> 2 files changed, 144 insertions(+), 80 deletions(-)
>
> diff --git a/libavcodec/riscv/h264_chroma_init_riscv.c
> b/libavcodec/riscv/h264_chroma_init_riscv.c index 7c905edfcd..9f95150ea3
> 100644
> --- a/libavcodec/riscv/h264_chroma_init_riscv.c
> +++ b/libavcodec/riscv/h264_chroma_init_riscv.c
> @@ -27,6 +27,10 @@
>
> void h264_put_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src,
> ptrdiff_t stride, int h, int x, int y); void
> h264_avg_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t
> stride, int h, int x, int y); +void h264_put_chroma_mc4_rvv(uint8_t *p_dst,
> const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y); +void
> h264_avg_chroma_mc4_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t
> stride, int h, int x, int y); +void h264_put_chroma_mc2_rvv(uint8_t *p_dst,
> const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y); +void
> h264_avg_chroma_mc2_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t
> stride, int h, int x, int y);
>
> av_cold void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth)
> {
> @@ -36,6 +40,10 @@ av_cold void ff_h264chroma_init_riscv(H264ChromaContext
> *c, int bit_depth) if (bit_depth == 8 && (flags & AV_CPU_FLAG_RVV_I32) &&
> ff_get_rv_vlenb() >= 16) { c->put_h264_chroma_pixels_tab[0] =
> h264_put_chroma_mc8_rvv; c->avg_h264_chroma_pixels_tab[0] =
> h264_avg_chroma_mc8_rvv; + c->put_h264_chroma_pixels_tab[1] =
> h264_put_chroma_mc4_rvv; + c->avg_h264_chroma_pixels_tab[1] =
> h264_avg_chroma_mc4_rvv; + c->put_h264_chroma_pixels_tab[2] =
> h264_put_chroma_mc2_rvv; + c->avg_h264_chroma_pixels_tab[2] =
> h264_avg_chroma_mc2_rvv; }
> #endif
> }
> diff --git a/libavcodec/riscv/h264_mc_chroma.S
> b/libavcodec/riscv/h264_mc_chroma.S index 364bc3156e..c97cdbad86 100644
> --- a/libavcodec/riscv/h264_mc_chroma.S
> +++ b/libavcodec/riscv/h264_mc_chroma.S
> @@ -19,8 +19,7 @@
> */
> #include "libavutil/riscv/asm.S"
>
> -.macro h264_chroma_mc8 type
> -func h264_\type\()_chroma_mc8_rvv, zve32x
> +.macro do_chroma_mc type width unroll
It looks like \width is only ever used as AVL. You could advantageously pass
it as a run-time argument to an internal function, and spare the instruction
cache, instead of instantiating otherwise identical code thrice.
> csrw vxrm, zero
> slli t2, a5, 3
> mul t1, a5, a4
> @@ -30,94 +29,104 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
> sub a7, a4, t1
> addi a6, a5, 64
> sub t0, t2, t1
> - vsetivli t3, 8, e8, m1, ta, mu
> + vsetivli t3, \width, e8, m1, ta, mu
> beqz t1, 2f
> blez a3, 8f
> li t4, 0
> li t2, 0
> li t5, 1
> addi a5, t3, 1
> + .ifc \unroll,1
> slli t3, a2, 2
> + .else
> + slli t3, a2, 1
> + .endif
Note that all those 5-line conditional shift blocks could be simplified by
folding, e.g.:
slli t3, a2, (1 + \unroll)
Though I wonder if we could leverage SH*ADD instructions in some cases instead
of SLLI?
(..)
> +.endm
> +
> +.macro h264_chroma_mc type width
> +func h264_\type\()_chroma_mc\width\()_rvv, zve32x
> + .ifc \width,8
> + do_chroma_mc \type 8 1
> + .else
> + li a7, 3
> + blt a3, a7, 11f
> + do_chroma_mc \type \width 1
> +11:
> + do_chroma_mc \type \width 0
> + .endif
--
Rémi Denis-Courmont
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks
2023-06-14 15:57 ` Rémi Denis-Courmont
@ 2023-06-15 14:58 ` Arnie Chang
2023-06-15 18:48 ` Rémi Denis-Courmont
0 siblings, 1 reply; 9+ messages in thread
From: Arnie Chang @ 2023-06-15 14:58 UTC (permalink / raw)
To: Rémi Denis-Courmont; +Cc: ffmpeg-devel
On Wed, Jun 14, 2023 at 11:57 PM Rémi Denis-Courmont <remi@remlab.net>
wrote:
> It looks like \width is only ever used as AVL. You could advantageously
> pass
> it as a run-time argument to an internal function, and spare the
> instruction
> cache, instead of instantiating otherwise identical code thrice.
>
Since these functions are frequently called, I prefer instantiating similar
code many times
rather than calling another internal function, as it may introduce
additional function call overhead.
Neon and MMX code also apply the same approach.
slli t3, a2, (1 + \unroll)
>
Brilliant suggestion, I will fix it in the next patch.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks
2023-06-15 14:58 ` Arnie Chang
@ 2023-06-15 18:48 ` Rémi Denis-Courmont
0 siblings, 0 replies; 9+ messages in thread
From: Rémi Denis-Courmont @ 2023-06-15 18:48 UTC (permalink / raw)
To: ffmpeg-devel
Le torstaina 15. kesäkuuta 2023, 17.58.37 EEST Arnie Chang a écrit :
> Since these functions are frequently called, I prefer instantiating similar
> code many times
> rather than calling another internal function, as it may introduce
> additional function call overhead.
This works both ways. Smaller code reduces IC overhead and the risk of its own
eviction or that of some other frequently used code.
Here, we would just add one `li` to the 8x cases, and a pair of `li` and `j`
to the 2x and 4x cases (like we already do for Opus postfilter). Indeed, since
this is assembler, we can enforce tail-call optimisation.
Since this is assembler, you can count on tail-call optimisation. This is
really just one `li` and `j` added on the 2 and 4.
Not that I could measure the actual impact of either approaches.
--
Rémi Denis-Courmont
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2023-06-15 18:48 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-06-09 7:17 [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks Arnie Chang
2023-06-10 14:55 ` Lynne
2023-06-10 16:43 ` Arnie Chang
2023-06-12 14:59 ` Rémi Denis-Courmont
2023-06-12 15:28 ` Arnie Chang
2023-06-12 15:29 ` Rémi Denis-Courmont
2023-06-14 15:57 ` Rémi Denis-Courmont
2023-06-15 14:58 ` Arnie Chang
2023-06-15 18:48 ` Rémi Denis-Courmont
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git