* [FFmpeg-devel] [PATCH v2] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks @ 2023-06-19 13:06 Arnie Chang 2023-07-25 3:37 ` Arnie Chang 0 siblings, 1 reply; 3+ messages in thread From: Arnie Chang @ 2023-06-19 13:06 UTC (permalink / raw) To: ffmpeg-devel; +Cc: Arnie Chang Optimize the put and avg filtering for 4xH and 2xH blocks Signed-off-by: Arnie Chang <arnie.chang@sifive.com> --- V2: 1. Change the \width to an run time argument 2. Call to an internal function instead of instantiating similar code three times RVVi32: - h264chroma.chroma_mc [OK] checkasm: all 6 tests passed avg_h264_chroma_mc1_8_c: 1821.5 avg_h264_chroma_mc1_8_rvv_i32: 466.5 avg_h264_chroma_mc2_8_c: 939.2 avg_h264_chroma_mc2_8_rvv_i32: 466.5 avg_h264_chroma_mc4_8_c: 502.2 avg_h264_chroma_mc4_8_rvv_i32: 466.5 put_h264_chroma_mc1_8_c: 1436.5 put_h264_chroma_mc1_8_rvv_i32: 382.5 put_h264_chroma_mc2_8_c: 824.2 put_h264_chroma_mc2_8_rvv_i32: 382.5 put_h264_chroma_mc4_8_c: 431.2 put_h264_chroma_mc4_8_rvv_i32: 382.5 libavcodec/riscv/h264_chroma_init_riscv.c | 8 + libavcodec/riscv/h264_mc_chroma.S | 237 ++++++++++++++-------- 2 files changed, 160 insertions(+), 85 deletions(-) diff --git a/libavcodec/riscv/h264_chroma_init_riscv.c b/libavcodec/riscv/h264_chroma_init_riscv.c index 7c905edfcd..9f95150ea3 100644 --- a/libavcodec/riscv/h264_chroma_init_riscv.c +++ b/libavcodec/riscv/h264_chroma_init_riscv.c @@ -27,6 +27,10 @@ void h264_put_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y); void h264_avg_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y); +void h264_put_chroma_mc4_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y); +void h264_avg_chroma_mc4_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y); +void h264_put_chroma_mc2_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y); +void h264_avg_chroma_mc2_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y); av_cold void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth) { @@ -36,6 +40,10 @@ av_cold void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth) if (bit_depth == 8 && (flags & AV_CPU_FLAG_RVV_I32) && ff_get_rv_vlenb() >= 16) { c->put_h264_chroma_pixels_tab[0] = h264_put_chroma_mc8_rvv; c->avg_h264_chroma_pixels_tab[0] = h264_avg_chroma_mc8_rvv; + c->put_h264_chroma_pixels_tab[1] = h264_put_chroma_mc4_rvv; + c->avg_h264_chroma_pixels_tab[1] = h264_avg_chroma_mc4_rvv; + c->put_h264_chroma_pixels_tab[2] = h264_put_chroma_mc2_rvv; + c->avg_h264_chroma_pixels_tab[2] = h264_avg_chroma_mc2_rvv; } #endif } diff --git a/libavcodec/riscv/h264_mc_chroma.S b/libavcodec/riscv/h264_mc_chroma.S index 364bc3156e..ce99bda44d 100644 --- a/libavcodec/riscv/h264_mc_chroma.S +++ b/libavcodec/riscv/h264_mc_chroma.S @@ -19,8 +19,7 @@ */ #include "libavutil/riscv/asm.S" -.macro h264_chroma_mc8 type -func h264_\type\()_chroma_mc8_rvv, zve32x +.macro do_chroma_mc type unroll csrw vxrm, zero slli t2, a5, 3 mul t1, a5, a4 @@ -30,94 +29,100 @@ func h264_\type\()_chroma_mc8_rvv, zve32x sub a7, a4, t1 addi a6, a5, 64 sub t0, t2, t1 - vsetivli t3, 8, e8, m1, ta, mu + vsetvli t3, t6, e8, m1, ta, mu beqz t1, 2f blez a3, 8f li t4, 0 li t2, 0 li t5, 1 addi a5, t3, 1 - slli t3, a2, 2 + slli t3, a2, (1 + \unroll) 1: # if (xy != 0) add a4, a1, t4 vsetvli zero, a5, e8, m1, ta, ma + .ifc \unroll,1 addi t2, t2, 4 + .else + addi t2, t2, 2 + .endif vle8.v v10, (a4) add a4, a4, a2 vslide1down.vx v11, v10, t5 - vsetivli zero, 8, e8, m1, ta, ma + vsetvli zero, t6, e8, m1, ta, ma vwmulu.vx v8, v10, a6 vwmaccu.vx v8, a7, v11 vsetvli zero, a5, e8, m1, ta, ma vle8.v v12, (a4) - vsetivli zero, 8, e8, m1, ta, ma + vsetvli zero, t6, e8, m1, ta, ma add a4, a4, a2 vwmaccu.vx v8, t0, v12 vsetvli zero, a5, e8, m1, ta, ma vslide1down.vx v13, v12, t5 - vsetivli zero, 8, e8, m1, ta, ma + vsetvli zero, t6, e8, m1, ta, ma vwmulu.vx v10, v12, a6 vwmaccu.vx v8, t1, v13 vwmaccu.vx v10, a7, v13 vsetvli zero, a5, e8, m1, ta, ma vle8.v v14, (a4) - vsetivli zero, 8, e8, m1, ta, ma + vsetvli zero, t6, e8, m1, ta, ma add a4, a4, a2 vwmaccu.vx v10, t0, v14 vsetvli zero, a5, e8, m1, ta, ma vslide1down.vx v15, v14, t5 - vsetivli zero, 8, e8, m1, ta, ma + vsetvli zero, t6, e8, m1, ta, ma vwmulu.vx v12, v14, a6 vwmaccu.vx v10, t1, v15 vwmaccu.vx v12, a7, v15 + vnclipu.wi v15, v8, 6 + .ifc \type,avg + vle8.v v9, (a0) + vaaddu.vv v15, v15, v9 + .endif + vse8.v v15, (a0) + add a0, a0, a2 + vnclipu.wi v8, v10, 6 + .ifc \type,avg + vle8.v v9, (a0) + vaaddu.vv v8, v8, v9 + .endif + add t4, t4, t3 + vse8.v v8, (a0) + add a0, a0, a2 + .ifc \unroll,1 vsetvli zero, a5, e8, m1, ta, ma vle8.v v14, (a4) - vsetivli zero, 8, e8, m1, ta, ma + vsetvli zero, t6, e8, m1, ta, ma add a4, a4, a2 vwmaccu.vx v12, t0, v14 vsetvli zero, a5, e8, m1, ta, ma vslide1down.vx v15, v14, t5 - vsetivli zero, 8, e8, m1, ta, ma + vsetvli zero, t6, e8, m1, ta, ma vwmulu.vx v16, v14, a6 vwmaccu.vx v12, t1, v15 vwmaccu.vx v16, a7, v15 vsetvli zero, a5, e8, m1, ta, ma vle8.v v14, (a4) - vsetivli zero, 8, e8, m1, ta, ma - add a4, a0, t4 - add t4, t4, t3 + vsetvli zero, t6, e8, m1, ta, ma vwmaccu.vx v16, t0, v14 vsetvli zero, a5, e8, m1, ta, ma vslide1down.vx v14, v14, t5 - vsetivli zero, 8, e8, m1, ta, ma - vnclipu.wi v15, v8, 6 + vsetvli zero, t6, e8, m1, ta, ma vwmaccu.vx v16, t1, v14 - .ifc \type,avg - vle8.v v9, (a4) - vaaddu.vv v15, v15, v9 - .endif - vse8.v v15, (a4) - add a4, a4, a2 - vnclipu.wi v8, v10, 6 - .ifc \type,avg - vle8.v v9, (a4) - vaaddu.vv v8, v8, v9 - .endif - vse8.v v8, (a4) - add a4, a4, a2 vnclipu.wi v8, v12, 6 .ifc \type,avg - vle8.v v9, (a4) + vle8.v v9, (a0) vaaddu.vv v8, v8, v9 .endif - vse8.v v8, (a4) - add a4, a4, a2 + vse8.v v8, (a0) + add a0, a0, a2 vnclipu.wi v8, v16, 6 .ifc \type,avg - vle8.v v9, (a4) + vle8.v v9, (a0) vaaddu.vv v8, v8, v9 .endif - vse8.v v8, (a4) + vse8.v v8, (a0) + add a0, a0, a2 + .endif blt t2, a3, 1b j 8f 2: @@ -126,11 +131,15 @@ func h264_\type\()_chroma_mc8_rvv, zve32x blez a3, 8f li a4, 0 li t1, 0 - slli a7, a2, 2 + slli a7, a2, (1 + \unroll) 3: # if ((x8 - xy) == 0 && (y8 -xy) != 0) add a5, a1, a4 vsetvli zero, zero, e8, m1, ta, ma + .ifc \unroll,1 addi t1, t1, 4 + .else + addi t1, t1, 2 + .endif vle8.v v8, (a5) add a5, a5, a2 add t2, a5, a2 @@ -141,42 +150,44 @@ func h264_\type\()_chroma_mc8_rvv, zve32x add t2, t2, a2 add a5, t2, a2 vwmaccu.vx v10, t0, v8 - vle8.v v8, (t2) - vle8.v v14, (a5) - add a5, a0, a4 add a4, a4, a7 vwmaccu.vx v12, t0, v9 vnclipu.wi v15, v10, 6 vwmulu.vx v10, v9, a6 + vnclipu.wi v9, v12, 6 .ifc \type,avg - vle8.v v16, (a5) + vle8.v v16, (a0) vaaddu.vv v15, v15, v16 .endif - vse8.v v15, (a5) - add a5, a5, a2 - vnclipu.wi v9, v12, 6 - vwmaccu.vx v10, t0, v8 - vwmulu.vx v12, v8, a6 + vse8.v v15, (a0) + add a0, a0, a2 .ifc \type,avg - vle8.v v16, (a5) + vle8.v v16, (a0) vaaddu.vv v9, v9, v16 .endif - vse8.v v9, (a5) - add a5, a5, a2 + vse8.v v9, (a0) + add a0, a0, a2 + .ifc \unroll,1 + vle8.v v8, (t2) + vle8.v v14, (a5) + vwmaccu.vx v10, t0, v8 + vwmulu.vx v12, v8, a6 vnclipu.wi v8, v10, 6 vwmaccu.vx v12, t0, v14 .ifc \type,avg - vle8.v v16, (a5) + vle8.v v16, (a0) vaaddu.vv v8, v8, v16 .endif - vse8.v v8, (a5) - add a5, a5, a2 + vse8.v v8, (a0) + add a0, a0, a2 vnclipu.wi v8, v12, 6 .ifc \type,avg - vle8.v v16, (a5) + vle8.v v16, (a0) vaaddu.vv v8, v8, v16 .endif - vse8.v v8, (a5) + vse8.v v8, (a0) + add a0, a0, a2 + .endif blt t1, a3, 3b j 8f 4: @@ -186,87 +197,95 @@ func h264_\type\()_chroma_mc8_rvv, zve32x li a4, 0 li t2, 0 addi t0, t3, 1 - slli t1, a2, 2 + slli t1, a2, (1 + \unroll) 5: # if ((x8 - xy) != 0 && (y8 -xy) == 0) add a5, a1, a4 vsetvli zero, t0, e8, m1, ta, ma + .ifc \unroll,1 addi t2, t2, 4 + .else + addi t2, t2, 2 + .endif vle8.v v8, (a5) add a5, a5, a2 vslide1down.vx v9, v8, t5 - vsetivli zero, 8, e8, m1, ta, ma + vsetvli zero, t6, e8, m1, ta, ma vwmulu.vx v10, v8, a6 vwmaccu.vx v10, a7, v9 vsetvli zero, t0, e8, m1, ta, ma vle8.v v8, (a5) add a5, a5, a2 vslide1down.vx v9, v8, t5 - vsetivli zero, 8, e8, m1, ta, ma + vsetvli zero, t6, e8, m1, ta, ma vwmulu.vx v12, v8, a6 vwmaccu.vx v12, a7, v9 + vnclipu.wi v16, v10, 6 + .ifc \type,avg + vle8.v v18, (a0) + vaaddu.vv v16, v16, v18 + .endif + vse8.v v16, (a0) + add a0, a0, a2 + vnclipu.wi v10, v12, 6 + .ifc \type,avg + vle8.v v18, (a0) + vaaddu.vv v10, v10, v18 + .endif + add a4, a4, t1 + vse8.v v10, (a0) + add a0, a0, a2 + .ifc \unroll,1 vsetvli zero, t0, e8, m1, ta, ma vle8.v v8, (a5) add a5, a5, a2 vslide1down.vx v9, v8, t5 - vsetivli zero, 8, e8, m1, ta, ma + vsetvli zero, t6, e8, m1, ta, ma vwmulu.vx v14, v8, a6 vwmaccu.vx v14, a7, v9 vsetvli zero, t0, e8, m1, ta, ma vle8.v v8, (a5) - add a5, a0, a4 - add a4, a4, t1 vslide1down.vx v9, v8, t5 - vsetivli zero, 8, e8, m1, ta, ma - vnclipu.wi v16, v10, 6 - .ifc \type,avg - vle8.v v18, (a5) - vaaddu.vv v16, v16, v18 - .endif - vse8.v v16, (a5) - add a5, a5, a2 - vnclipu.wi v10, v12, 6 + vsetvli zero, t6, e8, m1, ta, ma vwmulu.vx v12, v8, a6 - .ifc \type,avg - vle8.v v18, (a5) - vaaddu.vv v10, v10, v18 - .endif - vse8.v v10, (a5) - add a5, a5, a2 vnclipu.wi v8, v14, 6 vwmaccu.vx v12, a7, v9 .ifc \type,avg - vle8.v v18, (a5) + vle8.v v18, (a0) vaaddu.vv v8, v8, v18 .endif - vse8.v v8, (a5) - add a5, a5, a2 + vse8.v v8, (a0) + add a0, a0, a2 vnclipu.wi v8, v12, 6 .ifc \type,avg - vle8.v v18, (a5) + vle8.v v18, (a0) vaaddu.vv v8, v8, v18 .endif - vse8.v v8, (a5) + vse8.v v8, (a0) + add a0, a0, a2 + .endif blt t2, a3, 5b j 8f 6: blez a3, 8f li a4, 0 li t2, 0 - slli a7, a2, 2 + slli a7, a2, (1 + \unroll) 7: # the final else, none of the above conditions are met add t0, a1, a4 vsetvli zero, zero, e8, m1, ta, ma add a5, a0, a4 add a4, a4, a7 + .ifc \unroll,1 addi t2, t2, 4 + .else + addi t2, t2, 2 + .endif vle8.v v8, (t0) add t0, t0, a2 add t1, t0, a2 vwmulu.vx v10, v8, a6 vle8.v v8, (t0) add t0, t1, a2 - vle8.v v9, (t1) - vle8.v v12, (t0) vnclipu.wi v13, v10, 6 vwmulu.vx v10, v8, a6 .ifc \type,avg @@ -276,13 +295,16 @@ func h264_\type\()_chroma_mc8_rvv, zve32x vse8.v v13, (a5) add a5, a5, a2 vnclipu.wi v8, v10, 6 - vwmulu.vx v10, v9, a6 .ifc \type,avg vle8.v v18, (a5) vaaddu.vv v8, v8, v18 .endif vse8.v v8, (a5) add a5, a5, a2 + .ifc \unroll,1 + vle8.v v9, (t1) + vle8.v v12, (t0) + vwmulu.vx v10, v9, a6 vnclipu.wi v8, v10, 6 vwmulu.vx v10, v12, a6 .ifc \type,avg @@ -297,11 +319,56 @@ func h264_\type\()_chroma_mc8_rvv, zve32x vaaddu.vv v8, v8, v18 .endif vse8.v v8, (a5) + .endif blt t2, a3, 7b 8: ret -endfunc .endm -h264_chroma_mc8 put -h264_chroma_mc8 avg +func h264_put_chroma_mc_rvv, zve32x +11: + li a7, 3 + blt a3, a7, 12f + do_chroma_mc put 1 +12: + do_chroma_mc put 0 +endfunc + +func h264_avg_chroma_mc_rvv, zve32x +21: + li a7, 3 + blt a3, a7, 22f + do_chroma_mc avg 1 +22: + do_chroma_mc avg 0 +endfunc + +func h264_put_chroma_mc8_rvv, zve32x + li t6, 8 + j 11b +endfunc + +func h264_put_chroma_mc4_rvv, zve32x + li t6, 4 + j 11b +endfunc + +func h264_put_chroma_mc2_rvv, zve32x + li t6, 2 + j 11b +endfunc + +func h264_avg_chroma_mc8_rvv, zve32x + li t6, 8 + j 21b +endfunc + +func h264_avg_chroma_mc4_rvv, zve32x + li t6, 4 + j 21b +endfunc + +func h264_avg_chroma_mc2_rvv, zve32x + li t6, 2 + j 21b +endfunc -- 2.17.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks 2023-06-19 13:06 [FFmpeg-devel] [PATCH v2] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks Arnie Chang @ 2023-07-25 3:37 ` Arnie Chang 2023-07-25 6:24 ` Rémi Denis-Courmont 0 siblings, 1 reply; 3+ messages in thread From: Arnie Chang @ 2023-07-25 3:37 UTC (permalink / raw) To: ffmpeg-devel It appears that all the issues raised during the review have been fixed, and there have been no additional comments for over 1 month. Could I kindly request assistance in pushing the patch? On Mon, Jun 19, 2023 at 9:06 PM Arnie Chang <arnie.chang@sifive.com> wrote: > Optimize the put and avg filtering for 4xH and 2xH blocks > > Signed-off-by: Arnie Chang <arnie.chang@sifive.com> > --- > V2: > 1. Change the \width to an run time argument > 2. Call to an internal function instead of instantiating similar code > three times > > RVVi32: > - h264chroma.chroma_mc [OK] > checkasm: all 6 tests passed > avg_h264_chroma_mc1_8_c: 1821.5 > avg_h264_chroma_mc1_8_rvv_i32: 466.5 > avg_h264_chroma_mc2_8_c: 939.2 > avg_h264_chroma_mc2_8_rvv_i32: 466.5 > avg_h264_chroma_mc4_8_c: 502.2 > avg_h264_chroma_mc4_8_rvv_i32: 466.5 > put_h264_chroma_mc1_8_c: 1436.5 > put_h264_chroma_mc1_8_rvv_i32: 382.5 > put_h264_chroma_mc2_8_c: 824.2 > put_h264_chroma_mc2_8_rvv_i32: 382.5 > put_h264_chroma_mc4_8_c: 431.2 > put_h264_chroma_mc4_8_rvv_i32: 382.5 > > libavcodec/riscv/h264_chroma_init_riscv.c | 8 + > libavcodec/riscv/h264_mc_chroma.S | 237 ++++++++++++++-------- > 2 files changed, 160 insertions(+), 85 deletions(-) > > diff --git a/libavcodec/riscv/h264_chroma_init_riscv.c > b/libavcodec/riscv/h264_chroma_init_riscv.c > index 7c905edfcd..9f95150ea3 100644 > --- a/libavcodec/riscv/h264_chroma_init_riscv.c > +++ b/libavcodec/riscv/h264_chroma_init_riscv.c > @@ -27,6 +27,10 @@ > > void h264_put_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src, > ptrdiff_t stride, int h, int x, int y); > void h264_avg_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src, > ptrdiff_t stride, int h, int x, int y); > +void h264_put_chroma_mc4_rvv(uint8_t *p_dst, const uint8_t *p_src, > ptrdiff_t stride, int h, int x, int y); > +void h264_avg_chroma_mc4_rvv(uint8_t *p_dst, const uint8_t *p_src, > ptrdiff_t stride, int h, int x, int y); > +void h264_put_chroma_mc2_rvv(uint8_t *p_dst, const uint8_t *p_src, > ptrdiff_t stride, int h, int x, int y); > +void h264_avg_chroma_mc2_rvv(uint8_t *p_dst, const uint8_t *p_src, > ptrdiff_t stride, int h, int x, int y); > > av_cold void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth) > { > @@ -36,6 +40,10 @@ av_cold void ff_h264chroma_init_riscv(H264ChromaContext > *c, int bit_depth) > if (bit_depth == 8 && (flags & AV_CPU_FLAG_RVV_I32) && > ff_get_rv_vlenb() >= 16) { > c->put_h264_chroma_pixels_tab[0] = h264_put_chroma_mc8_rvv; > c->avg_h264_chroma_pixels_tab[0] = h264_avg_chroma_mc8_rvv; > + c->put_h264_chroma_pixels_tab[1] = h264_put_chroma_mc4_rvv; > + c->avg_h264_chroma_pixels_tab[1] = h264_avg_chroma_mc4_rvv; > + c->put_h264_chroma_pixels_tab[2] = h264_put_chroma_mc2_rvv; > + c->avg_h264_chroma_pixels_tab[2] = h264_avg_chroma_mc2_rvv; > } > #endif > } > diff --git a/libavcodec/riscv/h264_mc_chroma.S > b/libavcodec/riscv/h264_mc_chroma.S > index 364bc3156e..ce99bda44d 100644 > --- a/libavcodec/riscv/h264_mc_chroma.S > +++ b/libavcodec/riscv/h264_mc_chroma.S > @@ -19,8 +19,7 @@ > */ > #include "libavutil/riscv/asm.S" > > -.macro h264_chroma_mc8 type > -func h264_\type\()_chroma_mc8_rvv, zve32x > +.macro do_chroma_mc type unroll > csrw vxrm, zero > slli t2, a5, 3 > mul t1, a5, a4 > @@ -30,94 +29,100 @@ func h264_\type\()_chroma_mc8_rvv, zve32x > sub a7, a4, t1 > addi a6, a5, 64 > sub t0, t2, t1 > - vsetivli t3, 8, e8, m1, ta, mu > + vsetvli t3, t6, e8, m1, ta, mu > beqz t1, 2f > blez a3, 8f > li t4, 0 > li t2, 0 > li t5, 1 > addi a5, t3, 1 > - slli t3, a2, 2 > + slli t3, a2, (1 + \unroll) > 1: # if (xy != 0) > add a4, a1, t4 > vsetvli zero, a5, e8, m1, ta, ma > + .ifc \unroll,1 > addi t2, t2, 4 > + .else > + addi t2, t2, 2 > + .endif > vle8.v v10, (a4) > add a4, a4, a2 > vslide1down.vx v11, v10, t5 > - vsetivli zero, 8, e8, m1, ta, ma > + vsetvli zero, t6, e8, m1, ta, ma > vwmulu.vx v8, v10, a6 > vwmaccu.vx v8, a7, v11 > vsetvli zero, a5, e8, m1, ta, ma > vle8.v v12, (a4) > - vsetivli zero, 8, e8, m1, ta, ma > + vsetvli zero, t6, e8, m1, ta, ma > add a4, a4, a2 > vwmaccu.vx v8, t0, v12 > vsetvli zero, a5, e8, m1, ta, ma > vslide1down.vx v13, v12, t5 > - vsetivli zero, 8, e8, m1, ta, ma > + vsetvli zero, t6, e8, m1, ta, ma > vwmulu.vx v10, v12, a6 > vwmaccu.vx v8, t1, v13 > vwmaccu.vx v10, a7, v13 > vsetvli zero, a5, e8, m1, ta, ma > vle8.v v14, (a4) > - vsetivli zero, 8, e8, m1, ta, ma > + vsetvli zero, t6, e8, m1, ta, ma > add a4, a4, a2 > vwmaccu.vx v10, t0, v14 > vsetvli zero, a5, e8, m1, ta, ma > vslide1down.vx v15, v14, t5 > - vsetivli zero, 8, e8, m1, ta, ma > + vsetvli zero, t6, e8, m1, ta, ma > vwmulu.vx v12, v14, a6 > vwmaccu.vx v10, t1, v15 > vwmaccu.vx v12, a7, v15 > + vnclipu.wi v15, v8, 6 > + .ifc \type,avg > + vle8.v v9, (a0) > + vaaddu.vv v15, v15, v9 > + .endif > + vse8.v v15, (a0) > + add a0, a0, a2 > + vnclipu.wi v8, v10, 6 > + .ifc \type,avg > + vle8.v v9, (a0) > + vaaddu.vv v8, v8, v9 > + .endif > + add t4, t4, t3 > + vse8.v v8, (a0) > + add a0, a0, a2 > + .ifc \unroll,1 > vsetvli zero, a5, e8, m1, ta, ma > vle8.v v14, (a4) > - vsetivli zero, 8, e8, m1, ta, ma > + vsetvli zero, t6, e8, m1, ta, ma > add a4, a4, a2 > vwmaccu.vx v12, t0, v14 > vsetvli zero, a5, e8, m1, ta, ma > vslide1down.vx v15, v14, t5 > - vsetivli zero, 8, e8, m1, ta, ma > + vsetvli zero, t6, e8, m1, ta, ma > vwmulu.vx v16, v14, a6 > vwmaccu.vx v12, t1, v15 > vwmaccu.vx v16, a7, v15 > vsetvli zero, a5, e8, m1, ta, ma > vle8.v v14, (a4) > - vsetivli zero, 8, e8, m1, ta, ma > - add a4, a0, t4 > - add t4, t4, t3 > + vsetvli zero, t6, e8, m1, ta, ma > vwmaccu.vx v16, t0, v14 > vsetvli zero, a5, e8, m1, ta, ma > vslide1down.vx v14, v14, t5 > - vsetivli zero, 8, e8, m1, ta, ma > - vnclipu.wi v15, v8, 6 > + vsetvli zero, t6, e8, m1, ta, ma > vwmaccu.vx v16, t1, v14 > - .ifc \type,avg > - vle8.v v9, (a4) > - vaaddu.vv v15, v15, v9 > - .endif > - vse8.v v15, (a4) > - add a4, a4, a2 > - vnclipu.wi v8, v10, 6 > - .ifc \type,avg > - vle8.v v9, (a4) > - vaaddu.vv v8, v8, v9 > - .endif > - vse8.v v8, (a4) > - add a4, a4, a2 > vnclipu.wi v8, v12, 6 > .ifc \type,avg > - vle8.v v9, (a4) > + vle8.v v9, (a0) > vaaddu.vv v8, v8, v9 > .endif > - vse8.v v8, (a4) > - add a4, a4, a2 > + vse8.v v8, (a0) > + add a0, a0, a2 > vnclipu.wi v8, v16, 6 > .ifc \type,avg > - vle8.v v9, (a4) > + vle8.v v9, (a0) > vaaddu.vv v8, v8, v9 > .endif > - vse8.v v8, (a4) > + vse8.v v8, (a0) > + add a0, a0, a2 > + .endif > blt t2, a3, 1b > j 8f > 2: > @@ -126,11 +131,15 @@ func h264_\type\()_chroma_mc8_rvv, zve32x > blez a3, 8f > li a4, 0 > li t1, 0 > - slli a7, a2, 2 > + slli a7, a2, (1 + \unroll) > 3: # if ((x8 - xy) == 0 && (y8 -xy) != 0) > add a5, a1, a4 > vsetvli zero, zero, e8, m1, ta, ma > + .ifc \unroll,1 > addi t1, t1, 4 > + .else > + addi t1, t1, 2 > + .endif > vle8.v v8, (a5) > add a5, a5, a2 > add t2, a5, a2 > @@ -141,42 +150,44 @@ func h264_\type\()_chroma_mc8_rvv, zve32x > add t2, t2, a2 > add a5, t2, a2 > vwmaccu.vx v10, t0, v8 > - vle8.v v8, (t2) > - vle8.v v14, (a5) > - add a5, a0, a4 > add a4, a4, a7 > vwmaccu.vx v12, t0, v9 > vnclipu.wi v15, v10, 6 > vwmulu.vx v10, v9, a6 > + vnclipu.wi v9, v12, 6 > .ifc \type,avg > - vle8.v v16, (a5) > + vle8.v v16, (a0) > vaaddu.vv v15, v15, v16 > .endif > - vse8.v v15, (a5) > - add a5, a5, a2 > - vnclipu.wi v9, v12, 6 > - vwmaccu.vx v10, t0, v8 > - vwmulu.vx v12, v8, a6 > + vse8.v v15, (a0) > + add a0, a0, a2 > .ifc \type,avg > - vle8.v v16, (a5) > + vle8.v v16, (a0) > vaaddu.vv v9, v9, v16 > .endif > - vse8.v v9, (a5) > - add a5, a5, a2 > + vse8.v v9, (a0) > + add a0, a0, a2 > + .ifc \unroll,1 > + vle8.v v8, (t2) > + vle8.v v14, (a5) > + vwmaccu.vx v10, t0, v8 > + vwmulu.vx v12, v8, a6 > vnclipu.wi v8, v10, 6 > vwmaccu.vx v12, t0, v14 > .ifc \type,avg > - vle8.v v16, (a5) > + vle8.v v16, (a0) > vaaddu.vv v8, v8, v16 > .endif > - vse8.v v8, (a5) > - add a5, a5, a2 > + vse8.v v8, (a0) > + add a0, a0, a2 > vnclipu.wi v8, v12, 6 > .ifc \type,avg > - vle8.v v16, (a5) > + vle8.v v16, (a0) > vaaddu.vv v8, v8, v16 > .endif > - vse8.v v8, (a5) > + vse8.v v8, (a0) > + add a0, a0, a2 > + .endif > blt t1, a3, 3b > j 8f > 4: > @@ -186,87 +197,95 @@ func h264_\type\()_chroma_mc8_rvv, zve32x > li a4, 0 > li t2, 0 > addi t0, t3, 1 > - slli t1, a2, 2 > + slli t1, a2, (1 + \unroll) > 5: # if ((x8 - xy) != 0 && (y8 -xy) == 0) > add a5, a1, a4 > vsetvli zero, t0, e8, m1, ta, ma > + .ifc \unroll,1 > addi t2, t2, 4 > + .else > + addi t2, t2, 2 > + .endif > vle8.v v8, (a5) > add a5, a5, a2 > vslide1down.vx v9, v8, t5 > - vsetivli zero, 8, e8, m1, ta, ma > + vsetvli zero, t6, e8, m1, ta, ma > vwmulu.vx v10, v8, a6 > vwmaccu.vx v10, a7, v9 > vsetvli zero, t0, e8, m1, ta, ma > vle8.v v8, (a5) > add a5, a5, a2 > vslide1down.vx v9, v8, t5 > - vsetivli zero, 8, e8, m1, ta, ma > + vsetvli zero, t6, e8, m1, ta, ma > vwmulu.vx v12, v8, a6 > vwmaccu.vx v12, a7, v9 > + vnclipu.wi v16, v10, 6 > + .ifc \type,avg > + vle8.v v18, (a0) > + vaaddu.vv v16, v16, v18 > + .endif > + vse8.v v16, (a0) > + add a0, a0, a2 > + vnclipu.wi v10, v12, 6 > + .ifc \type,avg > + vle8.v v18, (a0) > + vaaddu.vv v10, v10, v18 > + .endif > + add a4, a4, t1 > + vse8.v v10, (a0) > + add a0, a0, a2 > + .ifc \unroll,1 > vsetvli zero, t0, e8, m1, ta, ma > vle8.v v8, (a5) > add a5, a5, a2 > vslide1down.vx v9, v8, t5 > - vsetivli zero, 8, e8, m1, ta, ma > + vsetvli zero, t6, e8, m1, ta, ma > vwmulu.vx v14, v8, a6 > vwmaccu.vx v14, a7, v9 > vsetvli zero, t0, e8, m1, ta, ma > vle8.v v8, (a5) > - add a5, a0, a4 > - add a4, a4, t1 > vslide1down.vx v9, v8, t5 > - vsetivli zero, 8, e8, m1, ta, ma > - vnclipu.wi v16, v10, 6 > - .ifc \type,avg > - vle8.v v18, (a5) > - vaaddu.vv v16, v16, v18 > - .endif > - vse8.v v16, (a5) > - add a5, a5, a2 > - vnclipu.wi v10, v12, 6 > + vsetvli zero, t6, e8, m1, ta, ma > vwmulu.vx v12, v8, a6 > - .ifc \type,avg > - vle8.v v18, (a5) > - vaaddu.vv v10, v10, v18 > - .endif > - vse8.v v10, (a5) > - add a5, a5, a2 > vnclipu.wi v8, v14, 6 > vwmaccu.vx v12, a7, v9 > .ifc \type,avg > - vle8.v v18, (a5) > + vle8.v v18, (a0) > vaaddu.vv v8, v8, v18 > .endif > - vse8.v v8, (a5) > - add a5, a5, a2 > + vse8.v v8, (a0) > + add a0, a0, a2 > vnclipu.wi v8, v12, 6 > .ifc \type,avg > - vle8.v v18, (a5) > + vle8.v v18, (a0) > vaaddu.vv v8, v8, v18 > .endif > - vse8.v v8, (a5) > + vse8.v v8, (a0) > + add a0, a0, a2 > + .endif > blt t2, a3, 5b > j 8f > 6: > blez a3, 8f > li a4, 0 > li t2, 0 > - slli a7, a2, 2 > + slli a7, a2, (1 + \unroll) > 7: # the final else, none of the above > conditions are met > add t0, a1, a4 > vsetvli zero, zero, e8, m1, ta, ma > add a5, a0, a4 > add a4, a4, a7 > + .ifc \unroll,1 > addi t2, t2, 4 > + .else > + addi t2, t2, 2 > + .endif > vle8.v v8, (t0) > add t0, t0, a2 > add t1, t0, a2 > vwmulu.vx v10, v8, a6 > vle8.v v8, (t0) > add t0, t1, a2 > - vle8.v v9, (t1) > - vle8.v v12, (t0) > vnclipu.wi v13, v10, 6 > vwmulu.vx v10, v8, a6 > .ifc \type,avg > @@ -276,13 +295,16 @@ func h264_\type\()_chroma_mc8_rvv, zve32x > vse8.v v13, (a5) > add a5, a5, a2 > vnclipu.wi v8, v10, 6 > - vwmulu.vx v10, v9, a6 > .ifc \type,avg > vle8.v v18, (a5) > vaaddu.vv v8, v8, v18 > .endif > vse8.v v8, (a5) > add a5, a5, a2 > + .ifc \unroll,1 > + vle8.v v9, (t1) > + vle8.v v12, (t0) > + vwmulu.vx v10, v9, a6 > vnclipu.wi v8, v10, 6 > vwmulu.vx v10, v12, a6 > .ifc \type,avg > @@ -297,11 +319,56 @@ func h264_\type\()_chroma_mc8_rvv, zve32x > vaaddu.vv v8, v8, v18 > .endif > vse8.v v8, (a5) > + .endif > blt t2, a3, 7b > 8: > ret > -endfunc > .endm > > -h264_chroma_mc8 put > -h264_chroma_mc8 avg > +func h264_put_chroma_mc_rvv, zve32x > +11: > + li a7, 3 > + blt a3, a7, 12f > + do_chroma_mc put 1 > +12: > + do_chroma_mc put 0 > +endfunc > + > +func h264_avg_chroma_mc_rvv, zve32x > +21: > + li a7, 3 > + blt a3, a7, 22f > + do_chroma_mc avg 1 > +22: > + do_chroma_mc avg 0 > +endfunc > + > +func h264_put_chroma_mc8_rvv, zve32x > + li t6, 8 > + j 11b > +endfunc > + > +func h264_put_chroma_mc4_rvv, zve32x > + li t6, 4 > + j 11b > +endfunc > + > +func h264_put_chroma_mc2_rvv, zve32x > + li t6, 2 > + j 11b > +endfunc > + > +func h264_avg_chroma_mc8_rvv, zve32x > + li t6, 8 > + j 21b > +endfunc > + > +func h264_avg_chroma_mc4_rvv, zve32x > + li t6, 4 > + j 21b > +endfunc > + > +func h264_avg_chroma_mc2_rvv, zve32x > + li t6, 2 > + j 21b > +endfunc > -- > 2.17.1 > > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks 2023-07-25 3:37 ` Arnie Chang @ 2023-07-25 6:24 ` Rémi Denis-Courmont 0 siblings, 0 replies; 3+ messages in thread From: Rémi Denis-Courmont @ 2023-07-25 6:24 UTC (permalink / raw) To: FFmpeg development discussions and patches Hi, Sorry, I totally missed the last version. I'll see if I can dig it out of the archives or patchwork. _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2023-07-25 6:24 UTC | newest] Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2023-06-19 13:06 [FFmpeg-devel] [PATCH v2] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks Arnie Chang 2023-07-25 3:37 ` Arnie Chang 2023-07-25 6:24 ` Rémi Denis-Courmont
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git