Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks
@ 2023-06-09  7:17 Arnie Chang
  2023-06-10 14:55 ` Lynne
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Arnie Chang @ 2023-06-09  7:17 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Arnie Chang

Optimize the put and avg filtering for 4xH and 2xH blocks

Signed-off-by: Arnie Chang <arnie.chang@sifive.com>
---
checkasm: using random seed 3475799765
RVVi32:
 - h264chroma.chroma_mc [OK]
checkasm: all 6 tests passed
avg_h264_chroma_mc1_8_c: 1821.5
avg_h264_chroma_mc1_8_rvv_i32: 466.5
avg_h264_chroma_mc2_8_c: 939.2
avg_h264_chroma_mc2_8_rvv_i32: 466.5
avg_h264_chroma_mc4_8_c: 502.2
avg_h264_chroma_mc4_8_rvv_i32: 466.5
put_h264_chroma_mc1_8_c: 1436.5
put_h264_chroma_mc1_8_rvv_i32: 382.5
put_h264_chroma_mc2_8_c: 824.2
put_h264_chroma_mc2_8_rvv_i32: 382.5
put_h264_chroma_mc4_8_c: 431.2
put_h264_chroma_mc4_8_rvv_i32: 382.5

 libavcodec/riscv/h264_chroma_init_riscv.c |   8 +
 libavcodec/riscv/h264_mc_chroma.S         | 216 ++++++++++++++--------
 2 files changed, 144 insertions(+), 80 deletions(-)

diff --git a/libavcodec/riscv/h264_chroma_init_riscv.c b/libavcodec/riscv/h264_chroma_init_riscv.c
index 7c905edfcd..9f95150ea3 100644
--- a/libavcodec/riscv/h264_chroma_init_riscv.c
+++ b/libavcodec/riscv/h264_chroma_init_riscv.c
@@ -27,6 +27,10 @@
 
 void h264_put_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
 void h264_avg_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
+void h264_put_chroma_mc4_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
+void h264_avg_chroma_mc4_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
+void h264_put_chroma_mc2_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
+void h264_avg_chroma_mc2_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
 
 av_cold void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth)
 {
@@ -36,6 +40,10 @@ av_cold void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth)
     if (bit_depth == 8 && (flags & AV_CPU_FLAG_RVV_I32) && ff_get_rv_vlenb() >= 16) {
         c->put_h264_chroma_pixels_tab[0] = h264_put_chroma_mc8_rvv;
         c->avg_h264_chroma_pixels_tab[0] = h264_avg_chroma_mc8_rvv;
+        c->put_h264_chroma_pixels_tab[1] = h264_put_chroma_mc4_rvv;
+        c->avg_h264_chroma_pixels_tab[1] = h264_avg_chroma_mc4_rvv;
+        c->put_h264_chroma_pixels_tab[2] = h264_put_chroma_mc2_rvv;
+        c->avg_h264_chroma_pixels_tab[2] = h264_avg_chroma_mc2_rvv;
     }
 #endif
 }
diff --git a/libavcodec/riscv/h264_mc_chroma.S b/libavcodec/riscv/h264_mc_chroma.S
index 364bc3156e..c97cdbad86 100644
--- a/libavcodec/riscv/h264_mc_chroma.S
+++ b/libavcodec/riscv/h264_mc_chroma.S
@@ -19,8 +19,7 @@
  */
 #include "libavutil/riscv/asm.S"
 
-.macro  h264_chroma_mc8 type
-func h264_\type\()_chroma_mc8_rvv, zve32x
+.macro  do_chroma_mc type width unroll
         csrw            vxrm, zero
         slli            t2, a5, 3
         mul             t1, a5, a4
@@ -30,94 +29,104 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
         sub             a7, a4, t1
         addi            a6, a5, 64
         sub             t0, t2, t1
-        vsetivli        t3, 8, e8, m1, ta, mu
+        vsetivli        t3, \width, e8, m1, ta, mu
         beqz            t1, 2f
         blez            a3, 8f
         li              t4, 0
         li              t2, 0
         li              t5, 1
         addi            a5, t3, 1
+  .ifc \unroll,1
         slli            t3, a2, 2
+  .else
+        slli            t3, a2, 1
+  .endif
 1:                                # if (xy != 0)
         add             a4, a1, t4
         vsetvli         zero, a5, e8, m1, ta, ma
+  .ifc \unroll,1
         addi            t2, t2, 4
+  .else
+        addi            t2, t2, 2
+  .endif
         vle8.v          v10, (a4)
         add             a4, a4, a2
         vslide1down.vx  v11, v10, t5
-        vsetivli        zero, 8, e8, m1, ta, ma
+        vsetivli        zero, \width, e8, m1, ta, ma
         vwmulu.vx       v8, v10, a6
         vwmaccu.vx      v8, a7, v11
         vsetvli         zero, a5, e8, m1, ta, ma
         vle8.v          v12, (a4)
-        vsetivli        zero, 8, e8, m1, ta, ma
+        vsetivli        zero, \width, e8, m1, ta, ma
         add             a4, a4, a2
         vwmaccu.vx      v8, t0, v12
         vsetvli         zero, a5, e8, m1, ta, ma
         vslide1down.vx  v13, v12, t5
-        vsetivli        zero, 8, e8, m1, ta, ma
+        vsetivli        zero, \width, e8, m1, ta, ma
         vwmulu.vx       v10, v12, a6
         vwmaccu.vx      v8, t1, v13
         vwmaccu.vx      v10, a7, v13
         vsetvli         zero, a5, e8, m1, ta, ma
         vle8.v          v14, (a4)
-        vsetivli        zero, 8, e8, m1, ta, ma
+        vsetivli        zero, \width, e8, m1, ta, ma
         add             a4, a4, a2
         vwmaccu.vx      v10, t0, v14
         vsetvli         zero, a5, e8, m1, ta, ma
         vslide1down.vx  v15, v14, t5
-        vsetivli        zero, 8, e8, m1, ta, ma
+        vsetivli        zero, \width, e8, m1, ta, ma
         vwmulu.vx       v12, v14, a6
         vwmaccu.vx      v10, t1, v15
         vwmaccu.vx      v12, a7, v15
+        vnclipu.wi      v15, v8, 6
+  .ifc \type,avg
+        vle8.v          v9, (a0)
+        vaaddu.vv       v15, v15, v9
+  .endif
+        vse8.v          v15, (a0)
+        add             a0, a0, a2
+        vnclipu.wi      v8, v10, 6
+  .ifc \type,avg
+        vle8.v          v9, (a0)
+        vaaddu.vv       v8, v8, v9
+  .endif
+        add             t4, t4, t3
+        vse8.v          v8, (a0)
+        add             a0, a0, a2
+  .ifc \unroll,1
         vsetvli         zero, a5, e8, m1, ta, ma
         vle8.v          v14, (a4)
-        vsetivli        zero, 8, e8, m1, ta, ma
+        vsetivli        zero, \width, e8, m1, ta, ma
         add             a4, a4, a2
         vwmaccu.vx      v12, t0, v14
         vsetvli         zero, a5, e8, m1, ta, ma
         vslide1down.vx  v15, v14, t5
-        vsetivli        zero, 8, e8, m1, ta, ma
+        vsetivli        zero, \width, e8, m1, ta, ma
         vwmulu.vx       v16, v14, a6
         vwmaccu.vx      v12, t1, v15
         vwmaccu.vx      v16, a7, v15
         vsetvli         zero, a5, e8, m1, ta, ma
         vle8.v          v14, (a4)
-        vsetivli        zero, 8, e8, m1, ta, ma
-        add             a4, a0, t4
-        add             t4, t4, t3
+        vsetivli        zero, \width, e8, m1, ta, ma
         vwmaccu.vx      v16, t0, v14
         vsetvli         zero, a5, e8, m1, ta, ma
         vslide1down.vx  v14, v14, t5
-        vsetivli        zero, 8, e8, m1, ta, ma
-        vnclipu.wi      v15, v8, 6
+        vsetivli        zero, \width, e8, m1, ta, ma
         vwmaccu.vx      v16, t1, v14
-  .ifc \type,avg
-        vle8.v          v9, (a4)
-        vaaddu.vv       v15, v15, v9
-  .endif
-        vse8.v          v15, (a4)
-        add             a4, a4, a2
-        vnclipu.wi      v8, v10, 6
-  .ifc \type,avg
-        vle8.v          v9, (a4)
-        vaaddu.vv       v8, v8, v9
-  .endif
-        vse8.v          v8, (a4)
-        add             a4, a4, a2
         vnclipu.wi      v8, v12, 6
   .ifc \type,avg
-        vle8.v          v9, (a4)
+        vle8.v          v9, (a0)
         vaaddu.vv       v8, v8, v9
   .endif
-        vse8.v          v8, (a4)
-        add             a4, a4, a2
+        vse8.v          v8, (a0)
+        add             a0, a0, a2
         vnclipu.wi      v8, v16, 6
   .ifc \type,avg
-        vle8.v          v9, (a4)
+        vle8.v          v9, (a0)
         vaaddu.vv       v8, v8, v9
   .endif
-        vse8.v          v8, (a4)
+        vse8.v          v8, (a0)
+        add             a0, a0, a2
+  .endif
         blt             t2, a3, 1b
         j               8f
 2:
@@ -126,11 +135,19 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
         blez            a3, 8f
         li              a4, 0
         li              t1, 0
+  .ifc \unroll,1
         slli            a7, a2, 2
+  .else
+        slli            a7, a2, 1
+  .endif
 3:                                # if ((x8 - xy) == 0 && (y8 -xy) != 0)
         add             a5, a1, a4
         vsetvli         zero, zero, e8, m1, ta, ma
+  .ifc \unroll,1
         addi            t1, t1, 4
+  .else
+        addi            t1, t1, 2
+  .endif
         vle8.v          v8, (a5)
         add             a5, a5, a2
         add             t2, a5, a2
@@ -141,42 +158,44 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
         add             t2, t2, a2
         add             a5, t2, a2
         vwmaccu.vx      v10, t0, v8
-        vle8.v          v8, (t2)
-        vle8.v          v14, (a5)
-        add             a5, a0, a4
         add             a4, a4, a7
         vwmaccu.vx      v12, t0, v9
         vnclipu.wi      v15, v10, 6
         vwmulu.vx       v10, v9, a6
+        vnclipu.wi      v9, v12, 6
   .ifc \type,avg
-        vle8.v          v16, (a5)
+        vle8.v          v16, (a0)
         vaaddu.vv       v15, v15, v16
   .endif
-        vse8.v          v15, (a5)
-        add             a5, a5, a2
-        vnclipu.wi      v9, v12, 6
-        vwmaccu.vx      v10, t0, v8
-        vwmulu.vx       v12, v8, a6
+        vse8.v          v15, (a0)
+        add             a0, a0, a2
   .ifc \type,avg
-        vle8.v          v16, (a5)
+        vle8.v          v16, (a0)
         vaaddu.vv       v9, v9, v16
   .endif
-        vse8.v          v9, (a5)
-        add             a5, a5, a2
+        vse8.v          v9, (a0)
+        add             a0, a0, a2
+  .ifc \unroll,1
+        vle8.v          v8, (t2)
+        vle8.v          v14, (a5)
+        vwmaccu.vx      v10, t0, v8
+        vwmulu.vx       v12, v8, a6
         vnclipu.wi      v8, v10, 6
         vwmaccu.vx      v12, t0, v14
   .ifc \type,avg
-        vle8.v          v16, (a5)
+        vle8.v          v16, (a0)
         vaaddu.vv       v8, v8, v16
   .endif
-        vse8.v          v8, (a5)
-        add             a5, a5, a2
+        vse8.v          v8, (a0)
+        add             a0, a0, a2
         vnclipu.wi      v8, v12, 6
   .ifc \type,avg
-        vle8.v          v16, (a5)
+        vle8.v          v16, (a0)
         vaaddu.vv       v8, v8, v16
   .endif
-        vse8.v          v8, (a5)
+        vse8.v          v8, (a0)
+        add             a0, a0, a2
+  .endif
         blt             t1, a3, 3b
         j               8f
 4:
@@ -186,87 +205,103 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
         li              a4, 0
         li              t2, 0
         addi            t0, t3, 1
+  .ifc \unroll,1
         slli            t1, a2, 2
+  .else
+        slli            t1, a2, 1
+  .endif
 5:                               # if ((x8 - xy) != 0 && (y8 -xy) == 0)
         add             a5, a1, a4
         vsetvli         zero, t0, e8, m1, ta, ma
+  .ifc \unroll,1
         addi            t2, t2, 4
+  .else
+        addi            t2, t2, 2
+  .endif
         vle8.v          v8, (a5)
         add             a5, a5, a2
         vslide1down.vx  v9, v8, t5
-        vsetivli        zero, 8, e8, m1, ta, ma
+        vsetivli        zero, \width, e8, m1, ta, ma
         vwmulu.vx       v10, v8, a6
         vwmaccu.vx      v10, a7, v9
         vsetvli         zero, t0, e8, m1, ta, ma
         vle8.v          v8, (a5)
         add             a5, a5, a2
         vslide1down.vx  v9, v8, t5
-        vsetivli        zero, 8, e8, m1, ta, ma
+        vsetivli        zero, \width, e8, m1, ta, ma
         vwmulu.vx       v12, v8, a6
         vwmaccu.vx      v12, a7, v9
+        vnclipu.wi      v16, v10, 6
+  .ifc \type,avg
+        vle8.v          v18, (a0)
+        vaaddu.vv       v16, v16, v18
+  .endif
+        vse8.v          v16, (a0)
+        add             a0, a0, a2
+        vnclipu.wi      v10, v12, 6
+  .ifc \type,avg
+        vle8.v          v18, (a0)
+        vaaddu.vv       v10, v10, v18
+  .endif
+        add             a4, a4, t1
+        vse8.v          v10, (a0)
+        add             a0, a0, a2
+  .ifc \unroll,1
         vsetvli         zero, t0, e8, m1, ta, ma
         vle8.v          v8, (a5)
         add             a5, a5, a2
         vslide1down.vx  v9, v8, t5
-        vsetivli        zero, 8, e8, m1, ta, ma
+        vsetivli        zero, \width, e8, m1, ta, ma
         vwmulu.vx       v14, v8, a6
         vwmaccu.vx      v14, a7, v9
         vsetvli         zero, t0, e8, m1, ta, ma
         vle8.v          v8, (a5)
-        add             a5, a0, a4
-        add             a4, a4, t1
         vslide1down.vx  v9, v8, t5
-        vsetivli        zero, 8, e8, m1, ta, ma
-        vnclipu.wi      v16, v10, 6
-  .ifc \type,avg
-        vle8.v          v18, (a5)
-        vaaddu.vv       v16, v16, v18
-  .endif
-        vse8.v          v16, (a5)
-        add             a5, a5, a2
-        vnclipu.wi      v10, v12, 6
+        vsetivli        zero, \width, e8, m1, ta, ma
         vwmulu.vx       v12, v8, a6
-  .ifc \type,avg
-        vle8.v          v18, (a5)
-        vaaddu.vv       v10, v10, v18
-  .endif
-        vse8.v          v10, (a5)
-        add             a5, a5, a2
         vnclipu.wi      v8, v14, 6
         vwmaccu.vx      v12, a7, v9
   .ifc \type,avg
-        vle8.v          v18, (a5)
+        vle8.v          v18, (a0)
         vaaddu.vv       v8, v8, v18
   .endif
-        vse8.v          v8, (a5)
-        add             a5, a5, a2
+        vse8.v          v8, (a0)
+        add             a0, a0, a2
         vnclipu.wi      v8, v12, 6
   .ifc \type,avg
-        vle8.v          v18, (a5)
+        vle8.v          v18, (a0)
         vaaddu.vv       v8, v8, v18
   .endif
-        vse8.v          v8, (a5)
+        vse8.v          v8, (a0)
+        add             a0, a0, a2
+  .endif
         blt             t2, a3, 5b
         j               8f
 6:
         blez            a3, 8f
         li              a4, 0
         li              t2, 0
+  .ifc \unroll,1
         slli            a7, a2, 2
+  .else
+        slli            a7, a2, 1
+  .endif
 7:                               # the final else, none of the above conditions are met
         add             t0, a1, a4
         vsetvli         zero, zero, e8, m1, ta, ma
         add             a5, a0, a4
         add             a4, a4, a7
+  .ifc \unroll,1
         addi            t2, t2, 4
+  .else
+        addi            t2, t2, 2
+  .endif
         vle8.v          v8, (t0)
         add             t0, t0, a2
         add             t1, t0, a2
         vwmulu.vx       v10, v8, a6
         vle8.v          v8, (t0)
         add             t0, t1, a2
-        vle8.v          v9, (t1)
-        vle8.v          v12, (t0)
         vnclipu.wi      v13, v10, 6
         vwmulu.vx       v10, v8, a6
   .ifc \type,avg
@@ -276,13 +311,16 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
         vse8.v          v13, (a5)
         add             a5, a5, a2
         vnclipu.wi      v8, v10, 6
-        vwmulu.vx       v10, v9, a6
   .ifc \type,avg
         vle8.v          v18, (a5)
         vaaddu.vv       v8, v8, v18
   .endif
         vse8.v          v8, (a5)
         add             a5, a5, a2
+  .ifc \unroll,1
+        vle8.v          v9, (t1)
+        vle8.v          v12, (t0)
+        vwmulu.vx       v10, v9, a6
         vnclipu.wi      v8, v10, 6
         vwmulu.vx       v10, v12, a6
   .ifc \type,avg
@@ -297,11 +335,29 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
         vaaddu.vv       v8, v8, v18
   .endif
         vse8.v          v8, (a5)
+  .endif
         blt             t2, a3, 7b
 8:
         ret
+.endm
+
+.macro  h264_chroma_mc type width
+func h264_\type\()_chroma_mc\width\()_rvv, zve32x
+  .ifc \width,8
+        do_chroma_mc \type 8 1
+  .else
+        li      a7, 3
+        blt     a3, a7, 11f
+        do_chroma_mc \type \width 1
+11:
+        do_chroma_mc \type \width 0
+  .endif
 endfunc
 .endm
 
-h264_chroma_mc8 put
-h264_chroma_mc8 avg
+h264_chroma_mc put 8
+h264_chroma_mc avg 8
+h264_chroma_mc put 4
+h264_chroma_mc avg 4
+h264_chroma_mc put 2
+h264_chroma_mc avg 2
-- 
2.17.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks
  2023-06-09  7:17 [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks Arnie Chang
@ 2023-06-10 14:55 ` Lynne
  2023-06-10 16:43   ` Arnie Chang
  2023-06-12 14:59 ` Rémi Denis-Courmont
  2023-06-14 15:57 ` Rémi Denis-Courmont
  2 siblings, 1 reply; 9+ messages in thread
From: Lynne @ 2023-06-10 14:55 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Jun 9, 2023, 09:17 by arnie.chang-at-sifive.com@ffmpeg.org:

> Optimize the put and avg filtering for 4xH and 2xH blocks
>
> Signed-off-by: Arnie Chang <arnie.chang@sifive.com>
> ---
> checkasm: using random seed 3475799765
> RVVi32:
>  - h264chroma.chroma_mc [OK]
> checkasm: all 6 tests passed
> avg_h264_chroma_mc1_8_c: 1821.5
> avg_h264_chroma_mc1_8_rvv_i32: 466.5
> avg_h264_chroma_mc2_8_c: 939.2
> avg_h264_chroma_mc2_8_rvv_i32: 466.5
> avg_h264_chroma_mc4_8_c: 502.2
> avg_h264_chroma_mc4_8_rvv_i32: 466.5
> put_h264_chroma_mc1_8_c: 1436.5
> put_h264_chroma_mc1_8_rvv_i32: 382.5
> put_h264_chroma_mc2_8_c: 824.2
> put_h264_chroma_mc2_8_rvv_i32: 382.5
> put_h264_chroma_mc4_8_c: 431.2
> put_h264_chroma_mc4_8_rvv_i32: 382.5
>

Why do they all have the same timing?
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks
  2023-06-10 14:55 ` Lynne
@ 2023-06-10 16:43   ` Arnie Chang
  0 siblings, 0 replies; 9+ messages in thread
From: Arnie Chang @ 2023-06-10 16:43 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

On Sat, Jun 10, 2023 at 10:55 PM Lynne <dev@lynne.ee> wrote:

> Why do they all have the same timing?
>

The processing procedure for these workloads is the same,
except for the difference in block width. (8xH, 4xH, 2xH)
So, the number of instructions remains constant.
Since these workloads handle a small amount of data each time,
the cycle count also stays the same.
However, if the quantity increases, instructions may involve more
micro-operations,
which affects the cycle count (workloads with larger block width may take
more cycles).
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks
  2023-06-09  7:17 [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks Arnie Chang
  2023-06-10 14:55 ` Lynne
@ 2023-06-12 14:59 ` Rémi Denis-Courmont
  2023-06-12 15:28   ` Arnie Chang
  2023-06-14 15:57 ` Rémi Denis-Courmont
  2 siblings, 1 reply; 9+ messages in thread
From: Rémi Denis-Courmont @ 2023-06-12 14:59 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Arnie Chang

Le perjantaina 9. kesäkuuta 2023, 10.17.27 EEST Arnie Chang a écrit :
> Optimize the put and avg filtering for 4xH and 2xH blocks
> 
> Signed-off-by: Arnie Chang <arnie.chang@sifive.com>
> diff --git a/libavcodec/riscv/h264_mc_chroma.S
> b/libavcodec/riscv/h264_mc_chroma.S index 364bc3156e..c97cdbad86 100644
> --- a/libavcodec/riscv/h264_mc_chroma.S
> +++ b/libavcodec/riscv/h264_mc_chroma.S
> @@ -19,8 +19,7 @@
>   */
>  #include "libavutil/riscv/asm.S"
> 
> -.macro  h264_chroma_mc8 type
> -func h264_\type\()_chroma_mc8_rvv, zve32x
> +.macro  do_chroma_mc type width unroll
>          csrw            vxrm, zero
>          slli            t2, a5, 3
>          mul             t1, a5, a4
> @@ -30,94 +29,104 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
>          sub             a7, a4, t1
>          addi            a6, a5, 64
>          sub             t0, t2, t1
> -        vsetivli        t3, 8, e8, m1, ta, mu
> +        vsetivli        t3, \width, e8, m1, ta, mu
>          beqz            t1, 2f
>          blez            a3, 8f
>          li              t4, 0
>          li              t2, 0
>          li              t5, 1
>          addi            a5, t3, 1
> +  .ifc \unroll,1

It would seem more simpler and more intuitive to just use `.if` here. (Ditto 
below.)

-- 
レミ・デニ-クールモン
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks
  2023-06-12 14:59 ` Rémi Denis-Courmont
@ 2023-06-12 15:28   ` Arnie Chang
  2023-06-12 15:29     ` Rémi Denis-Courmont
  0 siblings, 1 reply; 9+ messages in thread
From: Arnie Chang @ 2023-06-12 15:28 UTC (permalink / raw)
  To: Rémi Denis-Courmont; +Cc: ffmpeg-devel

On Mon, Jun 12, 2023 at 10:59 PM Rémi Denis-Courmont <remi@remlab.net>
wrote:

> It would seem more simpler and more intuitive to just use `.if` here.
> (Ditto
> below.)
>

hi,
Do you mean using .if to modify this line of code?
+    vsetivli        t3, \width, e8, m1, ta, mu
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks
  2023-06-12 15:28   ` Arnie Chang
@ 2023-06-12 15:29     ` Rémi Denis-Courmont
  0 siblings, 0 replies; 9+ messages in thread
From: Rémi Denis-Courmont @ 2023-06-12 15:29 UTC (permalink / raw)
  To: Arnie Chang; +Cc: ffmpeg-devel

Le maanantaina 12. kesäkuuta 2023, 18.28.34 EEST Arnie Chang a écrit :
> On Mon, Jun 12, 2023 at 10:59 PM Rémi Denis-Courmont <remi@remlab.net>
> 
> wrote:
> > It would seem more simpler and more intuitive to just use `.if` here.
> > (Ditto
> > below.)
> 
> hi,
> Do you mean using .if to modify this line of code?
> +    vsetivli        t3, \width, e8, m1, ta, mu

No. I mean that I don't see a reason to use .ifc as opposed to just .if.


-- 
Rémi Denis-Courmont
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks
  2023-06-09  7:17 [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks Arnie Chang
  2023-06-10 14:55 ` Lynne
  2023-06-12 14:59 ` Rémi Denis-Courmont
@ 2023-06-14 15:57 ` Rémi Denis-Courmont
  2023-06-15 14:58   ` Arnie Chang
  2 siblings, 1 reply; 9+ messages in thread
From: Rémi Denis-Courmont @ 2023-06-14 15:57 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Arnie Chang

Le perjantaina 9. kesäkuuta 2023, 10.17.27 EEST Arnie Chang a écrit :
> Optimize the put and avg filtering for 4xH and 2xH blocks
> 
> Signed-off-by: Arnie Chang <arnie.chang@sifive.com>
> ---
> checkasm: using random seed 3475799765
> RVVi32:
>  - h264chroma.chroma_mc [OK]
> checkasm: all 6 tests passed
> avg_h264_chroma_mc1_8_c: 1821.5
> avg_h264_chroma_mc1_8_rvv_i32: 466.5
> avg_h264_chroma_mc2_8_c: 939.2
> avg_h264_chroma_mc2_8_rvv_i32: 466.5
> avg_h264_chroma_mc4_8_c: 502.2
> avg_h264_chroma_mc4_8_rvv_i32: 466.5
> put_h264_chroma_mc1_8_c: 1436.5
> put_h264_chroma_mc1_8_rvv_i32: 382.5
> put_h264_chroma_mc2_8_c: 824.2
> put_h264_chroma_mc2_8_rvv_i32: 382.5
> put_h264_chroma_mc4_8_c: 431.2
> put_h264_chroma_mc4_8_rvv_i32: 382.5
> 
>  libavcodec/riscv/h264_chroma_init_riscv.c |   8 +
>  libavcodec/riscv/h264_mc_chroma.S         | 216 ++++++++++++++--------
>  2 files changed, 144 insertions(+), 80 deletions(-)
> 
> diff --git a/libavcodec/riscv/h264_chroma_init_riscv.c
> b/libavcodec/riscv/h264_chroma_init_riscv.c index 7c905edfcd..9f95150ea3
> 100644
> --- a/libavcodec/riscv/h264_chroma_init_riscv.c
> +++ b/libavcodec/riscv/h264_chroma_init_riscv.c
> @@ -27,6 +27,10 @@
> 
>  void h264_put_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src,
> ptrdiff_t stride, int h, int x, int y); void
> h264_avg_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t
> stride, int h, int x, int y); +void h264_put_chroma_mc4_rvv(uint8_t *p_dst,
> const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y); +void
> h264_avg_chroma_mc4_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t
> stride, int h, int x, int y); +void h264_put_chroma_mc2_rvv(uint8_t *p_dst,
> const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y); +void
> h264_avg_chroma_mc2_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t
> stride, int h, int x, int y);
> 
>  av_cold void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth)
>  {
> @@ -36,6 +40,10 @@ av_cold void ff_h264chroma_init_riscv(H264ChromaContext
> *c, int bit_depth) if (bit_depth == 8 && (flags & AV_CPU_FLAG_RVV_I32) &&
> ff_get_rv_vlenb() >= 16) { c->put_h264_chroma_pixels_tab[0] =
> h264_put_chroma_mc8_rvv; c->avg_h264_chroma_pixels_tab[0] =
> h264_avg_chroma_mc8_rvv; +        c->put_h264_chroma_pixels_tab[1] =
> h264_put_chroma_mc4_rvv; +        c->avg_h264_chroma_pixels_tab[1] =
> h264_avg_chroma_mc4_rvv; +        c->put_h264_chroma_pixels_tab[2] =
> h264_put_chroma_mc2_rvv; +        c->avg_h264_chroma_pixels_tab[2] =
> h264_avg_chroma_mc2_rvv; }
>  #endif
>  }
> diff --git a/libavcodec/riscv/h264_mc_chroma.S
> b/libavcodec/riscv/h264_mc_chroma.S index 364bc3156e..c97cdbad86 100644
> --- a/libavcodec/riscv/h264_mc_chroma.S
> +++ b/libavcodec/riscv/h264_mc_chroma.S
> @@ -19,8 +19,7 @@
>   */
>  #include "libavutil/riscv/asm.S"
> 
> -.macro  h264_chroma_mc8 type
> -func h264_\type\()_chroma_mc8_rvv, zve32x
> +.macro  do_chroma_mc type width unroll

It looks like \width is only ever used as AVL. You could advantageously pass 
it as a run-time argument to an internal function, and spare the instruction 
cache, instead of instantiating otherwise identical code thrice.

>          csrw            vxrm, zero
>          slli            t2, a5, 3
>          mul             t1, a5, a4
> @@ -30,94 +29,104 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
>          sub             a7, a4, t1
>          addi            a6, a5, 64
>          sub             t0, t2, t1
> -        vsetivli        t3, 8, e8, m1, ta, mu
> +        vsetivli        t3, \width, e8, m1, ta, mu
>          beqz            t1, 2f
>          blez            a3, 8f
>          li              t4, 0
>          li              t2, 0
>          li              t5, 1
>          addi            a5, t3, 1
> +  .ifc \unroll,1
>          slli            t3, a2, 2
> +  .else
> +        slli            t3, a2, 1
> +  .endif

Note that all those 5-line conditional shift blocks could be simplified by 
folding, e.g.:

    slli t3, a2, (1 + \unroll)

Though I wonder if we could leverage SH*ADD instructions in some cases instead 
of SLLI?

(..)

> +.endm
> +
> +.macro  h264_chroma_mc type width
> +func h264_\type\()_chroma_mc\width\()_rvv, zve32x
> +  .ifc \width,8
> +        do_chroma_mc \type 8 1
> +  .else
> +        li      a7, 3
> +        blt     a3, a7, 11f
> +        do_chroma_mc \type \width 1
> +11:
> +        do_chroma_mc \type \width 0
> +  .endif


-- 
Rémi Denis-Courmont
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks
  2023-06-14 15:57 ` Rémi Denis-Courmont
@ 2023-06-15 14:58   ` Arnie Chang
  2023-06-15 18:48     ` Rémi Denis-Courmont
  0 siblings, 1 reply; 9+ messages in thread
From: Arnie Chang @ 2023-06-15 14:58 UTC (permalink / raw)
  To: Rémi Denis-Courmont; +Cc: ffmpeg-devel

On Wed, Jun 14, 2023 at 11:57 PM Rémi Denis-Courmont <remi@remlab.net>
wrote:

> It looks like \width is only ever used as AVL. You could advantageously
> pass
> it as a run-time argument to an internal function, and spare the
> instruction
> cache, instead of instantiating otherwise identical code thrice.
>

Since these functions are frequently called, I prefer instantiating similar
code many times
rather than calling another internal function, as it may introduce
additional function call overhead.
Neon and MMX code also apply the same approach.


    slli t3, a2, (1 + \unroll)
>

Brilliant suggestion, I will fix it in the next patch.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks
  2023-06-15 14:58   ` Arnie Chang
@ 2023-06-15 18:48     ` Rémi Denis-Courmont
  0 siblings, 0 replies; 9+ messages in thread
From: Rémi Denis-Courmont @ 2023-06-15 18:48 UTC (permalink / raw)
  To: ffmpeg-devel

Le torstaina 15. kesäkuuta 2023, 17.58.37 EEST Arnie Chang a écrit :
> Since these functions are frequently called, I prefer instantiating similar
> code many times
> rather than calling another internal function, as it may introduce
> additional function call overhead.

This works both ways. Smaller code reduces IC overhead and the risk of its own 
eviction or that of some other frequently used code.

Here, we would just add one `li` to the 8x cases, and a pair of `li` and `j` 
to the 2x and 4x cases (like we already do for Opus postfilter). Indeed, since 
this is assembler, we can enforce tail-call optimisation.

Since this is assembler, you can count on tail-call optimisation. This is 
really just one `li` and `j` added on the 2 and 4.

Not that I could measure the actual impact of either approaches.

-- 
Rémi Denis-Courmont
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2023-06-15 18:48 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-06-09  7:17 [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks Arnie Chang
2023-06-10 14:55 ` Lynne
2023-06-10 16:43   ` Arnie Chang
2023-06-12 14:59 ` Rémi Denis-Courmont
2023-06-12 15:28   ` Arnie Chang
2023-06-12 15:29     ` Rémi Denis-Courmont
2023-06-14 15:57 ` Rémi Denis-Courmont
2023-06-15 14:58   ` Arnie Chang
2023-06-15 18:48     ` Rémi Denis-Courmont

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git