[FFmpeg-devel] [PATCH v4 2/4] lavc/vp9dsp: R-V V mc bilin hv

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed

* [FFmpeg-devel] [PATCH v4 2/4] lavc/vp9dsp: R-V V mc bilin hv
       [not found] <20240615115034.3891490-1-uk7b@foxmail.com>
@ 2024-06-15 11:50 ` uk7b
  2024-06-15 11:52   ` flow gg
  2024-06-24 20:07   ` Rémi Denis-Courmont
  2024-06-15 11:50 ` [FFmpeg-devel] [PATCH v4 3/4] lavc/vp9dsp: R-V V mc tap h v uk7b
  2024-06-15 11:50 ` [FFmpeg-devel] [PATCH v4 4/4] lavc/vp9dsp: R-V V mc tap hv uk7b
  2 siblings, 2 replies; 15+ messages in thread
From: uk7b @ 2024-06-15 11:50 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

                                                     C908   X60
vp9_avg_bilin_4hv_8bpp_c                           :   10.7    9.5
vp9_avg_bilin_4hv_8bpp_rvv_i32                     :    4.0    3.5
vp9_avg_bilin_8hv_8bpp_c                           :   38.5   34.2
vp9_avg_bilin_8hv_8bpp_rvv_i32                     :    7.2    6.5
vp9_avg_bilin_16hv_8bpp_c                          :  147.2  130.5
vp9_avg_bilin_16hv_8bpp_rvv_i32                    :   14.5   12.7
vp9_avg_bilin_32hv_8bpp_c                          :  574.2  509.7
vp9_avg_bilin_32hv_8bpp_rvv_i32                    :   42.5   38.0
vp9_avg_bilin_64hv_8bpp_c                          : 2321.2 2017.7
vp9_avg_bilin_64hv_8bpp_rvv_i32                    :  163.5  131.0
vp9_put_bilin_4hv_8bpp_c                           :   10.0    8.7
vp9_put_bilin_4hv_8bpp_rvv_i32                     :    3.5    3.0
vp9_put_bilin_8hv_8bpp_c                           :   35.2   31.2
vp9_put_bilin_8hv_8bpp_rvv_i32                     :    6.5    5.7
vp9_put_bilin_16hv_8bpp_c                          :  134.0  119.0
vp9_put_bilin_16hv_8bpp_rvv_i32                    :   12.7   11.5
vp9_put_bilin_32hv_8bpp_c                          :  538.5  464.2
vp9_put_bilin_32hv_8bpp_rvv_i32                    :   39.7   35.2
vp9_put_bilin_64hv_8bpp_c                          : 2111.7 1833.2
vp9_put_bilin_64hv_8bpp_rvv_i32                    :  138.5  122.5
---
 libavcodec/riscv/vp9_mc_rvv.S  | 38 +++++++++++++++++++++++++++++++++-
 libavcodec/riscv/vp9dsp_init.c | 10 +++++++++
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index fb7377048a..5241562531 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -147,6 +147,40 @@ func ff_\op\()_vp9_bilin_64\type\()_rvv, zve32x
 endfunc
 .endm
 
+.macro bilin_hv op
+func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
+        vsetvlstatic8   64, t0, 64
+.Lbilin_hv\op:
+.ifc \op,avg
+        csrwi           vxrm, 0
+.endif
+        neg             t1, a5
+        neg             t2, a6
+        li              t4, 8
+        bilin_load_h    v24, put, a5
+        add             a2, a2, a3
+1:
+        addi            a4, a4, -1
+        bilin_load_h    v4, put, a5
+        vwmulu.vx       v16, v4, a6
+        vwmaccsu.vx     v16, t2, v24
+        vwadd.wx        v16, v16, t4
+        vnsra.wi        v16, v16, 4
+        vadd.vv         v0, v16, v24
+.ifc \op,avg
+        vle8.v          v16, (a0)
+        vaaddu.vv       v0, v0, v16
+.endif
+        vse8.v          v0, (a0)
+        vmv.v.v         v24, v4
+        add             a2, a2, a3
+        add             a0, a0, a1
+        bnez            a4, 1b
+
+        ret
+endfunc
+.endm
+
 .irp len, 64, 32, 16, 8, 4
         copy_avg \len
 .endr
@@ -155,6 +189,8 @@ bilin_h_v  put, h, a5
 bilin_h_v  avg, h, a5
 bilin_h_v  put, v, a6
 bilin_h_v  avg, v, a6
+bilin_hv   put
+bilin_hv   avg
 
 .macro func_bilin_h_v len, op, type
 func ff_\op\()_vp9_bilin_\len\()\type\()_rvv, zve32x
@@ -165,7 +201,7 @@ endfunc
 
 .irp len, 32, 16, 8, 4
         .irp op, put, avg
-                .irp type, h, v
+                .irp type, h, v, hv
                         func_bilin_h_v \len, \op, \type
                 .endr
         .endr
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index 9606d8545f..b3700dfb08 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -83,6 +83,16 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
     dsp->mc[4][FILTER_BILINEAR ][0][1][0] = ff_put_vp9_bilin_4h_rvv;
     dsp->mc[4][FILTER_BILINEAR ][1][0][1] = ff_avg_vp9_bilin_4v_rvv;
     dsp->mc[4][FILTER_BILINEAR ][1][1][0] = ff_avg_vp9_bilin_4h_rvv;
+    dsp->mc[0][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_64hv_rvv;
+    dsp->mc[0][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_64hv_rvv;
+    dsp->mc[1][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_32hv_rvv;
+    dsp->mc[1][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_32hv_rvv;
+    dsp->mc[2][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_16hv_rvv;
+    dsp->mc[2][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_16hv_rvv;
+    dsp->mc[3][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_8hv_rvv;
+    dsp->mc[3][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_8hv_rvv;
+    dsp->mc[4][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_4hv_rvv;
+    dsp->mc[4][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_4hv_rvv;
 
 #undef init_fpel
     }
-- 
2.45.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [FFmpeg-devel] [PATCH v4 2/4] lavc/vp9dsp: R-V V mc bilin hv
  2024-06-15 11:50 ` [FFmpeg-devel] [PATCH v4 2/4] lavc/vp9dsp: R-V V mc bilin hv uk7b
@ 2024-06-15 11:52   ` flow gg
  2024-06-24 20:07   ` Rémi Denis-Courmont
  1 sibling, 0 replies; 15+ messages in thread
From: flow gg @ 2024-06-15 11:52 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

> Copying vectors is rarely justified - mostly only before destructive
> instructions such as FMA.

It is slightly different from VP8. In VP8, many scalar values are positive,
so the related calculations can be easily replaced. However, in this
context of VP9, since t2 is a negative number, vwmaccsu is required.
Therefore, unlike the logic in VP8, we cannot use vwmulu.vx before
bilin_load to avoid vmv.


<uk7b@foxmail.com> 于2024年6月15日周六 19:51写道：

> From: sunyuechi <sunyuechi@iscas.ac.cn>
>
>                                                      C908   X60
> vp9_avg_bilin_4hv_8bpp_c                           :   10.7    9.5
> vp9_avg_bilin_4hv_8bpp_rvv_i32                     :    4.0    3.5
> vp9_avg_bilin_8hv_8bpp_c                           :   38.5   34.2
> vp9_avg_bilin_8hv_8bpp_rvv_i32                     :    7.2    6.5
> vp9_avg_bilin_16hv_8bpp_c                          :  147.2  130.5
> vp9_avg_bilin_16hv_8bpp_rvv_i32                    :   14.5   12.7
> vp9_avg_bilin_32hv_8bpp_c                          :  574.2  509.7
> vp9_avg_bilin_32hv_8bpp_rvv_i32                    :   42.5   38.0
> vp9_avg_bilin_64hv_8bpp_c                          : 2321.2 2017.7
> vp9_avg_bilin_64hv_8bpp_rvv_i32                    :  163.5  131.0
> vp9_put_bilin_4hv_8bpp_c                           :   10.0    8.7
> vp9_put_bilin_4hv_8bpp_rvv_i32                     :    3.5    3.0
> vp9_put_bilin_8hv_8bpp_c                           :   35.2   31.2
> vp9_put_bilin_8hv_8bpp_rvv_i32                     :    6.5    5.7
> vp9_put_bilin_16hv_8bpp_c                          :  134.0  119.0
> vp9_put_bilin_16hv_8bpp_rvv_i32                    :   12.7   11.5
> vp9_put_bilin_32hv_8bpp_c                          :  538.5  464.2
> vp9_put_bilin_32hv_8bpp_rvv_i32                    :   39.7   35.2
> vp9_put_bilin_64hv_8bpp_c                          : 2111.7 1833.2
> vp9_put_bilin_64hv_8bpp_rvv_i32                    :  138.5  122.5
> ---
>  libavcodec/riscv/vp9_mc_rvv.S  | 38 +++++++++++++++++++++++++++++++++-
>  libavcodec/riscv/vp9dsp_init.c | 10 +++++++++
>  2 files changed, 47 insertions(+), 1 deletion(-)
>
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index fb7377048a..5241562531 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -147,6 +147,40 @@ func ff_\op\()_vp9_bilin_64\type\()_rvv, zve32x
>  endfunc
>  .endm
>
> +.macro bilin_hv op
> +func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
> +        vsetvlstatic8   64, t0, 64
> +.Lbilin_hv\op:
> +.ifc \op,avg
> +        csrwi           vxrm, 0
> +.endif
> +        neg             t1, a5
> +        neg             t2, a6
> +        li              t4, 8
> +        bilin_load_h    v24, put, a5
> +        add             a2, a2, a3
> +1:
> +        addi            a4, a4, -1
> +        bilin_load_h    v4, put, a5
> +        vwmulu.vx       v16, v4, a6
> +        vwmaccsu.vx     v16, t2, v24
> +        vwadd.wx        v16, v16, t4
> +        vnsra.wi        v16, v16, 4
> +        vadd.vv         v0, v16, v24
> +.ifc \op,avg
> +        vle8.v          v16, (a0)
> +        vaaddu.vv       v0, v0, v16
> +.endif
> +        vse8.v          v0, (a0)
> +        vmv.v.v         v24, v4
> +        add             a2, a2, a3
> +        add             a0, a0, a1
> +        bnez            a4, 1b
> +
> +        ret
> +endfunc
> +.endm
> +
>  .irp len, 64, 32, 16, 8, 4
>          copy_avg \len
>  .endr
> @@ -155,6 +189,8 @@ bilin_h_v  put, h, a5
>  bilin_h_v  avg, h, a5
>  bilin_h_v  put, v, a6
>  bilin_h_v  avg, v, a6
> +bilin_hv   put
> +bilin_hv   avg
>
>  .macro func_bilin_h_v len, op, type
>  func ff_\op\()_vp9_bilin_\len\()\type\()_rvv, zve32x
> @@ -165,7 +201,7 @@ endfunc
>
>  .irp len, 32, 16, 8, 4
>          .irp op, put, avg
> -                .irp type, h, v
> +                .irp type, h, v, hv
>                          func_bilin_h_v \len, \op, \type
>                  .endr
>          .endr
> diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> index 9606d8545f..b3700dfb08 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -83,6 +83,16 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp)
>      dsp->mc[4][FILTER_BILINEAR ][0][1][0] = ff_put_vp9_bilin_4h_rvv;
>      dsp->mc[4][FILTER_BILINEAR ][1][0][1] = ff_avg_vp9_bilin_4v_rvv;
>      dsp->mc[4][FILTER_BILINEAR ][1][1][0] = ff_avg_vp9_bilin_4h_rvv;
> +    dsp->mc[0][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_64hv_rvv;
> +    dsp->mc[0][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_64hv_rvv;
> +    dsp->mc[1][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_32hv_rvv;
> +    dsp->mc[1][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_32hv_rvv;
> +    dsp->mc[2][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_16hv_rvv;
> +    dsp->mc[2][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_16hv_rvv;
> +    dsp->mc[3][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_8hv_rvv;
> +    dsp->mc[3][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_8hv_rvv;
> +    dsp->mc[4][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_4hv_rvv;
> +    dsp->mc[4][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_4hv_rvv;
>
>  #undef init_fpel
>      }
> --
> 2.45.2
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [FFmpeg-devel] [PATCH v4 2/4] lavc/vp9dsp: R-V V mc bilin hv
  2024-06-15 11:50 ` [FFmpeg-devel] [PATCH v4 2/4] lavc/vp9dsp: R-V V mc bilin hv uk7b
  2024-06-15 11:52   ` flow gg
@ 2024-06-24 20:07   ` Rémi Denis-Courmont
  2024-06-30 11:39     ` flow gg
  1 sibling, 1 reply; 15+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-24 20:07 UTC (permalink / raw)
  To: ffmpeg-devel

Le lauantaina 15. kesäkuuta 2024, 14.50.32 EEST uk7b@foxmail.com a écrit :
> From: sunyuechi <sunyuechi@iscas.ac.cn>
> 
>                                                      C908   X60
> vp9_avg_bilin_4hv_8bpp_c                           :   10.7    9.5
> vp9_avg_bilin_4hv_8bpp_rvv_i32                     :    4.0    3.5
> vp9_avg_bilin_8hv_8bpp_c                           :   38.5   34.2
> vp9_avg_bilin_8hv_8bpp_rvv_i32                     :    7.2    6.5
> vp9_avg_bilin_16hv_8bpp_c                          :  147.2  130.5
> vp9_avg_bilin_16hv_8bpp_rvv_i32                    :   14.5   12.7
> vp9_avg_bilin_32hv_8bpp_c                          :  574.2  509.7
> vp9_avg_bilin_32hv_8bpp_rvv_i32                    :   42.5   38.0
> vp9_avg_bilin_64hv_8bpp_c                          : 2321.2 2017.7
> vp9_avg_bilin_64hv_8bpp_rvv_i32                    :  163.5  131.0
> vp9_put_bilin_4hv_8bpp_c                           :   10.0    8.7
> vp9_put_bilin_4hv_8bpp_rvv_i32                     :    3.5    3.0
> vp9_put_bilin_8hv_8bpp_c                           :   35.2   31.2
> vp9_put_bilin_8hv_8bpp_rvv_i32                     :    6.5    5.7
> vp9_put_bilin_16hv_8bpp_c                          :  134.0  119.0
> vp9_put_bilin_16hv_8bpp_rvv_i32                    :   12.7   11.5
> vp9_put_bilin_32hv_8bpp_c                          :  538.5  464.2
> vp9_put_bilin_32hv_8bpp_rvv_i32                    :   39.7   35.2
> vp9_put_bilin_64hv_8bpp_c                          : 2111.7 1833.2
> vp9_put_bilin_64hv_8bpp_rvv_i32                    :  138.5  122.5
> ---
>  libavcodec/riscv/vp9_mc_rvv.S  | 38 +++++++++++++++++++++++++++++++++-
>  libavcodec/riscv/vp9dsp_init.c | 10 +++++++++
>  2 files changed, 47 insertions(+), 1 deletion(-)
> 
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index fb7377048a..5241562531 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -147,6 +147,40 @@ func ff_\op\()_vp9_bilin_64\type\()_rvv, zve32x
>  endfunc
>  .endm
> 
> +.macro bilin_hv op
> +func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
> +        vsetvlstatic8   64, t0, 64
> +.Lbilin_hv\op:
> +.ifc \op,avg
> +        csrwi           vxrm, 0
> +.endif
> +        neg             t1, a5
> +        neg             t2, a6
> +        li              t4, 8
> +        bilin_load_h    v24, put, a5
> +        add             a2, a2, a3
> +1:
> +        addi            a4, a4, -1
> +        bilin_load_h    v4, put, a5
> +        vwmulu.vx       v16, v4, a6
> +        vwmaccsu.vx     v16, t2, v24
> +        vwadd.wx        v16, v16, t4
> +        vnsra.wi        v16, v16, 4

Why round manually?
It looks like vnclip.wi would be more straightforward here.

> +        vadd.vv         v0, v16, v24
> +.ifc \op,avg
> +        vle8.v          v16, (a0)
> +        vaaddu.vv       v0, v0, v16
> +.endif
> +        vse8.v          v0, (a0)
> +        vmv.v.v         v24, v4
> +        add             a2, a2, a3
> +        add             a0, a0, a1
> +        bnez            a4, 1b
> +
> +        ret
> +endfunc
> +.endm
> +
>  .irp len, 64, 32, 16, 8, 4
>          copy_avg \len
>  .endr
> @@ -155,6 +189,8 @@ bilin_h_v  put, h, a5
>  bilin_h_v  avg, h, a5
>  bilin_h_v  put, v, a6
>  bilin_h_v  avg, v, a6
> +bilin_hv   put
> +bilin_hv   avg
> 
>  .macro func_bilin_h_v len, op, type
>  func ff_\op\()_vp9_bilin_\len\()\type\()_rvv, zve32x
> @@ -165,7 +201,7 @@ endfunc
> 
>  .irp len, 32, 16, 8, 4
>          .irp op, put, avg
> -                .irp type, h, v
> +                .irp type, h, v, hv
>                          func_bilin_h_v \len, \op, \type
>                  .endr
>          .endr
> diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
> index 9606d8545f..b3700dfb08 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -83,6 +83,16 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp) dsp->mc[4][FILTER_BILINEAR ][0][1][0] =
> ff_put_vp9_bilin_4h_rvv; dsp->mc[4][FILTER_BILINEAR ][1][0][1] =
> ff_avg_vp9_bilin_4v_rvv; dsp->mc[4][FILTER_BILINEAR ][1][1][0] =
> ff_avg_vp9_bilin_4h_rvv; +    dsp->mc[0][FILTER_BILINEAR ][0][1][1] =
> ff_put_vp9_bilin_64hv_rvv; +    dsp->mc[0][FILTER_BILINEAR ][1][1][1] =
> ff_avg_vp9_bilin_64hv_rvv; +    dsp->mc[1][FILTER_BILINEAR ][0][1][1] =
> ff_put_vp9_bilin_32hv_rvv; +    dsp->mc[1][FILTER_BILINEAR ][1][1][1] =
> ff_avg_vp9_bilin_32hv_rvv; +    dsp->mc[2][FILTER_BILINEAR ][0][1][1] =
> ff_put_vp9_bilin_16hv_rvv; +    dsp->mc[2][FILTER_BILINEAR ][1][1][1] =
> ff_avg_vp9_bilin_16hv_rvv; +    dsp->mc[3][FILTER_BILINEAR ][0][1][1] =
> ff_put_vp9_bilin_8hv_rvv; +    dsp->mc[3][FILTER_BILINEAR ][1][1][1] =
> ff_avg_vp9_bilin_8hv_rvv; +    dsp->mc[4][FILTER_BILINEAR ][0][1][1] =
> ff_put_vp9_bilin_4hv_rvv; +    dsp->mc[4][FILTER_BILINEAR ][1][1][1] =
> ff_avg_vp9_bilin_4hv_rvv;
> 
>  #undef init_fpel
>      }


-- 
Rémi Denis-Courmont
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [FFmpeg-devel] [PATCH v4 2/4] lavc/vp9dsp: R-V V mc bilin hv
  2024-06-24 20:07   ` Rémi Denis-Courmont
@ 2024-06-30 11:39     ` flow gg
  0 siblings, 0 replies; 15+ messages in thread
From: flow gg @ 2024-06-30 11:39 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Initially, I tried using `vnclip.wi` with reference to h264,
-        vwadd.wx        v16, v16, t4
-        vnsra.wi        v16, v16, 4
+        vnclip.wi       v16, v16, 4

but couldn't find the correct way... I think there might be some overflow
issues that I didn't understand correctly. How do you think it should be
replaced?

Rémi Denis-Courmont <remi@remlab.net> 于2024年6月25日周二 04:07写道：

> Le lauantaina 15. kesäkuuta 2024, 14.50.32 EEST uk7b@foxmail.com a écrit :
> > From: sunyuechi <sunyuechi@iscas.ac.cn>
> >
> >                                                      C908   X60
> > vp9_avg_bilin_4hv_8bpp_c                           :   10.7    9.5
> > vp9_avg_bilin_4hv_8bpp_rvv_i32                     :    4.0    3.5
> > vp9_avg_bilin_8hv_8bpp_c                           :   38.5   34.2
> > vp9_avg_bilin_8hv_8bpp_rvv_i32                     :    7.2    6.5
> > vp9_avg_bilin_16hv_8bpp_c                          :  147.2  130.5
> > vp9_avg_bilin_16hv_8bpp_rvv_i32                    :   14.5   12.7
> > vp9_avg_bilin_32hv_8bpp_c                          :  574.2  509.7
> > vp9_avg_bilin_32hv_8bpp_rvv_i32                    :   42.5   38.0
> > vp9_avg_bilin_64hv_8bpp_c                          : 2321.2 2017.7
> > vp9_avg_bilin_64hv_8bpp_rvv_i32                    :  163.5  131.0
> > vp9_put_bilin_4hv_8bpp_c                           :   10.0    8.7
> > vp9_put_bilin_4hv_8bpp_rvv_i32                     :    3.5    3.0
> > vp9_put_bilin_8hv_8bpp_c                           :   35.2   31.2
> > vp9_put_bilin_8hv_8bpp_rvv_i32                     :    6.5    5.7
> > vp9_put_bilin_16hv_8bpp_c                          :  134.0  119.0
> > vp9_put_bilin_16hv_8bpp_rvv_i32                    :   12.7   11.5
> > vp9_put_bilin_32hv_8bpp_c                          :  538.5  464.2
> > vp9_put_bilin_32hv_8bpp_rvv_i32                    :   39.7   35.2
> > vp9_put_bilin_64hv_8bpp_c                          : 2111.7 1833.2
> > vp9_put_bilin_64hv_8bpp_rvv_i32                    :  138.5  122.5
> > ---
> >  libavcodec/riscv/vp9_mc_rvv.S  | 38 +++++++++++++++++++++++++++++++++-
> >  libavcodec/riscv/vp9dsp_init.c | 10 +++++++++
> >  2 files changed, 47 insertions(+), 1 deletion(-)
> >
> > diff --git a/libavcodec/riscv/vp9_mc_rvv.S
> b/libavcodec/riscv/vp9_mc_rvv.S
> > index fb7377048a..5241562531 100644
> > --- a/libavcodec/riscv/vp9_mc_rvv.S
> > +++ b/libavcodec/riscv/vp9_mc_rvv.S
> > @@ -147,6 +147,40 @@ func ff_\op\()_vp9_bilin_64\type\()_rvv, zve32x
> >  endfunc
> >  .endm
> >
> > +.macro bilin_hv op
> > +func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
> > +        vsetvlstatic8   64, t0, 64
> > +.Lbilin_hv\op:
> > +.ifc \op,avg
> > +        csrwi           vxrm, 0
> > +.endif
> > +        neg             t1, a5
> > +        neg             t2, a6
> > +        li              t4, 8
> > +        bilin_load_h    v24, put, a5
> > +        add             a2, a2, a3
> > +1:
> > +        addi            a4, a4, -1
> > +        bilin_load_h    v4, put, a5
> > +        vwmulu.vx       v16, v4, a6
> > +        vwmaccsu.vx     v16, t2, v24
> > +        vwadd.wx        v16, v16, t4
> > +        vnsra.wi        v16, v16, 4
>
> Why round manually?
> It looks like vnclip.wi would be more straightforward here.
>
> > +        vadd.vv         v0, v16, v24
> > +.ifc \op,avg
> > +        vle8.v          v16, (a0)
> > +        vaaddu.vv       v0, v0, v16
> > +.endif
> > +        vse8.v          v0, (a0)
> > +        vmv.v.v         v24, v4
> > +        add             a2, a2, a3
> > +        add             a0, a0, a1
> > +        bnez            a4, 1b
> > +
> > +        ret
> > +endfunc
> > +.endm
> > +
> >  .irp len, 64, 32, 16, 8, 4
> >          copy_avg \len
> >  .endr
> > @@ -155,6 +189,8 @@ bilin_h_v  put, h, a5
> >  bilin_h_v  avg, h, a5
> >  bilin_h_v  put, v, a6
> >  bilin_h_v  avg, v, a6
> > +bilin_hv   put
> > +bilin_hv   avg
> >
> >  .macro func_bilin_h_v len, op, type
> >  func ff_\op\()_vp9_bilin_\len\()\type\()_rvv, zve32x
> > @@ -165,7 +201,7 @@ endfunc
> >
> >  .irp len, 32, 16, 8, 4
> >          .irp op, put, avg
> > -                .irp type, h, v
> > +                .irp type, h, v, hv
> >                          func_bilin_h_v \len, \op, \type
> >                  .endr
> >          .endr
> > diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> > index 9606d8545f..b3700dfb08 100644
> > --- a/libavcodec/riscv/vp9dsp_init.c
> > +++ b/libavcodec/riscv/vp9dsp_init.c
> > @@ -83,6 +83,16 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> > *dsp, int bpp) dsp->mc[4][FILTER_BILINEAR ][0][1][0] =
> > ff_put_vp9_bilin_4h_rvv; dsp->mc[4][FILTER_BILINEAR ][1][0][1] =
> > ff_avg_vp9_bilin_4v_rvv; dsp->mc[4][FILTER_BILINEAR ][1][1][0] =
> > ff_avg_vp9_bilin_4h_rvv; +    dsp->mc[0][FILTER_BILINEAR ][0][1][1] =
> > ff_put_vp9_bilin_64hv_rvv; +    dsp->mc[0][FILTER_BILINEAR ][1][1][1] =
> > ff_avg_vp9_bilin_64hv_rvv; +    dsp->mc[1][FILTER_BILINEAR ][0][1][1] =
> > ff_put_vp9_bilin_32hv_rvv; +    dsp->mc[1][FILTER_BILINEAR ][1][1][1] =
> > ff_avg_vp9_bilin_32hv_rvv; +    dsp->mc[2][FILTER_BILINEAR ][0][1][1] =
> > ff_put_vp9_bilin_16hv_rvv; +    dsp->mc[2][FILTER_BILINEAR ][1][1][1] =
> > ff_avg_vp9_bilin_16hv_rvv; +    dsp->mc[3][FILTER_BILINEAR ][0][1][1] =
> > ff_put_vp9_bilin_8hv_rvv; +    dsp->mc[3][FILTER_BILINEAR ][1][1][1] =
> > ff_avg_vp9_bilin_8hv_rvv; +    dsp->mc[4][FILTER_BILINEAR ][0][1][1] =
> > ff_put_vp9_bilin_4hv_rvv; +    dsp->mc[4][FILTER_BILINEAR ][1][1][1] =
> > ff_avg_vp9_bilin_4hv_rvv;
> >
> >  #undef init_fpel
> >      }
>
>
> --
> Rémi Denis-Courmont
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [FFmpeg-devel] [PATCH v4 3/4] lavc/vp9dsp: R-V V mc tap h v
       [not found] <20240615115034.3891490-1-uk7b@foxmail.com>
  2024-06-15 11:50 ` [FFmpeg-devel] [PATCH v4 2/4] lavc/vp9dsp: R-V V mc bilin hv uk7b
@ 2024-06-15 11:50 ` uk7b
  2024-06-15 11:52   ` flow gg
  2024-07-13  9:02   ` Rémi Denis-Courmont
  2024-06-15 11:50 ` [FFmpeg-devel] [PATCH v4 4/4] lavc/vp9dsp: R-V V mc tap hv uk7b
  2 siblings, 2 replies; 15+ messages in thread
From: uk7b @ 2024-06-15 11:50 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

                                                     C908   X60
vp9_avg_8tap_smooth_4h_8bpp_c                      :   12.7   11.2
vp9_avg_8tap_smooth_4h_8bpp_rvv_i32                :    4.7    4.2
vp9_avg_8tap_smooth_4v_8bpp_c                      :   29.7   12.5
vp9_avg_8tap_smooth_4v_8bpp_rvv_i32                :    4.7    4.2
vp9_avg_8tap_smooth_8h_8bpp_c                      :   48.7   42.2
vp9_avg_8tap_smooth_8h_8bpp_rvv_i32                :    9.5    8.5
vp9_avg_8tap_smooth_8v_8bpp_c                      :   49.7   45.5
vp9_avg_8tap_smooth_8v_8bpp_rvv_i32                :    9.5    8.5
vp9_avg_8tap_smooth_16h_8bpp_c                     :  192.0  166.5
vp9_avg_8tap_smooth_16h_8bpp_rvv_i32               :   21.7   19.5
vp9_avg_8tap_smooth_16v_8bpp_c                     :  191.2  175.2
vp9_avg_8tap_smooth_16v_8bpp_rvv_i32               :   21.2   19.0
vp9_avg_8tap_smooth_32h_8bpp_c                     :  780.2  663.2
vp9_avg_8tap_smooth_32h_8bpp_rvv_i32               :   68.2   60.5
vp9_avg_8tap_smooth_32v_8bpp_c                     :  770.0  685.7
vp9_avg_8tap_smooth_32v_8bpp_rvv_i32               :   67.0   59.5
vp9_avg_8tap_smooth_64h_8bpp_c                     : 3116.2 2648.2
vp9_avg_8tap_smooth_64h_8bpp_rvv_i32               :  270.7  120.7
vp9_avg_8tap_smooth_64v_8bpp_c                     : 3058.5 2731.7
vp9_avg_8tap_smooth_64v_8bpp_rvv_i32               :  266.5  119.0
vp9_put_8tap_smooth_4h_8bpp_c                      :   11.0    9.7
vp9_put_8tap_smooth_4h_8bpp_rvv_i32                :    4.2    3.7
vp9_put_8tap_smooth_4v_8bpp_c                      :   11.7   10.5
vp9_put_8tap_smooth_4v_8bpp_rvv_i32                :    4.0    3.7
vp9_put_8tap_smooth_8h_8bpp_c                      :   42.0   37.5
vp9_put_8tap_smooth_8h_8bpp_rvv_i32                :    8.5    7.7
vp9_put_8tap_smooth_8v_8bpp_c                      :   43.5   38.5
vp9_put_8tap_smooth_8v_8bpp_rvv_i32                :    8.7    7.7
vp9_put_8tap_smooth_16h_8bpp_c                     :  181.7  147.2
vp9_put_8tap_smooth_16h_8bpp_rvv_i32               :   20.0   18.0
vp9_put_8tap_smooth_16v_8bpp_c                     :  168.5  149.7
vp9_put_8tap_smooth_16v_8bpp_rvv_i32               :   19.7   17.5
vp9_put_8tap_smooth_32h_8bpp_c                     :  675.0  586.5
vp9_put_8tap_smooth_32h_8bpp_rvv_i32               :   65.2   58.0
vp9_put_8tap_smooth_32v_8bpp_c                     :  664.7  591.2
vp9_put_8tap_smooth_32v_8bpp_rvv_i32               :   64.0   57.0
vp9_put_8tap_smooth_64h_8bpp_c                     : 2696.2 2339.0
vp9_put_8tap_smooth_64h_8bpp_rvv_i32               :  259.7  115.7
vp9_put_8tap_smooth_64v_8bpp_c                     : 2691.0 2348.5
vp9_put_8tap_smooth_64v_8bpp_rvv_i32               :  255.5  114.0
---
 libavcodec/riscv/vp9_mc_rvv.S  | 200 +++++++++++++++++++++++++++++++++
 libavcodec/riscv/vp9dsp.h      |  72 ++++++++----
 libavcodec/riscv/vp9dsp_init.c |  38 ++++++-
 3 files changed, 285 insertions(+), 25 deletions(-)

diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index 5241562531..5e81301aa5 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -36,6 +36,18 @@
 .endif
 .endm
 
+.macro vsetvlstatic16 len
+.ifc \len,4
+        vsetvli         zero, zero, e16, mf2, ta, ma
+.elseif \len == 8
+        vsetvli         zero, zero, e16, m1, ta, ma
+.elseif \len == 16
+        vsetvli         zero, zero, e16, m2, ta, ma
+.else
+        vsetvli         zero, zero, e16, m4, ta, ma
+.endif
+.endm
+
 .macro copy_avg len
 func ff_vp9_avg\len\()_rvv, zve32x
         csrwi           vxrm, 0
@@ -181,8 +193,196 @@ func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
 endfunc
 .endm
 
+.equ ff_vp9_subpel_filters_smooth, ff_vp9_subpel_filters
+.equ ff_vp9_subpel_filters_regular, ff_vp9_subpel_filters + 16*8*2
+.equ ff_vp9_subpel_filters_sharp, ff_vp9_subpel_filters + 16*8*2*2
+
+.macro epel_filter name, type, regtype
+        lla             \regtype\()2, ff_vp9_subpel_filters_\name
+
+.ifc \type,v
+        slli            \regtype\()0, a6, 4
+.else
+        slli            \regtype\()0, a5, 4
+.endif
+        add             \regtype\()0, \regtype\()0, \regtype\()2
+
+        lh              \regtype\()1, 2(\regtype\()0)
+        lh              \regtype\()2, 4(\regtype\()0)
+        lh              \regtype\()3, 6(\regtype\()0)
+        lh              \regtype\()4, 8(\regtype\()0)
+        lh              \regtype\()5, 10(\regtype\()0)
+        lh              \regtype\()6, 12(\regtype\()0)
+
+.ifc \regtype,t
+        lh              a7, 14(\regtype\()0)
+.else
+        lh              s7, 14(\regtype\()0)
+.endif
+        lh              \regtype\()0, 0(\regtype\()0)
+.endm
+
+.macro epel_load dst, len, op, name, type, from_mem, regtype
+.ifc \from_mem, 1
+        vle8.v          v22, (a2)
+.ifc \type,v
+        add             a5, a3, a2
+        sub             a2, a2, a3
+        vle8.v          v24, (a5)
+        vle8.v          v20, (a2)
+        sh1add          a2, a3, a5
+        add             a5, a5, a3
+        vle8.v          v26, (a5)
+        vle8.v          v28, (a2)
+        add             a2, a2, a3
+        vle8.v          v30, (a2)
+.else
+        addi            a5, a2, 1
+        addi            a2, a2, -1
+        vle8.v          v24, (a5)
+        vle8.v          v20, (a2)
+        addi            a5, a5, 2
+        addi            a2, a2, 3
+        vle8.v          v28, (a5)
+        vle8.v          v26, (a2)
+        addi            a2, a5, 1
+        vle8.v          v30, (a2)
+.endif
+
+.ifc \name,smooth
+        vwmulu.vx       v16, v24, \regtype\()4
+        vwmaccu.vx      v16, \regtype\()2, v20
+        vwmaccu.vx      v16, \regtype\()5, v26
+        vwmaccsu.vx     v16, \regtype\()6, v28
+.else
+        vwmulu.vx       v16, v28, \regtype\()6
+        vwmaccsu.vx     v16, \regtype\()2, v20
+        vwmaccsu.vx     v16, \regtype\()5, v26
+.endif
+
+.ifc \regtype,t
+        vwmaccsu.vx     v16, a7, v30
+.else
+        vwmaccsu.vx     v16, s7, v30
+.endif
+
+.ifc \type,v
+        sh1add          a5, a3, a3
+        sub             a2, a2, a5
+        sub             a2, a2, a5
+        sub             a5, a2, a3
+        vle8.v          v28, (a2)
+        vle8.v          v26, (a5)
+        sh1add          a2, a3, a2
+.else
+        addi            a5, a2, -7
+        addi            a2, a2, -6
+        vle8.v          v26, (a5)
+        vle8.v          v28, (a2)
+        addi            a2, a2, 2
+.endif
+
+.ifc \name,smooth
+        vwmaccsu.vx     v16, \regtype\()1, v28
+.else
+        vwmaccu.vx      v16, \regtype\()1, v28
+        vwmulu.vx       v28, v24, \regtype\()4
+.endif
+        vwmaccsu.vx     v16, \regtype\()0, v26
+        vwmulu.vx       v20, v22, \regtype\()3
+.else
+.ifc \name,smooth
+        vwmulu.vx       v16, v8, \regtype\()4
+        vwmaccu.vx      v16, \regtype\()2, v4
+        vwmaccu.vx      v16, \regtype\()5, v10
+        vwmaccsu.vx     v16, \regtype\()6, v12
+        vwmaccsu.vx     v16, \regtype\()1, v2
+.else
+        vwmulu.vx       v16, v2, \regtype\()1
+        vwmaccu.vx      v16, \regtype\()6, v12
+        vwmaccsu.vx     v16, \regtype\()5, v10
+        vwmaccsu.vx     v16, \regtype\()2, v4
+        vwmulu.vx       v28, v8, \regtype\()4
+.endif
+        vwmaccsu.vx     v16, \regtype\()0, v0
+        vwmulu.vx       v20, v6, \regtype\()3
+
+.ifc \regtype,t
+        vwmaccsu.vx     v16, a7, v14
+.else
+        vwmaccsu.vx     v16, s7, v14
+.endif
+
+.endif
+        li              a5, 64
+        vwadd.wx        v16, v16, a5
+        vsetvlstatic16  \len
+
+.ifc \name,smooth
+        vwadd.vv        v24, v16, v20
+.else
+        vwadd.vv        v24, v16, v28
+        vwadd.wv        v24, v24, v20
+.endif
+        vnsra.wi        v24, v24, 7
+        vmax.vx         v24, v24, zero
+        vsetvlstatic8   \len, zero, 32, m2
+
+        vnclipu.wi      \dst, v24, 0
+.ifc \op,avg
+        vle8.v          v24, (a0)
+        vaaddu.vv       \dst, \dst, v24
+.endif
+
+.endm
+
+.macro epel_load_inc dst, len, op, name, type, from_mem, regtype
+        epel_load       \dst, \len, \op, \name, \type, \from_mem, \regtype
+        add             a2, a2, a3
+.endm
+
+.macro epel len, op, name, type, vlen
+func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
+        epel_filter     \name, \type, t
+.if \vlen < 256
+        vsetvlstatic8   \len, a5, 32, m2
+.else
+        vsetvlstatic8   \len, a5, 64, m2
+.endif
+.ifc \op,avg
+        csrwi           vxrm, 0
+.endif
+
+1:
+        addi            a4, a4, -1
+        epel_load       v30, \len, \op, \name, \type, 1, t
+        vse8.v          v30, (a0)
+.if \len == 64 && \vlen < 256
+        addi            a0, a0, 32
+        addi            a2, a2, 32
+        epel_load       v30, \len, \op, \name, \type, 1, t
+        vse8.v          v30, (a0)
+        addi            a0, a0, -32
+        addi            a2, a2, -32
+.endif
+        add             a2, a2, a3
+        add             a0, a0, a1
+        bnez            a4, 1b
+
+        ret
+endfunc
+.endm
+
 .irp len, 64, 32, 16, 8, 4
         copy_avg \len
+        .irp op, put, avg
+                .irp name, regular, sharp, smooth
+                        .irp type, h, v
+                                epel \len, \op, \name, \type, 128
+                                epel \len, \op, \name, \type, 256
+                        .endr
+                .endr
+        .endr
 .endr
 
 bilin_h_v  put, h, a5
diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
index 8fb326dae0..5fd64a1b8c 100644
--- a/libavcodec/riscv/vp9dsp.h
+++ b/libavcodec/riscv/vp9dsp.h
@@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
 void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
                    const uint8_t *a);
 
-#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)                         \
-void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride,   \
+#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx, min_vlen)              \
+void ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,          \
+                                        ptrdiff_t dststride,                 \
                                         const uint8_t *src,                  \
                                         ptrdiff_t srcstride,                 \
                                         int h, int mx, int my);              \
                                                                              \
-void ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,   \
+void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,          \
+                                        ptrdiff_t dststride,                 \
                                         const uint8_t *src,                  \
                                         ptrdiff_t srcstride,                 \
                                         int h, int mx, int my);              \
                                                                              \
-void ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,  \
+void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,         \
+                                         ptrdiff_t dststride,                \
                                          const uint8_t *src,                 \
                                          ptrdiff_t srcstride,                \
                                          int h, int mx, int my);             \
                                                                              \
-void ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride,   \
+void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,          \
+                                        ptrdiff_t dststride,                 \
                                         const uint8_t *src,                  \
                                         ptrdiff_t srcstride,                 \
                                         int h, int mx, int my);              \
                                                                              \
-void ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,   \
+void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,          \
+                                        ptrdiff_t dststride,                 \
                                         const uint8_t *src,                  \
                                         ptrdiff_t srcstride,                 \
                                         int h, int mx, int my);              \
                                                                              \
-void ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,  \
+void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,         \
+                                         ptrdiff_t dststride,                \
                                          const uint8_t *src,                 \
                                          ptrdiff_t srcstride,                \
                                          int h, int mx, int my);
@@ -146,23 +152,41 @@ void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t dststride,     \
                         const uint8_t *src, ptrdiff_t srcstride,   \
                         int h, int mx, int my);
 
-VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR);
-
-VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP);
-
-VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256);
 
 VP9_BILINEAR_RISCV_RVV_FUNC(64);
 VP9_BILINEAR_RISCV_RVV_FUNC(32);
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index b3700dfb08..3669070fca 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -49,7 +49,9 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
 # endif
 
 #if HAVE_RVV
-    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) && ff_rv_vlen_least(128)) {
+    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
+    int vlenb = ff_get_rv_vlenb();
+    if (vlenb >= 16) {
 
 #define init_fpel(idx1, sz)                                           \
     dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_vp9_avg##sz##_rvv;  \
@@ -95,6 +97,40 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
     dsp->mc[4][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_4hv_rvv;
 
 #undef init_fpel
+
+#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen)  \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] =   \
+        ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen;       \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] =   \
+        ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen;      \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] =   \
+        ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen;
+
+#define init_subpel2(idx, idxh, idxv, dir, type, vlen)      \
+    init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen);  \
+    init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen);  \
+    init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen);  \
+    init_subpel1(3, idx, idxh, idxv,  8, dir, type, vlen);  \
+    init_subpel1(4, idx, idxh, idxv,  4, dir, type, vlen)
+
+    init_subpel2(0, 1, 0, h, put, 128);
+    init_subpel2(1, 1, 0, h, avg, 128);
+
+    if (flags & AV_CPU_FLAG_RVB_ADDR) {
+        init_subpel2(0, 0, 1, v, put, 128);
+        init_subpel2(1, 0, 1, v, avg, 128);
+    }
+
+    }
+    if (vlenb >= 32) {
+        init_subpel2(0, 1, 0, h, put, 256);
+        init_subpel2(1, 1, 0, h, avg, 256);
+
+        if (flags & AV_CPU_FLAG_RVB_ADDR) {
+            init_subpel2(0, 0, 1, v, put, 256);
+            init_subpel2(1, 0, 1, v, avg, 256);
+        }
+    }
     }
 #endif
 #endif
-- 
2.45.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [FFmpeg-devel] [PATCH v4 3/4] lavc/vp9dsp: R-V V mc tap h v
  2024-06-15 11:50 ` [FFmpeg-devel] [PATCH v4 3/4] lavc/vp9dsp: R-V V mc tap h v uk7b
@ 2024-06-15 11:52   ` flow gg
  2024-07-13  9:02   ` Rémi Denis-Courmont
  1 sibling, 0 replies; 15+ messages in thread
From: flow gg @ 2024-06-15 11:52 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

> You can directly LLA filters + 16 * 8 * 2 and save one add. Same below.
You can
> also use .equ to alias the filter addresses, and avoid if's.

> That's a lot of address dependencies, which is going to hurt performance.
It
> might help to just spill more S registers if needed.

> This can be done in 3 instructions, even without mul. Of course you'll
again
> need a spare register.

Okay, updated them

> Use a macro parameter for the stride register.

Doing this will reduce one if-else statement in this patch, but in the next
patch, it will lead to adding multiple if-else statements. I think we can
leave it unchanged.

<uk7b@foxmail.com> 于2024年6月15日周六 19:51写道：

> From: sunyuechi <sunyuechi@iscas.ac.cn>
>
>                                                      C908   X60
> vp9_avg_8tap_smooth_4h_8bpp_c                      :   12.7   11.2
> vp9_avg_8tap_smooth_4h_8bpp_rvv_i32                :    4.7    4.2
> vp9_avg_8tap_smooth_4v_8bpp_c                      :   29.7   12.5
> vp9_avg_8tap_smooth_4v_8bpp_rvv_i32                :    4.7    4.2
> vp9_avg_8tap_smooth_8h_8bpp_c                      :   48.7   42.2
> vp9_avg_8tap_smooth_8h_8bpp_rvv_i32                :    9.5    8.5
> vp9_avg_8tap_smooth_8v_8bpp_c                      :   49.7   45.5
> vp9_avg_8tap_smooth_8v_8bpp_rvv_i32                :    9.5    8.5
> vp9_avg_8tap_smooth_16h_8bpp_c                     :  192.0  166.5
> vp9_avg_8tap_smooth_16h_8bpp_rvv_i32               :   21.7   19.5
> vp9_avg_8tap_smooth_16v_8bpp_c                     :  191.2  175.2
> vp9_avg_8tap_smooth_16v_8bpp_rvv_i32               :   21.2   19.0
> vp9_avg_8tap_smooth_32h_8bpp_c                     :  780.2  663.2
> vp9_avg_8tap_smooth_32h_8bpp_rvv_i32               :   68.2   60.5
> vp9_avg_8tap_smooth_32v_8bpp_c                     :  770.0  685.7
> vp9_avg_8tap_smooth_32v_8bpp_rvv_i32               :   67.0   59.5
> vp9_avg_8tap_smooth_64h_8bpp_c                     : 3116.2 2648.2
> vp9_avg_8tap_smooth_64h_8bpp_rvv_i32               :  270.7  120.7
> vp9_avg_8tap_smooth_64v_8bpp_c                     : 3058.5 2731.7
> vp9_avg_8tap_smooth_64v_8bpp_rvv_i32               :  266.5  119.0
> vp9_put_8tap_smooth_4h_8bpp_c                      :   11.0    9.7
> vp9_put_8tap_smooth_4h_8bpp_rvv_i32                :    4.2    3.7
> vp9_put_8tap_smooth_4v_8bpp_c                      :   11.7   10.5
> vp9_put_8tap_smooth_4v_8bpp_rvv_i32                :    4.0    3.7
> vp9_put_8tap_smooth_8h_8bpp_c                      :   42.0   37.5
> vp9_put_8tap_smooth_8h_8bpp_rvv_i32                :    8.5    7.7
> vp9_put_8tap_smooth_8v_8bpp_c                      :   43.5   38.5
> vp9_put_8tap_smooth_8v_8bpp_rvv_i32                :    8.7    7.7
> vp9_put_8tap_smooth_16h_8bpp_c                     :  181.7  147.2
> vp9_put_8tap_smooth_16h_8bpp_rvv_i32               :   20.0   18.0
> vp9_put_8tap_smooth_16v_8bpp_c                     :  168.5  149.7
> vp9_put_8tap_smooth_16v_8bpp_rvv_i32               :   19.7   17.5
> vp9_put_8tap_smooth_32h_8bpp_c                     :  675.0  586.5
> vp9_put_8tap_smooth_32h_8bpp_rvv_i32               :   65.2   58.0
> vp9_put_8tap_smooth_32v_8bpp_c                     :  664.7  591.2
> vp9_put_8tap_smooth_32v_8bpp_rvv_i32               :   64.0   57.0
> vp9_put_8tap_smooth_64h_8bpp_c                     : 2696.2 2339.0
> vp9_put_8tap_smooth_64h_8bpp_rvv_i32               :  259.7  115.7
> vp9_put_8tap_smooth_64v_8bpp_c                     : 2691.0 2348.5
> vp9_put_8tap_smooth_64v_8bpp_rvv_i32               :  255.5  114.0
> ---
>  libavcodec/riscv/vp9_mc_rvv.S  | 200 +++++++++++++++++++++++++++++++++
>  libavcodec/riscv/vp9dsp.h      |  72 ++++++++----
>  libavcodec/riscv/vp9dsp_init.c |  38 ++++++-
>  3 files changed, 285 insertions(+), 25 deletions(-)
>
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index 5241562531..5e81301aa5 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -36,6 +36,18 @@
>  .endif
>  .endm
>
> +.macro vsetvlstatic16 len
> +.ifc \len,4
> +        vsetvli         zero, zero, e16, mf2, ta, ma
> +.elseif \len == 8
> +        vsetvli         zero, zero, e16, m1, ta, ma
> +.elseif \len == 16
> +        vsetvli         zero, zero, e16, m2, ta, ma
> +.else
> +        vsetvli         zero, zero, e16, m4, ta, ma
> +.endif
> +.endm
> +
>  .macro copy_avg len
>  func ff_vp9_avg\len\()_rvv, zve32x
>          csrwi           vxrm, 0
> @@ -181,8 +193,196 @@ func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
>  endfunc
>  .endm
>
> +.equ ff_vp9_subpel_filters_smooth, ff_vp9_subpel_filters
> +.equ ff_vp9_subpel_filters_regular, ff_vp9_subpel_filters + 16*8*2
> +.equ ff_vp9_subpel_filters_sharp, ff_vp9_subpel_filters + 16*8*2*2
> +
> +.macro epel_filter name, type, regtype
> +        lla             \regtype\()2, ff_vp9_subpel_filters_\name
> +
> +.ifc \type,v
> +        slli            \regtype\()0, a6, 4
> +.else
> +        slli            \regtype\()0, a5, 4
> +.endif
> +        add             \regtype\()0, \regtype\()0, \regtype\()2
> +
> +        lh              \regtype\()1, 2(\regtype\()0)
> +        lh              \regtype\()2, 4(\regtype\()0)
> +        lh              \regtype\()3, 6(\regtype\()0)
> +        lh              \regtype\()4, 8(\regtype\()0)
> +        lh              \regtype\()5, 10(\regtype\()0)
> +        lh              \regtype\()6, 12(\regtype\()0)
> +
> +.ifc \regtype,t
> +        lh              a7, 14(\regtype\()0)
> +.else
> +        lh              s7, 14(\regtype\()0)
> +.endif
> +        lh              \regtype\()0, 0(\regtype\()0)
> +.endm
> +
> +.macro epel_load dst, len, op, name, type, from_mem, regtype
> +.ifc \from_mem, 1
> +        vle8.v          v22, (a2)
> +.ifc \type,v
> +        add             a5, a3, a2
> +        sub             a2, a2, a3
> +        vle8.v          v24, (a5)
> +        vle8.v          v20, (a2)
> +        sh1add          a2, a3, a5
> +        add             a5, a5, a3
> +        vle8.v          v26, (a5)
> +        vle8.v          v28, (a2)
> +        add             a2, a2, a3
> +        vle8.v          v30, (a2)
> +.else
> +        addi            a5, a2, 1
> +        addi            a2, a2, -1
> +        vle8.v          v24, (a5)
> +        vle8.v          v20, (a2)
> +        addi            a5, a5, 2
> +        addi            a2, a2, 3
> +        vle8.v          v28, (a5)
> +        vle8.v          v26, (a2)
> +        addi            a2, a5, 1
> +        vle8.v          v30, (a2)
> +.endif
> +
> +.ifc \name,smooth
> +        vwmulu.vx       v16, v24, \regtype\()4
> +        vwmaccu.vx      v16, \regtype\()2, v20
> +        vwmaccu.vx      v16, \regtype\()5, v26
> +        vwmaccsu.vx     v16, \regtype\()6, v28
> +.else
> +        vwmulu.vx       v16, v28, \regtype\()6
> +        vwmaccsu.vx     v16, \regtype\()2, v20
> +        vwmaccsu.vx     v16, \regtype\()5, v26
> +.endif
> +
> +.ifc \regtype,t
> +        vwmaccsu.vx     v16, a7, v30
> +.else
> +        vwmaccsu.vx     v16, s7, v30
> +.endif
> +
> +.ifc \type,v
> +        sh1add          a5, a3, a3
> +        sub             a2, a2, a5
> +        sub             a2, a2, a5
> +        sub             a5, a2, a3
> +        vle8.v          v28, (a2)
> +        vle8.v          v26, (a5)
> +        sh1add          a2, a3, a2
> +.else
> +        addi            a5, a2, -7
> +        addi            a2, a2, -6
> +        vle8.v          v26, (a5)
> +        vle8.v          v28, (a2)
> +        addi            a2, a2, 2
> +.endif
> +
> +.ifc \name,smooth
> +        vwmaccsu.vx     v16, \regtype\()1, v28
> +.else
> +        vwmaccu.vx      v16, \regtype\()1, v28
> +        vwmulu.vx       v28, v24, \regtype\()4
> +.endif
> +        vwmaccsu.vx     v16, \regtype\()0, v26
> +        vwmulu.vx       v20, v22, \regtype\()3
> +.else
> +.ifc \name,smooth
> +        vwmulu.vx       v16, v8, \regtype\()4
> +        vwmaccu.vx      v16, \regtype\()2, v4
> +        vwmaccu.vx      v16, \regtype\()5, v10
> +        vwmaccsu.vx     v16, \regtype\()6, v12
> +        vwmaccsu.vx     v16, \regtype\()1, v2
> +.else
> +        vwmulu.vx       v16, v2, \regtype\()1
> +        vwmaccu.vx      v16, \regtype\()6, v12
> +        vwmaccsu.vx     v16, \regtype\()5, v10
> +        vwmaccsu.vx     v16, \regtype\()2, v4
> +        vwmulu.vx       v28, v8, \regtype\()4
> +.endif
> +        vwmaccsu.vx     v16, \regtype\()0, v0
> +        vwmulu.vx       v20, v6, \regtype\()3
> +
> +.ifc \regtype,t
> +        vwmaccsu.vx     v16, a7, v14
> +.else
> +        vwmaccsu.vx     v16, s7, v14
> +.endif
> +
> +.endif
> +        li              a5, 64
> +        vwadd.wx        v16, v16, a5
> +        vsetvlstatic16  \len
> +
> +.ifc \name,smooth
> +        vwadd.vv        v24, v16, v20
> +.else
> +        vwadd.vv        v24, v16, v28
> +        vwadd.wv        v24, v24, v20
> +.endif
> +        vnsra.wi        v24, v24, 7
> +        vmax.vx         v24, v24, zero
> +        vsetvlstatic8   \len, zero, 32, m2
> +
> +        vnclipu.wi      \dst, v24, 0
> +.ifc \op,avg
> +        vle8.v          v24, (a0)
> +        vaaddu.vv       \dst, \dst, v24
> +.endif
> +
> +.endm
> +
> +.macro epel_load_inc dst, len, op, name, type, from_mem, regtype
> +        epel_load       \dst, \len, \op, \name, \type, \from_mem, \regtype
> +        add             a2, a2, a3
> +.endm
> +
> +.macro epel len, op, name, type, vlen
> +func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
> +        epel_filter     \name, \type, t
> +.if \vlen < 256
> +        vsetvlstatic8   \len, a5, 32, m2
> +.else
> +        vsetvlstatic8   \len, a5, 64, m2
> +.endif
> +.ifc \op,avg
> +        csrwi           vxrm, 0
> +.endif
> +
> +1:
> +        addi            a4, a4, -1
> +        epel_load       v30, \len, \op, \name, \type, 1, t
> +        vse8.v          v30, (a0)
> +.if \len == 64 && \vlen < 256
> +        addi            a0, a0, 32
> +        addi            a2, a2, 32
> +        epel_load       v30, \len, \op, \name, \type, 1, t
> +        vse8.v          v30, (a0)
> +        addi            a0, a0, -32
> +        addi            a2, a2, -32
> +.endif
> +        add             a2, a2, a3
> +        add             a0, a0, a1
> +        bnez            a4, 1b
> +
> +        ret
> +endfunc
> +.endm
> +
>  .irp len, 64, 32, 16, 8, 4
>          copy_avg \len
> +        .irp op, put, avg
> +                .irp name, regular, sharp, smooth
> +                        .irp type, h, v
> +                                epel \len, \op, \name, \type, 128
> +                                epel \len, \op, \name, \type, 256
> +                        .endr
> +                .endr
> +        .endr
>  .endr
>
>  bilin_h_v  put, h, a5
> diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> index 8fb326dae0..5fd64a1b8c 100644
> --- a/libavcodec/riscv/vp9dsp.h
> +++ b/libavcodec/riscv/vp9dsp.h
> @@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride,
> const uint8_t *l,
>  void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
>                     const uint8_t *a);
>
> -#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)
>      \
> -void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> dststride,   \
> +#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx, min_vlen)
>     \
> +void ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,
>     \
> +                                        ptrdiff_t dststride,
>    \
>                                          const uint8_t *src,
>     \
>                                          ptrdiff_t srcstride,
>    \
>                                          int h, int mx, int my);
>     \
>
>     \
> -void ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t
> dststride,   \
> +void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,
>     \
> +                                        ptrdiff_t dststride,
>    \
>                                          const uint8_t *src,
>     \
>                                          ptrdiff_t srcstride,
>    \
>                                          int h, int mx, int my);
>     \
>
>     \
> -void ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t
> dststride,  \
> +void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,
>    \
> +                                         ptrdiff_t dststride,
>     \
>                                           const uint8_t *src,
>    \
>                                           ptrdiff_t srcstride,
>     \
>                                           int h, int mx, int my);
>    \
>
>     \
> -void ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> dststride,   \
> +void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,
>     \
> +                                        ptrdiff_t dststride,
>    \
>                                          const uint8_t *src,
>     \
>                                          ptrdiff_t srcstride,
>    \
>                                          int h, int mx, int my);
>     \
>
>     \
> -void ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t
> dststride,   \
> +void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,
>     \
> +                                        ptrdiff_t dststride,
>    \
>                                          const uint8_t *src,
>     \
>                                          ptrdiff_t srcstride,
>    \
>                                          int h, int mx, int my);
>     \
>
>     \
> -void ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t
> dststride,  \
> +void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,
>    \
> +                                         ptrdiff_t dststride,
>     \
>                                           const uint8_t *src,
>    \
>                                           ptrdiff_t srcstride,
>     \
>                                           int h, int mx, int my);
> @@ -146,23 +152,41 @@ void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t
> dststride,     \
>                          const uint8_t *src, ptrdiff_t srcstride,   \
>                          int h, int mx, int my);
>
> -VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR);
> -
> -VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP);
> -
> -VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
> +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256);
>
>  VP9_BILINEAR_RISCV_RVV_FUNC(64);
>  VP9_BILINEAR_RISCV_RVV_FUNC(32);
> diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> index b3700dfb08..3669070fca 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -49,7 +49,9 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp)
>  # endif
>
>  #if HAVE_RVV
> -    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) &&
> ff_rv_vlen_least(128)) {
> +    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
> +    int vlenb = ff_get_rv_vlenb();
> +    if (vlenb >= 16) {
>
>  #define init_fpel(idx1, sz)                                           \
>      dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_vp9_avg##sz##_rvv;  \
> @@ -95,6 +97,40 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp)
>      dsp->mc[4][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_4hv_rvv;
>
>  #undef init_fpel
> +
> +#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen)  \
> +    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] =   \
> +        ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen;       \
> +    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] =   \
> +        ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen;      \
> +    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] =   \
> +        ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen;
> +
> +#define init_subpel2(idx, idxh, idxv, dir, type, vlen)      \
> +    init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen);  \
> +    init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen);  \
> +    init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen);  \
> +    init_subpel1(3, idx, idxh, idxv,  8, dir, type, vlen);  \
> +    init_subpel1(4, idx, idxh, idxv,  4, dir, type, vlen)
> +
> +    init_subpel2(0, 1, 0, h, put, 128);
> +    init_subpel2(1, 1, 0, h, avg, 128);
> +
> +    if (flags & AV_CPU_FLAG_RVB_ADDR) {
> +        init_subpel2(0, 0, 1, v, put, 128);
> +        init_subpel2(1, 0, 1, v, avg, 128);
> +    }
> +
> +    }
> +    if (vlenb >= 32) {
> +        init_subpel2(0, 1, 0, h, put, 256);
> +        init_subpel2(1, 1, 0, h, avg, 256);
> +
> +        if (flags & AV_CPU_FLAG_RVB_ADDR) {
> +            init_subpel2(0, 0, 1, v, put, 256);
> +            init_subpel2(1, 0, 1, v, avg, 256);
> +        }
> +    }
>      }
>  #endif
>  #endif
> --
> 2.45.2
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [FFmpeg-devel] [PATCH v4 3/4] lavc/vp9dsp: R-V V mc tap h v
  2024-06-15 11:50 ` [FFmpeg-devel] [PATCH v4 3/4] lavc/vp9dsp: R-V V mc tap h v uk7b
  2024-06-15 11:52   ` flow gg
@ 2024-07-13  9:02   ` Rémi Denis-Courmont
  2024-07-23  8:51     ` uk7b
  2024-07-23  8:56     ` flow gg
  1 sibling, 2 replies; 15+ messages in thread
From: Rémi Denis-Courmont @ 2024-07-13  9:02 UTC (permalink / raw)
  To: ffmpeg-devel

Le lauantaina 15. kesäkuuta 2024, 14.50.33 EEST uk7b@foxmail.com a écrit :
> From: sunyuechi <sunyuechi@iscas.ac.cn>

OK, so I realise that this review is very late, but...

TBH it is very hard to review this due to the large extents of code 
conditionals. This should avoidable at least partly. You can name macros for 
each filter and then expand those macros instead of using if's.

Besides in my experience, it is more readable to leave the loads/stores to the 
outer function or macros and factor only the calculations, whenever you need 
to apply the same maths vertically and/or horizontally. This also sometimes 
enables actually using shared code, e.g., the H.263 loop filter or the VC-1 
ITX.

Lastly this seems to both add new optimisations *and* add specialisations for 
256-bit vectors, which really should be separate patches, but maybe I just 
don't understand the code. In any case, that would not really match with the 
patch description.


>                                                      C908   X60
> vp9_avg_8tap_smooth_4h_8bpp_c                      :   12.7   11.2
> vp9_avg_8tap_smooth_4h_8bpp_rvv_i32                :    4.7    4.2
> vp9_avg_8tap_smooth_4v_8bpp_c                      :   29.7   12.5
> vp9_avg_8tap_smooth_4v_8bpp_rvv_i32                :    4.7    4.2
> vp9_avg_8tap_smooth_8h_8bpp_c                      :   48.7   42.2
> vp9_avg_8tap_smooth_8h_8bpp_rvv_i32                :    9.5    8.5
> vp9_avg_8tap_smooth_8v_8bpp_c                      :   49.7   45.5
> vp9_avg_8tap_smooth_8v_8bpp_rvv_i32                :    9.5    8.5
> vp9_avg_8tap_smooth_16h_8bpp_c                     :  192.0  166.5
> vp9_avg_8tap_smooth_16h_8bpp_rvv_i32               :   21.7   19.5
> vp9_avg_8tap_smooth_16v_8bpp_c                     :  191.2  175.2
> vp9_avg_8tap_smooth_16v_8bpp_rvv_i32               :   21.2   19.0
> vp9_avg_8tap_smooth_32h_8bpp_c                     :  780.2  663.2
> vp9_avg_8tap_smooth_32h_8bpp_rvv_i32               :   68.2   60.5
> vp9_avg_8tap_smooth_32v_8bpp_c                     :  770.0  685.7
> vp9_avg_8tap_smooth_32v_8bpp_rvv_i32               :   67.0   59.5
> vp9_avg_8tap_smooth_64h_8bpp_c                     : 3116.2 2648.2
> vp9_avg_8tap_smooth_64h_8bpp_rvv_i32               :  270.7  120.7
> vp9_avg_8tap_smooth_64v_8bpp_c                     : 3058.5 2731.7
> vp9_avg_8tap_smooth_64v_8bpp_rvv_i32               :  266.5  119.0
> vp9_put_8tap_smooth_4h_8bpp_c                      :   11.0    9.7
> vp9_put_8tap_smooth_4h_8bpp_rvv_i32                :    4.2    3.7
> vp9_put_8tap_smooth_4v_8bpp_c                      :   11.7   10.5
> vp9_put_8tap_smooth_4v_8bpp_rvv_i32                :    4.0    3.7
> vp9_put_8tap_smooth_8h_8bpp_c                      :   42.0   37.5
> vp9_put_8tap_smooth_8h_8bpp_rvv_i32                :    8.5    7.7
> vp9_put_8tap_smooth_8v_8bpp_c                      :   43.5   38.5
> vp9_put_8tap_smooth_8v_8bpp_rvv_i32                :    8.7    7.7
> vp9_put_8tap_smooth_16h_8bpp_c                     :  181.7  147.2
> vp9_put_8tap_smooth_16h_8bpp_rvv_i32               :   20.0   18.0
> vp9_put_8tap_smooth_16v_8bpp_c                     :  168.5  149.7
> vp9_put_8tap_smooth_16v_8bpp_rvv_i32               :   19.7   17.5
> vp9_put_8tap_smooth_32h_8bpp_c                     :  675.0  586.5
> vp9_put_8tap_smooth_32h_8bpp_rvv_i32               :   65.2   58.0
> vp9_put_8tap_smooth_32v_8bpp_c                     :  664.7  591.2
> vp9_put_8tap_smooth_32v_8bpp_rvv_i32               :   64.0   57.0
> vp9_put_8tap_smooth_64h_8bpp_c                     : 2696.2 2339.0
> vp9_put_8tap_smooth_64h_8bpp_rvv_i32               :  259.7  115.7
> vp9_put_8tap_smooth_64v_8bpp_c                     : 2691.0 2348.5
> vp9_put_8tap_smooth_64v_8bpp_rvv_i32               :  255.5  114.0
> ---
>  libavcodec/riscv/vp9_mc_rvv.S  | 200 +++++++++++++++++++++++++++++++++
>  libavcodec/riscv/vp9dsp.h      |  72 ++++++++----
>  libavcodec/riscv/vp9dsp_init.c |  38 ++++++-
>  3 files changed, 285 insertions(+), 25 deletions(-)
> 
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index 5241562531..5e81301aa5 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -36,6 +36,18 @@
>  .endif
>  .endm
> 
> +.macro vsetvlstatic16 len
> +.ifc \len,4
> +        vsetvli         zero, zero, e16, mf2, ta, ma
> +.elseif \len == 8
> +        vsetvli         zero, zero, e16, m1, ta, ma
> +.elseif \len == 16
> +        vsetvli         zero, zero, e16, m2, ta, ma
> +.else
> +        vsetvli         zero, zero, e16, m4, ta, ma
> +.endif
> +.endm
> +
>  .macro copy_avg len
>  func ff_vp9_avg\len\()_rvv, zve32x
>          csrwi           vxrm, 0
> @@ -181,8 +193,196 @@ func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
>  endfunc
>  .endm
> 
> +.equ ff_vp9_subpel_filters_smooth, ff_vp9_subpel_filters
> +.equ ff_vp9_subpel_filters_regular, ff_vp9_subpel_filters + 16*8*2
> +.equ ff_vp9_subpel_filters_sharp, ff_vp9_subpel_filters + 16*8*2*2
> +
> +.macro epel_filter name, type, regtype
> +        lla             \regtype\()2, ff_vp9_subpel_filters_\name
> +
> +.ifc \type,v
> +        slli            \regtype\()0, a6, 4
> +.else
> +        slli            \regtype\()0, a5, 4
> +.endif
> +        add             \regtype\()0, \regtype\()0, \regtype\()2
> +
> +        lh              \regtype\()1, 2(\regtype\()0)
> +        lh              \regtype\()2, 4(\regtype\()0)
> +        lh              \regtype\()3, 6(\regtype\()0)
> +        lh              \regtype\()4, 8(\regtype\()0)
> +        lh              \regtype\()5, 10(\regtype\()0)
> +        lh              \regtype\()6, 12(\regtype\()0)
> +
> +.ifc \regtype,t
> +        lh              a7, 14(\regtype\()0)
> +.else
> +        lh              s7, 14(\regtype\()0)
> +.endif
> +        lh              \regtype\()0, 0(\regtype\()0)
> +.endm
> +
> +.macro epel_load dst, len, op, name, type, from_mem, regtype
> +.ifc \from_mem, 1
> +        vle8.v          v22, (a2)
> +.ifc \type,v
> +        add             a5, a3, a2
> +        sub             a2, a2, a3
> +        vle8.v          v24, (a5)
> +        vle8.v          v20, (a2)
> +        sh1add          a2, a3, a5
> +        add             a5, a5, a3
> +        vle8.v          v26, (a5)
> +        vle8.v          v28, (a2)
> +        add             a2, a2, a3
> +        vle8.v          v30, (a2)
> +.else
> +        addi            a5, a2, 1
> +        addi            a2, a2, -1
> +        vle8.v          v24, (a5)
> +        vle8.v          v20, (a2)
> +        addi            a5, a5, 2
> +        addi            a2, a2, 3
> +        vle8.v          v28, (a5)
> +        vle8.v          v26, (a2)
> +        addi            a2, a5, 1
> +        vle8.v          v30, (a2)
> +.endif
> +
> +.ifc \name,smooth
> +        vwmulu.vx       v16, v24, \regtype\()4
> +        vwmaccu.vx      v16, \regtype\()2, v20
> +        vwmaccu.vx      v16, \regtype\()5, v26
> +        vwmaccsu.vx     v16, \regtype\()6, v28
> +.else
> +        vwmulu.vx       v16, v28, \regtype\()6
> +        vwmaccsu.vx     v16, \regtype\()2, v20
> +        vwmaccsu.vx     v16, \regtype\()5, v26
> +.endif
> +
> +.ifc \regtype,t
> +        vwmaccsu.vx     v16, a7, v30
> +.else
> +        vwmaccsu.vx     v16, s7, v30
> +.endif
> +
> +.ifc \type,v
> +        sh1add          a5, a3, a3
> +        sub             a2, a2, a5
> +        sub             a2, a2, a5
> +        sub             a5, a2, a3
> +        vle8.v          v28, (a2)
> +        vle8.v          v26, (a5)
> +        sh1add          a2, a3, a2
> +.else
> +        addi            a5, a2, -7
> +        addi            a2, a2, -6
> +        vle8.v          v26, (a5)
> +        vle8.v          v28, (a2)
> +        addi            a2, a2, 2
> +.endif
> +
> +.ifc \name,smooth
> +        vwmaccsu.vx     v16, \regtype\()1, v28
> +.else
> +        vwmaccu.vx      v16, \regtype\()1, v28
> +        vwmulu.vx       v28, v24, \regtype\()4
> +.endif
> +        vwmaccsu.vx     v16, \regtype\()0, v26
> +        vwmulu.vx       v20, v22, \regtype\()3
> +.else
> +.ifc \name,smooth
> +        vwmulu.vx       v16, v8, \regtype\()4
> +        vwmaccu.vx      v16, \regtype\()2, v4
> +        vwmaccu.vx      v16, \regtype\()5, v10
> +        vwmaccsu.vx     v16, \regtype\()6, v12
> +        vwmaccsu.vx     v16, \regtype\()1, v2
> +.else
> +        vwmulu.vx       v16, v2, \regtype\()1
> +        vwmaccu.vx      v16, \regtype\()6, v12
> +        vwmaccsu.vx     v16, \regtype\()5, v10
> +        vwmaccsu.vx     v16, \regtype\()2, v4
> +        vwmulu.vx       v28, v8, \regtype\()4
> +.endif
> +        vwmaccsu.vx     v16, \regtype\()0, v0
> +        vwmulu.vx       v20, v6, \regtype\()3
> +
> +.ifc \regtype,t
> +        vwmaccsu.vx     v16, a7, v14
> +.else
> +        vwmaccsu.vx     v16, s7, v14
> +.endif
> +
> +.endif
> +        li              a5, 64
> +        vwadd.wx        v16, v16, a5
> +        vsetvlstatic16  \len
> +
> +.ifc \name,smooth
> +        vwadd.vv        v24, v16, v20
> +.else
> +        vwadd.vv        v24, v16, v28
> +        vwadd.wv        v24, v24, v20
> +.endif
> +        vnsra.wi        v24, v24, 7
> +        vmax.vx         v24, v24, zero
> +        vsetvlstatic8   \len, zero, 32, m2
> +
> +        vnclipu.wi      \dst, v24, 0
> +.ifc \op,avg
> +        vle8.v          v24, (a0)
> +        vaaddu.vv       \dst, \dst, v24
> +.endif
> +
> +.endm
> +
> +.macro epel_load_inc dst, len, op, name, type, from_mem, regtype
> +        epel_load       \dst, \len, \op, \name, \type, \from_mem, \regtype
> +        add             a2, a2, a3
> +.endm
> +
> +.macro epel len, op, name, type, vlen
> +func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
> +        epel_filter     \name, \type, t
> +.if \vlen < 256
> +        vsetvlstatic8   \len, a5, 32, m2
> +.else
> +        vsetvlstatic8   \len, a5, 64, m2
> +.endif
> +.ifc \op,avg
> +        csrwi           vxrm, 0
> +.endif
> +
> +1:
> +        addi            a4, a4, -1
> +        epel_load       v30, \len, \op, \name, \type, 1, t
> +        vse8.v          v30, (a0)
> +.if \len == 64 && \vlen < 256
> +        addi            a0, a0, 32
> +        addi            a2, a2, 32
> +        epel_load       v30, \len, \op, \name, \type, 1, t
> +        vse8.v          v30, (a0)
> +        addi            a0, a0, -32
> +        addi            a2, a2, -32
> +.endif
> +        add             a2, a2, a3
> +        add             a0, a0, a1
> +        bnez            a4, 1b
> +
> +        ret
> +endfunc
> +.endm
> +
>  .irp len, 64, 32, 16, 8, 4
>          copy_avg \len
> +        .irp op, put, avg
> +                .irp name, regular, sharp, smooth
> +                        .irp type, h, v
> +                                epel \len, \op, \name, \type, 128
> +                                epel \len, \op, \name, \type, 256
> +                        .endr
> +                .endr
> +        .endr
>  .endr
> 
>  bilin_h_v  put, h, a5
> diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> index 8fb326dae0..5fd64a1b8c 100644
> --- a/libavcodec/riscv/vp9dsp.h
> +++ b/libavcodec/riscv/vp9dsp.h
> @@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const
> uint8_t *l, void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const
> uint8_t *l, const uint8_t *a);
> 
> -#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)                      
>   \ -void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> dststride,   \ +#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx,
> min_vlen)              \ +void
> ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,          \ +
>                                        ptrdiff_t dststride,                
> \ const uint8_t *src,                  \ ptrdiff_t srcstride,              
>   \ int h, int mx, int my);              \ \ -void
> ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,   \
> +void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,       
>   \ +                                        ptrdiff_t dststride,          
>       \ const uint8_t *src,                  \ ptrdiff_t srcstride,        
>         \ int h, int mx, int my);              \ \ -void
> ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,  \
> +void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,      
>   \ +                                         ptrdiff_t dststride,         
>       \ const uint8_t *src,                 \ ptrdiff_t srcstride,         
>       \ int h, int mx, int my);             \ \ -void
> ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride,   \
> +void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,       
>   \ +                                        ptrdiff_t dststride,          
>       \ const uint8_t *src,                  \ ptrdiff_t srcstride,        
>         \ int h, int mx, int my);              \ \ -void
> ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,   \
> +void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,       
>   \ +                                        ptrdiff_t dststride,          
>       \ const uint8_t *src,                  \ ptrdiff_t srcstride,        
>         \ int h, int mx, int my);              \ \ -void
> ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,  \
> +void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,      
>   \ +                                         ptrdiff_t dststride,         
>       \ const uint8_t *src,                 \ ptrdiff_t srcstride,         
>       \ int h, int mx, int my);
> @@ -146,23 +152,41 @@ void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t
> dststride,     \ const uint8_t *src, ptrdiff_t srcstride,   \ int h, int
> mx, int my);
> 
> -VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR);
> -
> -VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP);
> -
> -VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
> +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256);
> 
>  VP9_BILINEAR_RISCV_RVV_FUNC(64);
>  VP9_BILINEAR_RISCV_RVV_FUNC(32);
> diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
> index b3700dfb08..3669070fca 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -49,7 +49,9 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp) # endif
> 
>  #if HAVE_RVV
> -    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) && ff_rv_vlen_least(128))
> { +    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
> +    int vlenb = ff_get_rv_vlenb();
> +    if (vlenb >= 16) {
> 
>  #define init_fpel(idx1, sz)                                           \
>      dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_vp9_avg##sz##_rvv;  \
> @@ -95,6 +97,40 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp) dsp->mc[4][FILTER_BILINEAR ][1][1][1] =
> ff_avg_vp9_bilin_4hv_rvv;
> 
>  #undef init_fpel
> +
> +#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen)  \
> +    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] =   \
> +        ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen;       \
> +    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] =   \
> +        ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen;      \
> +    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] =   \
> +        ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen;
> +
> +#define init_subpel2(idx, idxh, idxv, dir, type, vlen)      \
> +    init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen);  \
> +    init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen);  \
> +    init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen);  \
> +    init_subpel1(3, idx, idxh, idxv,  8, dir, type, vlen);  \
> +    init_subpel1(4, idx, idxh, idxv,  4, dir, type, vlen)
> +
> +    init_subpel2(0, 1, 0, h, put, 128);
> +    init_subpel2(1, 1, 0, h, avg, 128);
> +
> +    if (flags & AV_CPU_FLAG_RVB_ADDR) {
> +        init_subpel2(0, 0, 1, v, put, 128);
> +        init_subpel2(1, 0, 1, v, avg, 128);
> +    }
> +
> +    }
> +    if (vlenb >= 32) {
> +        init_subpel2(0, 1, 0, h, put, 256);
> +        init_subpel2(1, 1, 0, h, avg, 256);
> +
> +        if (flags & AV_CPU_FLAG_RVB_ADDR) {
> +            init_subpel2(0, 0, 1, v, put, 256);
> +            init_subpel2(1, 0, 1, v, avg, 256);
> +        }
> +    }
>      }
>  #endif
>  #endif


-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [FFmpeg-devel] [PATCH v4 3/4] lavc/vp9dsp: R-V V mc tap h v
  2024-07-13  9:02   ` Rémi Denis-Courmont
@ 2024-07-23  8:51     ` uk7b
  2024-07-29 15:20       ` Rémi Denis-Courmont
  2024-07-23  8:56     ` flow gg
  1 sibling, 1 reply; 15+ messages in thread
From: uk7b @ 2024-07-23  8:51 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

                                                     C908   X60
vp9_avg_8tap_smooth_4h_8bpp_c                      :   12.7   11.2
vp9_avg_8tap_smooth_4h_8bpp_rvv_i32                :    4.7    4.2
vp9_avg_8tap_smooth_4v_8bpp_c                      :   29.7   12.5
vp9_avg_8tap_smooth_4v_8bpp_rvv_i32                :    4.7    4.2
vp9_avg_8tap_smooth_8h_8bpp_c                      :   48.7   42.2
vp9_avg_8tap_smooth_8h_8bpp_rvv_i32                :    9.5    8.5
vp9_avg_8tap_smooth_8v_8bpp_c                      :   49.7   45.5
vp9_avg_8tap_smooth_8v_8bpp_rvv_i32                :    9.5    8.5
vp9_avg_8tap_smooth_16h_8bpp_c                     :  192.0  166.5
vp9_avg_8tap_smooth_16h_8bpp_rvv_i32               :   21.7   19.5
vp9_avg_8tap_smooth_16v_8bpp_c                     :  191.2  175.2
vp9_avg_8tap_smooth_16v_8bpp_rvv_i32               :   21.2   19.0
vp9_avg_8tap_smooth_32h_8bpp_c                     :  780.2  663.2
vp9_avg_8tap_smooth_32h_8bpp_rvv_i32               :   68.2   60.5
vp9_avg_8tap_smooth_32v_8bpp_c                     :  770.0  685.7
vp9_avg_8tap_smooth_32v_8bpp_rvv_i32               :   67.0   59.5
vp9_avg_8tap_smooth_64h_8bpp_c                     : 3116.2 2648.2
vp9_avg_8tap_smooth_64h_8bpp_rvv_i32               :  270.7  120.7
vp9_avg_8tap_smooth_64v_8bpp_c                     : 3058.5 2731.7
vp9_avg_8tap_smooth_64v_8bpp_rvv_i32               :  266.5  119.0
vp9_put_8tap_smooth_4h_8bpp_c                      :   11.0    9.7
vp9_put_8tap_smooth_4h_8bpp_rvv_i32                :    4.2    3.7
vp9_put_8tap_smooth_4v_8bpp_c                      :   11.7   10.5
vp9_put_8tap_smooth_4v_8bpp_rvv_i32                :    4.0    3.7
vp9_put_8tap_smooth_8h_8bpp_c                      :   42.0   37.5
vp9_put_8tap_smooth_8h_8bpp_rvv_i32                :    8.5    7.7
vp9_put_8tap_smooth_8v_8bpp_c                      :   43.5   38.5
vp9_put_8tap_smooth_8v_8bpp_rvv_i32                :    8.7    7.7
vp9_put_8tap_smooth_16h_8bpp_c                     :  181.7  147.2
vp9_put_8tap_smooth_16h_8bpp_rvv_i32               :   20.0   18.0
vp9_put_8tap_smooth_16v_8bpp_c                     :  168.5  149.7
vp9_put_8tap_smooth_16v_8bpp_rvv_i32               :   19.7   17.5
vp9_put_8tap_smooth_32h_8bpp_c                     :  675.0  586.5
vp9_put_8tap_smooth_32h_8bpp_rvv_i32               :   65.2   58.0
vp9_put_8tap_smooth_32v_8bpp_c                     :  664.7  591.2
vp9_put_8tap_smooth_32v_8bpp_rvv_i32               :   64.0   57.0
vp9_put_8tap_smooth_64h_8bpp_c                     : 2696.2 2339.0
vp9_put_8tap_smooth_64h_8bpp_rvv_i32               :  259.7  115.7
vp9_put_8tap_smooth_64v_8bpp_c                     : 2691.0 2348.5
vp9_put_8tap_smooth_64v_8bpp_rvv_i32               :  255.5  114.0
---
 libavcodec/riscv/vp9_mc_rvv.S  | 193 +++++++++++++++++++++++++++++++++
 libavcodec/riscv/vp9dsp.h      |  72 ++++++++----
 libavcodec/riscv/vp9dsp_init.c |  38 ++++++-
 3 files changed, 278 insertions(+), 25 deletions(-)

diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index 5241562531..6a4be7b9bd 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -36,6 +36,18 @@
 .endif
 .endm
 
+.macro vsetvlstatic16 len
+.ifc \len,4
+        vsetvli         zero, zero, e16, mf2, ta, ma
+.elseif \len == 8
+        vsetvli         zero, zero, e16, m1, ta, ma
+.elseif \len == 16
+        vsetvli         zero, zero, e16, m2, ta, ma
+.else
+        vsetvli         zero, zero, e16, m4, ta, ma
+.endif
+.endm
+
 .macro copy_avg len
 func ff_vp9_avg\len\()_rvv, zve32x
         csrwi           vxrm, 0
@@ -181,8 +193,189 @@ func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
 endfunc
 .endm
 
+.equ ff_vp9_subpel_filters_smooth, ff_vp9_subpel_filters
+.equ ff_vp9_subpel_filters_regular, ff_vp9_subpel_filters + 16*8*2
+.equ ff_vp9_subpel_filters_sharp, ff_vp9_subpel_filters + 16*8*2*2
+
+.macro epel_filter name, type, regtype, arg
+        lla             \regtype\()2, ff_vp9_subpel_filters_\name
+.ifc \type,v
+        slli            \regtype\()0, a6, 4
+.else
+        slli            \regtype\()0, a5, 4
+.endif
+        add             \regtype\()0, \regtype\()0, \regtype\()2
+        lh              \regtype\()1, 2(\regtype\()0)
+        lh              \regtype\()2, 4(\regtype\()0)
+        lh              \regtype\()3, 6(\regtype\()0)
+        lh              \regtype\()4, 8(\regtype\()0)
+        lh              \regtype\()5, 10(\regtype\()0)
+        lh              \regtype\()6, 12(\regtype\()0)
+        lh              \arg, 14(\regtype\()0)
+        lh              \regtype\()0, 0(\regtype\()0)
+.endm
+
+.macro epel_load dst, len, op, name, type, from_mem, regtype
+.ifc \from_mem, 1
+        vle8.v          v22, (a2)
+.ifc \type,v
+        add             a5, a3, a2
+        sub             a2, a2, a3
+        vle8.v          v24, (a5)
+        vle8.v          v20, (a2)
+        sh1add          a2, a3, a5
+        add             a5, a5, a3
+        vle8.v          v26, (a5)
+        vle8.v          v28, (a2)
+        add             a2, a2, a3
+        vle8.v          v30, (a2)
+.else
+        addi            a5, a2, 1
+        addi            a2, a2, -1
+        vle8.v          v24, (a5)
+        vle8.v          v20, (a2)
+        addi            a5, a5, 2
+        addi            a2, a2, 3
+        vle8.v          v28, (a5)
+        vle8.v          v26, (a2)
+        addi            a2, a5, 1
+        vle8.v          v30, (a2)
+.endif
+
+.ifc \name,smooth
+        vwmulu.vx       v16, v24, \regtype\()4
+        vwmaccu.vx      v16, \regtype\()2, v20
+        vwmaccu.vx      v16, \regtype\()5, v26
+        vwmaccsu.vx     v16, \regtype\()6, v28
+.else
+        vwmulu.vx       v16, v28, \regtype\()6
+        vwmaccsu.vx     v16, \regtype\()2, v20
+        vwmaccsu.vx     v16, \regtype\()5, v26
+.endif
+
+.ifc \regtype,t
+        vwmaccsu.vx     v16, a7, v30
+.else
+        vwmaccsu.vx     v16, s7, v30
+.endif
+
+.ifc \type,v
+        sh1add          a5, a3, a3
+        sub             a2, a2, a5
+        sub             a2, a2, a5
+        sub             a5, a2, a3
+        vle8.v          v28, (a2)
+        vle8.v          v26, (a5)
+        sh1add          a2, a3, a2
+.else
+        addi            a5, a2, -7
+        addi            a2, a2, -6
+        vle8.v          v26, (a5)
+        vle8.v          v28, (a2)
+        addi            a2, a2, 2
+.endif
+
+.ifc \name,smooth
+        vwmaccsu.vx     v16, \regtype\()1, v28
+.else
+        vwmaccu.vx      v16, \regtype\()1, v28
+        vwmulu.vx       v28, v24, \regtype\()4
+.endif
+        vwmaccsu.vx     v16, \regtype\()0, v26
+        vwmulu.vx       v20, v22, \regtype\()3
+.else
+.ifc \name,smooth
+        vwmulu.vx       v16, v8, \regtype\()4
+        vwmaccu.vx      v16, \regtype\()2, v4
+        vwmaccu.vx      v16, \regtype\()5, v10
+        vwmaccsu.vx     v16, \regtype\()6, v12
+        vwmaccsu.vx     v16, \regtype\()1, v2
+.else
+        vwmulu.vx       v16, v2, \regtype\()1
+        vwmaccu.vx      v16, \regtype\()6, v12
+        vwmaccsu.vx     v16, \regtype\()5, v10
+        vwmaccsu.vx     v16, \regtype\()2, v4
+        vwmulu.vx       v28, v8, \regtype\()4
+.endif
+        vwmaccsu.vx     v16, \regtype\()0, v0
+        vwmulu.vx       v20, v6, \regtype\()3
+
+.ifc \regtype,t
+        vwmaccsu.vx     v16, a7, v14
+.else
+        vwmaccsu.vx     v16, s7, v14
+.endif
+
+.endif
+        li              a5, 64
+        vwadd.wx        v16, v16, a5
+        vsetvlstatic16  \len
+
+.ifc \name,smooth
+        vwadd.vv        v24, v16, v20
+.else
+        vwadd.vv        v24, v16, v28
+        vwadd.wv        v24, v24, v20
+.endif
+        vnsra.wi        v24, v24, 7
+        vmax.vx         v24, v24, zero
+        vsetvlstatic8   \len, zero, 32, m2
+
+        vnclipu.wi      \dst, v24, 0
+.ifc \op,avg
+        vle8.v          v24, (a0)
+        vaaddu.vv       \dst, \dst, v24
+.endif
+
+.endm
+
+.macro epel_load_inc dst, len, op, name, type, from_mem, regtype
+        epel_load       \dst, \len, \op, \name, \type, \from_mem, \regtype
+        add             a2, a2, a3
+.endm
+
+.macro epel len, op, name, type, vlen
+func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
+        epel_filter     \name, \type, t, a7
+.if \vlen < 256
+        vsetvlstatic8   \len, a5, 32, m2
+.else
+        vsetvlstatic8   \len, a5, 64, m2
+.endif
+.ifc \op,avg
+        csrwi           vxrm, 0
+.endif
+
+1:
+        addi            a4, a4, -1
+        epel_load       v30, \len, \op, \name, \type, 1, t
+        vse8.v          v30, (a0)
+.if \len == 64 && \vlen < 256
+        addi            a0, a0, 32
+        addi            a2, a2, 32
+        epel_load       v30, \len, \op, \name, \type, 1, t
+        vse8.v          v30, (a0)
+        addi            a0, a0, -32
+        addi            a2, a2, -32
+.endif
+        add             a2, a2, a3
+        add             a0, a0, a1
+        bnez            a4, 1b
+
+        ret
+endfunc
+.endm
+
 .irp len, 64, 32, 16, 8, 4
         copy_avg \len
+        .irp op, put, avg
+                .irp name, regular, sharp, smooth
+                        .irp type, h, v
+                                epel \len, \op, \name, \type, 128
+                                epel \len, \op, \name, \type, 256
+                        .endr
+                .endr
+        .endr
 .endr
 
 bilin_h_v  put, h, a5
diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
index 8fb326dae0..5fd64a1b8c 100644
--- a/libavcodec/riscv/vp9dsp.h
+++ b/libavcodec/riscv/vp9dsp.h
@@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
 void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
                    const uint8_t *a);
 
-#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)                         \
-void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride,   \
+#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx, min_vlen)              \
+void ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,          \
+                                        ptrdiff_t dststride,                 \
                                         const uint8_t *src,                  \
                                         ptrdiff_t srcstride,                 \
                                         int h, int mx, int my);              \
                                                                              \
-void ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,   \
+void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,          \
+                                        ptrdiff_t dststride,                 \
                                         const uint8_t *src,                  \
                                         ptrdiff_t srcstride,                 \
                                         int h, int mx, int my);              \
                                                                              \
-void ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,  \
+void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,         \
+                                         ptrdiff_t dststride,                \
                                          const uint8_t *src,                 \
                                          ptrdiff_t srcstride,                \
                                          int h, int mx, int my);             \
                                                                              \
-void ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride,   \
+void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,          \
+                                        ptrdiff_t dststride,                 \
                                         const uint8_t *src,                  \
                                         ptrdiff_t srcstride,                 \
                                         int h, int mx, int my);              \
                                                                              \
-void ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,   \
+void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,          \
+                                        ptrdiff_t dststride,                 \
                                         const uint8_t *src,                  \
                                         ptrdiff_t srcstride,                 \
                                         int h, int mx, int my);              \
                                                                              \
-void ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,  \
+void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,         \
+                                         ptrdiff_t dststride,                \
                                          const uint8_t *src,                 \
                                          ptrdiff_t srcstride,                \
                                          int h, int mx, int my);
@@ -146,23 +152,41 @@ void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t dststride,     \
                         const uint8_t *src, ptrdiff_t srcstride,   \
                         int h, int mx, int my);
 
-VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR);
-
-VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP);
-
-VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256);
 
 VP9_BILINEAR_RISCV_RVV_FUNC(64);
 VP9_BILINEAR_RISCV_RVV_FUNC(32);
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index b3700dfb08..3669070fca 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -49,7 +49,9 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
 # endif
 
 #if HAVE_RVV
-    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) && ff_rv_vlen_least(128)) {
+    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
+    int vlenb = ff_get_rv_vlenb();
+    if (vlenb >= 16) {
 
 #define init_fpel(idx1, sz)                                           \
     dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_vp9_avg##sz##_rvv;  \
@@ -95,6 +97,40 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
     dsp->mc[4][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_4hv_rvv;
 
 #undef init_fpel
+
+#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen)  \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] =   \
+        ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen;       \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] =   \
+        ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen;      \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] =   \
+        ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen;
+
+#define init_subpel2(idx, idxh, idxv, dir, type, vlen)      \
+    init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen);  \
+    init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen);  \
+    init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen);  \
+    init_subpel1(3, idx, idxh, idxv,  8, dir, type, vlen);  \
+    init_subpel1(4, idx, idxh, idxv,  4, dir, type, vlen)
+
+    init_subpel2(0, 1, 0, h, put, 128);
+    init_subpel2(1, 1, 0, h, avg, 128);
+
+    if (flags & AV_CPU_FLAG_RVB_ADDR) {
+        init_subpel2(0, 0, 1, v, put, 128);
+        init_subpel2(1, 0, 1, v, avg, 128);
+    }
+
+    }
+    if (vlenb >= 32) {
+        init_subpel2(0, 1, 0, h, put, 256);
+        init_subpel2(1, 1, 0, h, avg, 256);
+
+        if (flags & AV_CPU_FLAG_RVB_ADDR) {
+            init_subpel2(0, 0, 1, v, put, 256);
+            init_subpel2(1, 0, 1, v, avg, 256);
+        }
+    }
     }
 #endif
 #endif
-- 
2.45.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [FFmpeg-devel] [PATCH v4 3/4] lavc/vp9dsp: R-V V mc tap h v
  2024-07-23  8:51     ` uk7b
@ 2024-07-29 15:20       ` Rémi Denis-Courmont
  2024-07-31 10:36         ` flow gg
  0 siblings, 1 reply; 15+ messages in thread
From: Rémi Denis-Courmont @ 2024-07-29 15:20 UTC (permalink / raw)
  To: ffmpeg-devel

Le tiistaina 23. heinäkuuta 2024, 11.51.48 EEST uk7b@foxmail.com a écrit :
> From: sunyuechi <sunyuechi@iscas.ac.cn>
> 
>                                                      C908   X60
> vp9_avg_8tap_smooth_4h_8bpp_c                      :   12.7   11.2
> vp9_avg_8tap_smooth_4h_8bpp_rvv_i32                :    4.7    4.2
> vp9_avg_8tap_smooth_4v_8bpp_c                      :   29.7   12.5
> vp9_avg_8tap_smooth_4v_8bpp_rvv_i32                :    4.7    4.2
> vp9_avg_8tap_smooth_8h_8bpp_c                      :   48.7   42.2
> vp9_avg_8tap_smooth_8h_8bpp_rvv_i32                :    9.5    8.5
> vp9_avg_8tap_smooth_8v_8bpp_c                      :   49.7   45.5
> vp9_avg_8tap_smooth_8v_8bpp_rvv_i32                :    9.5    8.5
> vp9_avg_8tap_smooth_16h_8bpp_c                     :  192.0  166.5
> vp9_avg_8tap_smooth_16h_8bpp_rvv_i32               :   21.7   19.5
> vp9_avg_8tap_smooth_16v_8bpp_c                     :  191.2  175.2
> vp9_avg_8tap_smooth_16v_8bpp_rvv_i32               :   21.2   19.0
> vp9_avg_8tap_smooth_32h_8bpp_c                     :  780.2  663.2
> vp9_avg_8tap_smooth_32h_8bpp_rvv_i32               :   68.2   60.5
> vp9_avg_8tap_smooth_32v_8bpp_c                     :  770.0  685.7
> vp9_avg_8tap_smooth_32v_8bpp_rvv_i32               :   67.0   59.5
> vp9_avg_8tap_smooth_64h_8bpp_c                     : 3116.2 2648.2
> vp9_avg_8tap_smooth_64h_8bpp_rvv_i32               :  270.7  120.7
> vp9_avg_8tap_smooth_64v_8bpp_c                     : 3058.5 2731.7
> vp9_avg_8tap_smooth_64v_8bpp_rvv_i32               :  266.5  119.0
> vp9_put_8tap_smooth_4h_8bpp_c                      :   11.0    9.7
> vp9_put_8tap_smooth_4h_8bpp_rvv_i32                :    4.2    3.7
> vp9_put_8tap_smooth_4v_8bpp_c                      :   11.7   10.5
> vp9_put_8tap_smooth_4v_8bpp_rvv_i32                :    4.0    3.7
> vp9_put_8tap_smooth_8h_8bpp_c                      :   42.0   37.5
> vp9_put_8tap_smooth_8h_8bpp_rvv_i32                :    8.5    7.7
> vp9_put_8tap_smooth_8v_8bpp_c                      :   43.5   38.5
> vp9_put_8tap_smooth_8v_8bpp_rvv_i32                :    8.7    7.7
> vp9_put_8tap_smooth_16h_8bpp_c                     :  181.7  147.2
> vp9_put_8tap_smooth_16h_8bpp_rvv_i32               :   20.0   18.0
> vp9_put_8tap_smooth_16v_8bpp_c                     :  168.5  149.7
> vp9_put_8tap_smooth_16v_8bpp_rvv_i32               :   19.7   17.5
> vp9_put_8tap_smooth_32h_8bpp_c                     :  675.0  586.5
> vp9_put_8tap_smooth_32h_8bpp_rvv_i32               :   65.2   58.0
> vp9_put_8tap_smooth_32v_8bpp_c                     :  664.7  591.2
> vp9_put_8tap_smooth_32v_8bpp_rvv_i32               :   64.0   57.0
> vp9_put_8tap_smooth_64h_8bpp_c                     : 2696.2 2339.0
> vp9_put_8tap_smooth_64h_8bpp_rvv_i32               :  259.7  115.7
> vp9_put_8tap_smooth_64v_8bpp_c                     : 2691.0 2348.5
> vp9_put_8tap_smooth_64v_8bpp_rvv_i32               :  255.5  114.0
> ---
>  libavcodec/riscv/vp9_mc_rvv.S  | 193 +++++++++++++++++++++++++++++++++
>  libavcodec/riscv/vp9dsp.h      |  72 ++++++++----
>  libavcodec/riscv/vp9dsp_init.c |  38 ++++++-
>  3 files changed, 278 insertions(+), 25 deletions(-)
> 
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index 5241562531..6a4be7b9bd 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -36,6 +36,18 @@
>  .endif
>  .endm
> 
> +.macro vsetvlstatic16 len
> +.ifc \len,4
> +        vsetvli         zero, zero, e16, mf2, ta, ma
> +.elseif \len == 8
> +        vsetvli         zero, zero, e16, m1, ta, ma
> +.elseif \len == 16
> +        vsetvli         zero, zero, e16, m2, ta, ma
> +.else
> +        vsetvli         zero, zero, e16, m4, ta, ma
> +.endif
> +.endm
> +
>  .macro copy_avg len
>  func ff_vp9_avg\len\()_rvv, zve32x
>          csrwi           vxrm, 0
> @@ -181,8 +193,189 @@ func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
>  endfunc
>  .endm
> 
> +.equ ff_vp9_subpel_filters_smooth, ff_vp9_subpel_filters
> +.equ ff_vp9_subpel_filters_regular, ff_vp9_subpel_filters + 16*8*2
> +.equ ff_vp9_subpel_filters_sharp, ff_vp9_subpel_filters + 16*8*2*2
> +
> +.macro epel_filter name, type, regtype, arg
> +        lla             \regtype\()2, ff_vp9_subpel_filters_\name
> +.ifc \type,v
> +        slli            \regtype\()0, a6, 4
> +.else
> +        slli            \regtype\()0, a5, 4
> +.endif
> +        add             \regtype\()0, \regtype\()0, \regtype\()2
> +        lh              \regtype\()1, 2(\regtype\()0)
> +        lh              \regtype\()2, 4(\regtype\()0)
> +        lh              \regtype\()3, 6(\regtype\()0)
> +        lh              \regtype\()4, 8(\regtype\()0)
> +        lh              \regtype\()5, 10(\regtype\()0)
> +        lh              \regtype\()6, 12(\regtype\()0)
> +        lh              \arg, 14(\regtype\()0)
> +        lh              \regtype\()0, 0(\regtype\()0)
> +.endm
> +
> +.macro epel_load dst, len, op, name, type, from_mem, regtype
> +.ifc \from_mem, 1
> +        vle8.v          v22, (a2)
> +.ifc \type,v
> +        add             a5, a3, a2
> +        sub             a2, a2, a3
> +        vle8.v          v24, (a5)
> +        vle8.v          v20, (a2)
> +        sh1add          a2, a3, a5
> +        add             a5, a5, a3
> +        vle8.v          v26, (a5)
> +        vle8.v          v28, (a2)
> +        add             a2, a2, a3
> +        vle8.v          v30, (a2)
> +.else
> +        addi            a5, a2, 1
> +        addi            a2, a2, -1
> +        vle8.v          v24, (a5)
> +        vle8.v          v20, (a2)
> +        addi            a5, a5, 2
> +        addi            a2, a2, 3
> +        vle8.v          v28, (a5)
> +        vle8.v          v26, (a2)
> +        addi            a2, a5, 1
> +        vle8.v          v30, (a2)
> +.endif
> +
> +.ifc \name,smooth
> +        vwmulu.vx       v16, v24, \regtype\()4
> +        vwmaccu.vx      v16, \regtype\()2, v20
> +        vwmaccu.vx      v16, \regtype\()5, v26
> +        vwmaccsu.vx     v16, \regtype\()6, v28
> +.else
> +        vwmulu.vx       v16, v28, \regtype\()6
> +        vwmaccsu.vx     v16, \regtype\()2, v20
> +        vwmaccsu.vx     v16, \regtype\()5, v26
> +.endif
> +
> +.ifc \regtype,t
> +        vwmaccsu.vx     v16, a7, v30
> +.else
> +        vwmaccsu.vx     v16, s7, v30
> +.endif
> +
> +.ifc \type,v
> +        sh1add          a5, a3, a3
> +        sub             a2, a2, a5
> +        sub             a2, a2, a5
> +        sub             a5, a2, a3
> +        vle8.v          v28, (a2)
> +        vle8.v          v26, (a5)
> +        sh1add          a2, a3, a2
> +.else
> +        addi            a5, a2, -7
> +        addi            a2, a2, -6
> +        vle8.v          v26, (a5)
> +        vle8.v          v28, (a2)
> +        addi            a2, a2, 2
> +.endif
> +
> +.ifc \name,smooth
> +        vwmaccsu.vx     v16, \regtype\()1, v28
> +.else
> +        vwmaccu.vx      v16, \regtype\()1, v28
> +        vwmulu.vx       v28, v24, \regtype\()4
> +.endif
> +        vwmaccsu.vx     v16, \regtype\()0, v26
> +        vwmulu.vx       v20, v22, \regtype\()3
> +.else
> +.ifc \name,smooth
> +        vwmulu.vx       v16, v8, \regtype\()4
> +        vwmaccu.vx      v16, \regtype\()2, v4
> +        vwmaccu.vx      v16, \regtype\()5, v10
> +        vwmaccsu.vx     v16, \regtype\()6, v12
> +        vwmaccsu.vx     v16, \regtype\()1, v2
> +.else
> +        vwmulu.vx       v16, v2, \regtype\()1
> +        vwmaccu.vx      v16, \regtype\()6, v12
> +        vwmaccsu.vx     v16, \regtype\()5, v10
> +        vwmaccsu.vx     v16, \regtype\()2, v4
> +        vwmulu.vx       v28, v8, \regtype\()4
> +.endif
> +        vwmaccsu.vx     v16, \regtype\()0, v0
> +        vwmulu.vx       v20, v6, \regtype\()3
> +
> +.ifc \regtype,t
> +        vwmaccsu.vx     v16, a7, v14
> +.else
> +        vwmaccsu.vx     v16, s7, v14
> +.endif
> +
> +.endif
> +        li              a5, 64
> +        vwadd.wx        v16, v16, a5

Use rounding.

> +        vsetvlstatic16  \len
> +
> +.ifc \name,smooth
> +        vwadd.vv        v24, v16, v20
> +.else
> +        vwadd.vv        v24, v16, v28
> +        vwadd.wv        v24, v24, v20
> +.endif
> +        vnsra.wi        v24, v24, 7
> +        vmax.vx         v24, v24, zero
> +        vsetvlstatic8   \len, zero, 32, m2
> +
> +        vnclipu.wi      \dst, v24, 0
> +.ifc \op,avg
> +        vle8.v          v24, (a0)
> +        vaaddu.vv       \dst, \dst, v24
> +.endif
> +
> +.endm
> +
> +.macro epel_load_inc dst, len, op, name, type, from_mem, regtype
> +        epel_load       \dst, \len, \op, \name, \type, \from_mem, \regtype
> +        add             a2, a2, a3
> +.endm
> +
> +.macro epel len, op, name, type, vlen
> +func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
> +        epel_filter     \name, \type, t, a7
> +.if \vlen < 256
> +        vsetvlstatic8   \len, a5, 32, m2
> +.else
> +        vsetvlstatic8   \len, a5, 64, m2
> +.endif
> +.ifc \op,avg
> +        csrwi           vxrm, 0
> +.endif
> +
> +1:
> +        addi            a4, a4, -1
> +        epel_load       v30, \len, \op, \name, \type, 1, t
> +        vse8.v          v30, (a0)
> +.if \len == 64 && \vlen < 256
> +        addi            a0, a0, 32
> +        addi            a2, a2, 32
> +        epel_load       v30, \len, \op, \name, \type, 1, t
> +        vse8.v          v30, (a0)
> +        addi            a0, a0, -32
> +        addi            a2, a2, -32
> +.endif
> +        add             a2, a2, a3
> +        add             a0, a0, a1
> +        bnez            a4, 1b
> +
> +        ret
> +endfunc
> +.endm
> +
>  .irp len, 64, 32, 16, 8, 4
>          copy_avg \len
> +        .irp op, put, avg
> +                .irp name, regular, sharp, smooth
> +                        .irp type, h, v
> +                                epel \len, \op, \name, \type, 128
> +                                epel \len, \op, \name, \type, 256
> +                        .endr
> +                .endr
> +        .endr
>  .endr
> 
>  bilin_h_v  put, h, a5
> diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> index 8fb326dae0..5fd64a1b8c 100644
> --- a/libavcodec/riscv/vp9dsp.h
> +++ b/libavcodec/riscv/vp9dsp.h
> @@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const
> uint8_t *l, void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const
> uint8_t *l, const uint8_t *a);
> 
> -#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)                      
>   \ -void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> dststride,   \ +#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx,
> min_vlen)              \ +void
> ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,          \ +
>                                        ptrdiff_t dststride,                
> \ const uint8_t *src,                  \ ptrdiff_t srcstride,              
>   \ int h, int mx, int my);              \ \ -void
> ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,   \
> +void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,       
>   \ +                                        ptrdiff_t dststride,          
>       \ const uint8_t *src,                  \ ptrdiff_t srcstride,        
>         \ int h, int mx, int my);              \ \ -void
> ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,  \
> +void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,      
>   \ +                                         ptrdiff_t dststride,         
>       \ const uint8_t *src,                 \ ptrdiff_t srcstride,         
>       \ int h, int mx, int my);             \ \ -void
> ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride,   \
> +void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,       
>   \ +                                        ptrdiff_t dststride,          
>       \ const uint8_t *src,                  \ ptrdiff_t srcstride,        
>         \ int h, int mx, int my);              \ \ -void
> ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,   \
> +void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,       
>   \ +                                        ptrdiff_t dststride,          
>       \ const uint8_t *src,                  \ ptrdiff_t srcstride,        
>         \ int h, int mx, int my);              \ \ -void
> ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,  \
> +void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,      
>   \ +                                         ptrdiff_t dststride,         
>       \ const uint8_t *src,                 \ ptrdiff_t srcstride,         
>       \ int h, int mx, int my);
> @@ -146,23 +152,41 @@ void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t
> dststride,     \ const uint8_t *src, ptrdiff_t srcstride,   \ int h, int
> mx, int my);
> 
> -VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR);
> -
> -VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP);
> -
> -VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
> +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256);
> 
>  VP9_BILINEAR_RISCV_RVV_FUNC(64);
>  VP9_BILINEAR_RISCV_RVV_FUNC(32);
> diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
> index b3700dfb08..3669070fca 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -49,7 +49,9 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp) # endif
> 
>  #if HAVE_RVV
> -    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) && ff_rv_vlen_least(128))
> { +    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
> +    int vlenb = ff_get_rv_vlenb();
> +    if (vlenb >= 16) {
> 
>  #define init_fpel(idx1, sz)                                           \
>      dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_vp9_avg##sz##_rvv;  \
> @@ -95,6 +97,40 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp) dsp->mc[4][FILTER_BILINEAR ][1][1][1] =
> ff_avg_vp9_bilin_4hv_rvv;
> 
>  #undef init_fpel
> +
> +#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen)  \
> +    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] =   \
> +        ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen;       \
> +    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] =   \
> +        ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen;      \
> +    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] =   \
> +        ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen;
> +
> +#define init_subpel2(idx, idxh, idxv, dir, type, vlen)      \
> +    init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen);  \
> +    init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen);  \
> +    init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen);  \
> +    init_subpel1(3, idx, idxh, idxv,  8, dir, type, vlen);  \
> +    init_subpel1(4, idx, idxh, idxv,  4, dir, type, vlen)
> +
> +    init_subpel2(0, 1, 0, h, put, 128);
> +    init_subpel2(1, 1, 0, h, avg, 128);
> +
> +    if (flags & AV_CPU_FLAG_RVB_ADDR) {
> +        init_subpel2(0, 0, 1, v, put, 128);
> +        init_subpel2(1, 0, 1, v, avg, 128);
> +    }
> +
> +    }
> +    if (vlenb >= 32) {
> +        init_subpel2(0, 1, 0, h, put, 256);
> +        init_subpel2(1, 1, 0, h, avg, 256);
> +
> +        if (flags & AV_CPU_FLAG_RVB_ADDR) {
> +            init_subpel2(0, 0, 1, v, put, 256);
> +            init_subpel2(1, 0, 1, v, avg, 256);
> +        }
> +    }
>      }
>  #endif
>  #endif


-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [FFmpeg-devel] [PATCH v4 3/4] lavc/vp9dsp: R-V V mc tap h v
  2024-07-29 15:20       ` Rémi Denis-Courmont
@ 2024-07-31 10:36         ` flow gg
  2024-07-31 19:10           ` Rémi Denis-Courmont
  0 siblings, 1 reply; 15+ messages in thread
From: flow gg @ 2024-07-31 10:36 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

I'm a bit confused because the calculation here goes up to 32 bits and then
returns to 8 bits. It seems that the vmax and vnclipu instructions can't be
removed by using round-related instructions？

Rémi Denis-Courmont <remi@remlab.net> 于2024年7月29日周一 23:21写道：

> Le tiistaina 23. heinäkuuta 2024, 11.51.48 EEST uk7b@foxmail.com a écrit :
> > From: sunyuechi <sunyuechi@iscas.ac.cn>
> >
> >                                                      C908   X60
> > vp9_avg_8tap_smooth_4h_8bpp_c                      :   12.7   11.2
> > vp9_avg_8tap_smooth_4h_8bpp_rvv_i32                :    4.7    4.2
> > vp9_avg_8tap_smooth_4v_8bpp_c                      :   29.7   12.5
> > vp9_avg_8tap_smooth_4v_8bpp_rvv_i32                :    4.7    4.2
> > vp9_avg_8tap_smooth_8h_8bpp_c                      :   48.7   42.2
> > vp9_avg_8tap_smooth_8h_8bpp_rvv_i32                :    9.5    8.5
> > vp9_avg_8tap_smooth_8v_8bpp_c                      :   49.7   45.5
> > vp9_avg_8tap_smooth_8v_8bpp_rvv_i32                :    9.5    8.5
> > vp9_avg_8tap_smooth_16h_8bpp_c                     :  192.0  166.5
> > vp9_avg_8tap_smooth_16h_8bpp_rvv_i32               :   21.7   19.5
> > vp9_avg_8tap_smooth_16v_8bpp_c                     :  191.2  175.2
> > vp9_avg_8tap_smooth_16v_8bpp_rvv_i32               :   21.2   19.0
> > vp9_avg_8tap_smooth_32h_8bpp_c                     :  780.2  663.2
> > vp9_avg_8tap_smooth_32h_8bpp_rvv_i32               :   68.2   60.5
> > vp9_avg_8tap_smooth_32v_8bpp_c                     :  770.0  685.7
> > vp9_avg_8tap_smooth_32v_8bpp_rvv_i32               :   67.0   59.5
> > vp9_avg_8tap_smooth_64h_8bpp_c                     : 3116.2 2648.2
> > vp9_avg_8tap_smooth_64h_8bpp_rvv_i32               :  270.7  120.7
> > vp9_avg_8tap_smooth_64v_8bpp_c                     : 3058.5 2731.7
> > vp9_avg_8tap_smooth_64v_8bpp_rvv_i32               :  266.5  119.0
> > vp9_put_8tap_smooth_4h_8bpp_c                      :   11.0    9.7
> > vp9_put_8tap_smooth_4h_8bpp_rvv_i32                :    4.2    3.7
> > vp9_put_8tap_smooth_4v_8bpp_c                      :   11.7   10.5
> > vp9_put_8tap_smooth_4v_8bpp_rvv_i32                :    4.0    3.7
> > vp9_put_8tap_smooth_8h_8bpp_c                      :   42.0   37.5
> > vp9_put_8tap_smooth_8h_8bpp_rvv_i32                :    8.5    7.7
> > vp9_put_8tap_smooth_8v_8bpp_c                      :   43.5   38.5
> > vp9_put_8tap_smooth_8v_8bpp_rvv_i32                :    8.7    7.7
> > vp9_put_8tap_smooth_16h_8bpp_c                     :  181.7  147.2
> > vp9_put_8tap_smooth_16h_8bpp_rvv_i32               :   20.0   18.0
> > vp9_put_8tap_smooth_16v_8bpp_c                     :  168.5  149.7
> > vp9_put_8tap_smooth_16v_8bpp_rvv_i32               :   19.7   17.5
> > vp9_put_8tap_smooth_32h_8bpp_c                     :  675.0  586.5
> > vp9_put_8tap_smooth_32h_8bpp_rvv_i32               :   65.2   58.0
> > vp9_put_8tap_smooth_32v_8bpp_c                     :  664.7  591.2
> > vp9_put_8tap_smooth_32v_8bpp_rvv_i32               :   64.0   57.0
> > vp9_put_8tap_smooth_64h_8bpp_c                     : 2696.2 2339.0
> > vp9_put_8tap_smooth_64h_8bpp_rvv_i32               :  259.7  115.7
> > vp9_put_8tap_smooth_64v_8bpp_c                     : 2691.0 2348.5
> > vp9_put_8tap_smooth_64v_8bpp_rvv_i32               :  255.5  114.0
> > ---
> >  libavcodec/riscv/vp9_mc_rvv.S  | 193 +++++++++++++++++++++++++++++++++
> >  libavcodec/riscv/vp9dsp.h      |  72 ++++++++----
> >  libavcodec/riscv/vp9dsp_init.c |  38 ++++++-
> >  3 files changed, 278 insertions(+), 25 deletions(-)
> >
> > diff --git a/libavcodec/riscv/vp9_mc_rvv.S
> b/libavcodec/riscv/vp9_mc_rvv.S
> > index 5241562531..6a4be7b9bd 100644
> > --- a/libavcodec/riscv/vp9_mc_rvv.S
> > +++ b/libavcodec/riscv/vp9_mc_rvv.S
> > @@ -36,6 +36,18 @@
> >  .endif
> >  .endm
> >
> > +.macro vsetvlstatic16 len
> > +.ifc \len,4
> > +        vsetvli         zero, zero, e16, mf2, ta, ma
> > +.elseif \len == 8
> > +        vsetvli         zero, zero, e16, m1, ta, ma
> > +.elseif \len == 16
> > +        vsetvli         zero, zero, e16, m2, ta, ma
> > +.else
> > +        vsetvli         zero, zero, e16, m4, ta, ma
> > +.endif
> > +.endm
> > +
> >  .macro copy_avg len
> >  func ff_vp9_avg\len\()_rvv, zve32x
> >          csrwi           vxrm, 0
> > @@ -181,8 +193,189 @@ func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
> >  endfunc
> >  .endm
> >
> > +.equ ff_vp9_subpel_filters_smooth, ff_vp9_subpel_filters
> > +.equ ff_vp9_subpel_filters_regular, ff_vp9_subpel_filters + 16*8*2
> > +.equ ff_vp9_subpel_filters_sharp, ff_vp9_subpel_filters + 16*8*2*2
> > +
> > +.macro epel_filter name, type, regtype, arg
> > +        lla             \regtype\()2, ff_vp9_subpel_filters_\name
> > +.ifc \type,v
> > +        slli            \regtype\()0, a6, 4
> > +.else
> > +        slli            \regtype\()0, a5, 4
> > +.endif
> > +        add             \regtype\()0, \regtype\()0, \regtype\()2
> > +        lh              \regtype\()1, 2(\regtype\()0)
> > +        lh              \regtype\()2, 4(\regtype\()0)
> > +        lh              \regtype\()3, 6(\regtype\()0)
> > +        lh              \regtype\()4, 8(\regtype\()0)
> > +        lh              \regtype\()5, 10(\regtype\()0)
> > +        lh              \regtype\()6, 12(\regtype\()0)
> > +        lh              \arg, 14(\regtype\()0)
> > +        lh              \regtype\()0, 0(\regtype\()0)
> > +.endm
> > +
> > +.macro epel_load dst, len, op, name, type, from_mem, regtype
> > +.ifc \from_mem, 1
> > +        vle8.v          v22, (a2)
> > +.ifc \type,v
> > +        add             a5, a3, a2
> > +        sub             a2, a2, a3
> > +        vle8.v          v24, (a5)
> > +        vle8.v          v20, (a2)
> > +        sh1add          a2, a3, a5
> > +        add             a5, a5, a3
> > +        vle8.v          v26, (a5)
> > +        vle8.v          v28, (a2)
> > +        add             a2, a2, a3
> > +        vle8.v          v30, (a2)
> > +.else
> > +        addi            a5, a2, 1
> > +        addi            a2, a2, -1
> > +        vle8.v          v24, (a5)
> > +        vle8.v          v20, (a2)
> > +        addi            a5, a5, 2
> > +        addi            a2, a2, 3
> > +        vle8.v          v28, (a5)
> > +        vle8.v          v26, (a2)
> > +        addi            a2, a5, 1
> > +        vle8.v          v30, (a2)
> > +.endif
> > +
> > +.ifc \name,smooth
> > +        vwmulu.vx       v16, v24, \regtype\()4
> > +        vwmaccu.vx      v16, \regtype\()2, v20
> > +        vwmaccu.vx      v16, \regtype\()5, v26
> > +        vwmaccsu.vx     v16, \regtype\()6, v28
> > +.else
> > +        vwmulu.vx       v16, v28, \regtype\()6
> > +        vwmaccsu.vx     v16, \regtype\()2, v20
> > +        vwmaccsu.vx     v16, \regtype\()5, v26
> > +.endif
> > +
> > +.ifc \regtype,t
> > +        vwmaccsu.vx     v16, a7, v30
> > +.else
> > +        vwmaccsu.vx     v16, s7, v30
> > +.endif
> > +
> > +.ifc \type,v
> > +        sh1add          a5, a3, a3
> > +        sub             a2, a2, a5
> > +        sub             a2, a2, a5
> > +        sub             a5, a2, a3
> > +        vle8.v          v28, (a2)
> > +        vle8.v          v26, (a5)
> > +        sh1add          a2, a3, a2
> > +.else
> > +        addi            a5, a2, -7
> > +        addi            a2, a2, -6
> > +        vle8.v          v26, (a5)
> > +        vle8.v          v28, (a2)
> > +        addi            a2, a2, 2
> > +.endif
> > +
> > +.ifc \name,smooth
> > +        vwmaccsu.vx     v16, \regtype\()1, v28
> > +.else
> > +        vwmaccu.vx      v16, \regtype\()1, v28
> > +        vwmulu.vx       v28, v24, \regtype\()4
> > +.endif
> > +        vwmaccsu.vx     v16, \regtype\()0, v26
> > +        vwmulu.vx       v20, v22, \regtype\()3
> > +.else
> > +.ifc \name,smooth
> > +        vwmulu.vx       v16, v8, \regtype\()4
> > +        vwmaccu.vx      v16, \regtype\()2, v4
> > +        vwmaccu.vx      v16, \regtype\()5, v10
> > +        vwmaccsu.vx     v16, \regtype\()6, v12
> > +        vwmaccsu.vx     v16, \regtype\()1, v2
> > +.else
> > +        vwmulu.vx       v16, v2, \regtype\()1
> > +        vwmaccu.vx      v16, \regtype\()6, v12
> > +        vwmaccsu.vx     v16, \regtype\()5, v10
> > +        vwmaccsu.vx     v16, \regtype\()2, v4
> > +        vwmulu.vx       v28, v8, \regtype\()4
> > +.endif
> > +        vwmaccsu.vx     v16, \regtype\()0, v0
> > +        vwmulu.vx       v20, v6, \regtype\()3
> > +
> > +.ifc \regtype,t
> > +        vwmaccsu.vx     v16, a7, v14
> > +.else
> > +        vwmaccsu.vx     v16, s7, v14
> > +.endif
> > +
> > +.endif
> > +        li              a5, 64
> > +        vwadd.wx        v16, v16, a5
>
> Use rounding.
>
> > +        vsetvlstatic16  \len
> > +
> > +.ifc \name,smooth
> > +        vwadd.vv        v24, v16, v20
> > +.else
> > +        vwadd.vv        v24, v16, v28
> > +        vwadd.wv        v24, v24, v20
> > +.endif
> > +        vnsra.wi        v24, v24, 7
> > +        vmax.vx         v24, v24, zero
> > +        vsetvlstatic8   \len, zero, 32, m2
> > +
> > +        vnclipu.wi      \dst, v24, 0
> > +.ifc \op,avg
> > +        vle8.v          v24, (a0)
> > +        vaaddu.vv       \dst, \dst, v24
> > +.endif
> > +
> > +.endm
> > +
> > +.macro epel_load_inc dst, len, op, name, type, from_mem, regtype
> > +        epel_load       \dst, \len, \op, \name, \type, \from_mem,
> \regtype
> > +        add             a2, a2, a3
> > +.endm
> > +
> > +.macro epel len, op, name, type, vlen
> > +func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
> > +        epel_filter     \name, \type, t, a7
> > +.if \vlen < 256
> > +        vsetvlstatic8   \len, a5, 32, m2
> > +.else
> > +        vsetvlstatic8   \len, a5, 64, m2
> > +.endif
> > +.ifc \op,avg
> > +        csrwi           vxrm, 0
> > +.endif
> > +
> > +1:
> > +        addi            a4, a4, -1
> > +        epel_load       v30, \len, \op, \name, \type, 1, t
> > +        vse8.v          v30, (a0)
> > +.if \len == 64 && \vlen < 256
> > +        addi            a0, a0, 32
> > +        addi            a2, a2, 32
> > +        epel_load       v30, \len, \op, \name, \type, 1, t
> > +        vse8.v          v30, (a0)
> > +        addi            a0, a0, -32
> > +        addi            a2, a2, -32
> > +.endif
> > +        add             a2, a2, a3
> > +        add             a0, a0, a1
> > +        bnez            a4, 1b
> > +
> > +        ret
> > +endfunc
> > +.endm
> > +
> >  .irp len, 64, 32, 16, 8, 4
> >          copy_avg \len
> > +        .irp op, put, avg
> > +                .irp name, regular, sharp, smooth
> > +                        .irp type, h, v
> > +                                epel \len, \op, \name, \type, 128
> > +                                epel \len, \op, \name, \type, 256
> > +                        .endr
> > +                .endr
> > +        .endr
> >  .endr
> >
> >  bilin_h_v  put, h, a5
> > diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> > index 8fb326dae0..5fd64a1b8c 100644
> > --- a/libavcodec/riscv/vp9dsp.h
> > +++ b/libavcodec/riscv/vp9dsp.h
> > @@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride,
> const
> > uint8_t *l, void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const
> > uint8_t *l, const uint8_t *a);
> >
> > -#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)
>
> >   \ -void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> > dststride,   \ +#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx,
> > min_vlen)              \ +void
> > ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,
> \ +
> >                                        ptrdiff_t dststride,
>
> > \ const uint8_t *src,                  \ ptrdiff_t srcstride,
>
> >   \ int h, int mx, int my);              \ \ -void
> > ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,   \
> > +void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,
>
> >   \ +                                        ptrdiff_t dststride,
>
> >       \ const uint8_t *src,                  \ ptrdiff_t srcstride,
>
> >         \ int h, int mx, int my);              \ \ -void
> > ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,  \
> > +void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,
>
> >   \ +                                         ptrdiff_t dststride,
>
> >       \ const uint8_t *src,                 \ ptrdiff_t srcstride,
>
> >       \ int h, int mx, int my);             \ \ -void
> > ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride,   \
> > +void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,
>
> >   \ +                                        ptrdiff_t dststride,
>
> >       \ const uint8_t *src,                  \ ptrdiff_t srcstride,
>
> >         \ int h, int mx, int my);              \ \ -void
> > ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,   \
> > +void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,
>
> >   \ +                                        ptrdiff_t dststride,
>
> >       \ const uint8_t *src,                  \ ptrdiff_t srcstride,
>
> >         \ int h, int mx, int my);              \ \ -void
> > ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,  \
> > +void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,
>
> >   \ +                                         ptrdiff_t dststride,
>
> >       \ const uint8_t *src,                 \ ptrdiff_t srcstride,
>
> >       \ int h, int mx, int my);
> > @@ -146,23 +152,41 @@ void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t
> > dststride,     \ const uint8_t *src, ptrdiff_t srcstride,   \ int h, int
> > mx, int my);
> >
> > -VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR);
> > -VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR);
> > -VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR);
> > -VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR);
> > -VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR);
> > -
> > -VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP);
> > -VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP);
> > -VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP);
> > -VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP);
> > -VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP);
> > -
> > -VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
> > -VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
> > -VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
> > -VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
> > -VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
> > +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128);
> > +
> > +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128);
> > +
> > +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128);
> > +
> > +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256);
> > +
> > +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256);
> > +
> > +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256);
> >
> >  VP9_BILINEAR_RISCV_RVV_FUNC(64);
> >  VP9_BILINEAR_RISCV_RVV_FUNC(32);
> > diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> > index b3700dfb08..3669070fca 100644
> > --- a/libavcodec/riscv/vp9dsp_init.c
> > +++ b/libavcodec/riscv/vp9dsp_init.c
> > @@ -49,7 +49,9 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> > *dsp, int bpp) # endif
> >
> >  #if HAVE_RVV
> > -    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) &&
> ff_rv_vlen_least(128))
> > { +    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
> > +    int vlenb = ff_get_rv_vlenb();
> > +    if (vlenb >= 16) {
> >
> >  #define init_fpel(idx1, sz)                                           \
> >      dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] =
> ff_vp9_avg##sz##_rvv;  \
> > @@ -95,6 +97,40 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> > *dsp, int bpp) dsp->mc[4][FILTER_BILINEAR ][1][1][1] =
> > ff_avg_vp9_bilin_4hv_rvv;
> >
> >  #undef init_fpel
> > +
> > +#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen)  \
> > +    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] =   \
> > +        ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen;       \
> > +    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] =   \
> > +        ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen;      \
> > +    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] =   \
> > +        ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen;
> > +
> > +#define init_subpel2(idx, idxh, idxv, dir, type, vlen)      \
> > +    init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen);  \
> > +    init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen);  \
> > +    init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen);  \
> > +    init_subpel1(3, idx, idxh, idxv,  8, dir, type, vlen);  \
> > +    init_subpel1(4, idx, idxh, idxv,  4, dir, type, vlen)
> > +
> > +    init_subpel2(0, 1, 0, h, put, 128);
> > +    init_subpel2(1, 1, 0, h, avg, 128);
> > +
> > +    if (flags & AV_CPU_FLAG_RVB_ADDR) {
> > +        init_subpel2(0, 0, 1, v, put, 128);
> > +        init_subpel2(1, 0, 1, v, avg, 128);
> > +    }
> > +
> > +    }
> > +    if (vlenb >= 32) {
> > +        init_subpel2(0, 1, 0, h, put, 256);
> > +        init_subpel2(1, 1, 0, h, avg, 256);
> > +
> > +        if (flags & AV_CPU_FLAG_RVB_ADDR) {
> > +            init_subpel2(0, 0, 1, v, put, 256);
> > +            init_subpel2(1, 0, 1, v, avg, 256);
> > +        }
> > +    }
> >      }
> >  #endif
> >  #endif
>
>
> --
> 雷米‧德尼-库尔蒙
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [FFmpeg-devel] [PATCH v4 3/4] lavc/vp9dsp: R-V V mc tap h v
  2024-07-31 10:36         ` flow gg
@ 2024-07-31 19:10           ` Rémi Denis-Courmont
  0 siblings, 0 replies; 15+ messages in thread
From: Rémi Denis-Courmont @ 2024-07-31 19:10 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Le keskiviikkona 31. heinäkuuta 2024, 13.36.00 EEST flow gg a écrit :
> I'm a bit confused because the calculation here goes up to 32 bits and then
> returns to 8 bits. It seems that the vmax and vnclipu instructions can't be
> removed by using round-related instructions？

You seem to be adding 64 then dividing by 128. That's rounding up.

-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [FFmpeg-devel] [PATCH v4 3/4] lavc/vp9dsp: R-V V mc tap h v
  2024-07-13  9:02   ` Rémi Denis-Courmont
  2024-07-23  8:51     ` uk7b
@ 2024-07-23  8:56     ` flow gg
  1 sibling, 0 replies; 15+ messages in thread
From: flow gg @ 2024-07-23  8:56 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

> TBH it is very hard to review this due to the large extents of code
> conditionals. This should avoidable at least partly. You can name macros
for
> each filter and then expand those macros instead of using if's.

Do you mean that before the addition of .equ ff_vp9_subpel_filters_xxx,
epel_filter had too many if statements?
Now the filter has only two if statements. Anyway, I have updated it and
reduced one more if.

> Besides in my experience, it is more readable to leave the loads/stores
to the
> outer function or macros and factor only the calculations, whenever you
need
> to apply the same maths vertically and/or horizontally. This also
sometimes
> enables actually using shared code, e.g., the H.263 loop filter or the
VC-1
> ITX.

There is an issue here because of insufficient vector registers, so vector
registers need to be reused.
If we use the H.263 method, it would require two more jumps.
Additionally, scalar registers are also insufficient. so need more stack.
I want to implement this as a macro for lengths of 4, 8, 16, 32, and 64
first.
In a subsequent patch, I will break down 4, 8, and 16 into one macro, and
32 or 64 into another macro.
This way, code can be better shared and some other adjustments like vlseg
...

> Lastly this seems to both add new optimisations *and* add specialisations
for
> 256-bit vectors, which really should be separate patches, but maybe I just
> don't understand the code. In any case, that would not really match with
the
> patch description.

I think the purpose of this patch is to implement 128b+256b RVV, so adding
the corresponding 128+256 functions in vp9dsp.h could also be part of this
patch?

Rémi Denis-Courmont <remi@remlab.net> 于2024年7月13日周六 17:02写道：

> Le lauantaina 15. kesäkuuta 2024, 14.50.33 EEST uk7b@foxmail.com a écrit :
> > From: sunyuechi <sunyuechi@iscas.ac.cn>
>
> OK, so I realise that this review is very late, but...
>
> TBH it is very hard to review this due to the large extents of code
> conditionals. This should avoidable at least partly. You can name macros
> for
> each filter and then expand those macros instead of using if's.
>
> Besides in my experience, it is more readable to leave the loads/stores to
> the
> outer function or macros and factor only the calculations, whenever you
> need
> to apply the same maths vertically and/or horizontally. This also
> sometimes
> enables actually using shared code, e.g., the H.263 loop filter or the
> VC-1
> ITX.
>
> Lastly this seems to both add new optimisations *and* add specialisations
> for
> 256-bit vectors, which really should be separate patches, but maybe I just
> don't understand the code. In any case, that would not really match with
> the
> patch description.
>
>
> >                                                      C908   X60
> > vp9_avg_8tap_smooth_4h_8bpp_c                      :   12.7   11.2
> > vp9_avg_8tap_smooth_4h_8bpp_rvv_i32                :    4.7    4.2
> > vp9_avg_8tap_smooth_4v_8bpp_c                      :   29.7   12.5
> > vp9_avg_8tap_smooth_4v_8bpp_rvv_i32                :    4.7    4.2
> > vp9_avg_8tap_smooth_8h_8bpp_c                      :   48.7   42.2
> > vp9_avg_8tap_smooth_8h_8bpp_rvv_i32                :    9.5    8.5
> > vp9_avg_8tap_smooth_8v_8bpp_c                      :   49.7   45.5
> > vp9_avg_8tap_smooth_8v_8bpp_rvv_i32                :    9.5    8.5
> > vp9_avg_8tap_smooth_16h_8bpp_c                     :  192.0  166.5
> > vp9_avg_8tap_smooth_16h_8bpp_rvv_i32               :   21.7   19.5
> > vp9_avg_8tap_smooth_16v_8bpp_c                     :  191.2  175.2
> > vp9_avg_8tap_smooth_16v_8bpp_rvv_i32               :   21.2   19.0
> > vp9_avg_8tap_smooth_32h_8bpp_c                     :  780.2  663.2
> > vp9_avg_8tap_smooth_32h_8bpp_rvv_i32               :   68.2   60.5
> > vp9_avg_8tap_smooth_32v_8bpp_c                     :  770.0  685.7
> > vp9_avg_8tap_smooth_32v_8bpp_rvv_i32               :   67.0   59.5
> > vp9_avg_8tap_smooth_64h_8bpp_c                     : 3116.2 2648.2
> > vp9_avg_8tap_smooth_64h_8bpp_rvv_i32               :  270.7  120.7
> > vp9_avg_8tap_smooth_64v_8bpp_c                     : 3058.5 2731.7
> > vp9_avg_8tap_smooth_64v_8bpp_rvv_i32               :  266.5  119.0
> > vp9_put_8tap_smooth_4h_8bpp_c                      :   11.0    9.7
> > vp9_put_8tap_smooth_4h_8bpp_rvv_i32                :    4.2    3.7
> > vp9_put_8tap_smooth_4v_8bpp_c                      :   11.7   10.5
> > vp9_put_8tap_smooth_4v_8bpp_rvv_i32                :    4.0    3.7
> > vp9_put_8tap_smooth_8h_8bpp_c                      :   42.0   37.5
> > vp9_put_8tap_smooth_8h_8bpp_rvv_i32                :    8.5    7.7
> > vp9_put_8tap_smooth_8v_8bpp_c                      :   43.5   38.5
> > vp9_put_8tap_smooth_8v_8bpp_rvv_i32                :    8.7    7.7
> > vp9_put_8tap_smooth_16h_8bpp_c                     :  181.7  147.2
> > vp9_put_8tap_smooth_16h_8bpp_rvv_i32               :   20.0   18.0
> > vp9_put_8tap_smooth_16v_8bpp_c                     :  168.5  149.7
> > vp9_put_8tap_smooth_16v_8bpp_rvv_i32               :   19.7   17.5
> > vp9_put_8tap_smooth_32h_8bpp_c                     :  675.0  586.5
> > vp9_put_8tap_smooth_32h_8bpp_rvv_i32               :   65.2   58.0
> > vp9_put_8tap_smooth_32v_8bpp_c                     :  664.7  591.2
> > vp9_put_8tap_smooth_32v_8bpp_rvv_i32               :   64.0   57.0
> > vp9_put_8tap_smooth_64h_8bpp_c                     : 2696.2 2339.0
> > vp9_put_8tap_smooth_64h_8bpp_rvv_i32               :  259.7  115.7
> > vp9_put_8tap_smooth_64v_8bpp_c                     : 2691.0 2348.5
> > vp9_put_8tap_smooth_64v_8bpp_rvv_i32               :  255.5  114.0
> > ---
> >  libavcodec/riscv/vp9_mc_rvv.S  | 200 +++++++++++++++++++++++++++++++++
> >  libavcodec/riscv/vp9dsp.h      |  72 ++++++++----
> >  libavcodec/riscv/vp9dsp_init.c |  38 ++++++-
> >  3 files changed, 285 insertions(+), 25 deletions(-)
> >
> > diff --git a/libavcodec/riscv/vp9_mc_rvv.S
> b/libavcodec/riscv/vp9_mc_rvv.S
> > index 5241562531..5e81301aa5 100644
> > --- a/libavcodec/riscv/vp9_mc_rvv.S
> > +++ b/libavcodec/riscv/vp9_mc_rvv.S
> > @@ -36,6 +36,18 @@
> >  .endif
> >  .endm
> >
> > +.macro vsetvlstatic16 len
> > +.ifc \len,4
> > +        vsetvli         zero, zero, e16, mf2, ta, ma
> > +.elseif \len == 8
> > +        vsetvli         zero, zero, e16, m1, ta, ma
> > +.elseif \len == 16
> > +        vsetvli         zero, zero, e16, m2, ta, ma
> > +.else
> > +        vsetvli         zero, zero, e16, m4, ta, ma
> > +.endif
> > +.endm
> > +
> >  .macro copy_avg len
> >  func ff_vp9_avg\len\()_rvv, zve32x
> >          csrwi           vxrm, 0
> > @@ -181,8 +193,196 @@ func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
> >  endfunc
> >  .endm
> >
> > +.equ ff_vp9_subpel_filters_smooth, ff_vp9_subpel_filters
> > +.equ ff_vp9_subpel_filters_regular, ff_vp9_subpel_filters + 16*8*2
> > +.equ ff_vp9_subpel_filters_sharp, ff_vp9_subpel_filters + 16*8*2*2
> > +
> > +.macro epel_filter name, type, regtype
> > +        lla             \regtype\()2, ff_vp9_subpel_filters_\name
> > +
> > +.ifc \type,v
> > +        slli            \regtype\()0, a6, 4
> > +.else
> > +        slli            \regtype\()0, a5, 4
> > +.endif
> > +        add             \regtype\()0, \regtype\()0, \regtype\()2
> > +
> > +        lh              \regtype\()1, 2(\regtype\()0)
> > +        lh              \regtype\()2, 4(\regtype\()0)
> > +        lh              \regtype\()3, 6(\regtype\()0)
> > +        lh              \regtype\()4, 8(\regtype\()0)
> > +        lh              \regtype\()5, 10(\regtype\()0)
> > +        lh              \regtype\()6, 12(\regtype\()0)
> > +
> > +.ifc \regtype,t
> > +        lh              a7, 14(\regtype\()0)
> > +.else
> > +        lh              s7, 14(\regtype\()0)
> > +.endif
> > +        lh              \regtype\()0, 0(\regtype\()0)
> > +.endm
> > +
> > +.macro epel_load dst, len, op, name, type, from_mem, regtype
> > +.ifc \from_mem, 1
> > +        vle8.v          v22, (a2)
> > +.ifc \type,v
> > +        add             a5, a3, a2
> > +        sub             a2, a2, a3
> > +        vle8.v          v24, (a5)
> > +        vle8.v          v20, (a2)
> > +        sh1add          a2, a3, a5
> > +        add             a5, a5, a3
> > +        vle8.v          v26, (a5)
> > +        vle8.v          v28, (a2)
> > +        add             a2, a2, a3
> > +        vle8.v          v30, (a2)
> > +.else
> > +        addi            a5, a2, 1
> > +        addi            a2, a2, -1
> > +        vle8.v          v24, (a5)
> > +        vle8.v          v20, (a2)
> > +        addi            a5, a5, 2
> > +        addi            a2, a2, 3
> > +        vle8.v          v28, (a5)
> > +        vle8.v          v26, (a2)
> > +        addi            a2, a5, 1
> > +        vle8.v          v30, (a2)
> > +.endif
> > +
> > +.ifc \name,smooth
> > +        vwmulu.vx       v16, v24, \regtype\()4
> > +        vwmaccu.vx      v16, \regtype\()2, v20
> > +        vwmaccu.vx      v16, \regtype\()5, v26
> > +        vwmaccsu.vx     v16, \regtype\()6, v28
> > +.else
> > +        vwmulu.vx       v16, v28, \regtype\()6
> > +        vwmaccsu.vx     v16, \regtype\()2, v20
> > +        vwmaccsu.vx     v16, \regtype\()5, v26
> > +.endif
> > +
> > +.ifc \regtype,t
> > +        vwmaccsu.vx     v16, a7, v30
> > +.else
> > +        vwmaccsu.vx     v16, s7, v30
> > +.endif
> > +
> > +.ifc \type,v
> > +        sh1add          a5, a3, a3
> > +        sub             a2, a2, a5
> > +        sub             a2, a2, a5
> > +        sub             a5, a2, a3
> > +        vle8.v          v28, (a2)
> > +        vle8.v          v26, (a5)
> > +        sh1add          a2, a3, a2
> > +.else
> > +        addi            a5, a2, -7
> > +        addi            a2, a2, -6
> > +        vle8.v          v26, (a5)
> > +        vle8.v          v28, (a2)
> > +        addi            a2, a2, 2
> > +.endif
> > +
> > +.ifc \name,smooth
> > +        vwmaccsu.vx     v16, \regtype\()1, v28
> > +.else
> > +        vwmaccu.vx      v16, \regtype\()1, v28
> > +        vwmulu.vx       v28, v24, \regtype\()4
> > +.endif
> > +        vwmaccsu.vx     v16, \regtype\()0, v26
> > +        vwmulu.vx       v20, v22, \regtype\()3
> > +.else
> > +.ifc \name,smooth
> > +        vwmulu.vx       v16, v8, \regtype\()4
> > +        vwmaccu.vx      v16, \regtype\()2, v4
> > +        vwmaccu.vx      v16, \regtype\()5, v10
> > +        vwmaccsu.vx     v16, \regtype\()6, v12
> > +        vwmaccsu.vx     v16, \regtype\()1, v2
> > +.else
> > +        vwmulu.vx       v16, v2, \regtype\()1
> > +        vwmaccu.vx      v16, \regtype\()6, v12
> > +        vwmaccsu.vx     v16, \regtype\()5, v10
> > +        vwmaccsu.vx     v16, \regtype\()2, v4
> > +        vwmulu.vx       v28, v8, \regtype\()4
> > +.endif
> > +        vwmaccsu.vx     v16, \regtype\()0, v0
> > +        vwmulu.vx       v20, v6, \regtype\()3
> > +
> > +.ifc \regtype,t
> > +        vwmaccsu.vx     v16, a7, v14
> > +.else
> > +        vwmaccsu.vx     v16, s7, v14
> > +.endif
> > +
> > +.endif
> > +        li              a5, 64
> > +        vwadd.wx        v16, v16, a5
> > +        vsetvlstatic16  \len
> > +
> > +.ifc \name,smooth
> > +        vwadd.vv        v24, v16, v20
> > +.else
> > +        vwadd.vv        v24, v16, v28
> > +        vwadd.wv        v24, v24, v20
> > +.endif
> > +        vnsra.wi        v24, v24, 7
> > +        vmax.vx         v24, v24, zero
> > +        vsetvlstatic8   \len, zero, 32, m2
> > +
> > +        vnclipu.wi      \dst, v24, 0
> > +.ifc \op,avg
> > +        vle8.v          v24, (a0)
> > +        vaaddu.vv       \dst, \dst, v24
> > +.endif
> > +
> > +.endm
> > +
> > +.macro epel_load_inc dst, len, op, name, type, from_mem, regtype
> > +        epel_load       \dst, \len, \op, \name, \type, \from_mem,
> \regtype
> > +        add             a2, a2, a3
> > +.endm
> > +
> > +.macro epel len, op, name, type, vlen
> > +func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
> > +        epel_filter     \name, \type, t
> > +.if \vlen < 256
> > +        vsetvlstatic8   \len, a5, 32, m2
> > +.else
> > +        vsetvlstatic8   \len, a5, 64, m2
> > +.endif
> > +.ifc \op,avg
> > +        csrwi           vxrm, 0
> > +.endif
> > +
> > +1:
> > +        addi            a4, a4, -1
> > +        epel_load       v30, \len, \op, \name, \type, 1, t
> > +        vse8.v          v30, (a0)
> > +.if \len == 64 && \vlen < 256
> > +        addi            a0, a0, 32
> > +        addi            a2, a2, 32
> > +        epel_load       v30, \len, \op, \name, \type, 1, t
> > +        vse8.v          v30, (a0)
> > +        addi            a0, a0, -32
> > +        addi            a2, a2, -32
> > +.endif
> > +        add             a2, a2, a3
> > +        add             a0, a0, a1
> > +        bnez            a4, 1b
> > +
> > +        ret
> > +endfunc
> > +.endm
> > +
> >  .irp len, 64, 32, 16, 8, 4
> >          copy_avg \len
> > +        .irp op, put, avg
> > +                .irp name, regular, sharp, smooth
> > +                        .irp type, h, v
> > +                                epel \len, \op, \name, \type, 128
> > +                                epel \len, \op, \name, \type, 256
> > +                        .endr
> > +                .endr
> > +        .endr
> >  .endr
> >
> >  bilin_h_v  put, h, a5
> > diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> > index 8fb326dae0..5fd64a1b8c 100644
> > --- a/libavcodec/riscv/vp9dsp.h
> > +++ b/libavcodec/riscv/vp9dsp.h
> > @@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride,
> const
> > uint8_t *l, void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const
> > uint8_t *l, const uint8_t *a);
> >
> > -#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)
>
> >   \ -void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> > dststride,   \ +#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx,
> > min_vlen)              \ +void
> > ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,
> \ +
> >                                        ptrdiff_t dststride,
>
> > \ const uint8_t *src,                  \ ptrdiff_t srcstride,
>
> >   \ int h, int mx, int my);              \ \ -void
> > ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,   \
> > +void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,
>
> >   \ +                                        ptrdiff_t dststride,
>
> >       \ const uint8_t *src,                  \ ptrdiff_t srcstride,
>
> >         \ int h, int mx, int my);              \ \ -void
> > ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,  \
> > +void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,
>
> >   \ +                                         ptrdiff_t dststride,
>
> >       \ const uint8_t *src,                 \ ptrdiff_t srcstride,
>
> >       \ int h, int mx, int my);             \ \ -void
> > ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride,   \
> > +void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,
>
> >   \ +                                        ptrdiff_t dststride,
>
> >       \ const uint8_t *src,                  \ ptrdiff_t srcstride,
>
> >         \ int h, int mx, int my);              \ \ -void
> > ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,   \
> > +void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,
>
> >   \ +                                        ptrdiff_t dststride,
>
> >       \ const uint8_t *src,                  \ ptrdiff_t srcstride,
>
> >         \ int h, int mx, int my);              \ \ -void
> > ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,  \
> > +void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,
>
> >   \ +                                         ptrdiff_t dststride,
>
> >       \ const uint8_t *src,                 \ ptrdiff_t srcstride,
>
> >       \ int h, int mx, int my);
> > @@ -146,23 +152,41 @@ void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t
> > dststride,     \ const uint8_t *src, ptrdiff_t srcstride,   \ int h, int
> > mx, int my);
> >
> > -VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR);
> > -VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR);
> > -VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR);
> > -VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR);
> > -VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR);
> > -
> > -VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP);
> > -VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP);
> > -VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP);
> > -VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP);
> > -VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP);
> > -
> > -VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
> > -VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
> > -VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
> > -VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
> > -VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
> > +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128);
> > +
> > +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128);
> > +
> > +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128);
> > +
> > +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256);
> > +
> > +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256);
> > +
> > +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256);
> >
> >  VP9_BILINEAR_RISCV_RVV_FUNC(64);
> >  VP9_BILINEAR_RISCV_RVV_FUNC(32);
> > diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> > index b3700dfb08..3669070fca 100644
> > --- a/libavcodec/riscv/vp9dsp_init.c
> > +++ b/libavcodec/riscv/vp9dsp_init.c
> > @@ -49,7 +49,9 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> > *dsp, int bpp) # endif
> >
> >  #if HAVE_RVV
> > -    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) &&
> ff_rv_vlen_least(128))
> > { +    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
> > +    int vlenb = ff_get_rv_vlenb();
> > +    if (vlenb >= 16) {
> >
> >  #define init_fpel(idx1, sz)                                           \
> >      dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] =
> ff_vp9_avg##sz##_rvv;  \
> > @@ -95,6 +97,40 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> > *dsp, int bpp) dsp->mc[4][FILTER_BILINEAR ][1][1][1] =
> > ff_avg_vp9_bilin_4hv_rvv;
> >
> >  #undef init_fpel
> > +
> > +#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen)  \
> > +    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] =   \
> > +        ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen;       \
> > +    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] =   \
> > +        ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen;      \
> > +    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] =   \
> > +        ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen;
> > +
> > +#define init_subpel2(idx, idxh, idxv, dir, type, vlen)      \
> > +    init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen);  \
> > +    init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen);  \
> > +    init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen);  \
> > +    init_subpel1(3, idx, idxh, idxv,  8, dir, type, vlen);  \
> > +    init_subpel1(4, idx, idxh, idxv,  4, dir, type, vlen)
> > +
> > +    init_subpel2(0, 1, 0, h, put, 128);
> > +    init_subpel2(1, 1, 0, h, avg, 128);
> > +
> > +    if (flags & AV_CPU_FLAG_RVB_ADDR) {
> > +        init_subpel2(0, 0, 1, v, put, 128);
> > +        init_subpel2(1, 0, 1, v, avg, 128);
> > +    }
> > +
> > +    }
> > +    if (vlenb >= 32) {
> > +        init_subpel2(0, 1, 0, h, put, 256);
> > +        init_subpel2(1, 1, 0, h, avg, 256);
> > +
> > +        if (flags & AV_CPU_FLAG_RVB_ADDR) {
> > +            init_subpel2(0, 0, 1, v, put, 256);
> > +            init_subpel2(1, 0, 1, v, avg, 256);
> > +        }
> > +    }
> >      }
> >  #endif
> >  #endif
>
>
> --
> 雷米‧德尼-库尔蒙
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [FFmpeg-devel] [PATCH v4 4/4] lavc/vp9dsp: R-V V mc tap hv
       [not found] <20240615115034.3891490-1-uk7b@foxmail.com>
  2024-06-15 11:50 ` [FFmpeg-devel] [PATCH v4 2/4] lavc/vp9dsp: R-V V mc bilin hv uk7b
  2024-06-15 11:50 ` [FFmpeg-devel] [PATCH v4 3/4] lavc/vp9dsp: R-V V mc tap h v uk7b
@ 2024-06-15 11:50 ` uk7b
  2024-07-23  8:58   ` uk7b
  2 siblings, 1 reply; 15+ messages in thread
From: uk7b @ 2024-06-15 11:50 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

                                                     C908   X60
vp9_avg_8tap_smooth_4hv_8bpp_c                     :   32.0   28.0
vp9_avg_8tap_smooth_4hv_8bpp_rvv_i32               :   15.0   13.2
vp9_avg_8tap_smooth_8hv_8bpp_c                     :   98.0   86.2
vp9_avg_8tap_smooth_8hv_8bpp_rvv_i32               :   23.7   21.2
vp9_avg_8tap_smooth_16hv_8bpp_c                    :  355.7  297.0
vp9_avg_8tap_smooth_16hv_8bpp_rvv_i32              :   47.0   41.5
vp9_avg_8tap_smooth_32hv_8bpp_c                    : 1272.7 1099.7
vp9_avg_8tap_smooth_32hv_8bpp_rvv_i32              :  134.7  119.7
vp9_avg_8tap_smooth_64hv_8bpp_c                    : 4937.0 4224.2
vp9_avg_8tap_smooth_64hv_8bpp_rvv_i32              :  528.5  228.5
vp9_put_8tap_smooth_4hv_8bpp_c                     :   30.2   26.7
vp9_put_8tap_smooth_4hv_8bpp_rvv_i32               :   30.5   12.5
vp9_put_8tap_smooth_8hv_8bpp_c                     :   91.5   81.2
vp9_put_8tap_smooth_8hv_8bpp_rvv_i32               :   22.7   20.2
vp9_put_8tap_smooth_16hv_8bpp_c                    :  313.2  277.5
vp9_put_8tap_smooth_16hv_8bpp_rvv_i32              :   45.2   40.2
vp9_put_8tap_smooth_32hv_8bpp_c                    : 1166.7 1022.2
vp9_put_8tap_smooth_32hv_8bpp_rvv_i32              :  131.7  117.2
vp9_put_8tap_smooth_64hv_8bpp_c                    : 4560.5 3961.7
vp9_put_8tap_smooth_64hv_8bpp_rvv_i32              :  517.0  223.2
---
 libavcodec/riscv/vp9_mc_rvv.S  | 75 ++++++++++++++++++++++++++++++++++
 libavcodec/riscv/vp9dsp_init.c |  8 ++++
 2 files changed, 83 insertions(+)

diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index 5e81301aa5..474c9035ae 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -373,6 +373,77 @@ func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
 endfunc
 .endm
 
+#if __riscv_xlen == 64
+.macro epel_hv_once len, name, op
+        sub             a2, a2, a3
+        sub             a2, a2, a3
+        sub             a2, a2, a3
+        .irp n,0,2,4,6,8,10,12,14
+        epel_load_inc   v\n, \len, put, \name, h, 1, t
+        .endr
+        addi            a4, a4, -1
+1:
+        addi            a4, a4, -1
+        epel_load       v30, \len, \op, \name, v, 0, s
+        vse8.v          v30, (a0)
+        vmv.v.v         v0, v2
+        vmv.v.v         v2, v4
+        vmv.v.v         v4, v6
+        vmv.v.v         v6, v8
+        vmv.v.v         v8, v10
+        vmv.v.v         v10, v12
+        vmv.v.v         v12, v14
+        epel_load       v14, \len, put, \name, h, 1, t
+        add             a2, a2, a3
+        add             a0, a0, a1
+        bnez            a4, 1b
+        epel_load       v30, \len, \op, \name, v, 0, s
+        vse8.v          v30, (a0)
+.endm
+
+.macro epel_hv op, name, len, vlen
+func ff_\op\()_vp9_8tap_\name\()_\len\()hv_rvv\vlen\(), zve32x
+        addi            sp, sp, -64
+        .irp n,0,1,2,3,4,5,6,7
+        sd              s\n, \n\()<<3(sp)
+        .endr
+.if \len == 64 && \vlen < 256
+        addi            sp, sp, -48
+        .irp n,0,1,2,3,4,5
+        sd              a\n, \n\()<<3(sp)
+        .endr
+.endif
+.ifc \op,avg
+        csrwi           vxrm, 0
+.endif
+        epel_filter     \name, h, t
+        epel_filter     \name, v, s
+.if \vlen < 256
+        vsetvlstatic8   \len, a6, 32, m2
+.else
+        vsetvlstatic8   \len, a6, 64, m2
+.endif
+        epel_hv_once    \len, \name, \op
+.if \len == 64 && \vlen < 256
+        .irp n,0,1,2,3,4,5
+        ld              a\n, \n\()<<3(sp)
+        .endr
+        addi            sp, sp, 48
+        addi            a0, a0, 32
+        addi            a2, a2, 32
+        epel_filter     \name, h, t
+        epel_hv_once    \len, \name, \op
+.endif
+        .irp n,0,1,2,3,4,5,6,7
+        ld              s\n, \n\()<<3(sp)
+        .endr
+        addi            sp, sp, 64
+
+        ret
+endfunc
+.endm
+#endif
+
 .irp len, 64, 32, 16, 8, 4
         copy_avg \len
         .irp op, put, avg
@@ -381,6 +452,10 @@ endfunc
                                 epel \len, \op, \name, \type, 128
                                 epel \len, \op, \name, \type, 256
                         .endr
+                        #if __riscv_xlen == 64
+                        epel_hv \op, \name, \len, 128
+                        epel_hv \op, \name, \len, 256
+                        #endif
                 .endr
         .endr
 .endr
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index 3669070fca..7b090c9889 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -119,6 +119,10 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
     if (flags & AV_CPU_FLAG_RVB_ADDR) {
         init_subpel2(0, 0, 1, v, put, 128);
         init_subpel2(1, 0, 1, v, avg, 128);
+# if __riscv_xlen == 64
+        init_subpel2(0, 1, 1, hv, put, 128);
+        init_subpel2(1, 1, 1, hv, avg, 128);
+# endif
     }
 
     }
@@ -129,6 +133,10 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
         if (flags & AV_CPU_FLAG_RVB_ADDR) {
             init_subpel2(0, 0, 1, v, put, 256);
             init_subpel2(1, 0, 1, v, avg, 256);
+# if __riscv_xlen == 64
+            init_subpel2(0, 1, 1, hv, put, 256);
+            init_subpel2(1, 1, 1, hv, avg, 256);
+# endif
         }
     }
     }
-- 
2.45.2


_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [FFmpeg-devel] [PATCH v4 4/4] lavc/vp9dsp: R-V V mc tap hv
  2024-06-15 11:50 ` [FFmpeg-devel] [PATCH v4 4/4] lavc/vp9dsp: R-V V mc tap hv uk7b
@ 2024-07-23  8:58   ` uk7b
  2024-07-23  9:03     ` flow gg
  0 siblings, 1 reply; 15+ messages in thread
From: uk7b @ 2024-07-23  8:58 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

                                                     C908   X60
vp9_avg_8tap_smooth_4hv_8bpp_c                     :   32.0   28.0
vp9_avg_8tap_smooth_4hv_8bpp_rvv_i32               :   15.0   13.2
vp9_avg_8tap_smooth_8hv_8bpp_c                     :   98.0   86.2
vp9_avg_8tap_smooth_8hv_8bpp_rvv_i32               :   23.7   21.2
vp9_avg_8tap_smooth_16hv_8bpp_c                    :  355.7  297.0
vp9_avg_8tap_smooth_16hv_8bpp_rvv_i32              :   47.0   41.5
vp9_avg_8tap_smooth_32hv_8bpp_c                    : 1272.7 1099.7
vp9_avg_8tap_smooth_32hv_8bpp_rvv_i32              :  134.7  119.7
vp9_avg_8tap_smooth_64hv_8bpp_c                    : 4937.0 4224.2
vp9_avg_8tap_smooth_64hv_8bpp_rvv_i32              :  528.5  228.5
vp9_put_8tap_smooth_4hv_8bpp_c                     :   30.2   26.7
vp9_put_8tap_smooth_4hv_8bpp_rvv_i32               :   30.5   12.5
vp9_put_8tap_smooth_8hv_8bpp_c                     :   91.5   81.2
vp9_put_8tap_smooth_8hv_8bpp_rvv_i32               :   22.7   20.2
vp9_put_8tap_smooth_16hv_8bpp_c                    :  313.2  277.5
vp9_put_8tap_smooth_16hv_8bpp_rvv_i32              :   45.2   40.2
vp9_put_8tap_smooth_32hv_8bpp_c                    : 1166.7 1022.2
vp9_put_8tap_smooth_32hv_8bpp_rvv_i32              :  131.7  117.2
vp9_put_8tap_smooth_64hv_8bpp_c                    : 4560.5 3961.7
vp9_put_8tap_smooth_64hv_8bpp_rvv_i32              :  517.0  223.2
---
 libavcodec/riscv/vp9_mc_rvv.S  | 75 ++++++++++++++++++++++++++++++++++
 libavcodec/riscv/vp9dsp_init.c |  8 ++++
 2 files changed, 83 insertions(+)

diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index 6a4be7b9bd..26754ac6f8 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -366,6 +366,77 @@ func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
 endfunc
 .endm
 
+#if __riscv_xlen == 64
+.macro epel_hv_once len, name, op
+        sub             a2, a2, a3
+        sub             a2, a2, a3
+        sub             a2, a2, a3
+        .irp n,0,2,4,6,8,10,12,14
+        epel_load_inc   v\n, \len, put, \name, h, 1, t
+        .endr
+        addi            a4, a4, -1
+1:
+        addi            a4, a4, -1
+        epel_load       v30, \len, \op, \name, v, 0, s
+        vse8.v          v30, (a0)
+        vmv.v.v         v0, v2
+        vmv.v.v         v2, v4
+        vmv.v.v         v4, v6
+        vmv.v.v         v6, v8
+        vmv.v.v         v8, v10
+        vmv.v.v         v10, v12
+        vmv.v.v         v12, v14
+        epel_load       v14, \len, put, \name, h, 1, t
+        add             a2, a2, a3
+        add             a0, a0, a1
+        bnez            a4, 1b
+        epel_load       v30, \len, \op, \name, v, 0, s
+        vse8.v          v30, (a0)
+.endm
+
+.macro epel_hv op, name, len, vlen
+func ff_\op\()_vp9_8tap_\name\()_\len\()hv_rvv\vlen\(), zve32x
+        addi            sp, sp, -64
+        .irp n,0,1,2,3,4,5,6,7
+        sd              s\n, \n\()<<3(sp)
+        .endr
+.if \len == 64 && \vlen < 256
+        addi            sp, sp, -48
+        .irp n,0,1,2,3,4,5
+        sd              a\n, \n\()<<3(sp)
+        .endr
+.endif
+.ifc \op,avg
+        csrwi           vxrm, 0
+.endif
+        epel_filter     \name, h, t, a7
+        epel_filter     \name, v, s, s7
+.if \vlen < 256
+        vsetvlstatic8   \len, a6, 32, m2
+.else
+        vsetvlstatic8   \len, a6, 64, m2
+.endif
+        epel_hv_once    \len, \name, \op
+.if \len == 64 && \vlen < 256
+        .irp n,0,1,2,3,4,5
+        ld              a\n, \n\()<<3(sp)
+        .endr
+        addi            sp, sp, 48
+        addi            a0, a0, 32
+        addi            a2, a2, 32
+        epel_filter     \name, h, t, a7
+        epel_hv_once    \len, \name, \op
+.endif
+        .irp n,0,1,2,3,4,5,6,7
+        ld              s\n, \n\()<<3(sp)
+        .endr
+        addi            sp, sp, 64
+
+        ret
+endfunc
+.endm
+#endif
+
 .irp len, 64, 32, 16, 8, 4
         copy_avg \len
         .irp op, put, avg
@@ -374,6 +445,10 @@ endfunc
                                 epel \len, \op, \name, \type, 128
                                 epel \len, \op, \name, \type, 256
                         .endr
+                        #if __riscv_xlen == 64
+                        epel_hv \op, \name, \len, 128
+                        epel_hv \op, \name, \len, 256
+                        #endif
                 .endr
         .endr
 .endr
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index 3669070fca..7b090c9889 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -119,6 +119,10 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
     if (flags & AV_CPU_FLAG_RVB_ADDR) {
         init_subpel2(0, 0, 1, v, put, 128);
         init_subpel2(1, 0, 1, v, avg, 128);
+# if __riscv_xlen == 64
+        init_subpel2(0, 1, 1, hv, put, 128);
+        init_subpel2(1, 1, 1, hv, avg, 128);
+# endif
     }
 
     }
@@ -129,6 +133,10 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
         if (flags & AV_CPU_FLAG_RVB_ADDR) {
             init_subpel2(0, 0, 1, v, put, 256);
             init_subpel2(1, 0, 1, v, avg, 256);
+# if __riscv_xlen == 64
+            init_subpel2(0, 1, 1, hv, put, 256);
+            init_subpel2(1, 1, 1, hv, avg, 256);
+# endif
         }
     }
     }
-- 
2.45.2


_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [FFmpeg-devel] [PATCH v4 4/4] lavc/vp9dsp: R-V V mc tap hv
  2024-07-23  8:58   ` uk7b
@ 2024-07-23  9:03     ` flow gg
  0 siblings, 0 replies; 15+ messages in thread
From: flow gg @ 2024-07-23  9:03 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Because of the 3/4 update, updated it."

<uk7b@foxmail.com> 于2024年7月23日周二 16:59写道：

> From: sunyuechi <sunyuechi@iscas.ac.cn>
>
>                                                      C908   X60
> vp9_avg_8tap_smooth_4hv_8bpp_c                     :   32.0   28.0
> vp9_avg_8tap_smooth_4hv_8bpp_rvv_i32               :   15.0   13.2
> vp9_avg_8tap_smooth_8hv_8bpp_c                     :   98.0   86.2
> vp9_avg_8tap_smooth_8hv_8bpp_rvv_i32               :   23.7   21.2
> vp9_avg_8tap_smooth_16hv_8bpp_c                    :  355.7  297.0
> vp9_avg_8tap_smooth_16hv_8bpp_rvv_i32              :   47.0   41.5
> vp9_avg_8tap_smooth_32hv_8bpp_c                    : 1272.7 1099.7
> vp9_avg_8tap_smooth_32hv_8bpp_rvv_i32              :  134.7  119.7
> vp9_avg_8tap_smooth_64hv_8bpp_c                    : 4937.0 4224.2
> vp9_avg_8tap_smooth_64hv_8bpp_rvv_i32              :  528.5  228.5
> vp9_put_8tap_smooth_4hv_8bpp_c                     :   30.2   26.7
> vp9_put_8tap_smooth_4hv_8bpp_rvv_i32               :   30.5   12.5
> vp9_put_8tap_smooth_8hv_8bpp_c                     :   91.5   81.2
> vp9_put_8tap_smooth_8hv_8bpp_rvv_i32               :   22.7   20.2
> vp9_put_8tap_smooth_16hv_8bpp_c                    :  313.2  277.5
> vp9_put_8tap_smooth_16hv_8bpp_rvv_i32              :   45.2   40.2
> vp9_put_8tap_smooth_32hv_8bpp_c                    : 1166.7 1022.2
> vp9_put_8tap_smooth_32hv_8bpp_rvv_i32              :  131.7  117.2
> vp9_put_8tap_smooth_64hv_8bpp_c                    : 4560.5 3961.7
> vp9_put_8tap_smooth_64hv_8bpp_rvv_i32              :  517.0  223.2
> ---
>  libavcodec/riscv/vp9_mc_rvv.S  | 75 ++++++++++++++++++++++++++++++++++
>  libavcodec/riscv/vp9dsp_init.c |  8 ++++
>  2 files changed, 83 insertions(+)
>
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index 6a4be7b9bd..26754ac6f8 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -366,6 +366,77 @@ func
> ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
>  endfunc
>  .endm
>
> +#if __riscv_xlen == 64
> +.macro epel_hv_once len, name, op
> +        sub             a2, a2, a3
> +        sub             a2, a2, a3
> +        sub             a2, a2, a3
> +        .irp n,0,2,4,6,8,10,12,14
> +        epel_load_inc   v\n, \len, put, \name, h, 1, t
> +        .endr
> +        addi            a4, a4, -1
> +1:
> +        addi            a4, a4, -1
> +        epel_load       v30, \len, \op, \name, v, 0, s
> +        vse8.v          v30, (a0)
> +        vmv.v.v         v0, v2
> +        vmv.v.v         v2, v4
> +        vmv.v.v         v4, v6
> +        vmv.v.v         v6, v8
> +        vmv.v.v         v8, v10
> +        vmv.v.v         v10, v12
> +        vmv.v.v         v12, v14
> +        epel_load       v14, \len, put, \name, h, 1, t
> +        add             a2, a2, a3
> +        add             a0, a0, a1
> +        bnez            a4, 1b
> +        epel_load       v30, \len, \op, \name, v, 0, s
> +        vse8.v          v30, (a0)
> +.endm
> +
> +.macro epel_hv op, name, len, vlen
> +func ff_\op\()_vp9_8tap_\name\()_\len\()hv_rvv\vlen\(), zve32x
> +        addi            sp, sp, -64
> +        .irp n,0,1,2,3,4,5,6,7
> +        sd              s\n, \n\()<<3(sp)
> +        .endr
> +.if \len == 64 && \vlen < 256
> +        addi            sp, sp, -48
> +        .irp n,0,1,2,3,4,5
> +        sd              a\n, \n\()<<3(sp)
> +        .endr
> +.endif
> +.ifc \op,avg
> +        csrwi           vxrm, 0
> +.endif
> +        epel_filter     \name, h, t, a7
> +        epel_filter     \name, v, s, s7
> +.if \vlen < 256
> +        vsetvlstatic8   \len, a6, 32, m2
> +.else
> +        vsetvlstatic8   \len, a6, 64, m2
> +.endif
> +        epel_hv_once    \len, \name, \op
> +.if \len == 64 && \vlen < 256
> +        .irp n,0,1,2,3,4,5
> +        ld              a\n, \n\()<<3(sp)
> +        .endr
> +        addi            sp, sp, 48
> +        addi            a0, a0, 32
> +        addi            a2, a2, 32
> +        epel_filter     \name, h, t, a7
> +        epel_hv_once    \len, \name, \op
> +.endif
> +        .irp n,0,1,2,3,4,5,6,7
> +        ld              s\n, \n\()<<3(sp)
> +        .endr
> +        addi            sp, sp, 64
> +
> +        ret
> +endfunc
> +.endm
> +#endif
> +
>  .irp len, 64, 32, 16, 8, 4
>          copy_avg \len
>          .irp op, put, avg
> @@ -374,6 +445,10 @@ endfunc
>                                  epel \len, \op, \name, \type, 128
>                                  epel \len, \op, \name, \type, 256
>                          .endr
> +                        #if __riscv_xlen == 64
> +                        epel_hv \op, \name, \len, 128
> +                        epel_hv \op, \name, \len, 256
> +                        #endif
>                  .endr
>          .endr
>  .endr
> diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> index 3669070fca..7b090c9889 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -119,6 +119,10 @@ static av_cold void
> vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
>      if (flags & AV_CPU_FLAG_RVB_ADDR) {
>          init_subpel2(0, 0, 1, v, put, 128);
>          init_subpel2(1, 0, 1, v, avg, 128);
> +# if __riscv_xlen == 64
> +        init_subpel2(0, 1, 1, hv, put, 128);
> +        init_subpel2(1, 1, 1, hv, avg, 128);
> +# endif
>      }
>
>      }
> @@ -129,6 +133,10 @@ static av_cold void
> vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
>          if (flags & AV_CPU_FLAG_RVB_ADDR) {
>              init_subpel2(0, 0, 1, v, put, 256);
>              init_subpel2(1, 0, 1, v, avg, 256);
> +# if __riscv_xlen == 64
> +            init_subpel2(0, 1, 1, hv, put, 256);
> +            init_subpel2(1, 1, 1, hv, avg, 256);
> +# endif
>          }
>      }
>      }
> --
> 2.45.2
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2024-07-31 19:10 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20240615115034.3891490-1-uk7b@foxmail.com>
2024-06-15 11:50 ` [FFmpeg-devel] [PATCH v4 2/4] lavc/vp9dsp: R-V V mc bilin hv uk7b
2024-06-15 11:52   ` flow gg
2024-06-24 20:07   ` Rémi Denis-Courmont
2024-06-30 11:39     ` flow gg
2024-06-15 11:50 ` [FFmpeg-devel] [PATCH v4 3/4] lavc/vp9dsp: R-V V mc tap h v uk7b
2024-06-15 11:52   ` flow gg
2024-07-13  9:02   ` Rémi Denis-Courmont
2024-07-23  8:51     ` uk7b
2024-07-29 15:20       ` Rémi Denis-Courmont
2024-07-31 10:36         ` flow gg
2024-07-31 19:10           ` Rémi Denis-Courmont
2024-07-23  8:56     ` flow gg
2024-06-15 11:50 ` [FFmpeg-devel] [PATCH v4 4/4] lavc/vp9dsp: R-V V mc tap hv uk7b
2024-07-23  8:58   ` uk7b
2024-07-23  9:03     ` flow gg

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git