Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels
@ 2024-05-04 10:01 uk7b
  2024-05-04 10:08 ` flow gg
  2024-05-04 17:53 ` Rémi Denis-Courmont
  0 siblings, 2 replies; 16+ messages in thread
From: uk7b @ 2024-05-04 10:01 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c: 869.7
vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32: 148.7
vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c: 220.5
vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64: 56.2
vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c: 523.7
vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvv_i32: 82.0
vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c: 138.5
vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvv_i64: 23.7
---
 libavcodec/riscv/vc1dsp_init.c |  8 +++++
 libavcodec/riscv/vc1dsp_rvv.S  | 66 ++++++++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+)

diff --git a/libavcodec/riscv/vc1dsp_init.c b/libavcodec/riscv/vc1dsp_init.c
index e47b644f80..610c43a1a3 100644
--- a/libavcodec/riscv/vc1dsp_init.c
+++ b/libavcodec/riscv/vc1dsp_init.c
@@ -29,6 +29,10 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block
 void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_put_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
+void ff_put_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
+void ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
+void ff_avg_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
 
 av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
 {
@@ -38,9 +42,13 @@ av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
     if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
         dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv;
         dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv;
+        dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvv;
+        dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv;
         if (flags & AV_CPU_FLAG_RVV_I64) {
             dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv;
             dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv;
+            dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvv;
+            dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv;
         }
     }
 #endif
diff --git a/libavcodec/riscv/vc1dsp_rvv.S b/libavcodec/riscv/vc1dsp_rvv.S
index 4a00945ead..48244f91aa 100644
--- a/libavcodec/riscv/vc1dsp_rvv.S
+++ b/libavcodec/riscv/vc1dsp_rvv.S
@@ -111,3 +111,69 @@ func ff_vc1_inv_trans_4x4_dc_rvv, zve32x
         vsse32.v      v0, (a0), a1
         ret
 endfunc
+
+func ff_put_pixels16x16_rvv, zve32x
+        vsetivli      zero, 16, e8, m1, ta, ma
+        .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
+        vle8.v        v\n, (a1)
+        add           a1, a1, a2
+        .endr
+        vle8.v        v31, (a1)
+        .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
+        vse8.v        v\n, (a0)
+        add           a0, a0, a2
+        .endr
+        vse8.v        v31, (a0)
+
+        ret
+endfunc
+
+func ff_put_pixels8x8_rvv, zve64x
+        vsetivli      zero, 8, e8, mf2, ta, ma
+        vlse64.v      v8, (a1), a2
+        vsse64.v      v8, (a0), a2
+
+        ret
+endfunc
+
+func ff_avg_pixels16x16_rvv, zve32x
+        csrwi         vxrm, 0
+        vsetivli      zero, 16, e8, m1, ta, ma
+        li            t0, 128
+
+        .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
+        vle8.v        v\n, (a1)
+        add           a1, a1, a2
+        .endr
+        vle8.v        v31, (a1)
+        .irp n 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
+        vle8.v        v\n, (a0)
+        add           a0, a0, a2
+        .endr
+        vle8.v        v15, (a0)
+        vsetvli       zero, t0, e8, m8, ta, ma
+        vaaddu.vv     v0, v0, v16
+        vaaddu.vv     v8, v8, v24
+        vsetivli      zero, 16, e8, m1, ta, ma
+        .irp n  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
+        vse8.v        v\n, (a0)
+        sub           a0, a0, a2
+        .endr
+        vse8.v        v0, (a0)
+
+        ret
+endfunc
+
+func ff_avg_pixels8x8_rvv, zve64x
+        csrwi         vxrm, 0
+        li            t0, 64
+        vsetivli      zero, 8, e8, mf2, ta, ma
+        vlse64.v      v16, (a1), a2
+        vlse64.v      v8, (a0), a2
+        vsetvli       zero, t0, e8, m4, ta, ma
+        vaaddu.vv     v16, v16, v8
+        vsetivli      zero, 8, e8, mf2, ta, ma
+        vsse64.v      v16, (a0), a2
+
+        ret
+endfunc
-- 
2.45.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels
  2024-05-04 10:01 [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels uk7b
@ 2024-05-04 10:08 ` flow gg
  2024-05-04 17:53 ` Rémi Denis-Courmont
  1 sibling, 0 replies; 16+ messages in thread
From: flow gg @ 2024-05-04 10:08 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Hi, it's me. I accidentally repeated it but it seems to be correct.

<uk7b@foxmail.com> 于2024年5月4日周六 18:01写道:

> From: sunyuechi <sunyuechi@iscas.ac.cn>
>
> vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c: 869.7
> vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32: 148.7
> vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c: 220.5
> vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64: 56.2
> vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c: 523.7
> vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvv_i32: 82.0
> vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c: 138.5
> vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvv_i64: 23.7
> ---
>  libavcodec/riscv/vc1dsp_init.c |  8 +++++
>  libavcodec/riscv/vc1dsp_rvv.S  | 66 ++++++++++++++++++++++++++++++++++
>  2 files changed, 74 insertions(+)
>
> diff --git a/libavcodec/riscv/vc1dsp_init.c
> b/libavcodec/riscv/vc1dsp_init.c
> index e47b644f80..610c43a1a3 100644
> --- a/libavcodec/riscv/vc1dsp_init.c
> +++ b/libavcodec/riscv/vc1dsp_init.c
> @@ -29,6 +29,10 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest,
> ptrdiff_t stride, int16_t *block
>  void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t
> *block);
>  void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t
> *block);
>  void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t
> *block);
> +void ff_put_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t
> line_size, int rnd);
> +void ff_put_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t
> line_size, int rnd);
> +void ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t
> line_size, int rnd);
> +void ff_avg_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t
> line_size, int rnd);
>
>  av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
>  {
> @@ -38,9 +42,13 @@ av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
>      if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
>          dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv;
>          dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv;
> +        dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvv;
> +        dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv;
>          if (flags & AV_CPU_FLAG_RVV_I64) {
>              dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv;
>              dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv;
> +            dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvv;
> +            dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv;
>          }
>      }
>  #endif
> diff --git a/libavcodec/riscv/vc1dsp_rvv.S b/libavcodec/riscv/vc1dsp_rvv.S
> index 4a00945ead..48244f91aa 100644
> --- a/libavcodec/riscv/vc1dsp_rvv.S
> +++ b/libavcodec/riscv/vc1dsp_rvv.S
> @@ -111,3 +111,69 @@ func ff_vc1_inv_trans_4x4_dc_rvv, zve32x
>          vsse32.v      v0, (a0), a1
>          ret
>  endfunc
> +
> +func ff_put_pixels16x16_rvv, zve32x
> +        vsetivli      zero, 16, e8, m1, ta, ma
> +        .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
> +        vle8.v        v\n, (a1)
> +        add           a1, a1, a2
> +        .endr
> +        vle8.v        v31, (a1)
> +        .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
> +        vse8.v        v\n, (a0)
> +        add           a0, a0, a2
> +        .endr
> +        vse8.v        v31, (a0)
> +
> +        ret
> +endfunc
> +
> +func ff_put_pixels8x8_rvv, zve64x
> +        vsetivli      zero, 8, e8, mf2, ta, ma
> +        vlse64.v      v8, (a1), a2
> +        vsse64.v      v8, (a0), a2
> +
> +        ret
> +endfunc
> +
> +func ff_avg_pixels16x16_rvv, zve32x
> +        csrwi         vxrm, 0
> +        vsetivli      zero, 16, e8, m1, ta, ma
> +        li            t0, 128
> +
> +        .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
> +        vle8.v        v\n, (a1)
> +        add           a1, a1, a2
> +        .endr
> +        vle8.v        v31, (a1)
> +        .irp n 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
> +        vle8.v        v\n, (a0)
> +        add           a0, a0, a2
> +        .endr
> +        vle8.v        v15, (a0)
> +        vsetvli       zero, t0, e8, m8, ta, ma
> +        vaaddu.vv     v0, v0, v16
> +        vaaddu.vv     v8, v8, v24
> +        vsetivli      zero, 16, e8, m1, ta, ma
> +        .irp n  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
> +        vse8.v        v\n, (a0)
> +        sub           a0, a0, a2
> +        .endr
> +        vse8.v        v0, (a0)
> +
> +        ret
> +endfunc
> +
> +func ff_avg_pixels8x8_rvv, zve64x
> +        csrwi         vxrm, 0
> +        li            t0, 64
> +        vsetivli      zero, 8, e8, mf2, ta, ma
> +        vlse64.v      v16, (a1), a2
> +        vlse64.v      v8, (a0), a2
> +        vsetvli       zero, t0, e8, m4, ta, ma
> +        vaaddu.vv     v16, v16, v8
> +        vsetivli      zero, 8, e8, mf2, ta, ma
> +        vsse64.v      v16, (a0), a2
> +
> +        ret
> +endfunc
> --
> 2.45.0
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels
  2024-05-04 10:01 [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels uk7b
  2024-05-04 10:08 ` flow gg
@ 2024-05-04 17:53 ` Rémi Denis-Courmont
  2024-05-05  9:15   ` uk7b
  2024-05-05  9:18   ` flow gg
  1 sibling, 2 replies; 16+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-04 17:53 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

Le lauantaina 4. toukokuuta 2024, 13.01.05 EEST uk7b@foxmail.com a écrit :
> From: sunyuechi <sunyuechi@iscas.ac.cn>
> 
> vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c: 869.7
> vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32: 148.7
> vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c: 220.5
> vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64: 56.2
> vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c: 523.7
> vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvv_i32: 82.0
> vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c: 138.5
> vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvv_i64: 23.7
> ---
>  libavcodec/riscv/vc1dsp_init.c |  8 +++++
>  libavcodec/riscv/vc1dsp_rvv.S  | 66 ++++++++++++++++++++++++++++++++++
>  2 files changed, 74 insertions(+)
> 
> diff --git a/libavcodec/riscv/vc1dsp_init.c b/libavcodec/riscv/vc1dsp_init.c
> index e47b644f80..610c43a1a3 100644
> --- a/libavcodec/riscv/vc1dsp_init.c
> +++ b/libavcodec/riscv/vc1dsp_init.c
> @@ -29,6 +29,10 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, ptrdiff_t
> stride, int16_t *block void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest,
> ptrdiff_t stride, int16_t *block); void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t
> *dest, ptrdiff_t stride, int16_t *block); void
> ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t
> *block); +void ff_put_pixels16x16_rvv(uint8_t *dst, const uint8_t *src,
> ptrdiff_t line_size, int rnd); +void ff_put_pixels8x8_rvv(uint8_t *dst,
> const uint8_t *src, ptrdiff_t line_size, int rnd); +void
> ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t
> line_size, int rnd); +void ff_avg_pixels8x8_rvv(uint8_t *dst, const uint8_t
> *src, ptrdiff_t line_size, int rnd);
> 
>  av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
>  {
> @@ -38,9 +42,13 @@ av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
>      if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
>          dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv;
>          dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv;
> +        dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvv;
> +        dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv;
>          if (flags & AV_CPU_FLAG_RVV_I64) {
>              dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv;
>              dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv;
> +            dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvv;
> +            dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv;
>          }
>      }
>  #endif
> diff --git a/libavcodec/riscv/vc1dsp_rvv.S b/libavcodec/riscv/vc1dsp_rvv.S
> index 4a00945ead..48244f91aa 100644
> --- a/libavcodec/riscv/vc1dsp_rvv.S
> +++ b/libavcodec/riscv/vc1dsp_rvv.S
> @@ -111,3 +111,69 @@ func ff_vc1_inv_trans_4x4_dc_rvv, zve32x
>          vsse32.v      v0, (a0), a1
>          ret
>  endfunc
> +
> +func ff_put_pixels16x16_rvv, zve32x
> +        vsetivli      zero, 16, e8, m1, ta, ma
> +        .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
> +        vle8.v        v\n, (a1)
> +        add           a1, a1, a2
> +        .endr
> +        vle8.v        v31, (a1)

Is it not faster to compute the address ahead of time, e.g.:

add t1, a2, a1
vle8.v vN, (a1)
sh1add a1, a2, a1
vle8.v vN+1, (t1)

...and so on? Even on a reordering core, you can't eliminate stall on data 
dependency if there is nothing else to be done.

(Ditto below and in other patches.)

> +        .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
> +        vse8.v        v\n, (a0)
> +        add           a0, a0, a2
> +        .endr
> +        vse8.v        v31, (a0)
> +
> +        ret
> +endfunc
> +
> +func ff_put_pixels8x8_rvv, zve64x
> +        vsetivli      zero, 8, e8, mf2, ta, ma
> +        vlse64.v      v8, (a1), a2
> +        vsse64.v      v8, (a0), a2

Copying 64-bit quantities should not need RVV at all. Maybe the C version 
needs to be improved instead, but if that is not possible, then an RVI version 
may be more portable and work just as well.

> +
> +        ret
> +endfunc
> +
> +func ff_avg_pixels16x16_rvv, zve32x
> +        csrwi         vxrm, 0
> +        vsetivli      zero, 16, e8, m1, ta, ma
> +        li            t0, 128
> +
> +        .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
> +        vle8.v        v\n, (a1)
> +        add           a1, a1, a2
> +        .endr
> +        vle8.v        v31, (a1)
> +        .irp n 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
> +        vle8.v        v\n, (a0)
> +        add           a0, a0, a2
> +        .endr
> +        vle8.v        v15, (a0)
> +        vsetvli       zero, t0, e8, m8, ta, ma
> +        vaaddu.vv     v0, v0, v16
> +        vaaddu.vv     v8, v8, v24
> +        vsetivli      zero, 16, e8, m1, ta, ma
> +        .irp n  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
> +        vse8.v        v\n, (a0)
> +        sub           a0, a0, a2
> +        .endr
> +        vse8.v        v0, (a0)
> +
> +        ret
> +endfunc
> +
> +func ff_avg_pixels8x8_rvv, zve64x
> +        csrwi         vxrm, 0
> +        li            t0, 64
> +        vsetivli      zero, 8, e8, mf2, ta, ma

Does MF2 actually improve perfs over M1 here?

> +        vlse64.v      v16, (a1), a2
> +        vlse64.v      v8, (a0), a2
> +        vsetvli       zero, t0, e8, m4, ta, ma
> +        vaaddu.vv     v16, v16, v8
> +        vsetivli      zero, 8, e8, mf2, ta, ma
> +        vsse64.v      v16, (a0), a2
> +
> +        ret
> +endfunc


-- 
レミ・デニ-クールモン
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels
  2024-05-04 17:53 ` Rémi Denis-Courmont
@ 2024-05-05  9:15   ` uk7b
  2024-05-05  9:18   ` flow gg
  1 sibling, 0 replies; 16+ messages in thread
From: uk7b @ 2024-05-05  9:15 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c: 875.2
vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32: 141.7
vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c: 226.5
vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64: 56.2
vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c: 529.5
vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvv_i32: 79.5
vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c: 144.5
vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvi: 26.7
---
 libavcodec/riscv/Makefile      |  1 +
 libavcodec/riscv/vc1dsp_init.c | 16 +++++++++-
 libavcodec/riscv/vc1dsp_rvi.S  | 37 ++++++++++++++++++++++
 libavcodec/riscv/vc1dsp_rvv.S  | 57 ++++++++++++++++++++++++++++++++++
 4 files changed, 110 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/riscv/vc1dsp_rvi.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 43b5c21cf4..cd5cc21cfd 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -59,6 +59,7 @@ RVV-OBJS-$(CONFIG_TAK_DECODER) += riscv/takdsp_rvv.o
 OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_init.o
 RVV-OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_rvv.o
 OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_init.o
+RV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvi.o
 RVV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvv.o
 OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_init.o
 RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o
diff --git a/libavcodec/riscv/vc1dsp_init.c b/libavcodec/riscv/vc1dsp_init.c
index e47b644f80..bddc4b65eb 100644
--- a/libavcodec/riscv/vc1dsp_init.c
+++ b/libavcodec/riscv/vc1dsp_init.c
@@ -29,19 +29,33 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block
 void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_put_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
+void ff_put_pixels8x8_rvi(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
+void ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
+void ff_avg_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
 
 av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
 {
-#if HAVE_RVV
+#if HAVE_RV
     int flags = av_get_cpu_flags();
 
+# if __riscv_xlen >= 64
+    if (flags & AV_CPU_FLAG_RVI) {
+        dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvi;
+    }
+# endif
+#if HAVE_RVV
     if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
         dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv;
         dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv;
+        dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvv;
+        dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv;
         if (flags & AV_CPU_FLAG_RVV_I64) {
             dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv;
             dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv;
+            dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv;
         }
     }
 #endif
+#endif
 }
diff --git a/libavcodec/riscv/vc1dsp_rvi.S b/libavcodec/riscv/vc1dsp_rvi.S
new file mode 100644
index 0000000000..bee8e52f2e
--- /dev/null
+++ b/libavcodec/riscv/vc1dsp_rvi.S
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+#if __riscv_xlen >= 64
+func ff_put_pixels8x8_rvi
+.rept 7
+        ld t0, (a1)
+        sd t0, (a0)
+        add a1, a1, a2
+        add a0, a0, a2
+.endr
+        ld t0, (a1)
+        sd t0, (a0)
+
+        ret
+endfunc
+#endif
+
diff --git a/libavcodec/riscv/vc1dsp_rvv.S b/libavcodec/riscv/vc1dsp_rvv.S
index 4a00945ead..08c93476de 100644
--- a/libavcodec/riscv/vc1dsp_rvv.S
+++ b/libavcodec/riscv/vc1dsp_rvv.S
@@ -111,3 +111,60 @@ func ff_vc1_inv_trans_4x4_dc_rvv, zve32x
         vsse32.v      v0, (a0), a1
         ret
 endfunc
+
+.macro mspel_op op pos n1 n2
+        add           t1, \pos, a2
+        v\op\()e8.v   v\n1, (\pos)
+        sh1add        \pos, a2, \pos
+        v\op\()e8.v   v\n2, (t1)
+.endm
+
+.macro mspel_op_all op pos a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15 a16
+        mspel_op      \op \pos \a1 \a2
+        mspel_op      \op \pos \a3 \a4
+        mspel_op      \op \pos \a5 \a6
+        mspel_op      \op \pos \a7 \a8
+        mspel_op      \op \pos \a9 \a10
+        mspel_op      \op \pos \a11 \a12
+        mspel_op      \op \pos \a13 \a14
+        mspel_op      \op \pos \a15 \a16
+.endm
+
+func ff_put_pixels16x16_rvv, zve32x
+        vsetivli      zero, 16, e8, m1, ta, ma
+        mspel_op_all  l a1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
+        mspel_op_all  s a0 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
+
+        ret
+endfunc
+
+func ff_avg_pixels16x16_rvv, zve32x
+        csrwi         vxrm, 0
+        vsetivli      zero, 16, e8, m1, ta, ma
+        li            t0, 128
+        mspel_op_all  l a1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
+        mspel_op_all  l a0 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+        vsetvli       zero, t0, e8, m8, ta, ma
+        sub           a0, a0, a2
+        vaaddu.vv     v0, v0, v16
+        neg           a2, a2
+        vaaddu.vv     v8, v8, v24
+        vsetivli      zero, 16, e8, m1, ta, ma
+        mspel_op_all  s a0 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+
+        ret
+endfunc
+
+func ff_avg_pixels8x8_rvv, zve64x
+        csrwi         vxrm, 0
+        li            t0, 64
+        vsetivli      zero, 8, e8, mf2, ta, ma
+        vlse64.v      v16, (a1), a2
+        vlse64.v      v8, (a0), a2
+        vsetvli       zero, t0, e8, m4, ta, ma
+        vaaddu.vv     v16, v16, v8
+        vsetivli      zero, 8, e8, mf2, ta, ma
+        vsse64.v      v16, (a0), a2
+
+        ret
+endfunc
-- 
2.45.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels
  2024-05-04 17:53 ` Rémi Denis-Courmont
  2024-05-05  9:15   ` uk7b
@ 2024-05-05  9:18   ` flow gg
  2024-05-05 19:26     ` Rémi Denis-Courmont
  1 sibling, 1 reply; 16+ messages in thread
From: flow gg @ 2024-05-05  9:18 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

> Is it not faster to compute the address ahead of time, e.g.:
> Ditto below and in other patches.

Yes, update here and I will check other patches

> Copying 64-bit quantities should not need RVV at all. Maybe the C version
needs to be improved instead, but if that is not possible, then an RVI
version
may be more portable and work just as well.

The logic in the c version is the same in other places, which might be
difficult to modify. I've updated it using rvi.

> Does MF2 actually improve perfs over M1 here?

The difference here seems very small, but when both mf2 and m1 are correct,
the test results have only shown mf2 to be better, so I want to use mf2.

Rémi Denis-Courmont <remi@remlab.net> 于2024年5月5日周日 01:53写道:

> Le lauantaina 4. toukokuuta 2024, 13.01.05 EEST uk7b@foxmail.com a écrit :
> > From: sunyuechi <sunyuechi@iscas.ac.cn>
> >
> > vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c: 869.7
> > vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32: 148.7
> > vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c: 220.5
> > vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64: 56.2
> > vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c: 523.7
> > vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvv_i32: 82.0
> > vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c: 138.5
> > vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvv_i64: 23.7
> > ---
> >  libavcodec/riscv/vc1dsp_init.c |  8 +++++
> >  libavcodec/riscv/vc1dsp_rvv.S  | 66 ++++++++++++++++++++++++++++++++++
> >  2 files changed, 74 insertions(+)
> >
> > diff --git a/libavcodec/riscv/vc1dsp_init.c
> b/libavcodec/riscv/vc1dsp_init.c
> > index e47b644f80..610c43a1a3 100644
> > --- a/libavcodec/riscv/vc1dsp_init.c
> > +++ b/libavcodec/riscv/vc1dsp_init.c
> > @@ -29,6 +29,10 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest,
> ptrdiff_t
> > stride, int16_t *block void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest,
> > ptrdiff_t stride, int16_t *block); void
> ff_vc1_inv_trans_8x4_dc_rvv(uint8_t
> > *dest, ptrdiff_t stride, int16_t *block); void
> > ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t
> > *block); +void ff_put_pixels16x16_rvv(uint8_t *dst, const uint8_t *src,
> > ptrdiff_t line_size, int rnd); +void ff_put_pixels8x8_rvv(uint8_t *dst,
> > const uint8_t *src, ptrdiff_t line_size, int rnd); +void
> > ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t
> > line_size, int rnd); +void ff_avg_pixels8x8_rvv(uint8_t *dst, const
> uint8_t
> > *src, ptrdiff_t line_size, int rnd);
> >
> >  av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
> >  {
> > @@ -38,9 +42,13 @@ av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
> >      if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
> >          dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv;
> >          dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv;
> > +        dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvv;
> > +        dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv;
> >          if (flags & AV_CPU_FLAG_RVV_I64) {
> >              dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv;
> >              dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv;
> > +            dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvv;
> > +            dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv;
> >          }
> >      }
> >  #endif
> > diff --git a/libavcodec/riscv/vc1dsp_rvv.S
> b/libavcodec/riscv/vc1dsp_rvv.S
> > index 4a00945ead..48244f91aa 100644
> > --- a/libavcodec/riscv/vc1dsp_rvv.S
> > +++ b/libavcodec/riscv/vc1dsp_rvv.S
> > @@ -111,3 +111,69 @@ func ff_vc1_inv_trans_4x4_dc_rvv, zve32x
> >          vsse32.v      v0, (a0), a1
> >          ret
> >  endfunc
> > +
> > +func ff_put_pixels16x16_rvv, zve32x
> > +        vsetivli      zero, 16, e8, m1, ta, ma
> > +        .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
> 30
> > +        vle8.v        v\n, (a1)
> > +        add           a1, a1, a2
> > +        .endr
> > +        vle8.v        v31, (a1)
>
> Is it not faster to compute the address ahead of time, e.g.:
>
> add t1, a2, a1
> vle8.v vN, (a1)
> sh1add a1, a2, a1
> vle8.v vN+1, (t1)
>
> ...and so on? Even on a reordering core, you can't eliminate stall on data
> dependency if there is nothing else to be done.
>
> (Ditto below and in other patches.)
>
> > +        .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
> 30
> > +        vse8.v        v\n, (a0)
> > +        add           a0, a0, a2
> > +        .endr
> > +        vse8.v        v31, (a0)
> > +
> > +        ret
> > +endfunc
> > +
> > +func ff_put_pixels8x8_rvv, zve64x
> > +        vsetivli      zero, 8, e8, mf2, ta, ma
> > +        vlse64.v      v8, (a1), a2
> > +        vsse64.v      v8, (a0), a2
>
> Copying 64-bit quantities should not need RVV at all. Maybe the C version
> needs to be improved instead, but if that is not possible, then an RVI
> version
> may be more portable and work just as well.
>
> > +
> > +        ret
> > +endfunc
> > +
> > +func ff_avg_pixels16x16_rvv, zve32x
> > +        csrwi         vxrm, 0
> > +        vsetivli      zero, 16, e8, m1, ta, ma
> > +        li            t0, 128
> > +
> > +        .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
> 30
> > +        vle8.v        v\n, (a1)
> > +        add           a1, a1, a2
> > +        .endr
> > +        vle8.v        v31, (a1)
> > +        .irp n 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
> > +        vle8.v        v\n, (a0)
> > +        add           a0, a0, a2
> > +        .endr
> > +        vle8.v        v15, (a0)
> > +        vsetvli       zero, t0, e8, m8, ta, ma
> > +        vaaddu.vv     v0, v0, v16
> > +        vaaddu.vv     v8, v8, v24
> > +        vsetivli      zero, 16, e8, m1, ta, ma
> > +        .irp n  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
> > +        vse8.v        v\n, (a0)
> > +        sub           a0, a0, a2
> > +        .endr
> > +        vse8.v        v0, (a0)
> > +
> > +        ret
> > +endfunc
> > +
> > +func ff_avg_pixels8x8_rvv, zve64x
> > +        csrwi         vxrm, 0
> > +        li            t0, 64
> > +        vsetivli      zero, 8, e8, mf2, ta, ma
>
> Does MF2 actually improve perfs over M1 here?
>
> > +        vlse64.v      v16, (a1), a2
> > +        vlse64.v      v8, (a0), a2
> > +        vsetvli       zero, t0, e8, m4, ta, ma
> > +        vaaddu.vv     v16, v16, v8
> > +        vsetivli      zero, 8, e8, mf2, ta, ma
> > +        vsse64.v      v16, (a0), a2
> > +
> > +        ret
> > +endfunc
>
>
> --
> レミ・デニ-クールモン
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels
  2024-05-05  9:18   ` flow gg
@ 2024-05-05 19:26     ` Rémi Denis-Courmont
  2024-05-10  8:21       ` uk7b
  2024-05-10  8:22       ` flow gg
  0 siblings, 2 replies; 16+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-05 19:26 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Le sunnuntaina 5. toukokuuta 2024, 12.18.56 EEST flow gg a écrit :
> > Does MF2 actually improve perfs over M1 here?
> 
> The difference here seems very small, but when both mf2 and m1 are correct,
> the test results have only shown mf2 to be better, so I want to use mf2.

I can live with that. But this is a slippery slope because large vector sizes 
would involve even smaller fractions. Then we would need to compute the value 
which might negate the performance gains from fractional multipliers.

The fastest approach that I can think of is a symbolic LA (which expands to 
1xAUIPC + 1xLA) to load a precomputed VTYPE value from a static variable. 
Furthermore, this requires VSETVL, which precludes immediate constant VL 
Indeed, the VSETIVL instruction does not exist.

AFAIU, BananaPi F3 has 256-bit vectors already now.

-- 
Rémi Denis-Courmont
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels
  2024-05-05 19:26     ` Rémi Denis-Courmont
@ 2024-05-10  8:21       ` uk7b
  2024-05-12 11:48         ` Rémi Denis-Courmont
  2024-05-10  8:22       ` flow gg
  1 sibling, 1 reply; 16+ messages in thread
From: uk7b @ 2024-05-10  8:21 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

                                                      C908 X60
vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c            :  14.7 13.2
vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32      :   2.5  2.2
vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c            :   3.7  3.5
vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64      :   1.0  1.2
vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c            :   9.0  8.0
vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvi          :   1.0  1.0
vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c            :   2.5  2.2
vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvi          :   0.5  0.5
---
 libavcodec/riscv/Makefile      |  1 +
 libavcodec/riscv/vc1dsp_init.c | 16 +++++++++++-
 libavcodec/riscv/vc1dsp_rvi.S  | 48 ++++++++++++++++++++++++++++++++++
 libavcodec/riscv/vc1dsp_rvv.S  | 48 ++++++++++++++++++++++++++++++++++
 4 files changed, 112 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/riscv/vc1dsp_rvi.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 43b5c21cf4..cd5cc21cfd 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -59,6 +59,7 @@ RVV-OBJS-$(CONFIG_TAK_DECODER) += riscv/takdsp_rvv.o
 OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_init.o
 RVV-OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_rvv.o
 OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_init.o
+RV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvi.o
 RVV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvv.o
 OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_init.o
 RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o
diff --git a/libavcodec/riscv/vc1dsp_init.c b/libavcodec/riscv/vc1dsp_init.c
index e47b644f80..555aa5aea7 100644
--- a/libavcodec/riscv/vc1dsp_init.c
+++ b/libavcodec/riscv/vc1dsp_init.c
@@ -29,19 +29,33 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block
 void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_put_pixels16x16_rvi(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
+void ff_put_pixels8x8_rvi(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
+void ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
+void ff_avg_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
 
 av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
 {
-#if HAVE_RVV
+#if HAVE_RV
     int flags = av_get_cpu_flags();
 
+# if __riscv_xlen >= 64
+    if (flags & AV_CPU_FLAG_RVI) {
+        dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvi;
+        dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvi;
+    }
+# endif
+#if HAVE_RVV
     if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
         dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv;
         dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv;
+        dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv;
         if (flags & AV_CPU_FLAG_RVV_I64) {
             dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv;
             dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv;
+            dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv;
         }
     }
 #endif
+#endif
 }
diff --git a/libavcodec/riscv/vc1dsp_rvi.S b/libavcodec/riscv/vc1dsp_rvi.S
new file mode 100644
index 0000000000..1d5660316f
--- /dev/null
+++ b/libavcodec/riscv/vc1dsp_rvi.S
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+#if __riscv_xlen >= 64
+func ff_put_pixels8x8_rvi
+.rept 8
+        ld t0, (a1)
+        sd t0, (a0)
+        add a1, a1, a2
+        add a0, a0, a2
+.endr
+
+        ret
+endfunc
+
+func ff_put_pixels16x16_rvi
+.rept 16
+        ld t0, (a1)
+        ld t1, 8(a1)
+        sd t0, (a0)
+        sd t1, 8(a0)
+        add a1, a1, a2
+        add a0, a0, a2
+.endr
+
+        ret
+endfunc
+#endif
+
diff --git a/libavcodec/riscv/vc1dsp_rvv.S b/libavcodec/riscv/vc1dsp_rvv.S
index 4a00945ead..7c2b47f66c 100644
--- a/libavcodec/riscv/vc1dsp_rvv.S
+++ b/libavcodec/riscv/vc1dsp_rvv.S
@@ -111,3 +111,51 @@ func ff_vc1_inv_trans_4x4_dc_rvv, zve32x
         vsse32.v      v0, (a0), a1
         ret
 endfunc
+
+.macro mspel_op op pos n1 n2
+        add           t1, \pos, a2
+        v\op\()e8.v   v\n1, (\pos)
+        sh1add        \pos, a2, \pos
+        v\op\()e8.v   v\n2, (t1)
+.endm
+
+.macro mspel_op_all op pos a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15 a16
+        mspel_op      \op \pos \a1 \a2
+        mspel_op      \op \pos \a3 \a4
+        mspel_op      \op \pos \a5 \a6
+        mspel_op      \op \pos \a7 \a8
+        mspel_op      \op \pos \a9 \a10
+        mspel_op      \op \pos \a11 \a12
+        mspel_op      \op \pos \a13 \a14
+        mspel_op      \op \pos \a15 \a16
+.endm
+
+func ff_avg_pixels16x16_rvv, zve32x
+        csrwi         vxrm, 0
+        vsetivli      zero, 16, e8, m1, ta, ma
+        mspel_op_all  l a1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
+        mspel_op_all  l a0 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+        vsetvli       t0, zero, e8, m8, ta, ma
+        sub           a0, a0, a2
+        vaaddu.vv     v0, v0, v16
+        neg           a2, a2
+        vaaddu.vv     v8, v8, v24
+        vsetivli      zero, 16, e8, m1, ta, ma
+        mspel_op_all  s a0 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+
+        ret
+endfunc
+
+func ff_avg_pixels8x8_rvv, zve64x
+        csrwi         vxrm, 0
+        li            t0, 64
+        vsetivli      zero, 8, e8, mf2, ta, ma
+        vlse64.v      v16, (a1), a2
+        vlse64.v      v8, (a0), a2
+        vsetvli       zero, t0, e8, m4, ta, ma
+        vaaddu.vv     v16, v16, v8
+        vsetivli      zero, 8, e8, mf2, ta, ma
+        vsse64.v      v16, (a0), a2
+
+        ret
+endfunc
-- 
2.45.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels
  2024-05-05 19:26     ` Rémi Denis-Courmont
  2024-05-10  8:21       ` uk7b
@ 2024-05-10  8:22       ` flow gg
  2024-05-10 15:34         ` Rémi Denis-Courmont
  1 sibling, 1 reply; 16+ messages in thread
From: flow gg @ 2024-05-10  8:22 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Hi, I got BananaPi F3, made some fixes, updated in reply

Rémi Denis-Courmont <remi@remlab.net> 于2024年5月6日周一 03:26写道:

> Le sunnuntaina 5. toukokuuta 2024, 12.18.56 EEST flow gg a écrit :
> > > Does MF2 actually improve perfs over M1 here?
> >
> > The difference here seems very small, but when both mf2 and m1 are
> correct,
> > the test results have only shown mf2 to be better, so I want to use mf2.
>
> I can live with that. But this is a slippery slope because large vector
> sizes
> would involve even smaller fractions. Then we would need to compute the
> value
> which might negate the performance gains from fractional multipliers.
>
> The fastest approach that I can think of is a symbolic LA (which expands
> to
> 1xAUIPC + 1xLA) to load a precomputed VTYPE value from a static variable.
> Furthermore, this requires VSETVL, which precludes immediate constant VL
> Indeed, the VSETIVL instruction does not exist.
>
> AFAIU, BananaPi F3 has 256-bit vectors already now.
>
> --
> Rémi Denis-Courmont
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels
  2024-05-10  8:22       ` flow gg
@ 2024-05-10 15:34         ` Rémi Denis-Courmont
  2024-05-11 10:02           ` flow gg
  0 siblings, 1 reply; 16+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-10 15:34 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Le perjantaina 10. toukokuuta 2024, 11.22.53 EEST flow gg a écrit :
> Hi, I got BananaPi F3, made some fixes, updated in reply

So... Does it benefit from halving the logical multiplier to process fixed-sized 
block as compared to C908, or can we stick to the same code regardless of 
vector sizes?

Also beware that K60 cores have in-order pipelines, so data dependencies will 
probably hurt more than on C908.

-- 
Rémi Denis-Courmont
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels
  2024-05-10 15:34         ` Rémi Denis-Courmont
@ 2024-05-11 10:02           ` flow gg
  2024-05-11 10:24             ` Rémi Denis-Courmont
  0 siblings, 1 reply; 16+ messages in thread
From: flow gg @ 2024-05-11 10:02 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

The test results show that changing mf2 to m1 in ff_avg_pixels8x8_rvv in
vc1,
or changing mf2/mf4 to m1 in vsetvlstatic8 in vp8,
results in a 10-20% performance decrease on both k230 and banana_f3.

I think we should just continue using it as is...

Rémi Denis-Courmont <remi@remlab.net> 于2024年5月10日周五 23:34写道:

> Le perjantaina 10. toukokuuta 2024, 11.22.53 EEST flow gg a écrit :
> > Hi, I got BananaPi F3, made some fixes, updated in reply
>
> So... Does it benefit from halving the logical multiplier to process
> fixed-sized
> block as compared to C908, or can we stick to the same code regardless of
> vector sizes?
>
> Also beware that K60 cores have in-order pipelines, so data dependencies
> will
> probably hurt more than on C908.
>
> --
> Rémi Denis-Courmont
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels
  2024-05-11 10:02           ` flow gg
@ 2024-05-11 10:24             ` Rémi Denis-Courmont
  2024-05-11 10:47               ` flow gg
  0 siblings, 1 reply; 16+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-11 10:24 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Le lauantaina 11. toukokuuta 2024, 13.02.02 EEST flow gg a écrit :
> The test results show that changing mf2 to m1 in ff_avg_pixels8x8_rvv in
> vc1,
> or changing mf2/mf4 to m1 in vsetvlstatic8 in vp8,
> results in a 10-20% performance decrease on both k230 and banana_f3.

The questions remain, how changing from MF2 to MF4 affects performance on 
Zvl256b, and if it does, how to deal with that without breaking support for 
Zvl128b.

-- 
Rémi Denis-Courmont
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels
  2024-05-11 10:24             ` Rémi Denis-Courmont
@ 2024-05-11 10:47               ` flow gg
  0 siblings, 0 replies; 16+ messages in thread
From: flow gg @ 2024-05-11 10:47 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

In banana_f3, further reducing the value of mf resulted in another
performance improvement. I think in the end we might need to use different
functions depending on vlen in init..

Rémi Denis-Courmont <remi@remlab.net> 于2024年5月11日周六 18:24写道:

> Le lauantaina 11. toukokuuta 2024, 13.02.02 EEST flow gg a écrit :
> > The test results show that changing mf2 to m1 in ff_avg_pixels8x8_rvv in
> > vc1,
> > or changing mf2/mf4 to m1 in vsetvlstatic8 in vp8,
> > results in a 10-20% performance decrease on both k230 and banana_f3.
>
> The questions remain, how changing from MF2 to MF4 affects performance on
> Zvl256b, and if it does, how to deal with that without breaking support
> for
> Zvl128b.
>
> --
> Rémi Denis-Courmont
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels
  2024-05-10  8:21       ` uk7b
@ 2024-05-12 11:48         ` Rémi Denis-Courmont
  2024-05-12 12:43           ` uk7b
  2024-05-12 12:43           ` flow gg
  0 siblings, 2 replies; 16+ messages in thread
From: Rémi Denis-Courmont @ 2024-05-12 11:48 UTC (permalink / raw)
  To: ffmpeg-devel

Le perjantaina 10. toukokuuta 2024, 11.21.14 EEST uk7b@foxmail.com a écrit :
> From: sunyuechi <sunyuechi@iscas.ac.cn>
> 
>                                                       C908 X60
> vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c            :  14.7 13.2
> vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32      :   2.5  2.2
> vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c            :   3.7  3.5
> vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64      :   1.0  1.2
> vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c            :   9.0  8.0
> vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvi          :   1.0  1.0
> vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c            :   2.5  2.2
> vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvi          :   0.5  0.5
> ---
>  libavcodec/riscv/Makefile      |  1 +
>  libavcodec/riscv/vc1dsp_init.c | 16 +++++++++++-
>  libavcodec/riscv/vc1dsp_rvi.S  | 48 ++++++++++++++++++++++++++++++++++
>  libavcodec/riscv/vc1dsp_rvv.S  | 48 ++++++++++++++++++++++++++++++++++
>  4 files changed, 112 insertions(+), 1 deletion(-)
>  create mode 100644 libavcodec/riscv/vc1dsp_rvi.S
> 
> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> index 43b5c21cf4..cd5cc21cfd 100644
> --- a/libavcodec/riscv/Makefile
> +++ b/libavcodec/riscv/Makefile
> @@ -59,6 +59,7 @@ RVV-OBJS-$(CONFIG_TAK_DECODER) += riscv/takdsp_rvv.o
>  OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_init.o
>  RVV-OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_rvv.o
>  OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_init.o
> +RV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvi.o
>  RVV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvv.o
>  OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_init.o
>  RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o
> diff --git a/libavcodec/riscv/vc1dsp_init.c b/libavcodec/riscv/vc1dsp_init.c
> index e47b644f80..555aa5aea7 100644
> --- a/libavcodec/riscv/vc1dsp_init.c
> +++ b/libavcodec/riscv/vc1dsp_init.c
> @@ -29,19 +29,33 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest,
> ptrdiff_t stride, int16_t *block void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t
> *dest, ptrdiff_t stride, int16_t *block); void
> ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t
> *block); void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride,
> int16_t *block); +void ff_put_pixels16x16_rvi(uint8_t *dst, const uint8_t
> *src, ptrdiff_t line_size, int rnd); +void ff_put_pixels8x8_rvi(uint8_t
> *dst, const uint8_t *src, ptrdiff_t line_size, int rnd); +void
> ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t
> line_size, int rnd); +void ff_avg_pixels8x8_rvv(uint8_t *dst, const uint8_t
> *src, ptrdiff_t line_size, int rnd);
> 
>  av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
>  {
> -#if HAVE_RVV
> +#if HAVE_RV
>      int flags = av_get_cpu_flags();
> 
> +# if __riscv_xlen >= 64
> +    if (flags & AV_CPU_FLAG_RVI) {
> +        dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvi;
> +        dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvi;
> +    }
> +# endif
> +#if HAVE_RVV
>      if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
>          dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv;
>          dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv;
> +        dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv;
>          if (flags & AV_CPU_FLAG_RVV_I64) {
>              dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv;
>              dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv;
> +            dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv;
>          }
>      }
>  #endif
> +#endif
>  }
> diff --git a/libavcodec/riscv/vc1dsp_rvi.S b/libavcodec/riscv/vc1dsp_rvi.S
> new file mode 100644
> index 0000000000..1d5660316f
> --- /dev/null
> +++ b/libavcodec/riscv/vc1dsp_rvi.S
> @@ -0,0 +1,48 @@
> +/*
> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> (ISCAS). + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA + */
> +
> +#include "libavutil/riscv/asm.S"
> +
> +#if __riscv_xlen >= 64
> +func ff_put_pixels8x8_rvi
> +.rept 8
> +        ld t0, (a1)
> +        sd t0, (a0)
> +        add a1, a1, a2
> +        add a0, a0, a2
> +.endr
> +
> +        ret
> +endfunc

Are you sure that these accesses are aligned? Same below

> +
> +func ff_put_pixels16x16_rvi
> +.rept 16
> +        ld t0, (a1)
> +        ld t1, 8(a1)
> +        sd t0, (a0)
> +        sd t1, 8(a0)
> +        add a1, a1, a2
> +        add a0, a0, a2
> +.endr
> +
> +        ret
> +endfunc
> +#endif
> +

-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels
  2024-05-12 11:48         ` Rémi Denis-Courmont
@ 2024-05-12 12:43           ` uk7b
  2024-05-12 12:43           ` flow gg
  1 sibling, 0 replies; 16+ messages in thread
From: uk7b @ 2024-05-12 12:43 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

                                                      C908 X60
vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c            :  14.7 13.2
vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32      :   2.5  2.2
vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c            :   3.7  3.5
vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64      :   1.0  1.2
vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c            :   9.0  8.0
vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvi          :   1.0  1.0
vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c            :   2.5  2.2
vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvi          :   0.5  0.5
---
 libavcodec/riscv/Makefile      |  1 +
 libavcodec/riscv/vc1dsp_init.c | 16 +++++++++++-
 libavcodec/riscv/vc1dsp_rvi.S  | 47 +++++++++++++++++++++++++++++++++
 libavcodec/riscv/vc1dsp_rvv.S  | 48 ++++++++++++++++++++++++++++++++++
 4 files changed, 111 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/riscv/vc1dsp_rvi.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index f2348e259e..6f4869145a 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -58,6 +58,7 @@ RVV-OBJS-$(CONFIG_TAK_DECODER) += riscv/takdsp_rvv.o
 OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_init.o
 RVV-OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_rvv.o
 OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_init.o
+RV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvi.o
 RVV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvv.o
 OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_init.o
 RV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvi.o
diff --git a/libavcodec/riscv/vc1dsp_init.c b/libavcodec/riscv/vc1dsp_init.c
index e47b644f80..2b7071d6ff 100644
--- a/libavcodec/riscv/vc1dsp_init.c
+++ b/libavcodec/riscv/vc1dsp_init.c
@@ -29,19 +29,33 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block
 void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_put_pixels16x16_rvi(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
+void ff_put_pixels8x8_rvi(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
+void ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
+void ff_avg_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
 
 av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
 {
-#if HAVE_RVV
+#if HAVE_RV
     int flags = av_get_cpu_flags();
 
+# if __riscv_xlen >= 64
+    if (flags & AV_CPU_FLAG_RVI & AV_CPU_FLAG_RV_MISALIGNED) {
+        dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvi;
+        dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvi;
+    }
+# endif
+#if HAVE_RVV
     if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
         dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv;
         dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv;
+        dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv;
         if (flags & AV_CPU_FLAG_RVV_I64) {
             dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv;
             dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv;
+            dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv;
         }
     }
 #endif
+#endif
 }
diff --git a/libavcodec/riscv/vc1dsp_rvi.S b/libavcodec/riscv/vc1dsp_rvi.S
new file mode 100644
index 0000000000..d4a1b5bf49
--- /dev/null
+++ b/libavcodec/riscv/vc1dsp_rvi.S
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+#if __riscv_xlen >= 64
+func ff_put_pixels8x8_rvi
+.rept 8
+        ld t0, (a1)
+        sd t0, (a0)
+        add a1, a1, a2
+        add a0, a0, a2
+.endr
+
+        ret
+endfunc
+
+func ff_put_pixels16x16_rvi
+.rept 16
+        ld t0, (a1)
+        ld t1, 8(a1)
+        sd t0, (a0)
+        sd t1, 8(a0)
+        add a1, a1, a2
+        add a0, a0, a2
+.endr
+
+        ret
+endfunc
+#endif
diff --git a/libavcodec/riscv/vc1dsp_rvv.S b/libavcodec/riscv/vc1dsp_rvv.S
index 4a00945ead..7c2b47f66c 100644
--- a/libavcodec/riscv/vc1dsp_rvv.S
+++ b/libavcodec/riscv/vc1dsp_rvv.S
@@ -111,3 +111,51 @@ func ff_vc1_inv_trans_4x4_dc_rvv, zve32x
         vsse32.v      v0, (a0), a1
         ret
 endfunc
+
+.macro mspel_op op pos n1 n2
+        add           t1, \pos, a2
+        v\op\()e8.v   v\n1, (\pos)
+        sh1add        \pos, a2, \pos
+        v\op\()e8.v   v\n2, (t1)
+.endm
+
+.macro mspel_op_all op pos a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15 a16
+        mspel_op      \op \pos \a1 \a2
+        mspel_op      \op \pos \a3 \a4
+        mspel_op      \op \pos \a5 \a6
+        mspel_op      \op \pos \a7 \a8
+        mspel_op      \op \pos \a9 \a10
+        mspel_op      \op \pos \a11 \a12
+        mspel_op      \op \pos \a13 \a14
+        mspel_op      \op \pos \a15 \a16
+.endm
+
+func ff_avg_pixels16x16_rvv, zve32x
+        csrwi         vxrm, 0
+        vsetivli      zero, 16, e8, m1, ta, ma
+        mspel_op_all  l a1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
+        mspel_op_all  l a0 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+        vsetvli       t0, zero, e8, m8, ta, ma
+        sub           a0, a0, a2
+        vaaddu.vv     v0, v0, v16
+        neg           a2, a2
+        vaaddu.vv     v8, v8, v24
+        vsetivli      zero, 16, e8, m1, ta, ma
+        mspel_op_all  s a0 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+
+        ret
+endfunc
+
+func ff_avg_pixels8x8_rvv, zve64x
+        csrwi         vxrm, 0
+        li            t0, 64
+        vsetivli      zero, 8, e8, mf2, ta, ma
+        vlse64.v      v16, (a1), a2
+        vlse64.v      v8, (a0), a2
+        vsetvli       zero, t0, e8, m4, ta, ma
+        vaaddu.vv     v16, v16, v8
+        vsetivli      zero, 8, e8, mf2, ta, ma
+        vsse64.v      v16, (a0), a2
+
+        ret
+endfunc
-- 
2.45.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels
  2024-05-12 11:48         ` Rémi Denis-Courmont
  2024-05-12 12:43           ` uk7b
@ 2024-05-12 12:43           ` flow gg
  2024-05-12 12:57             ` uk7b
  1 sibling, 1 reply; 16+ messages in thread
From: flow gg @ 2024-05-12 12:43 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

It seems like it can't... update using AV_CPU_FLAG_RV_MISALIGNED

Rémi Denis-Courmont <remi@remlab.net> 于2024年5月12日周日 19:48写道:

> Le perjantaina 10. toukokuuta 2024, 11.21.14 EEST uk7b@foxmail.com a
> écrit :
> > From: sunyuechi <sunyuechi@iscas.ac.cn>
> >
> >                                                       C908 X60
> > vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c            :  14.7 13.2
> > vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32      :   2.5  2.2
> > vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c            :   3.7  3.5
> > vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64      :   1.0  1.2
> > vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c            :   9.0  8.0
> > vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvi          :   1.0  1.0
> > vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c            :   2.5  2.2
> > vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvi          :   0.5  0.5
> > ---
> >  libavcodec/riscv/Makefile      |  1 +
> >  libavcodec/riscv/vc1dsp_init.c | 16 +++++++++++-
> >  libavcodec/riscv/vc1dsp_rvi.S  | 48 ++++++++++++++++++++++++++++++++++
> >  libavcodec/riscv/vc1dsp_rvv.S  | 48 ++++++++++++++++++++++++++++++++++
> >  4 files changed, 112 insertions(+), 1 deletion(-)
> >  create mode 100644 libavcodec/riscv/vc1dsp_rvi.S
> >
> > diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> > index 43b5c21cf4..cd5cc21cfd 100644
> > --- a/libavcodec/riscv/Makefile
> > +++ b/libavcodec/riscv/Makefile
> > @@ -59,6 +59,7 @@ RVV-OBJS-$(CONFIG_TAK_DECODER) += riscv/takdsp_rvv.o
> >  OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_init.o
> >  RVV-OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_rvv.o
> >  OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_init.o
> > +RV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvi.o
> >  RVV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvv.o
> >  OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_init.o
> >  RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o
> > diff --git a/libavcodec/riscv/vc1dsp_init.c
> b/libavcodec/riscv/vc1dsp_init.c
> > index e47b644f80..555aa5aea7 100644
> > --- a/libavcodec/riscv/vc1dsp_init.c
> > +++ b/libavcodec/riscv/vc1dsp_init.c
> > @@ -29,19 +29,33 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest,
> > ptrdiff_t stride, int16_t *block void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t
> > *dest, ptrdiff_t stride, int16_t *block); void
> > ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t
> > *block); void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t
> stride,
> > int16_t *block); +void ff_put_pixels16x16_rvi(uint8_t *dst, const uint8_t
> > *src, ptrdiff_t line_size, int rnd); +void ff_put_pixels8x8_rvi(uint8_t
> > *dst, const uint8_t *src, ptrdiff_t line_size, int rnd); +void
> > ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t
> > line_size, int rnd); +void ff_avg_pixels8x8_rvv(uint8_t *dst, const
> uint8_t
> > *src, ptrdiff_t line_size, int rnd);
> >
> >  av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
> >  {
> > -#if HAVE_RVV
> > +#if HAVE_RV
> >      int flags = av_get_cpu_flags();
> >
> > +# if __riscv_xlen >= 64
> > +    if (flags & AV_CPU_FLAG_RVI) {
> > +        dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvi;
> > +        dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvi;
> > +    }
> > +# endif
> > +#if HAVE_RVV
> >      if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
> >          dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv;
> >          dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv;
> > +        dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv;
> >          if (flags & AV_CPU_FLAG_RVV_I64) {
> >              dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv;
> >              dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv;
> > +            dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv;
> >          }
> >      }
> >  #endif
> > +#endif
> >  }
> > diff --git a/libavcodec/riscv/vc1dsp_rvi.S
> b/libavcodec/riscv/vc1dsp_rvi.S
> > new file mode 100644
> > index 0000000000..1d5660316f
> > --- /dev/null
> > +++ b/libavcodec/riscv/vc1dsp_rvi.S
> > @@ -0,0 +1,48 @@
> > +/*
> > + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> > (ISCAS). + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA + */
> > +
> > +#include "libavutil/riscv/asm.S"
> > +
> > +#if __riscv_xlen >= 64
> > +func ff_put_pixels8x8_rvi
> > +.rept 8
> > +        ld t0, (a1)
> > +        sd t0, (a0)
> > +        add a1, a1, a2
> > +        add a0, a0, a2
> > +.endr
> > +
> > +        ret
> > +endfunc
>
> Are you sure that these accesses are aligned? Same below
>
> > +
> > +func ff_put_pixels16x16_rvi
> > +.rept 16
> > +        ld t0, (a1)
> > +        ld t1, 8(a1)
> > +        sd t0, (a0)
> > +        sd t1, 8(a0)
> > +        add a1, a1, a2
> > +        add a0, a0, a2
> > +.endr
> > +
> > +        ret
> > +endfunc
> > +#endif
> > +
>
> --
> 雷米‧德尼-库尔蒙
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels
  2024-05-12 12:43           ` flow gg
@ 2024-05-12 12:57             ` uk7b
  0 siblings, 0 replies; 16+ messages in thread
From: uk7b @ 2024-05-12 12:57 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: sunyuechi

From: sunyuechi <sunyuechi@iscas.ac.cn>

                                                      C908 X60
vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c            :  14.7 13.2
vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32      :   2.5  2.2
vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c            :   3.7  3.5
vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64      :   1.0  1.2
vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c            :   9.0  8.0
vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvi          :   1.0  1.0
vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c            :   2.5  2.2
vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvi          :   0.5  0.5
---
 libavcodec/riscv/Makefile      |  1 +
 libavcodec/riscv/vc1dsp_init.c | 16 +++++++++++-
 libavcodec/riscv/vc1dsp_rvi.S  | 47 +++++++++++++++++++++++++++++++++
 libavcodec/riscv/vc1dsp_rvv.S  | 48 ++++++++++++++++++++++++++++++++++
 4 files changed, 111 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/riscv/vc1dsp_rvi.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index f2348e259e..6f4869145a 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -58,6 +58,7 @@ RVV-OBJS-$(CONFIG_TAK_DECODER) += riscv/takdsp_rvv.o
 OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_init.o
 RVV-OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_rvv.o
 OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_init.o
+RV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvi.o
 RVV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvv.o
 OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_init.o
 RV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvi.o
diff --git a/libavcodec/riscv/vc1dsp_init.c b/libavcodec/riscv/vc1dsp_init.c
index e47b644f80..2628369155 100644
--- a/libavcodec/riscv/vc1dsp_init.c
+++ b/libavcodec/riscv/vc1dsp_init.c
@@ -29,19 +29,33 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block
 void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_put_pixels16x16_rvi(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
+void ff_put_pixels8x8_rvi(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
+void ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
+void ff_avg_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
 
 av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
 {
-#if HAVE_RVV
+#if HAVE_RV
     int flags = av_get_cpu_flags();
 
+# if __riscv_xlen >= 64
+    if (flags & AV_CPU_FLAG_RV_MISALIGNED) {
+        dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvi;
+        dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvi;
+    }
+# endif
+#if HAVE_RVV
     if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
         dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv;
         dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv;
+        dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv;
         if (flags & AV_CPU_FLAG_RVV_I64) {
             dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv;
             dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv;
+            dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv;
         }
     }
 #endif
+#endif
 }
diff --git a/libavcodec/riscv/vc1dsp_rvi.S b/libavcodec/riscv/vc1dsp_rvi.S
new file mode 100644
index 0000000000..d4a1b5bf49
--- /dev/null
+++ b/libavcodec/riscv/vc1dsp_rvi.S
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+#if __riscv_xlen >= 64
+func ff_put_pixels8x8_rvi
+.rept 8
+        ld t0, (a1)
+        sd t0, (a0)
+        add a1, a1, a2
+        add a0, a0, a2
+.endr
+
+        ret
+endfunc
+
+func ff_put_pixels16x16_rvi
+.rept 16
+        ld t0, (a1)
+        ld t1, 8(a1)
+        sd t0, (a0)
+        sd t1, 8(a0)
+        add a1, a1, a2
+        add a0, a0, a2
+.endr
+
+        ret
+endfunc
+#endif
diff --git a/libavcodec/riscv/vc1dsp_rvv.S b/libavcodec/riscv/vc1dsp_rvv.S
index 4a00945ead..7c2b47f66c 100644
--- a/libavcodec/riscv/vc1dsp_rvv.S
+++ b/libavcodec/riscv/vc1dsp_rvv.S
@@ -111,3 +111,51 @@ func ff_vc1_inv_trans_4x4_dc_rvv, zve32x
         vsse32.v      v0, (a0), a1
         ret
 endfunc
+
+.macro mspel_op op pos n1 n2
+        add           t1, \pos, a2
+        v\op\()e8.v   v\n1, (\pos)
+        sh1add        \pos, a2, \pos
+        v\op\()e8.v   v\n2, (t1)
+.endm
+
+.macro mspel_op_all op pos a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15 a16
+        mspel_op      \op \pos \a1 \a2
+        mspel_op      \op \pos \a3 \a4
+        mspel_op      \op \pos \a5 \a6
+        mspel_op      \op \pos \a7 \a8
+        mspel_op      \op \pos \a9 \a10
+        mspel_op      \op \pos \a11 \a12
+        mspel_op      \op \pos \a13 \a14
+        mspel_op      \op \pos \a15 \a16
+.endm
+
+func ff_avg_pixels16x16_rvv, zve32x
+        csrwi         vxrm, 0
+        vsetivli      zero, 16, e8, m1, ta, ma
+        mspel_op_all  l a1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
+        mspel_op_all  l a0 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+        vsetvli       t0, zero, e8, m8, ta, ma
+        sub           a0, a0, a2
+        vaaddu.vv     v0, v0, v16
+        neg           a2, a2
+        vaaddu.vv     v8, v8, v24
+        vsetivli      zero, 16, e8, m1, ta, ma
+        mspel_op_all  s a0 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+
+        ret
+endfunc
+
+func ff_avg_pixels8x8_rvv, zve64x
+        csrwi         vxrm, 0
+        li            t0, 64
+        vsetivli      zero, 8, e8, mf2, ta, ma
+        vlse64.v      v16, (a1), a2
+        vlse64.v      v8, (a0), a2
+        vsetvli       zero, t0, e8, m4, ta, ma
+        vaaddu.vv     v16, v16, v8
+        vsetivli      zero, 8, e8, mf2, ta, ma
+        vsse64.v      v16, (a0), a2
+
+        ret
+endfunc
-- 
2.45.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2024-05-12 12:58 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-05-04 10:01 [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels uk7b
2024-05-04 10:08 ` flow gg
2024-05-04 17:53 ` Rémi Denis-Courmont
2024-05-05  9:15   ` uk7b
2024-05-05  9:18   ` flow gg
2024-05-05 19:26     ` Rémi Denis-Courmont
2024-05-10  8:21       ` uk7b
2024-05-12 11:48         ` Rémi Denis-Courmont
2024-05-12 12:43           ` uk7b
2024-05-12 12:43           ` flow gg
2024-05-12 12:57             ` uk7b
2024-05-10  8:22       ` flow gg
2024-05-10 15:34         ` Rémi Denis-Courmont
2024-05-11 10:02           ` flow gg
2024-05-11 10:24             ` Rémi Denis-Courmont
2024-05-11 10:47               ` flow gg

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git