Re: [FFmpeg-devel] [PATCH v2 3/5] lavc/vp9dsp: R-V V mc tap h v

From: flow gg <hlefthleft@gmail.com>
To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
Subject: Re: [FFmpeg-devel] [PATCH v2 3/5] lavc/vp9dsp: R-V V mc tap h v
Date: Sat, 25 May 2024 18:38:39 +0800
Message-ID: <CAEa-L+tKWWS8=VBjRY89Jjf5N8cvq3HYxGwW0BskJkg_zvnjFQ@mail.gmail.com> (raw)
In-Reply-To: <3483316.QJadu78ljV@basile.remlab.net>

> Is there a reason that you cannot use the tables from C code?

Similar to VP8, to adjust the positive and negative data and prevent small
probability overflow during calculations.

> AFAICT, regular and sharp are identical, except for the base address of
the
> filter table, so it should be possible to share the byte code

Initially, they used the same code, but after testing hundreds of times,
there were always a few failures...

Because the data in the table is different, when regular, sharp, and smooth
use the same code, there will always be a small amount of overflow.
Different signed and unsigned calculations are needed.

> A French philosopher famously said that Perfect is the ennemy of Good.
> Generally, as with VVC, nested repetition macros for finely specialised
> functions tend to generate way too much byte code, and this ends up being
> worse rather than better in the big picture.

Got it, I will try to update.

Rémi Denis-Courmont <remi@remlab.net> 于2024年5月25日周六 18:17写道：

> Le tiistaina 21. toukokuuta 2024, 20.13.17 EEST uk7b@foxmail.com a écrit :
> > From: sunyuechi <sunyuechi@iscas.ac.cn>
> >
> >                                                      C908   X60
> > vp9_avg_8tap_smooth_4h_8bpp_c                      :   13.0   11.2
> > vp9_avg_8tap_smooth_4h_8bpp_rvv_i32                :    5.0    4.2
> > vp9_avg_8tap_smooth_4v_8bpp_c                      :   13.7   12.5
> > vp9_avg_8tap_smooth_4v_8bpp_rvv_i32                :    5.0    4.2
> > vp9_avg_8tap_smooth_8h_8bpp_c                      :   49.5   42.2
> > vp9_avg_8tap_smooth_8h_8bpp_rvv_i32                :    9.2    8.5
> > vp9_avg_8tap_smooth_8v_8bpp_c                      :   66.5   45.0
> > vp9_avg_8tap_smooth_8v_8bpp_rvv_i32                :    9.5    8.5
> > vp9_avg_8tap_smooth_16h_8bpp_c                     :  192.7  166.5
> > vp9_avg_8tap_smooth_16h_8bpp_rvv_i32               :   21.2   18.7
> > vp9_avg_8tap_smooth_16v_8bpp_c                     :  192.2  175.7
> > vp9_avg_8tap_smooth_16v_8bpp_rvv_i32               :   21.5   19.0
> > vp9_avg_8tap_smooth_32h_8bpp_c                     :  780.2  663.7
> > vp9_avg_8tap_smooth_32h_8bpp_rvv_i32               :   83.5   60.0
> > vp9_avg_8tap_smooth_32v_8bpp_c                     :  770.5  689.2
> > vp9_avg_8tap_smooth_32v_8bpp_rvv_i32               :   67.2   60.0
> > vp9_avg_8tap_smooth_64h_8bpp_c                     : 3115.5 2647.2
> > vp9_avg_8tap_smooth_64h_8bpp_rvv_i32               :  283.5  119.2
> > vp9_avg_8tap_smooth_64v_8bpp_c                     : 3082.2 2729.0
> > vp9_avg_8tap_smooth_64v_8bpp_rvv_i32               :  305.2  119.0
> > vp9_put_8tap_smooth_4h_8bpp_c                      :   11.2    9.7
> > vp9_put_8tap_smooth_4h_8bpp_rvv_i32                :    4.2    4.0
> > vp9_put_8tap_smooth_4v_8bpp_c                      :   11.7   10.7
> > vp9_put_8tap_smooth_4v_8bpp_rvv_i32                :    4.2    4.0
> > vp9_put_8tap_smooth_8h_8bpp_c                      :   42.0   37.5
> > vp9_put_8tap_smooth_8h_8bpp_rvv_i32                :    8.5    7.7
> > vp9_put_8tap_smooth_8v_8bpp_c                      :   44.2   38.7
> > vp9_put_8tap_smooth_8v_8bpp_rvv_i32                :    8.5    7.7
> > vp9_put_8tap_smooth_16h_8bpp_c                     :  165.7  147.2
> > vp9_put_8tap_smooth_16h_8bpp_rvv_i32               :   19.5   17.5
> > vp9_put_8tap_smooth_16v_8bpp_c                     :  169.0  149.7
> > vp9_put_8tap_smooth_16v_8bpp_rvv_i32               :   19.7   17.5
> > vp9_put_8tap_smooth_32h_8bpp_c                     :  659.7  586.7
> > vp9_put_8tap_smooth_32h_8bpp_rvv_i32               :   64.2   57.2
> > vp9_put_8tap_smooth_32v_8bpp_c                     :  680.5  591.2
> > vp9_put_8tap_smooth_32v_8bpp_rvv_i32               :   64.2   57.2
> > vp9_put_8tap_smooth_64h_8bpp_c                     : 2681.5 2339.0
> > vp9_put_8tap_smooth_64h_8bpp_rvv_i32               :  255.5  114.2
> > vp9_put_8tap_smooth_64v_8bpp_c                     : 2709.7 2348.7
> > vp9_put_8tap_smooth_64v_8bpp_rvv_i32               :  255.5  114.0
> > ---
> >  libavcodec/riscv/vp9_mc_rvv.S  | 243 +++++++++++++++++++++++++++++++++
> >  libavcodec/riscv/vp9dsp.h      |  72 ++++++----
> >  libavcodec/riscv/vp9dsp_init.c |  38 +++++-
> >  3 files changed, 328 insertions(+), 25 deletions(-)
> >
> > diff --git a/libavcodec/riscv/vp9_mc_rvv.S
> b/libavcodec/riscv/vp9_mc_rvv.S
> > index 739380d9a9..adba4afb90 100644
> > --- a/libavcodec/riscv/vp9_mc_rvv.S
> > +++ b/libavcodec/riscv/vp9_mc_rvv.S
> > @@ -36,6 +36,18 @@
> >  .endif
> >  .endm
> >
> > +.macro vsetvlstatic16 len
> > +.ifc \len,4
> > +        vsetvli         zero, zero, e16, mf2, ta, ma
> > +.elseif \len == 8
> > +        vsetvli         zero, zero, e16, m1, ta, ma
> > +.elseif \len == 16
> > +        vsetvli         zero, zero, e16, m2, ta, ma
> > +.else
> > +        vsetvli         zero, zero, e16, m4, ta, ma
> > +.endif
> > +.endm
> > +
> >  .macro copy_avg len
> >  func ff_vp9_avg\len\()_rvv, zve32x
> >          csrwi           vxrm, 0
> > @@ -92,10 +104,241 @@ func ff_\op\()_vp9_bilin_\len\()\type\()_rvv,
> zve32x
> >  endfunc
> >  .endm
> >
> > +const subpel_filters_regular
> > +        .byte  0,  0,   0, 128,   0,   0,  0,  0
> > +        .byte  0,  1,  -5, 126,   8,  -3,  1,  0
> > +        .byte -1,  3, -10, 122,  18,  -6,  2,  0
> > +        .byte -1,  4, -13, 118,  27,  -9,  3, -1
> > +        .byte -1,  4, -16, 112,  37, -11,  4, -1
> > +        .byte -1,  5, -18, 105,  48, -14,  4, -1
> > +        .byte -1,  5, -19,  97,  58, -16,  5, -1
> > +        .byte -1,  6, -19,  88,  68, -18,  5, -1
> > +        .byte -1,  6, -19,  78,  78, -19,  6, -1
> > +        .byte -1,  5, -18,  68,  88, -19,  6, -1
> > +        .byte -1,  5, -16,  58,  97, -19,  5, -1
> > +        .byte -1,  4, -14,  48, 105, -18,  5, -1
> > +        .byte -1,  4, -11,  37, 112, -16,  4, -1
> > +        .byte -1,  3,  -9,  27, 118, -13,  4, -1
> > +        .byte  0,  2,  -6,  18, 122, -10,  3, -1
> > +        .byte  0,  1,  -3,   8, 126,  -5,  1,  0
> > +subpel_filters_sharp:
> > +        .byte  0,  0,   0, 128,   0,   0,  0,  0
> > +        .byte -1,  3,  -7, 127,   8,  -3,  1,  0
> > +        .byte -2,  5, -13, 125,  17,  -6,  3, -1
> > +        .byte -3,  7, -17, 121,  27, -10,  5, -2
> > +        .byte -4,  9, -20, 115,  37, -13,  6, -2
> > +        .byte -4, 10, -23, 108,  48, -16,  8, -3
> > +        .byte -4, 10, -24, 100,  59, -19,  9, -3
> > +        .byte -4, 11, -24,  90,  70, -21, 10, -4
> > +        .byte -4, 11, -23,  80,  80, -23, 11, -4
> > +        .byte -4, 10, -21,  70,  90, -24, 11, -4
> > +        .byte -3,  9, -19,  59, 100, -24, 10, -4
> > +        .byte -3,  8, -16,  48, 108, -23, 10, -4
> > +        .byte -2,  6, -13,  37, 115, -20,  9, -4
> > +        .byte -2,  5, -10,  27, 121, -17,  7, -3
> > +        .byte -1,  3,  -6,  17, 125, -13,  5, -2
> > +        .byte  0,  1,  -3,   8, 127,  -7,  3, -1
> > +subpel_filters_smooth:
> > +        .byte  0,  0,   0, 128,   0,   0,  0,  0
> > +        .byte -3, -1,  32,  64,  38,   1, -3,  0
> > +        .byte -2, -2,  29,  63,  41,   2, -3,  0
> > +        .byte -2, -2,  26,  63,  43,   4, -4,  0
> > +        .byte -2, -3,  24,  62,  46,   5, -4,  0
> > +        .byte -2, -3,  21,  60,  49,   7, -4,  0
> > +        .byte -1, -4,  18,  59,  51,   9, -4,  0
> > +        .byte -1, -4,  16,  57,  53,  12, -4, -1
> > +        .byte -1, -4,  14,  55,  55,  14, -4, -1
> > +        .byte -1, -4,  12,  53,  57,  16, -4, -1
> > +        .byte  0, -4,   9,  51,  59,  18, -4, -1
> > +        .byte  0, -4,   7,  49,  60,  21, -3, -2
> > +        .byte  0, -4,   5,  46,  62,  24, -3, -2
> > +        .byte  0, -4,   4,  43,  63,  26, -2, -2
> > +        .byte  0, -3,   2,  41,  63,  29, -2, -2
> > +        .byte  0, -3,   1,  38,  64,  32, -1, -3
> > +endconst
>
> Is there a reason that you cannot use the tables from C code?
>
> > +
> > +.macro epel_filter name type regtype
> > +        lla             \regtype\()2, subpel_filters_\name
>
> It should be possible to spare one ADDI by using just AUIPC here, and
> folding
> the immediate offset into the LB's below (see also H.263 loop filter).
>
> > +        li              \regtype\()1, 8
> > +.ifc \type,v
> > +        mul             \regtype\()0, a6, \regtype\()1
> > +.else
> > +        mul             \regtype\()0, a5, \regtype\()1
>
> slli 3 ?
>
> > +.endif
> > +        add             \regtype\()0, \regtype\()0, \regtype\()2
> > +        .irp n,1,2,3,4,5,6
> > +        lb              \regtype\n, \n(\regtype\()0)
> > +        .endr
> > +.ifc \regtype,t
> > +        lb              a7, 7(\regtype\()0)
> > +.else
> > +        lb              s7, 7(\regtype\()0)
> > +.endif
> > +        lb              \regtype\()0, 0(\regtype\()0)
> > +.endm
> > +
> > +.macro epel_load dst len op name type from_mem regtype
> > +        li              a5, 64
> > +.ifc \from_mem, 1
> > +        vle8.v          v22, (a2)
> > +.ifc \type,v
> > +        sub             a2, a2, a3
> > +        vle8.v          v20, (a2)
> > +        sh1add          a2, a3, a2
> > +        vle8.v          v24, (a2)
> > +        add             a2, a2, a3
> > +        vle8.v          v26, (a2)
> > +        add             a2, a2, a3
> > +        vle8.v          v28, (a2)
> > +        add             a2, a2, a3
> > +        vle8.v          v30, (a2)
> > +.else
> > +        addi            a2, a2, -1
> > +        vle8.v          v20, (a2)
> > +        addi            a2, a2, 2
> > +        vle8.v          v24, (a2)
> > +        addi            a2, a2, 1
> > +        vle8.v          v26, (a2)
> > +        addi            a2, a2, 1
> > +        vle8.v          v28, (a2)
> > +        addi            a2, a2, 1
> > +        vle8.v          v30, (a2)
> > +.endif
> > +
> > +.ifc \name,smooth
> > +        vwmulu.vx       v16, v24, \regtype\()4
> > +        vwmaccu.vx      v16, \regtype\()2, v20
> > +        vwmaccu.vx      v16, \regtype\()5, v26
> > +        vwmaccsu.vx     v16, \regtype\()6, v28
> > +.else
> > +        vwmulu.vx       v16, v28, \regtype\()6
> > +        vwmaccsu.vx     v16, \regtype\()2, v20
> > +        vwmaccsu.vx     v16, \regtype\()5, v26
> > +.endif
> > +
> > +.ifc \regtype,t
> > +        vwmaccsu.vx     v16, a7, v30
> > +.else
> > +        vwmaccsu.vx     v16, s7, v30
> > +.endif
> > +
> > +.ifc \type,v
> > +        .rept 6
> > +        sub             a2, a2, a3
> > +        .endr
> > +        vle8.v          v28, (a2)
> > +        sub             a2, a2, a3
> > +        vle8.v          v26, (a2)
> > +        sh1add          a2, a3, a2
> > +        add             a2, a2, a3
> > +.else
> > +        addi            a2, a2, -6
> > +        vle8.v          v28, (a2)
> > +        addi            a2, a2, -1
> > +        vle8.v          v26, (a2)
> > +        addi            a2, a2, 3
> > +.endif
> > +
> > +.ifc \name,smooth
> > +        vwmaccsu.vx     v16, \regtype\()1, v28
> > +.else
> > +        vwmaccu.vx      v16, \regtype\()1, v28
> > +        vwmulu.vx       v28, v24, \regtype\()4
> > +.endif
> > +        vwmaccsu.vx     v16, \regtype\()0, v26
> > +        vwmulu.vx       v20, v22, \regtype\()3
> > +.else
> > +.ifc \name,smooth
> > +        vwmulu.vx       v16, v8, \regtype\()4
> > +        vwmaccu.vx      v16, \regtype\()2, v4
> > +        vwmaccu.vx      v16, \regtype\()5, v10
> > +        vwmaccsu.vx     v16, \regtype\()6, v12
> > +        vwmaccsu.vx     v16, \regtype\()1, v2
> > +.else
> > +        vwmulu.vx       v16, v2, \regtype\()1
> > +        vwmaccu.vx      v16, \regtype\()6, v12
> > +        vwmaccsu.vx     v16, \regtype\()5, v10
> > +        vwmaccsu.vx     v16, \regtype\()2, v4
> > +        vwmulu.vx       v28, v8, \regtype\()4
> > +.endif
> > +        vwmaccsu.vx     v16, \regtype\()0, v0
> > +        vwmulu.vx       v20, v6, \regtype\()3
> > +
> > +.ifc \regtype,t
> > +        vwmaccsu.vx     v16, a7, v14
> > +.else
> > +        vwmaccsu.vx     v16, s7, v14
> > +.endif
> > +
> > +.endif
> > +        vwadd.wx        v16, v16, a5
> > +        vsetvlstatic16  \len
> > +
> > +.ifc \name,smooth
> > +        vwadd.vv        v24, v16, v20
> > +.else
> > +        vwadd.vv        v24, v16, v28
> > +        vwadd.wv        v24, v24, v20
> > +.endif
> > +        vnsra.wi        v24, v24, 7
> > +        vmax.vx         v24, v24, zero
> > +        vsetvlstatic8   \len, zero, 32, m2
> > +
> > +        vnclipu.wi      \dst, v24, 0
> > +.ifc \op,avg
> > +        vle8.v          v24, (a0)
> > +        vaaddu.vv       \dst, \dst, v24
> > +.endif
> > +
> > +.endm
> > +
> > +.macro epel_load_inc dst len op name type from_mem regtype
> > +        epel_load       \dst, \len, \op, \name, \type, \from_mem,
> \regtype
> > +        add             a2, a2, a3
> > +.endm
> > +
> > +.macro epel len op name type vlen
> > +func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
> > +        epel_filter     \name, \type, t
> > +.if \vlen < 256
> > +        vsetvlstatic8   \len, a5, 32, m2
> > +.else
> > +        vsetvlstatic8   \len, a5, 64, m2
> > +.endif
> > +.ifc \op,avg
> > +        csrwi           vxrm, 0
> > +.endif
> > +
> > +1:
> > +        addi            a4, a4, -1
> > +        epel_load       v30, \len, \op, \name, \type, 1, t
> > +        vse8.v          v30, (a0)
> > +.if \len == 64 && \vlen < 256
> > +        addi            a0, a0, 32
> > +        addi            a2, a2, 32
> > +        epel_load       v30, \len, \op, \name, \type, 1, t
> > +        vse8.v          v30, (a0)
> > +        addi            a0, a0, -32
> > +        addi            a2, a2, -32
> > +.endif
> > +        add             a2, a2, a3
> > +        add             a0, a0, a1
> > +        bnez            a4, 1b
> > +
> > +        ret
> > +endfunc
> > +.endm
> > +
> >  .irp len, 64, 32, 16, 8, 4
> >          copy_avg \len
> >          .irp op, put, avg
> >                  bilin_h_v \len, \op, h, a5
> >                  bilin_h_v \len, \op, v, a6
> > +                .irp name, regular, sharp, smooth
>
> AFAICT, regular and sharp are identical, except for the base address of
> the
> filter table, so it should be possible to share the byte code. Similarly,
> it
> should be possible to share most of the horizontal and vertical code
> (maybe
> also for bilinear. not just EPel) with separate load/store then inner
> procedures. The H.263 loop filter already does that though with almost no
> overhead, though
> H.263 is obviously simpler than VP9.
>
> A French philosopher famously said that Perfect is the ennemy of Good.
> Generally, as with VVC, nested repetition macros for finely specialised
> functions tend to generate way too much byte code, and this ends up being
> worse rather than better in the big picture.
>
> > +                        .irp type, h, v
> > +                                epel \len, \op, \name, \type, 128
> > +                                epel \len, \op, \name, \type, 256
> > +                        .endr
> > +                .endr
> >          .endr
> >  .endr
> > diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> > index 8fb326dae0..5fd64a1b8c 100644
> > --- a/libavcodec/riscv/vp9dsp.h
> > +++ b/libavcodec/riscv/vp9dsp.h
> > @@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride,
> const
> > uint8_t *l, void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const
> > uint8_t *l, const uint8_t *a);
> >
> > -#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)
>
> >   \ -void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> > dststride,   \ +#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx,
> > min_vlen)              \ +void
> > ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,
> \ +
> >                                        ptrdiff_t dststride,
>
> > \ const uint8_t *src,                  \ ptrdiff_t srcstride,
>
> >   \ int h, int mx, int my);              \ \ -void
> > ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,   \
> > +void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,
>
> >   \ +                                        ptrdiff_t dststride,
>
> >       \ const uint8_t *src,                  \ ptrdiff_t srcstride,
>
> >         \ int h, int mx, int my);              \ \ -void
> > ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,  \
> > +void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,
>
> >   \ +                                         ptrdiff_t dststride,
>
> >       \ const uint8_t *src,                 \ ptrdiff_t srcstride,
>
> >       \ int h, int mx, int my);             \ \ -void
> > ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride,   \
> > +void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,
>
> >   \ +                                        ptrdiff_t dststride,
>
> >       \ const uint8_t *src,                  \ ptrdiff_t srcstride,
>
> >         \ int h, int mx, int my);              \ \ -void
> > ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,   \
> > +void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,
>
> >   \ +                                        ptrdiff_t dststride,
>
> >       \ const uint8_t *src,                  \ ptrdiff_t srcstride,
>
> >         \ int h, int mx, int my);              \ \ -void
> > ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,  \
> > +void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,
>
> >   \ +                                         ptrdiff_t dststride,
>
> >       \ const uint8_t *src,                 \ ptrdiff_t srcstride,
>
> >       \ int h, int mx, int my);
> > @@ -146,23 +152,41 @@ void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t
> > dststride,     \ const uint8_t *src, ptrdiff_t srcstride,   \ int h, int
> > mx, int my);
> >
> > -VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR);
> > -VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR);
> > -VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR);
> > -VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR);
> > -VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR);
> > -
> > -VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP);
> > -VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP);
> > -VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP);
> > -VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP);
> > -VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP);
> > -
> > -VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
> > -VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
> > -VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
> > -VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
> > -VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
> > +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128);
> > +
> > +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128);
> > +
> > +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128);
> > +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128);
> > +
> > +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256);
> > +
> > +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256);
> > +
> > +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256);
> > +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256);
> >
> >  VP9_BILINEAR_RISCV_RVV_FUNC(64);
> >  VP9_BILINEAR_RISCV_RVV_FUNC(32);
> > diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> > index 9606d8545f..314a1e5808 100644
> > --- a/libavcodec/riscv/vp9dsp_init.c
> > +++ b/libavcodec/riscv/vp9dsp_init.c
> > @@ -49,7 +49,8 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> > *dsp, int bpp) # endif
> >
> >  #if HAVE_RVV
> > -    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) &&
> ff_rv_vlen_least(128))
> > { +    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
> > +    if (ff_rv_vlen_least(128)) {
> >
> >  #define init_fpel(idx1, sz)                                           \
> >      dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] =
> ff_vp9_avg##sz##_rvv;  \
> > @@ -85,7 +86,42 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> > *dsp, int bpp) dsp->mc[4][FILTER_BILINEAR ][1][1][0] =
> > ff_avg_vp9_bilin_4h_rvv;
> >
> >  #undef init_fpel
> > +
> > +#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen)  \
> > +    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] =   \
> > +        ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen;       \
> > +    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] =   \
> > +        ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen;      \
> > +    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] =   \
> > +        ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen;
> > +
> > +#define init_subpel2(idx, idxh, idxv, dir, type, vlen)      \
> > +    init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen);  \
> > +    init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen);  \
> > +    init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen);  \
> > +    init_subpel1(3, idx, idxh, idxv,  8, dir, type, vlen);  \
> > +    init_subpel1(4, idx, idxh, idxv,  4, dir, type, vlen)
> > +
> > +    init_subpel2(0, 1, 0, h, put, 128);
> > +    init_subpel2(1, 1, 0, h, avg, 128);
> > +
> > +    if (flags & AV_CPU_FLAG_RVB_ADDR) {
> > +        init_subpel2(0, 0, 1, v, put, 128);
> > +        init_subpel2(1, 0, 1, v, avg, 128);
> > +    }
> > +
> > +    }
> > +    if (ff_rv_vlen_least(256)) {
> > +        init_subpel2(0, 1, 0, h, put, 256);
> > +        init_subpel2(1, 1, 0, h, avg, 256);
> > +
> > +        if (flags & AV_CPU_FLAG_RVB_ADDR) {
> > +            init_subpel2(0, 0, 1, v, put, 256);
> > +            init_subpel2(1, 0, 1, v, avg, 256);
> > +        }
> >      }
> > +    }
> > +
> >  #endif
> >  #endif
> >  }
>
>
> --
> Rémi Denis-Courmont
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".