Re: [FFmpeg-devel] [PATCH v4 3/4] lavc/vp9dsp: R-V V mc tap h v

From: flow gg <hlefthleft@gmail.com>
To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
Subject: Re: [FFmpeg-devel] [PATCH v4 3/4] lavc/vp9dsp: R-V V mc tap h v
Date: Sat, 15 Jun 2024 19:52:41 +0800
Message-ID: <CAEa-L+vmVSvnw+hyj0vcL5Ps9ejCE1u=DsexLfoUSCY45-6Pqg@mail.gmail.com> (raw)
In-Reply-To: <tencent_5EE6DCB0E4C3587842548B2C7887988A3509@qq.com>

> You can directly LLA filters + 16 * 8 * 2 and save one add. Same below.
You can
> also use .equ to alias the filter addresses, and avoid if's.

> That's a lot of address dependencies, which is going to hurt performance.
It
> might help to just spill more S registers if needed.

> This can be done in 3 instructions, even without mul. Of course you'll
again
> need a spare register.

Okay, updated them

> Use a macro parameter for the stride register.

Doing this will reduce one if-else statement in this patch, but in the next
patch, it will lead to adding multiple if-else statements. I think we can
leave it unchanged.

<uk7b@foxmail.com> 于2024年6月15日周六 19:51写道：

> From: sunyuechi <sunyuechi@iscas.ac.cn>
>
>                                                      C908   X60
> vp9_avg_8tap_smooth_4h_8bpp_c                      :   12.7   11.2
> vp9_avg_8tap_smooth_4h_8bpp_rvv_i32                :    4.7    4.2
> vp9_avg_8tap_smooth_4v_8bpp_c                      :   29.7   12.5
> vp9_avg_8tap_smooth_4v_8bpp_rvv_i32                :    4.7    4.2
> vp9_avg_8tap_smooth_8h_8bpp_c                      :   48.7   42.2
> vp9_avg_8tap_smooth_8h_8bpp_rvv_i32                :    9.5    8.5
> vp9_avg_8tap_smooth_8v_8bpp_c                      :   49.7   45.5
> vp9_avg_8tap_smooth_8v_8bpp_rvv_i32                :    9.5    8.5
> vp9_avg_8tap_smooth_16h_8bpp_c                     :  192.0  166.5
> vp9_avg_8tap_smooth_16h_8bpp_rvv_i32               :   21.7   19.5
> vp9_avg_8tap_smooth_16v_8bpp_c                     :  191.2  175.2
> vp9_avg_8tap_smooth_16v_8bpp_rvv_i32               :   21.2   19.0
> vp9_avg_8tap_smooth_32h_8bpp_c                     :  780.2  663.2
> vp9_avg_8tap_smooth_32h_8bpp_rvv_i32               :   68.2   60.5
> vp9_avg_8tap_smooth_32v_8bpp_c                     :  770.0  685.7
> vp9_avg_8tap_smooth_32v_8bpp_rvv_i32               :   67.0   59.5
> vp9_avg_8tap_smooth_64h_8bpp_c                     : 3116.2 2648.2
> vp9_avg_8tap_smooth_64h_8bpp_rvv_i32               :  270.7  120.7
> vp9_avg_8tap_smooth_64v_8bpp_c                     : 3058.5 2731.7
> vp9_avg_8tap_smooth_64v_8bpp_rvv_i32               :  266.5  119.0
> vp9_put_8tap_smooth_4h_8bpp_c                      :   11.0    9.7
> vp9_put_8tap_smooth_4h_8bpp_rvv_i32                :    4.2    3.7
> vp9_put_8tap_smooth_4v_8bpp_c                      :   11.7   10.5
> vp9_put_8tap_smooth_4v_8bpp_rvv_i32                :    4.0    3.7
> vp9_put_8tap_smooth_8h_8bpp_c                      :   42.0   37.5
> vp9_put_8tap_smooth_8h_8bpp_rvv_i32                :    8.5    7.7
> vp9_put_8tap_smooth_8v_8bpp_c                      :   43.5   38.5
> vp9_put_8tap_smooth_8v_8bpp_rvv_i32                :    8.7    7.7
> vp9_put_8tap_smooth_16h_8bpp_c                     :  181.7  147.2
> vp9_put_8tap_smooth_16h_8bpp_rvv_i32               :   20.0   18.0
> vp9_put_8tap_smooth_16v_8bpp_c                     :  168.5  149.7
> vp9_put_8tap_smooth_16v_8bpp_rvv_i32               :   19.7   17.5
> vp9_put_8tap_smooth_32h_8bpp_c                     :  675.0  586.5
> vp9_put_8tap_smooth_32h_8bpp_rvv_i32               :   65.2   58.0
> vp9_put_8tap_smooth_32v_8bpp_c                     :  664.7  591.2
> vp9_put_8tap_smooth_32v_8bpp_rvv_i32               :   64.0   57.0
> vp9_put_8tap_smooth_64h_8bpp_c                     : 2696.2 2339.0
> vp9_put_8tap_smooth_64h_8bpp_rvv_i32               :  259.7  115.7
> vp9_put_8tap_smooth_64v_8bpp_c                     : 2691.0 2348.5
> vp9_put_8tap_smooth_64v_8bpp_rvv_i32               :  255.5  114.0
> ---
>  libavcodec/riscv/vp9_mc_rvv.S  | 200 +++++++++++++++++++++++++++++++++
>  libavcodec/riscv/vp9dsp.h      |  72 ++++++++----
>  libavcodec/riscv/vp9dsp_init.c |  38 ++++++-
>  3 files changed, 285 insertions(+), 25 deletions(-)
>
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index 5241562531..5e81301aa5 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -36,6 +36,18 @@
>  .endif
>  .endm
>
> +.macro vsetvlstatic16 len
> +.ifc \len,4
> +        vsetvli         zero, zero, e16, mf2, ta, ma
> +.elseif \len == 8
> +        vsetvli         zero, zero, e16, m1, ta, ma
> +.elseif \len == 16
> +        vsetvli         zero, zero, e16, m2, ta, ma
> +.else
> +        vsetvli         zero, zero, e16, m4, ta, ma
> +.endif
> +.endm
> +
>  .macro copy_avg len
>  func ff_vp9_avg\len\()_rvv, zve32x
>          csrwi           vxrm, 0
> @@ -181,8 +193,196 @@ func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
>  endfunc
>  .endm
>
> +.equ ff_vp9_subpel_filters_smooth, ff_vp9_subpel_filters
> +.equ ff_vp9_subpel_filters_regular, ff_vp9_subpel_filters + 16*8*2
> +.equ ff_vp9_subpel_filters_sharp, ff_vp9_subpel_filters + 16*8*2*2
> +
> +.macro epel_filter name, type, regtype
> +        lla             \regtype\()2, ff_vp9_subpel_filters_\name
> +
> +.ifc \type,v
> +        slli            \regtype\()0, a6, 4
> +.else
> +        slli            \regtype\()0, a5, 4
> +.endif
> +        add             \regtype\()0, \regtype\()0, \regtype\()2
> +
> +        lh              \regtype\()1, 2(\regtype\()0)
> +        lh              \regtype\()2, 4(\regtype\()0)
> +        lh              \regtype\()3, 6(\regtype\()0)
> +        lh              \regtype\()4, 8(\regtype\()0)
> +        lh              \regtype\()5, 10(\regtype\()0)
> +        lh              \regtype\()6, 12(\regtype\()0)
> +
> +.ifc \regtype,t
> +        lh              a7, 14(\regtype\()0)
> +.else
> +        lh              s7, 14(\regtype\()0)
> +.endif
> +        lh              \regtype\()0, 0(\regtype\()0)
> +.endm
> +
> +.macro epel_load dst, len, op, name, type, from_mem, regtype
> +.ifc \from_mem, 1
> +        vle8.v          v22, (a2)
> +.ifc \type,v
> +        add             a5, a3, a2
> +        sub             a2, a2, a3
> +        vle8.v          v24, (a5)
> +        vle8.v          v20, (a2)
> +        sh1add          a2, a3, a5
> +        add             a5, a5, a3
> +        vle8.v          v26, (a5)
> +        vle8.v          v28, (a2)
> +        add             a2, a2, a3
> +        vle8.v          v30, (a2)
> +.else
> +        addi            a5, a2, 1
> +        addi            a2, a2, -1
> +        vle8.v          v24, (a5)
> +        vle8.v          v20, (a2)
> +        addi            a5, a5, 2
> +        addi            a2, a2, 3
> +        vle8.v          v28, (a5)
> +        vle8.v          v26, (a2)
> +        addi            a2, a5, 1
> +        vle8.v          v30, (a2)
> +.endif
> +
> +.ifc \name,smooth
> +        vwmulu.vx       v16, v24, \regtype\()4
> +        vwmaccu.vx      v16, \regtype\()2, v20
> +        vwmaccu.vx      v16, \regtype\()5, v26
> +        vwmaccsu.vx     v16, \regtype\()6, v28
> +.else
> +        vwmulu.vx       v16, v28, \regtype\()6
> +        vwmaccsu.vx     v16, \regtype\()2, v20
> +        vwmaccsu.vx     v16, \regtype\()5, v26
> +.endif
> +
> +.ifc \regtype,t
> +        vwmaccsu.vx     v16, a7, v30
> +.else
> +        vwmaccsu.vx     v16, s7, v30
> +.endif
> +
> +.ifc \type,v
> +        sh1add          a5, a3, a3
> +        sub             a2, a2, a5
> +        sub             a2, a2, a5
> +        sub             a5, a2, a3
> +        vle8.v          v28, (a2)
> +        vle8.v          v26, (a5)
> +        sh1add          a2, a3, a2
> +.else
> +        addi            a5, a2, -7
> +        addi            a2, a2, -6
> +        vle8.v          v26, (a5)
> +        vle8.v          v28, (a2)
> +        addi            a2, a2, 2
> +.endif
> +
> +.ifc \name,smooth
> +        vwmaccsu.vx     v16, \regtype\()1, v28
> +.else
> +        vwmaccu.vx      v16, \regtype\()1, v28
> +        vwmulu.vx       v28, v24, \regtype\()4
> +.endif
> +        vwmaccsu.vx     v16, \regtype\()0, v26
> +        vwmulu.vx       v20, v22, \regtype\()3
> +.else
> +.ifc \name,smooth
> +        vwmulu.vx       v16, v8, \regtype\()4
> +        vwmaccu.vx      v16, \regtype\()2, v4
> +        vwmaccu.vx      v16, \regtype\()5, v10
> +        vwmaccsu.vx     v16, \regtype\()6, v12
> +        vwmaccsu.vx     v16, \regtype\()1, v2
> +.else
> +        vwmulu.vx       v16, v2, \regtype\()1
> +        vwmaccu.vx      v16, \regtype\()6, v12
> +        vwmaccsu.vx     v16, \regtype\()5, v10
> +        vwmaccsu.vx     v16, \regtype\()2, v4
> +        vwmulu.vx       v28, v8, \regtype\()4
> +.endif
> +        vwmaccsu.vx     v16, \regtype\()0, v0
> +        vwmulu.vx       v20, v6, \regtype\()3
> +
> +.ifc \regtype,t
> +        vwmaccsu.vx     v16, a7, v14
> +.else
> +        vwmaccsu.vx     v16, s7, v14
> +.endif
> +
> +.endif
> +        li              a5, 64
> +        vwadd.wx        v16, v16, a5
> +        vsetvlstatic16  \len
> +
> +.ifc \name,smooth
> +        vwadd.vv        v24, v16, v20
> +.else
> +        vwadd.vv        v24, v16, v28
> +        vwadd.wv        v24, v24, v20
> +.endif
> +        vnsra.wi        v24, v24, 7
> +        vmax.vx         v24, v24, zero
> +        vsetvlstatic8   \len, zero, 32, m2
> +
> +        vnclipu.wi      \dst, v24, 0
> +.ifc \op,avg
> +        vle8.v          v24, (a0)
> +        vaaddu.vv       \dst, \dst, v24
> +.endif
> +
> +.endm
> +
> +.macro epel_load_inc dst, len, op, name, type, from_mem, regtype
> +        epel_load       \dst, \len, \op, \name, \type, \from_mem, \regtype
> +        add             a2, a2, a3
> +.endm
> +
> +.macro epel len, op, name, type, vlen
> +func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
> +        epel_filter     \name, \type, t
> +.if \vlen < 256
> +        vsetvlstatic8   \len, a5, 32, m2
> +.else
> +        vsetvlstatic8   \len, a5, 64, m2
> +.endif
> +.ifc \op,avg
> +        csrwi           vxrm, 0
> +.endif
> +
> +1:
> +        addi            a4, a4, -1
> +        epel_load       v30, \len, \op, \name, \type, 1, t
> +        vse8.v          v30, (a0)
> +.if \len == 64 && \vlen < 256
> +        addi            a0, a0, 32
> +        addi            a2, a2, 32
> +        epel_load       v30, \len, \op, \name, \type, 1, t
> +        vse8.v          v30, (a0)
> +        addi            a0, a0, -32
> +        addi            a2, a2, -32
> +.endif
> +        add             a2, a2, a3
> +        add             a0, a0, a1
> +        bnez            a4, 1b
> +
> +        ret
> +endfunc
> +.endm
> +
>  .irp len, 64, 32, 16, 8, 4
>          copy_avg \len
> +        .irp op, put, avg
> +                .irp name, regular, sharp, smooth
> +                        .irp type, h, v
> +                                epel \len, \op, \name, \type, 128
> +                                epel \len, \op, \name, \type, 256
> +                        .endr
> +                .endr
> +        .endr
>  .endr
>
>  bilin_h_v  put, h, a5
> diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> index 8fb326dae0..5fd64a1b8c 100644
> --- a/libavcodec/riscv/vp9dsp.h
> +++ b/libavcodec/riscv/vp9dsp.h
> @@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride,
> const uint8_t *l,
>  void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
>                     const uint8_t *a);
>
> -#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)
>      \
> -void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> dststride,   \
> +#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx, min_vlen)
>     \
> +void ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,
>     \
> +                                        ptrdiff_t dststride,
>    \
>                                          const uint8_t *src,
>     \
>                                          ptrdiff_t srcstride,
>    \
>                                          int h, int mx, int my);
>     \
>
>     \
> -void ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t
> dststride,   \
> +void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,
>     \
> +                                        ptrdiff_t dststride,
>    \
>                                          const uint8_t *src,
>     \
>                                          ptrdiff_t srcstride,
>    \
>                                          int h, int mx, int my);
>     \
>
>     \
> -void ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t
> dststride,  \
> +void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,
>    \
> +                                         ptrdiff_t dststride,
>     \
>                                           const uint8_t *src,
>    \
>                                           ptrdiff_t srcstride,
>     \
>                                           int h, int mx, int my);
>    \
>
>     \
> -void ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> dststride,   \
> +void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,
>     \
> +                                        ptrdiff_t dststride,
>    \
>                                          const uint8_t *src,
>     \
>                                          ptrdiff_t srcstride,
>    \
>                                          int h, int mx, int my);
>     \
>
>     \
> -void ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t
> dststride,   \
> +void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,
>     \
> +                                        ptrdiff_t dststride,
>    \
>                                          const uint8_t *src,
>     \
>                                          ptrdiff_t srcstride,
>    \
>                                          int h, int mx, int my);
>     \
>
>     \
> -void ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t
> dststride,  \
> +void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,
>    \
> +                                         ptrdiff_t dststride,
>     \
>                                           const uint8_t *src,
>    \
>                                           ptrdiff_t srcstride,
>     \
>                                           int h, int mx, int my);
> @@ -146,23 +152,41 @@ void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t
> dststride,     \
>                          const uint8_t *src, ptrdiff_t srcstride,   \
>                          int h, int mx, int my);
>
> -VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR);
> -
> -VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP);
> -
> -VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
> +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256);
>
>  VP9_BILINEAR_RISCV_RVV_FUNC(64);
>  VP9_BILINEAR_RISCV_RVV_FUNC(32);
> diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> index b3700dfb08..3669070fca 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -49,7 +49,9 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp)
>  # endif
>
>  #if HAVE_RVV
> -    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) &&
> ff_rv_vlen_least(128)) {
> +    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
> +    int vlenb = ff_get_rv_vlenb();
> +    if (vlenb >= 16) {
>
>  #define init_fpel(idx1, sz)                                           \
>      dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_vp9_avg##sz##_rvv;  \
> @@ -95,6 +97,40 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp)
>      dsp->mc[4][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_4hv_rvv;
>
>  #undef init_fpel
> +
> +#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen)  \
> +    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] =   \
> +        ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen;       \
> +    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] =   \
> +        ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen;      \
> +    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] =   \
> +        ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen;
> +
> +#define init_subpel2(idx, idxh, idxv, dir, type, vlen)      \
> +    init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen);  \
> +    init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen);  \
> +    init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen);  \
> +    init_subpel1(3, idx, idxh, idxv,  8, dir, type, vlen);  \
> +    init_subpel1(4, idx, idxh, idxv,  4, dir, type, vlen)
> +
> +    init_subpel2(0, 1, 0, h, put, 128);
> +    init_subpel2(1, 1, 0, h, avg, 128);
> +
> +    if (flags & AV_CPU_FLAG_RVB_ADDR) {
> +        init_subpel2(0, 0, 1, v, put, 128);
> +        init_subpel2(1, 0, 1, v, avg, 128);
> +    }
> +
> +    }
> +    if (vlenb >= 32) {
> +        init_subpel2(0, 1, 0, h, put, 256);
> +        init_subpel2(1, 1, 0, h, avg, 256);
> +
> +        if (flags & AV_CPU_FLAG_RVB_ADDR) {
> +            init_subpel2(0, 0, 1, v, put, 256);
> +            init_subpel2(1, 0, 1, v, avg, 256);
> +        }
> +    }
>      }
>  #endif
>  #endif
> --
> 2.45.2
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".