* [FFmpeg-devel] [PATCH 3/5] lavc/h264qpel: Add vectorized implementation of luma MC for RISC-V
2023-05-09 9:50 [FFmpeg-devel] [PATCH 0/5] RISC-V: Improve H264 decoding performance using RVV intrinsic Arnie Chang
2023-05-09 9:50 ` [FFmpeg-devel] [PATCH 1/5] configure: Add detection of RISC-V vector intrinsic support Arnie Chang
2023-05-09 9:50 ` [FFmpeg-devel] [PATCH 2/5] lavc/h264chroma: Add vectorized implementation of chroma MC for RISC-V Arnie Chang
@ 2023-05-09 9:50 ` Arnie Chang
2023-05-09 9:50 ` [FFmpeg-devel] [PATCH 4/5] lavc/h264dsp: Add vectorized implementation of DSP functions " Arnie Chang
` (3 subsequent siblings)
6 siblings, 0 replies; 11+ messages in thread
From: Arnie Chang @ 2023-05-09 9:50 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Arnie Chang
Optimize luma motion compensation using RISC-V vector intrinsics.
The performance is elvaluated using 720P videos.
Combining vecterization of chroma and luma MC, the FPS is 1.49x faster than the scalar one,
wihle applying only chroma MC resulted in a speedup of 1.13x.
Signed-off-by: Arnie Chang <arnie.chang@sifive.com>
---
libavcodec/h264qpel.c | 2 +
libavcodec/h264qpel.h | 1 +
libavcodec/riscv/Makefile | 2 +
libavcodec/riscv/h264_lowpass.h | 249 +++++
libavcodec/riscv/h264_mc_luma.c | 412 ++++++++
libavcodec/riscv/h264_mc_luma.h | 101 ++
libavcodec/riscv/h264_mc_luma_avg16.h | 1183 +++++++++++++++++++++++
libavcodec/riscv/h264_mc_luma_avg8.h | 773 +++++++++++++++
libavcodec/riscv/h264_mc_luma_put16.h | 963 ++++++++++++++++++
libavcodec/riscv/h264_mc_luma_put8.h | 648 +++++++++++++
libavcodec/riscv/h264_qpel_init_riscv.c | 107 ++
libavcodec/riscv/h264_utility.h | 75 ++
12 files changed, 4516 insertions(+)
create mode 100644 libavcodec/riscv/h264_lowpass.h
create mode 100644 libavcodec/riscv/h264_mc_luma.c
create mode 100644 libavcodec/riscv/h264_mc_luma.h
create mode 100644 libavcodec/riscv/h264_mc_luma_avg16.h
create mode 100644 libavcodec/riscv/h264_mc_luma_avg8.h
create mode 100644 libavcodec/riscv/h264_mc_luma_put16.h
create mode 100644 libavcodec/riscv/h264_mc_luma_put8.h
create mode 100644 libavcodec/riscv/h264_qpel_init_riscv.c
create mode 100644 libavcodec/riscv/h264_utility.h
diff --git a/libavcodec/h264qpel.c b/libavcodec/h264qpel.c
index 65fef03304..4293fa2a7b 100644
--- a/libavcodec/h264qpel.c
+++ b/libavcodec/h264qpel.c
@@ -108,5 +108,7 @@ av_cold void ff_h264qpel_init(H264QpelContext *c, int bit_depth)
ff_h264qpel_init_mips(c, bit_depth);
#elif ARCH_LOONGARCH64
ff_h264qpel_init_loongarch(c, bit_depth);
+#elif ARCH_RISCV
+ ff_h264qpel_init_riscv(c, bit_depth);
#endif
}
diff --git a/libavcodec/h264qpel.h b/libavcodec/h264qpel.h
index 0259e8de23..f8425ea116 100644
--- a/libavcodec/h264qpel.h
+++ b/libavcodec/h264qpel.h
@@ -37,5 +37,6 @@ void ff_h264qpel_init_ppc(H264QpelContext *c, int bit_depth);
void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth);
void ff_h264qpel_init_mips(H264QpelContext *c, int bit_depth);
void ff_h264qpel_init_loongarch(H264QpelContext *c, int bit_depth);
+void ff_h264qpel_init_riscv(H264QpelContext *c, int bit_depth);
#endif /* AVCODEC_H264QPEL_H */
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 08b76c93cb..088efa3b1e 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -22,3 +22,5 @@ RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
OBJS-$(CONFIG_H264CHROMA) += riscv/h264_chroma_init_riscv.o
RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o
+OBJS-$(CONFIG_H264QPEL) += riscv/h264_qpel_init_riscv.o
+RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264_mc_luma.o
diff --git a/libavcodec/riscv/h264_lowpass.h b/libavcodec/riscv/h264_lowpass.h
new file mode 100644
index 0000000000..f416f7429f
--- /dev/null
+++ b/libavcodec/riscv/h264_lowpass.h
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_RISCV_H264_LOWPASS_H
+#define AVCODEC_RISCV_H264_LOWPASS_H
+#include <riscv_vector.h>
+
+
+__attribute__((always_inline)) static void v_lowpass_u8m1(vuint8m1_t *p_dst0, vuint8m1_t *p_dst1, vuint8m1_t row0, vuint8m1_t row1,
+ vuint8m1_t row2, vuint8m1_t row3, vuint8m1_t row4, vuint8m1_t row5,
+ vuint8m1_t row6, int vl)
+{
+ vuint16m2_t dst0 = __riscv_vwaddu_vv_u16m2(row0, row5, vl);
+ vuint16m2_t add00 = __riscv_vwaddu_vv_u16m2(row1, row4, vl);
+ vuint16m2_t add01 = __riscv_vwaddu_vv_u16m2(row2, row3, vl);
+
+ vuint16m2_t dst1 = __riscv_vwaddu_vv_u16m2(row1, row6, vl);
+ vuint16m2_t add10 = __riscv_vwaddu_vv_u16m2(row2, row5, vl);
+ vuint16m2_t add11 = __riscv_vwaddu_vv_u16m2(row3, row4, vl);
+
+ vint16m2_t dst0_s = __riscv_vreinterpret_v_u16m2_i16m2(dst0);
+ vint16m2_t dst1_s = __riscv_vreinterpret_v_u16m2_i16m2(dst1);
+
+ dst0_s = __riscv_vmacc_vx_i16m2(dst0_s, 20, __riscv_vreinterpret_v_u16m2_i16m2(add01), vl);
+ dst0_s = __riscv_vmacc_vx_i16m2(dst0_s, -5, __riscv_vreinterpret_v_u16m2_i16m2(add00), vl);
+ dst1_s = __riscv_vmacc_vx_i16m2(dst1_s, 20, __riscv_vreinterpret_v_u16m2_i16m2(add11), vl);
+ dst1_s = __riscv_vmacc_vx_i16m2(dst1_s, -5, __riscv_vreinterpret_v_u16m2_i16m2(add10), vl);
+
+ dst0_s = __riscv_vmax_vx_i16m2(dst0_s, 0, vl);
+ dst1_s = __riscv_vmax_vx_i16m2(dst1_s, 0, vl);
+
+ dst0 = __riscv_vreinterpret_v_i16m2_u16m2(dst0_s);
+ dst1 = __riscv_vreinterpret_v_i16m2_u16m2(dst1_s);
+
+ *p_dst0 = __riscv_vnclipu_wx_u8m1(dst0, 5, vl);
+ *p_dst1 = __riscv_vnclipu_wx_u8m1(dst1, 5, vl);
+}
+
+__attribute__((always_inline)) static void v_lowpass_u32m2(vuint32m2_t *p_dst0, vuint32m2_t *p_dst1, vint16m1_t *p_row0, vint16m1_t *p_row1,
+ vint16m1_t *p_row2, vint16m1_t *p_row3, vint16m1_t *p_row4, vint16m1_t *p_row5,
+ vint16m1_t *p_row6, ptrdiff_t stride, int vl)
+{
+ vint32m2_t dst0_s = __riscv_vwadd_vv_i32m2(*p_row0, *p_row5, vl);
+ vint32m2_t add00 = __riscv_vwadd_vv_i32m2(*p_row1, *p_row4, vl);
+ vint32m2_t add01 = __riscv_vwadd_vv_i32m2(*p_row2, *p_row3, vl);
+
+ vint32m2_t dst1_s = __riscv_vwadd_vv_i32m2(*p_row1, *p_row6, vl);
+ vint32m2_t add10 = __riscv_vwadd_vv_i32m2(*p_row2, *p_row5, vl);
+ vint32m2_t add11 = __riscv_vwadd_vv_i32m2(*p_row3, *p_row4, vl);
+
+ dst0_s = __riscv_vmacc_vx_i32m2(dst0_s, 20, add01, vl);
+ dst0_s = __riscv_vmacc_vx_i32m2(dst0_s, -5, add00, vl);
+ dst1_s = __riscv_vmacc_vx_i32m2(dst1_s, 20, add11, vl);
+ dst1_s = __riscv_vmacc_vx_i32m2(dst1_s, -5, add10, vl);
+
+ dst0_s = __riscv_vmax_vx_i32m2(dst0_s, 0, vl);
+ dst1_s = __riscv_vmax_vx_i32m2(dst1_s, 0, vl);
+
+ *p_dst0 = __riscv_vreinterpret_v_i32m2_u32m2(dst0_s);
+ *p_dst1 = __riscv_vreinterpret_v_i32m2_u32m2(dst1_s);
+}
+
+__attribute__((always_inline)) static void h_lowpass_i16m1(vint16m1_t *p_dst0, vint16m1_t *p_dst1, const uint8_t **pp_src, ptrdiff_t stride, int vl)
+{
+ vuint8mf2_t row00 = __riscv_vle8_v_u8mf2(*pp_src - 2, vl);
+ vuint8mf2_t row01 = __riscv_vle8_v_u8mf2(*pp_src - 2 + 1, vl);
+ vuint8mf2_t row02 = __riscv_vle8_v_u8mf2(*pp_src - 2 + 2, vl);
+ vuint8mf2_t row03 = __riscv_vle8_v_u8mf2(*pp_src - 2 + 3, vl);
+ vuint8mf2_t row04 = __riscv_vle8_v_u8mf2(*pp_src - 2 + 4, vl);
+ vuint8mf2_t row05 = __riscv_vle8_v_u8mf2(*pp_src - 2 + 5, vl);
+ *pp_src += stride;
+
+ vuint8mf2_t row10 = __riscv_vle8_v_u8mf2(*pp_src - 2, vl);
+ vuint8mf2_t row11 = __riscv_vle8_v_u8mf2(*pp_src - 2 + 1, vl);
+ vuint8mf2_t row12 = __riscv_vle8_v_u8mf2(*pp_src - 2 + 2, vl);
+ vuint8mf2_t row13 = __riscv_vle8_v_u8mf2(*pp_src - 2 + 3, vl);
+ vuint8mf2_t row14 = __riscv_vle8_v_u8mf2(*pp_src - 2 + 4, vl);
+ vuint8mf2_t row15 = __riscv_vle8_v_u8mf2(*pp_src - 2 + 5, vl);
+ *pp_src += stride;
+
+ vuint16m1_t dst0_u = __riscv_vwaddu_vv_u16m1(row00, row05, vl);
+ vuint16m1_t add00 = __riscv_vwaddu_vv_u16m1(row01, row04, vl);
+ vuint16m1_t add01 = __riscv_vwaddu_vv_u16m1(row02, row03, vl);
+
+ vuint16m1_t dst1_u = __riscv_vwaddu_vv_u16m1(row10, row15, vl);
+ vuint16m1_t add10 = __riscv_vwaddu_vv_u16m1(row11, row14, vl);
+ vuint16m1_t add11 = __riscv_vwaddu_vv_u16m1(row12, row13, vl);
+
+ *p_dst0 = __riscv_vreinterpret_v_u16m1_i16m1(dst0_u);
+ *p_dst1 = __riscv_vreinterpret_v_u16m1_i16m1(dst1_u);
+
+ *p_dst0 = __riscv_vmacc_vx_i16m1(*p_dst0, 20, __riscv_vreinterpret_v_u16m1_i16m1(add01), vl);
+ *p_dst0 = __riscv_vmacc_vx_i16m1(*p_dst0, -5, __riscv_vreinterpret_v_u16m1_i16m1(add00), vl);
+ *p_dst1 = __riscv_vmacc_vx_i16m1(*p_dst1, 20, __riscv_vreinterpret_v_u16m1_i16m1(add11), vl);
+ *p_dst1 = __riscv_vmacc_vx_i16m1(*p_dst1, -5, __riscv_vreinterpret_v_u16m1_i16m1(add10), vl);
+}
+
+__attribute__((always_inline)) static void h_lowpass_u16m2(vuint16m2_t *p_dst0, vuint16m2_t *p_dst1, const uint8_t **pp_src, ptrdiff_t stride, int vl)
+{
+ vuint8m1_t row00 = __riscv_vle8_v_u8m1(*pp_src - 2, vl);
+ vuint8m1_t row01 = __riscv_vle8_v_u8m1(*pp_src - 2 + 1, vl);
+ vuint8m1_t row02 = __riscv_vle8_v_u8m1(*pp_src - 2 + 2, vl);
+ vuint8m1_t row03 = __riscv_vle8_v_u8m1(*pp_src - 2 + 3, vl);
+ vuint8m1_t row04 = __riscv_vle8_v_u8m1(*pp_src - 2 + 4, vl);
+ vuint8m1_t row05 = __riscv_vle8_v_u8m1(*pp_src - 2 + 5, vl);
+ *pp_src += stride;
+
+ vuint8m1_t row10 = __riscv_vle8_v_u8m1(*pp_src - 2, vl);
+ vuint8m1_t row11 = __riscv_vle8_v_u8m1(*pp_src - 2 + 1, vl);
+ vuint8m1_t row12 = __riscv_vle8_v_u8m1(*pp_src - 2 + 2, vl);
+ vuint8m1_t row13 = __riscv_vle8_v_u8m1(*pp_src - 2 + 3, vl);
+ vuint8m1_t row14 = __riscv_vle8_v_u8m1(*pp_src - 2 + 4, vl);
+ vuint8m1_t row15 = __riscv_vle8_v_u8m1(*pp_src - 2 + 5, vl);
+ *pp_src += stride;
+
+ *p_dst0 = __riscv_vwaddu_vv_u16m2(row00, row05, vl);
+ vuint16m2_t add00 = __riscv_vwaddu_vv_u16m2(row01, row04, vl);
+ vuint16m2_t add01 = __riscv_vwaddu_vv_u16m2(row02, row03, vl);
+
+ *p_dst1 = __riscv_vwaddu_vv_u16m2(row10, row15, vl);
+ vuint16m2_t add10 = __riscv_vwaddu_vv_u16m2(row11, row14, vl);
+ vuint16m2_t add11 = __riscv_vwaddu_vv_u16m2(row12, row13, vl);
+
+ vint16m2_t dst0_s = __riscv_vreinterpret_v_u16m2_i16m2(*p_dst0);
+ vint16m2_t dst1_s = __riscv_vreinterpret_v_u16m2_i16m2(*p_dst1);
+
+ dst0_s = __riscv_vmacc_vx_i16m2(dst0_s, 20, __riscv_vreinterpret_v_u16m2_i16m2(add01), vl);
+ dst0_s = __riscv_vmacc_vx_i16m2(dst0_s, -5, __riscv_vreinterpret_v_u16m2_i16m2(add00), vl);
+ dst1_s = __riscv_vmacc_vx_i16m2(dst1_s, 20, __riscv_vreinterpret_v_u16m2_i16m2(add11), vl);
+ dst1_s = __riscv_vmacc_vx_i16m2(dst1_s, -5, __riscv_vreinterpret_v_u16m2_i16m2(add10), vl);
+
+ dst0_s = __riscv_vmax_vx_i16m2(dst0_s, 0, vl);
+ dst1_s = __riscv_vmax_vx_i16m2(dst1_s, 0, vl);
+
+ *p_dst0 = __riscv_vreinterpret_v_i16m2_u16m2(dst0_s);
+ *p_dst1 = __riscv_vreinterpret_v_i16m2_u16m2(dst1_s);
+}
+
+__attribute__((always_inline)) static void h_lowpass_u8m1_l2src(vuint8m1_t *p_dst0, vuint8m1_t *p_dst1, const uint8_t **pp_src, ptrdiff_t stride, int vl)
+{
+ vuint8m1_t row00 = __riscv_vle8_v_u8m1(*pp_src - 2, vl);
+ vuint8m1_t row01 = __riscv_vle8_v_u8m1(*pp_src - 2 + 1, vl);
+ vuint8m1_t row02 = __riscv_vle8_v_u8m1(*pp_src - 2 + 2, vl);
+ vuint8m1_t row03 = __riscv_vle8_v_u8m1(*pp_src - 2 + 3, vl);
+ vuint8m1_t row04 = __riscv_vle8_v_u8m1(*pp_src - 2 + 4, vl);
+ vuint8m1_t row05 = __riscv_vle8_v_u8m1(*pp_src - 2 + 5, vl);
+ *pp_src += stride;
+
+ vuint8m1_t row10 = __riscv_vle8_v_u8m1(*pp_src - 2, vl);
+ vuint8m1_t row11 = __riscv_vle8_v_u8m1(*pp_src - 2 + 1, vl);
+ vuint8m1_t row12 = __riscv_vle8_v_u8m1(*pp_src - 2 + 2, vl);
+ vuint8m1_t row13 = __riscv_vle8_v_u8m1(*pp_src - 2 + 3, vl);
+ vuint8m1_t row14 = __riscv_vle8_v_u8m1(*pp_src - 2 + 4, vl);
+ vuint8m1_t row15 = __riscv_vle8_v_u8m1(*pp_src - 2 + 5, vl);
+ *pp_src += stride;
+
+ vuint16m2_t dst0_u = __riscv_vwaddu_vv_u16m2(row00, row05, vl);
+ vuint16m2_t add00 = __riscv_vwaddu_vv_u16m2(row01, row04, vl);
+ vuint16m2_t add01 = __riscv_vwaddu_vv_u16m2(row02, row03, vl);
+
+ vuint16m2_t dst1_u = __riscv_vwaddu_vv_u16m2(row10, row15, vl);
+ vuint16m2_t add10 = __riscv_vwaddu_vv_u16m2(row11, row14, vl);
+ vuint16m2_t add11 = __riscv_vwaddu_vv_u16m2(row12, row13, vl);
+
+ vint16m2_t dst0_s = __riscv_vreinterpret_v_u16m2_i16m2(dst0_u);
+ vint16m2_t dst1_s = __riscv_vreinterpret_v_u16m2_i16m2(dst1_u);
+
+ dst0_s = __riscv_vmacc_vx_i16m2(dst0_s, 20, __riscv_vreinterpret_v_u16m2_i16m2(add01), vl);
+ dst0_s = __riscv_vmacc_vx_i16m2(dst0_s, -5, __riscv_vreinterpret_v_u16m2_i16m2(add00), vl);
+ dst1_s = __riscv_vmacc_vx_i16m2(dst1_s, 20, __riscv_vreinterpret_v_u16m2_i16m2(add11), vl);
+ dst1_s = __riscv_vmacc_vx_i16m2(dst1_s, -5, __riscv_vreinterpret_v_u16m2_i16m2(add10), vl);
+
+ dst0_s = __riscv_vmax_vx_i16m2(dst0_s, 0, vl);
+ dst1_s = __riscv_vmax_vx_i16m2(dst1_s, 0, vl);
+
+ dst0_u = __riscv_vreinterpret_v_i16m2_u16m2(dst0_s);
+ dst1_u = __riscv_vreinterpret_v_i16m2_u16m2(dst1_s);
+
+ *p_dst0 = __riscv_vnclipu_wx_u8m1(dst0_u, 5, vl);
+ *p_dst1 = __riscv_vnclipu_wx_u8m1(dst1_u, 5, vl);
+
+ *p_dst0 = __riscv_vaaddu_vv_u8m1(*p_dst0, row02, vl);
+ *p_dst1 = __riscv_vaaddu_vv_u8m1(*p_dst1, row12, vl);
+}
+
+__attribute__((always_inline)) static void h_lowpass_u8m1_l2src_shift(vuint8m1_t *p_dst0, vuint8m1_t *p_dst1, const uint8_t **pp_src, ptrdiff_t stride, int vl)
+{
+ vuint8m1_t row00 = __riscv_vle8_v_u8m1(*pp_src - 2, vl);
+ vuint8m1_t row01 = __riscv_vle8_v_u8m1(*pp_src - 2 + 1, vl);
+ vuint8m1_t row02 = __riscv_vle8_v_u8m1(*pp_src - 2 + 2, vl);
+ vuint8m1_t row03 = __riscv_vle8_v_u8m1(*pp_src - 2 + 3, vl);
+ vuint8m1_t row04 = __riscv_vle8_v_u8m1(*pp_src - 2 + 4, vl);
+ vuint8m1_t row05 = __riscv_vle8_v_u8m1(*pp_src - 2 + 5, vl);
+ *pp_src += stride;
+
+ vuint8m1_t row10 = __riscv_vle8_v_u8m1(*pp_src - 2, vl);
+ vuint8m1_t row11 = __riscv_vle8_v_u8m1(*pp_src - 2 + 1, vl);
+ vuint8m1_t row12 = __riscv_vle8_v_u8m1(*pp_src - 2 + 2, vl);
+ vuint8m1_t row13 = __riscv_vle8_v_u8m1(*pp_src - 2 + 3, vl);
+ vuint8m1_t row14 = __riscv_vle8_v_u8m1(*pp_src - 2 + 4, vl);
+ vuint8m1_t row15 = __riscv_vle8_v_u8m1(*pp_src - 2 + 5, vl);
+ *pp_src += stride;
+
+ vuint16m2_t dst0_u = __riscv_vwaddu_vv_u16m2(row00, row05, vl);
+ vuint16m2_t add00 = __riscv_vwaddu_vv_u16m2(row01, row04, vl);
+ vuint16m2_t add01 = __riscv_vwaddu_vv_u16m2(row02, row03, vl);
+
+ vuint16m2_t dst1_u = __riscv_vwaddu_vv_u16m2(row10, row15, vl);
+ vuint16m2_t add10 = __riscv_vwaddu_vv_u16m2(row11, row14, vl);
+ vuint16m2_t add11 = __riscv_vwaddu_vv_u16m2(row12, row13, vl);
+
+ vint16m2_t dst0_s = __riscv_vreinterpret_v_u16m2_i16m2(dst0_u);
+ vint16m2_t dst1_s = __riscv_vreinterpret_v_u16m2_i16m2(dst1_u);
+
+ dst0_s = __riscv_vmacc_vx_i16m2(dst0_s, 20, __riscv_vreinterpret_v_u16m2_i16m2(add01), vl);
+ dst0_s = __riscv_vmacc_vx_i16m2(dst0_s, -5, __riscv_vreinterpret_v_u16m2_i16m2(add00), vl);
+ dst1_s = __riscv_vmacc_vx_i16m2(dst1_s, 20, __riscv_vreinterpret_v_u16m2_i16m2(add11), vl);
+ dst1_s = __riscv_vmacc_vx_i16m2(dst1_s, -5, __riscv_vreinterpret_v_u16m2_i16m2(add10), vl);
+
+ dst0_s = __riscv_vmax_vx_i16m2(dst0_s, 0, vl);
+ dst1_s = __riscv_vmax_vx_i16m2(dst1_s, 0, vl);
+
+ dst0_u = __riscv_vreinterpret_v_i16m2_u16m2(dst0_s);
+ dst1_u = __riscv_vreinterpret_v_i16m2_u16m2(dst1_s);
+
+ *p_dst0 = __riscv_vnclipu_wx_u8m1(dst0_u, 5, vl);
+ *p_dst1 = __riscv_vnclipu_wx_u8m1(dst1_u, 5, vl);
+
+ *p_dst0 = __riscv_vaaddu_vv_u8m1(*p_dst0, row03, vl);
+ *p_dst1 = __riscv_vaaddu_vv_u8m1(*p_dst1, row13, vl);
+}
+#endif
diff --git a/libavcodec/riscv/h264_mc_luma.c b/libavcodec/riscv/h264_mc_luma.c
new file mode 100644
index 0000000000..4047c0ff4e
--- /dev/null
+++ b/libavcodec/riscv/h264_mc_luma.c
@@ -0,0 +1,412 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264_mc_luma.h"
+#if HAVE_INTRINSICS_RVV
+#include <riscv_vector.h>
+#include "h264_mc_luma_put16.h"
+#include "h264_mc_luma_avg16.h"
+#include "h264_mc_luma_put8.h"
+#include "h264_mc_luma_avg8.h"
+
+void put_h264_qpel16_mc00_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ put_copy_block16(p_dst, p_src, stride);
+}
+
+void put_h264_qpel16_mc01_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ put_h264_qpel16_v_lowpass_l2src(p_dst, p_src, stride);
+}
+
+void put_h264_qpel16_mc02_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ put_h264_qpel16_v_lowpass(p_dst, p_src, stride, stride);
+}
+
+void put_h264_qpel16_mc03_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ put_h264_qpel16_v_lowpass_l2src_shift(p_dst, p_src, stride);
+}
+
+void put_h264_qpel16_mc10_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ put_h264_qpel16_h_lowpass_l2src(p_dst, p_src, stride);
+}
+
+void put_h264_qpel16_mc11_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t temp[256] = {0};
+ put_h264_qpel16_h_lowpass(temp, p_src, 16, stride);
+ put_h264_qpel16_v_lowpass_l2(p_dst, p_src, temp, stride, 16);
+}
+
+void put_h264_qpel16_mc12_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t temp[256] = {0};
+ put_h264_qpel16_v_lowpass(temp, p_src, 16, stride);
+ put_h264_qpel16_hv_lowpass_l2(p_dst, p_src, temp, stride, 16);
+}
+
+void put_h264_qpel16_mc13_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t temp[256] = {0};
+ put_h264_qpel16_h_lowpass(temp, p_src + stride, 16, stride);
+ put_h264_qpel16_v_lowpass_l2(p_dst, p_src, temp, stride, 16);
+}
+
+void put_h264_qpel16_mc20_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ put_h264_qpel16_h_lowpass(p_dst, p_src, stride, stride);
+}
+
+void put_h264_qpel16_mc21_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t temp[256] = {0};
+ put_h264_qpel16_h_lowpass(temp, p_src, 16, stride);
+ put_h264_qpel16_hv_lowpass_l2(p_dst, p_src, temp, stride, 16);
+}
+
+void put_h264_qpel16_mc22_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ put_h264_qpel16_hv_lowpass(p_dst, p_src, stride);
+}
+
+void put_h264_qpel16_mc23_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t temp[256] = {0};
+ put_h264_qpel16_h_lowpass(temp, p_src + stride, 16, stride);
+ put_h264_qpel16_hv_lowpass_l2(p_dst, p_src, temp, stride, 16);
+}
+
+void put_h264_qpel16_mc30_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ put_h264_qpel16_h_lowpass_l2src_shift(p_dst, p_src, stride);
+}
+
+void put_h264_qpel16_mc31_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t temp[256] = {0};
+ put_h264_qpel16_h_lowpass(temp, p_src, 16, stride);
+ put_h264_qpel16_v_lowpass_l2(p_dst, p_src + 1, temp, stride, 16);
+}
+
+void put_h264_qpel16_mc32_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t temp[256] = {0};
+ put_h264_qpel16_v_lowpass(temp, p_src + 1, 16, stride);
+ put_h264_qpel16_hv_lowpass_l2(p_dst, p_src, temp, stride, 16);
+}
+
+void put_h264_qpel16_mc33_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t temp[256] = {0};
+ put_h264_qpel16_h_lowpass(temp, p_src + stride, 16, stride);
+ put_h264_qpel16_v_lowpass_l2(p_dst, p_src + 1, temp, stride, 16);
+}
+
+void avg_h264_qpel16_mc00_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ avg_copy_block16(p_dst, p_src, stride);
+}
+
+void avg_h264_qpel16_mc01_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ avg_h264_qpel16_v_lowpass_l2src(p_dst, p_src, stride);
+}
+
+void avg_h264_qpel16_mc02_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ avg_h264_qpel16_v_lowpass(p_dst, p_src, stride, stride);
+}
+
+void avg_h264_qpel16_mc03_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ avg_h264_qpel16_v_lowpass_l2src_shift(p_dst, p_src, stride);
+}
+
+void avg_h264_qpel16_mc10_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ avg_h264_qpel16_h_lowpass_l2src(p_dst, p_src, stride);
+}
+
+void avg_h264_qpel16_mc11_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t temp[256] = {0};
+ put_h264_qpel16_h_lowpass(temp, p_src, 16, stride);
+ avg_h264_qpel16_v_lowpass_l2(p_dst, p_src, temp, stride, 16);
+}
+
+void avg_h264_qpel16_mc12_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t temp[256] = {0};
+ put_h264_qpel16_v_lowpass(temp, p_src, 16, stride);
+ avg_h264_qpel16_hv_lowpass_l2(p_dst, p_src, temp, stride, 16);
+}
+
+void avg_h264_qpel16_mc13_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t temp[256] = {0};
+ put_h264_qpel16_h_lowpass(temp, p_src + stride, 16, stride);
+ avg_h264_qpel16_v_lowpass_l2(p_dst, p_src, temp, stride, 16);
+}
+
+void avg_h264_qpel16_mc20_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ avg_h264_qpel16_h_lowpass(p_dst, p_src, stride, stride);
+}
+
+void avg_h264_qpel16_mc21_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t temp[256] = {0};
+ put_h264_qpel16_h_lowpass(temp, p_src, 16, stride);
+ avg_h264_qpel16_hv_lowpass_l2(p_dst, p_src, temp, stride, 16);
+}
+
+void avg_h264_qpel16_mc22_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ avg_h264_qpel16_hv_lowpass(p_dst, p_src, stride);
+}
+
+void avg_h264_qpel16_mc23_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t temp[256] = {0};
+ put_h264_qpel16_h_lowpass(temp, p_src + stride, 16, stride);
+ avg_h264_qpel16_hv_lowpass_l2(p_dst, p_src, temp, stride, 16);
+}
+
+void avg_h264_qpel16_mc30_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ avg_h264_qpel16_h_lowpass_l2src_shift(p_dst, p_src, stride);
+}
+
+void avg_h264_qpel16_mc31_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t temp[256] = {0};
+ put_h264_qpel16_h_lowpass(temp, p_src, 16, stride);
+ avg_h264_qpel16_v_lowpass_l2(p_dst, p_src + 1, temp, stride, 16);
+}
+
+void avg_h264_qpel16_mc32_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t temp[256] = {0};
+ put_h264_qpel16_v_lowpass(temp, p_src + 1, 16, stride);
+ avg_h264_qpel16_hv_lowpass_l2(p_dst, p_src, temp, stride, 16);
+}
+
+void avg_h264_qpel16_mc33_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t temp[256] = {0};
+ put_h264_qpel16_h_lowpass(temp, p_src + stride, 16, stride);
+ avg_h264_qpel16_v_lowpass_l2(p_dst, p_src + 1, temp, stride, 16);
+}
+
+void put_h264_qpel8_mc00_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ put_copy_block8(p_dst, p_src, stride);
+}
+
+void put_h264_qpel8_mc01_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ put_h264_qpel8_v_lowpass_l2src(p_dst, p_src, stride);
+}
+
+void put_h264_qpel8_mc02_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ put_h264_qpel8_v_lowpass(p_dst, p_src, stride, stride);
+}
+
+void put_h264_qpel8_mc03_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ put_h264_qpel8_v_lowpass_l2src_shift(p_dst, p_src, stride);
+}
+
+void put_h264_qpel8_mc10_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ put_h264_qpel8_h_lowpass_l2src(p_dst, p_src, stride);
+}
+
+void put_h264_qpel8_mc11_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t temp[64] = {0};
+ put_h264_qpel8_h_lowpass(temp, p_src, 8, stride);
+ put_h264_qpel8_v_lowpass_l2(p_dst, p_src, temp, stride, 8);
+}
+
+void put_h264_qpel8_mc12_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t temp[64] = {0};
+ put_h264_qpel8_v_lowpass(temp, p_src, 8, stride);
+ put_h264_qpel8_hv_lowpass_l2(p_dst, p_src, temp, stride, 8);
+}
+
+void put_h264_qpel8_mc13_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t temp[64] = {0};
+ put_h264_qpel8_h_lowpass(temp, p_src + stride, 8, stride);
+ put_h264_qpel8_v_lowpass_l2(p_dst, p_src, temp, stride, 8);
+}
+
+void put_h264_qpel8_mc20_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ put_h264_qpel8_h_lowpass(p_dst, p_src, stride, stride);
+}
+
+void put_h264_qpel8_mc21_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t temp[64] = {0};
+ put_h264_qpel8_h_lowpass(temp, p_src, 8, stride);
+ put_h264_qpel8_hv_lowpass_l2(p_dst, p_src, temp, stride, 8);
+}
+
+void put_h264_qpel8_mc22_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ put_h264_qpel8_hv_lowpass(p_dst, p_src, stride);
+}
+
+void put_h264_qpel8_mc23_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t temp[64] = {0};
+ put_h264_qpel8_h_lowpass(temp, p_src + stride, 8, stride);
+ put_h264_qpel8_hv_lowpass_l2(p_dst, p_src, temp, stride, 8);
+}
+
+void put_h264_qpel8_mc30_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ put_h264_qpel8_h_lowpass_l2src_shift(p_dst, p_src, stride);
+}
+
+void put_h264_qpel8_mc31_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t temp[64] = {0};
+ put_h264_qpel8_h_lowpass(temp, p_src, 8, stride);
+ put_h264_qpel8_v_lowpass_l2(p_dst, p_src + 1, temp, stride, 8);
+}
+
+void put_h264_qpel8_mc32_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t temp[64] = {0};
+ put_h264_qpel8_v_lowpass(temp, p_src + 1, 8, stride);
+ put_h264_qpel8_hv_lowpass_l2(p_dst, p_src, temp, stride, 8);
+}
+
+void put_h264_qpel8_mc33_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t temp[64] = {0};
+ put_h264_qpel8_h_lowpass(temp, p_src + stride, 8, stride);
+ put_h264_qpel8_v_lowpass_l2(p_dst, p_src + 1, temp, stride, 8);
+}
+
+void avg_h264_qpel8_mc00_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ avg_copy_block8(p_dst, p_src, stride);
+}
+
+void avg_h264_qpel8_mc01_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ avg_h264_qpel8_v_lowpass_l2src(p_dst, p_src, stride);
+}
+
+void avg_h264_qpel8_mc02_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ avg_h264_qpel8_v_lowpass(p_dst, p_src, stride, stride);
+}
+
+void avg_h264_qpel8_mc03_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ avg_h264_qpel8_v_lowpass_l2src_shift(p_dst, p_src, stride);
+}
+
+void avg_h264_qpel8_mc10_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ avg_h264_qpel8_h_lowpass_l2src(p_dst, p_src, stride);
+}
+
+void avg_h264_qpel8_mc11_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t temp[64] = {0};
+ put_h264_qpel8_h_lowpass(temp, p_src, 8, stride);
+ avg_h264_qpel8_v_lowpass_l2(p_dst, p_src, temp, stride, 8);
+}
+
+void avg_h264_qpel8_mc12_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t temp[64] = {0};
+ put_h264_qpel8_v_lowpass(temp, p_src, 8, stride);
+ avg_h264_qpel8_hv_lowpass_l2(p_dst, p_src, temp, stride, 8);
+}
+
+void avg_h264_qpel8_mc13_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t temp[64] = {0};
+ put_h264_qpel8_h_lowpass(temp, p_src + stride, 8, stride);
+ avg_h264_qpel8_v_lowpass_l2(p_dst, p_src, temp, stride, 8);
+}
+
+void avg_h264_qpel8_mc20_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ avg_h264_qpel8_h_lowpass(p_dst, p_src, stride, stride);
+}
+
+void avg_h264_qpel8_mc21_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t temp[64] = {0};
+ put_h264_qpel8_h_lowpass(temp, p_src, 8, stride);
+ avg_h264_qpel8_hv_lowpass_l2(p_dst, p_src, temp, stride, 8);
+}
+
+void avg_h264_qpel8_mc22_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ avg_h264_qpel8_hv_lowpass(p_dst, p_src, stride);
+}
+
+void avg_h264_qpel8_mc23_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t temp[64] = {0};
+ put_h264_qpel8_h_lowpass(temp, p_src + stride, 8, stride);
+ avg_h264_qpel8_hv_lowpass_l2(p_dst, p_src, temp, stride, 8);
+}
+
+void avg_h264_qpel8_mc30_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ avg_h264_qpel8_h_lowpass_l2src_shift(p_dst, p_src, stride);
+}
+
+void avg_h264_qpel8_mc31_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t temp[64] = {0};
+ put_h264_qpel8_h_lowpass(temp, p_src, 8, stride);
+ avg_h264_qpel8_v_lowpass_l2(p_dst, p_src + 1, temp, stride, 8);
+}
+
+void avg_h264_qpel8_mc32_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t temp[64] = {0};
+ put_h264_qpel8_v_lowpass(temp, p_src + 1, 8, stride);
+ avg_h264_qpel8_hv_lowpass_l2(p_dst, p_src, temp, stride, 8);
+}
+
+void avg_h264_qpel8_mc33_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ uint8_t temp[64] = {0};
+ put_h264_qpel8_h_lowpass(temp, p_src + stride, 8, stride);
+ avg_h264_qpel8_v_lowpass_l2(p_dst, p_src + 1, temp, stride, 8);
+}
+#endif
diff --git a/libavcodec/riscv/h264_mc_luma.h b/libavcodec/riscv/h264_mc_luma.h
new file mode 100644
index 0000000000..78d7c41a5f
--- /dev/null
+++ b/libavcodec/riscv/h264_mc_luma.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_RISCV_H264_MC_LUMA_H
+#define AVCODEC_RISCV_H264_MC_LUMA_H
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stddef.h>
+#include "config.h"
+
+#if HAVE_INTRINSICS_RVV
+typedef unsigned char pixel;
+
+void put_h264_qpel16_mc00_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel16_mc01_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel16_mc02_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel16_mc03_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel16_mc10_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel16_mc11_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel16_mc12_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel16_mc13_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel16_mc20_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel16_mc21_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel16_mc22_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel16_mc23_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel16_mc30_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel16_mc31_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel16_mc32_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel16_mc33_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+
+void avg_h264_qpel16_mc00_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel16_mc01_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel16_mc02_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel16_mc03_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel16_mc10_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel16_mc11_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel16_mc12_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel16_mc13_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel16_mc20_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel16_mc21_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel16_mc22_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel16_mc23_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel16_mc30_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel16_mc31_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel16_mc32_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel16_mc33_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+
+void put_h264_qpel8_mc00_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel8_mc01_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel8_mc02_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel8_mc03_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel8_mc10_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel8_mc11_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel8_mc12_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel8_mc13_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel8_mc20_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel8_mc21_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel8_mc22_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel8_mc23_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel8_mc30_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel8_mc31_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel8_mc32_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void put_h264_qpel8_mc33_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+
+void avg_h264_qpel8_mc00_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel8_mc01_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel8_mc02_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel8_mc03_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel8_mc10_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel8_mc11_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel8_mc12_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel8_mc13_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel8_mc20_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel8_mc21_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel8_mc22_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel8_mc23_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel8_mc30_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel8_mc31_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel8_mc32_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+void avg_h264_qpel8_mc33_8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride);
+#endif
+#endif
\ No newline at end of file
diff --git a/libavcodec/riscv/h264_mc_luma_avg16.h b/libavcodec/riscv/h264_mc_luma_avg16.h
new file mode 100644
index 0000000000..7f2aacd00d
--- /dev/null
+++ b/libavcodec/riscv/h264_mc_luma_avg16.h
@@ -0,0 +1,1183 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_RISCV_H264_MC_LUMA_AVG16_H
+#define AVCODEC_RISCV_H264_MC_LUMA_AVG16_H
+#include <riscv_vector.h>
+#include "h264_utility.h"
+#include "h264_lowpass.h"
+
+__attribute__((always_inline)) static void avg_copy_block16(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 16;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ for (int j = 0; j < 16; j += 8)
+ {
+ vuint8m1_t src0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t src1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t src2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t src3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t src4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t src5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t src6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t src7 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ vuint8m1_t dst0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ vuint8m1_t dst1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ vuint8m1_t dst2 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 2, vl);
+ vuint8m1_t dst3 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 3, vl);
+ vuint8m1_t dst4 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 4, vl);
+ vuint8m1_t dst5 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 5, vl);
+ vuint8m1_t dst6 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 6, vl);
+ vuint8m1_t dst7 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 7, vl);
+
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, src0, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, src1, vl);
+ dst2 = __riscv_vaaddu_vv_u8m1(dst2, src2, vl);
+ dst3 = __riscv_vaaddu_vv_u8m1(dst3, src3, vl);
+ dst4 = __riscv_vaaddu_vv_u8m1(dst4, src4, vl);
+ dst5 = __riscv_vaaddu_vv_u8m1(dst5, src5, vl);
+ dst6 = __riscv_vaaddu_vv_u8m1(dst6, src6, vl);
+ dst7 = __riscv_vaaddu_vv_u8m1(dst7, src7, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst2, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst3, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst4, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst5, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst6, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst7, vl);
+ p_dst_iter += stride;
+ }
+
+ p_src_iter = p_src_begin + vl;
+ p_dst_iter = p_dst_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel16_h_lowpass(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t dst_stride, ptrdiff_t src_stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 16;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ for (int j = 0; j < 16; j += 2)
+ {
+ vuint16m2_t dst0_u, dst1_u;
+ h_lowpass_u16m2(&dst0_u, &dst1_u, &p_src_iter, src_stride, vl);
+
+ vuint8m1_t dst0_nrw = __riscv_vnclipu_wx_u8m1(dst0_u, 5, vl);
+ vuint8m1_t dst1_nrw = __riscv_vnclipu_wx_u8m1(dst1_u, 5, vl);
+
+ vuint8m1_t avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ vuint8m1_t avg1 = __riscv_vle8_v_u8m1(p_dst_iter + dst_stride, vl);
+
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0_nrw, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1_nrw, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += dst_stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += dst_stride;
+ }
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel16_hv_lowpass(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 16;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8mf2(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ p_src_iter -= (stride << 1);
+
+ vint16m1_t h_row0, h_row1, h_row2, h_row3, h_row4, h_row5, h_row6, h_row7;
+ vint16m1_t h_row8, h_row9, h_row10, h_row11, h_row12, h_row13;
+
+ h_lowpass_i16m1(&h_row0, &h_row1, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row2, &h_row3, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row4, &h_row5, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row6, &h_row7, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row8, &h_row9, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row10, &h_row11, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row12, &h_row13, &p_src_iter, stride, vl);
+
+ vuint32m2_t dst0, dst1;
+ v_lowpass_u32m2(&dst0, &dst1, &h_row0, &h_row1, &h_row2, &h_row3, &h_row4, &h_row5, &h_row6, stride, vl);
+
+ vuint8mf2_t dst0_u8, dst1_u8;
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+
+ vuint8mf2_t avg0 = __riscv_vle8_v_u8mf2(p_dst_iter, vl);
+ vuint8mf2_t avg1 = __riscv_vle8_v_u8mf2(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8mf2(avg0, dst0_u8, vl);
+ avg1 = __riscv_vaaddu_vv_u8mf2(avg1, dst1_u8, vl);
+
+ __riscv_vse8_v_u8mf2(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8mf2(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row2, &h_row3, &h_row4, &h_row5, &h_row6, &h_row7, &h_row8, stride, vl);
+
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+
+ avg0 = __riscv_vle8_v_u8mf2(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8mf2(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8mf2(avg0, dst0_u8, vl);
+ avg1 = __riscv_vaaddu_vv_u8mf2(avg1, dst1_u8, vl);
+
+ __riscv_vse8_v_u8mf2(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8mf2(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row4, &h_row5, &h_row6, &h_row7, &h_row8, &h_row9, &h_row10, stride, vl);
+
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+
+ avg0 = __riscv_vle8_v_u8mf2(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8mf2(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8mf2(avg0, dst0_u8, vl);
+ avg1 = __riscv_vaaddu_vv_u8mf2(avg1, dst1_u8, vl);
+
+ __riscv_vse8_v_u8mf2(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8mf2(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row6, &h_row7, &h_row8, &h_row9, &h_row10, &h_row11, &h_row12, stride, vl);
+
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+
+ avg0 = __riscv_vle8_v_u8mf2(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8mf2(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8mf2(avg0, dst0_u8, vl);
+ avg1 = __riscv_vaaddu_vv_u8mf2(avg1, dst1_u8, vl);
+
+ __riscv_vse8_v_u8mf2(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8mf2(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ vint16m1_t h_row14, h_row15, h_row16, h_row17, h_row18, h_row19, h_row20, h_row21;
+
+ h_lowpass_i16m1(&h_row14, &h_row15, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row16, &h_row17, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row18, &h_row19, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row20, &h_row21, &p_src_iter, stride, vl);
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row8, &h_row9, &h_row10, &h_row11, &h_row12, &h_row13, &h_row14, stride, vl);
+
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+
+ avg0 = __riscv_vle8_v_u8mf2(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8mf2(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8mf2(avg0, dst0_u8, vl);
+ avg1 = __riscv_vaaddu_vv_u8mf2(avg1, dst1_u8, vl);
+
+ __riscv_vse8_v_u8mf2(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8mf2(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row10, &h_row11, &h_row12, &h_row13, &h_row14, &h_row15, &h_row16, stride, vl);
+
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+
+ avg0 = __riscv_vle8_v_u8mf2(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8mf2(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8mf2(avg0, dst0_u8, vl);
+ avg1 = __riscv_vaaddu_vv_u8mf2(avg1, dst1_u8, vl);
+
+ __riscv_vse8_v_u8mf2(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8mf2(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row12, &h_row13, &h_row14, &h_row15, &h_row16, &h_row17, &h_row18, stride, vl);
+
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+
+ avg0 = __riscv_vle8_v_u8mf2(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8mf2(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8mf2(avg0, dst0_u8, vl);
+ avg1 = __riscv_vaaddu_vv_u8mf2(avg1, dst1_u8, vl);
+
+ __riscv_vse8_v_u8mf2(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8mf2(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row14, &h_row15, &h_row16, &h_row17, &h_row18, &h_row19, &h_row20, stride, vl);
+
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+
+ avg0 = __riscv_vle8_v_u8mf2(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8mf2(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8mf2(avg0, dst0_u8, vl);
+ avg1 = __riscv_vaaddu_vv_u8mf2(avg1, dst1_u8, vl);
+
+ __riscv_vse8_v_u8mf2(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8mf2(p_dst_iter, avg1, vl);
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel16_hv_lowpass_l2(uint8_t *p_dst, const uint8_t *p_src, uint8_t *p_l2_src, ptrdiff_t stride, ptrdiff_t l2_stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ const uint8_t *p_l2_src_iter = p_l2_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 16;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8mf2(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ const uint8_t *p_l2_src_begin = p_l2_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ p_src_iter -= (stride << 1);
+
+ vint16m1_t h_row0, h_row1, h_row2, h_row3, h_row4, h_row5, h_row6, h_row7;
+ vint16m1_t h_row8, h_row9, h_row10, h_row11, h_row12, h_row13;
+
+ h_lowpass_i16m1(&h_row0, &h_row1, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row2, &h_row3, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row4, &h_row5, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row6, &h_row7, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row8, &h_row9, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row10, &h_row11, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row12, &h_row13, &p_src_iter, stride, vl);
+
+ vuint32m2_t dst0, dst1;
+ vuint8mf2_t dst0_u8, dst1_u8;
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row0, &h_row1, &h_row2, &h_row3, &h_row4, &h_row5, &h_row6, stride, vl);
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ avg_average_l2(&p_dst_iter, &p_l2_src_iter, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row2, &h_row3, &h_row4, &h_row5, &h_row6, &h_row7, &h_row8, stride, vl);
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ avg_average_l2(&p_dst_iter, &p_l2_src_iter, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row4, &h_row5, &h_row6, &h_row7, &h_row8, &h_row9, &h_row10, stride, vl);
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ avg_average_l2(&p_dst_iter, &p_l2_src_iter, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row6, &h_row7, &h_row8, &h_row9, &h_row10, &h_row11, &h_row12, stride, vl);
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ avg_average_l2(&p_dst_iter, &p_l2_src_iter, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+ vint16m1_t h_row14, h_row15, h_row16, h_row17, h_row18, h_row19, h_row20, h_row21;
+ h_lowpass_i16m1(&h_row14, &h_row15, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row16, &h_row17, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row18, &h_row19, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row20, &h_row21, &p_src_iter, stride, vl);
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row8, &h_row9, &h_row10, &h_row11, &h_row12, &h_row13, &h_row14, stride, vl);
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ avg_average_l2(&p_dst_iter, &p_l2_src_iter, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row10, &h_row11, &h_row12, &h_row13, &h_row14, &h_row15, &h_row16, stride, vl);
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ avg_average_l2(&p_dst_iter, &p_l2_src_iter, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row12, &h_row13, &h_row14, &h_row15, &h_row16, &h_row17, &h_row18, stride, vl);
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ avg_average_l2(&p_dst_iter, &p_l2_src_iter, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row14, &h_row15, &h_row16, &h_row17, &h_row18, &h_row19, &h_row20, stride, vl);
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ avg_average_l2(&p_dst_iter, &p_l2_src_iter, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ p_l2_src_iter = p_l2_src_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel16_v_lowpass(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t dst_stride, ptrdiff_t src_stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 16;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ p_src_iter -= (src_stride * 2);
+
+ vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+
+ vuint8m1_t dst0, dst1;
+ v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+ vuint8m1_t avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ vuint8m1_t avg1 = __riscv_vle8_v_u8m1(p_dst_iter + dst_stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += dst_stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += dst_stride;
+
+ // 3rd, 4th dst
+ row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row2, row3, row4, row5, row6, row0, row1, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + dst_stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += dst_stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += dst_stride;
+
+ // 5th, 6th dst
+ row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row4, row5, row6, row0, row1, row2, row3, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + dst_stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += dst_stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += dst_stride;
+
+ // 7th, 8th dst
+ row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row6, row0, row1, row2, row3, row4, row5, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + dst_stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += dst_stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += dst_stride;
+
+ // 9th, 10th dst
+ row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row1, row2, row3, row4, row5, row6, row0, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + dst_stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += dst_stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += dst_stride;
+
+ // 11th, 12th dst
+ row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row3, row4, row5, row6, row0, row1, row2, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + dst_stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += dst_stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += dst_stride;
+
+ // 13th, 14th dst
+ row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row5, row6, row0, row1, row2, row3, row4, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + dst_stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += dst_stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += dst_stride;
+
+ // 15th, 16th dst
+ row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + dst_stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += dst_stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel16_v_lowpass_l2(uint8_t *p_dst, const uint8_t *p_src, const uint8_t *p_l2_src, int stride, int l2_stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ const uint8_t *p_l2_src_iter = p_l2_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 16;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ const uint8_t *p_l2_src_begin = p_l2_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ p_src_iter -= (stride * 2);
+
+ vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ vuint8m1_t l2_row0 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+ p_l2_src_iter += l2_stride;
+ vuint8m1_t l2_row1 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+ p_l2_src_iter += l2_stride;
+
+ vuint8m1_t dst0, dst1;
+ v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+ vuint8m1_t avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ vuint8m1_t avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ // 3rd, 4th dst
+ row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ l2_row0 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+ p_l2_src_iter += l2_stride;
+ l2_row1 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+ p_l2_src_iter += l2_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row2, row3, row4, row5, row6, row0, row1, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ // 5th, 6th dst
+ row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ l2_row0 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+ p_l2_src_iter += l2_stride;
+ l2_row1 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+ p_l2_src_iter += l2_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row4, row5, row6, row0, row1, row2, row3, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ // 7th, 8th dst
+ row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ l2_row0 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+ p_l2_src_iter += l2_stride;
+ l2_row1 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+ p_l2_src_iter += l2_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row6, row0, row1, row2, row3, row4, row5, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ // 9th, 10th dst
+ row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ l2_row0 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+ p_l2_src_iter += l2_stride;
+ l2_row1 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+ p_l2_src_iter += l2_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row1, row2, row3, row4, row5, row6, row0, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ // 11th, 12th dst
+ row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ l2_row0 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+ p_l2_src_iter += l2_stride;
+ l2_row1 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+ p_l2_src_iter += l2_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row3, row4, row5, row6, row0, row1, row2, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ // 13th, 14th dst
+ row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ l2_row0 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+ p_l2_src_iter += l2_stride;
+ l2_row1 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+ p_l2_src_iter += l2_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row5, row6, row0, row1, row2, row3, row4, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ // 15th, 16th dst
+ row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ l2_row0 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+ p_l2_src_iter += l2_stride;
+ l2_row1 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+ p_l2_src_iter += l2_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ p_l2_src_iter = p_l2_src_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel16_v_lowpass_l2src(uint8_t *p_dst, const uint8_t *p_src, int stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 16;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ p_src_iter -= (stride * 2);
+
+ vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ vuint8m1_t dst0, dst1;
+ v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row2, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row3, vl);
+ vuint8m1_t avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ vuint8m1_t avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ // 3rd, 4th dst
+ row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row2, row3, row4, row5, row6, row0, row1, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row4, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row5, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ // 5th, 6th dst
+ row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row4, row5, row6, row0, row1, row2, row3, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row6, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row0, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ // 7th, 8th dst
+ row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row6, row0, row1, row2, row3, row4, row5, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row1, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row2, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ // 9th, 10th dst
+ row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row1, row2, row3, row4, row5, row6, row0, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row3, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row4, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ // 11th, 12th dst
+ row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row3, row4, row5, row6, row0, row1, row2, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row5, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row6, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ // 13th, 14th dst
+ row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row5, row6, row0, row1, row2, row3, row4, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row0, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row1, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ // 15th, 16th dst
+ row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row2, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row3, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel16_h_lowpass_l2src(uint8_t *p_dst, const uint8_t *p_src, int stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 16;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ for (int j = 0; j < 16; j += 2)
+ {
+ vuint8m1_t dst0, dst1;
+ h_lowpass_u8m1_l2src(&dst0, &dst1, &p_src_iter, stride, vl);
+
+ vuint8m1_t avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ vuint8m1_t avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+ }
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel16_h_lowpass_l2src_shift(uint8_t *p_dst, const uint8_t *p_src, int stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 16;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ for (int j = 0; j < 16; j += 2)
+ {
+ vuint8m1_t dst0, dst1;
+ h_lowpass_u8m1_l2src_shift(&dst0, &dst1, &p_src_iter, stride, vl);
+
+ vuint8m1_t avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ vuint8m1_t avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+ }
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel16_v_lowpass_l2src_shift(uint8_t *p_dst, const uint8_t *p_src, int stride)
+{
+
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 16;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ p_src_iter -= (stride * 2);
+
+ vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ vuint8m1_t dst0, dst1;
+ v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row3, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row4, vl);
+ vuint8m1_t avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ vuint8m1_t avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ // 3rd, 4th dst
+ row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row2, row3, row4, row5, row6, row0, row1, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row5, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row6, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ // 5th, 6th dst
+ row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row4, row5, row6, row0, row1, row2, row3, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row0, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row1, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ // 7th, 8th dst
+ row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row6, row0, row1, row2, row3, row4, row5, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row2, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row3, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ // 9th, 10th dst
+ row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row1, row2, row3, row4, row5, row6, row0, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row4, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row5, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ // 11th, 12th dst
+ row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row3, row4, row5, row6, row0, row1, row2, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row6, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row0, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ // 13th, 14th dst
+ row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row5, row6, row0, row1, row2, row3, row4, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row1, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row2, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ // 15th, 16th dst
+ row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row3, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row4, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ len -= vl;
+ }
+}
+#endif
\ No newline at end of file
diff --git a/libavcodec/riscv/h264_mc_luma_avg8.h b/libavcodec/riscv/h264_mc_luma_avg8.h
new file mode 100644
index 0000000000..789bc90c44
--- /dev/null
+++ b/libavcodec/riscv/h264_mc_luma_avg8.h
@@ -0,0 +1,773 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_RISCV_H264_MC_LUMA_AVG8_H
+#define AVCODEC_RISCV_H264_MC_LUMA_AVG8_H
+#include <riscv_vector.h>
+#include "h264_utility.h"
+#include "h264_lowpass.h"
+
+__attribute__((always_inline)) static void avg_copy_block8(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 8;
+
+ while(len > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ vuint8m1_t src0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t src1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t src2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t src3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t src4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t src5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t src6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t src7 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ vuint8m1_t dst0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ vuint8m1_t dst1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ vuint8m1_t dst2 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 2, vl);
+ vuint8m1_t dst3 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 3, vl);
+ vuint8m1_t dst4 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 4, vl);
+ vuint8m1_t dst5 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 5, vl);
+ vuint8m1_t dst6 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 6, vl);
+ vuint8m1_t dst7 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 7, vl);
+
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, src0, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, src1, vl);
+ dst2 = __riscv_vaaddu_vv_u8m1(dst2, src2, vl);
+ dst3 = __riscv_vaaddu_vv_u8m1(dst3, src3, vl);
+ dst4 = __riscv_vaaddu_vv_u8m1(dst4, src4, vl);
+ dst5 = __riscv_vaaddu_vv_u8m1(dst5, src5, vl);
+ dst6 = __riscv_vaaddu_vv_u8m1(dst6, src6, vl);
+ dst7 = __riscv_vaaddu_vv_u8m1(dst7, src7, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst2, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst3, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst4, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst5, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst6, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst7, vl);
+ p_dst_iter += stride;
+
+ p_src_iter = p_src_begin + vl;
+ p_dst_iter = p_dst_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel8_h_lowpass(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t dst_stride, ptrdiff_t src_stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 8;
+
+ while(len > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ for (int j = 0; j < 8; j += 2)
+ {
+ vuint16m2_t dst0_u, dst1_u;
+ h_lowpass_u16m2(&dst0_u, &dst1_u, &p_src_iter, src_stride, vl);
+
+ vuint8m1_t dst0_nrw = __riscv_vnclipu_wx_u8m1(dst0_u, 5, vl);
+ vuint8m1_t dst1_nrw = __riscv_vnclipu_wx_u8m1(dst1_u, 5, vl);
+
+ vuint8m1_t avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ vuint8m1_t avg1 = __riscv_vle8_v_u8m1(p_dst_iter + dst_stride, vl);
+
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0_nrw, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1_nrw, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += dst_stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += dst_stride;
+ }
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel8_hv_lowpass(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 8;
+
+ while(len > 0)
+ {
+ int vl = __riscv_vsetvl_e8mf2(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ p_src_iter -= (stride << 1);
+
+ vint16m1_t h_row0, h_row1, h_row2, h_row3, h_row4, h_row5, h_row6, h_row7;
+ vint16m1_t h_row8, h_row9, h_row10, h_row11, h_row12, h_row13;
+
+ h_lowpass_i16m1(&h_row0, &h_row1, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row2, &h_row3, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row4, &h_row5, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row6, &h_row7, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row8, &h_row9, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row10, &h_row11, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row12, &h_row13, &p_src_iter, stride, vl);
+
+ vuint32m2_t dst0, dst1;
+ v_lowpass_u32m2(&dst0, &dst1, &h_row0, &h_row1, &h_row2, &h_row3, &h_row4, &h_row5, &h_row6, stride, vl);
+
+ vuint8mf2_t dst0_u8, dst1_u8;
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+
+ vuint8mf2_t avg0 = __riscv_vle8_v_u8mf2(p_dst_iter, vl);
+ vuint8mf2_t avg1 = __riscv_vle8_v_u8mf2(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8mf2(avg0, dst0_u8, vl);
+ avg1 = __riscv_vaaddu_vv_u8mf2(avg1, dst1_u8, vl);
+
+ __riscv_vse8_v_u8mf2(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8mf2(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row2, &h_row3, &h_row4, &h_row5, &h_row6, &h_row7, &h_row8, stride, vl);
+
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+
+ avg0 = __riscv_vle8_v_u8mf2(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8mf2(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8mf2(avg0, dst0_u8, vl);
+ avg1 = __riscv_vaaddu_vv_u8mf2(avg1, dst1_u8, vl);
+
+ __riscv_vse8_v_u8mf2(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8mf2(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row4, &h_row5, &h_row6, &h_row7, &h_row8, &h_row9, &h_row10, stride, vl);
+
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+
+ avg0 = __riscv_vle8_v_u8mf2(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8mf2(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8mf2(avg0, dst0_u8, vl);
+ avg1 = __riscv_vaaddu_vv_u8mf2(avg1, dst1_u8, vl);
+
+ __riscv_vse8_v_u8mf2(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8mf2(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row6, &h_row7, &h_row8, &h_row9, &h_row10, &h_row11, &h_row12, stride, vl);
+
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+
+ avg0 = __riscv_vle8_v_u8mf2(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8mf2(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8mf2(avg0, dst0_u8, vl);
+ avg1 = __riscv_vaaddu_vv_u8mf2(avg1, dst1_u8, vl);
+
+ __riscv_vse8_v_u8mf2(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8mf2(p_dst_iter, avg1, vl);
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel8_hv_lowpass_l2(uint8_t *p_dst, const uint8_t *p_src, uint8_t *p_l2_src, ptrdiff_t stride, ptrdiff_t l2_stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 8;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8mf2(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ p_src_iter -= (stride << 1);
+
+ vint16m1_t h_row0, h_row1, h_row2, h_row3, h_row4, h_row5, h_row6, h_row7;
+ vint16m1_t h_row8, h_row9, h_row10, h_row11, h_row12, h_row13;
+
+ h_lowpass_i16m1(&h_row0, &h_row1, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row2, &h_row3, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row4, &h_row5, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row6, &h_row7, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row8, &h_row9, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row10, &h_row11, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row12, &h_row13, &p_src_iter, stride, vl);
+
+ vuint32m2_t dst0, dst1;
+ vuint8mf2_t dst0_u8, dst1_u8;
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row0, &h_row1, &h_row2, &h_row3, &h_row4, &h_row5, &h_row6, stride, vl);
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ avg_average_l2(&p_dst_iter, &p_l2_src, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row2, &h_row3, &h_row4, &h_row5, &h_row6, &h_row7, &h_row8, stride, vl);
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ avg_average_l2(&p_dst_iter, &p_l2_src, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row4, &h_row5, &h_row6, &h_row7, &h_row8, &h_row9, &h_row10, stride, vl);
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ avg_average_l2(&p_dst_iter, &p_l2_src, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row6, &h_row7, &h_row8, &h_row9, &h_row10, &h_row11, &h_row12, stride, vl);
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ avg_average_l2(&p_dst_iter, &p_l2_src, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel8_v_lowpass(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t dst_stride, ptrdiff_t src_stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 8;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ p_src_iter -= (src_stride * 2);
+
+ vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+
+ vuint8m1_t dst0, dst1;
+ v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+ vuint8m1_t avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ vuint8m1_t avg1 = __riscv_vle8_v_u8m1(p_dst_iter + dst_stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += dst_stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += dst_stride;
+
+ // 3rd, 4th dst
+ row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row2, row3, row4, row5, row6, row0, row1, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + dst_stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += dst_stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += dst_stride;
+
+ // 5th, 6th dst
+ row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row4, row5, row6, row0, row1, row2, row3, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + dst_stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += dst_stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += dst_stride;
+
+ // 7th, 8th dst
+ row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row6, row0, row1, row2, row3, row4, row5, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + dst_stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += dst_stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel8_v_lowpass_l2(uint8_t *p_dst, const uint8_t *p_src, const uint8_t *p_l2_src, int stride, int l2_stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 8;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(len);\
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ p_src_iter -= (stride * 2);
+
+ vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ vuint8m1_t l2_row0 = __riscv_vle8_v_u8m1(p_l2_src, vl);
+ p_l2_src += l2_stride;
+ vuint8m1_t l2_row1 = __riscv_vle8_v_u8m1(p_l2_src, vl);
+ p_l2_src += l2_stride;
+
+ vuint8m1_t dst0, dst1;
+ v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+ vuint8m1_t avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ vuint8m1_t avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ // 3rd, 4th dst
+ row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ l2_row0 = __riscv_vle8_v_u8m1(p_l2_src, vl);
+ p_l2_src += l2_stride;
+ l2_row1 = __riscv_vle8_v_u8m1(p_l2_src, vl);
+ p_l2_src += l2_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row2, row3, row4, row5, row6, row0, row1, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ // 5th, 6th dst
+ row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ l2_row0 = __riscv_vle8_v_u8m1(p_l2_src, vl);
+ p_l2_src += l2_stride;
+ l2_row1 = __riscv_vle8_v_u8m1(p_l2_src, vl);
+ p_l2_src += l2_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row4, row5, row6, row0, row1, row2, row3, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ // 7th, 8th dst
+ row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ l2_row0 = __riscv_vle8_v_u8m1(p_l2_src, vl);
+ p_l2_src += l2_stride;
+ l2_row1 = __riscv_vle8_v_u8m1(p_l2_src, vl);
+ p_l2_src += l2_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row6, row0, row1, row2, row3, row4, row5, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel8_v_lowpass_l2src(uint8_t *p_dst, const uint8_t *p_src, int stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 8;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ p_src_iter -= (stride * 2);
+
+ vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ vuint8m1_t dst0, dst1;
+ v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row2, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row3, vl);
+ vuint8m1_t avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ vuint8m1_t avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ // 3rd, 4th dst
+ row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row2, row3, row4, row5, row6, row0, row1, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row4, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row5, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ // 5th, 6th dst
+ row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row4, row5, row6, row0, row1, row2, row3, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row6, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row0, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ // 7th, 8th dst
+ row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row6, row0, row1, row2, row3, row4, row5, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row1, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row2, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel8_h_lowpass_l2src(uint8_t *p_dst, const uint8_t *p_src, int stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 8;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ for (int j = 0; j < 8; j += 2)
+ {
+ vuint8m1_t dst0, dst1;
+ h_lowpass_u8m1_l2src(&dst0, &dst1, &p_src_iter, stride, vl);
+
+ vuint8m1_t avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ vuint8m1_t avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+ }
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel8_h_lowpass_l2src_shift(uint8_t *p_dst, const uint8_t *p_src, int stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 8;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ for (int j = 0; j < 8; j += 2)
+ {
+ vuint8m1_t dst0, dst1;
+ h_lowpass_u8m1_l2src_shift(&dst0, &dst1, &p_src_iter, stride, vl);
+
+ vuint8m1_t avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ vuint8m1_t avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+ }
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void avg_h264_qpel8_v_lowpass_l2src_shift(uint8_t *p_dst, const uint8_t *p_src, int stride)
+{
+
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 8;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ p_src_iter -= (stride * 2);
+
+ vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ vuint8m1_t dst0, dst1;
+ v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row3, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row4, vl);
+ vuint8m1_t avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ vuint8m1_t avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ // 3rd, 4th dst
+ row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row2, row3, row4, row5, row6, row0, row1, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row5, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row6, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ // 5th, 6th dst
+ row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row4, row5, row6, row0, row1, row2, row3, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row0, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row1, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+ p_dst_iter += stride;
+
+ // 7th, 8th dst
+ row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row6, row0, row1, row2, row3, row4, row5, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row2, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row3, vl);
+ avg0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ avg1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ avg0 = __riscv_vaaddu_vv_u8m1(avg0, dst0, vl);
+ avg1 = __riscv_vaaddu_vv_u8m1(avg1, dst1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ len -= vl;
+ }
+}
+#endif
\ No newline at end of file
diff --git a/libavcodec/riscv/h264_mc_luma_put16.h b/libavcodec/riscv/h264_mc_luma_put16.h
new file mode 100644
index 0000000000..5a03507b0a
--- /dev/null
+++ b/libavcodec/riscv/h264_mc_luma_put16.h
@@ -0,0 +1,963 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_RISCV_H264_MC_LUMA_PUT16_H
+#define AVCODEC_RISCV_H264_MC_LUMA_PUT16_H
+#include <riscv_vector.h>
+#include "h264_lowpass.h"
+#include "h264_utility.h"
+
+__attribute__((always_inline)) static void put_copy_block16(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 16;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ for (int j = 0; j < 16; j += 8)
+ {
+ vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row7 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_dst_iter, row0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, row1, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, row2, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, row3, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, row4, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, row5, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, row6, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, row7, vl);
+ p_dst_iter += stride;
+ }
+
+ p_src_iter = p_src_begin + vl;
+ p_dst_iter = p_dst_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel16_h_lowpass(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t dst_stride, ptrdiff_t src_stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 16;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ for (int j = 0; j < 16; j += 2)
+ {
+ vuint16m2_t dst0_u, dst1_u;
+ h_lowpass_u16m2(&dst0_u, &dst1_u, &p_src_iter, src_stride, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, __riscv_vnclipu_wx_u8m1(dst0_u, 5, vl), vl);
+ p_dst_iter += dst_stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, __riscv_vnclipu_wx_u8m1(dst1_u, 5, vl), vl);
+ p_dst_iter += dst_stride;
+ }
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel16_hv_lowpass(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 16;
+
+ while(len > 0)
+ {
+ int vl = __riscv_vsetvl_e8mf2(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ p_src_iter -= (stride << 1);
+
+ vint16m1_t h_row0, h_row1, h_row2, h_row3, h_row4, h_row5, h_row6, h_row7;
+ vint16m1_t h_row8, h_row9, h_row10, h_row11, h_row12, h_row13;
+
+ h_lowpass_i16m1(&h_row0, &h_row1, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row2, &h_row3, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row4, &h_row5, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row6, &h_row7, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row8, &h_row9, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row10, &h_row11, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row12, &h_row13, &p_src_iter, stride, vl);
+
+ vuint32m2_t dst0, dst1;
+ v_lowpass_u32m2(&dst0, &dst1, &h_row0, &h_row1, &h_row2, &h_row3, &h_row4, &h_row5, &h_row6, stride, vl);
+
+ vuint8mf2_t dst0_u8, dst1_u8;
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+
+ __riscv_vse8_v_u8mf2(p_dst_iter, dst0_u8, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8mf2(p_dst_iter, dst1_u8, vl);
+ p_dst_iter += stride;
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row2, &h_row3, &h_row4, &h_row5, &h_row6, &h_row7, &h_row8, stride, vl);
+
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ __riscv_vse8_v_u8mf2(p_dst_iter, dst0_u8, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8mf2(p_dst_iter, dst1_u8, vl);
+ p_dst_iter += stride;
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row4, &h_row5, &h_row6, &h_row7, &h_row8, &h_row9, &h_row10, stride, vl);
+
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ __riscv_vse8_v_u8mf2(p_dst_iter, dst0_u8, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8mf2(p_dst_iter, dst1_u8, vl);
+ p_dst_iter += stride;
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row6, &h_row7, &h_row8, &h_row9, &h_row10, &h_row11, &h_row12, stride, vl);
+
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ __riscv_vse8_v_u8mf2(p_dst_iter, dst0_u8, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8mf2(p_dst_iter, dst1_u8, vl);
+ p_dst_iter += stride;
+
+ vint16m1_t h_row14, h_row15, h_row16, h_row17, h_row18, h_row19, h_row20, h_row21;
+
+ h_lowpass_i16m1(&h_row14, &h_row15, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row16, &h_row17, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row18, &h_row19, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row20, &h_row21, &p_src_iter, stride, vl);
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row8, &h_row9, &h_row10, &h_row11, &h_row12, &h_row13, &h_row14, stride, vl);
+
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ __riscv_vse8_v_u8mf2(p_dst_iter, dst0_u8, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8mf2(p_dst_iter, dst1_u8, vl);
+ p_dst_iter += stride;
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row10, &h_row11, &h_row12, &h_row13, &h_row14, &h_row15, &h_row16, stride, vl);
+
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ __riscv_vse8_v_u8mf2(p_dst_iter, dst0_u8, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8mf2(p_dst_iter, dst1_u8, vl);
+ p_dst_iter += stride;
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row12, &h_row13, &h_row14, &h_row15, &h_row16, &h_row17, &h_row18, stride, vl);
+
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ __riscv_vse8_v_u8mf2(p_dst_iter, dst0_u8, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8mf2(p_dst_iter, dst1_u8, vl);
+ p_dst_iter += stride;
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row14, &h_row15, &h_row16, &h_row17, &h_row18, &h_row19, &h_row20, stride, vl);
+
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ __riscv_vse8_v_u8mf2(p_dst_iter, dst0_u8, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8mf2(p_dst_iter, dst1_u8, vl);
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel16_hv_lowpass_l2(uint8_t *p_dst, const uint8_t *p_src, uint8_t *p_l2_src, ptrdiff_t stride, ptrdiff_t l2_stride)
+{
+ const uint8_t *p_src_iter = p_src - (stride << 1);
+ const uint8_t *p_l2_src_iter = p_l2_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 16;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8mf2(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+ uint8_t *p_l2_src_begin = p_l2_src_iter;
+
+ vint16m1_t h_row0, h_row1, h_row2, h_row3, h_row4, h_row5, h_row6, h_row7;
+ vint16m1_t h_row8, h_row9, h_row10, h_row11, h_row12, h_row13;
+
+ h_lowpass_i16m1(&h_row0, &h_row1, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row2, &h_row3, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row4, &h_row5, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row6, &h_row7, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row8, &h_row9, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row10, &h_row11, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row12, &h_row13, &p_src_iter, stride, vl);
+
+ vuint32m2_t dst0, dst1;
+ vuint8mf2_t dst0_u8, dst1_u8;
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row0, &h_row1, &h_row2, &h_row3, &h_row4, &h_row5, &h_row6, stride, vl);
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ put_average_l2(&p_dst_iter, &p_l2_src_iter, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row2, &h_row3, &h_row4, &h_row5, &h_row6, &h_row7, &h_row8, stride, vl);
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ put_average_l2(&p_dst_iter, &p_l2_src_iter, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row4, &h_row5, &h_row6, &h_row7, &h_row8, &h_row9, &h_row10, stride, vl);
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ put_average_l2(&p_dst_iter, &p_l2_src_iter, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row6, &h_row7, &h_row8, &h_row9, &h_row10, &h_row11, &h_row12, stride, vl);
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ put_average_l2(&p_dst_iter, &p_l2_src_iter, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+ vint16m1_t h_row14, h_row15, h_row16, h_row17, h_row18, h_row19, h_row20, h_row21;
+ h_lowpass_i16m1(&h_row14, &h_row15, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row16, &h_row17, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row18, &h_row19, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row20, &h_row21, &p_src_iter, stride, vl);
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row8, &h_row9, &h_row10, &h_row11, &h_row12, &h_row13, &h_row14, stride, vl);
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ put_average_l2(&p_dst_iter, &p_l2_src_iter, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row10, &h_row11, &h_row12, &h_row13, &h_row14, &h_row15, &h_row16, stride, vl);
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ put_average_l2(&p_dst_iter, &p_l2_src_iter, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row12, &h_row13, &h_row14, &h_row15, &h_row16, &h_row17, &h_row18, stride, vl);
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ put_average_l2(&p_dst_iter, &p_l2_src_iter, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row14, &h_row15, &h_row16, &h_row17, &h_row18, &h_row19, &h_row20, stride, vl);
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ put_average_l2(&p_dst_iter, &p_l2_src_iter, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ p_l2_src_iter = p_l2_src_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel16_v_lowpass(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t dst_stride, ptrdiff_t src_stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 16;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ p_src_iter -= (src_stride * 2);
+
+ vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+
+ vuint8m1_t dst0, dst1;
+ v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += dst_stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += dst_stride;
+
+ // 3rd, 4th dst
+ row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row2, row3, row4, row5, row6, row0, row1, vl);
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += dst_stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += dst_stride;
+
+ // 5th, 6th dst
+ row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row4, row5, row6, row0, row1, row2, row3, vl);
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += dst_stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += dst_stride;
+
+ // 7th, 8th dst
+ row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row6, row0, row1, row2, row3, row4, row5, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += dst_stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += dst_stride;
+
+ // 9th, 10th dst
+ row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row1, row2, row3, row4, row5, row6, row0, vl);
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += dst_stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += dst_stride;
+
+ // 11th, 12th dst
+ row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row3, row4, row5, row6, row0, row1, row2, vl);
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += dst_stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += dst_stride;
+
+ // 13th, 14th dst
+ row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row5, row6, row0, row1, row2, row3, row4, vl);
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += dst_stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += dst_stride;
+
+ // 15th, 16th dst
+ row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += dst_stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel16_v_lowpass_l2(uint8_t *p_dst, const uint8_t *p_src, const uint8_t *p_l2_src, int stride, int l2_stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ const uint8_t *p_l2_src_iter = p_l2_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 16;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ const uint8_t *p_l2_src_begin = p_l2_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ p_src_iter -= (stride * 2);
+
+ vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ vuint8m1_t l2_row0 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+ p_l2_src_iter += l2_stride;
+ vuint8m1_t l2_row1 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+ p_l2_src_iter += l2_stride;
+
+ vuint8m1_t dst0, dst1;
+ v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+
+ // 3rd, 4th dst
+ row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ l2_row0 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+ p_l2_src_iter += l2_stride;
+ l2_row1 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+ p_l2_src_iter += l2_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row2, row3, row4, row5, row6, row0, row1, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+
+ // 5th, 6th dst
+ row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ l2_row0 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+ p_l2_src_iter += l2_stride;
+ l2_row1 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+ p_l2_src_iter += l2_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row4, row5, row6, row0, row1, row2, row3, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+
+ // 7th, 8th dst
+ row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ l2_row0 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+ p_l2_src_iter += l2_stride;
+ l2_row1 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+ p_l2_src_iter += l2_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row6, row0, row1, row2, row3, row4, row5, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+
+ // 9th, 10th dst
+ row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ l2_row0 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+ p_l2_src_iter += l2_stride;
+ l2_row1 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+ p_l2_src_iter += l2_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row1, row2, row3, row4, row5, row6, row0, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+
+ // 11th, 12th dst
+ row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ l2_row0 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+ p_l2_src_iter += l2_stride;
+ l2_row1 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+ p_l2_src_iter += l2_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row3, row4, row5, row6, row0, row1, row2, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+
+ // 13th, 14th dst
+ row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ l2_row0 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+ p_l2_src_iter += l2_stride;
+ l2_row1 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+ p_l2_src_iter += l2_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row5, row6, row0, row1, row2, row3, row4, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+
+ // 15th, 16th dst
+ row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ l2_row0 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+ p_l2_src_iter += l2_stride;
+ l2_row1 = __riscv_vle8_v_u8m1(p_l2_src_iter, vl);
+ p_l2_src_iter += l2_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ p_l2_src_iter = p_l2_src_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel16_v_lowpass_l2src(uint8_t *p_dst, const uint8_t *p_src, int stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 16;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ p_src_iter -= (stride * 2);
+
+ vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ vuint8m1_t dst0, dst1;
+ v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row2, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row3, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+
+ // 3rd, 4th dst
+ row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row2, row3, row4, row5, row6, row0, row1, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row4, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row5, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+
+ // 5th, 6th dst
+ row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row4, row5, row6, row0, row1, row2, row3, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row6, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row0, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+
+ // 7th, 8th dst
+ row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row6, row0, row1, row2, row3, row4, row5, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row1, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row2, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+
+ // 9th, 10th dst
+ row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row1, row2, row3, row4, row5, row6, row0, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row3, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row4, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+
+ // 11th, 12th dst
+ row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row3, row4, row5, row6, row0, row1, row2, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row5, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row6, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+
+ // 13th, 14th dst
+ row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row5, row6, row0, row1, row2, row3, row4, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row0, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+
+ // 15th, 16th dst
+ row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row2, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row3, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel16_h_lowpass_l2src(uint8_t *p_dst, const uint8_t *p_src, int stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 16;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ for (int j = 0; j < 16; j += 2)
+ {
+ vuint8m1_t dst0, dst1;
+ h_lowpass_u8m1_l2src(&dst0, &dst1, &p_src_iter, stride, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+ }
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel16_h_lowpass_l2src_shift(uint8_t *p_dst, const uint8_t *p_src, int stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 16;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ for (int j = 0; j < 16; j += 2)
+ {
+ vuint8m1_t dst0, dst1;
+ h_lowpass_u8m1_l2src_shift(&dst0, &dst1, &p_src_iter, stride, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+ }
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel16_v_lowpass_l2src_shift(uint8_t *p_dst, const uint8_t *p_src, int stride)
+{
+
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 16;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ p_src_iter -= (stride * 2);
+
+ vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ vuint8m1_t dst0, dst1;
+ v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row3, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row4, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+
+ // 3rd, 4th dst
+ row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row2, row3, row4, row5, row6, row0, row1, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row5, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row6, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+
+ // 5th, 6th dst
+ row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row4, row5, row6, row0, row1, row2, row3, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row0, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+
+ // 7th, 8th dst
+ row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row6, row0, row1, row2, row3, row4, row5, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row2, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row3, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+
+ // 9th, 10th dst
+ row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row1, row2, row3, row4, row5, row6, row0, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row4, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row5, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+
+ // 11th, 12th dst
+ row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row3, row4, row5, row6, row0, row1, row2, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row6, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row0, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+
+ // 13th, 14th dst
+ row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row5, row6, row0, row1, row2, row3, row4, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row1, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row2, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+
+ // 15th, 16th dst
+ row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row3, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row4, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ len -= vl;
+ }
+}
+#endif
\ No newline at end of file
diff --git a/libavcodec/riscv/h264_mc_luma_put8.h b/libavcodec/riscv/h264_mc_luma_put8.h
new file mode 100644
index 0000000000..d1cfb90f80
--- /dev/null
+++ b/libavcodec/riscv/h264_mc_luma_put8.h
@@ -0,0 +1,648 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_RISCV_H264_MC_LUMA_PUT8_H
+#define AVCODEC_RISCV_H264_MC_LUMA_PUT8_H
+#include <riscv_vector.h>
+#include "h264_lowpass.h"
+#include "h264_utility.h"
+
+__attribute__((always_inline)) static void put_copy_block8(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 8;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row7 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ __riscv_vse8_v_u8m1(p_dst_iter, row0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, row1, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, row2, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, row3, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, row4, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, row5, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, row6, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, row7, vl);
+ p_dst_iter += stride;
+
+ p_src_iter = p_src_begin + vl;
+ p_dst_iter = p_dst_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel8_h_lowpass(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t dst_stride, ptrdiff_t src_stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+
+ int len = 8;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ for (int j = 0; j < 8; j += 2)
+ {
+ vuint16m2_t dst0_u, dst1_u;
+ h_lowpass_u16m2(&dst0_u, &dst1_u, &p_src_iter, src_stride, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, __riscv_vnclipu_wx_u8m1(dst0_u, 5, vl), vl);
+ p_dst_iter += dst_stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, __riscv_vnclipu_wx_u8m1(dst1_u, 5, vl), vl);
+ p_dst_iter += dst_stride;
+ }
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel8_hv_lowpass(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 8;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8mf2(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ p_src_iter -= (stride << 1);
+
+ vint16m1_t h_row0, h_row1, h_row2, h_row3, h_row4, h_row5, h_row6, h_row7;
+ vint16m1_t h_row8, h_row9, h_row10, h_row11, h_row12, h_row13;
+
+ h_lowpass_i16m1(&h_row0, &h_row1, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row2, &h_row3, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row4, &h_row5, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row6, &h_row7, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row8, &h_row9, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row10, &h_row11, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row12, &h_row13, &p_src_iter, stride, vl);
+
+ vuint32m2_t dst0, dst1;
+ v_lowpass_u32m2(&dst0, &dst1, &h_row0, &h_row1, &h_row2, &h_row3, &h_row4, &h_row5, &h_row6, stride, vl);
+
+ vuint8mf2_t dst0_u8, dst1_u8;
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+
+ __riscv_vse8_v_u8mf2(p_dst_iter, dst0_u8, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8mf2(p_dst_iter, dst1_u8, vl);
+ p_dst_iter += stride;
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row2, &h_row3, &h_row4, &h_row5, &h_row6, &h_row7, &h_row8, stride, vl);
+
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ __riscv_vse8_v_u8mf2(p_dst_iter, dst0_u8, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8mf2(p_dst_iter, dst1_u8, vl);
+ p_dst_iter += stride;
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row4, &h_row5, &h_row6, &h_row7, &h_row8, &h_row9, &h_row10, stride, vl);
+
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ __riscv_vse8_v_u8mf2(p_dst_iter, dst0_u8, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8mf2(p_dst_iter, dst1_u8, vl);
+ p_dst_iter += stride;
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row6, &h_row7, &h_row8, &h_row9, &h_row10, &h_row11, &h_row12, stride, vl);
+
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ __riscv_vse8_v_u8mf2(p_dst_iter, dst0_u8, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8mf2(p_dst_iter, dst1_u8, vl);
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel8_hv_lowpass_l2(uint8_t *p_dst, const uint8_t *p_src, uint8_t *p_l2_src, ptrdiff_t stride, ptrdiff_t l2_stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 8;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8mf2(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ p_src_iter -= (stride << 1);
+
+ vint16m1_t h_row0, h_row1, h_row2, h_row3, h_row4, h_row5, h_row6, h_row7;
+ vint16m1_t h_row8, h_row9, h_row10, h_row11, h_row12, h_row13;
+
+ h_lowpass_i16m1(&h_row0, &h_row1, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row2, &h_row3, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row4, &h_row5, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row6, &h_row7, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row8, &h_row9, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row10, &h_row11, &p_src_iter, stride, vl);
+ h_lowpass_i16m1(&h_row12, &h_row13, &p_src_iter, stride, vl);
+
+ vuint32m2_t dst0, dst1;
+ vuint8mf2_t dst0_u8, dst1_u8;
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row0, &h_row1, &h_row2, &h_row3, &h_row4, &h_row5, &h_row6, stride, vl);
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ put_average_l2(&p_dst_iter, &p_l2_src, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row2, &h_row3, &h_row4, &h_row5, &h_row6, &h_row7, &h_row8, stride, vl);
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ put_average_l2(&p_dst_iter, &p_l2_src, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row4, &h_row5, &h_row6, &h_row7, &h_row8, &h_row9, &h_row10, stride, vl);
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ put_average_l2(&p_dst_iter, &p_l2_src, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+ v_lowpass_u32m2(&dst0, &dst1, &h_row6, &h_row7, &h_row8, &h_row9, &h_row10, &h_row11, &h_row12, stride, vl);
+ u32_to_u8(&dst0_u8, &dst1_u8, dst0, dst1, vl);
+ put_average_l2(&p_dst_iter, &p_l2_src, stride, l2_stride, dst0_u8, dst1_u8, vl);
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel8_v_lowpass(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t dst_stride, ptrdiff_t src_stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 8;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ p_src_iter -= (src_stride * 2);
+
+ vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+
+ vuint8m1_t dst0, dst1;
+ v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += dst_stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += dst_stride;
+
+ // 3rd, 4th dst
+ row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row2, row3, row4, row5, row6, row0, row1, vl);
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += dst_stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += dst_stride;
+
+ // 5th, 6th dst
+ row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row4, row5, row6, row0, row1, row2, row3, vl);
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += dst_stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += dst_stride;
+
+ // 7th, 8th dst
+ row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+ row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += src_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row6, row0, row1, row2, row3, row4, row5, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += dst_stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel8_v_lowpass_l2(uint8_t *p_dst, const uint8_t *p_src, const uint8_t *p_l2_src, int stride, int l2_stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 8;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ p_src_iter -= (stride * 2);
+
+ vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ vuint8m1_t l2_row0 = __riscv_vle8_v_u8m1(p_l2_src, vl);
+ p_l2_src += l2_stride;
+ vuint8m1_t l2_row1 = __riscv_vle8_v_u8m1(p_l2_src, vl);
+ p_l2_src += l2_stride;
+
+ vuint8m1_t dst0, dst1;
+ v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+
+ // 3rd, 4th dst
+ row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ l2_row0 = __riscv_vle8_v_u8m1(p_l2_src, vl);
+ p_l2_src += l2_stride;
+ l2_row1 = __riscv_vle8_v_u8m1(p_l2_src, vl);
+ p_l2_src += l2_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row2, row3, row4, row5, row6, row0, row1, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+
+ // 5th, 6th dst
+ row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ l2_row0 = __riscv_vle8_v_u8m1(p_l2_src, vl);
+ p_l2_src += l2_stride;
+ l2_row1 = __riscv_vle8_v_u8m1(p_l2_src, vl);
+ p_l2_src += l2_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row4, row5, row6, row0, row1, row2, row3, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+
+ // 7th, 8th dst
+ row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ l2_row0 = __riscv_vle8_v_u8m1(p_l2_src, vl);
+ p_l2_src += l2_stride;
+ l2_row1 = __riscv_vle8_v_u8m1(p_l2_src, vl);
+ p_l2_src += l2_stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row6, row0, row1, row2, row3, row4, row5, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, l2_row0, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, l2_row1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel8_v_lowpass_l2src(uint8_t *p_dst, const uint8_t *p_src, int stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 8;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ p_src_iter -= (stride * 2);
+
+ vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ vuint8m1_t dst0, dst1;
+ v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row2, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row3, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+
+ // 3rd, 4th dst
+ row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row2, row3, row4, row5, row6, row0, row1, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row4, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row5, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+
+ // 5th, 6th dst
+ row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row4, row5, row6, row0, row1, row2, row3, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row6, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row0, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+
+ // 7th, 8th dst
+ row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row6, row0, row1, row2, row3, row4, row5, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row1, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row2, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel8_h_lowpass_l2src(uint8_t *p_dst, const uint8_t *p_src, int stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 8;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ for (int j = 0; j < 8; j += 2)
+ {
+ vuint8m1_t dst0, dst1;
+ h_lowpass_u8m1_l2src(&dst0, &dst1, &p_src_iter, stride, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+ }
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel8_h_lowpass_l2src_shift(uint8_t *p_dst, const uint8_t *p_src, int stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 8;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ for (int j = 0; j < 8; j += 2)
+ {
+ vuint8m1_t dst0, dst1;
+ h_lowpass_u8m1_l2src_shift(&dst0, &dst1, &p_src_iter, stride, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+ }
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ len -= vl;
+ }
+}
+
+__attribute__((always_inline)) static void put_h264_qpel8_v_lowpass_l2src_shift(uint8_t *p_dst, const uint8_t *p_src, int stride)
+{
+ const uint8_t *p_src_iter = p_src;
+ uint8_t *p_dst_iter = p_dst;
+ int len = 8;
+
+ while (len > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(len);
+ const uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ p_src_iter -= (stride * 2);
+
+ vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t row6 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ vuint8m1_t dst0, dst1;
+ v_lowpass_u8m1(&dst0, &dst1, row0, row1, row2, row3, row4, row5, row6, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row3, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row4, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+
+ // 3rd, 4th dst
+ row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row2, row3, row4, row5, row6, row0, row1, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row5, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row6, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+
+ // 5th, 6th dst
+ row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row4, row5, row6, row0, row1, row2, row3, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row0, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row1, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+
+ // 7th, 8th dst
+ row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ row5 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ v_lowpass_u8m1(&dst0, &dst1, row6, row0, row1, row2, row3, row4, row5, vl);
+ dst0 = __riscv_vaaddu_vv_u8m1(dst0, row2, vl);
+ dst1 = __riscv_vaaddu_vv_u8m1(dst1, row3, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, dst1, vl);
+ p_dst_iter += stride;
+
+ p_dst_iter = p_dst_begin + vl;
+ p_src_iter = p_src_begin + vl;
+ len -= vl;
+ }
+}
+#endif
\ No newline at end of file
diff --git a/libavcodec/riscv/h264_qpel_init_riscv.c b/libavcodec/riscv/h264_qpel_init_riscv.c
new file mode 100644
index 0000000000..582a4a64dd
--- /dev/null
+++ b/libavcodec/riscv/h264_qpel_init_riscv.c
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/h264qpel.h"
+#include "h264_mc_luma.h"
+
+av_cold void ff_h264qpel_init_riscv(H264QpelContext *c, int bit_depth)
+{
+#if HAVE_INTRINSICS_RVV
+ const int high_bit_depth = bit_depth > 8;
+ int cpu_flags = av_get_cpu_flags();
+
+ if (!high_bit_depth)
+ {
+ c->put_h264_qpel_pixels_tab[0][0] = put_h264_qpel16_mc00_8_rvv;
+ c->put_h264_qpel_pixels_tab[0][1] = put_h264_qpel16_mc10_8_rvv;
+ c->put_h264_qpel_pixels_tab[0][2] = put_h264_qpel16_mc20_8_rvv;
+ c->put_h264_qpel_pixels_tab[0][3] = put_h264_qpel16_mc30_8_rvv;
+ c->put_h264_qpel_pixels_tab[0][4] = put_h264_qpel16_mc01_8_rvv;
+ c->put_h264_qpel_pixels_tab[0][5] = put_h264_qpel16_mc11_8_rvv;
+ c->put_h264_qpel_pixels_tab[0][6] = put_h264_qpel16_mc21_8_rvv;
+ c->put_h264_qpel_pixels_tab[0][7] = put_h264_qpel16_mc31_8_rvv;
+ c->put_h264_qpel_pixels_tab[0][8] = put_h264_qpel16_mc02_8_rvv;
+ c->put_h264_qpel_pixels_tab[0][9] = put_h264_qpel16_mc12_8_rvv;
+ c->put_h264_qpel_pixels_tab[0][10] = put_h264_qpel16_mc22_8_rvv;
+ c->put_h264_qpel_pixels_tab[0][11] = put_h264_qpel16_mc32_8_rvv;
+ c->put_h264_qpel_pixels_tab[0][12] = put_h264_qpel16_mc03_8_rvv;
+ c->put_h264_qpel_pixels_tab[0][13] = put_h264_qpel16_mc13_8_rvv;
+ c->put_h264_qpel_pixels_tab[0][14] = put_h264_qpel16_mc23_8_rvv;
+ c->put_h264_qpel_pixels_tab[0][15] = put_h264_qpel16_mc33_8_rvv;
+
+ c->put_h264_qpel_pixels_tab[1][0] = put_h264_qpel8_mc00_8_rvv;
+ c->put_h264_qpel_pixels_tab[1][1] = put_h264_qpel8_mc10_8_rvv;
+ c->put_h264_qpel_pixels_tab[1][2] = put_h264_qpel8_mc20_8_rvv;
+ c->put_h264_qpel_pixels_tab[1][3] = put_h264_qpel8_mc30_8_rvv;
+ c->put_h264_qpel_pixels_tab[1][4] = put_h264_qpel8_mc01_8_rvv;
+ c->put_h264_qpel_pixels_tab[1][5] = put_h264_qpel8_mc11_8_rvv;
+ c->put_h264_qpel_pixels_tab[1][6] = put_h264_qpel8_mc21_8_rvv;
+ c->put_h264_qpel_pixels_tab[1][7] = put_h264_qpel8_mc31_8_rvv;
+ c->put_h264_qpel_pixels_tab[1][8] = put_h264_qpel8_mc02_8_rvv;
+ c->put_h264_qpel_pixels_tab[1][9] = put_h264_qpel8_mc12_8_rvv;
+ c->put_h264_qpel_pixels_tab[1][10] = put_h264_qpel8_mc22_8_rvv;
+ c->put_h264_qpel_pixels_tab[1][11] = put_h264_qpel8_mc32_8_rvv;
+ c->put_h264_qpel_pixels_tab[1][12] = put_h264_qpel8_mc03_8_rvv;
+ c->put_h264_qpel_pixels_tab[1][13] = put_h264_qpel8_mc13_8_rvv;
+ c->put_h264_qpel_pixels_tab[1][14] = put_h264_qpel8_mc23_8_rvv;
+ c->put_h264_qpel_pixels_tab[1][15] = put_h264_qpel8_mc33_8_rvv;
+
+ c->avg_h264_qpel_pixels_tab[0][0] = avg_h264_qpel16_mc00_8_rvv;
+ c->avg_h264_qpel_pixels_tab[0][1] = avg_h264_qpel16_mc10_8_rvv;
+ c->avg_h264_qpel_pixels_tab[0][2] = avg_h264_qpel16_mc20_8_rvv;
+ c->avg_h264_qpel_pixels_tab[0][3] = avg_h264_qpel16_mc30_8_rvv;
+ c->avg_h264_qpel_pixels_tab[0][4] = avg_h264_qpel16_mc01_8_rvv;
+ c->avg_h264_qpel_pixels_tab[0][5] = avg_h264_qpel16_mc11_8_rvv;
+ c->avg_h264_qpel_pixels_tab[0][6] = avg_h264_qpel16_mc21_8_rvv;
+ c->avg_h264_qpel_pixels_tab[0][7] = avg_h264_qpel16_mc31_8_rvv;
+ c->avg_h264_qpel_pixels_tab[0][8] = avg_h264_qpel16_mc02_8_rvv;
+ c->avg_h264_qpel_pixels_tab[0][9] = avg_h264_qpel16_mc12_8_rvv;
+ c->avg_h264_qpel_pixels_tab[0][10] = avg_h264_qpel16_mc22_8_rvv;
+ c->avg_h264_qpel_pixels_tab[0][11] = avg_h264_qpel16_mc32_8_rvv;
+ c->avg_h264_qpel_pixels_tab[0][12] = avg_h264_qpel16_mc03_8_rvv;
+ c->avg_h264_qpel_pixels_tab[0][13] = avg_h264_qpel16_mc13_8_rvv;
+ c->avg_h264_qpel_pixels_tab[0][14] = avg_h264_qpel16_mc23_8_rvv;
+ c->avg_h264_qpel_pixels_tab[0][15] = avg_h264_qpel16_mc33_8_rvv;
+
+ c->avg_h264_qpel_pixels_tab[1][0] = avg_h264_qpel8_mc00_8_rvv;
+ c->avg_h264_qpel_pixels_tab[1][1] = avg_h264_qpel8_mc10_8_rvv;
+ c->avg_h264_qpel_pixels_tab[1][2] = avg_h264_qpel8_mc20_8_rvv;
+ c->avg_h264_qpel_pixels_tab[1][3] = avg_h264_qpel8_mc30_8_rvv;
+ c->avg_h264_qpel_pixels_tab[1][4] = avg_h264_qpel8_mc01_8_rvv;
+ c->avg_h264_qpel_pixels_tab[1][5] = avg_h264_qpel8_mc11_8_rvv;
+ c->avg_h264_qpel_pixels_tab[1][6] = avg_h264_qpel8_mc21_8_rvv;
+ c->avg_h264_qpel_pixels_tab[1][7] = avg_h264_qpel8_mc31_8_rvv;
+ c->avg_h264_qpel_pixels_tab[1][8] = avg_h264_qpel8_mc02_8_rvv;
+ c->avg_h264_qpel_pixels_tab[1][9] = avg_h264_qpel8_mc12_8_rvv;
+ c->avg_h264_qpel_pixels_tab[1][10] = avg_h264_qpel8_mc22_8_rvv;
+ c->avg_h264_qpel_pixels_tab[1][11] = avg_h264_qpel8_mc32_8_rvv;
+ c->avg_h264_qpel_pixels_tab[1][12] = avg_h264_qpel8_mc03_8_rvv;
+ c->avg_h264_qpel_pixels_tab[1][13] = avg_h264_qpel8_mc13_8_rvv;
+ c->avg_h264_qpel_pixels_tab[1][14] = avg_h264_qpel8_mc23_8_rvv;
+ c->avg_h264_qpel_pixels_tab[1][15] = avg_h264_qpel8_mc33_8_rvv;
+ }
+#endif
+}
\ No newline at end of file
diff --git a/libavcodec/riscv/h264_utility.h b/libavcodec/riscv/h264_utility.h
new file mode 100644
index 0000000000..31029a44ae
--- /dev/null
+++ b/libavcodec/riscv/h264_utility.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_RISCV_H264_UTILITY_H
+#define AVCODEC_RISCV_H264_UTILITY_H
+#include <riscv_vector.h>
+
+__attribute__((always_inline)) static void u32_to_u8(vuint8mf2_t *p_u8_1, vuint8mf2_t *p_u8_2,
+ vuint32m2_t i32_1, vuint32m2_t i32_2, int vl)
+{
+ vuint16m1_t u16_1 = __riscv_vnclipu_wx_u16m1(i32_1, 10, vl);
+ vuint16m1_t u16_2 = __riscv_vnclipu_wx_u16m1(i32_2, 10, vl);
+ *p_u8_1 = __riscv_vnclipu_wx_u8mf2(u16_1, 0, vl);
+ *p_u8_2 = __riscv_vnclipu_wx_u8mf2(u16_2, 0, vl);
+}
+
+__attribute__((always_inline)) static void put_average_l2(uint8_t **pp_dst, uint8_t **pp_l2_src,
+ ptrdiff_t dst_stride, ptrdiff_t l2_stride,
+ vuint8mf2_t src_row0, vuint8mf2_t src_row1, int vl)
+{
+ vuint8mf2_t l2_row0 = __riscv_vle8_v_u8mf2(*pp_l2_src, vl);
+ *pp_l2_src += l2_stride;
+ vuint8mf2_t l2_row1 = __riscv_vle8_v_u8mf2(*pp_l2_src, vl);
+ *pp_l2_src += l2_stride;
+
+ src_row0 = __riscv_vaaddu_vv_u8mf2(src_row0, l2_row0, vl);
+ src_row1 = __riscv_vaaddu_vv_u8mf2(src_row1, l2_row1, vl);
+
+ __riscv_vse8_v_u8mf2(*pp_dst, src_row0, vl);
+ *pp_dst += dst_stride;
+ __riscv_vse8_v_u8mf2(*pp_dst, src_row1, vl);
+ *pp_dst += dst_stride;
+}
+
+__attribute__((always_inline)) static void avg_average_l2(uint8_t **pp_dst, uint8_t **pp_l2_src,
+ ptrdiff_t dst_stride, ptrdiff_t l2_stride,
+ vuint8mf2_t src_row0, vuint8mf2_t src_row1, int vl)
+{
+ vuint8mf2_t l2_row0 = __riscv_vle8_v_u8mf2(*pp_l2_src, vl);
+ *pp_l2_src += l2_stride;
+ vuint8mf2_t l2_row1 = __riscv_vle8_v_u8mf2(*pp_l2_src, vl);
+ *pp_l2_src += l2_stride;
+
+ vuint8mf2_t dst0 = __riscv_vle8_v_u8mf2(*pp_dst, vl);
+ vuint8mf2_t dst1 = __riscv_vle8_v_u8mf2(*pp_dst + dst_stride, vl);
+
+ src_row0 = __riscv_vaaddu_vv_u8mf2(src_row0, l2_row0, vl);
+ src_row1 = __riscv_vaaddu_vv_u8mf2(src_row1, l2_row1, vl);
+
+ src_row0 = __riscv_vaaddu_vv_u8mf2(src_row0, dst0, vl);
+ src_row1 = __riscv_vaaddu_vv_u8mf2(src_row1, dst1, vl);
+
+ __riscv_vse8_v_u8mf2(*pp_dst, src_row0, vl);
+ *pp_dst += dst_stride;
+ __riscv_vse8_v_u8mf2(*pp_dst, src_row1, vl);
+ *pp_dst += dst_stride;
+}
+#endif
\ No newline at end of file
--
2.17.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 11+ messages in thread
* [FFmpeg-devel] [PATCH 4/5] lavc/h264dsp: Add vectorized implementation of DSP functions for RISC-V
2023-05-09 9:50 [FFmpeg-devel] [PATCH 0/5] RISC-V: Improve H264 decoding performance using RVV intrinsic Arnie Chang
` (2 preceding siblings ...)
2023-05-09 9:50 ` [FFmpeg-devel] [PATCH 3/5] lavc/h264qpel: Add vectorized implementation of luma " Arnie Chang
@ 2023-05-09 9:50 ` Arnie Chang
2023-05-09 9:50 ` [FFmpeg-devel] [PATCH 5/5] lavc/h264pred: Add vectorized implementation of intra prediction " Arnie Chang
` (2 subsequent siblings)
6 siblings, 0 replies; 11+ messages in thread
From: Arnie Chang @ 2023-05-09 9:50 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Arnie Chang
Optimize IDCT, inloop filtering, and weighed prediction using RISC-V intrinsics.
The performance is elvaluated using 720P videos.
Combine with previous optimizations(chroma and luma MC), the FPS is 2.08x faster than the scalar one,
while applying only previous optimizations resulted in a speedup of 1.49x.
Signed-off-by: Arnie Chang <arnie.chang@sifive.com>
---
libavcodec/h264dsp.c | 2 +
libavcodec/h264dsp.h | 3 +-
libavcodec/riscv/Makefile | 4 +
libavcodec/riscv/h264_dsp_init_riscv.c | 68 +++
libavcodec/riscv/h264_idct.c | 482 ++++++++++++++++++
libavcodec/riscv/h264_idct.h | 46 ++
libavcodec/riscv/h264_inloop.c | 669 +++++++++++++++++++++++++
libavcodec/riscv/h264_inloop.h | 47 ++
libavcodec/riscv/h264_weighted_sum.c | 273 ++++++++++
libavcodec/riscv/h264_weighted_sum.h | 47 ++
10 files changed, 1640 insertions(+), 1 deletion(-)
create mode 100644 libavcodec/riscv/h264_dsp_init_riscv.c
create mode 100644 libavcodec/riscv/h264_idct.c
create mode 100644 libavcodec/riscv/h264_idct.h
create mode 100644 libavcodec/riscv/h264_inloop.c
create mode 100644 libavcodec/riscv/h264_inloop.h
create mode 100644 libavcodec/riscv/h264_weighted_sum.c
create mode 100644 libavcodec/riscv/h264_weighted_sum.h
diff --git a/libavcodec/h264dsp.c b/libavcodec/h264dsp.c
index 4d2ee10bab..b6e45c15ef 100644
--- a/libavcodec/h264dsp.c
+++ b/libavcodec/h264dsp.c
@@ -164,5 +164,7 @@ av_cold void ff_h264dsp_init(H264DSPContext *c, const int bit_depth,
ff_h264dsp_init_mips(c, bit_depth, chroma_format_idc);
#elif ARCH_LOONGARCH
ff_h264dsp_init_loongarch(c, bit_depth, chroma_format_idc);
+#elif ARCH_RISCV
+ ff_h264dsp_init_riscv(c, bit_depth, chroma_format_idc);
#endif
}
diff --git a/libavcodec/h264dsp.h b/libavcodec/h264dsp.h
index e0880c4d88..f2f8aa7e60 100644
--- a/libavcodec/h264dsp.h
+++ b/libavcodec/h264dsp.h
@@ -131,5 +131,6 @@ void ff_h264dsp_init_mips(H264DSPContext *c, const int bit_depth,
const int chroma_format_idc);
void ff_h264dsp_init_loongarch(H264DSPContext *c, const int bit_depth,
const int chroma_format_idc);
-
+void ff_h264dsp_init_riscv(H264DSPContext *c, const int bit_depth,
+ const int chroma_format_idc);
#endif /* AVCODEC_H264DSP_H */
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 088efa3b1e..4d54bf35e9 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -24,3 +24,7 @@ OBJS-$(CONFIG_H264CHROMA) += riscv/h264_chroma_init_riscv.o
RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o
OBJS-$(CONFIG_H264QPEL) += riscv/h264_qpel_init_riscv.o
RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264_mc_luma.o
+OBJS-$(CONFIG_H264DSP) += riscv/h264_dsp_init_riscv.o
+RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264_weighted_sum.o
+RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264_inloop.o
+RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264_idct.o
diff --git a/libavcodec/riscv/h264_dsp_init_riscv.c b/libavcodec/riscv/h264_dsp_init_riscv.c
new file mode 100644
index 0000000000..7d41aa98a5
--- /dev/null
+++ b/libavcodec/riscv/h264_dsp_init_riscv.c
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavcodec/h264dsp.h"
+#include "config.h"
+#include "h264_inloop.h"
+#include "h264_weighted_sum.h"
+#include "h264_idct.h"
+
+av_cold void ff_h264dsp_init_riscv(H264DSPContext *c, const int bit_depth, const int chroma_format_idc)
+{
+#if HAVE_INTRINSICS_RVV
+ if (bit_depth == 8) {
+ c->h264_v_loop_filter_luma = h264_v_loop_filter_luma_8_rvv;
+ c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_8_rvv;
+
+ c->h264_h_loop_filter_luma = h264_h_loop_filter_luma_8_rvv;
+ c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_8_rvv;
+
+ c->h264_v_loop_filter_chroma = h264_v_loop_filter_chroma_8_rvv;
+ c->h264_v_loop_filter_chroma_intra = h264_v_loop_filter_chroma_intra_8_rvv;
+
+ if (chroma_format_idc <= 1) {
+ c->h264_h_loop_filter_chroma = h264_h_loop_filter_chroma_8_rvv;
+ c->h264_h_loop_filter_chroma_intra = h264_h_loop_filter_chroma_intra_8_rvv;
+ c->h264_h_loop_filter_chroma_mbaff_intra = h264_h_loop_filter_chroma_mbaff_intra_8_rvv;
+ }
+
+ c->weight_h264_pixels_tab[0] = weight_h264_pixels_16_8_rvv;
+ c->weight_h264_pixels_tab[1] = weight_h264_pixels_8_8_rvv;
+ c->weight_h264_pixels_tab[2] = weight_h264_pixels_4_8_rvv;
+
+ c->biweight_h264_pixels_tab[0]= biweight_h264_pixels_16_8_rvv;
+ c->biweight_h264_pixels_tab[1]= biweight_h264_pixels_8_8_rvv;
+ c->biweight_h264_pixels_tab[2]= biweight_h264_pixels_4_8_rvv;
+
+ c->h264_idct_add = h264_idct_add_8_rvv;
+ c->h264_idct_dc_add = h264_idct_dc_add_8_rvv;
+ c->h264_idct_add16 = h264_idct_add16_8_rvv;
+ c->h264_idct_add16intra = h264_idct_add16_intra_8_rvv;
+ if (chroma_format_idc <= 1)
+ c->h264_idct_add8 = h264_idct_add8_8_rvv;
+ c->h264_idct8_add = h264_idct8_add_8_rvv;
+ c->h264_idct8_dc_add = h264_idct8_dc_add_8_rvv;
+ c->h264_idct8_add4 = h264_idct8_add4_8_rvv;
+ }
+#endif
+}
diff --git a/libavcodec/riscv/h264_idct.c b/libavcodec/riscv/h264_idct.c
new file mode 100644
index 0000000000..3ef6b74421
--- /dev/null
+++ b/libavcodec/riscv/h264_idct.c
@@ -0,0 +1,482 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264_idct.h"
+#if HAVE_INTRINSICS_RVV
+#include <riscv_vector.h>
+
+static const uint8_t scan8[16 * 3 + 3] =
+{
+ 4 + 1 * 8, 5 + 1 * 8, 4 + 2 * 8, 5 + 2 * 8,
+ 6 + 1 * 8, 7 + 1 * 8, 6 + 2 * 8, 7 + 2 * 8,
+ 4 + 3 * 8, 5 + 3 * 8, 4 + 4 * 8, 5 + 4 * 8,
+ 6 + 3 * 8, 7 + 3 * 8, 6 + 4 * 8, 7 + 4 * 8,
+ 4 + 6 * 8, 5 + 6 * 8, 4 + 7 * 8, 5 + 7 * 8,
+ 6 + 6 * 8, 7 + 6 * 8, 6 + 7 * 8, 7 + 7 * 8,
+ 4 + 8 * 8, 5 + 8 * 8, 4 + 9 * 8, 5 + 9 * 8,
+ 6 + 8 * 8, 7 + 8 * 8, 6 + 9 * 8, 7 + 9 * 8,
+ 4 + 11 * 8, 5 + 11 * 8, 4 + 12 * 8, 5 + 12 * 8,
+ 6 + 11 * 8, 7 + 11 * 8, 6 + 12 * 8, 7 + 12 * 8,
+ 4 + 13 * 8, 5 + 13 * 8, 4 + 14 * 8, 5 + 14 * 8,
+ 6 + 13 * 8, 7 + 13 * 8, 6 + 14 * 8, 7 + 14 * 8,
+ 0 + 0 * 8, 0 + 5 * 8, 0 + 10 * 8
+};
+
+void h264_idct_add_8_rvv(uint8_t *p_dst, int16_t *p_block, int stride)
+{
+ int16_t temp[16];
+ int vl = __riscv_vsetvl_e16m1(4);
+
+ p_block[0] += 32;
+
+ vint16m1_t row0 = __riscv_vle16_v_i16m1(p_block, vl);
+ vint16m1_t row1 = __riscv_vle16_v_i16m1(p_block + 4, vl);
+ vint16m1_t row2 = __riscv_vle16_v_i16m1(p_block + 8, vl);
+ vint16m1_t row3 = __riscv_vle16_v_i16m1(p_block + 12, vl);
+
+ // 1-D row idct
+ vint16m1_t z0 = __riscv_vadd_vv_i16m1(row0, row2, vl);
+ vint16m1_t z1 = __riscv_vsub_vv_i16m1(row0, row2, vl);
+ vint16m1_t z2 = __riscv_vsra_vx_i16m1(row1, 1, vl);
+ z2 = __riscv_vsub_vv_i16m1(z2, row3, vl);
+ vint16m1_t z3 = __riscv_vsra_vx_i16m1(row3, 1, vl);
+ z3 = __riscv_vadd_vv_i16m1(z3, row1, vl);
+
+ vint16m1_t result0 = __riscv_vadd_vv_i16m1(z0, z3, vl);
+ vint16m1_t result1 = __riscv_vadd_vv_i16m1(z1, z2, vl);
+ vint16m1_t result2 = __riscv_vsub_vv_i16m1(z1, z2, vl);
+ vint16m1_t result3 = __riscv_vsub_vv_i16m1(z0, z3, vl);
+
+ // transpose
+ __riscv_vse16_v_i16m1(&temp[0], result0, vl);
+ __riscv_vse16_v_i16m1(&temp[4], result1, vl);
+ __riscv_vse16_v_i16m1(&temp[8], result2, vl);
+ __riscv_vse16_v_i16m1(&temp[12], result3, vl);
+ __riscv_vlseg4e16_v_i16m1(&row0, &row1, &row2, &row3, &temp[0], vl);
+
+ // 1-D column idct
+ z0 = __riscv_vadd_vv_i16m1(row0, row2, vl);
+ z1 = __riscv_vsub_vv_i16m1(row0, row2, vl);
+ z2 = __riscv_vsra_vx_i16m1(row1, 1, vl);
+ z2 = __riscv_vsub_vv_i16m1(z2, row3, vl);
+ z3 = __riscv_vsra_vx_i16m1(row3, 1, vl);
+ z3 = __riscv_vadd_vv_i16m1(z3, row1, vl);
+
+ result0 = __riscv_vadd_vv_i16m1(z0, z3, vl);
+ result1 = __riscv_vadd_vv_i16m1(z1, z2, vl);
+ result2 = __riscv_vsub_vv_i16m1(z1, z2, vl);
+ result3 = __riscv_vsub_vv_i16m1(z0, z3, vl);
+
+ result0 = __riscv_vsra_vx_i16m1(result0, 6, vl);
+ result1 = __riscv_vsra_vx_i16m1(result1, 6, vl);
+ result2 = __riscv_vsra_vx_i16m1(result2, 6, vl);
+ result3 = __riscv_vsra_vx_i16m1(result3, 6, vl);
+
+ vuint8mf2_t dst0 = __riscv_vle8_v_u8mf2(p_dst, vl);
+ vuint8mf2_t dst1 = __riscv_vle8_v_u8mf2(p_dst + stride, vl);
+ vuint8mf2_t dst2 = __riscv_vle8_v_u8mf2(p_dst + stride * 2, vl);
+ vuint8mf2_t dst3 = __riscv_vle8_v_u8mf2(p_dst + stride * 3, vl);
+
+ vint16m1_t dst0_w = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst0, vl));
+ vint16m1_t dst1_w = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst1, vl));
+ vint16m1_t dst2_w = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst2, vl));
+ vint16m1_t dst3_w = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst3, vl));
+
+ result0 = __riscv_vadd_vv_i16m1(result0, dst0_w, vl);
+ result1 = __riscv_vadd_vv_i16m1(result1, dst1_w, vl);
+ result2 = __riscv_vadd_vv_i16m1(result2, dst2_w, vl);
+ result3 = __riscv_vadd_vv_i16m1(result3, dst3_w, vl);
+
+ result0 = __riscv_vmax_vx_i16m1(result0, 0, vl);
+ result1 = __riscv_vmax_vx_i16m1(result1, 0, vl);
+ result2 = __riscv_vmax_vx_i16m1(result2, 0, vl);
+ result3 = __riscv_vmax_vx_i16m1(result3, 0, vl);
+
+ vuint8mf2_t result0_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result0), 0, vl);
+ vuint8mf2_t result1_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result1), 0, vl);
+ vuint8mf2_t result2_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result2), 0, vl);
+ vuint8mf2_t result3_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result3), 0, vl);
+
+ __riscv_vse8_v_u8mf2(p_dst, result0_n, vl);
+ __riscv_vse8_v_u8mf2(p_dst + stride, result1_n, vl);
+ __riscv_vse8_v_u8mf2(p_dst + stride * 2, result2_n, vl);
+ __riscv_vse8_v_u8mf2(p_dst + stride * 3, result3_n, vl);
+
+ memset(p_block, 0, sizeof(int16_t) * 16);
+}
+
+void h264_idct_dc_add_8_rvv(uint8_t *p_dst, int16_t *p_block, int stride)
+{
+ int vl = __riscv_vsetvl_e16m1(4);
+
+ int dc = (p_block[0] + 32) >> 6;
+
+ if (dc > 255)
+ dc = 255;
+
+ if (dc < -255)
+ dc = -255;
+
+ p_block[0] = 0;
+
+ vuint8m1_t dst0 = __riscv_vle8_v_u8m1(p_dst, vl);
+ vuint8m1_t dst1 = __riscv_vle8_v_u8m1(p_dst + stride, vl);
+ vuint8m1_t dst2 = __riscv_vle8_v_u8m1(p_dst + stride * 2, vl);
+ vuint8m1_t dst3 = __riscv_vle8_v_u8m1(p_dst + stride * 3, vl);
+
+ if (dc >= 0)
+ {
+ dst0 = __riscv_vsaddu_vx_u8m1(dst0, dc, vl);
+ dst1 = __riscv_vsaddu_vx_u8m1(dst1, dc, vl);
+ dst2 = __riscv_vsaddu_vx_u8m1(dst2, dc, vl);
+ dst3 = __riscv_vsaddu_vx_u8m1(dst3, dc, vl);
+ }
+ else
+ {
+ dst0 = __riscv_vssubu_vx_u8m1(dst0, -dc, vl);
+ dst1 = __riscv_vssubu_vx_u8m1(dst1, -dc, vl);
+ dst2 = __riscv_vssubu_vx_u8m1(dst2, -dc, vl);
+ dst3 = __riscv_vssubu_vx_u8m1(dst3, -dc, vl);
+ }
+
+ __riscv_vse8_v_u8m1(p_dst, dst0, vl);
+ __riscv_vse8_v_u8m1(p_dst + stride, dst1, vl);
+ __riscv_vse8_v_u8m1(p_dst + stride * 2, dst2, vl);
+ __riscv_vse8_v_u8m1(p_dst + stride * 3, dst3, vl);
+}
+
+void h264_idct_add16_8_rvv(uint8_t *p_dst, const int *p_block_offset, int16_t *p_block, int stride,
+ const uint8_t nnzc[5 * 8])
+{
+ for(int i = 0; i < 16; i++)
+ {
+ int nnz = nnzc[scan8[i]];
+
+ if(nnz)
+ {
+ if(nnz==1 && p_block[i*16])
+ h264_idct_dc_add_8_rvv(p_dst + p_block_offset[i], p_block + i * 16 * sizeof(pixel), stride);
+ else
+ h264_idct_add_8_rvv(p_dst + p_block_offset[i], p_block + i * 16 * sizeof(pixel), stride);
+ }
+ }
+}
+
+void h264_idct_add16_intra_8_rvv(uint8_t *p_dst, const int *p_block_offset, int16_t *p_block, int stride,
+ const uint8_t nnzc[5 * 8])
+{
+ for(int i = 0; i < 16; i++)
+ {
+ if(nnzc[scan8[i]])
+ h264_idct_add_8_rvv(p_dst + p_block_offset[i], p_block + i * 16 * sizeof(pixel), stride);
+ else if(p_block[i*16])
+ h264_idct_dc_add_8_rvv(p_dst + p_block_offset[i], p_block + i * 16 * sizeof(pixel), stride);
+ }
+}
+
+void h264_idct_add8_8_rvv(uint8_t **p_dst, const int *p_block_offset, int16_t *p_block, int stride,
+ const uint8_t nnzc[15*8])
+{
+ for(int j = 1; j < 3; j++)
+ {
+ for(int i = j * 16; i < j * 16 + 4; i++)
+ {
+ if(nnzc[scan8[i]])
+ h264_idct_add_8_rvv(p_dst[j - 1] + p_block_offset[i], p_block + i * 16 * sizeof(pixel), stride);
+ else if(p_block[i * 16])
+ h264_idct_dc_add_8_rvv(p_dst[j - 1] + p_block_offset[i], p_block + i * 16 * sizeof(pixel), stride);
+ }
+ }
+}
+
+void h264_idct8_add_8_rvv(uint8_t *p_dst, int16_t *p_block, int stride)
+{
+ int16_t temp[64];
+ int vl = __riscv_vsetvl_e16m1(8);
+
+ p_block[0] += 32;
+
+ vint16m1_t row0 = __riscv_vle16_v_i16m1(p_block, vl);
+ vint16m1_t row1 = __riscv_vle16_v_i16m1(p_block + 8, vl);
+ vint16m1_t row2 = __riscv_vle16_v_i16m1(p_block + 16, vl);
+ vint16m1_t row3 = __riscv_vle16_v_i16m1(p_block + 24, vl);
+ vint16m1_t row4 = __riscv_vle16_v_i16m1(p_block + 32, vl);
+ vint16m1_t row5 = __riscv_vle16_v_i16m1(p_block + 40, vl);
+ vint16m1_t row6 = __riscv_vle16_v_i16m1(p_block + 48, vl);
+ vint16m1_t row7 = __riscv_vle16_v_i16m1(p_block + 56, vl);
+
+ // 1-D row idct
+ vint16m1_t a0 = __riscv_vadd_vv_i16m1(row0, row4, vl);
+ vint16m1_t a2 = __riscv_vsub_vv_i16m1(row0, row4, vl);
+ vint16m1_t a4 = __riscv_vsra_vx_i16m1(row2, 1, vl);
+ a4 = __riscv_vsub_vv_i16m1(a4, row6, vl);
+ vint16m1_t a6 = __riscv_vsra_vx_i16m1(row6, 1, vl);
+ a6 = __riscv_vadd_vv_i16m1(row2, a6, vl);
+
+ vint16m1_t b0 = __riscv_vadd_vv_i16m1(a0, a6, vl);
+ vint16m1_t b2 = __riscv_vadd_vv_i16m1(a2, a4, vl);
+ vint16m1_t b4 = __riscv_vsub_vv_i16m1(a2, a4, vl);
+ vint16m1_t b6 = __riscv_vsub_vv_i16m1(a0, a6, vl);
+
+ vint16m1_t a1 = __riscv_vsra_vx_i16m1(row7, 1, vl);
+ a1 = __riscv_vsub_vv_i16m1(row5, a1, vl);
+ a1 = __riscv_vsub_vv_i16m1(a1, row3, vl);
+ a1 = __riscv_vsub_vv_i16m1(a1, row7, vl);
+ vint16m1_t a3 = __riscv_vsra_vx_i16m1(row3, 1, vl);
+ a3 = __riscv_vsub_vv_i16m1(row7, a3, vl);
+ a3 = __riscv_vadd_vv_i16m1(a3, row1, vl);
+ a3 = __riscv_vsub_vv_i16m1(a3, row3, vl);
+ vint16m1_t a5 = __riscv_vsra_vx_i16m1(row5, 1, vl);
+ a5 = __riscv_vsub_vv_i16m1(a5, row1, vl);
+ a5 = __riscv_vadd_vv_i16m1(a5, row7, vl);
+ a5 = __riscv_vadd_vv_i16m1(a5, row5, vl);
+ vint16m1_t a7 = __riscv_vsra_vx_i16m1(row1, 1, vl);
+ a7 = __riscv_vadd_vv_i16m1(a7, row3, vl);
+ a7 = __riscv_vadd_vv_i16m1(a7, row5, vl);
+ a7 = __riscv_vadd_vv_i16m1(a7, row1, vl);
+
+ vint16m1_t b1 = __riscv_vsra_vx_i16m1(a7, 2, vl);
+ b1 = __riscv_vadd_vv_i16m1(b1, a1, vl);
+ vint16m1_t b3 = __riscv_vsra_vx_i16m1(a5, 2, vl);
+ b3 = __riscv_vadd_vv_i16m1(b3, a3, vl);
+ vint16m1_t b5 = __riscv_vsra_vx_i16m1(a3, 2, vl);
+ b5 = __riscv_vsub_vv_i16m1(b5, a5, vl);
+ vint16m1_t b7 = __riscv_vsra_vx_i16m1(a1, 2, vl);
+ b7 = __riscv_vsub_vv_i16m1(a7, b7, vl);
+
+ vint16m1_t result0 = __riscv_vadd_vv_i16m1(b0, b7, vl);
+ vint16m1_t result7 = __riscv_vsub_vv_i16m1(b0, b7, vl);
+ vint16m1_t result1 = __riscv_vadd_vv_i16m1(b2, b5, vl);
+ vint16m1_t result6 = __riscv_vsub_vv_i16m1(b2, b5, vl);
+ vint16m1_t result2 = __riscv_vadd_vv_i16m1(b4, b3, vl);
+ vint16m1_t result5 = __riscv_vsub_vv_i16m1(b4, b3, vl);
+ vint16m1_t result3 = __riscv_vadd_vv_i16m1(b6, b1, vl);
+ vint16m1_t result4 = __riscv_vsub_vv_i16m1(b6, b1, vl);
+
+ // transpose
+ __riscv_vse16_v_i16m1(&temp[0], result0, vl);
+ __riscv_vse16_v_i16m1(&temp[8], result1, vl);
+ __riscv_vse16_v_i16m1(&temp[16], result2, vl);
+ __riscv_vse16_v_i16m1(&temp[24], result3, vl);
+ __riscv_vse16_v_i16m1(&temp[32], result4, vl);
+ __riscv_vse16_v_i16m1(&temp[40], result5, vl);
+ __riscv_vse16_v_i16m1(&temp[48], result6, vl);
+ __riscv_vse16_v_i16m1(&temp[56], result7, vl);
+
+ __riscv_vlseg8e16_v_i16m1(&row0, &row1, &row2, &row3, &row4, &row5, &row6, &row7, &temp[0], vl);
+
+ // 1-D column idct
+ a0 = __riscv_vadd_vv_i16m1(row0, row4, vl);
+ a2 = __riscv_vsub_vv_i16m1(row0, row4, vl);
+ a4 = __riscv_vsra_vx_i16m1(row2, 1, vl);
+ a4 = __riscv_vsub_vv_i16m1(a4, row6, vl);
+ a6 = __riscv_vsra_vx_i16m1(row6, 1, vl);
+ a6 = __riscv_vadd_vv_i16m1(row2, a6, vl);
+
+ b0 = __riscv_vadd_vv_i16m1(a0, a6, vl);
+ b2 = __riscv_vadd_vv_i16m1(a2, a4, vl);
+ b4 = __riscv_vsub_vv_i16m1(a2, a4, vl);
+ b6 = __riscv_vsub_vv_i16m1(a0, a6, vl);
+
+ a1 = __riscv_vsra_vx_i16m1(row7, 1, vl);
+ a1 = __riscv_vsub_vv_i16m1(row5, a1, vl);
+ a1 = __riscv_vsub_vv_i16m1(a1, row3, vl);
+ a1 = __riscv_vsub_vv_i16m1(a1, row7, vl);
+ a3 = __riscv_vsra_vx_i16m1(row3, 1, vl);
+ a3 = __riscv_vsub_vv_i16m1(row7, a3, vl);
+ a3 = __riscv_vadd_vv_i16m1(a3, row1, vl);
+ a3 = __riscv_vsub_vv_i16m1(a3, row3, vl);
+ a5 = __riscv_vsra_vx_i16m1(row5, 1, vl);
+ a5 = __riscv_vsub_vv_i16m1(a5, row1, vl);
+ a5 = __riscv_vadd_vv_i16m1(a5, row7, vl);
+ a5 = __riscv_vadd_vv_i16m1(a5, row5, vl);
+ a7 = __riscv_vsra_vx_i16m1(row1, 1, vl);
+ a7 = __riscv_vadd_vv_i16m1(a7, row3, vl);
+ a7 = __riscv_vadd_vv_i16m1(a7, row5, vl);
+ a7 = __riscv_vadd_vv_i16m1(a7, row1, vl);
+
+ b1 = __riscv_vsra_vx_i16m1(a7, 2, vl);
+ b1 = __riscv_vadd_vv_i16m1(b1, a1, vl);
+ b3 = __riscv_vsra_vx_i16m1(a5, 2, vl);
+ b3 = __riscv_vadd_vv_i16m1(b3, a3, vl);
+ b5 = __riscv_vsra_vx_i16m1(a3, 2, vl);
+ b5 = __riscv_vsub_vv_i16m1(b5, a5, vl);
+ b7 = __riscv_vsra_vx_i16m1(a1, 2, vl);
+ b7 = __riscv_vsub_vv_i16m1(a7, b7, vl);
+
+ result0 = __riscv_vadd_vv_i16m1(b0, b7, vl);
+ result1 = __riscv_vadd_vv_i16m1(b2, b5, vl);
+ result2 = __riscv_vadd_vv_i16m1(b4, b3, vl);
+ result3 = __riscv_vadd_vv_i16m1(b6, b1, vl);
+ result4 = __riscv_vsub_vv_i16m1(b6, b1, vl);
+ result5 = __riscv_vsub_vv_i16m1(b4, b3, vl);
+ result6 = __riscv_vsub_vv_i16m1(b2, b5, vl);
+ result7 = __riscv_vsub_vv_i16m1(b0, b7, vl);
+
+ // normalize and write to destination
+ result0 = __riscv_vsra_vx_i16m1(result0, 6, vl);
+ result1 = __riscv_vsra_vx_i16m1(result1, 6, vl);
+ result2 = __riscv_vsra_vx_i16m1(result2, 6, vl);
+ result3 = __riscv_vsra_vx_i16m1(result3, 6, vl);
+ result4 = __riscv_vsra_vx_i16m1(result4, 6, vl);
+ result5 = __riscv_vsra_vx_i16m1(result5, 6, vl);
+ result6 = __riscv_vsra_vx_i16m1(result6, 6, vl);
+ result7 = __riscv_vsra_vx_i16m1(result7, 6, vl);
+
+ vuint8mf2_t dst0 = __riscv_vle8_v_u8mf2(p_dst, vl);
+ vuint8mf2_t dst1 = __riscv_vle8_v_u8mf2(p_dst + stride, vl);
+ vuint8mf2_t dst2 = __riscv_vle8_v_u8mf2(p_dst + stride * 2, vl);
+ vuint8mf2_t dst3 = __riscv_vle8_v_u8mf2(p_dst + stride * 3, vl);
+ vuint8mf2_t dst4 = __riscv_vle8_v_u8mf2(p_dst + stride * 4, vl);
+ vuint8mf2_t dst5 = __riscv_vle8_v_u8mf2(p_dst + stride * 5, vl);
+ vuint8mf2_t dst6 = __riscv_vle8_v_u8mf2(p_dst + stride * 6, vl);
+ vuint8mf2_t dst7 = __riscv_vle8_v_u8mf2(p_dst + stride * 7, vl);
+
+ vint16m1_t dst0_w = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst0, vl));
+ vint16m1_t dst1_w = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst1, vl));
+ vint16m1_t dst2_w = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst2, vl));
+ vint16m1_t dst3_w = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst3, vl));
+ vint16m1_t dst4_w = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst4, vl));
+ vint16m1_t dst5_w = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst5, vl));
+ vint16m1_t dst6_w = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst6, vl));
+ vint16m1_t dst7_w = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst7, vl));
+
+ result0 = __riscv_vadd_vv_i16m1(result0, dst0_w, vl);
+ result1 = __riscv_vadd_vv_i16m1(result1, dst1_w, vl);
+ result2 = __riscv_vadd_vv_i16m1(result2, dst2_w, vl);
+ result3 = __riscv_vadd_vv_i16m1(result3, dst3_w, vl);
+ result4 = __riscv_vadd_vv_i16m1(result4, dst4_w, vl);
+ result5 = __riscv_vadd_vv_i16m1(result5, dst5_w, vl);
+ result6 = __riscv_vadd_vv_i16m1(result6, dst6_w, vl);
+ result7 = __riscv_vadd_vv_i16m1(result7, dst7_w, vl);
+
+ result0 = __riscv_vmax_vx_i16m1(result0, 0, vl);
+ result1 = __riscv_vmax_vx_i16m1(result1, 0, vl);
+ result2 = __riscv_vmax_vx_i16m1(result2, 0, vl);
+ result3 = __riscv_vmax_vx_i16m1(result3, 0, vl);
+ result4 = __riscv_vmax_vx_i16m1(result4, 0, vl);
+ result5 = __riscv_vmax_vx_i16m1(result5, 0, vl);
+ result6 = __riscv_vmax_vx_i16m1(result6, 0, vl);
+ result7 = __riscv_vmax_vx_i16m1(result7, 0, vl);
+
+ vuint8mf2_t result0_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result0), 0, vl);
+ vuint8mf2_t result1_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result1), 0, vl);
+ vuint8mf2_t result2_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result2), 0, vl);
+ vuint8mf2_t result3_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result3), 0, vl);
+ vuint8mf2_t result4_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result4), 0, vl);
+ vuint8mf2_t result5_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result5), 0, vl);
+ vuint8mf2_t result6_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result6), 0, vl);
+ vuint8mf2_t result7_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result7), 0, vl);
+
+ __riscv_vse8_v_u8mf2(p_dst, result0_n, vl);
+ __riscv_vse8_v_u8mf2(p_dst + stride, result1_n, vl);
+ __riscv_vse8_v_u8mf2(p_dst + stride * 2, result2_n, vl);
+ __riscv_vse8_v_u8mf2(p_dst + stride * 3, result3_n, vl);
+ __riscv_vse8_v_u8mf2(p_dst + stride * 4, result4_n, vl);
+ __riscv_vse8_v_u8mf2(p_dst + stride * 5, result5_n, vl);
+ __riscv_vse8_v_u8mf2(p_dst + stride * 6, result6_n, vl);
+ __riscv_vse8_v_u8mf2(p_dst + stride * 7, result7_n, vl);
+
+ memset(p_block, 0, sizeof(int16_t) * 64);
+}
+
+void h264_idct8_dc_add_8_rvv(uint8_t *p_dst, int16_t *p_block, int stride)
+{
+ int count = 8;
+ uint8_t *p_dst_iter = p_dst;
+
+ int dc = (p_block[0] + 32) >> 6;
+
+ if (dc > 255)
+ dc = 255;
+
+ if (dc < -255)
+ dc = -255;
+
+ p_block[0] = 0;
+
+ while (count > 0)
+ {
+ int vl = __riscv_vsetvl_e16m1(8);
+
+ vuint8m1_t dst0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ vuint8m1_t dst1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+ vuint8m1_t dst2 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 2, vl);
+ vuint8m1_t dst3 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 3, vl);
+ vuint8m1_t dst4 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 4, vl);
+ vuint8m1_t dst5 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 5, vl);
+ vuint8m1_t dst6 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 6, vl);
+ vuint8m1_t dst7 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 7, vl);
+
+ if (dc >= 0)
+ {
+ dst0 = __riscv_vsaddu_vx_u8m1(dst0, dc, vl);
+ dst1 = __riscv_vsaddu_vx_u8m1(dst1, dc, vl);
+ dst2 = __riscv_vsaddu_vx_u8m1(dst2, dc, vl);
+ dst3 = __riscv_vsaddu_vx_u8m1(dst3, dc, vl);
+ dst4 = __riscv_vsaddu_vx_u8m1(dst4, dc, vl);
+ dst5 = __riscv_vsaddu_vx_u8m1(dst5, dc, vl);
+ dst6 = __riscv_vsaddu_vx_u8m1(dst6, dc, vl);
+ dst7 = __riscv_vsaddu_vx_u8m1(dst7, dc, vl);
+ }
+ else
+ {
+ dst0 = __riscv_vssubu_vx_u8m1(dst0, -dc, vl);
+ dst1 = __riscv_vssubu_vx_u8m1(dst1, -dc, vl);
+ dst2 = __riscv_vssubu_vx_u8m1(dst2, -dc, vl);
+ dst3 = __riscv_vssubu_vx_u8m1(dst3, -dc, vl);
+ dst4 = __riscv_vssubu_vx_u8m1(dst4, -dc, vl);
+ dst5 = __riscv_vssubu_vx_u8m1(dst5, -dc, vl);
+ dst6 = __riscv_vssubu_vx_u8m1(dst6, -dc, vl);
+ dst7 = __riscv_vssubu_vx_u8m1(dst7, -dc, vl);
+ }
+
+ __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+ __riscv_vse8_v_u8m1(p_dst_iter + stride, dst1, vl);
+ __riscv_vse8_v_u8m1(p_dst_iter + stride * 2, dst2, vl);
+ __riscv_vse8_v_u8m1(p_dst_iter + stride * 3, dst3, vl);
+ __riscv_vse8_v_u8m1(p_dst_iter + stride * 4, dst4, vl);
+ __riscv_vse8_v_u8m1(p_dst_iter + stride * 5, dst5, vl);
+ __riscv_vse8_v_u8m1(p_dst_iter + stride * 6, dst6, vl);
+ __riscv_vse8_v_u8m1(p_dst_iter + stride * 7, dst7, vl);
+
+ count -= vl;
+ p_dst_iter += vl;
+ }
+}
+
+void h264_idct8_add4_8_rvv(uint8_t *p_dst, const int *p_block_offset,
+ int16_t *p_block, int stride, const uint8_t nnzc[5 * 8])
+{
+ for(int i = 0; i < 16; i += 4)
+ {
+ int nnz = nnzc[scan8[i]];
+
+ if(nnz)
+ {
+ if(nnz == 1 && p_block[i * 16])
+ h264_idct8_dc_add_8_rvv(p_dst + p_block_offset[i], p_block + i * 16 * sizeof(pixel), stride);
+ else
+ h264_idct8_add_8_rvv(p_dst + p_block_offset[i], p_block + i * 16 * sizeof(pixel), stride);
+ }
+ }
+}
+#endif
+
diff --git a/libavcodec/riscv/h264_idct.h b/libavcodec/riscv/h264_idct.h
new file mode 100644
index 0000000000..4b942c35f7
--- /dev/null
+++ b/libavcodec/riscv/h264_idct.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_RISCV_H264_IDCT_H
+#define AVCODEC_RISCV_H264_IDCT_H
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stddef.h>
+#include "config.h"
+
+#if HAVE_INTRINSICS_RVV
+typedef unsigned char pixel;
+
+void h264_idct_add_8_rvv(uint8_t *dst, int16_t *block, int stride);
+void h264_idct_dc_add_8_rvv(uint8_t *p_dst, int16_t *p_block, int stride);
+void h264_idct_add16_8_rvv(uint8_t *p_dst, const int *p_block_offset, int16_t *p_block, int stride,
+ const uint8_t nnzc[5 * 8]);
+void h264_idct_add16_intra_8_rvv(uint8_t *p_dst, const int *p_block_offset, int16_t *p_block, int stride,
+ const uint8_t nnzc[5 * 8]);
+void h264_idct_add8_8_rvv(uint8_t **p_dst, const int *p_block_offset, int16_t *p_block, int stride,
+ const uint8_t nnzc[15*8]);
+void h264_idct8_add_8_rvv(uint8_t *_dst, int16_t *_block, int stride);
+void h264_idct8_dc_add_8_rvv(uint8_t *p_dst, int16_t *p_block, int stride);
+void h264_idct8_add4_8_rvv(uint8_t *dst, const int *block_offset,
+ int16_t *block, int stride, const uint8_t nnzc[5 * 8]);
+#endif
+#endif
\ No newline at end of file
diff --git a/libavcodec/riscv/h264_inloop.c b/libavcodec/riscv/h264_inloop.c
new file mode 100644
index 0000000000..d14cf4dd7a
--- /dev/null
+++ b/libavcodec/riscv/h264_inloop.c
@@ -0,0 +1,669 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264_inloop.h"
+#if HAVE_INTRINSICS_RVV
+#include <riscv_vector.h>
+
+__attribute__((always_inline)) static void extend_tc0_2(vint8mf2_t *p_tc0_i8, int8_t *p_tc0, size_t start, int vl)
+{
+ if (p_tc0[0] == p_tc0[1] && p_tc0[1] == p_tc0[2] && p_tc0[2] == p_tc0[3])
+ {
+ *p_tc0_i8 = __riscv_vmv_v_x_i8mf2(p_tc0[0], vl);
+ }
+ else
+ {
+ const uint8_t tc02_index[] = {0, 0, 1, 1, 2, 2, 3, 3};
+ vint8mf2_t tc8 = __riscv_vle8_v_i8mf2(p_tc0, 4);
+ vuint8mf2_t v_index = __riscv_vle8_v_u8mf2(tc02_index + start, vl);
+ *p_tc0_i8 = __riscv_vrgather_vv_i8mf2(tc8, v_index, vl);
+ }
+}
+
+__attribute__((always_inline)) static void extend_tc0(vint8mf2_t *p_tc0_i8, int8_t *p_tc0, size_t start, int vl)
+{
+ if (p_tc0[0] == p_tc0[1] && p_tc0[1] == p_tc0[2] && p_tc0[2] == p_tc0[3])
+ {
+ *p_tc0_i8 = __riscv_vmv_v_x_i8mf2(p_tc0[0], vl);
+ }
+ else
+ {
+ const uint8_t tc01_index[] = {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3};
+ vint8mf2_t tc8 = __riscv_vle8_v_i8mf2(p_tc0, 4);
+ vuint8mf2_t v_index = __riscv_vle8_v_u8mf2(tc01_index + start, vl);
+ *p_tc0_i8 = __riscv_vrgather_vv_i8mf2(tc8, v_index, vl);
+ }
+}
+
+__attribute__((always_inline)) static void luma_core(vuint8mf2_t *p_p1_dst, vuint8mf2_t *p_p0_dst,
+ vuint8mf2_t *p_q0_dst, vuint8mf2_t *p_q1_dst,
+ vuint8mf2_t p2, vuint8mf2_t p1, vuint8mf2_t p0,
+ vuint8mf2_t q0, vuint8mf2_t q1, vuint8mf2_t q2,
+ vint8mf2_t tc8, int alpha, int beta, int vl)
+{
+ vint16m1_t p2_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(p2, 0, vl));
+ vint16m1_t p1_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(p1, 0, vl));
+ vint16m1_t p0_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(p0, 0, vl));
+ vint16m1_t q0_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(q0, 0, vl));
+ vint16m1_t q1_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(q1, 0, vl));
+ vint16m1_t q2_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(q2, 0, vl));
+
+ vint16m1_t sub_q0_p0 = __riscv_vsub_vv_i16m1(q0_i16, p0_i16, vl);
+ vint16m1_t sub_p1_p0 = __riscv_vsub_vv_i16m1(p1_i16, p0_i16, vl);
+ vint16m1_t sub_q1_q0 = __riscv_vsub_vv_i16m1(q1_i16, q0_i16, vl);
+ vint16m1_t sub_p2_p0 = __riscv_vsub_vv_i16m1(p2_i16, p0_i16, vl);
+ vint16m1_t sub_q2_q0 = __riscv_vsub_vv_i16m1(q2_i16, q0_i16, vl);
+
+ vint16m1_t minus_sub_q0_p0 = __riscv_vrsub_vx_i16m1(sub_q0_p0, 0, vl);
+ vint16m1_t minus_sub_p1_p0 = __riscv_vrsub_vx_i16m1(sub_p1_p0, 0, vl);
+ vint16m1_t minus_sub_q1_q0 = __riscv_vrsub_vx_i16m1(sub_q1_q0, 0, vl);
+ vint16m1_t minus_sub_p2_p0 = __riscv_vrsub_vx_i16m1(sub_p2_p0, 0, vl);
+ vint16m1_t minus_sub_q2_q0 = __riscv_vrsub_vx_i16m1(sub_q2_q0, 0, vl);
+
+ vint16m1_t abs_diff11 = __riscv_vmax_vv_i16m1(sub_q0_p0, minus_sub_q0_p0, vl);
+ vint16m1_t abs_diff12 = __riscv_vmax_vv_i16m1(sub_p1_p0, minus_sub_p1_p0, vl);
+ vint16m1_t abs_diff13 = __riscv_vmax_vv_i16m1(sub_q1_q0, minus_sub_q1_q0, vl);
+ vint16m1_t abs_diff2 = __riscv_vmax_vv_i16m1(sub_p2_p0, minus_sub_p2_p0, vl);
+ vint16m1_t abs_diff3 = __riscv_vmax_vv_i16m1(sub_q2_q0, minus_sub_q2_q0, vl);
+
+ vint16m1_t tc = __riscv_vwadd_vx_i16m1(tc8, 0, vl);
+ vbool16_t cond_mask = __riscv_vmsge_vx_i16m1_b16(tc, 0, vl);
+ vbool16_t cond11 = __riscv_vmslt_vx_i16m1_b16(abs_diff11, alpha, vl);
+ vbool16_t cond12 = __riscv_vmslt_vx_i16m1_b16(abs_diff12, beta, vl);
+ vbool16_t cond13 = __riscv_vmslt_vx_i16m1_b16(abs_diff13, beta, vl);
+ vbool16_t cond2 = __riscv_vmslt_vx_i16m1_b16(abs_diff2, beta, vl);
+ vbool16_t cond3 = __riscv_vmslt_vx_i16m1_b16(abs_diff3, beta, vl);
+
+ vbool16_t cond1 = __riscv_vmand_mm_b16(cond11, cond_mask, vl);
+ cond1 = __riscv_vmand_mm_b16(cond1, cond12, vl);
+ cond1 = __riscv_vmand_mm_b16(cond1, cond13, vl);
+ cond2 = __riscv_vmand_mm_b16(cond2, cond1, vl);
+ cond3 = __riscv_vmand_mm_b16(cond3, cond1, vl);
+
+ // p1
+ vint16m1_t sum_p0_q0 = __riscv_vaadd_vv_i16m1(p0_i16, q0_i16, vl);
+ vint16m1_t p1_new_i16 = __riscv_vadd_vv_i16m1(sum_p0_q0, p2_i16, vl);
+ p1_new_i16 = __riscv_vsra_vx_i16m1(p1_new_i16, 1, vl);
+ vint16m1_t p1_new_upper = __riscv_vadd_vv_i16m1(p1_i16, tc, vl);
+ vint16m1_t p1_new_lower = __riscv_vsub_vv_i16m1(p1_i16, tc, vl);
+ p1_new_i16 = __riscv_vmax_vv_i16m1(p1_new_i16, p1_new_lower, vl);
+ p1_new_i16 = __riscv_vmin_vv_i16m1(p1_new_i16, p1_new_upper, vl);
+ *p_p1_dst = __riscv_vncvt_x_x_w_u8mf2_mu(cond2, p1, __riscv_vreinterpret_v_i16m1_u16m1(p1_new_i16), vl);
+ vint16m1_t tc_adjust = __riscv_vadc_vxm_i16m1(tc, 0, cond2, vl);
+
+ // q1
+ vint16m1_t q1_new_i16 = __riscv_vadd_vv_i16m1(sum_p0_q0, q2_i16, vl);
+ q1_new_i16 = __riscv_vsra_vx_i16m1(q1_new_i16, 1, vl);
+ vint16m1_t q1_new_upper = __riscv_vadd_vv_i16m1(q1_i16, tc, vl);
+ vint16m1_t q1_new_lower = __riscv_vsub_vv_i16m1(q1_i16, tc, vl);
+ q1_new_i16 = __riscv_vmax_vv_i16m1(q1_new_i16, q1_new_lower, vl);
+ q1_new_i16 = __riscv_vmin_vv_i16m1(q1_new_i16, q1_new_upper, vl);
+ *p_q1_dst = __riscv_vncvt_x_x_w_u8mf2_mu(cond3, q1, __riscv_vreinterpret_v_i16m1_u16m1(q1_new_i16), vl);
+ tc_adjust = __riscv_vadc_vxm_i16m1(tc_adjust, 0, cond3, vl);
+
+ // p0, q0
+ vint16m1_t sub_p1_q1 = __riscv_vsub_vv_i16m1(p1_i16, q1_i16, vl);
+ vint16m1_t delta_i16 = __riscv_vsll_vx_i16m1(sub_q0_p0, 2, vl);
+ delta_i16 = __riscv_vadd_vv_i16m1(delta_i16, sub_p1_q1, vl);
+ delta_i16 = __riscv_vssra_vx_i16m1(delta_i16, 3, vl);
+ delta_i16 = __riscv_vmin_vv_i16m1(delta_i16, tc_adjust, vl);
+ delta_i16 = __riscv_vmax_vv_i16m1(delta_i16, __riscv_vrsub_vx_i16m1(tc_adjust, 0, vl), vl);
+
+ vint16m1_t p0_new_i16 = __riscv_vadd_vv_i16m1(p0_i16, delta_i16, vl);
+ vint16m1_t q0_new_i16 = __riscv_vsub_vv_i16m1(q0_i16, delta_i16, vl);
+ p0_new_i16 = __riscv_vmax_vx_i16m1(p0_new_i16, 0, vl);
+ q0_new_i16 = __riscv_vmax_vx_i16m1(q0_new_i16, 0, vl);
+
+ *p_p0_dst= __riscv_vnclipu_wx_u8mf2_mu(cond1, p0, __riscv_vreinterpret_v_i16m1_u16m1(p0_new_i16), 0, vl);
+ *p_q0_dst = __riscv_vnclipu_wx_u8mf2_mu(cond1, q0, __riscv_vreinterpret_v_i16m1_u16m1(q0_new_i16), 0, vl);
+}
+
+__attribute__((always_inline)) static void v_loop_filter_luma(uint8_t *p_pix, ptrdiff_t stride,
+ int width, int alpha, int beta, int8_t *p_tc0)
+{
+ uint8_t *p_iter = p_pix;
+
+ size_t vxrm = __builtin_rvv_vgetvxrm();
+ __builtin_rvv_vsetvxrm(VE_TONEARESTUP);
+
+ int count = width;
+ int tc_offset = 0;
+
+ while (count > 0)
+ {
+ int vl = __riscv_vsetvl_e8mf2(width);
+
+ vint8mf2_t tc8;
+ extend_tc0(&tc8, p_tc0, tc_offset, vl);
+
+ vuint8mf2_t p2 = __riscv_vle8_v_u8mf2(p_iter - 3 * stride, vl);
+ vuint8mf2_t p1 = __riscv_vle8_v_u8mf2(p_iter - 2 * stride, vl);
+ vuint8mf2_t p0 = __riscv_vle8_v_u8mf2(p_iter - stride, vl);
+ vuint8mf2_t q0 = __riscv_vle8_v_u8mf2(p_iter, vl);
+ vuint8mf2_t q1 = __riscv_vle8_v_u8mf2(p_iter + stride, vl);
+ vuint8mf2_t q2 = __riscv_vle8_v_u8mf2(p_iter + 2 * stride, vl);
+
+ vuint8mf2_t p1_dst, p0_dst, q0_dst, q1_dst;
+ luma_core(&p1_dst, &p0_dst, &q0_dst, &q1_dst, p2, p1, p0, q0, q1, q2, tc8, alpha, beta, vl);
+
+ __riscv_vse8_v_u8mf2(p_iter - stride * 2, p1_dst, vl);
+ __riscv_vse8_v_u8mf2(p_iter - stride, p0_dst, vl);
+ __riscv_vse8_v_u8mf2(p_iter, q0_dst, vl);
+ __riscv_vse8_v_u8mf2(p_iter + stride, q1_dst, vl);
+
+ count -= vl;
+ tc_offset = tc_offset + vl;
+ p_iter = p_iter + vl;
+ }
+
+ __builtin_rvv_vsetvxrm(vxrm);
+}
+
+__attribute__((always_inline)) static void h_loop_filter_luma(uint8_t *p_pix, ptrdiff_t stride,
+ int width, int alpha, int beta, int8_t *p_tc0)
+{
+ uint8_t *p_iter = p_pix;
+
+ size_t vxrm = __builtin_rvv_vgetvxrm();
+ __builtin_rvv_vsetvxrm(VE_TONEARESTUP);
+
+ int count = width;
+ int tc_offset = 0;
+
+ while (count > 0)
+ {
+ int vl = __riscv_vsetvl_e8mf2(width);
+
+ vint8mf2_t tc8;
+ extend_tc0(&tc8, p_tc0, tc_offset, vl);
+
+ vuint8mf2_t p2, p1, p0, q0, q1, q2;
+ __riscv_vlsseg6e8_v_u8mf2(&p2, &p1, &p0, &q0, &q1, &q2, p_iter - 3, stride, width);
+
+ vuint8mf2_t p1_dst, p0_dst, q0_dst, q1_dst;
+ luma_core(&p1_dst, &p0_dst, &q0_dst, &q1_dst, p2, p1, p0, q0, q1, q2, tc8, alpha, beta, vl);
+
+ __riscv_vssseg4e8_v_u8mf2(p_iter - 2, stride, p1_dst, p0_dst, q0_dst, q1_dst, 16);
+
+ count -= vl;
+ tc_offset = tc_offset + vl;
+ p_iter = p_iter + vl * stride;
+ }
+
+ __builtin_rvv_vsetvxrm(vxrm);
+}
+
+__attribute__((always_inline)) static void chroma_core(vuint8mf2_t *p_p0_dst, vuint8mf2_t *p_q0_dst,
+ vuint8mf2_t p1, vuint8mf2_t p0, vuint8mf2_t q0,
+ vuint8mf2_t q1, vint8mf2_t tc8, int alpha,
+ int beta, int vl)
+{
+ vint16m1_t p1_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(p1, 0, vl));
+ vint16m1_t p0_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(p0, 0, vl));
+ vint16m1_t q0_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(q0, 0, vl));
+ vint16m1_t q1_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(q1, 0, vl));
+
+ vint16m1_t sub_q0_p0 = __riscv_vsub_vv_i16m1(q0_i16, p0_i16, vl);
+ vint16m1_t sub_p1_p0 = __riscv_vsub_vv_i16m1(p1_i16, p0_i16, vl);
+ vint16m1_t sub_q1_q0 = __riscv_vsub_vv_i16m1(q1_i16, q0_i16, vl);
+
+ vint16m1_t rsub_q0_p0 = __riscv_vrsub_vx_i16m1(sub_q0_p0, 0, vl);
+ vint16m1_t rsub_p1_p0 = __riscv_vrsub_vx_i16m1(sub_p1_p0, 0, vl);
+ vint16m1_t rsub_q1_q0 = __riscv_vrsub_vx_i16m1(sub_q1_q0, 0, vl);
+
+ vint16m1_t abs_diff11 = __riscv_vmax_vv_i16m1(sub_q0_p0, rsub_q0_p0, vl);
+ vint16m1_t abs_diff12 = __riscv_vmax_vv_i16m1(sub_p1_p0, rsub_p1_p0, vl);
+ vint16m1_t abs_diff13 = __riscv_vmax_vv_i16m1(sub_q1_q0, rsub_q1_q0, vl);
+
+ vint16m1_t tc = __riscv_vwadd_vx_i16m1(tc8, 0, vl);
+ vbool16_t cond_mask = __riscv_vmsge_vx_i16m1_b16(tc, 0, vl);
+ vbool16_t cond11 = __riscv_vmslt_vx_i16m1_b16_mu(cond_mask, cond_mask, abs_diff11, alpha, vl);
+ vbool16_t cond12 = __riscv_vmslt_vx_i16m1_b16_mu(cond11, cond11, abs_diff12, beta, vl);
+ vbool16_t cond13 = __riscv_vmslt_vx_i16m1_b16_mu(cond12, cond12, abs_diff13, beta, vl);
+
+ vint16m1_t sub_p1_q1 = __riscv_vsub_vv_i16m1(p1_i16, q1_i16, vl);
+ vint16m1_t delta = __riscv_vsll_vx_i16m1(sub_q0_p0, 2, vl);
+ delta = __riscv_vadd_vv_i16m1(delta, sub_p1_q1, vl);
+ delta = __riscv_vssra_vx_i16m1(delta, 3, vl);
+ delta = __riscv_vmin_vv_i16m1(delta, tc, vl);
+ delta = __riscv_vmax_vv_i16m1(delta, __riscv_vrsub_vx_i16m1(tc, 0, vl), vl);
+
+ vint16m1_t p0_new_i16 = __riscv_vadd_vv_i16m1(p0_i16, delta, vl);
+ vint16m1_t q0_new_i16 = __riscv_vsub_vv_i16m1(q0_i16, delta, vl);
+ p0_new_i16 = __riscv_vmax_vx_i16m1(p0_new_i16, 0, vl);
+ q0_new_i16 = __riscv_vmax_vx_i16m1(q0_new_i16, 0, vl);
+
+ *p_p0_dst = __riscv_vnclipu_wx_u8mf2_mu(cond13, p0, __riscv_vreinterpret_v_i16m1_u16m1(p0_new_i16), 0, vl);
+ *p_q0_dst = __riscv_vnclipu_wx_u8mf2_mu(cond13, q0, __riscv_vreinterpret_v_i16m1_u16m1(q0_new_i16), 0, vl);
+}
+
+__attribute__((always_inline)) static void v_loop_filter_chroma(uint8_t *p_pix, ptrdiff_t stride,
+ int width, int alpha, int beta, int8_t *p_tc0)
+{
+ uint8_t *p_iter = p_pix;
+
+ size_t vxrm = __builtin_rvv_vgetvxrm();
+ __builtin_rvv_vsetvxrm(VE_TONEARESTUP);
+
+ int count = width;
+ int tc_offset = 0;
+
+ while (count > 0)
+ {
+ int vl = __riscv_vsetvl_e8mf2(width);
+
+ vint8mf2_t tc8;
+ extend_tc0_2(&tc8, p_tc0, tc_offset, vl);
+
+ vuint8mf2_t p1 = __riscv_vle8_v_u8mf2(p_iter - 2 * stride, vl);
+ vuint8mf2_t p0 = __riscv_vle8_v_u8mf2(p_iter - stride, vl);
+ vuint8mf2_t q0 = __riscv_vle8_v_u8mf2(p_iter, vl);
+ vuint8mf2_t q1 = __riscv_vle8_v_u8mf2(p_iter + stride, vl);
+
+ vuint8mf2_t p0_dst, q0_dst;
+ chroma_core(&p0_dst, &q0_dst, p1, p0, q0, q1, tc8, alpha, beta, vl);
+
+ __riscv_vse8_v_u8mf2(p_iter - stride, p0_dst, vl);
+ __riscv_vse8_v_u8mf2(p_iter, q0_dst, vl);
+
+ count -= vl;
+ tc_offset += vl;
+ p_iter = p_iter + vl;
+ }
+
+ __builtin_rvv_vsetvxrm(vxrm);
+}
+
+__attribute__((always_inline)) static void h_loop_filter_chroma(uint8_t *p_pix, ptrdiff_t stride,
+ int width, int alpha, int beta, int8_t *p_tc0)
+{
+ uint8_t *p_iter = p_pix;
+
+ size_t vxrm = __builtin_rvv_vgetvxrm();
+ __builtin_rvv_vsetvxrm(VE_TONEARESTUP);
+
+ int count = width;
+ int tc_offset = 0;
+
+ while (count > 0)
+ {
+ int vl = __riscv_vsetvl_e8mf2(width);
+
+ vint8mf2_t tc8;
+ extend_tc0_2(&tc8, p_tc0, tc_offset, vl);
+
+ vuint8mf2_t p1, p0, q0, q1;
+ __riscv_vlsseg4e8_v_u8mf2(&p1, &p0, &q0, &q1, p_iter - 2, stride, vl);
+
+ vuint8mf2_t p0_dst, q0_dst;
+ chroma_core(&p0_dst, &q0_dst, p1, p0, q0, q1, tc8, alpha, beta, vl);
+
+ __riscv_vssseg2e8_v_u8mf2(p_iter - 1, stride, p0_dst, q0_dst, vl);
+
+ count -= vl;
+ tc_offset = tc_offset + vl;
+ p_iter = p_iter + vl * stride;
+ }
+
+ __builtin_rvv_vsetvxrm(vxrm);
+}
+
+__attribute__((always_inline)) static void luma_intra_core(vuint8mf2_t *p_p2_dst, vuint8mf2_t *p_p1_dst,
+ vuint8mf2_t *p_p0_dst, vuint8mf2_t *p_q0_dst,
+ vuint8mf2_t *p_q1_dst, vuint8mf2_t *p_q2_dst,
+ vuint8mf2_t p3, vuint8mf2_t p2, vuint8mf2_t p1,
+ vuint8mf2_t p0, vuint8mf2_t q0, vuint8mf2_t q1,
+ vuint8mf2_t q2, vuint8mf2_t q3, int alpha,
+ int beta, int vl)
+{
+ vint16m1_t p3_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(p3, 0, vl));
+ vint16m1_t p2_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(p2, 0, vl));
+ vint16m1_t p1_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(p1, 0, vl));
+ vint16m1_t p0_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(p0, 0, vl));
+ vint16m1_t q0_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(q0, 0, vl));
+ vint16m1_t q1_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(q1, 0, vl));
+ vint16m1_t q2_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(q2, 0, vl));
+ vint16m1_t q3_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(q3, 0, vl));
+
+ // p0
+ vint16m1_t sum_p1p0q0 = __riscv_vadd_vv_i16m1(p0_i16, p1_i16, vl);
+ sum_p1p0q0 = __riscv_vadd_vv_i16m1(sum_p1p0q0, q0_i16, vl);
+
+ vint16m1_t p0_new1_i16 = __riscv_vadd_vv_i16m1(p0_i16, q1_i16, vl);
+ vint16m1_t p0_new2_i16 = __riscv_vadd_vv_i16m1(p2_i16, q1_i16, vl);
+
+ // p1
+ vint16m1_t p1_new1_i16 = __riscv_vadd_vv_i16m1(sum_p1p0q0, p2_i16, vl);
+
+ // q0
+ vint16m1_t sum_p0q0q1 = __riscv_vadd_vv_i16m1(p0_i16, q0_i16, vl);
+ sum_p0q0q1 = __riscv_vadd_vv_i16m1(sum_p0q0q1, q1_i16, vl);
+
+ vint16m1_t q0_new1_i16 = __riscv_vadd_vv_i16m1(q0_i16, p1_i16, vl);
+ vint16m1_t q0_new2_i16 = __riscv_vadd_vv_i16m1(q2_i16, p1_i16, vl);
+
+ // q1
+ vint16m1_t q1_new1_i16 = __riscv_vadd_vv_i16m1(sum_p0q0q1, q2_i16, vl);
+
+ p0_new1_i16 = __riscv_vmacc_vx_i16m1(p0_new1_i16, 2, p1_i16, vl);
+ p0_new2_i16 = __riscv_vmacc_vx_i16m1(p0_new2_i16, 2, sum_p1p0q0, vl);
+ vint16m1_t p2_new1_i16 = __riscv_vmadd_vx_i16m1(p3_i16, 2, sum_p1p0q0, vl);
+ p2_new1_i16 = __riscv_vmacc_vx_i16m1(p2_new1_i16, 3, p2_i16, vl);
+ q0_new1_i16 = __riscv_vmacc_vx_i16m1(q0_new1_i16, 2, q1_i16, vl);
+ q0_new2_i16 = __riscv_vmacc_vx_i16m1(q0_new2_i16, 2, sum_p0q0q1, vl);
+ vint16m1_t q2_new1_i16 = __riscv_vmadd_vx_i16m1(q3_i16, 2, sum_p0q0q1, vl);
+ q2_new1_i16 = __riscv_vmacc_vx_i16m1(q2_new1_i16, 3, q2_i16, vl);
+
+ vint16m1_t sub_q0p0 = __riscv_vsub_vv_i16m1(q0_i16, p0_i16, vl);
+ vint16m1_t sub_p1p0 = __riscv_vsub_vv_i16m1(p1_i16, p0_i16, vl);
+ vint16m1_t sub_q1q0 = __riscv_vsub_vv_i16m1(q1_i16, q0_i16, vl);
+ vint16m1_t sub_p2p0 = __riscv_vsub_vv_i16m1(p2_i16, p0_i16, vl);
+ vint16m1_t sub_q2q0 = __riscv_vsub_vv_i16m1(q2_i16, q0_i16, vl);
+
+ vint16m1_t rsub_q0p0 = __riscv_vrsub_vx_i16m1(sub_q0p0, 0, vl);
+ vint16m1_t rsub_p1p0 = __riscv_vrsub_vx_i16m1(sub_p1p0, 0, vl);
+ vint16m1_t rsub_q1q0 = __riscv_vrsub_vx_i16m1(sub_q1q0, 0, vl);
+ vint16m1_t rsub_p2p0 = __riscv_vrsub_vx_i16m1(sub_p2p0, 0, vl);
+ vint16m1_t rsub_q2q0 = __riscv_vrsub_vx_i16m1(sub_q2q0, 0, vl);
+
+ vint16m1_t abd_q0p0 = __riscv_vmax_vv_i16m1(rsub_q0p0, sub_q0p0, vl);
+ vint16m1_t abd_p1p0_ = __riscv_vmax_vv_i16m1(rsub_p1p0, sub_p1p0, vl);
+ vint16m1_t abd_q1q0 = __riscv_vmax_vv_i16m1(rsub_q1q0, sub_q1q0, vl);
+ vint16m1_t abd_p2p0 = __riscv_vmax_vv_i16m1(rsub_p2p0, sub_p2p0, vl);
+ vint16m1_t abd_q2q0 = __riscv_vmax_vv_i16m1(rsub_q2q0, sub_q2q0, vl);
+
+ vbool16_t cond11 = __riscv_vmslt_vx_i16m1_b16(abd_q0p0, alpha, vl);
+ vbool16_t cond12 = __riscv_vmslt_vx_i16m1_b16(abd_p1p0_, beta, vl);
+ vbool16_t cond13 = __riscv_vmslt_vx_i16m1_b16(abd_q1q0, beta, vl);
+ vbool16_t cond2 = __riscv_vmslt_vx_i16m1_b16(abd_q0p0, (alpha >> 2) + 2, vl);
+ vbool16_t cond3 = __riscv_vmslt_vx_i16m1_b16(abd_p2p0, beta, vl);
+ vbool16_t cond4 = __riscv_vmslt_vx_i16m1_b16(abd_q2q0, beta, vl);
+
+ vbool16_t cond1 = __riscv_vmand_mm_b16(cond11, cond12, vl);
+ cond1 = __riscv_vmand_mm_b16(cond1, cond13, vl);
+ cond2 = __riscv_vmand_mm_b16(cond2, cond1, vl);
+ cond3 = __riscv_vmand_mm_b16(cond3, cond2, vl);
+ cond4 = __riscv_vmand_mm_b16(cond4, cond2, vl);
+
+ vuint8mf2_t p0_new1_u8 = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(p0_new1_i16), 2, vl);
+ vuint8mf2_t p0_new2_u8 = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(p0_new2_i16), 3, vl);
+ vuint8mf2_t p1_new1_u8 = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(p1_new1_i16), 2, vl);
+ vuint8mf2_t p2_new1_u8 = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(p2_new1_i16), 3, vl);
+ vuint8mf2_t q0_new1_u8 = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(q0_new1_i16), 2, vl);
+ vuint8mf2_t q0_new2_u8 = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(q0_new2_i16), 3, vl);
+ vuint8mf2_t q1_new1_u8 = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(q1_new1_i16), 2, vl);
+ vuint8mf2_t q2_new1_u8 = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(q2_new1_i16), 3, vl);
+
+ *p_p1_dst = __riscv_vmerge_vvm_u8mf2(p1, p1_new1_u8, cond3, vl);
+ *p_p2_dst = __riscv_vmerge_vvm_u8mf2(p2, p2_new1_u8, cond3, vl);
+ *p_p0_dst = __riscv_vmerge_vvm_u8mf2(p0_new1_u8, p0_new2_u8, cond3, vl);
+ *p_p0_dst = __riscv_vmerge_vvm_u8mf2(p0, *p_p0_dst, cond1, vl);
+
+ *p_q0_dst = __riscv_vmerge_vvm_u8mf2(q0, q0_new1_u8, cond1, vl);
+ *p_q0_dst = __riscv_vmerge_vvm_u8mf2(*p_q0_dst, q0_new2_u8, cond4, vl);
+ *p_q1_dst = __riscv_vmerge_vvm_u8mf2(q1, q1_new1_u8, cond4, vl);
+ *p_q2_dst = __riscv_vmerge_vvm_u8mf2(q2, q2_new1_u8, cond4, vl);
+}
+
+__attribute__((always_inline)) static void v_loop_filter_luma_intra(uint8_t *p_pix, ptrdiff_t stride,
+ int width, int alpha, int beta)
+{
+ uint8_t *p_iter = p_pix;
+
+ size_t vxrm = __builtin_rvv_vgetvxrm();
+ __builtin_rvv_vsetvxrm(VE_TONEARESTUP);
+
+ int count = width;
+
+ while (count > 0)
+ {
+ int vl = __riscv_vsetvl_e8mf2(width);
+
+ vuint8mf2_t p3 = __riscv_vle8_v_u8mf2(p_iter - 4 * stride, vl);
+ vuint8mf2_t p2 = __riscv_vle8_v_u8mf2(p_iter - 3 * stride, vl);
+ vuint8mf2_t p1 = __riscv_vle8_v_u8mf2(p_iter - 2 * stride, vl);
+ vuint8mf2_t p0 = __riscv_vle8_v_u8mf2(p_iter - stride, vl);
+ vuint8mf2_t q0 = __riscv_vle8_v_u8mf2(p_iter, vl);
+ vuint8mf2_t q1 = __riscv_vle8_v_u8mf2(p_iter + stride, vl);
+ vuint8mf2_t q2 = __riscv_vle8_v_u8mf2(p_iter + 2 * stride, vl);
+ vuint8mf2_t q3 = __riscv_vle8_v_u8mf2(p_iter + 3 * stride, vl);
+
+ vuint8mf2_t p2_dst, p1_dst, p0_dst, q0_dst, q1_dst, q2_dst;
+
+ luma_intra_core(&p2_dst, &p1_dst, &p0_dst, &q0_dst, &q1_dst, &q2_dst,
+ p3, p2, p1, p0, q0, q1, q2, q3, alpha, beta, vl);
+
+ __riscv_vse8_v_u8mf2(p_iter - stride * 3, p2_dst, vl);
+ __riscv_vse8_v_u8mf2(p_iter - stride * 2, p1_dst, vl);
+ __riscv_vse8_v_u8mf2(p_iter - stride, p0_dst, vl);
+ __riscv_vse8_v_u8mf2(p_iter, q0_dst, vl);
+ __riscv_vse8_v_u8mf2(p_iter + stride, q1_dst, vl);
+ __riscv_vse8_v_u8mf2(p_iter + stride * 2, q2_dst, vl);
+
+ count -= vl;
+ p_iter = p_iter + vl;
+ }
+
+ __builtin_rvv_vsetvxrm(vxrm);
+}
+
+__attribute__((always_inline)) static void h_loop_filter_luma_intra(uint8_t *p_pix, ptrdiff_t stride,
+ int width, int alpha, int beta)
+{
+ uint8_t *p_iter = p_pix;
+
+ size_t vxrm = __builtin_rvv_vgetvxrm();
+ __builtin_rvv_vsetvxrm(VE_TONEARESTUP);
+
+ int count = width;
+
+ while (count > 0)
+ {
+ int vl = __riscv_vsetvl_e8mf2(width);
+
+ vuint8mf2_t p3, p2, p1, p0, q0, q1, q2, q3;
+ __riscv_vlsseg8e8_v_u8mf2(&p3, &p2, &p1, &p0,
+ &q0, &q1, &q2, &q3, p_iter - 4, stride, 16);
+
+ vuint8mf2_t p2_dst, p1_dst, p0_dst, q0_dst, q1_dst, q2_dst;
+
+ luma_intra_core(&p2_dst, &p1_dst, &p0_dst, &q0_dst, &q1_dst, &q2_dst,
+ p3, p2, p1, p0, q0, q1, q2, q3, alpha, beta, vl);
+
+ __riscv_vssseg6e8_v_u8mf2(p_iter - 3, stride,
+ p2_dst, p1_dst, p0_dst, q0_dst, q1_dst, q2_dst, 16);
+
+ count -= vl;
+ p_iter = p_iter + vl * stride;
+ }
+
+ __builtin_rvv_vsetvxrm(vxrm);
+}
+
+__attribute__((always_inline)) static void chroma_intra_core(vuint8mf2_t *p_p0_dst, vuint8mf2_t *p_q0_dst,
+ vuint8mf2_t p1, vuint8mf2_t p0,
+ vuint8mf2_t q0, vuint8mf2_t q1,
+ int alpha, int beta, int vl)
+{
+ vint16m1_t p1_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(p1, 0, vl));
+ vint16m1_t p0_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(p0, 0, vl));
+ vint16m1_t q0_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(q0, 0, vl));
+ vint16m1_t q1_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(q1, 0, vl));
+
+ vint16m1_t sub_q0p0 = __riscv_vsub_vv_i16m1(q0_i16, p0_i16, vl);
+ vint16m1_t sub_p1p0 = __riscv_vsub_vv_i16m1(p1_i16, p0_i16, vl);
+ vint16m1_t sub_q1q0 = __riscv_vsub_vv_i16m1(q1_i16, q0_i16, vl);
+
+ vint16m1_t rsub_q0p0 = __riscv_vrsub_vx_i16m1(sub_q0p0, 0, vl);
+ vint16m1_t rsub_p1p0 = __riscv_vrsub_vx_i16m1(sub_p1p0, 0, vl);
+ vint16m1_t rsub_q1q0 = __riscv_vrsub_vx_i16m1(sub_q1q0, 0, vl);
+
+ vint16m1_t abd_q0p0 = __riscv_vmax_vv_i16m1(sub_q0p0, rsub_q0p0, vl);
+ vint16m1_t abd_p1p0_ = __riscv_vmax_vv_i16m1(sub_p1p0, rsub_p1p0, vl);
+ vint16m1_t abd_q1q0 = __riscv_vmax_vv_i16m1(sub_q1q0, rsub_q1q0, vl);
+
+ vbool16_t cond11 = __riscv_vmslt_vx_i16m1_b16(abd_q0p0, alpha, vl);
+ vbool16_t cond12 = __riscv_vmslt_vx_i16m1_b16_mu(cond11, cond11, abd_p1p0_, beta, vl);
+ vbool16_t cond13 = __riscv_vmslt_vx_i16m1_b16_mu(cond12, cond12, abd_q1q0, beta, vl);
+
+ vint16m1_t p0_new1_i16 = __riscv_vadd_vv_i16m1(p0_i16, q1_i16, vl);
+ vint16m1_t q0_new1_i16 = __riscv_vadd_vv_i16m1(q0_i16, p1_i16, vl);
+ p0_new1_i16 = __riscv_vmacc_vx_i16m1(p0_new1_i16, 2, p1_i16, vl);
+ q0_new1_i16 = __riscv_vmacc_vx_i16m1(q0_new1_i16, 2, q1_i16, vl);
+
+ *p_p0_dst = __riscv_vnclipu_wx_u8mf2_mu(cond13, p0, __riscv_vreinterpret_v_i16m1_u16m1(p0_new1_i16), 2, vl);
+ *p_q0_dst = __riscv_vnclipu_wx_u8mf2_mu(cond13, q0, __riscv_vreinterpret_v_i16m1_u16m1(q0_new1_i16), 2, vl);
+}
+
+__attribute__((always_inline)) static void v_loop_filter_chroma_intra(uint8_t *p_pix, ptrdiff_t stride,
+ int width, int alpha, int beta)
+{
+ uint8_t *p_iter = p_pix;
+
+ size_t vxrm = __builtin_rvv_vgetvxrm();
+ __builtin_rvv_vsetvxrm(VE_TONEARESTUP);
+
+ int count = width;
+
+ while (count > 0)
+ {
+ int vl = __riscv_vsetvl_e8mf2(width);
+
+ vuint8mf2_t p1 = __riscv_vle8_v_u8mf2(p_iter - 2 * stride, vl);
+ vuint8mf2_t p0 = __riscv_vle8_v_u8mf2(p_iter - stride, vl);
+ vuint8mf2_t q0 = __riscv_vle8_v_u8mf2(p_iter, vl);
+ vuint8mf2_t q1 = __riscv_vle8_v_u8mf2(p_iter + stride, vl);
+
+ vuint8mf2_t p0_dst, q0_dst;
+ chroma_intra_core(&p0_dst, &q0_dst, p1, p0, q0, q1, alpha, beta, vl);
+
+ __riscv_vse8_v_u8mf2(p_iter - stride, p0_dst, vl);
+ __riscv_vse8_v_u8mf2(p_iter, q0_dst, vl);
+
+ count -= vl;
+ p_iter = p_iter + vl;
+ }
+
+ __builtin_rvv_vsetvxrm(vxrm);
+}
+
+__attribute__((always_inline)) static void h_loop_filter_chroma_intra(uint8_t *p_pix, ptrdiff_t stride,
+ int width, int alpha, int beta)
+{
+ uint8_t *p_iter = p_pix;
+
+ size_t vxrm = __builtin_rvv_vgetvxrm();
+ __builtin_rvv_vsetvxrm(VE_TONEARESTUP);
+
+ int count = width;
+
+ while (count > 0)
+ {
+ int vl = __riscv_vsetvl_e8mf2(width);
+
+ vuint8mf2_t p1, p0, q0, q1;
+ __riscv_vlsseg4e8_v_u8mf2(&p1, &p0, &q0, &q1, p_iter - 2, stride, vl);
+
+ vuint8mf2_t p0_dst, q0_dst;
+ chroma_intra_core(&p0_dst, &q0_dst, p1, p0, q0, q1, alpha, beta, vl);
+
+ __riscv_vssseg2e8_v_u8mf2(p_iter - 1, stride, p0_dst, q0_dst, vl);
+
+ count -= vl;
+ p_iter = p_iter + vl * stride;
+ }
+
+ __builtin_rvv_vsetvxrm(vxrm);
+}
+
+__attribute__((always_inline)) static void h_loop_filter_chroma_mbaff_intra(uint8_t *p_pix, ptrdiff_t stride,
+ int width, int alpha, int beta)
+{
+ uint8_t *p_iter = p_pix;
+
+ size_t vxrm = __builtin_rvv_vgetvxrm();
+ __builtin_rvv_vsetvxrm(VE_TONEARESTUP);
+
+ int count = width;
+
+ while (count > 0)
+ {
+ int vl = __riscv_vsetvl_e8mf2(count);
+
+ vuint8mf2_t p1, p0, q0, q1;
+ __riscv_vlsseg4e8_v_u8mf2(&p1, &p0, &q0, &q1, p_iter - 2, stride, vl);
+
+ vuint8mf2_t p0_dst, q0_dst;
+ chroma_intra_core(&p0_dst, &q0_dst, p1, p0, q0, q1, alpha, beta, vl);
+
+ __riscv_vssseg2e8_v_u8mf2(p_iter - 1, stride, p0_dst, q0_dst, vl);
+
+ count -= vl;
+ p_iter = p_iter + vl * stride;
+ }
+
+ __builtin_rvv_vsetvxrm(vxrm);
+}
+
+void h264_v_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *p_tc0)
+{
+ v_loop_filter_luma(pix, stride, 16, alpha, beta, p_tc0);
+}
+
+void h264_h_loop_filter_luma_8_rvv(uint8_t *p_pix, ptrdiff_t stride, int alpha, int beta, int8_t *p_tc0)
+{
+ h_loop_filter_luma(p_pix, stride, 16, alpha, beta, p_tc0);
+}
+
+void h264_v_loop_filter_chroma_8_rvv(uint8_t *p_pix, ptrdiff_t stride, int alpha, int beta, int8_t *p_tc0)
+{
+ v_loop_filter_chroma(p_pix, stride, 8, alpha, beta, p_tc0);
+}
+
+void h264_h_loop_filter_chroma_8_rvv(uint8_t *p_pix, ptrdiff_t stride, int alpha, int beta, int8_t *p_tc0)
+{
+ h_loop_filter_chroma(p_pix, stride, 8, alpha, beta, p_tc0);
+}
+
+void h264_v_loop_filter_luma_intra_8_rvv(uint8_t *p_pix, ptrdiff_t stride, int alpha, int beta)
+{
+ v_loop_filter_luma_intra(p_pix, stride, 16, alpha, beta);
+}
+
+void h264_h_loop_filter_luma_intra_8_rvv(uint8_t *p_pix, ptrdiff_t stride, int alpha, int beta)
+{
+ h_loop_filter_luma_intra(p_pix, stride, 16, alpha, beta);
+}
+
+void h264_v_loop_filter_chroma_intra_8_rvv(uint8_t *p_pix, ptrdiff_t stride, int alpha, int beta)
+{
+ v_loop_filter_chroma_intra(p_pix, stride, 8, alpha, beta);
+}
+
+void h264_h_loop_filter_chroma_intra_8_rvv(uint8_t *p_pix, ptrdiff_t stride, int alpha, int beta)
+{
+ h_loop_filter_chroma_intra(p_pix, stride, 8, alpha, beta);
+}
+
+void h264_h_loop_filter_chroma_mbaff_intra_8_rvv(uint8_t *p_pix, ptrdiff_t stride, int alpha, int beta)
+{
+ h_loop_filter_chroma_mbaff_intra(p_pix, stride, 4, alpha, beta);
+}
+#endif
diff --git a/libavcodec/riscv/h264_inloop.h b/libavcodec/riscv/h264_inloop.h
new file mode 100644
index 0000000000..3c60e45395
--- /dev/null
+++ b/libavcodec/riscv/h264_inloop.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_RISCV_H264_INLOOP_H
+#define AVCODEC_RISCV_H264_INLOOP_H
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stddef.h>
+#include "config.h"
+
+#if HAVE_INTRINSICS_RVV
+typedef unsigned char pixel;
+
+void h264_v_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0);
+void h264_h_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0);
+
+void h264_v_loop_filter_chroma_8_rvv(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0);
+void h264_h_loop_filter_chroma_8_rvv(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0);
+
+void h264_v_loop_filter_luma_intra_8_rvv(uint8_t *p_pix, ptrdiff_t stride, int alpha, int beta);
+void h264_h_loop_filter_luma_intra_8_rvv(uint8_t *p_pix, ptrdiff_t stride, int alpha, int beta);
+
+void h264_v_loop_filter_chroma_intra_8_rvv(uint8_t *p_pix, ptrdiff_t stride, int alpha, int beta);
+void h264_h_loop_filter_chroma_intra_8_rvv(uint8_t *p_pix, ptrdiff_t stride, int alpha, int beta);
+
+void h264_h_loop_filter_chroma_mbaff_intra_8_rvv(uint8_t *p_pix, ptrdiff_t stride, int alpha, int beta);
+#endif
+#endif
\ No newline at end of file
diff --git a/libavcodec/riscv/h264_weighted_sum.c b/libavcodec/riscv/h264_weighted_sum.c
new file mode 100644
index 0000000000..0ba57d0acc
--- /dev/null
+++ b/libavcodec/riscv/h264_weighted_sum.c
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264_weighted_sum.h"
+#if HAVE_INTRINSICS_RVV
+#include <riscv_vector.h>
+
+typedef unsigned char pixel;
+
+__attribute__((always_inline)) static void h264_weight_128(uint8_t *p_block, ptrdiff_t stride, int width,
+ int height, int log2_den, int offset)
+
+{
+ uint8_t *p_block_iter = p_block;
+
+ const unsigned char weight = 128;
+ short value = (unsigned)offset << log2_den;
+ value += (1 << (log2_den - 1));
+
+ int shift = log2_den;
+
+ size_t vxrm = __builtin_rvv_vgetvxrm();
+ __builtin_rvv_vsetvxrm(VE_DOWNWARD);
+
+ int count = width;
+
+ while (count > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(count);
+ uint8_t *p_begin = p_block_iter;
+
+ for (int j = 0; j < height; j += 2)
+ {
+ vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_block_iter, vl);
+ vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_block_iter + stride, vl);
+
+ vint16m2_t result0_w, result1_w;
+
+ result0_w = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vzext_vf2_u16m2(row0, vl));
+ result1_w = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vzext_vf2_u16m2(row1, vl));
+
+ result0_w = __riscv_vsll_vx_i16m2(result0_w, 7, vl);
+ result1_w = __riscv_vsll_vx_i16m2(result1_w, 7, vl);
+
+ result0_w = __riscv_vadd_vx_i16m2(result0_w, offset, vl);
+ result1_w = __riscv_vadd_vx_i16m2(result1_w, offset, vl);
+
+ result0_w = __riscv_vmax_vx_i16m2(result0_w, 0, vl);
+ result1_w = __riscv_vmax_vx_i16m2(result1_w, 0, vl);
+
+ vuint8m1_t result0_n = __riscv_vnclipu_wx_u8m1(__riscv_vreinterpret_v_i16m2_u16m2(result0_w), shift, vl);
+ vuint8m1_t result1_n = __riscv_vnclipu_wx_u8m1(__riscv_vreinterpret_v_i16m2_u16m2(result1_w), shift, vl);
+
+ __riscv_vse8_v_u8m1(p_block_iter, result0_n, vl);
+ p_block_iter += stride;
+ __riscv_vse8_v_u8m1(p_block_iter, result1_n, vl);
+ p_block_iter += stride;
+ }
+
+ p_block_iter = p_begin + vl;
+ count -= vl;
+ }
+
+ __builtin_rvv_vsetvxrm(vxrm);
+}
+
+__attribute__((always_inline)) static void h264_weight_normal(uint8_t *p_block, ptrdiff_t stride,
+ int width, int height, int log2_den,
+ int weight, int offset)
+
+{
+ uint8_t *p_block_iter = p_block;
+
+ short value = (unsigned)offset << log2_den;
+
+ if (log2_den)
+ value += (1 << (log2_den - 1));
+
+ int shift = log2_den;
+
+ size_t vxrm = __builtin_rvv_vgetvxrm();
+ __builtin_rvv_vsetvxrm(VE_DOWNWARD);
+
+ int count = width;
+
+ while (count > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(count);
+ uint8_t *p_begin = p_block_iter;
+
+ vint8m1_t weight_v = __riscv_vmv_v_x_i8m1(weight, vl);
+
+ for (int j = 0; j < height; j += 2)
+ {
+ vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_block_iter, vl);
+ vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_block_iter + stride, vl);
+
+ vint16m2_t result0_w, result1_w;
+
+ result0_w = __riscv_vwmulsu_vv_i16m2(weight_v, row0, vl);
+ result1_w = __riscv_vwmulsu_vv_i16m2(weight_v, row1, vl);
+
+ result0_w = __riscv_vsadd_vx_i16m2(result0_w, value, vl);
+ result1_w = __riscv_vsadd_vx_i16m2(result1_w, value, vl);
+
+ result0_w = __riscv_vmax_vx_i16m2(result0_w, 0, vl);
+ result1_w = __riscv_vmax_vx_i16m2(result1_w, 0, vl);
+
+ vuint8m1_t result0_n = __riscv_vnclipu_wx_u8m1(__riscv_vreinterpret_v_i16m2_u16m2(result0_w), shift, vl);
+ vuint8m1_t result1_n = __riscv_vnclipu_wx_u8m1(__riscv_vreinterpret_v_i16m2_u16m2(result1_w), shift, vl);
+
+ __riscv_vse8_v_u8m1(p_block_iter, result0_n, vl);
+ p_block_iter += stride;
+ __riscv_vse8_v_u8m1(p_block_iter, result1_n, vl);
+ p_block_iter += stride;
+ }
+
+ p_block_iter = p_begin + vl;
+ count -= vl;
+ }
+
+ __builtin_rvv_vsetvxrm(vxrm);
+}
+
+__attribute__((always_inline)) static void h264_biweight(uint8_t *p_dst, uint8_t *p_src, ptrdiff_t stride,
+ int width, int height, int log2_den,
+ int weightd, int weights, int offset)
+{
+ uint8_t *p_dst_iter = p_dst;
+ uint8_t *p_src_iter = p_src;
+ short value = (unsigned int)((offset + 1) | 1) << log2_den;
+ int shift = log2_den + 1;
+
+ size_t vxrm = __builtin_rvv_vgetvxrm();
+ __builtin_rvv_vsetvxrm(VE_DOWNWARD);
+
+ int count = width;
+
+ while (count > 0)
+ {
+ int vl = __riscv_vsetvl_e8m1(count);
+ uint8_t *p_src_begin = p_src_iter;
+ uint8_t *p_dst_begin = p_dst_iter;
+
+ for (int j = 0; j < height; j += 2)
+ {
+ vuint8m1_t src0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+ vuint8m1_t src1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+ p_src_iter += stride;
+
+ vuint8m1_t dst0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+ vuint8m1_t dst1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+
+ vint16m2_t result0_w, result1_w;
+
+ result0_w = __riscv_vmv_v_x_i16m2(value, vl);
+ result1_w = __riscv_vmv_v_x_i16m2(value, vl);
+
+ result0_w = __riscv_vwmaccsu_vx_i16m2(result0_w, weightd, dst0, vl);
+ result1_w = __riscv_vwmaccsu_vx_i16m2(result1_w, weightd, dst1, vl);
+
+ result0_w = __riscv_vwmaccsu_vx_i16m2(result0_w, weights, src0, vl);
+ result1_w = __riscv_vwmaccsu_vx_i16m2(result1_w, weights, src1, vl);
+
+ result0_w = __riscv_vmax_vx_i16m2(result0_w, 0, vl);
+ result1_w = __riscv_vmax_vx_i16m2(result1_w, 0, vl);
+
+ vuint8m1_t result0_n = __riscv_vnclipu_wx_u8m1(__riscv_vreinterpret_v_i16m2_u16m2(result0_w), shift, vl);
+ vuint8m1_t result1_n = __riscv_vnclipu_wx_u8m1(__riscv_vreinterpret_v_i16m2_u16m2(result1_w), shift, vl);
+
+ __riscv_vse8_v_u8m1(p_dst_iter, result0_n, vl);
+ p_dst_iter += stride;
+ __riscv_vse8_v_u8m1(p_dst_iter, result1_n, vl);
+ p_dst_iter += stride;
+ }
+
+ p_src_iter = p_src_begin + vl;
+ p_dst_iter = p_dst_begin + vl;
+ count -= vl;
+ }
+
+ __builtin_rvv_vsetvxrm(vxrm);
+}
+
+void weight_h264_pixels_16_8_rvv(uint8_t *p_block, ptrdiff_t stride,
+ int height, int log2_den, int weight, int offset)
+{
+ if (weight == 1 && offset == 0 && log2_den == 0)
+ return;
+
+ if (weight == 128)
+ {
+ h264_weight_128(p_block, stride, 16, height, log2_den, offset);
+ }
+ else
+ {
+ h264_weight_normal(p_block, stride, 16, height, log2_den, weight, offset);
+ }
+}
+
+void weight_h264_pixels_8_8_rvv(uint8_t *p_block, ptrdiff_t stride,
+ int height, int log2_den, int weight, int offset)
+{
+ if (weight == 1 && offset == 0 && log2_den == 0)
+ return;
+
+ if (weight == 128)
+ {
+ h264_weight_128(p_block, stride, 8, height, log2_den, offset);
+ }
+ else
+ {
+ h264_weight_normal(p_block, stride, 8, height, log2_den, weight, offset);
+ }
+}
+
+void weight_h264_pixels_4_8_rvv(uint8_t *p_block, ptrdiff_t stride,
+ int height, int log2_den, int weight, int offset)
+{
+ if (weight == 1 && offset == 0 && log2_den == 0)
+ return;
+
+ if (weight == 128)
+ {
+ h264_weight_128(p_block, stride, 4, height, log2_den, offset);
+ }
+ else
+ {
+ h264_weight_normal(p_block, stride, 4, height, log2_den, weight, offset);
+ }
+}
+
+void biweight_h264_pixels_16_8_rvv(uint8_t *p_dst, uint8_t *p_src, ptrdiff_t stride,
+ int height, int log2_den, int weightd,
+ int weights, int offset)
+{
+ h264_biweight(p_dst, p_src, stride, 16, height, log2_den, weightd, weights, offset);
+}
+
+void biweight_h264_pixels_8_8_rvv(uint8_t *p_dst, uint8_t *p_src, ptrdiff_t stride,
+ int height, int log2_den, int weightd,
+ int weights, int offset)
+{
+
+ h264_biweight(p_dst, p_src, stride, 8, height, log2_den, weightd, weights, offset);
+}
+
+void biweight_h264_pixels_4_8_rvv(uint8_t *p_dst, uint8_t *p_src, ptrdiff_t stride,
+ int height, int log2_den, int weightd,
+ int weights, int offset)
+{
+
+ h264_biweight(p_dst, p_src, stride, 4, height, log2_den, weightd, weights, offset);
+}
+#endif
diff --git a/libavcodec/riscv/h264_weighted_sum.h b/libavcodec/riscv/h264_weighted_sum.h
new file mode 100644
index 0000000000..631d6df1fa
--- /dev/null
+++ b/libavcodec/riscv/h264_weighted_sum.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_RISCV_H264_WEIGHTED_SUM_H
+#define AVCODEC_RISCV_H264_WEIGHTED_SUM_H
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stddef.h>
+#include "config.h"
+
+#if HAVE_INTRINSICS_RVV
+typedef unsigned char pixel;
+
+void weight_h264_pixels_16_8_rvv(uint8_t *p_block, ptrdiff_t stride,
+ int height, int log2_den, int weight, int offset);
+void weight_h264_pixels_8_8_rvv(uint8_t *p_block, ptrdiff_t stride,
+ int height, int log2_den, int weight, int offset);
+void weight_h264_pixels_4_8_rvv(uint8_t *p_block, ptrdiff_t stride,
+ int height, int log2_den, int weight, int offset);
+
+void biweight_h264_pixels_16_8_rvv(uint8_t *p_dst, uint8_t *p_src, ptrdiff_t stride,
+ int height, int log2_den, int weightd, int weights, int offset);
+void biweight_h264_pixels_8_8_rvv(uint8_t *p_dst, uint8_t *p_src, ptrdiff_t stride,
+ int height, int log2_den, int weightd, int weights, int offset);
+void biweight_h264_pixels_4_8_rvv(uint8_t *p_dst, uint8_t *p_src, ptrdiff_t stride,
+ int height, int log2_den, int weightd, int weights, int offset);
+#endif
+#endif
\ No newline at end of file
--
2.17.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 11+ messages in thread