[FFmpeg-devel] [PATCH 2/5] lavc/h264chroma: Add vectorized implementation of chroma MC for RISC-V

From: Arnie Chang <arnie.chang@sifive.com>
To: ffmpeg-devel@ffmpeg.org
Cc: Arnie Chang <arnie.chang@sifive.com>
Subject: [FFmpeg-devel] [PATCH 2/5] lavc/h264chroma: Add vectorized implementation of chroma MC for RISC-V
Date: Tue,  9 May 2023 17:50:27 +0800
Message-ID: <20230509095030.25506-3-arnie.chang@sifive.com> (raw)
In-Reply-To: <20230509095030.25506-1-arnie.chang@sifive.com>

Optimize chroma motion compensation using RISC-V vector intrinsics,
resulting in an average 13% FPS improvement on 720P videos.

Signed-off-by: Arnie Chang <arnie.chang@sifive.com>
---
 libavcodec/h264chroma.c                   |   2 +
 libavcodec/h264chroma.h                   |   1 +
 libavcodec/riscv/Makefile                 |   3 +
 libavcodec/riscv/h264_chroma_init_riscv.c |  45 ++
 libavcodec/riscv/h264_mc_chroma.c         | 821 ++++++++++++++++++++++
 libavcodec/riscv/h264_mc_chroma.h         |  40 ++
 6 files changed, 912 insertions(+)
 create mode 100644 libavcodec/riscv/h264_chroma_init_riscv.c
 create mode 100644 libavcodec/riscv/h264_mc_chroma.c
 create mode 100644 libavcodec/riscv/h264_mc_chroma.h

diff --git a/libavcodec/h264chroma.c b/libavcodec/h264chroma.c
index 60b86b6fba..1eeab7bc40 100644
--- a/libavcodec/h264chroma.c
+++ b/libavcodec/h264chroma.c
@@ -58,5 +58,7 @@ av_cold void ff_h264chroma_init(H264ChromaContext *c, int bit_depth)
     ff_h264chroma_init_mips(c, bit_depth);
 #elif ARCH_LOONGARCH64
     ff_h264chroma_init_loongarch(c, bit_depth);
+#elif ARCH_RISCV
+    ff_h264chroma_init_riscv(c, bit_depth);
 #endif
 }
diff --git a/libavcodec/h264chroma.h b/libavcodec/h264chroma.h
index b8f9c8f4fc..9c81c18a76 100644
--- a/libavcodec/h264chroma.h
+++ b/libavcodec/h264chroma.h
@@ -37,5 +37,6 @@ void ff_h264chroma_init_ppc(H264ChromaContext *c, int bit_depth);
 void ff_h264chroma_init_x86(H264ChromaContext *c, int bit_depth);
 void ff_h264chroma_init_mips(H264ChromaContext *c, int bit_depth);
 void ff_h264chroma_init_loongarch(H264ChromaContext *c, int bit_depth);
+void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth);
 
 #endif /* AVCODEC_H264CHROMA_H */
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 965942f4df..08b76c93cb 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -19,3 +19,6 @@ OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_init.o \
 RVV-OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_rvv.o
 OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
 RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
+
+OBJS-$(CONFIG_H264CHROMA) += riscv/h264_chroma_init_riscv.o
+RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o
diff --git a/libavcodec/riscv/h264_chroma_init_riscv.c b/libavcodec/riscv/h264_chroma_init_riscv.c
new file mode 100644
index 0000000000..daeca01fa2
--- /dev/null
+++ b/libavcodec/riscv/h264_chroma_init_riscv.c
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavcodec/h264chroma.h"
+#include "config.h"
+#include "h264_mc_chroma.h"
+
+av_cold void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth)
+{
+#if HAVE_INTRINSICS_RVV
+    const int high_bit_depth = bit_depth > 8;
+
+    if (!high_bit_depth) {
+        c->put_h264_chroma_pixels_tab[0] = h264_put_chroma_mc8_rvv;
+        c->avg_h264_chroma_pixels_tab[0] = h264_avg_chroma_mc8_rvv;
+
+        c->put_h264_chroma_pixels_tab[1] = h264_put_chroma_mc4_rvv;
+        c->avg_h264_chroma_pixels_tab[1] = h264_avg_chroma_mc4_rvv;
+
+        c->put_h264_chroma_pixels_tab[2] = h264_put_chroma_mc2_rvv;
+        c->avg_h264_chroma_pixels_tab[2] = h264_avg_chroma_mc2_rvv;
+    }
+#endif
+}
\ No newline at end of file
diff --git a/libavcodec/riscv/h264_mc_chroma.c b/libavcodec/riscv/h264_mc_chroma.c
new file mode 100644
index 0000000000..64b13ec3b8
--- /dev/null
+++ b/libavcodec/riscv/h264_mc_chroma.c
@@ -0,0 +1,821 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264_mc_chroma.h"
+#if HAVE_INTRINSICS_RVV
+#include <riscv_vector.h>
+typedef unsigned char pixel;
+
+__attribute__((always_inline)) static void h264_put_chroma_unroll4(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int w, int h, int x, int y)
+{
+    uint8_t *p_dst_iter = p_dst;
+    uint8_t *p_src_iter = p_src;
+
+    const int xy = x * y;
+    const int x8 = x << 3;
+    const int y8 = y << 3;
+    const int a = 64 - x8 - y8 + xy;
+    const int b = x8 - xy;
+    const int c = y8 -xy;
+    const int d = xy;
+
+    int vl = __riscv_vsetvl_e8m1(w);
+
+    if (d != 0)
+    {
+        for (int j = 0; j < h; j += 4)
+        {
+            // dst 1st row
+            vuint8m1_t row00 = __riscv_vle8_v_u8m1(p_src_iter, vl + 1);
+
+            vuint8m1_t row01;
+            row01 = __riscv_vslidedown_vx_u8m1(row00, 1, vl + 1);
+
+            vuint16m2_t dst0 = __riscv_vwmulu_vx_u16m2(row00, a, vl);
+            dst0 = __riscv_vwmaccu_vx_u16m2(dst0, b, row01, vl);
+
+            vuint8m1_t row10 = __riscv_vle8_v_u8m1(p_src_iter + stride, vl + 1);
+            dst0 = __riscv_vwmaccu_vx_u16m2(dst0, c, row10, vl);
+
+            vuint8m1_t row11;
+            row11 = __riscv_vslidedown_vx_u8m1(row10, 1, vl + 1);
+            dst0 = __riscv_vwmaccu_vx_u16m2(dst0, d, row11, vl);
+
+            // dst 2nd row
+            p_src_iter += (stride << 1);
+
+            vuint16m2_t dst1 =  __riscv_vwmulu_vx_u16m2(row10, a, vl);
+            dst1 =  __riscv_vwmaccu_vx_u16m2(dst1, b, row11, vl);
+
+            vuint8m1_t row20 =  __riscv_vle8_v_u8m1(p_src_iter, vl + 1);
+            dst1 =  __riscv_vwmaccu_vx_u16m2(dst1, c, row20, vl);
+
+            vuint8m1_t row21;
+            row21 =  __riscv_vslidedown_vx_u8m1(row20, 1, vl + 1);
+            dst1 =  __riscv_vwmaccu_vx_u16m2(dst1, d, row21, vl);
+
+            // dst 3rd row
+            p_src_iter += stride;
+
+            vuint16m2_t dst2 = __riscv_vwmulu_vx_u16m2(row20, a, vl);
+            dst2 = __riscv_vwmaccu_vx_u16m2(dst2, b, row21, vl);
+
+            vuint8m1_t row30 = __riscv_vle8_v_u8m1(p_src_iter, vl + 1);
+            dst2 = __riscv_vwmaccu_vx_u16m2(dst2, c, row30, vl);
+
+            vuint8m1_t row31;
+            row31 = __riscv_vslidedown_vx_u8m1(row30, 1, vl + 1);
+            dst2 = __riscv_vwmaccu_vx_u16m2(dst2, d, row31, vl);
+
+            // dst 4rd row
+            p_src_iter += stride;
+
+            vuint16m2_t dst3 = __riscv_vwmulu_vx_u16m2(row30, a, vl);
+            dst3 = __riscv_vwmaccu_vx_u16m2(dst3, b, row31, vl);
+
+            vuint8m1_t row40 = __riscv_vle8_v_u8m1(p_src_iter, vl + 1);
+            dst3 = __riscv_vwmaccu_vx_u16m2(dst3, c, row40, vl);
+
+            vuint8m1_t row41;
+            row41 = __riscv_vslidedown_vx_u8m1(row40, 1, vl + 1);
+            dst3 = __riscv_vwmaccu_vx_u16m2(dst3, d, row41, vl);
+
+            __riscv_vse8_v_u8m1(p_dst_iter, __riscv_vnclipu_wx_u8m1(dst0, 6, vl), vl);
+            p_dst_iter += stride;
+
+            __riscv_vse8_v_u8m1(p_dst_iter, __riscv_vnclipu_wx_u8m1(dst1, 6, vl), vl);
+            p_dst_iter += stride;
+
+            __riscv_vse8_v_u8m1(p_dst_iter, __riscv_vnclipu_wx_u8m1(dst2, 6, vl), vl);
+            p_dst_iter += stride;
+
+            __riscv_vse8_v_u8m1(p_dst_iter, __riscv_vnclipu_wx_u8m1(dst3, 6, vl), vl);
+            p_dst_iter += stride;
+        }
+    }
+    else if (b == 0 && c != 0)
+    {
+        const unsigned short e = b + c;
+
+        for (int j = 0; j < h; j += 4)
+        {
+            vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            vuint16m2_t dst0 = __riscv_vwmulu_vx_u16m2(row0, a, vl);
+            p_src_iter += stride;
+
+            vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            dst0 = __riscv_vwmaccu_vx_u16m2(dst0, e, row1, vl);
+            p_src_iter += stride;
+
+            vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            vuint16m2_t dst1 = __riscv_vwmulu_vx_u16m2(row1, a, vl);
+            dst1 = __riscv_vwmaccu_vx_u16m2(dst1, e, row2, vl);
+            p_src_iter += stride;
+
+            vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            vuint16m2_t dst2 = __riscv_vwmulu_vx_u16m2(row2, a, vl);
+            dst2 = __riscv_vwmaccu_vx_u16m2(dst2, e, row3, vl);
+            p_src_iter += stride;
+
+            vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            vuint16m2_t dst3 = __riscv_vwmulu_vx_u16m2(row3, a, vl);
+            dst3 = __riscv_vwmaccu_vx_u16m2(dst3, e, row4, vl);
+
+            __riscv_vse8_v_u8m1(p_dst_iter, __riscv_vnclipu_wx_u8m1(dst0, 6, vl), vl);
+            p_dst_iter += stride;
+
+            __riscv_vse8_v_u8m1(p_dst_iter, __riscv_vnclipu_wx_u8m1(dst1, 6, vl), vl);
+            p_dst_iter += stride;
+
+            __riscv_vse8_v_u8m1(p_dst_iter, __riscv_vnclipu_wx_u8m1(dst2, 6, vl), vl);
+            p_dst_iter += stride;
+
+            __riscv_vse8_v_u8m1(p_dst_iter, __riscv_vnclipu_wx_u8m1(dst3, 6, vl), vl);
+            p_dst_iter += stride;
+        }
+    }
+    else if (b !=0 && c == 0)
+    {
+        const unsigned short e = b + c;
+
+        for (int j = 0; j < h; j += 4)
+        {
+            // 1st
+            vuint8m1_t row00 = __riscv_vle8_v_u8m1(p_src_iter, vl + 1);
+            p_src_iter += stride;
+
+            vuint8m1_t row01;
+            row01 = __riscv_vslidedown_vx_u8m1(row00, 1, vl + 1);
+
+            vuint16m2_t dst0 = __riscv_vwmulu_vx_u16m2(row00, a, vl);
+            dst0 = __riscv_vwmaccu_vx_u16m2(dst0, e, row01, vl);
+
+            // 2nd
+            vuint8m1_t row10 = __riscv_vle8_v_u8m1(p_src_iter, vl + 1);
+            p_src_iter += stride;
+
+            vuint8m1_t row11;
+            row11 = __riscv_vslidedown_vx_u8m1(row10, 1, vl + 1);
+
+            vuint16m2_t dst1 = __riscv_vwmulu_vx_u16m2(row10, a, vl);
+            dst1 = __riscv_vwmaccu_vx_u16m2(dst1, e, row11, vl);
+
+            // 3rd
+            vuint8m1_t row20 = __riscv_vle8_v_u8m1(p_src_iter, vl + 1);
+            p_src_iter += stride;
+
+            vuint8m1_t row21;
+            row21 = __riscv_vslidedown_vx_u8m1(row20, 1, vl + 1);
+
+            vuint16m2_t dst2 = __riscv_vwmulu_vx_u16m2(row20, a, vl);
+            dst2 = __riscv_vwmaccu_vx_u16m2(dst2, e, row21, vl);
+
+            // 3rd
+            vuint8m1_t row30 = __riscv_vle8_v_u8m1(p_src_iter, vl + 1);
+            p_src_iter += stride;
+
+            vuint8m1_t row31;
+            row31 = __riscv_vslidedown_vx_u8m1(row30, 1, vl + 1);
+
+            vuint16m2_t dst3 = __riscv_vwmulu_vx_u16m2(row30, a, vl);
+            dst3 = __riscv_vwmaccu_vx_u16m2(dst3, e, row31, vl);
+
+            __riscv_vse8_v_u8m1(p_dst_iter, __riscv_vnclipu_wx_u8m1(dst0, 6, vl), vl);
+            p_dst_iter += stride;
+
+            __riscv_vse8_v_u8m1(p_dst_iter, __riscv_vnclipu_wx_u8m1(dst1, 6, vl), vl);
+            p_dst_iter += stride;
+
+            __riscv_vse8_v_u8m1(p_dst_iter, __riscv_vnclipu_wx_u8m1(dst2, 6, vl), vl);
+            p_dst_iter += stride;
+
+            __riscv_vse8_v_u8m1(p_dst_iter, __riscv_vnclipu_wx_u8m1(dst3, 6, vl), vl);
+            p_dst_iter += stride;
+        }
+    }
+    else
+    {
+        for (int j = 0; j < h; j += 4)
+        {
+            vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            p_src_iter += stride;
+            vuint16m2_t dst0 = __riscv_vwmulu_vx_u16m2(row0, a, vl);
+
+            vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            p_src_iter += stride;
+            vuint16m2_t dst1 = __riscv_vwmulu_vx_u16m2(row1, a, vl);
+
+            vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            p_src_iter += stride;
+            vuint16m2_t dst2 = __riscv_vwmulu_vx_u16m2(row2, a, vl);
+
+            vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            p_src_iter += stride;
+            vuint16m2_t dst3 = __riscv_vwmulu_vx_u16m2(row3, a, vl);
+
+            __riscv_vse8_v_u8m1(p_dst_iter, __riscv_vnclipu_wx_u8m1(dst0, 6, vl), vl);
+            p_dst_iter += stride;
+
+            __riscv_vse8_v_u8m1(p_dst_iter, __riscv_vnclipu_wx_u8m1(dst1, 6, vl), vl);
+            p_dst_iter += stride;
+
+            __riscv_vse8_v_u8m1(p_dst_iter, __riscv_vnclipu_wx_u8m1(dst2, 6, vl), vl);
+            p_dst_iter += stride;
+
+            __riscv_vse8_v_u8m1(p_dst_iter, __riscv_vnclipu_wx_u8m1(dst3, 6, vl), vl);
+            p_dst_iter += stride;
+        }
+    }
+}
+
+__attribute__((always_inline)) static void h264_put_chroma_unroll2(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int w, int h, int x, int y)
+{
+    uint8_t *p_dst_iter = p_dst;
+    uint8_t *p_src_iter = p_src;
+
+    const int xy = x * y;
+    const int x8 = x << 3;
+    const int y8 = y << 3;
+    const int a = 64 - x8 - y8 + xy;
+    const int b = x8 - xy;
+    const int c = y8 -xy;
+    const int d = xy;
+
+    int vl = __riscv_vsetvl_e8m1(w);
+
+    if (d != 0)
+    {
+        for (int j = 0; j < h; j += 2)
+        {
+            // dst 1st row
+            vuint8m1_t row00 = __riscv_vle8_v_u8m1(p_src_iter, vl + 1);
+
+            vuint8m1_t row01;
+            row01 = __riscv_vslidedown_vx_u8m1(row00, 1, vl + 1);
+
+            vuint16m2_t dst0 = __riscv_vwmulu_vx_u16m2(row00, a, vl);
+            dst0 = __riscv_vwmaccu_vx_u16m2(dst0, b, row01, vl);
+
+            vuint8m1_t row10 = __riscv_vle8_v_u8m1(p_src_iter + stride, vl + 1);
+            dst0 = __riscv_vwmaccu_vx_u16m2(dst0, c, row10, vl);
+
+            vuint8m1_t row11;
+            row11 = __riscv_vslidedown_vx_u8m1(row10, 1, vl + 1);
+            dst0 = __riscv_vwmaccu_vx_u16m2(dst0, d, row11, vl);
+
+            // dst 2nd row
+            p_src_iter += (stride << 1);
+
+            vuint16m2_t dst1 = __riscv_vwmulu_vx_u16m2(row10, a, vl);
+            dst1 = __riscv_vwmaccu_vx_u16m2(dst1, b, row11, vl);
+
+            vuint8m1_t row20 = __riscv_vle8_v_u8m1(p_src_iter, vl + 1);
+            dst1 = __riscv_vwmaccu_vx_u16m2(dst1, c, row20, vl);
+
+            vuint8m1_t row21;
+            row21 = __riscv_vslidedown_vx_u8m1(row20, 1, vl + 1);
+            dst1 = __riscv_vwmaccu_vx_u16m2(dst1, d, row21, vl);
+
+            __riscv_vse8_v_u8m1(p_dst_iter, __riscv_vnclipu_wx_u8m1(dst0, 6, vl), vl);
+            p_dst_iter += stride;
+
+            __riscv_vse8_v_u8m1(p_dst_iter, __riscv_vnclipu_wx_u8m1(dst1, 6, vl), vl);
+            p_dst_iter += stride;
+        }
+    }
+    else if (b == 0 && c != 0)
+    {
+        const unsigned short e = b + c;
+
+        for (int j = 0; j < h; j += 2)
+        {
+            vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            vuint16m2_t dst0 = __riscv_vwmulu_vx_u16m2(row0, a, vl);
+            p_src_iter += stride;
+
+            vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            dst0 = __riscv_vwmaccu_vx_u16m2(dst0, e, row1, vl);
+            p_src_iter += stride;
+
+            vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            vuint16m2_t dst1 = __riscv_vwmulu_vx_u16m2(row1, a, vl);
+            dst1 = __riscv_vwmaccu_vx_u16m2(dst1, e, row2, vl);
+            p_src_iter += stride;
+
+            __riscv_vse8_v_u8m1(p_dst_iter, __riscv_vnclipu_wx_u8m1(dst0, 6, vl), vl);
+            p_dst_iter += stride;
+
+            __riscv_vse8_v_u8m1(p_dst_iter, __riscv_vnclipu_wx_u8m1(dst1, 6, vl), vl);
+            p_dst_iter += stride;
+        }
+    }
+    else if (b !=0 && c == 0)
+    {
+        const unsigned short e = b + c;
+
+        for (int j = 0; j < h; j += 2)
+        {
+            // 1st
+            vuint8m1_t row00 = __riscv_vle8_v_u8m1(p_src_iter, vl + 1);
+            p_src_iter += stride;
+
+            vuint8m1_t row01;
+            row01 = __riscv_vslidedown_vx_u8m1(row00, 1, vl + 1);
+
+            vuint16m2_t dst0 = __riscv_vwmulu_vx_u16m2(row00, a, vl);
+            dst0 = __riscv_vwmaccu_vx_u16m2(dst0, e, row01, vl);
+
+            // 2nd
+            vuint8m1_t row10 = __riscv_vle8_v_u8m1(p_src_iter, vl + 1);
+            p_src_iter += stride;
+
+            vuint8m1_t row11;
+            row11 = __riscv_vslidedown_vx_u8m1(row10, 1, vl + 1);
+
+            vuint16m2_t dst1 = __riscv_vwmulu_vx_u16m2(row10, a, vl);
+            dst1 = __riscv_vwmaccu_vx_u16m2(dst1, e, row11, vl);
+
+            __riscv_vse8_v_u8m1(p_dst_iter, __riscv_vnclipu_wx_u8m1(dst0, 6, vl), vl);
+            p_dst_iter += stride;
+
+            __riscv_vse8_v_u8m1(p_dst_iter, __riscv_vnclipu_wx_u8m1(dst1, 6, vl), vl);
+            p_dst_iter += stride;
+        }
+    }
+    else
+    {
+        for (int j = 0; j < h; j += 2)
+        {
+            vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            p_src_iter += stride;
+            vuint16m2_t dst0 = __riscv_vwmulu_vx_u16m2(row0, a, vl);
+
+            vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            p_src_iter += stride;
+            vuint16m2_t dst1 = __riscv_vwmulu_vx_u16m2(row1, a, vl);
+
+            __riscv_vse8_v_u8m1(p_dst_iter, __riscv_vnclipu_wx_u8m1(dst0, 6, vl), vl);
+            p_dst_iter += stride;
+
+            __riscv_vse8_v_u8m1(p_dst_iter, __riscv_vnclipu_wx_u8m1(dst1, 6, vl), vl);
+            p_dst_iter += stride;
+        }
+    }
+}
+
+__attribute__((always_inline)) static void h264_avg_chroma_unroll4(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int w, int h, int x, int y)
+{
+    uint8_t *p_dst_iter = p_dst;
+    uint8_t *p_src_iter = p_src;
+
+    const int xy = x * y;
+    const int x8 = x << 3;
+    const int y8 = y << 3;
+    const int a = 64 - x8 - y8 + xy;
+    const int b = x8 - xy;
+    const int c = y8 - xy;
+    const int d = xy;
+
+    int vl = __riscv_vsetvl_e8m1(w);
+
+    if (d != 0)
+    {
+        for (int j = 0; j < h; j += 4)
+        {
+            // dst 1st row
+            vuint8m1_t row00 = __riscv_vle8_v_u8m1(p_src_iter, vl + 1);
+
+            vuint8m1_t row01;
+            row01 = __riscv_vslidedown_vx_u8m1(row00, 1, vl + 1);
+
+            vuint16m2_t dst0 = __riscv_vwmulu_vx_u16m2(row00, a, vl);
+            dst0 = __riscv_vwmaccu_vx_u16m2(dst0, b, row01, vl);
+
+            vuint8m1_t row10 = __riscv_vle8_v_u8m1(p_src_iter + stride, vl + 1);
+            dst0 = __riscv_vwmaccu_vx_u16m2(dst0, c, row10, vl);
+
+            vuint8m1_t row11;
+            row11 = __riscv_vslidedown_vx_u8m1(row10, 1, vl + 1);
+            dst0 = __riscv_vwmaccu_vx_u16m2(dst0, d, row11, vl);
+
+            // dst 2nd row
+            p_src_iter += (stride << 1);
+
+            vuint16m2_t dst1 = __riscv_vwmulu_vx_u16m2(row10, a, vl);
+            dst1 = __riscv_vwmaccu_vx_u16m2(dst1, b, row11, vl);
+
+            vuint8m1_t row20 = __riscv_vle8_v_u8m1(p_src_iter, vl + 1);
+            dst1 = __riscv_vwmaccu_vx_u16m2(dst1, c, row20, vl);
+
+            vuint8m1_t row21;
+            row21 = __riscv_vslidedown_vx_u8m1(row20, 1, vl + 1);
+            dst1 = __riscv_vwmaccu_vx_u16m2(dst1, d, row21, vl);
+
+            // dst 3rd row
+            p_src_iter += stride;
+
+            vuint16m2_t dst2 = __riscv_vwmulu_vx_u16m2(row20, a, vl);
+            dst2 = __riscv_vwmaccu_vx_u16m2(dst2, b, row21, vl);
+
+            vuint8m1_t row30 = __riscv_vle8_v_u8m1(p_src_iter, vl + 1);
+            dst2 = __riscv_vwmaccu_vx_u16m2(dst2, c, row30, vl);
+
+            vuint8m1_t row31;
+            row31 = __riscv_vslidedown_vx_u8m1(row30, 1, vl + 1);
+            dst2 = __riscv_vwmaccu_vx_u16m2(dst2, d, row31, vl);
+
+            // dst 4rd row
+            p_src_iter += stride;
+
+            vuint16m2_t dst3 = __riscv_vwmulu_vx_u16m2(row30, a, vl);
+            dst3 = __riscv_vwmaccu_vx_u16m2(dst3, b, row31, vl);
+
+            vuint8m1_t row40 = __riscv_vle8_v_u8m1(p_src_iter, vl + 1);
+            dst3 = __riscv_vwmaccu_vx_u16m2(dst3, c, row40, vl);
+
+            vuint8m1_t row41;
+            row41 = __riscv_vslidedown_vx_u8m1(row40, 1, vl + 1);
+            dst3 = __riscv_vwmaccu_vx_u16m2(dst3, d, row41, vl);
+
+            vuint8m1_t avg0 = __riscv_vaaddu_vv_u8m1(__riscv_vnclipu_wx_u8m1(dst0, 6, vl), __riscv_vle8_v_u8m1(p_dst_iter, vl), vl);
+            __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+            p_dst_iter += stride;
+
+            vuint8m1_t avg1 = __riscv_vaaddu_vv_u8m1(__riscv_vnclipu_wx_u8m1(dst1, 6, vl), __riscv_vle8_v_u8m1(p_dst_iter, vl), vl);
+            __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+            p_dst_iter += stride;
+
+            vuint8m1_t avg2 = __riscv_vaaddu_vv_u8m1(__riscv_vnclipu_wx_u8m1(dst2, 6, vl), __riscv_vle8_v_u8m1(p_dst_iter, vl), vl);
+            __riscv_vse8_v_u8m1(p_dst_iter, avg2, vl);
+            p_dst_iter += stride;
+
+            vuint8m1_t avg3 = __riscv_vaaddu_vv_u8m1(__riscv_vnclipu_wx_u8m1(dst3, 6, vl), __riscv_vle8_v_u8m1(p_dst_iter, vl), vl);
+            __riscv_vse8_v_u8m1(p_dst_iter, avg3, vl);
+            p_dst_iter += stride;
+        }
+    }
+    else if (b == 0 && c != 0)
+    {
+        const unsigned short e = b + c;
+
+        for (int j = 0; j < h; j += 4)
+        {
+            vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            vuint16m2_t dst0 = __riscv_vwmulu_vx_u16m2(row0, a, vl);
+            p_src_iter += stride;
+
+            vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            dst0 = __riscv_vwmaccu_vx_u16m2(dst0, e, row1, vl);
+            p_src_iter += stride;
+
+            vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            vuint16m2_t dst1 = __riscv_vwmulu_vx_u16m2(row1, a, vl);
+            dst1 = __riscv_vwmaccu_vx_u16m2(dst1, e, row2, vl);
+            p_src_iter += stride;
+
+            vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            vuint16m2_t dst2 = __riscv_vwmulu_vx_u16m2(row2, a, vl);
+            dst2 = __riscv_vwmaccu_vx_u16m2(dst2, e, row3, vl);
+            p_src_iter += stride;
+
+            vuint8m1_t row4 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            vuint16m2_t dst3 = __riscv_vwmulu_vx_u16m2(row3, a, vl);
+            dst3 = __riscv_vwmaccu_vx_u16m2(dst3, e, row4, vl);
+
+            vuint8m1_t avg0 = __riscv_vaaddu_vv_u8m1(__riscv_vnclipu_wx_u8m1(dst0, 6, vl), __riscv_vle8_v_u8m1(p_dst_iter, vl), vl);
+            __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+            p_dst_iter += stride;
+
+            vuint8m1_t avg1 = __riscv_vaaddu_vv_u8m1(__riscv_vnclipu_wx_u8m1(dst1, 6, vl), __riscv_vle8_v_u8m1(p_dst_iter, vl), vl);
+            __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+            p_dst_iter += stride;
+
+            vuint8m1_t avg2 = __riscv_vaaddu_vv_u8m1(__riscv_vnclipu_wx_u8m1(dst2, 6, vl), __riscv_vle8_v_u8m1(p_dst_iter, vl), vl);
+            __riscv_vse8_v_u8m1(p_dst_iter, avg2, vl);
+            p_dst_iter += stride;
+
+            vuint8m1_t avg3 = __riscv_vaaddu_vv_u8m1(__riscv_vnclipu_wx_u8m1(dst3, 6, vl), __riscv_vle8_v_u8m1(p_dst_iter, vl), vl);
+            __riscv_vse8_v_u8m1(p_dst_iter, avg3, vl);
+            p_dst_iter += stride;
+        }
+    }
+    else if (b != 0 && c == 0)
+    {
+        const unsigned short e = b + c;
+
+        for (int j = 0; j < h; j += 4)
+        {
+            // 1st
+            vuint8m1_t row00 = __riscv_vle8_v_u8m1(p_src_iter, vl + 1);
+            p_src_iter += stride;
+
+            vuint8m1_t row01;
+            row01 = __riscv_vslidedown_vx_u8m1(row00, 1, vl + 1);
+
+            vuint16m2_t dst0 = __riscv_vwmulu_vx_u16m2(row00, a, vl);
+            dst0 = __riscv_vwmaccu_vx_u16m2(dst0, e, row01, vl);
+
+            // 2nd
+            vuint8m1_t row10 = __riscv_vle8_v_u8m1(p_src_iter, vl + 1);
+            p_src_iter += stride;
+
+            vuint8m1_t row11;
+            row11 = __riscv_vslidedown_vx_u8m1(row10, 1, vl + 1);
+
+            vuint16m2_t dst1 = __riscv_vwmulu_vx_u16m2(row10, a, vl);
+            dst1 = __riscv_vwmaccu_vx_u16m2(dst1, e, row11, vl);
+
+            // 3rd
+            vuint8m1_t row20 = __riscv_vle8_v_u8m1(p_src_iter, vl + 1);
+            p_src_iter += stride;
+
+            vuint8m1_t row21;
+            row21 = __riscv_vslidedown_vx_u8m1(row20, 1, vl + 1);
+
+            vuint16m2_t dst2 = __riscv_vwmulu_vx_u16m2(row20, a, vl);
+            dst2 = __riscv_vwmaccu_vx_u16m2(dst2, e, row21, vl);
+
+            // 4th
+            vuint8m1_t row30 = __riscv_vle8_v_u8m1(p_src_iter, vl + 1);
+            p_src_iter += stride;
+
+            vuint8m1_t row31;
+            row31 = __riscv_vslidedown_vx_u8m1(row30, 1, vl + 1);
+
+            vuint16m2_t dst3 = __riscv_vwmulu_vx_u16m2(row30, a, vl);
+            dst3 = __riscv_vwmaccu_vx_u16m2(dst3, e, row31, vl);
+
+            vuint8m1_t avg0 = __riscv_vaaddu_vv_u8m1(__riscv_vnclipu_wx_u8m1(dst0, 6, vl), __riscv_vle8_v_u8m1(p_dst_iter, vl), vl);
+            __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+            p_dst_iter += stride;
+
+            vuint8m1_t avg1 = __riscv_vaaddu_vv_u8m1(__riscv_vnclipu_wx_u8m1(dst1, 6, vl), __riscv_vle8_v_u8m1(p_dst_iter, vl), vl);
+            __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+            p_dst_iter += stride;
+
+            vuint8m1_t avg2 = __riscv_vaaddu_vv_u8m1(__riscv_vnclipu_wx_u8m1(dst2, 6, vl), __riscv_vle8_v_u8m1(p_dst_iter, vl), vl);
+            __riscv_vse8_v_u8m1(p_dst_iter, avg2, vl);
+            p_dst_iter += stride;
+
+            vuint8m1_t avg3 = __riscv_vaaddu_vv_u8m1(__riscv_vnclipu_wx_u8m1(dst3, 6, vl), __riscv_vle8_v_u8m1(p_dst_iter, vl), vl);
+            __riscv_vse8_v_u8m1(p_dst_iter, avg3, vl);
+            p_dst_iter += stride;
+        }
+    }
+    else
+    {
+        for (int j = 0; j < h; j += 4)
+        {
+            vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            p_src_iter += stride;
+            vuint16m2_t dst0 = __riscv_vwmulu_vx_u16m2(row0, a, vl);
+
+            vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            p_src_iter += stride;
+            vuint16m2_t dst1 = __riscv_vwmulu_vx_u16m2(row1, a, vl);
+
+            vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            p_src_iter += stride;
+            vuint16m2_t dst2 = __riscv_vwmulu_vx_u16m2(row2, a, vl);
+
+            vuint8m1_t row3 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            p_src_iter += stride;
+            vuint16m2_t dst3 = __riscv_vwmulu_vx_u16m2(row3, a, vl);
+
+            vuint8m1_t avg0 = __riscv_vaaddu_vv_u8m1(__riscv_vnclipu_wx_u8m1(dst0, 6, vl), __riscv_vle8_v_u8m1(p_dst_iter, vl), vl);
+            __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+            p_dst_iter += stride;
+
+            vuint8m1_t avg1 = __riscv_vaaddu_vv_u8m1(__riscv_vnclipu_wx_u8m1(dst1, 6, vl), __riscv_vle8_v_u8m1(p_dst_iter, vl), vl);
+            __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+            p_dst_iter += stride;
+
+            vuint8m1_t avg2 = __riscv_vaaddu_vv_u8m1(__riscv_vnclipu_wx_u8m1(dst2, 6, vl), __riscv_vle8_v_u8m1(p_dst_iter, vl), vl);
+            __riscv_vse8_v_u8m1(p_dst_iter, avg2, vl);
+            p_dst_iter += stride;
+
+            vuint8m1_t avg3 = __riscv_vaaddu_vv_u8m1(__riscv_vnclipu_wx_u8m1(dst3, 6, vl), __riscv_vle8_v_u8m1(p_dst_iter, vl), vl);
+            __riscv_vse8_v_u8m1(p_dst_iter, avg3, vl);
+            p_dst_iter += stride;
+        }
+    }
+}
+
+__attribute__((always_inline)) static void h264_avg_chroma_unroll2(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int w, int h, int x, int y)
+{
+    uint8_t *p_dst_iter = p_dst;
+    uint8_t *p_src_iter = p_src;
+
+    const int xy = x * y;
+    const int x8 = x << 3;
+    const int y8 = y << 3;
+    const int a = 64 - x8 - y8 + xy;
+    const int b = x8 - xy;
+    const int c = y8 - xy;
+    const int d = xy;
+
+    int vl = __riscv_vsetvl_e8m1(w);
+
+    if (d != 0)
+    {
+        for (int j = 0; j < h; j += 2)
+        {
+            // dst 1st row
+            vuint8m1_t row00 = __riscv_vle8_v_u8m1(p_src_iter, vl + 1);
+
+            vuint8m1_t row01;
+            row01 = __riscv_vslidedown_vx_u8m1(row00, 1, vl + 1);
+
+            vuint16m2_t dst0 = __riscv_vwmulu_vx_u16m2(row00, a, vl);
+            dst0 = __riscv_vwmaccu_vx_u16m2(dst0, b, row01, vl);
+
+            vuint8m1_t row10 = __riscv_vle8_v_u8m1(p_src_iter + stride, vl + 1);
+            dst0 = __riscv_vwmaccu_vx_u16m2(dst0, c, row10, vl);
+
+            vuint8m1_t row11;
+            row11 = __riscv_vslidedown_vx_u8m1(row10, 1, vl + 1);
+            dst0 = __riscv_vwmaccu_vx_u16m2(dst0, d, row11, vl);
+
+            // dst 2nd row
+            p_src_iter += (stride << 1);
+
+            vuint16m2_t dst1 = __riscv_vwmulu_vx_u16m2(row10, a, vl);
+            dst1 = __riscv_vwmaccu_vx_u16m2(dst1, b, row11, vl);
+
+            vuint8m1_t row20 = __riscv_vle8_v_u8m1(p_src_iter, vl + 1);
+            dst1 = __riscv_vwmaccu_vx_u16m2(dst1, c, row20, vl);
+
+            vuint8m1_t row21;
+            row21 = __riscv_vslidedown_vx_u8m1(row20, 1, vl + 1);
+            dst1 = __riscv_vwmaccu_vx_u16m2(dst1, d, row21, vl);
+
+            vuint8m1_t avg0 = __riscv_vaaddu_vv_u8m1(__riscv_vnclipu_wx_u8m1(dst0, 6, vl), __riscv_vle8_v_u8m1(p_dst_iter, vl), vl);
+            __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+            p_dst_iter += stride;
+
+            vuint8m1_t avg1 = __riscv_vaaddu_vv_u8m1(__riscv_vnclipu_wx_u8m1(dst1, 6, vl), __riscv_vle8_v_u8m1(p_dst_iter, vl), vl);
+            __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+            p_dst_iter += stride;
+        }
+    }
+    else if (b == 0 && c != 0)
+    {
+        const unsigned short e = b + c;
+
+        for (int j = 0; j < h; j += 2)
+        {
+            vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            vuint16m2_t dst0 = __riscv_vwmulu_vx_u16m2(row0, a, vl);
+            p_src_iter += stride;
+
+            vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            dst0 = __riscv_vwmaccu_vx_u16m2(dst0, e, row1, vl);
+            p_src_iter += stride;
+
+            vuint8m1_t row2 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            vuint16m2_t dst1 =__riscv_vwmulu_vx_u16m2(row1, a, vl);
+            dst1 = __riscv_vwmaccu_vx_u16m2(dst1, e, row2, vl);
+            p_src_iter += stride;
+
+            vuint8m1_t avg0 = __riscv_vaaddu_vv_u8m1(__riscv_vnclipu_wx_u8m1(dst0, 6, vl), __riscv_vle8_v_u8m1(p_dst_iter, vl), vl);
+            __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+            p_dst_iter += stride;
+
+            vuint8m1_t avg1 = __riscv_vaaddu_vv_u8m1(__riscv_vnclipu_wx_u8m1(dst1, 6, vl), __riscv_vle8_v_u8m1(p_dst_iter, vl), vl);
+            __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+            p_dst_iter += stride;
+        }
+    }
+    else if (b != 0 && c == 0)
+    {
+        const unsigned short e = b + c;
+
+        for (int j = 0; j < h; j += 2)
+        {
+            // 1st
+            vuint8m1_t row00 = __riscv_vle8_v_u8m1(p_src_iter, vl + 1);
+            p_src_iter += stride;
+
+            vuint8m1_t row01;
+            row01 = __riscv_vslidedown_vx_u8m1(row00, 1, vl + 1);
+
+            vuint16m2_t dst0 = __riscv_vwmulu_vx_u16m2(row00, a, vl);
+            dst0 = __riscv_vwmaccu_vx_u16m2(dst0, e, row01, vl);
+
+            // 2nd
+            vuint8m1_t row10 = __riscv_vle8_v_u8m1(p_src_iter, vl + 1);
+            p_src_iter += stride;
+
+            vuint8m1_t row11;
+            row11 = __riscv_vslidedown_vx_u8m1(row10, 1, vl + 1);
+
+            vuint16m2_t dst1 = __riscv_vwmulu_vx_u16m2(row10, a, vl);
+            dst1 = __riscv_vwmaccu_vx_u16m2(dst1, e, row11, vl);
+
+            vuint8m1_t avg0 = __riscv_vaaddu_vv_u8m1(__riscv_vnclipu_wx_u8m1(dst0, 6, vl), __riscv_vle8_v_u8m1(p_dst_iter, vl), vl);
+            __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+            p_dst_iter += stride;
+
+            vuint8m1_t avg1 = __riscv_vaaddu_vv_u8m1(__riscv_vnclipu_wx_u8m1(dst1, 6, vl), __riscv_vle8_v_u8m1(p_dst_iter, vl), vl);
+            __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+            p_dst_iter += stride;
+        }
+    }
+    else
+    {
+        for (int j = 0; j < h; j += 2)
+        {
+            vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            p_src_iter += stride;
+            vuint16m2_t dst0 = __riscv_vwmulu_vx_u16m2(row0, a, vl);
+
+            vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            p_src_iter += stride;
+            vuint16m2_t dst1 = __riscv_vwmulu_vx_u16m2(row1, a, vl);
+
+            vuint8m1_t avg0 = __riscv_vaaddu_vv_u8m1(__riscv_vnclipu_wx_u8m1(dst0, 6, vl), __riscv_vle8_v_u8m1(p_dst_iter, vl), vl);
+            __riscv_vse8_v_u8m1(p_dst_iter, avg0, vl);
+            p_dst_iter += stride;
+
+            vuint8m1_t avg1 = __riscv_vaaddu_vv_u8m1(__riscv_vnclipu_wx_u8m1(dst1, 6, vl), __riscv_vle8_v_u8m1(p_dst_iter, vl), vl);
+            __riscv_vse8_v_u8m1(p_dst_iter, avg1, vl);
+            p_dst_iter += stride;
+        }
+    }
+}
+
+void h264_put_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y)
+{
+    h264_put_chroma_unroll4(p_dst, p_src, stride, 8, h, x, y);
+}
+
+void h264_avg_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y)
+{
+    h264_avg_chroma_unroll4(p_dst, p_src, stride, 8, h, x, y);
+}
+
+void h264_put_chroma_mc4_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y)
+{
+    if (h >= 4)
+    {
+        h264_put_chroma_unroll4(p_dst, p_src, stride, 4, h, x, y);
+    }
+    else
+    {
+        h264_put_chroma_unroll2(p_dst, p_src, stride, 4, h, x, y);
+    }
+}
+
+void h264_avg_chroma_mc4_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y)
+{
+    if (h >= 4)
+    {
+        h264_avg_chroma_unroll4(p_dst, p_src, stride, 4, h, x, y);
+    }
+    else
+    {
+        h264_avg_chroma_unroll2(p_dst, p_src, stride, 4, h, x, y);
+    }
+}
+
+void h264_put_chroma_mc2_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y)
+{
+    if (h >= 4)
+    {
+        h264_put_chroma_unroll4(p_dst, p_src, stride, 2, h, x, y);
+    }
+    else
+    {
+        h264_put_chroma_unroll2(p_dst, p_src, stride, 2, h, x, y);
+    }
+}
+
+void h264_avg_chroma_mc2_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y)
+{
+    if (h >= 4)
+    {
+        h264_avg_chroma_unroll4(p_dst, p_src, stride, 2, h, x, y);
+    }
+    else
+    {
+        h264_avg_chroma_unroll2(p_dst, p_src, stride, 2, h, x, y);
+    }
+}
+#endif
diff --git a/libavcodec/riscv/h264_mc_chroma.h b/libavcodec/riscv/h264_mc_chroma.h
new file mode 100644
index 0000000000..ec9fef6672
--- /dev/null
+++ b/libavcodec/riscv/h264_mc_chroma.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_RISCV_H264_MC_CHROMA_H
+#define AVCODEC_RISCV_H264_MC_CHROMA_H
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stddef.h>
+#include "config.h"
+
+#if HAVE_INTRINSICS_RVV
+typedef unsigned char pixel;
+
+void h264_put_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
+void h264_avg_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
+void h264_put_chroma_mc4_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
+void h264_avg_chroma_mc4_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
+void h264_put_chroma_mc2_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
+void h264_avg_chroma_mc2_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
+#endif
+#endif
\ No newline at end of file
-- 
2.17.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".