Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] Add LSX optimization in avcodec and swscale.
@ 2023-05-04  8:49 Hao Chen
  2023-05-04  8:49 ` [FFmpeg-devel] [PATCH v1 1/6] avcodec/la: add LSX optimization for h264 idct Hao Chen
                   ` (5 more replies)
  0 siblings, 6 replies; 18+ messages in thread
From: Hao Chen @ 2023-05-04  8:49 UTC (permalink / raw)
  To: ffmpeg-devel

v1: Add LSX optimization in avcodec and swscale, due to the 2K series CPUs only support lsx.
[PATCH v1 1/6] avcodec/la: add LSX optimization for h264 idct.
[PATCH v1 2/6] avcodec/la: Add LSX optimization for loop filter.
[PATCH v1 3/6] avcodec/la: Add LSX optimization for h264 chroma and
[PATCH v1 4/6] avcodec/la: Add LSX optimization for h264 qpel.
[PATCH v1 5/6] swscale/la: Optimize the functions of the swscale
[PATCH v1 6/6] swscale/la: Add following builtin optimized functions

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [FFmpeg-devel] [PATCH v1 1/6] avcodec/la: add LSX optimization for h264 idct.
  2023-05-04  8:49 [FFmpeg-devel] Add LSX optimization in avcodec and swscale Hao Chen
@ 2023-05-04  8:49 ` Hao Chen
  2023-05-09  2:47   ` yinshiyou-hf
  2023-05-04  8:49 ` [FFmpeg-devel] [PATCH v1 2/6] avcodec/la: Add LSX optimization for loop filter Hao Chen
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 18+ messages in thread
From: Hao Chen @ 2023-05-04  8:49 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Shiyou Yin

From: Shiyou Yin <yinshiyou-hf@loongson.cn>

loongson_asm.S is LoongArch asm optimization helper.
./configure --disable-lasx
Add functions:
  ff_h264_idct_add_8_lsx
  ff_h264_idct8_add_8_lsx
  ff_h264_idct_dc_add_8_lsx
  ff_h264_idct8_dc_add_8_lsx
  ff_h264_luma_dc_dequant_idct_8_lsx
Replaced function(LSX is enough for these functions):
  ff_h264_idct_add_lasx
  ff_h264_idct8_addblk_lasx
  ff_h264_deq_idct_luma_dc_lasx
Renamed functions:
  ff_h264_idct8_addblk_lasx ==> ff_h264_idct8_add_8_lasx
  ff_h264_idct8_dc_addblk_lasx ==> ff_h264_idct8_dc_add_8_lasx

ffmpeg -i 1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -y /dev/null -an
before: 155fps
after:  161fps
---
 libavcodec/loongarch/Makefile                 |   3 +-
 libavcodec/loongarch/h264_deblock_lasx.c      |   2 +-
 libavcodec/loongarch/h264dsp_init_loongarch.c |  38 +-
 libavcodec/loongarch/h264dsp_lasx.c           |   2 +-
 .../{h264dsp_lasx.h => h264dsp_loongarch.h}   |  60 +-
 libavcodec/loongarch/h264idct.S               | 659 ++++++++++++
 libavcodec/loongarch/h264idct_la.c            | 185 ++++
 libavcodec/loongarch/h264idct_lasx.c          | 498 ---------
 libavcodec/loongarch/loongson_asm.S           | 946 ++++++++++++++++++
 9 files changed, 1850 insertions(+), 543 deletions(-)
 rename libavcodec/loongarch/{h264dsp_lasx.h => h264dsp_loongarch.h} (68%)
 create mode 100644 libavcodec/loongarch/h264idct.S
 create mode 100644 libavcodec/loongarch/h264idct_la.c
 delete mode 100644 libavcodec/loongarch/h264idct_lasx.c
 create mode 100644 libavcodec/loongarch/loongson_asm.S

diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index c1b5de5c44..4bf06d903b 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -12,7 +12,6 @@ OBJS-$(CONFIG_HEVC_DECODER)           += loongarch/hevcdsp_init_loongarch.o
 LASX-OBJS-$(CONFIG_H264CHROMA)        += loongarch/h264chroma_lasx.o
 LASX-OBJS-$(CONFIG_H264QPEL)          += loongarch/h264qpel_lasx.o
 LASX-OBJS-$(CONFIG_H264DSP)           += loongarch/h264dsp_lasx.o \
-                                         loongarch/h264idct_lasx.o \
                                          loongarch/h264_deblock_lasx.o
 LASX-OBJS-$(CONFIG_H264PRED)          += loongarch/h264_intrapred_lasx.o
 LASX-OBJS-$(CONFIG_VC1_DECODER)       += loongarch/vc1dsp_lasx.o
@@ -31,3 +30,5 @@ LSX-OBJS-$(CONFIG_HEVC_DECODER)       += loongarch/hevcdsp_lsx.o \
                                          loongarch/hevc_mc_bi_lsx.o \
                                          loongarch/hevc_mc_uni_lsx.o \
                                          loongarch/hevc_mc_uniw_lsx.o
+LSX-OBJS-$(CONFIG_H264DSP)            += loongarch/h264idct.o \
+                                         loongarch/h264idct_la.o
diff --git a/libavcodec/loongarch/h264_deblock_lasx.c b/libavcodec/loongarch/h264_deblock_lasx.c
index c89bea9a84..eead931dcf 100644
--- a/libavcodec/loongarch/h264_deblock_lasx.c
+++ b/libavcodec/loongarch/h264_deblock_lasx.c
@@ -20,7 +20,7 @@
  */
 
 #include "libavcodec/bit_depth_template.c"
-#include "h264dsp_lasx.h"
+#include "h264dsp_loongarch.h"
 #include "libavutil/loongarch/loongson_intrinsics.h"
 
 #define H264_LOOP_FILTER_STRENGTH_ITERATION_LASX(edges, step, mask_mv, dir, \
diff --git a/libavcodec/loongarch/h264dsp_init_loongarch.c b/libavcodec/loongarch/h264dsp_init_loongarch.c
index 37633c3e51..f8616a7db5 100644
--- a/libavcodec/loongarch/h264dsp_init_loongarch.c
+++ b/libavcodec/loongarch/h264dsp_init_loongarch.c
@@ -21,13 +21,32 @@
  */
 
 #include "libavutil/loongarch/cpu.h"
-#include "h264dsp_lasx.h"
+#include "h264dsp_loongarch.h"
 
 av_cold void ff_h264dsp_init_loongarch(H264DSPContext *c, const int bit_depth,
                                        const int chroma_format_idc)
 {
     int cpu_flags = av_get_cpu_flags();
 
+    if (have_lsx(cpu_flags)) {
+        if (bit_depth == 8) {
+            c->h264_idct_add     = ff_h264_idct_add_8_lsx;
+            c->h264_idct8_add    = ff_h264_idct8_add_8_lsx;
+            c->h264_idct_dc_add  = ff_h264_idct_dc_add_8_lsx;
+            c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_lsx;
+
+            if (chroma_format_idc <= 1)
+                c->h264_idct_add8 = ff_h264_idct_add8_8_lsx;
+            else
+                c->h264_idct_add8 = ff_h264_idct_add8_422_8_lsx;
+
+            c->h264_idct_add16 = ff_h264_idct_add16_8_lsx;
+            c->h264_idct8_add4 = ff_h264_idct8_add4_8_lsx;
+            c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_8_lsx;
+            c->h264_idct_add16intra = ff_h264_idct_add16_intra_8_lsx;
+        }
+    }
+#if HAVE_LASX
     if (have_lasx(cpu_flags)) {
         if (chroma_format_idc <= 1)
             c->h264_loop_filter_strength = ff_h264_loop_filter_strength_lasx;
@@ -56,20 +75,9 @@ av_cold void ff_h264dsp_init_loongarch(H264DSPContext *c, const int bit_depth,
             c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_8_lasx;
             c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels4_8_lasx;
 
-            c->h264_idct_add = ff_h264_idct_add_lasx;
-            c->h264_idct8_add = ff_h264_idct8_addblk_lasx;
-            c->h264_idct_dc_add = ff_h264_idct4x4_addblk_dc_lasx;
-            c->h264_idct8_dc_add = ff_h264_idct8_dc_addblk_lasx;
-            c->h264_idct_add16 = ff_h264_idct_add16_lasx;
-            c->h264_idct8_add4 = ff_h264_idct8_add4_lasx;
-
-            if (chroma_format_idc <= 1)
-                c->h264_idct_add8 = ff_h264_idct_add8_lasx;
-            else
-                c->h264_idct_add8 = ff_h264_idct_add8_422_lasx;
-
-            c->h264_idct_add16intra = ff_h264_idct_add16_intra_lasx;
-            c->h264_luma_dc_dequant_idct = ff_h264_deq_idct_luma_dc_lasx;
+            c->h264_idct8_add    = ff_h264_idct8_add_8_lasx;
+            c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_lasx;
         }
     }
+#endif // #if HAVE_LASX
 }
diff --git a/libavcodec/loongarch/h264dsp_lasx.c b/libavcodec/loongarch/h264dsp_lasx.c
index 7fd4cedf7e..7b2b8ff0f0 100644
--- a/libavcodec/loongarch/h264dsp_lasx.c
+++ b/libavcodec/loongarch/h264dsp_lasx.c
@@ -23,7 +23,7 @@
  */
 
 #include "libavutil/loongarch/loongson_intrinsics.h"
-#include "h264dsp_lasx.h"
+#include "h264dsp_loongarch.h"
 
 #define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in,   \
                          p1_or_q1_org_in, p2_or_q2_org_in,   \
diff --git a/libavcodec/loongarch/h264dsp_lasx.h b/libavcodec/loongarch/h264dsp_loongarch.h
similarity index 68%
rename from libavcodec/loongarch/h264dsp_lasx.h
rename to libavcodec/loongarch/h264dsp_loongarch.h
index 4cf813750b..28dca2b537 100644
--- a/libavcodec/loongarch/h264dsp_lasx.h
+++ b/libavcodec/loongarch/h264dsp_loongarch.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
  * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
  *                Xiwei  Gu  <guxiwei-hf@loongson.cn>
  *
@@ -20,11 +20,34 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef AVCODEC_LOONGARCH_H264DSP_LASX_H
-#define AVCODEC_LOONGARCH_H264DSP_LASX_H
+#ifndef AVCODEC_LOONGARCH_H264DSP_LOONGARCH_H
+#define AVCODEC_LOONGARCH_H264DSP_LOONGARCH_H
 
 #include "libavcodec/h264dec.h"
+#include "config.h"
 
+void ff_h264_idct_add_8_lsx(uint8_t *dst, int16_t *src, int dst_stride);
+void ff_h264_idct8_add_8_lsx(uint8_t *dst, int16_t *src, int dst_stride);
+void ff_h264_idct_dc_add_8_lsx(uint8_t *dst, int16_t *src, int dst_stride);
+void ff_h264_idct8_dc_add_8_lsx(uint8_t *dst, int16_t *src, int dst_stride);
+void ff_h264_luma_dc_dequant_idct_8_lsx(int16_t *_output, int16_t *_input, int qmul);
+void ff_h264_idct_add16_8_lsx(uint8_t *dst, const int32_t *blk_offset,
+                              int16_t *block, int32_t dst_stride,
+                              const uint8_t nzc[15 * 8]);
+void ff_h264_idct8_add4_8_lsx(uint8_t *dst, const int32_t *blk_offset,
+                              int16_t *block, int32_t dst_stride,
+                              const uint8_t nzc[15 * 8]);
+void ff_h264_idct_add8_8_lsx(uint8_t **dst, const int32_t *blk_offset,
+                             int16_t *block, int32_t dst_stride,
+                             const uint8_t nzc[15 * 8]);
+void ff_h264_idct_add8_422_8_lsx(uint8_t **dst, const int32_t *blk_offset,
+                                 int16_t *block, int32_t dst_stride,
+                                 const uint8_t nzc[15 * 8]);
+void ff_h264_idct_add16_intra_8_lsx(uint8_t *dst, const int32_t *blk_offset,
+                                    int16_t *block, int32_t dst_stride,
+                                    const uint8_t nzc[15 * 8]);
+
+#if HAVE_LASX
 void ff_h264_h_lpf_luma_8_lasx(uint8_t *src, ptrdiff_t stride,
                                int alpha, int beta, int8_t *tc0);
 void ff_h264_v_lpf_luma_8_lasx(uint8_t *src, ptrdiff_t stride,
@@ -65,33 +88,16 @@ void ff_weight_h264_pixels4_8_lasx(uint8_t *src, ptrdiff_t stride,
 void ff_h264_add_pixels4_8_lasx(uint8_t *_dst, int16_t *_src, int stride);
 
 void ff_h264_add_pixels8_8_lasx(uint8_t *_dst, int16_t *_src, int stride);
-void ff_h264_idct_add_lasx(uint8_t *dst, int16_t *src, int32_t dst_stride);
-void ff_h264_idct8_addblk_lasx(uint8_t *dst, int16_t *src, int32_t dst_stride);
-void ff_h264_idct4x4_addblk_dc_lasx(uint8_t *dst, int16_t *src,
-                                    int32_t dst_stride);
-void ff_h264_idct8_dc_addblk_lasx(uint8_t *dst, int16_t *src,
+void ff_h264_idct8_add_8_lasx(uint8_t *dst, int16_t *src, int32_t dst_stride);
+void ff_h264_idct8_dc_add_8_lasx(uint8_t *dst, int16_t *src,
                                   int32_t dst_stride);
-void ff_h264_idct_add16_lasx(uint8_t *dst, const int32_t *blk_offset,
-                             int16_t *block, int32_t dst_stride,
-                             const uint8_t nzc[15 * 8]);
-void ff_h264_idct8_add4_lasx(uint8_t *dst, const int32_t *blk_offset,
-                             int16_t *block, int32_t dst_stride,
-                             const uint8_t nzc[15 * 8]);
-void ff_h264_idct_add8_lasx(uint8_t **dst, const int32_t *blk_offset,
-                            int16_t *block, int32_t dst_stride,
-                            const uint8_t nzc[15 * 8]);
-void ff_h264_idct_add8_422_lasx(uint8_t **dst, const int32_t *blk_offset,
-                                int16_t *block, int32_t dst_stride,
-                                const uint8_t nzc[15 * 8]);
-void ff_h264_idct_add16_intra_lasx(uint8_t *dst, const int32_t *blk_offset,
-                                   int16_t *block, int32_t dst_stride,
-                                   const uint8_t nzc[15 * 8]);
-void ff_h264_deq_idct_luma_dc_lasx(int16_t *dst, int16_t *src,
-                                   int32_t de_qval);
-
+void ff_h264_idct8_add4_8_lasx(uint8_t *dst, const int32_t *blk_offset,
+                               int16_t *block, int32_t dst_stride,
+                               const uint8_t nzc[15 * 8]);
 void ff_h264_loop_filter_strength_lasx(int16_t bS[2][4][4], uint8_t nnz[40],
                                        int8_t ref[2][40], int16_t mv[2][40][2],
                                        int bidir, int edges, int step,
                                        int mask_mv0, int mask_mv1, int field);
+#endif // #if HAVE_LASX
 
-#endif  // #ifndef AVCODEC_LOONGARCH_H264DSP_LASX_H
+#endif  // #ifndef AVCODEC_LOONGARCH_H264DSP_LOONGARCH_H
diff --git a/libavcodec/loongarch/h264idct.S b/libavcodec/loongarch/h264idct.S
new file mode 100644
index 0000000000..83fde3ed3f
--- /dev/null
+++ b/libavcodec/loongarch/h264idct.S
@@ -0,0 +1,659 @@
+/*
+ * Loongson LASX optimized h264idct
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+/*
+ * #define FUNC2(a, b, c)  FUNC3(a, b, c)
+ * #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
+ * void FUNCC(ff_h264_idct_add)(uint8_t *_dst, int16_t *_block, int stride)
+ * LSX optimization is enough for this function.
+ */
+function ff_h264_idct_add_8_lsx
+    fld.d         f0,     a1,    0
+    fld.d         f1,     a1,    8
+    fld.d         f2,     a1,    16
+    fld.d         f3,     a1,    24
+    vxor.v        vr7,    vr7,   vr7
+    add.d         t2,     a2,    a2
+    add.d         t3,     t2,    a2
+    vst           vr7,    a1,    0
+    vst           vr7,    a1,    16
+
+    vadd.h        vr4,    vr0,   vr2
+    vsub.h        vr5,    vr0,   vr2
+    vsrai.h       vr6,    vr1,   1
+    vsrai.h       vr7,    vr3,   1
+    vsub.h        vr6,    vr6,   vr3
+    vadd.h        vr7,    vr1,   vr7
+    LSX_BUTTERFLY_4_H vr4, vr5, vr6, vr7,  vr0, vr1, vr2, vr3
+    LSX_TRANSPOSE4x4_H vr0, vr1, vr2, vr3,  vr0, vr1, vr2, vr3,  vr4, vr5
+    vadd.h        vr4,    vr0,   vr2
+    vsub.h        vr5,    vr0,   vr2
+    vsrai.h       vr6,    vr1,   1
+    vsrai.h       vr7,    vr3,   1
+    vsub.h        vr6,    vr6,   vr3
+    vadd.h        vr7,    vr1,   vr7
+    LSX_BUTTERFLY_4_H vr4, vr5, vr6, vr7,  vr0, vr1, vr2, vr3
+
+    fld.s         f4,     a0,    0
+    fldx.s        f5,     a0,    a2
+    fldx.s        f6,     a0,    t2
+    fldx.s        f7,     a0,    t3
+
+    vsrari.h      vr0,    vr0,   6
+    vsrari.h      vr1,    vr1,   6
+    vsrari.h      vr2,    vr2,   6
+    vsrari.h      vr3,    vr3,   6
+
+    vsllwil.hu.bu vr4,    vr4,   0
+    vsllwil.hu.bu vr5,    vr5,   0
+    vsllwil.hu.bu vr6,    vr6,   0
+    vsllwil.hu.bu vr7,    vr7,   0
+    vadd.h        vr0,    vr0,   vr4
+    vadd.h        vr1,    vr1,   vr5
+    vadd.h        vr2,    vr2,   vr6
+    vadd.h        vr3,    vr3,   vr7
+    vssrarni.bu.h vr1,    vr0,   0
+    vssrarni.bu.h vr3,    vr2,   0
+
+    vbsrl.v       vr0,    vr1,   8
+    vbsrl.v       vr2,    vr3,   8
+    fst.s         f1,     a0,    0
+    fstx.s        f0,     a0,    a2
+    fstx.s        f3,     a0,    t2
+    fstx.s        f2,     a0,    t3
+endfunc
+
+/*
+ * #define FUNC2(a, b, c)  FUNC3(a, b, c)
+ * #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
+ * void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride)
+ */
+function ff_h264_idct8_add_8_lsx
+    ld.h          t0,     a1,    0
+    add.d         t2,     a2,    a2
+    add.d         t3,     t2,    a2
+    add.d         t4,     t3,    a2
+    add.d         t5,     t4,    a2
+    add.d         t6,     t5,    a2
+    add.d         t7,     t6,    a2
+    addi.w        t0,     t0,    32
+    st.h          t0,     a1,    0
+
+    vld           vr0,    a1,    0
+    vld           vr1,    a1,    16
+    vld           vr2,    a1,    32
+    vld           vr3,    a1,    48
+    vld           vr4,    a1,    64
+    vld           vr5,    a1,    80
+    vld           vr6,    a1,    96
+    vld           vr7,    a1,    112
+    vxor.v        vr8,    vr8,   vr8
+    vst           vr8,    a1,    0
+    vst           vr8,    a1,    16
+    vst           vr8,    a1,    32
+    vst           vr8,    a1,    48
+    vst           vr8,    a1,    64
+    vst           vr8,    a1,    80
+    vst           vr8,    a1,    96
+    vst           vr8,    a1,    112
+
+    vadd.h        vr18,   vr0,   vr4
+    vsub.h        vr19,   vr0,   vr4
+    vsrai.h       vr20,   vr2,   1
+    vsrai.h       vr21,   vr6,   1
+    vsub.h        vr20,   vr20,  vr6
+    vadd.h        vr21,   vr21,  vr2
+    LSX_BUTTERFLY_4_H vr18, vr19, vr20, vr21,  vr10, vr12, vr14, vr16
+    vsrai.h       vr11,   vr7,   1
+    vsrai.h       vr13,   vr3,   1
+    vsrai.h       vr15,   vr5,   1
+    vsrai.h       vr17,   vr1,   1
+    vsub.h        vr11,   vr5,   vr11
+    vsub.h        vr13,   vr7,   vr13
+    vadd.h        vr15,   vr7,   vr15
+    vadd.h        vr17,   vr5,   vr17
+    vsub.h        vr11,   vr11,  vr7
+    vsub.h        vr13,   vr13,  vr3
+    vadd.h        vr15,   vr15,  vr5
+    vadd.h        vr17,   vr17,  vr1
+    vsub.h        vr11,   vr11,  vr3
+    vadd.h        vr13,   vr13,  vr1
+    vsub.h        vr15,   vr15,  vr1
+    vadd.h        vr17,   vr17,  vr3
+    vsrai.h       vr18,   vr11,  2
+    vsrai.h       vr19,   vr13,  2
+    vsrai.h       vr20,   vr15,  2
+    vsrai.h       vr21,   vr17,  2
+    vadd.h        vr11,   vr11,  vr21
+    vadd.h        vr13,   vr13,  vr20
+    vsub.h        vr15,   vr19,  vr15
+    vsub.h        vr17,   vr17,  vr18
+    LSX_BUTTERFLY_8_H vr10, vr16, vr12, vr14, vr13, vr15, vr11, vr17, \
+                      vr0,  vr3,  vr1,  vr2,  vr5,  vr6,  vr4,  vr7
+
+    LSX_TRANSPOSE8x8_H vr0,  vr1,  vr2,  vr3,  vr4,  vr5,  vr6,  vr7, \
+                       vr0,  vr1,  vr2,  vr3,  vr4,  vr5,  vr6,  vr7, \
+                       vr10, vr11, vr12, vr13, vr14, vr15, vr16, vr17
+    vexth.w.h     vr20,   vr0
+    vexth.w.h     vr21,   vr1
+    vexth.w.h     vr22,   vr2
+    vexth.w.h     vr23,   vr3
+    vexth.w.h     vr8,    vr4
+    vexth.w.h     vr9,    vr5
+    vexth.w.h     vr18,   vr6
+    vexth.w.h     vr19,   vr7
+    vsllwil.w.h   vr0,    vr0,   0
+    vsllwil.w.h   vr1,    vr1,   0
+    vsllwil.w.h   vr2,    vr2,   0
+    vsllwil.w.h   vr3,    vr3,   0
+    vsllwil.w.h   vr4,    vr4,   0
+    vsllwil.w.h   vr5,    vr5,   0
+    vsllwil.w.h   vr6,    vr6,   0
+    vsllwil.w.h   vr7,    vr7,   0
+
+    vadd.w        vr11,   vr0,   vr4
+    vsub.w        vr13,   vr0,   vr4
+    vsrai.w       vr15,   vr2,   1
+    vsrai.w       vr17,   vr6,   1
+    vsub.w        vr15,   vr15,  vr6
+    vadd.w        vr17,   vr17,  vr2
+    LSX_BUTTERFLY_4_W vr11, vr13, vr15, vr17,  vr10, vr12, vr14, vr16
+    vsrai.w       vr11,   vr7,   1
+    vsrai.w       vr13,   vr3,   1
+    vsrai.w       vr15,   vr5,   1
+    vsrai.w       vr17,   vr1,   1
+    vsub.w        vr11,   vr5,   vr11
+    vsub.w        vr13,   vr7,   vr13
+    vadd.w        vr15,   vr7,   vr15
+    vadd.w        vr17,   vr5,   vr17
+    vsub.w        vr11,   vr11,  vr7
+    vsub.w        vr13,   vr13,  vr3
+    vadd.w        vr15,   vr15,  vr5
+    vadd.w        vr17,   vr17,  vr1
+    vsub.w        vr11,   vr11,  vr3
+    vadd.w        vr13,   vr13,  vr1
+    vsub.w        vr15,   vr15,  vr1
+    vadd.w        vr17,   vr17,  vr3
+    vsrai.w       vr0,    vr11,  2
+    vsrai.w       vr1,    vr13,  2
+    vsrai.w       vr2,    vr15,  2
+    vsrai.w       vr3,    vr17,  2
+    vadd.w        vr11,   vr11,  vr3
+    vadd.w        vr13,   vr13,  vr2
+    vsub.w        vr15,   vr1,   vr15
+    vsub.w        vr17,   vr17,  vr0
+    LSX_BUTTERFLY_8_W vr10, vr12, vr14, vr16, vr11, vr13, vr15, vr17, \
+                      vr0,  vr1,  vr2,  vr3,  vr4,  vr5,  vr6,  vr7
+
+    vadd.w        vr11,    vr20,  vr8
+    vsub.w        vr13,    vr20,  vr8
+    vsrai.w       vr15,    vr22,  1
+    vsrai.w       vr17,    vr18,  1
+    vsub.w        vr15,    vr15,  vr18
+    vadd.w        vr17,    vr17,  vr22
+    LSX_BUTTERFLY_4_W vr11, vr13, vr15, vr17,  vr10, vr12, vr14, vr16
+    vsrai.w       vr11,   vr19,  1
+    vsrai.w       vr13,   vr23,  1
+    vsrai.w       vr15,   vr9,   1
+    vsrai.w       vr17,   vr21,  1
+    vsub.w        vr11,   vr9,   vr11
+    vsub.w        vr13,   vr19,  vr13
+    vadd.w        vr15,   vr19,  vr15
+    vadd.w        vr17,   vr9,   vr17
+    vsub.w        vr11,   vr11,  vr19
+    vsub.w        vr13,   vr13,  vr23
+    vadd.w        vr15,   vr15,  vr9
+    vadd.w        vr17,   vr17,  vr21
+    vsub.w        vr11,   vr11,  vr23
+    vadd.w        vr13,   vr13,  vr21
+    vsub.w        vr15,   vr15,  vr21
+    vadd.w        vr17,   vr17,  vr23
+    vsrai.w       vr20,   vr11,  2
+    vsrai.w       vr21,   vr13,  2
+    vsrai.w       vr22,   vr15,  2
+    vsrai.w       vr23,   vr17,  2
+    vadd.w        vr11,   vr11,  vr23
+    vadd.w        vr13,   vr13,  vr22
+    vsub.w        vr15,   vr21,  vr15
+    vsub.w        vr17,   vr17,  vr20
+    LSX_BUTTERFLY_8_W vr10, vr12, vr14, vr16, vr11, vr13, vr15, vr17, \
+                      vr20, vr21, vr22, vr23, vr8, vr9, vr18, vr19
+
+    vld           vr10,   a0,    0
+    vldx          vr11,   a0,    a2
+    vldx          vr12,   a0,    t2
+    vldx          vr13,   a0,    t3
+    vldx          vr14,   a0,    t4
+    vldx          vr15,   a0,    t5
+    vldx          vr16,   a0,    t6
+    vldx          vr17,   a0,    t7
+    vsrani.h.w    vr20,   vr0,   6
+    vsrani.h.w    vr21,   vr1,   6
+    vsrani.h.w    vr22,   vr2,   6
+    vsrani.h.w    vr23,   vr3,   6
+    vsrani.h.w    vr8,    vr4,   6
+    vsrani.h.w    vr9,    vr5,   6
+    vsrani.h.w    vr18,   vr6,   6
+    vsrani.h.w    vr19,   vr7,   6
+    vsllwil.hu.bu vr10,   vr10,  0
+    vsllwil.hu.bu vr11,   vr11,  0
+    vsllwil.hu.bu vr12,   vr12,  0
+    vsllwil.hu.bu vr13,   vr13,  0
+    vsllwil.hu.bu vr14,   vr14,  0
+    vsllwil.hu.bu vr15,   vr15,  0
+    vsllwil.hu.bu vr16,   vr16,  0
+    vsllwil.hu.bu vr17,   vr17,  0
+
+    vadd.h        vr0,    vr20,  vr10
+    vadd.h        vr1,    vr21,  vr11
+    vadd.h        vr2,    vr22,  vr12
+    vadd.h        vr3,    vr23,  vr13
+    vadd.h        vr4,    vr8,   vr14
+    vadd.h        vr5,    vr9,   vr15
+    vadd.h        vr6,    vr18,  vr16
+    vadd.h        vr7,    vr19,  vr17
+    vssrarni.bu.h vr1,    vr0,   0
+    vssrarni.bu.h vr3,    vr2,   0
+    vssrarni.bu.h vr5,    vr4,   0
+    vssrarni.bu.h vr7,    vr6,   0
+    vbsrl.v       vr0,    vr1,   8
+    vbsrl.v       vr2,    vr3,   8
+    vbsrl.v       vr4,    vr5,   8
+    vbsrl.v       vr6,    vr7,   8
+    fst.d         f1,     a0,    0
+    fstx.d        f0,     a0,    a2
+    fstx.d        f3,     a0,    t2
+    fstx.d        f2,     a0,    t3
+    fstx.d        f5,     a0,    t4
+    fstx.d        f4,     a0,    t5
+    fstx.d        f7,     a0,    t6
+    fstx.d        f6,     a0,    t7
+endfunc
+
+/*
+ * #define FUNC2(a, b, c)  FUNC3(a, b, c)
+ * #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
+ * void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride)
+ */
+function ff_h264_idct8_add_8_lasx
+    ld.h          t0,     a1,    0
+    add.d         t2,     a2,    a2
+    add.d         t3,     t2,    a2
+    add.d         t4,     t3,    a2
+    add.d         t5,     t4,    a2
+    add.d         t6,     t5,    a2
+    add.d         t7,     t6,    a2
+    addi.w        t0,     t0,    32
+    st.h          t0,     a1,    0
+
+    vld           vr0,    a1,    0
+    vld           vr1,    a1,    16
+    vld           vr2,    a1,    32
+    vld           vr3,    a1,    48
+    vld           vr4,    a1,    64
+    vld           vr5,    a1,    80
+    vld           vr6,    a1,    96
+    vld           vr7,    a1,    112
+    xvxor.v       xr8,    xr8,   xr8
+    xvst          xr8,    a1,    0
+    xvst          xr8,    a1,    32
+    xvst          xr8,    a1,    64
+    xvst          xr8,    a1,    96
+
+    vadd.h        vr18,   vr0,   vr4
+    vsub.h        vr19,   vr0,   vr4
+    vsrai.h       vr20,   vr2,   1
+    vsrai.h       vr21,   vr6,   1
+    vsub.h        vr20,   vr20,  vr6
+    vadd.h        vr21,   vr21,  vr2
+    LSX_BUTTERFLY_4_H vr18, vr19, vr20, vr21,  vr10, vr12, vr14, vr16
+    vsrai.h       vr11,   vr7,   1
+    vsrai.h       vr13,   vr3,   1
+    vsrai.h       vr15,   vr5,   1
+    vsrai.h       vr17,   vr1,   1
+    vsub.h        vr11,   vr5,   vr11
+    vsub.h        vr13,   vr7,   vr13
+    vadd.h        vr15,   vr7,   vr15
+    vadd.h        vr17,   vr5,   vr17
+    vsub.h        vr11,   vr11,  vr7
+    vsub.h        vr13,   vr13,  vr3
+    vadd.h        vr15,   vr15,  vr5
+    vadd.h        vr17,   vr17,  vr1
+    vsub.h        vr11,   vr11,  vr3
+    vadd.h        vr13,   vr13,  vr1
+    vsub.h        vr15,   vr15,  vr1
+    vadd.h        vr17,   vr17,  vr3
+    vsrai.h       vr18,   vr11,  2
+    vsrai.h       vr19,   vr13,  2
+    vsrai.h       vr20,   vr15,  2
+    vsrai.h       vr21,   vr17,  2
+    vadd.h        vr11,   vr11,  vr21
+    vadd.h        vr13,   vr13,  vr20
+    vsub.h        vr15,   vr19,  vr15
+    vsub.h        vr17,   vr17,  vr18
+    LSX_BUTTERFLY_8_H vr10, vr16, vr12, vr14, vr13, vr15, vr11, vr17, \
+                      vr0,  vr3,  vr1,  vr2,  vr5,  vr6,  vr4,  vr7
+
+    LSX_TRANSPOSE8x8_H vr0,  vr1,  vr2,  vr3,  vr4,  vr5,  vr6,  vr7, \
+                       vr0,  vr1,  vr2,  vr3,  vr4,  vr5,  vr6,  vr7, \
+                       vr10, vr11, vr12, vr13, vr14, vr15, vr16, vr17
+    vext2xv.w.h   xr0,    xr0
+    vext2xv.w.h   xr1,    xr1
+    vext2xv.w.h   xr2,    xr2
+    vext2xv.w.h   xr3,    xr3
+    vext2xv.w.h   xr4,    xr4
+    vext2xv.w.h   xr5,    xr5
+    vext2xv.w.h   xr6,    xr6
+    vext2xv.w.h   xr7,    xr7
+
+    xvadd.w       xr11,   xr0,   xr4
+    xvsub.w       xr13,   xr0,   xr4
+    xvsrai.w      xr15,   xr2,   1
+    xvsrai.w      xr17,   xr6,   1
+    xvsub.w       xr15,   xr15,  xr6
+    xvadd.w       xr17,   xr17,  xr2
+    LASX_BUTTERFLY_4_W xr11, xr13, xr15, xr17,  xr10, xr12, xr14, xr16
+    xvsrai.w      xr11,   xr7,   1
+    xvsrai.w      xr13,   xr3,   1
+    xvsrai.w      xr15,   xr5,   1
+    xvsrai.w      xr17,   xr1,   1
+    xvsub.w       xr11,   xr5,   xr11
+    xvsub.w       xr13,   xr7,   xr13
+    xvadd.w       xr15,   xr7,   xr15
+    xvadd.w       xr17,   xr5,   xr17
+    xvsub.w       xr11,   xr11,  xr7
+    xvsub.w       xr13,   xr13,  xr3
+    xvadd.w       xr15,   xr15,  xr5
+    xvadd.w       xr17,   xr17,  xr1
+    xvsub.w       xr11,   xr11,  xr3
+    xvadd.w       xr13,   xr13,  xr1
+    xvsub.w       xr15,   xr15,  xr1
+    xvadd.w       xr17,   xr17,  xr3
+    xvsrai.w      xr0,    xr11,  2
+    xvsrai.w      xr1,    xr13,  2
+    xvsrai.w      xr2,    xr15,  2
+    xvsrai.w      xr3,    xr17,  2
+    xvadd.w       xr11,   xr11,  xr3
+    xvadd.w       xr13,   xr13,  xr2
+    xvsub.w       xr15,   xr1,   xr15
+    xvsub.w       xr17,   xr17,  xr0
+    LASX_BUTTERFLY_8_W xr10, xr12, xr14, xr16, xr11, xr13, xr15, xr17, \
+                       xr0,  xr1,  xr2,  xr3,  xr4,  xr5,  xr6,  xr7
+
+    vld           vr10,   a0,    0
+    vldx          vr11,   a0,    a2
+    vldx          vr12,   a0,    t2
+    vldx          vr13,   a0,    t3
+    vldx          vr14,   a0,    t4
+    vldx          vr15,   a0,    t5
+    vldx          vr16,   a0,    t6
+    vldx          vr17,   a0,    t7
+    xvldi         xr8,    0x806     //"xvldi.w xr8 6"
+    xvsran.h.w    xr0,    xr0,   xr8
+    xvsran.h.w    xr1,    xr1,   xr8
+    xvsran.h.w    xr2,    xr2,   xr8
+    xvsran.h.w    xr3,    xr3,   xr8
+    xvsran.h.w    xr4,    xr4,   xr8
+    xvsran.h.w    xr5,    xr5,   xr8
+    xvsran.h.w    xr6,    xr6,   xr8
+    xvsran.h.w    xr7,    xr7,   xr8
+    xvpermi.d     xr0,    xr0,   0x08
+    xvpermi.d     xr1,    xr1,   0x08
+    xvpermi.d     xr2,    xr2,   0x08
+    xvpermi.d     xr3,    xr3,   0x08
+    xvpermi.d     xr4,    xr4,   0x08
+    xvpermi.d     xr5,    xr5,   0x08
+    xvpermi.d     xr6,    xr6,   0x08
+    xvpermi.d     xr7,    xr7,   0x08
+
+    vsllwil.hu.bu vr10,   vr10,  0
+    vsllwil.hu.bu vr11,   vr11,  0
+    vsllwil.hu.bu vr12,   vr12,  0
+    vsllwil.hu.bu vr13,   vr13,  0
+    vsllwil.hu.bu vr14,   vr14,  0
+    vsllwil.hu.bu vr15,   vr15,  0
+    vsllwil.hu.bu vr16,   vr16,  0
+    vsllwil.hu.bu vr17,   vr17,  0
+
+    vadd.h        vr0,    vr0,   vr10
+    vadd.h        vr1,    vr1,   vr11
+    vadd.h        vr2,    vr2,   vr12
+    vadd.h        vr3,    vr3,   vr13
+    vadd.h        vr4,    vr4,   vr14
+    vadd.h        vr5,    vr5,   vr15
+    vadd.h        vr6,    vr6,   vr16
+    vadd.h        vr7,    vr7,   vr17
+    vssrarni.bu.h vr1,    vr0,   0
+    vssrarni.bu.h vr3,    vr2,   0
+    vssrarni.bu.h vr5,    vr4,   0
+    vssrarni.bu.h vr7,    vr6,   0
+    vbsrl.v       vr0,    vr1,   8
+    vbsrl.v       vr2,    vr3,   8
+    vbsrl.v       vr4,    vr5,   8
+    vbsrl.v       vr6,    vr7,   8
+    fst.d         f1,     a0,    0
+    fstx.d        f0,     a0,    a2
+    fstx.d        f3,     a0,    t2
+    fstx.d        f2,     a0,    t3
+    fstx.d        f5,     a0,    t4
+    fstx.d        f4,     a0,    t5
+    fstx.d        f7,     a0,    t6
+    fstx.d        f6,     a0,    t7
+endfunc
+
+/*
+ * #define FUNC2(a, b, c)  FUNC3(a, b, c)
+ * #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
+ * void FUNCC(ff_h264_idct_dc_add)(uint8_t *_dst, int16_t *_block, int stride)
+ * LSX optimization is enough for this function.
+ */
+function ff_h264_idct_dc_add_8_lsx
+    vldrepl.h     vr4,    a1,    0
+    add.d         t2,     a2,    a2
+    add.d         t3,     t2,    a2
+    fld.s         f0,     a0,    0
+    fldx.s        f1,     a0,    a2
+    fldx.s        f2,     a0,    t2
+    fldx.s        f3,     a0,    t3
+    st.h          zero,   a1,    0
+
+    vsrari.h      vr4,    vr4,   6
+    vilvl.w       vr0,    vr1,   vr0
+    vilvl.w       vr1,    vr3,   vr2
+    vsllwil.hu.bu vr0,    vr0,   0
+    vsllwil.hu.bu vr1,    vr1,   0
+    vadd.h        vr0,    vr0,   vr4
+    vadd.h        vr1,    vr1,   vr4
+    vssrarni.bu.h vr1,    vr0,   0
+
+    vbsrl.v       vr2,    vr1,   4
+    vbsrl.v       vr3,    vr1,   8
+    vbsrl.v       vr4,    vr1,   12
+    fst.s         f1,     a0,    0
+    fstx.s        f2,     a0,    a2
+    fstx.s        f3,     a0,    t2
+    fstx.s        f4,     a0,    t3
+endfunc
+
+/*
+ * #define FUNC2(a, b, c)  FUNC3(a, b, c)
+ * #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
+ * void FUNCC(ff_h264_idct8_dc_add)(uint8_t *_dst, int16_t *_block, int stride)
+ */
+function ff_h264_idct8_dc_add_8_lsx
+    vldrepl.h     vr8,    a1,    0
+    add.d         t2,     a2,    a2
+    add.d         t3,     t2,    a2
+    add.d         t4,     t3,    a2
+    add.d         t5,     t4,    a2
+    add.d         t6,     t5,    a2
+    add.d         t7,     t6,    a2
+
+    fld.d         f0,     a0,    0
+    fldx.d        f1,     a0,    a2
+    fldx.d        f2,     a0,    t2
+    fldx.d        f3,     a0,    t3
+    fldx.d        f4,     a0,    t4
+    fldx.d        f5,     a0,    t5
+    fldx.d        f6,     a0,    t6
+    fldx.d        f7,     a0,    t7
+    st.h          zero,   a1,    0
+
+    vsrari.h      vr8,    vr8,   6
+    vsllwil.hu.bu vr0,    vr0,   0
+    vsllwil.hu.bu vr1,    vr1,   0
+    vsllwil.hu.bu vr2,    vr2,   0
+    vsllwil.hu.bu vr3,    vr3,   0
+    vsllwil.hu.bu vr4,    vr4,   0
+    vsllwil.hu.bu vr5,    vr5,   0
+    vsllwil.hu.bu vr6,    vr6,   0
+    vsllwil.hu.bu vr7,    vr7,   0
+    vadd.h        vr0,    vr0,   vr8
+    vadd.h        vr1,    vr1,   vr8
+    vadd.h        vr2,    vr2,   vr8
+    vadd.h        vr3,    vr3,   vr8
+    vadd.h        vr4,    vr4,   vr8
+    vadd.h        vr5,    vr5,   vr8
+    vadd.h        vr6,    vr6,   vr8
+    vadd.h        vr7,    vr7,   vr8
+    vssrarni.bu.h vr1,    vr0,   0
+    vssrarni.bu.h vr3,    vr2,   0
+    vssrarni.bu.h vr5,    vr4,   0
+    vssrarni.bu.h vr7,    vr6,   0
+
+    vbsrl.v       vr0,    vr1,   8
+    vbsrl.v       vr2,    vr3,   8
+    vbsrl.v       vr4,    vr5,   8
+    vbsrl.v       vr6,    vr7,   8
+    fst.d         f1,     a0,    0
+    fstx.d        f0,     a0,    a2
+    fstx.d        f3,     a0,    t2
+    fstx.d        f2,     a0,    t3
+    fstx.d        f5,     a0,    t4
+    fstx.d        f4,     a0,    t5
+    fstx.d        f7,     a0,    t6
+    fstx.d        f6,     a0,    t7
+endfunc
+function ff_h264_idct8_dc_add_8_lasx
+    xvldrepl.h    xr8,    a1,    0
+    add.d         t2,     a2,    a2
+    add.d         t3,     t2,    a2
+    add.d         t4,     t3,    a2
+    add.d         t5,     t4,    a2
+    add.d         t6,     t5,    a2
+    add.d         t7,     t6,    a2
+
+    fld.d         f0,     a0,    0
+    fldx.d        f1,     a0,    a2
+    fldx.d        f2,     a0,    t2
+    fldx.d        f3,     a0,    t3
+    fldx.d        f4,     a0,    t4
+    fldx.d        f5,     a0,    t5
+    fldx.d        f6,     a0,    t6
+    fldx.d        f7,     a0,    t7
+    st.h          zero,   a1,    0
+
+    xvsrari.h     xr8,    xr8,   6
+    xvpermi.q     xr1,    xr0,   0x20
+    xvpermi.q     xr3,    xr2,   0x20
+    xvpermi.q     xr5,    xr4,   0x20
+    xvpermi.q     xr7,    xr6,   0x20
+    xvsllwil.hu.bu xr1,   xr1,   0
+    xvsllwil.hu.bu xr3,   xr3,   0
+    xvsllwil.hu.bu xr5,   xr5,   0
+    xvsllwil.hu.bu xr7,   xr7,   0
+    xvadd.h       xr1,    xr1,   xr8
+    xvadd.h       xr3,    xr3,   xr8
+    xvadd.h       xr5,    xr5,   xr8
+    xvadd.h       xr7,    xr7,   xr8
+
+    xvssrarni.bu.h xr3,   xr1,   0
+    xvssrarni.bu.h xr7,   xr5,   0
+
+    xvpermi.q     xr1,    xr3,   0x11
+    xvpermi.q     xr5,    xr7,   0x11
+    xvbsrl.v      xr0,    xr1,   8
+    xvbsrl.v      xr2,    xr3,   8
+    xvbsrl.v      xr4,    xr5,   8
+    xvbsrl.v      xr6,    xr7,   8
+
+    fst.d         f3,     a0,    0
+    fstx.d        f1,     a0,    a2
+    fstx.d        f2,     a0,    t2
+    fstx.d        f0,     a0,    t3
+    fstx.d        f7,     a0,    t4
+    fstx.d        f5,     a0,    t5
+    fstx.d        f6,     a0,    t6
+    fstx.d        f4,     a0,    t7
+endfunc
+
+/**
+ * IDCT transforms the 16 dc values and dequantizes them.
+ * @param qmul quantization parameter
+ * void FUNCC(ff_h264_luma_dc_dequant_idct)(int16_t *_output, int16_t *_input, int qmul){
+ * LSX optimization is enough for this function.
+ */
+function ff_h264_luma_dc_dequant_idct_8_lsx
+    vld           vr0,    a1,    0
+    vld           vr1,    a1,    8
+    vld           vr2,    a1,    16
+    vld           vr3,    a1,    24
+    vreplgr2vr.w  vr8,    a2
+    LSX_TRANSPOSE4x4_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr9, vr10
+    LSX_BUTTERFLY_4_H  vr4, vr6, vr7, vr5, vr0, vr3, vr2, vr1
+    LSX_BUTTERFLY_4_H  vr0, vr1, vr2, vr3, vr4, vr7, vr6, vr5
+    LSX_TRANSPOSE4x4_H vr4, vr5, vr6, vr7, vr0, vr1, vr2, vr3, vr9, vr10
+    LSX_BUTTERFLY_4_H  vr0, vr1, vr3, vr2, vr4, vr7, vr6, vr5
+    LSX_BUTTERFLY_4_H  vr4, vr5, vr6, vr7, vr0, vr1, vr2, vr3
+    vsllwil.w.h   vr0,    vr0,   0
+    vsllwil.w.h   vr1,    vr1,   0
+    vsllwil.w.h   vr2,    vr2,   0
+    vsllwil.w.h   vr3,    vr3,   0
+    vmul.w        vr0,    vr0,   vr8
+    vmul.w        vr1,    vr1,   vr8
+    vmul.w        vr2,    vr2,   vr8
+    vmul.w        vr3,    vr3,   vr8
+    vsrarni.h.w   vr1,    vr0,   8
+    vsrarni.h.w   vr3,    vr2,   8
+
+    vstelm.h      vr1,    a0,    0,   0
+    vstelm.h      vr1,    a0,    32,  4
+    vstelm.h      vr1,    a0,    64,  1
+    vstelm.h      vr1,    a0,    96,  5
+    vstelm.h      vr3,    a0,    128, 0
+    vstelm.h      vr3,    a0,    160, 4
+    vstelm.h      vr3,    a0,    192, 1
+    vstelm.h      vr3,    a0,    224, 5
+    addi.d        a0,     a0,    256
+    vstelm.h      vr1,    a0,    0,   2
+    vstelm.h      vr1,    a0,    32,  6
+    vstelm.h      vr1,    a0,    64,  3
+    vstelm.h      vr1,    a0,    96,  7
+    vstelm.h      vr3,    a0,    128, 2
+    vstelm.h      vr3,    a0,    160, 6
+    vstelm.h      vr3,    a0,    192, 3
+    vstelm.h      vr3,    a0,    224, 7
+endfunc
+
diff --git a/libavcodec/loongarch/h264idct_la.c b/libavcodec/loongarch/h264idct_la.c
new file mode 100644
index 0000000000..41e9b1e8bc
--- /dev/null
+++ b/libavcodec/loongarch/h264idct_la.c
@@ -0,0 +1,185 @@
+/*
+ * Loongson LSX/LASX optimized h264idct
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *                Xiwei  Gu  <guxiwei-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264dsp_loongarch.h"
+#include "libavcodec/bit_depth_template.c"
+
+void ff_h264_idct_add16_8_lsx(uint8_t *dst, const int32_t *blk_offset,
+                              int16_t *block, int32_t dst_stride,
+                              const uint8_t nzc[15 * 8])
+{
+    int32_t i;
+
+    for (i = 0; i < 16; i++) {
+        int32_t nnz = nzc[scan8[i]];
+
+        if (nnz == 1 && ((dctcoef *) block)[i * 16]) {
+            ff_h264_idct_dc_add_8_lsx(dst + blk_offset[i],
+                                      block + i * 16 * sizeof(pixel),
+                                      dst_stride);
+	} else if (nnz) {
+            ff_h264_idct_add_8_lsx(dst + blk_offset[i],
+                                   block + i * 16 * sizeof(pixel),
+                                   dst_stride);
+        }
+    }
+}
+
+void ff_h264_idct8_add4_8_lsx(uint8_t *dst, const int32_t *blk_offset,
+                              int16_t *block, int32_t dst_stride,
+                              const uint8_t nzc[15 * 8])
+{
+    int32_t cnt;
+
+    for (cnt = 0; cnt < 16; cnt += 4) {
+        int32_t nnz = nzc[scan8[cnt]];
+
+        if (nnz == 1 && ((dctcoef *) block)[cnt * 16]) {
+            ff_h264_idct8_dc_add_8_lsx(dst + blk_offset[cnt],
+                                        block + cnt * 16 * sizeof(pixel),
+                                        dst_stride);
+        } else if (nnz) {
+            ff_h264_idct8_add_8_lsx(dst + blk_offset[cnt],
+                                     block + cnt * 16 * sizeof(pixel),
+                                     dst_stride);
+        }
+    }
+}
+
+#if HAVE_LASX
+void ff_h264_idct8_add4_8_lasx(uint8_t *dst, const int32_t *blk_offset,
+                               int16_t *block, int32_t dst_stride,
+                               const uint8_t nzc[15 * 8])
+{
+    int32_t cnt;
+
+    for (cnt = 0; cnt < 16; cnt += 4) {
+        int32_t nnz = nzc[scan8[cnt]];
+
+        if (nnz == 1 && ((dctcoef *) block)[cnt * 16]) {
+            ff_h264_idct8_dc_add_8_lasx(dst + blk_offset[cnt],
+                                        block + cnt * 16 * sizeof(pixel),
+                                        dst_stride);
+        } else if (nnz) {
+            ff_h264_idct8_add_8_lasx(dst + blk_offset[cnt],
+                                     block + cnt * 16 * sizeof(pixel),
+                                     dst_stride);
+        }
+    }
+}
+#endif // #if HAVE_LASX
+
+void ff_h264_idct_add8_8_lsx(uint8_t **dst, const int32_t *blk_offset,
+                             int16_t *block, int32_t dst_stride,
+                             const uint8_t nzc[15 * 8])
+{
+    int32_t i;
+
+    for (i = 16; i < 20; i++) {
+        if (nzc[scan8[i]])
+            ff_h264_idct_add_8_lsx(dst[0] + blk_offset[i],
+                                   block + i * 16 * sizeof(pixel),
+                                   dst_stride);
+        else if (((dctcoef *) block)[i * 16])
+            ff_h264_idct_dc_add_8_lsx(dst[0] + blk_offset[i],
+                                      block + i * 16 * sizeof(pixel),
+                                      dst_stride);
+    }
+    for (i = 32; i < 36; i++) {
+        if (nzc[scan8[i]])
+            ff_h264_idct_add_8_lsx(dst[1] + blk_offset[i],
+                                   block + i * 16 * sizeof(pixel),
+                                   dst_stride);
+        else if (((dctcoef *) block)[i * 16])
+            ff_h264_idct_dc_add_8_lsx(dst[1] + blk_offset[i],
+                                      block + i * 16 * sizeof(pixel),
+                                      dst_stride);
+    }
+}
+
+void ff_h264_idct_add8_422_8_lsx(uint8_t **dst, const int32_t *blk_offset,
+                                 int16_t *block, int32_t dst_stride,
+                                 const uint8_t nzc[15 * 8])
+{
+    int32_t i;
+
+    for (i = 16; i < 20; i++) {
+        if (nzc[scan8[i]])
+            ff_h264_idct_add_8_lsx(dst[0] + blk_offset[i],
+                                   block + i * 16 * sizeof(pixel),
+                                   dst_stride);
+        else if (((dctcoef *) block)[i * 16])
+            ff_h264_idct_dc_add_8_lsx(dst[0] + blk_offset[i],
+                                      block + i * 16 * sizeof(pixel),
+                                      dst_stride);
+    }
+    for (i = 20; i < 24; i++) {
+        if (nzc[scan8[i + 4]])
+            ff_h264_idct_add_8_lsx(dst[0] + blk_offset[i + 4],
+                                   block + i * 16 * sizeof(pixel),
+                                   dst_stride);
+        else if (((dctcoef *) block)[i * 16])
+            ff_h264_idct_dc_add_8_lsx(dst[0] + blk_offset[i + 4],
+                                      block + i * 16 * sizeof(pixel),
+                                      dst_stride);
+    }
+    for (i = 32; i < 36; i++) {
+        if (nzc[scan8[i]])
+            ff_h264_idct_add_8_lsx(dst[1] + blk_offset[i],
+                                   block + i * 16 * sizeof(pixel),
+                                   dst_stride);
+        else if (((dctcoef *) block)[i * 16])
+            ff_h264_idct_dc_add_8_lsx(dst[1] + blk_offset[i],
+                                      block + i * 16 * sizeof(pixel),
+                                      dst_stride);
+    }
+    for (i = 36; i < 40; i++) {
+        if (nzc[scan8[i + 4]])
+            ff_h264_idct_add_8_lsx(dst[1] + blk_offset[i + 4],
+                                   block + i * 16 * sizeof(pixel),
+                                   dst_stride);
+        else if (((dctcoef *) block)[i * 16])
+            ff_h264_idct_dc_add_8_lsx(dst[1] + blk_offset[i + 4],
+                                      block + i * 16 * sizeof(pixel),
+                                      dst_stride);
+    }
+}
+
+void ff_h264_idct_add16_intra_8_lsx(uint8_t *dst, const int32_t *blk_offset,
+                                    int16_t *block, int32_t dst_stride,
+                                    const uint8_t nzc[15 * 8])
+{
+    int32_t i;
+
+    for (i = 0; i < 16; i++) {
+        if (nzc[scan8[i]])
+            ff_h264_idct_add_8_lsx(dst + blk_offset[i],
+                                   block + i * 16 * sizeof(pixel), dst_stride);
+        else if (((dctcoef *) block)[i * 16])
+            ff_h264_idct_dc_add_8_lsx(dst + blk_offset[i],
+                                      block + i * 16 * sizeof(pixel),
+                                      dst_stride);
+    }
+}
+
diff --git a/libavcodec/loongarch/h264idct_lasx.c b/libavcodec/loongarch/h264idct_lasx.c
deleted file mode 100644
index 46bd3b74d5..0000000000
--- a/libavcodec/loongarch/h264idct_lasx.c
+++ /dev/null
@@ -1,498 +0,0 @@
-/*
- * Loongson LASX optimized h264dsp
- *
- * Copyright (c) 2021 Loongson Technology Corporation Limited
- * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
- *                Xiwei  Gu  <guxiwei-hf@loongson.cn>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/loongarch/loongson_intrinsics.h"
-#include "h264dsp_lasx.h"
-#include "libavcodec/bit_depth_template.c"
-
-#define AVC_ITRANS_H(in0, in1, in2, in3, out0, out1, out2, out3)     \
-{                                                                    \
-   __m256i tmp0_m, tmp1_m, tmp2_m, tmp3_m;                           \
-                                                                     \
-    tmp0_m = __lasx_xvadd_h(in0, in2);                               \
-    tmp1_m = __lasx_xvsub_h(in0, in2);                               \
-    tmp2_m = __lasx_xvsrai_h(in1, 1);                                \
-    tmp2_m = __lasx_xvsub_h(tmp2_m, in3);                            \
-    tmp3_m = __lasx_xvsrai_h(in3, 1);                                \
-    tmp3_m = __lasx_xvadd_h(in1, tmp3_m);                            \
-                                                                     \
-    LASX_BUTTERFLY_4_H(tmp0_m, tmp1_m, tmp2_m, tmp3_m,               \
-                       out0, out1, out2, out3);                      \
-}
-
-void ff_h264_idct_add_lasx(uint8_t *dst, int16_t *src, int32_t dst_stride)
-{
-    __m256i src0_m, src1_m, src2_m, src3_m;
-    __m256i dst0_m, dst1_m;
-    __m256i hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3;
-    __m256i inp0_m, inp1_m, res0_m, src1, src3;
-    __m256i src0 = __lasx_xvld(src, 0);
-    __m256i src2 = __lasx_xvld(src, 16);
-    __m256i zero = __lasx_xvldi(0);
-    int32_t dst_stride_2x = dst_stride << 1;
-    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
-
-    __lasx_xvst(zero, src, 0);
-    DUP2_ARG2(__lasx_xvilvh_d, src0, src0, src2, src2, src1, src3);
-    AVC_ITRANS_H(src0, src1, src2, src3, hres0, hres1, hres2, hres3);
-    LASX_TRANSPOSE4x4_H(hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3);
-    AVC_ITRANS_H(hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3);
-    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
-              dst, dst_stride_3x, src0_m, src1_m, src2_m, src3_m);
-    DUP4_ARG2(__lasx_xvld, dst, 0, dst + dst_stride, 0, dst + dst_stride_2x,
-              0, dst + dst_stride_3x, 0, src0_m, src1_m, src2_m, src3_m);
-    DUP2_ARG2(__lasx_xvilvl_d, vres1, vres0, vres3, vres2, inp0_m, inp1_m);
-    inp0_m = __lasx_xvpermi_q(inp1_m, inp0_m, 0x20);
-    inp0_m = __lasx_xvsrari_h(inp0_m, 6);
-    DUP2_ARG2(__lasx_xvilvl_w, src1_m, src0_m, src3_m, src2_m, dst0_m, dst1_m);
-    dst0_m = __lasx_xvilvl_d(dst1_m, dst0_m);
-    res0_m = __lasx_vext2xv_hu_bu(dst0_m);
-    res0_m = __lasx_xvadd_h(res0_m, inp0_m);
-    res0_m = __lasx_xvclip255_h(res0_m);
-    dst0_m = __lasx_xvpickev_b(res0_m, res0_m);
-    __lasx_xvstelm_w(dst0_m, dst, 0, 0);
-    __lasx_xvstelm_w(dst0_m, dst + dst_stride, 0, 1);
-    __lasx_xvstelm_w(dst0_m, dst + dst_stride_2x, 0, 4);
-    __lasx_xvstelm_w(dst0_m, dst + dst_stride_3x, 0, 5);
-}
-
-void ff_h264_idct8_addblk_lasx(uint8_t *dst, int16_t *src,
-                               int32_t dst_stride)
-{
-    __m256i src0, src1, src2, src3, src4, src5, src6, src7;
-    __m256i vec0, vec1, vec2, vec3;
-    __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-    __m256i res0, res1, res2, res3, res4, res5, res6, res7;
-    __m256i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
-    __m256i zero = __lasx_xvldi(0);
-    int32_t dst_stride_2x = dst_stride << 1;
-    int32_t dst_stride_4x = dst_stride << 2;
-    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
-
-    src[0] += 32;
-    DUP4_ARG2(__lasx_xvld, src, 0, src, 16, src, 32, src, 48,
-              src0, src1, src2, src3);
-    DUP4_ARG2(__lasx_xvld, src, 64, src, 80, src, 96, src, 112,
-              src4, src5, src6, src7);
-    __lasx_xvst(zero, src, 0);
-    __lasx_xvst(zero, src, 32);
-    __lasx_xvst(zero, src, 64);
-    __lasx_xvst(zero, src, 96);
-
-    vec0 = __lasx_xvadd_h(src0, src4);
-    vec1 = __lasx_xvsub_h(src0, src4);
-    vec2 = __lasx_xvsrai_h(src2, 1);
-    vec2 = __lasx_xvsub_h(vec2, src6);
-    vec3 = __lasx_xvsrai_h(src6, 1);
-    vec3 = __lasx_xvadd_h(src2, vec3);
-
-    LASX_BUTTERFLY_4_H(vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3);
-
-    vec0 = __lasx_xvsrai_h(src7, 1);
-    vec0 = __lasx_xvsub_h(src5, vec0);
-    vec0 = __lasx_xvsub_h(vec0, src3);
-    vec0 = __lasx_xvsub_h(vec0, src7);
-
-    vec1 = __lasx_xvsrai_h(src3, 1);
-    vec1 = __lasx_xvsub_h(src1, vec1);
-    vec1 = __lasx_xvadd_h(vec1, src7);
-    vec1 = __lasx_xvsub_h(vec1, src3);
-
-    vec2 = __lasx_xvsrai_h(src5, 1);
-    vec2 = __lasx_xvsub_h(vec2, src1);
-    vec2 = __lasx_xvadd_h(vec2, src7);
-    vec2 = __lasx_xvadd_h(vec2, src5);
-
-    vec3 = __lasx_xvsrai_h(src1, 1);
-    vec3 = __lasx_xvadd_h(src3, vec3);
-    vec3 = __lasx_xvadd_h(vec3, src5);
-    vec3 = __lasx_xvadd_h(vec3, src1);
-
-    tmp4 = __lasx_xvsrai_h(vec3, 2);
-    tmp4 = __lasx_xvadd_h(tmp4, vec0);
-    tmp5 = __lasx_xvsrai_h(vec2, 2);
-    tmp5 = __lasx_xvadd_h(tmp5, vec1);
-    tmp6 = __lasx_xvsrai_h(vec1, 2);
-    tmp6 = __lasx_xvsub_h(tmp6, vec2);
-    tmp7 = __lasx_xvsrai_h(vec0, 2);
-    tmp7 = __lasx_xvsub_h(vec3, tmp7);
-
-    LASX_BUTTERFLY_8_H(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
-                       res0, res1, res2, res3, res4, res5, res6, res7);
-    LASX_TRANSPOSE8x8_H(res0, res1, res2, res3, res4, res5, res6, res7,
-                        res0, res1, res2, res3, res4, res5, res6, res7);
-
-    DUP4_ARG1(__lasx_vext2xv_w_h, res0, res1, res2, res3,
-              tmp0, tmp1, tmp2, tmp3);
-    DUP4_ARG1(__lasx_vext2xv_w_h, res4, res5, res6, res7,
-              tmp4, tmp5, tmp6, tmp7);
-    vec0 = __lasx_xvadd_w(tmp0, tmp4);
-    vec1 = __lasx_xvsub_w(tmp0, tmp4);
-
-    vec2 = __lasx_xvsrai_w(tmp2, 1);
-    vec2 = __lasx_xvsub_w(vec2, tmp6);
-    vec3 = __lasx_xvsrai_w(tmp6, 1);
-    vec3 = __lasx_xvadd_w(vec3, tmp2);
-
-    tmp0 = __lasx_xvadd_w(vec0, vec3);
-    tmp2 = __lasx_xvadd_w(vec1, vec2);
-    tmp4 = __lasx_xvsub_w(vec1, vec2);
-    tmp6 = __lasx_xvsub_w(vec0, vec3);
-
-    vec0 = __lasx_xvsrai_w(tmp7, 1);
-    vec0 = __lasx_xvsub_w(tmp5, vec0);
-    vec0 = __lasx_xvsub_w(vec0, tmp3);
-    vec0 = __lasx_xvsub_w(vec0, tmp7);
-
-    vec1 = __lasx_xvsrai_w(tmp3, 1);
-    vec1 = __lasx_xvsub_w(tmp1, vec1);
-    vec1 = __lasx_xvadd_w(vec1, tmp7);
-    vec1 = __lasx_xvsub_w(vec1, tmp3);
-
-    vec2 = __lasx_xvsrai_w(tmp5, 1);
-    vec2 = __lasx_xvsub_w(vec2, tmp1);
-    vec2 = __lasx_xvadd_w(vec2, tmp7);
-    vec2 = __lasx_xvadd_w(vec2, tmp5);
-
-    vec3 = __lasx_xvsrai_w(tmp1, 1);
-    vec3 = __lasx_xvadd_w(tmp3, vec3);
-    vec3 = __lasx_xvadd_w(vec3, tmp5);
-    vec3 = __lasx_xvadd_w(vec3, tmp1);
-
-    tmp1 = __lasx_xvsrai_w(vec3, 2);
-    tmp1 = __lasx_xvadd_w(tmp1, vec0);
-    tmp3 = __lasx_xvsrai_w(vec2, 2);
-    tmp3 = __lasx_xvadd_w(tmp3, vec1);
-    tmp5 = __lasx_xvsrai_w(vec1, 2);
-    tmp5 = __lasx_xvsub_w(tmp5, vec2);
-    tmp7 = __lasx_xvsrai_w(vec0, 2);
-    tmp7 = __lasx_xvsub_w(vec3, tmp7);
-
-    LASX_BUTTERFLY_4_W(tmp0, tmp2, tmp5, tmp7, res0, res1, res6, res7);
-    LASX_BUTTERFLY_4_W(tmp4, tmp6, tmp1, tmp3, res2, res3, res4, res5);
-
-    DUP4_ARG2(__lasx_xvsrai_w, res0, 6, res1, 6, res2, 6, res3, 6,
-              res0, res1, res2, res3);
-    DUP4_ARG2(__lasx_xvsrai_w, res4, 6, res5, 6, res6, 6, res7, 6,
-              res4, res5, res6, res7);
-    DUP4_ARG2(__lasx_xvpickev_h, res1, res0, res3, res2, res5, res4, res7,
-              res6, res0, res1, res2, res3);
-    DUP4_ARG2(__lasx_xvpermi_d, res0, 0xd8, res1, 0xd8, res2, 0xd8, res3, 0xd8,
-              res0, res1, res2, res3);
-
-    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
-              dst, dst_stride_3x, dst0, dst1, dst2, dst3);
-    dst += dst_stride_4x;
-    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
-              dst, dst_stride_3x, dst4, dst5, dst6, dst7);
-    dst -= dst_stride_4x;
-    DUP4_ARG2(__lasx_xvilvl_b, zero, dst0, zero, dst1, zero, dst2, zero, dst3,
-              dst0, dst1, dst2, dst3);
-    DUP4_ARG2(__lasx_xvilvl_b, zero, dst4, zero, dst5, zero, dst6, zero, dst7,
-              dst4, dst5, dst6, dst7);
-    DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst3, dst2, 0x20, dst5,
-              dst4, 0x20, dst7, dst6, 0x20, dst0, dst1, dst2, dst3);
-    res0 = __lasx_xvadd_h(res0, dst0);
-    res1 = __lasx_xvadd_h(res1, dst1);
-    res2 = __lasx_xvadd_h(res2, dst2);
-    res3 = __lasx_xvadd_h(res3, dst3);
-    DUP4_ARG1(__lasx_xvclip255_h, res0, res1, res2, res3, res0, res1,
-              res2, res3);
-    DUP2_ARG2(__lasx_xvpickev_b, res1, res0, res3, res2, res0, res1);
-    __lasx_xvstelm_d(res0, dst, 0, 0);
-    __lasx_xvstelm_d(res0, dst + dst_stride, 0, 2);
-    __lasx_xvstelm_d(res0, dst + dst_stride_2x, 0, 1);
-    __lasx_xvstelm_d(res0, dst + dst_stride_3x, 0, 3);
-    dst += dst_stride_4x;
-    __lasx_xvstelm_d(res1, dst, 0, 0);
-    __lasx_xvstelm_d(res1, dst + dst_stride, 0, 2);
-    __lasx_xvstelm_d(res1, dst + dst_stride_2x, 0, 1);
-    __lasx_xvstelm_d(res1, dst + dst_stride_3x, 0, 3);
-}
-
-void ff_h264_idct4x4_addblk_dc_lasx(uint8_t *dst, int16_t *src,
-                                    int32_t dst_stride)
-{
-    const int16_t dc = (src[0] + 32) >> 6;
-    int32_t dst_stride_2x = dst_stride << 1;
-    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
-    __m256i pred, out;
-    __m256i src0, src1, src2, src3;
-    __m256i input_dc = __lasx_xvreplgr2vr_h(dc);
-
-    src[0] = 0;
-    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
-              dst, dst_stride_3x, src0, src1, src2, src3);
-    DUP2_ARG2(__lasx_xvilvl_w, src1, src0, src3, src2, src0, src1);
-
-    pred = __lasx_xvpermi_q(src0, src1, 0x02);
-    pred = __lasx_xvaddw_h_h_bu(input_dc, pred);
-    pred = __lasx_xvclip255_h(pred);
-    out = __lasx_xvpickev_b(pred, pred);
-    __lasx_xvstelm_w(out, dst, 0, 0);
-    __lasx_xvstelm_w(out, dst + dst_stride, 0, 1);
-    __lasx_xvstelm_w(out, dst + dst_stride_2x, 0, 4);
-    __lasx_xvstelm_w(out, dst + dst_stride_3x, 0, 5);
-}
-
-void ff_h264_idct8_dc_addblk_lasx(uint8_t *dst, int16_t *src,
-                                  int32_t dst_stride)
-{
-    int32_t dc_val;
-    int32_t dst_stride_2x = dst_stride << 1;
-    int32_t dst_stride_4x = dst_stride << 2;
-    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
-    __m256i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
-    __m256i dc;
-
-    dc_val = (src[0] + 32) >> 6;
-    dc = __lasx_xvreplgr2vr_h(dc_val);
-
-    src[0] = 0;
-
-    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
-              dst, dst_stride_3x, dst0, dst1, dst2, dst3);
-    dst += dst_stride_4x;
-    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
-              dst, dst_stride_3x, dst4, dst5, dst6, dst7);
-    dst -= dst_stride_4x;
-    DUP4_ARG1(__lasx_vext2xv_hu_bu, dst0, dst1, dst2, dst3,
-              dst0, dst1, dst2, dst3);
-    DUP4_ARG1(__lasx_vext2xv_hu_bu, dst4, dst5, dst6, dst7,
-              dst4, dst5, dst6, dst7);
-    DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst3, dst2, 0x20, dst5,
-              dst4, 0x20, dst7, dst6, 0x20, dst0, dst1, dst2, dst3);
-    dst0 = __lasx_xvadd_h(dst0, dc);
-    dst1 = __lasx_xvadd_h(dst1, dc);
-    dst2 = __lasx_xvadd_h(dst2, dc);
-    dst3 = __lasx_xvadd_h(dst3, dc);
-    DUP4_ARG1(__lasx_xvclip255_h, dst0, dst1, dst2, dst3,
-              dst0, dst1, dst2, dst3);
-    DUP2_ARG2(__lasx_xvpickev_b, dst1, dst0, dst3, dst2, dst0, dst1);
-    __lasx_xvstelm_d(dst0, dst, 0, 0);
-    __lasx_xvstelm_d(dst0, dst + dst_stride, 0, 2);
-    __lasx_xvstelm_d(dst0, dst + dst_stride_2x, 0, 1);
-    __lasx_xvstelm_d(dst0, dst + dst_stride_3x, 0, 3);
-    dst += dst_stride_4x;
-    __lasx_xvstelm_d(dst1, dst, 0, 0);
-    __lasx_xvstelm_d(dst1, dst + dst_stride, 0, 2);
-    __lasx_xvstelm_d(dst1, dst + dst_stride_2x, 0, 1);
-    __lasx_xvstelm_d(dst1, dst + dst_stride_3x, 0, 3);
-}
-
-void ff_h264_idct_add16_lasx(uint8_t *dst,
-                             const int32_t *blk_offset,
-                             int16_t *block, int32_t dst_stride,
-                             const uint8_t nzc[15 * 8])
-{
-    int32_t i;
-
-    for (i = 0; i < 16; i++) {
-        int32_t nnz = nzc[scan8[i]];
-
-        if (nnz) {
-            if (nnz == 1 && ((dctcoef *) block)[i * 16])
-                ff_h264_idct4x4_addblk_dc_lasx(dst + blk_offset[i],
-                                               block + i * 16 * sizeof(pixel),
-                                               dst_stride);
-            else
-                ff_h264_idct_add_lasx(dst + blk_offset[i],
-                                      block + i * 16 * sizeof(pixel),
-                                      dst_stride);
-        }
-    }
-}
-
-void ff_h264_idct8_add4_lasx(uint8_t *dst, const int32_t *blk_offset,
-                             int16_t *block, int32_t dst_stride,
-                             const uint8_t nzc[15 * 8])
-{
-    int32_t cnt;
-
-    for (cnt = 0; cnt < 16; cnt += 4) {
-        int32_t nnz = nzc[scan8[cnt]];
-
-        if (nnz) {
-            if (nnz == 1 && ((dctcoef *) block)[cnt * 16])
-                ff_h264_idct8_dc_addblk_lasx(dst + blk_offset[cnt],
-                                             block + cnt * 16 * sizeof(pixel),
-                                             dst_stride);
-            else
-                ff_h264_idct8_addblk_lasx(dst + blk_offset[cnt],
-                                          block + cnt * 16 * sizeof(pixel),
-                                          dst_stride);
-        }
-    }
-}
-
-
-void ff_h264_idct_add8_lasx(uint8_t **dst,
-                            const int32_t *blk_offset,
-                            int16_t *block, int32_t dst_stride,
-                            const uint8_t nzc[15 * 8])
-{
-    int32_t i;
-
-    for (i = 16; i < 20; i++) {
-        if (nzc[scan8[i]])
-            ff_h264_idct_add_lasx(dst[0] + blk_offset[i],
-                                  block + i * 16 * sizeof(pixel),
-                                  dst_stride);
-        else if (((dctcoef *) block)[i * 16])
-            ff_h264_idct4x4_addblk_dc_lasx(dst[0] + blk_offset[i],
-                                           block + i * 16 * sizeof(pixel),
-                                           dst_stride);
-    }
-    for (i = 32; i < 36; i++) {
-        if (nzc[scan8[i]])
-            ff_h264_idct_add_lasx(dst[1] + blk_offset[i],
-                                  block + i * 16 * sizeof(pixel),
-                                  dst_stride);
-        else if (((dctcoef *) block)[i * 16])
-            ff_h264_idct4x4_addblk_dc_lasx(dst[1] + blk_offset[i],
-                                           block + i * 16 * sizeof(pixel),
-                                           dst_stride);
-    }
-}
-
-void ff_h264_idct_add8_422_lasx(uint8_t **dst,
-                                const int32_t *blk_offset,
-                                int16_t *block, int32_t dst_stride,
-                                const uint8_t nzc[15 * 8])
-{
-    int32_t i;
-
-    for (i = 16; i < 20; i++) {
-        if (nzc[scan8[i]])
-            ff_h264_idct_add_lasx(dst[0] + blk_offset[i],
-                                  block + i * 16 * sizeof(pixel),
-                                  dst_stride);
-        else if (((dctcoef *) block)[i * 16])
-            ff_h264_idct4x4_addblk_dc_lasx(dst[0] + blk_offset[i],
-                                           block + i * 16 * sizeof(pixel),
-                                           dst_stride);
-    }
-    for (i = 32; i < 36; i++) {
-        if (nzc[scan8[i]])
-            ff_h264_idct_add_lasx(dst[1] + blk_offset[i],
-                                  block + i * 16 * sizeof(pixel),
-                                  dst_stride);
-        else if (((dctcoef *) block)[i * 16])
-            ff_h264_idct4x4_addblk_dc_lasx(dst[1] + blk_offset[i],
-                                           block + i * 16 * sizeof(pixel),
-                                           dst_stride);
-    }
-    for (i = 20; i < 24; i++) {
-        if (nzc[scan8[i + 4]])
-            ff_h264_idct_add_lasx(dst[0] + blk_offset[i + 4],
-                                  block + i * 16 * sizeof(pixel),
-                                  dst_stride);
-        else if (((dctcoef *) block)[i * 16])
-            ff_h264_idct4x4_addblk_dc_lasx(dst[0] + blk_offset[i + 4],
-                                           block + i * 16 * sizeof(pixel),
-                                           dst_stride);
-    }
-    for (i = 36; i < 40; i++) {
-        if (nzc[scan8[i + 4]])
-            ff_h264_idct_add_lasx(dst[1] + blk_offset[i + 4],
-                                  block + i * 16 * sizeof(pixel),
-                                  dst_stride);
-        else if (((dctcoef *) block)[i * 16])
-            ff_h264_idct4x4_addblk_dc_lasx(dst[1] + blk_offset[i + 4],
-                                           block + i * 16 * sizeof(pixel),
-                                           dst_stride);
-    }
-}
-
-void ff_h264_idct_add16_intra_lasx(uint8_t *dst,
-                                   const int32_t *blk_offset,
-                                   int16_t *block,
-                                   int32_t dst_stride,
-                                   const uint8_t nzc[15 * 8])
-{
-    int32_t i;
-
-    for (i = 0; i < 16; i++) {
-        if (nzc[scan8[i]])
-            ff_h264_idct_add_lasx(dst + blk_offset[i],
-                                  block + i * 16 * sizeof(pixel), dst_stride);
-        else if (((dctcoef *) block)[i * 16])
-            ff_h264_idct4x4_addblk_dc_lasx(dst + blk_offset[i],
-                                           block + i * 16 * sizeof(pixel),
-                                           dst_stride);
-    }
-}
-
-void ff_h264_deq_idct_luma_dc_lasx(int16_t *dst, int16_t *src,
-                                   int32_t de_qval)
-{
-#define DC_DEST_STRIDE 16
-
-    __m256i src0, src1, src2, src3;
-    __m256i vec0, vec1, vec2, vec3;
-    __m256i tmp0, tmp1, tmp2, tmp3;
-    __m256i hres0, hres1, hres2, hres3;
-    __m256i vres0, vres1, vres2, vres3;
-    __m256i de_q_vec = __lasx_xvreplgr2vr_w(de_qval);
-
-    DUP4_ARG2(__lasx_xvld, src, 0, src, 8, src, 16, src, 24,
-              src0, src1, src2, src3);
-    LASX_TRANSPOSE4x4_H(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
-    LASX_BUTTERFLY_4_H(tmp0, tmp2, tmp3, tmp1, vec0, vec3, vec2, vec1);
-    LASX_BUTTERFLY_4_H(vec0, vec1, vec2, vec3, hres0, hres3, hres2, hres1);
-    LASX_TRANSPOSE4x4_H(hres0, hres1, hres2, hres3,
-                        hres0, hres1, hres2, hres3);
-    LASX_BUTTERFLY_4_H(hres0, hres1, hres3, hres2, vec0, vec3, vec2, vec1);
-    LASX_BUTTERFLY_4_H(vec0, vec1, vec2, vec3, vres0, vres1, vres2, vres3);
-    DUP4_ARG1(__lasx_vext2xv_w_h, vres0, vres1, vres2, vres3,
-              vres0, vres1, vres2, vres3);
-    DUP2_ARG3(__lasx_xvpermi_q, vres1, vres0, 0x20, vres3, vres2, 0x20,
-              vres0, vres1);
-
-    vres0 = __lasx_xvmul_w(vres0, de_q_vec);
-    vres1 = __lasx_xvmul_w(vres1, de_q_vec);
-
-    vres0 = __lasx_xvsrari_w(vres0, 8);
-    vres1 = __lasx_xvsrari_w(vres1, 8);
-    vec0 = __lasx_xvpickev_h(vres1, vres0);
-    vec0 = __lasx_xvpermi_d(vec0, 0xd8);
-    __lasx_xvstelm_h(vec0, dst + 0  * DC_DEST_STRIDE, 0, 0);
-    __lasx_xvstelm_h(vec0, dst + 2  * DC_DEST_STRIDE, 0, 1);
-    __lasx_xvstelm_h(vec0, dst + 8  * DC_DEST_STRIDE, 0, 2);
-    __lasx_xvstelm_h(vec0, dst + 10 * DC_DEST_STRIDE, 0, 3);
-    __lasx_xvstelm_h(vec0, dst + 1  * DC_DEST_STRIDE, 0, 4);
-    __lasx_xvstelm_h(vec0, dst + 3  * DC_DEST_STRIDE, 0, 5);
-    __lasx_xvstelm_h(vec0, dst + 9  * DC_DEST_STRIDE, 0, 6);
-    __lasx_xvstelm_h(vec0, dst + 11 * DC_DEST_STRIDE, 0, 7);
-    __lasx_xvstelm_h(vec0, dst + 4  * DC_DEST_STRIDE, 0, 8);
-    __lasx_xvstelm_h(vec0, dst + 6  * DC_DEST_STRIDE, 0, 9);
-    __lasx_xvstelm_h(vec0, dst + 12 * DC_DEST_STRIDE, 0, 10);
-    __lasx_xvstelm_h(vec0, dst + 14 * DC_DEST_STRIDE, 0, 11);
-    __lasx_xvstelm_h(vec0, dst + 5  * DC_DEST_STRIDE, 0, 12);
-    __lasx_xvstelm_h(vec0, dst + 7  * DC_DEST_STRIDE, 0, 13);
-    __lasx_xvstelm_h(vec0, dst + 13 * DC_DEST_STRIDE, 0, 14);
-    __lasx_xvstelm_h(vec0, dst + 15 * DC_DEST_STRIDE, 0, 15);
-
-#undef DC_DEST_STRIDE
-}
diff --git a/libavcodec/loongarch/loongson_asm.S b/libavcodec/loongarch/loongson_asm.S
new file mode 100644
index 0000000000..767c7c0bb7
--- /dev/null
+++ b/libavcodec/loongarch/loongson_asm.S
@@ -0,0 +1,946 @@
+/*
+ * Loongson asm helper.
+ *
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Gu Xiwei(guxiwei-hf@loongson.cn)
+ *                Shiyou Yin(yinshiyou-hf@loongson.cn)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * MAJOR version: Macro usage changes.
+ * MINOR version: Add new functions, or bug fixes.
+ * MICRO version: Comment changes or implementation changes.
+ */
+#define LML_VERSION_MAJOR 0
+#define LML_VERSION_MINOR 2
+#define LML_VERSION_MICRO 0
+
+/*
+ *============================================================================
+ * macros for specific projetc, set them as needed.
+ * Following LoongML macros for your reference.
+ *============================================================================
+ */
+#define ASM_PREF
+#define DEFAULT_ALIGN    5
+
+.macro function name, align=DEFAULT_ALIGN
+.macro endfunc
+    jirl    $r0, $r1, 0x0
+    .size ASM_PREF\name, . - ASM_PREF\name
+    .purgem endfunc
+.endm
+.text ;
+.align \align ;
+.globl ASM_PREF\name ;
+.type  ASM_PREF\name, @function ;
+ASM_PREF\name: ;
+.endm
+
+/**
+ *  Attention: If align is not zero, the macro will use
+ *  t7 until the end of function
+ */
+.macro alloc_stack size, align=0
+.if \align
+    .macro clean_stack
+        add.d   sp, sp, t7
+    .endm
+    addi.d  sp, sp, - \size
+    andi.d  t7, sp, \align - 1
+    sub.d   sp, sp, t7
+    addi.d  t7, t7, \size
+.else
+    .macro clean_stack
+        addi.d  sp, sp, \size
+    .endm
+    addi.d  sp, sp, - \size
+.endif
+.endm
+
+.macro  const name, align=DEFAULT_ALIGN
+    .macro endconst
+    .size  \name, . - \name
+    .purgem endconst
+    .endm
+.section .rodata
+.align   \align
+\name:
+.endm
+
+/*
+ *============================================================================
+ * LoongArch register alias
+ *============================================================================
+ */
+
+#define a0 $a0
+#define a1 $a1
+#define a2 $a2
+#define a3 $a3
+#define a4 $a4
+#define a5 $a5
+#define a6 $a6
+#define a7 $a7
+
+#define t0 $t0
+#define t1 $t1
+#define t2 $t2
+#define t3 $t3
+#define t4 $t4
+#define t5 $t5
+#define t6 $t6
+#define t7 $t7
+#define t8 $t8
+
+#define s0 $s0
+#define s1 $s1
+#define s2 $s2
+#define s3 $s3
+#define s4 $s4
+#define s5 $s5
+#define s6 $s6
+#define s7 $s7
+#define s8 $s8
+
+#define zero $zero
+#define sp   $sp
+#define ra   $ra
+
+#define f0  $f0
+#define f1  $f1
+#define f2  $f2
+#define f3  $f3
+#define f4  $f4
+#define f5  $f5
+#define f6  $f6
+#define f7  $f7
+#define f8  $f8
+#define f9  $f9
+#define f10 $f10
+#define f11 $f11
+#define f12 $f12
+#define f13 $f13
+#define f14 $f14
+#define f15 $f15
+#define f16 $f16
+#define f17 $f17
+#define f18 $f18
+#define f19 $f19
+#define f20 $f20
+#define f21 $f21
+#define f22 $f22
+#define f23 $f23
+#define f24 $f24
+#define f25 $f25
+#define f26 $f26
+#define f27 $f27
+#define f28 $f28
+#define f29 $f29
+#define f30 $f30
+#define f31 $f31
+
+#define vr0 $vr0
+#define vr1 $vr1
+#define vr2 $vr2
+#define vr3 $vr3
+#define vr4 $vr4
+#define vr5 $vr5
+#define vr6 $vr6
+#define vr7 $vr7
+#define vr8 $vr8
+#define vr9 $vr9
+#define vr10 $vr10
+#define vr11 $vr11
+#define vr12 $vr12
+#define vr13 $vr13
+#define vr14 $vr14
+#define vr15 $vr15
+#define vr16 $vr16
+#define vr17 $vr17
+#define vr18 $vr18
+#define vr19 $vr19
+#define vr20 $vr20
+#define vr21 $vr21
+#define vr22 $vr22
+#define vr23 $vr23
+#define vr24 $vr24
+#define vr25 $vr25
+#define vr26 $vr26
+#define vr27 $vr27
+#define vr28 $vr28
+#define vr29 $vr29
+#define vr30 $vr30
+#define vr31 $vr31
+
+#define xr0 $xr0
+#define xr1 $xr1
+#define xr2 $xr2
+#define xr3 $xr3
+#define xr4 $xr4
+#define xr5 $xr5
+#define xr6 $xr6
+#define xr7 $xr7
+#define xr8 $xr8
+#define xr9 $xr9
+#define xr10 $xr10
+#define xr11 $xr11
+#define xr12 $xr12
+#define xr13 $xr13
+#define xr14 $xr14
+#define xr15 $xr15
+#define xr16 $xr16
+#define xr17 $xr17
+#define xr18 $xr18
+#define xr19 $xr19
+#define xr20 $xr20
+#define xr21 $xr21
+#define xr22 $xr22
+#define xr23 $xr23
+#define xr24 $xr24
+#define xr25 $xr25
+#define xr26 $xr26
+#define xr27 $xr27
+#define xr28 $xr28
+#define xr29 $xr29
+#define xr30 $xr30
+#define xr31 $xr31
+
+/*
+ *============================================================================
+ * LSX/LASX synthesize instructions
+ *============================================================================
+ */
+
+/*
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - vj, vk
+ *               Outputs - vd
+ *               Return Type - halfword
+ */
+.macro vdp2.h.bu vd, vj, vk
+    vmulwev.h.bu      \vd,    \vj,    \vk
+    vmaddwod.h.bu     \vd,    \vj,    \vk
+.endm
+
+.macro vdp2.h.bu.b vd, vj, vk
+    vmulwev.h.bu.b    \vd,    \vj,    \vk
+    vmaddwod.h.bu.b   \vd,    \vj,    \vk
+.endm
+
+.macro vdp2.w.h vd, vj, vk
+    vmulwev.w.h       \vd,    \vj,    \vk
+    vmaddwod.w.h      \vd,    \vj,    \vk
+.endm
+
+.macro xvdp2.h.bu xd, xj, xk
+    xvmulwev.h.bu    \xd,    \xj,    \xk
+    xvmaddwod.h.bu   \xd,    \xj,    \xk
+.endm
+
+.macro xvdp2.h.bu.b xd, xj, xk
+    xvmulwev.h.bu.b    \xd,  \xj,    \xk
+    xvmaddwod.h.bu.b   \xd,  \xj,    \xk
+.endm
+
+.macro xvdp2.w.h xd, xj, xk
+    xvmulwev.w.h       \xd,  \xj,    \xk
+    xvmaddwod.w.h      \xd,  \xj,    \xk
+.endm
+
+/*
+ * Description : Dot product & addition of halfword vector elements
+ * Arguments   : Inputs  - vj, vk
+ *               Outputs - vd
+ *               Return Type - twice size of input
+ */
+.macro vdp2add.h.bu vd, vj, vk
+    vmaddwev.h.bu     \vd,    \vj,    \vk
+    vmaddwod.h.bu     \vd,    \vj,    \vk
+.endm
+
+.macro vdp2add.h.bu.b vd, vj, vk
+    vmaddwev.h.bu.b   \vd,    \vj,    \vk
+    vmaddwod.h.bu.b   \vd,    \vj,    \vk
+.endm
+
+.macro vdp2add.w.h vd, vj, vk
+    vmaddwev.w.h      \vd,    \vj,    \vk
+    vmaddwod.w.h      \vd,    \vj,    \vk
+.endm
+
+.macro xvdp2add.h.bu.b xd, xj, xk
+    xvmaddwev.h.bu.b   \xd,  \xj,    \xk
+    xvmaddwod.h.bu.b   \xd,  \xj,    \xk
+.endm
+
+.macro xvdp2add.w.h xd, xj, xk
+    xvmaddwev.w.h      \xd,  \xj,    \xk
+    xvmaddwod.w.h      \xd,  \xj,    \xk
+.endm
+
+/*
+ * Description : Range each element of vector
+ * clip: vj > vk ? vj : vk && vj < va ? vj : va
+ * clip255: vj < 255 ? vj : 255 && vj > 0 ? vj : 0
+ */
+.macro vclip.h  vd,  vj, vk, va
+    vmax.h    \vd,  \vj,   \vk
+    vmin.h    \vd,  \vd,   \va
+.endm
+
+.macro vclip255.w  vd, vj
+    vmaxi.w   \vd,   \vj,  0
+    vsat.wu   \vd,   \vd,  7
+.endm
+
+.macro vclip255.h  vd, vj
+    vmaxi.h   \vd,   \vj,  0
+    vsat.hu   \vd,   \vd,  7
+.endm
+
+.macro xvclip.h  xd,  xj, xk, xa
+    xvmax.h    \xd,  \xj,   \xk
+    xvmin.h    \xd,  \xd,   \xa
+.endm
+
+.macro xvclip255.h  xd, xj
+    xvmaxi.h   \xd,   \xj,  0
+    xvsat.hu   \xd,   \xd,  7
+.endm
+
+.macro xvclip255.w  xd, xj
+    xvmaxi.w   \xd,   \xj,  0
+    xvsat.wu   \xd,   \xd,  7
+.endm
+
+/*
+ * Description : Store elements of vector
+ * vd : Data vector to be stroed
+ * rk : Address of data storage
+ * ra : Offset of address
+ * si : Index of data in vd
+ */
+.macro vstelmx.b vd, rk, ra, si
+    add.d      \rk,  \rk,  \ra
+    vstelm.b   \vd,  \rk,  0, \si
+.endm
+
+.macro vstelmx.h vd, rk, ra, si
+    add.d      \rk,  \rk,  \ra
+    vstelm.h   \vd,  \rk,  0, \si
+.endm
+
+.macro vstelmx.w vd, rk, ra, si
+    add.d      \rk,  \rk,  \ra
+    vstelm.w   \vd,  \rk,  0, \si
+.endm
+
+.macro vstelmx.d  vd, rk, ra, si
+    add.d      \rk,  \rk,  \ra
+    vstelm.d   \vd,  \rk,  0, \si
+.endm
+
+.macro vmov xd, xj
+    vor.v  \xd,  \xj,  \xj
+.endm
+
+.macro xmov xd, xj
+    xvor.v  \xd,  \xj,  \xj
+.endm
+
+.macro xvstelmx.d  xd, rk, ra, si
+    add.d      \rk, \rk,  \ra
+    xvstelm.d  \xd, \rk,  0, \si
+.endm
+
+/*
+ *============================================================================
+ * LSX/LASX custom macros
+ *============================================================================
+ */
+
+/*
+ * Load 4 float, double, V128, v256 elements with stride.
+ */
+.macro FLDS_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+    fld.s     \out0,    \src,    0
+    fldx.s    \out1,    \src,    \stride
+    fldx.s    \out2,    \src,    \stride2
+    fldx.s    \out3,    \src,    \stride3
+.endm
+
+.macro FLDD_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+    fld.d     \out0,    \src,    0
+    fldx.d    \out1,    \src,    \stride
+    fldx.d    \out2,    \src,    \stride2
+    fldx.d    \out3,    \src,    \stride3
+.endm
+
+.macro LSX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+    vld     \out0,    \src,    0
+    vldx    \out1,    \src,    \stride
+    vldx    \out2,    \src,    \stride2
+    vldx    \out3,    \src,    \stride3
+.endm
+
+.macro LASX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+    xvld    \out0,    \src,    0
+    xvldx   \out1,    \src,    \stride
+    xvldx   \out2,    \src,    \stride2
+    xvldx   \out3,    \src,    \stride3
+.endm
+
+/*
+ * Description : Transpose 4x4 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ */
+.macro LSX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
+                          tmp0, tmp1
+    vilvl.h   \tmp0,  \in1,   \in0
+    vilvl.h   \tmp1,  \in3,   \in2
+    vilvl.w   \out0,  \tmp1,  \tmp0
+    vilvh.w   \out2,  \tmp1,  \tmp0
+    vilvh.d   \out1,  \out0,  \out0
+    vilvh.d   \out3,  \out0,  \out2
+.endm
+
+/*
+ * Description : Transpose 4x4 block with word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ * Details     :
+ * Example     :
+ *               1, 2, 3, 4            1, 5, 9,13
+ *               5, 6, 7, 8    to      2, 6,10,14
+ *               9,10,11,12  =====>    3, 7,11,15
+ *              13,14,15,16            4, 8,12,16
+ */
+.macro LSX_TRANSPOSE4x4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3, \
+                          _tmp0, _tmp1
+
+    vilvl.w    \_tmp0,   \_in1,    \_in0
+    vilvh.w    \_out1,   \_in1,    \_in0
+    vilvl.w    \_tmp1,   \_in3,    \_in2
+    vilvh.w    \_out3,   \_in3,    \_in2
+
+    vilvl.d    \_out0,   \_tmp1,   \_tmp0
+    vilvl.d    \_out2,   \_out3,   \_out1
+    vilvh.d    \_out3,   \_out3,   \_out1
+    vilvh.d    \_out1,   \_tmp1,   \_tmp0
+.endm
+
+/*
+ * Description : Transpose 8x8 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ */
+.macro LSX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1,   \
+                          out2, out3, out4, out5, out6, out7, tmp0, tmp1, tmp2, \
+                          tmp3, tmp4, tmp5, tmp6, tmp7
+    vilvl.h      \tmp0,    \in6,   \in4
+    vilvl.h      \tmp1,    \in7,   \in5
+    vilvl.h      \tmp2,    \in2,   \in0
+    vilvl.h      \tmp3,    \in3,   \in1
+
+    vilvl.h      \tmp4,    \tmp1,  \tmp0
+    vilvh.h      \tmp5,    \tmp1,  \tmp0
+    vilvl.h      \tmp6,    \tmp3,  \tmp2
+    vilvh.h      \tmp7,    \tmp3,  \tmp2
+
+    vilvh.h      \tmp0,    \in6,   \in4
+    vilvh.h      \tmp1,    \in7,   \in5
+    vilvh.h      \tmp2,    \in2,   \in0
+    vilvh.h      \tmp3,    \in3,   \in1
+
+    vpickev.d    \out0,    \tmp4,  \tmp6
+    vpickod.d    \out1,    \tmp4,  \tmp6
+    vpickev.d    \out2,    \tmp5,  \tmp7
+    vpickod.d    \out3,    \tmp5,  \tmp7
+
+    vilvl.h      \tmp4,    \tmp1,  \tmp0
+    vilvh.h      \tmp5,    \tmp1,  \tmp0
+    vilvl.h      \tmp6,    \tmp3,  \tmp2
+    vilvh.h      \tmp7,    \tmp3,  \tmp2
+
+    vpickev.d    \out4,    \tmp4,  \tmp6
+    vpickod.d    \out5,    \tmp4,  \tmp6
+    vpickev.d    \out6,    \tmp5,  \tmp7
+    vpickod.d    \out7,    \tmp5,  \tmp7
+.endm
+
+/*
+ * Description : Transpose 16x8 block with byte elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ */
+.macro LASX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7,        \
+                            in8, in9, in10, in11, in12, in13, in14, in15,  \
+                            out0, out1, out2, out3, out4, out5, out6, out7,\
+                            tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
+    xvilvl.b   \tmp0,    \in2,     \in0
+    xvilvl.b   \tmp1,    \in3,     \in1
+    xvilvl.b   \tmp2,    \in6,     \in4
+    xvilvl.b   \tmp3,    \in7,     \in5
+    xvilvl.b   \tmp4,    \in10,    \in8
+    xvilvl.b   \tmp5,    \in11,    \in9
+    xvilvl.b   \tmp6,    \in14,    \in12
+    xvilvl.b   \tmp7,    \in15,    \in13
+    xvilvl.b   \out0,    \tmp1,    \tmp0
+    xvilvh.b   \out1,    \tmp1,    \tmp0
+    xvilvl.b   \out2,    \tmp3,    \tmp2
+    xvilvh.b   \out3,    \tmp3,    \tmp2
+    xvilvl.b   \out4,    \tmp5,    \tmp4
+    xvilvh.b   \out5,    \tmp5,    \tmp4
+    xvilvl.b   \out6,    \tmp7,    \tmp6
+    xvilvh.b   \out7,    \tmp7,    \tmp6
+    xvilvl.w   \tmp0,    \out2,    \out0
+    xvilvh.w   \tmp2,    \out2,    \out0
+    xvilvl.w   \tmp4,    \out3,    \out1
+    xvilvh.w   \tmp6,    \out3,    \out1
+    xvilvl.w   \tmp1,    \out6,    \out4
+    xvilvh.w   \tmp3,    \out6,    \out4
+    xvilvl.w   \tmp5,    \out7,    \out5
+    xvilvh.w   \tmp7,    \out7,    \out5
+    xvilvl.d   \out0,    \tmp1,    \tmp0
+    xvilvh.d   \out1,    \tmp1,    \tmp0
+    xvilvl.d   \out2,    \tmp3,    \tmp2
+    xvilvh.d   \out3,    \tmp3,    \tmp2
+    xvilvl.d   \out4,    \tmp5,    \tmp4
+    xvilvh.d   \out5,    \tmp5,    \tmp4
+    xvilvl.d   \out6,    \tmp7,    \tmp6
+    xvilvh.d   \out7,    \tmp7,    \tmp6
+.endm
+
+/*
+ * Description : Transpose 16x8 block with byte elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ */
+.macro LSX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7,        \
+                           in8, in9, in10, in11, in12, in13, in14, in15,  \
+                           out0, out1, out2, out3, out4, out5, out6, out7,\
+                           tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
+    vilvl.b   \tmp0,    \in2,     \in0
+    vilvl.b   \tmp1,    \in3,     \in1
+    vilvl.b   \tmp2,    \in6,     \in4
+    vilvl.b   \tmp3,    \in7,     \in5
+    vilvl.b   \tmp4,    \in10,    \in8
+    vilvl.b   \tmp5,    \in11,    \in9
+    vilvl.b   \tmp6,    \in14,    \in12
+    vilvl.b   \tmp7,    \in15,    \in13
+
+    vilvl.b   \out0,    \tmp1,    \tmp0
+    vilvh.b   \out1,    \tmp1,    \tmp0
+    vilvl.b   \out2,    \tmp3,    \tmp2
+    vilvh.b   \out3,    \tmp3,    \tmp2
+    vilvl.b   \out4,    \tmp5,    \tmp4
+    vilvh.b   \out5,    \tmp5,    \tmp4
+    vilvl.b   \out6,    \tmp7,    \tmp6
+    vilvh.b   \out7,    \tmp7,    \tmp6
+    vilvl.w   \tmp0,    \out2,    \out0
+    vilvh.w   \tmp2,    \out2,    \out0
+    vilvl.w   \tmp4,    \out3,    \out1
+    vilvh.w   \tmp6,    \out3,    \out1
+    vilvl.w   \tmp1,    \out6,    \out4
+    vilvh.w   \tmp3,    \out6,    \out4
+    vilvl.w   \tmp5,    \out7,    \out5
+    vilvh.w   \tmp7,    \out7,    \out5
+    vilvl.d   \out0,    \tmp1,    \tmp0
+    vilvh.d   \out1,    \tmp1,    \tmp0
+    vilvl.d   \out2,    \tmp3,    \tmp2
+    vilvh.d   \out3,    \tmp3,    \tmp2
+    vilvl.d   \out4,    \tmp5,    \tmp4
+    vilvh.d   \out5,    \tmp5,    \tmp4
+    vilvl.d   \out6,    \tmp7,    \tmp6
+    vilvh.d   \out7,    \tmp7,    \tmp6
+.endm
+
+/*
+ * Description : Transpose 4x4 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ */
+.macro LASX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
+                           tmp0, tmp1
+    xvilvl.h   \tmp0,  \in1,   \in0
+    xvilvl.h   \tmp1,  \in3,   \in2
+    xvilvl.w   \out0,  \tmp1,  \tmp0
+    xvilvh.w   \out2,  \tmp1,  \tmp0
+    xvilvh.d   \out1,  \out0,  \out0
+    xvilvh.d   \out3,  \out0,  \out2
+.endm
+
+/*
+ * Description : Transpose 4x8 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ */
+.macro LASX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, \
+                           tmp0, tmp1
+    xvilvl.h      \tmp0,    \in2,   \in0
+    xvilvl.h      \tmp1,    \in3,   \in1
+    xvilvl.h      \out2,    \tmp1,  \tmp0
+    xvilvh.h      \out3,    \tmp1,  \tmp0
+
+    xvilvl.d      \out0,    \out2,  \out2
+    xvilvh.d      \out1,    \out2,  \out2
+    xvilvl.d      \out2,    \out3,  \out3
+    xvilvh.d      \out3,    \out3,  \out3
+.endm
+
+/*
+ * Description : Transpose 8x8 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ */
+.macro LASX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7,         \
+                           out0, out1, out2, out3, out4, out5, out6, out7, \
+                           tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
+    xvilvl.h     \tmp0,   \in6,     \in4
+    xvilvl.h     \tmp1,   \in7,     \in5
+    xvilvl.h     \tmp2,   \in2,     \in0
+    xvilvl.h     \tmp3,   \in3,     \in1
+
+    xvilvl.h     \tmp4,   \tmp1,    \tmp0
+    xvilvh.h     \tmp5,   \tmp1,    \tmp0
+    xvilvl.h     \tmp6,   \tmp3,    \tmp2
+    xvilvh.h     \tmp7,   \tmp3,    \tmp2
+
+    xvilvh.h     \tmp0,   \in6,     \in4
+    xvilvh.h     \tmp1,   \in7,     \in5
+    xvilvh.h     \tmp2,   \in2,     \in0
+    xvilvh.h     \tmp3,   \in3,     \in1
+
+    xvpickev.d   \out0,   \tmp4,    \tmp6
+    xvpickod.d   \out1,   \tmp4,    \tmp6
+    xvpickev.d   \out2,   \tmp5,    \tmp7
+    xvpickod.d   \out3,   \tmp5,    \tmp7
+
+    xvilvl.h     \tmp4,   \tmp1,    \tmp0
+    xvilvh.h     \tmp5,   \tmp1,    \tmp0
+    xvilvl.h     \tmp6,   \tmp3,    \tmp2
+    xvilvh.h     \tmp7,   \tmp3,    \tmp2
+
+    xvpickev.d   \out4,   \tmp4,    \tmp6
+    xvpickod.d   \out5,   \tmp4,    \tmp6
+    xvpickev.d   \out6,   \tmp5,    \tmp7
+    xvpickod.d   \out7,   \tmp5,    \tmp7
+.endm
+
+/*
+ * Description : Transpose 2x4x4 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ */
+.macro LASX_TRANSPOSE2x4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
+                             tmp0, tmp1, tmp2
+    xvilvh.h   \tmp1,    \in0,     \in1
+    xvilvl.h   \out1,    \in0,     \in1
+    xvilvh.h   \tmp0,    \in2,     \in3
+    xvilvl.h   \out3,    \in2,     \in3
+
+    xvilvh.w   \tmp2,    \out3,    \out1
+    xvilvl.w   \out3,    \out3,    \out1
+
+    xvilvl.w   \out2,    \tmp0,    \tmp1
+    xvilvh.w   \tmp1,    \tmp0,    \tmp1
+
+    xvilvh.d   \out0,    \out2,    \out3
+    xvilvl.d   \out2,    \out2,    \out3
+    xvilvh.d   \out1,    \tmp1,    \tmp2
+    xvilvl.d   \out3,    \tmp1,    \tmp2
+.endm
+
+/*
+ * Description : Transpose 4x4 block with word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ * Details     :
+ * Example     :
+ *               1, 2, 3, 4,  1, 2, 3, 4        1,5, 9,13, 1,5, 9,13
+ *               5, 6, 7, 8,  5, 6, 7, 8   to   2,6,10,14, 2,6,10,14
+ *               9,10,11,12,  9,10,11,12 =====> 3,7,11,15, 3,7,11,15
+ *              13,14,15,16, 13,14,15,16        4,8,12,16, 4,8,12,16
+ */
+.macro LASX_TRANSPOSE4x4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3, \
+                           _tmp0, _tmp1
+
+    xvilvl.w    \_tmp0,   \_in1,    \_in0
+    xvilvh.w    \_out1,   \_in1,    \_in0
+    xvilvl.w    \_tmp1,   \_in3,    \_in2
+    xvilvh.w    \_out3,   \_in3,    \_in2
+
+    xvilvl.d    \_out0,   \_tmp1,   \_tmp0
+    xvilvl.d    \_out2,   \_out3,   \_out1
+    xvilvh.d    \_out3,   \_out3,   \_out1
+    xvilvh.d    \_out1,   \_tmp1,   \_tmp0
+.endm
+
+/*
+ * Description : Transpose 8x8 block with word elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *               _out7
+ * Example     : LASX_TRANSPOSE8x8_W
+ *        _in0 : 1,2,3,4,5,6,7,8
+ *        _in1 : 2,2,3,4,5,6,7,8
+ *        _in2 : 3,2,3,4,5,6,7,8
+ *        _in3 : 4,2,3,4,5,6,7,8
+ *        _in4 : 5,2,3,4,5,6,7,8
+ *        _in5 : 6,2,3,4,5,6,7,8
+ *        _in6 : 7,2,3,4,5,6,7,8
+ *        _in7 : 8,2,3,4,5,6,7,8
+ *
+ *       _out0 : 1,2,3,4,5,6,7,8
+ *       _out1 : 2,2,2,2,2,2,2,2
+ *       _out2 : 3,3,3,3,3,3,3,3
+ *       _out3 : 4,4,4,4,4,4,4,4
+ *       _out4 : 5,5,5,5,5,5,5,5
+ *       _out5 : 6,6,6,6,6,6,6,6
+ *       _out6 : 7,7,7,7,7,7,7,7
+ *       _out7 : 8,8,8,8,8,8,8,8
+ */
+.macro LASX_TRANSPOSE8x8_W _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,\
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7,\
+                           _tmp0, _tmp1, _tmp2, _tmp3
+    xvilvl.w    \_tmp0,   \_in2,    \_in0
+    xvilvl.w    \_tmp1,   \_in3,    \_in1
+    xvilvh.w    \_tmp2,   \_in2,    \_in0
+    xvilvh.w    \_tmp3,   \_in3,    \_in1
+    xvilvl.w    \_out0,   \_tmp1,   \_tmp0
+    xvilvh.w    \_out1,   \_tmp1,   \_tmp0
+    xvilvl.w    \_out2,   \_tmp3,   \_tmp2
+    xvilvh.w    \_out3,   \_tmp3,   \_tmp2
+
+    xvilvl.w    \_tmp0,   \_in6,    \_in4
+    xvilvl.w    \_tmp1,   \_in7,    \_in5
+    xvilvh.w    \_tmp2,   \_in6,    \_in4
+    xvilvh.w    \_tmp3,   \_in7,    \_in5
+    xvilvl.w    \_out4,   \_tmp1,   \_tmp0
+    xvilvh.w    \_out5,   \_tmp1,   \_tmp0
+    xvilvl.w    \_out6,   \_tmp3,   \_tmp2
+    xvilvh.w    \_out7,   \_tmp3,   \_tmp2
+
+    xmov        \_tmp0,   \_out0
+    xmov        \_tmp1,   \_out1
+    xmov        \_tmp2,   \_out2
+    xmov        \_tmp3,   \_out3
+    xvpermi.q   \_out0,   \_out4,   0x02
+    xvpermi.q   \_out1,   \_out5,   0x02
+    xvpermi.q   \_out2,   \_out6,   0x02
+    xvpermi.q   \_out3,   \_out7,   0x02
+    xvpermi.q   \_out4,   \_tmp0,   0x31
+    xvpermi.q   \_out5,   \_tmp1,   0x31
+    xvpermi.q   \_out6,   \_tmp2,   0x31
+    xvpermi.q   \_out7,   \_tmp3,   0x31
+.endm
+
+/*
+ * Description : Transpose 4x4 block with double-word elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3
+ *               Outputs - _out0, _out1, _out2, _out3
+ * Example     : LASX_TRANSPOSE4x4_D
+ *        _in0 : 1,2,3,4
+ *        _in1 : 1,2,3,4
+ *        _in2 : 1,2,3,4
+ *        _in3 : 1,2,3,4
+ *
+ *       _out0 : 1,1,1,1
+ *       _out1 : 2,2,2,2
+ *       _out2 : 3,3,3,3
+ *       _out3 : 4,4,4,4
+ */
+.macro LASX_TRANSPOSE4x4_D _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3, \
+                           _tmp0, _tmp1
+    xvilvl.d    \_tmp0,   \_in1,    \_in0
+    xvilvh.d    \_out1,   \_in1,    \_in0
+    xvilvh.d    \_tmp1,   \_in3,    \_in2
+    xvilvl.d    \_out2,   \_in3,    \_in2
+
+    xvor.v      \_out0,   \_tmp0,   \_tmp0
+    xvor.v      \_out3,   \_tmp1,   \_tmp1
+
+    xvpermi.q   \_out0,   \_out2,   0x02
+    xvpermi.q   \_out2,   \_tmp0,   0x31
+    xvpermi.q   \_out3,   \_out1,   0x31
+    xvpermi.q   \_out1,   \_tmp1,   0x02
+.endm
+
+/*
+ * Description : Butterfly of 4 input vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3
+ *               Outputs - _out0, _out1, _out2, _out3
+ * Details     : Butterfly operation
+ * Example     : LSX_BUTTERFLY_4
+ *               _out0 = _in0 + _in3;
+ *               _out1 = _in1 + _in2;
+ *               _out2 = _in1 - _in2;
+ *               _out3 = _in0 - _in3;
+ */
+.macro LSX_BUTTERFLY_4_B _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+    vadd.b   \_out0,   \_in0,   \_in3
+    vadd.b   \_out1,   \_in1,   \_in2
+    vsub.b   \_out2,   \_in1,   \_in2
+    vsub.b   \_out3,   \_in0,   \_in3
+.endm
+.macro LSX_BUTTERFLY_4_H _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+    vadd.h   \_out0,   \_in0,   \_in3
+    vadd.h   \_out1,   \_in1,   \_in2
+    vsub.h   \_out2,   \_in1,   \_in2
+    vsub.h   \_out3,   \_in0,   \_in3
+.endm
+.macro LSX_BUTTERFLY_4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+    vadd.w   \_out0,   \_in0,   \_in3
+    vadd.w   \_out1,   \_in1,   \_in2
+    vsub.w   \_out2,   \_in1,   \_in2
+    vsub.w   \_out3,   \_in0,   \_in3
+.endm
+.macro LSX_BUTTERFLY_4_D _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+    vadd.d   \_out0,   \_in0,   \_in3
+    vadd.d   \_out1,   \_in1,   \_in2
+    vsub.d   \_out2,   \_in1,   \_in2
+    vsub.d   \_out3,   \_in0,   \_in3
+.endm
+
+.macro LASX_BUTTERFLY_4_B _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+    xvadd.b   \_out0,   \_in0,   \_in3
+    xvadd.b   \_out1,   \_in1,   \_in2
+    xvsub.b   \_out2,   \_in1,   \_in2
+    xvsub.b   \_out3,   \_in0,   \_in3
+.endm
+.macro LASX_BUTTERFLY_4_H _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+    xvadd.h   \_out0,   \_in0,   \_in3
+    xvadd.h   \_out1,   \_in1,   \_in2
+    xvsub.h   \_out2,   \_in1,   \_in2
+    xvsub.h   \_out3,   \_in0,   \_in3
+.endm
+.macro LASX_BUTTERFLY_4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+    xvadd.w   \_out0,   \_in0,   \_in3
+    xvadd.w   \_out1,   \_in1,   \_in2
+    xvsub.w   \_out2,   \_in1,   \_in2
+    xvsub.w   \_out3,   \_in0,   \_in3
+.endm
+.macro LASX_BUTTERFLY_4_D _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+    xvadd.d   \_out0,   \_in0,   \_in3
+    xvadd.d   \_out1,   \_in1,   \_in2
+    xvsub.d   \_out2,   \_in1,   \_in2
+    xvsub.d   \_out3,   \_in0,   \_in3
+.endm
+
+/*
+ * Description : Butterfly of 8 input vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
+ *               Outputs - _out0, _out1, _out2, _out3, ~
+ * Details     : Butterfly operation
+ * Example     : LASX_BUTTERFLY_8
+ *               _out0 = _in0 + _in7;
+ *               _out1 = _in1 + _in6;
+ *               _out2 = _in2 + _in5;
+ *               _out3 = _in3 + _in4;
+ *               _out4 = _in3 - _in4;
+ *               _out5 = _in2 - _in5;
+ *               _out6 = _in1 - _in6;
+ *               _out7 = _in0 - _in7;
+ */
+.macro LSX_BUTTERFLY_8_B _in0,  _in1,  _in2,  _in3,  _in4,  _in5,  _in6,  _in7, \
+                         _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+    vadd.b    \_out0,    \_in0,    \_in7
+    vadd.b    \_out1,    \_in1,    \_in6
+    vadd.b    \_out2,    \_in2,    \_in5
+    vadd.b    \_out3,    \_in3,    \_in4
+    vsub.b    \_out4,    \_in3,    \_in4
+    vsub.b    \_out5,    \_in2,    \_in5
+    vsub.b    \_out6,    \_in1,    \_in6
+    vsub.b    \_out7,    \_in0,    \_in7
+.endm
+
+.macro LSX_BUTTERFLY_8_H _in0,  _in1,  _in2,  _in3,  _in4,  _in5,  _in6,  _in7, \
+                         _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+    vadd.h    \_out0,    \_in0,    \_in7
+    vadd.h    \_out1,    \_in1,    \_in6
+    vadd.h    \_out2,    \_in2,    \_in5
+    vadd.h    \_out3,    \_in3,    \_in4
+    vsub.h    \_out4,    \_in3,    \_in4
+    vsub.h    \_out5,    \_in2,    \_in5
+    vsub.h    \_out6,    \_in1,    \_in6
+    vsub.h    \_out7,    \_in0,    \_in7
+.endm
+
+.macro LSX_BUTTERFLY_8_W _in0,  _in1,  _in2,  _in3,  _in4,  _in5,  _in6,  _in7, \
+                         _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+    vadd.w    \_out0,    \_in0,    \_in7
+    vadd.w    \_out1,    \_in1,    \_in6
+    vadd.w    \_out2,    \_in2,    \_in5
+    vadd.w    \_out3,    \_in3,    \_in4
+    vsub.w    \_out4,    \_in3,    \_in4
+    vsub.w    \_out5,    \_in2,    \_in5
+    vsub.w    \_out6,    \_in1,    \_in6
+    vsub.w    \_out7,    \_in0,    \_in7
+.endm
+
+.macro LSX_BUTTERFLY_8_D _in0,  _in1,  _in2,  _in3,  _in4,  _in5,  _in6,  _in7, \
+                         _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+    vadd.d    \_out0,    \_in0,    \_in7
+    vadd.d    \_out1,    \_in1,    \_in6
+    vadd.d    \_out2,    \_in2,    \_in5
+    vadd.d    \_out3,    \_in3,    \_in4
+    vsub.d    \_out4,    \_in3,    \_in4
+    vsub.d    \_out5,    \_in2,    \_in5
+    vsub.d    \_out6,    \_in1,    \_in6
+    vsub.d    \_out7,    \_in0,    \_in7
+.endm
+
+.macro LASX_BUTTERFLY_8_B _in0,  _in1,  _in2,  _in3,  _in4,  _in5,  _in6,  _in7, \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+    xvadd.b    \_out0,    \_in0,    \_in7
+    xvadd.b    \_out1,    \_in1,    \_in6
+    xvadd.b    \_out2,    \_in2,    \_in5
+    xvadd.b    \_out3,    \_in3,    \_in4
+    xvsub.b    \_out4,    \_in3,    \_in4
+    xvsub.b    \_out5,    \_in2,    \_in5
+    xvsub.b    \_out6,    \_in1,    \_in6
+    xvsub.b    \_out7,    \_in0,    \_in7
+.endm
+
+.macro LASX_BUTTERFLY_8_H _in0,  _in1,  _in2,  _in3,  _in4,  _in5,  _in6,  _in7, \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+    xvadd.h    \_out0,    \_in0,    \_in7
+    xvadd.h    \_out1,    \_in1,    \_in6
+    xvadd.h    \_out2,    \_in2,    \_in5
+    xvadd.h    \_out3,    \_in3,    \_in4
+    xvsub.h    \_out4,    \_in3,    \_in4
+    xvsub.h    \_out5,    \_in2,    \_in5
+    xvsub.h    \_out6,    \_in1,    \_in6
+    xvsub.h    \_out7,    \_in0,    \_in7
+.endm
+
+.macro LASX_BUTTERFLY_8_W _in0,  _in1,  _in2,  _in3,  _in4,  _in5,  _in6,  _in7, \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+    xvadd.w    \_out0,    \_in0,    \_in7
+    xvadd.w    \_out1,    \_in1,    \_in6
+    xvadd.w    \_out2,    \_in2,    \_in5
+    xvadd.w    \_out3,    \_in3,    \_in4
+    xvsub.w    \_out4,    \_in3,    \_in4
+    xvsub.w    \_out5,    \_in2,    \_in5
+    xvsub.w    \_out6,    \_in1,    \_in6
+    xvsub.w    \_out7,    \_in0,    \_in7
+.endm
+
-- 
2.20.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [FFmpeg-devel] [PATCH v1 2/6] avcodec/la: Add LSX optimization for loop filter.
  2023-05-04  8:49 [FFmpeg-devel] Add LSX optimization in avcodec and swscale Hao Chen
  2023-05-04  8:49 ` [FFmpeg-devel] [PATCH v1 1/6] avcodec/la: add LSX optimization for h264 idct Hao Chen
@ 2023-05-04  8:49 ` Hao Chen
  2023-05-04  8:49 ` [FFmpeg-devel] [PATCH v1 3/6] avcodec/la: Add LSX optimization for h264 chroma and intrapred Hao Chen
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 18+ messages in thread
From: Hao Chen @ 2023-05-04  8:49 UTC (permalink / raw)
  To: ffmpeg-devel

./configure --disable-lasx
ffmpeg -i 1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -y /dev/null -an
before: 161fps
after:  199fps
---
 libavcodec/loongarch/Makefile                 |    3 +-
 libavcodec/loongarch/h264dsp.S                | 2873 +++++++++++++++++
 libavcodec/loongarch/h264dsp_init_loongarch.c |   37 +-
 libavcodec/loongarch/h264dsp_lasx.c           | 1354 +-------
 libavcodec/loongarch/h264dsp_loongarch.h      |   67 +-
 5 files changed, 2959 insertions(+), 1375 deletions(-)
 create mode 100644 libavcodec/loongarch/h264dsp.S

diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index 4bf06d903b..6eabe71c0b 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -31,4 +31,5 @@ LSX-OBJS-$(CONFIG_HEVC_DECODER)       += loongarch/hevcdsp_lsx.o \
                                          loongarch/hevc_mc_uni_lsx.o \
                                          loongarch/hevc_mc_uniw_lsx.o
 LSX-OBJS-$(CONFIG_H264DSP)            += loongarch/h264idct.o \
-                                         loongarch/h264idct_la.o
+                                         loongarch/h264idct_la.o \
+                                         loongarch/h264dsp.o
diff --git a/libavcodec/loongarch/h264dsp.S b/libavcodec/loongarch/h264dsp.S
new file mode 100644
index 0000000000..9031e474ae
--- /dev/null
+++ b/libavcodec/loongarch/h264dsp.S
@@ -0,0 +1,2873 @@
+/*
+ * Loongson LSX/LASX optimized h264dsp
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+const vec_shuf
+.rept 2
+.byte 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3
+.endr
+endconst
+
+.macro AVC_LPF_P1_OR_Q1 _in0, _in1, _in2, _in3, _in4, _in5, _out, _tmp0, _tmp1
+    vavgr.hu       \_tmp0,   \_in0,  \_in1
+    vslli.h        \_tmp1,   \_in2,  1
+    vsub.h         \_tmp0,   \_tmp0, \_tmp1
+    vavg.h         \_tmp0,   \_in3,  \_tmp0
+    vclip.h        \_tmp0,   \_tmp0, \_in4,  \_in5
+    vadd.h         \_out,    \_in2,  \_tmp0
+.endm
+
+.macro AVC_LPF_P0Q0 _in0, _in1, _in2, _in3, _in4, _in5, _out0,   \
+                    _out1, _tmp0, _tmp1
+    vsub.h         \_tmp0,   \_in0,  \_in1
+    vsub.h         \_tmp1,   \_in2,  \_in3
+    vslli.h        \_tmp0,   \_tmp0, 2
+    vaddi.hu       \_tmp1,   \_tmp1, 4
+    vadd.h         \_tmp0,   \_tmp0, \_tmp1
+    vsrai.h        \_tmp0,   \_tmp0, 3
+    vclip.h        \_tmp0,   \_tmp0, \_in4,  \_in5
+    vadd.h         \_out0,   \_in1,  \_tmp0
+    vsub.h         \_out1,   \_in0,  \_tmp0
+    vclip255.h     \_out0,   \_out0
+    vclip255.h     \_out1,   \_out1
+.endm
+
+function ff_h264_h_lpf_luma_8_lsx
+    slli.d          t0,     a1,    1   //img_width_2x
+    slli.d          t1,     a1,    2   //img_width_4x
+    slli.d          t2,     a1,    3   //img_width_8x
+    addi.d          sp,     sp,    -64
+    fst.d           f24,    sp,    0
+    fst.d           f25,    sp,    8
+    fst.d           f26,    sp,    16
+    fst.d           f27,    sp,    24
+    fst.d           f28,    sp,    32
+    fst.d           f29,    sp,    40
+    fst.d           f30,    sp,    48
+    fst.d           f31,    sp,    56
+    la.local        t4,     vec_shuf
+    add.d           t3,     t0,    a1  //img_width_3x
+    vldrepl.w       vr0,    a4,    0   //tmp_vec0
+    vld             vr1,    t4,    0  //tc_vec
+    vshuf.b         vr1,    vr0,   vr0,   vr1   //tc_vec
+    vslti.b         vr2,    vr1,   0
+    vxori.b         vr2,    vr2,   255
+    vandi.b         vr2,    vr2,   1    //bs_vec
+    vsetnez.v       $fcc0,  vr2
+    bceqz           $fcc0,  .END_LUMA_8
+    vldi            vr0,    0            //zero
+    addi.d          t4,     a0,    -4    //src
+    vslt.bu         vr3,    vr0,   vr2   //is_bs_greater_than0
+    add.d           t5,     t4,    t2    //src_tmp
+    vld             vr4,    t4,    0    //row0
+    vldx            vr5,    t4,    a1   //row1
+    vldx            vr6,    t4,    t0   //row2
+    vldx            vr7,    t4,    t3   //row3
+    add.d           t6,     t4,    t1   // src += img_width_4x
+    vld             vr8,    t6,    0    //row4
+    vldx            vr9,    t6,    a1   //row5
+    vldx            vr10,   t6,    t0   //row6
+    vldx            vr11,   t6,    t3   //row7
+    vld             vr12,   t5,    0    //row8
+    vldx            vr13,   t5,    a1   //row9
+    vldx            vr14,   t5,    t0   //row10
+    vldx            vr15,   t5,    t3   //row11
+    add.d           t6,     t5,    t1   // src_tmp += img_width_4x
+    vld             vr16,   t6,    0    //row12
+    vldx            vr17,   t6,    a1   //row13
+    vldx            vr18,   t6,    t0   //row14
+    vldx            vr19,   t6,    t3   //row15
+    LSX_TRANSPOSE16X8_B vr4, vr5, vr6, vr7, vr8, vr9, vr10, vr11,        \
+                        vr12, vr13, vr14, vr15, vr16, vr17, vr18, vr19,  \
+                        vr10, vr11, vr12, vr13, vr14, vr15, vr16, vr17,  \
+                        vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27
+    //vr10: p3_org, vr11: p2_org, vr12: p1_org, vr13: p0_org
+    //vr14: q0_org, vr15: q1_org, vr16: q2_org, vr17: q3_org
+    vabsd.bu        vr20,   vr13,  vr14    //p0_asub_q0
+    vabsd.bu        vr21,   vr12,  vr13    //p1_asub_p0
+    vabsd.bu        vr22,   vr15,  vr14    //q1_asub_q0
+
+    vreplgr2vr.b    vr4,    a2          //alpha
+    vreplgr2vr.b    vr5,    a3          //beta
+
+    vslt.bu         vr6,    vr20,  vr4   //is_less_than_alpha
+    vslt.bu         vr7,    vr21,  vr5   //is_less_than_beta
+    vand.v          vr8,    vr6,   vr7   //is_less_than
+    vslt.bu         vr7,    vr22,  vr5   //is_less_than_beta
+    vand.v          vr8,    vr7,   vr8   //is_less_than
+    vand.v          vr8,    vr8,   vr3   //is_less_than
+    vsetnez.v       $fcc0,  vr8
+    bceqz           $fcc0,  .END_LUMA_8
+    vneg.b          vr9,    vr1          //neg_tc_h
+    vsllwil.hu.bu   vr18,   vr1,   0     //tc_h.0
+    vexth.hu.bu     vr19,   vr1          //tc_h.1
+    vexth.h.b       vr2,    vr9          //neg_tc_h.1
+    vsllwil.h.b     vr9,    vr9,   0     //neg_tc_h.0
+
+    vsllwil.hu.bu   vr23,   vr12,  0     //p1_org_h.0
+    vexth.hu.bu     vr3,    vr12         //p1_org_h.1
+    vsllwil.hu.bu   vr24,   vr13,  0     //p0_org_h.0
+    vexth.hu.bu     vr4,    vr13         //p0_org_h.1
+    vsllwil.hu.bu   vr25,   vr14,  0     //q0_org_h.0
+    vexth.hu.bu     vr6,    vr14         //q0_org_h.1
+
+    vabsd.bu        vr0,    vr11,  vr13  //p2_asub_p0
+    vslt.bu         vr7,    vr0,   vr5
+    vand.v          vr7,    vr8,   vr7   //is_less_than_beta
+    vsetnez.v       $fcc0,  vr7
+    bceqz           $fcc0,  .END_LUMA_BETA
+    vsllwil.hu.bu   vr26,   vr11,  0   //p2_org_h.0
+    vexth.hu.bu     vr0,    vr11       //p2_org_h.1
+    AVC_LPF_P1_OR_Q1 vr24, vr25, vr23, vr26, vr9, vr18, vr27, vr28, vr29  //vr27: p1_h.0
+    AVC_LPF_P1_OR_Q1 vr4, vr6, vr3, vr0, vr2, vr19, vr28, vr29, vr30 //vr28: p1_h.1
+    vpickev.b       vr27,   vr28,  vr27
+    vbitsel.v       vr12,   vr12,  vr27,  vr7
+    vandi.b         vr7,    vr7,   1
+    vadd.b          vr1,    vr1,   vr7
+.END_LUMA_BETA:
+    vabsd.bu        vr26,   vr16,  vr14  //q2_asub_q0
+    vslt.bu         vr7,    vr26,  vr5
+    vand.v          vr7,    vr7,   vr8
+    vsllwil.hu.bu   vr27,   vr15,  0     //q1_org_h.0
+    vexth.hu.bu     vr26,   vr15         //q1_org_h.1
+    vsetnez.v       $fcc0,  vr7
+    bceqz           $fcc0,  .END_LUMA_BETA_SEC
+    vsllwil.hu.bu   vr28,   vr16,  0     //q2_org_h.0
+    vexth.hu.bu     vr0,    vr16         //q2_org_h.1
+    AVC_LPF_P1_OR_Q1 vr24, vr25, vr27, vr28, vr9, vr18, vr29, vr30, vr31  //vr29: q1_h.0
+    AVC_LPF_P1_OR_Q1 vr4, vr6, vr26, vr0, vr2, vr19, vr22, vr30, vr31  //vr22:q1_h.1
+    vpickev.b       vr29,   vr22,  vr29
+    vbitsel.v       vr15,   vr15,  vr29,  vr7
+    vandi.b         vr7,    vr7,   1
+    vadd.b          vr1,    vr1,   vr7
+.END_LUMA_BETA_SEC:
+    vneg.b          vr22,   vr1    //neg_thresh_h
+    vsllwil.h.b     vr28,   vr22,  0  //neg_thresh_h.0
+    vexth.h.b       vr29,   vr22     //neg_thresh_h.1
+    vsllwil.hu.bu   vr18,   vr1,   0  //tc_h.0
+    vexth.hu.bu     vr1,    vr1       //tc_h.1
+    AVC_LPF_P0Q0 vr25, vr24, vr23, vr27, vr28, vr18, vr30, vr31, vr0, vr2
+    AVC_LPF_P0Q0 vr6, vr4, vr3, vr26, vr29, vr1, vr20, vr21, vr0, vr2
+    vpickev.b       vr30,   vr20,  vr30  //p0_h
+    vpickev.b       vr31,   vr21,  vr31  //q0_h
+    vbitsel.v       vr13,   vr13,  vr30,  vr8  //p0_org
+    vbitsel.v       vr14,   vr14,  vr31,  vr8  //q0_org
+    //vr10: p3_org, vr11: p2_org, vr12: p1_org, vr13: p0_org
+    //vr14: q0_org, vr15: q1_org, vr16: q2_org, vr17: q3_org
+
+    vilvl.b         vr4,    vr12,  vr10   // row0.0
+    vilvl.b         vr5,    vr16,  vr14   // row0.1
+    vilvl.b         vr6,    vr13,  vr11   // row2.0
+    vilvl.b         vr7,    vr17,  vr15   // row2.1
+
+    vilvh.b         vr8,    vr12,  vr10   // row1.0
+    vilvh.b         vr9,    vr16,  vr14   // row1.1
+    vilvh.b         vr10,   vr13,  vr11   // row3.0
+    vilvh.b         vr11,   vr17,  vr15   // row3.1
+
+    vilvl.b         vr12,   vr6,   vr4    // row4.0
+    vilvl.b         vr13,   vr7,   vr5    // row4.1
+    vilvl.b         vr14,   vr10,  vr8    // row6.0
+    vilvl.b         vr15,   vr11,  vr9    // row6.1
+
+    vilvh.b         vr16,   vr6,   vr4    // row5.0
+    vilvh.b         vr17,   vr7,   vr5    // row5.1
+    vilvh.b         vr18,   vr10,  vr8    // row7.0
+    vilvh.b         vr19,   vr11,  vr9    // row7.1
+
+    vilvl.w         vr4,    vr13,  vr12   // row4: 0, 4, 1, 5
+    vilvh.w         vr5,    vr13,  vr12   // row4: 2, 6, 3, 7
+    vilvl.w         vr6,    vr17,  vr16   // row5: 0, 4, 1, 5
+    vilvh.w         vr7,    vr17,  vr16   // row5: 2, 6, 3, 7
+
+    vilvl.w         vr8,    vr15,  vr14   // row6: 0, 4, 1, 5
+    vilvh.w         vr9,    vr15,  vr14   // row6: 2, 6, 3, 7
+    vilvl.w         vr10,   vr19,  vr18   // row7: 0, 4, 1, 5
+    vilvh.w         vr11,   vr19,  vr18   // row7: 2, 6, 3, 7
+
+    vbsrl.v         vr20,   vr4,   8
+    vbsrl.v         vr21,   vr5,   8
+    vbsrl.v         vr22,   vr6,   8
+    vbsrl.v         vr23,   vr7,   8
+
+    vbsrl.v         vr24,   vr8,   8
+    vbsrl.v         vr25,   vr9,   8
+    vbsrl.v         vr26,   vr10,  8
+    vbsrl.v         vr27,   vr11,  8
+
+    fst.d           f4,     t4,    0
+    fstx.d          f20,    t4,    a1
+    fstx.d          f5,     t4,    t0
+    fstx.d          f21,    t4,    t3
+    add.d           t4,     t4,    t1
+    fst.d           f6,     t4,    0
+    fstx.d          f22,    t4,    a1
+    fstx.d          f7,     t4,    t0
+    fstx.d          f23,    t4,    t3
+    add.d           t4,     t4,    t1
+    fst.d           f8,     t4,    0
+    fstx.d          f24,    t4,    a1
+    fstx.d          f9,     t4,    t0
+    fstx.d          f25,    t4,    t3
+    add.d           t4,     t4,    t1
+    fst.d           f10,    t4,    0
+    fstx.d          f26,    t4,    a1
+    fstx.d          f11,    t4,    t0
+    fstx.d          f27,    t4,    t3
+
+.END_LUMA_8:
+    fld.d           f24,    sp,    0
+    fld.d           f25,    sp,    8
+    fld.d           f26,    sp,    16
+    fld.d           f27,    sp,    24
+    fld.d           f28,    sp,    32
+    fld.d           f29,    sp,    40
+    fld.d           f30,    sp,    48
+    fld.d           f31,    sp,    56
+    addi.d          sp,     sp,    64
+endfunc
+
+function ff_h264_v_lpf_luma_8_lsx
+    slli.d          t0,     a1,    1   //img_width_2x
+    la.local        t4,     vec_shuf
+    vldrepl.w       vr0,    a4,    0   //tmp_vec0
+    vld             vr1,    t4,    0  //tc_vec
+    add.d           t1,     t0,    a1  //img_width_3x
+    vshuf.b         vr1,    vr0,   vr0,   vr1   //tc_vec
+    addi.d          sp,     sp,    -24
+    fst.d           f24,    sp,    0
+    fst.d           f25,    sp,    8
+    fst.d           f26,    sp,    16
+    vslti.b         vr2,    vr1,   0
+    vxori.b         vr2,    vr2,   255
+    vandi.b         vr2,    vr2,   1    //bs_vec
+    vsetnez.v       $fcc0,  vr2
+    bceqz           $fcc0,  .END_V_LUMA_8
+    sub.d           t2,     a0,    t1   //data - img_width_3x
+    vreplgr2vr.b    vr4,    a2          //alpha
+    vreplgr2vr.b    vr5,    a3          //beta
+    vldi            vr0,    0           //zero
+    vld             vr10,   t2,    0    //p2_org
+    vldx            vr11,   t2,    a1   //p1_org
+    vldx            vr12,   t2,    t0   //p0_org
+    vld             vr13,   a0,    0    //q0_org
+    vldx            vr14,   a0,    a1   //q1_org
+
+    vslt.bu         vr0,    vr0,   vr2   //is_bs_greater_than0
+    vabsd.bu        vr16,   vr11,  vr12  //p1_asub_p0
+    vabsd.bu        vr15,   vr12,  vr13  //p0_asub_q0
+    vabsd.bu        vr17,   vr14,  vr13  //q1_asub_q0
+
+    vslt.bu         vr6,    vr15,  vr4   //is_less_than_alpha
+    vslt.bu         vr7,    vr16,  vr5   //is_less_than_beta
+    vand.v          vr8,    vr6,   vr7   //is_less_than
+    vslt.bu         vr7,    vr17,  vr5   //is_less_than_beta
+    vand.v          vr8,    vr7,   vr8
+    vand.v          vr8,    vr8,   vr0  //is_less_than
+
+    vsetnez.v       $fcc0,  vr8
+    bceqz           $fcc0,  .END_V_LUMA_8
+    vldx            vr15,   a0,    t0    //q2_org
+    vneg.b          vr0,    vr1          //neg_tc_h
+    vsllwil.h.b     vr18,   vr1,   0     //tc_h.0
+    vexth.h.b       vr19,   vr1          //tc_h.1
+    vsllwil.h.b     vr9,    vr0,   0     //neg_tc_h.0
+    vexth.h.b       vr2,    vr0          //neg_tc_h.1
+
+    vsllwil.hu.bu   vr16,   vr11,  0     //p1_org_h.0
+    vexth.hu.bu     vr17,   vr11         //p1_org_h.1
+    vsllwil.hu.bu   vr20,   vr12,  0     //p0_org_h.0
+    vexth.hu.bu     vr21,   vr12         //p0_org_h.1
+    vsllwil.hu.bu   vr22,   vr13,  0     //q0_org_h.0
+    vexth.hu.bu     vr23,   vr13         //q0_org_h.1
+
+    vabsd.bu        vr0,    vr10,  vr12  //p2_asub_p0
+    vslt.bu         vr7,    vr0,   vr5   //is_less_than_beta
+    vand.v          vr7,    vr7,   vr8   //is_less_than_beta
+
+    vsetnez.v       $fcc0,  vr8
+    bceqz           $fcc0,  .END_V_LESS_BETA
+    vsllwil.hu.bu   vr3,    vr10,  0   //p2_org_h.0
+    vexth.hu.bu     vr4,    vr10       //p2_org_h.1
+    AVC_LPF_P1_OR_Q1 vr20, vr22, vr16, vr3, vr9, vr18, vr24, vr0, vr26
+    AVC_LPF_P1_OR_Q1 vr21, vr23, vr17, vr4, vr2, vr19, vr25, vr0, vr26
+    vpickev.b       vr24,   vr25,  vr24
+    vbitsel.v       vr24,   vr11,  vr24,   vr7
+    addi.d          t3,     t2,    16
+    vstx            vr24,   t2,    a1
+    vandi.b         vr7,    vr7,   1
+    vadd.b          vr1,    vr7,   vr1
+.END_V_LESS_BETA:
+    vabsd.bu        vr0,    vr15,  vr13   //q2_asub_q0
+    vslt.bu         vr7,    vr0,   vr5    //is_less_than_beta
+    vand.v          vr7,    vr7,   vr8    //is_less_than_beta
+    vsllwil.hu.bu   vr3,    vr14,  0     //q1_org_h.0
+    vexth.hu.bu     vr4,    vr14         //q1_org_h.1
+
+    vsetnez.v       $fcc0,  vr7
+    bceqz           $fcc0,  .END_V_LESS_BETA_SEC
+    vsllwil.hu.bu   vr11,   vr15,  0     //q2_org_h.0
+    vexth.hu.bu     vr15,   vr15         //q2_org_h.1
+    AVC_LPF_P1_OR_Q1 vr20, vr22, vr3, vr11, vr9, vr18, vr24, vr0, vr26
+    AVC_LPF_P1_OR_Q1 vr21, vr23, vr4, vr15, vr2, vr19, vr25, vr0, vr26
+    vpickev.b       vr24,   vr25,  vr24
+    vbitsel.v       vr24,   vr14,  vr24,   vr7
+    vstx            vr24,   a0,    a1
+    vandi.b         vr7,    vr7,   1
+    vadd.b          vr1,    vr1,   vr7
+.END_V_LESS_BETA_SEC:
+    vneg.b          vr0,    vr1
+    vsllwil.h.b     vr9,    vr0,   0    //neg_thresh_h.0
+    vexth.h.b       vr2,    vr0         //neg_thresh_h.1
+    vsllwil.hu.bu   vr18,   vr1,   0    //tc_h.0
+    vexth.hu.bu     vr19,   vr1         //tc_h.1
+    AVC_LPF_P0Q0 vr22, vr20, vr16, vr3, vr9, vr18, vr11, vr15, vr0, vr26
+    AVC_LPF_P0Q0 vr23, vr21, vr17, vr4, vr2, vr19, vr10, vr14, vr0, vr26
+    vpickev.b       vr11,   vr10,  vr11  //p0_h
+    vpickev.b       vr15,   vr14,  vr15  //q0_h
+    vbitsel.v       vr11,   vr12,  vr11,   vr8  //p0_h
+    vbitsel.v       vr15,   vr13,  vr15,   vr8  //q0_h
+    vstx            vr11,   t2,    t0
+    vst             vr15,   a0,    0
+.END_V_LUMA_8:
+    fld.d           f24,    sp,    0
+    fld.d           f25,    sp,    8
+    fld.d           f26,    sp,    16
+    addi.d          sp,     sp,    24
+endfunc
+
+const chroma_shuf
+.byte 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3
+endconst
+
+function ff_h264_h_lpf_chroma_8_lsx
+    slli.d          t0,     a1,    1   //img_width_2x
+    slli.d          t1,     a1,    2   //img_width_4x
+    la.local        t4,     chroma_shuf
+    add.d           t2,     t0,    a1  //img_width_3x
+    vldrepl.w       vr0,    a4,    0   //tmp_vec0
+    vld             vr1,    t4,    0  //tc_vec
+    vshuf.b         vr1,    vr0,   vr0,   vr1   //tc_vec
+    vslti.b         vr2,    vr1,   0
+    vxori.b         vr2,    vr2,   255
+    vandi.b         vr2,    vr2,   1    //bs_vec
+    vsetnez.v       $fcc0,  vr2
+    bceqz           $fcc0,  .END_CHROMA_8
+    vldi            vr0,    0
+    addi.d          t4,     a0,    -2
+    vslt.bu         vr3,    vr0,   vr2   //is_bs_greater_than0
+    add.d           t5,     t4,    t1
+    vld             vr4,    t4,    0    //row0
+    vldx            vr5,    t4,    a1   //row1
+    vldx            vr6,    t4,    t0   //row2
+    vldx            vr7,    t4,    t2   //row3
+    vld             vr8,    t5,    0    //row4
+    vldx            vr9,    t5,    a1   //row5
+    vldx            vr10,   t5,    t0   //row6
+    vldx            vr11,   t5,    t2   //row7
+    vilvl.b         vr12,   vr6,   vr4  //p1_org
+    vilvl.b         vr13,   vr7,   vr5  //p0_org
+    vilvl.b         vr14,   vr10,  vr8  //q0_org
+    vilvl.b         vr15,   vr11,  vr9  //q1_org
+    vilvl.b         vr4,    vr13,  vr12 //row0
+    vilvl.b         vr5,    vr15,  vr14 //row1
+    vilvl.w         vr6,    vr5,   vr4  //row2
+    vilvh.w         vr7,    vr5,   vr4  //row3
+    vilvl.d         vr12,   vr6,   vr6  //p1_org
+    vilvh.d         vr13,   vr6,   vr6  //p0_org
+    vilvl.d         vr14,   vr7,   vr7  //q0_org
+    vilvh.d         vr15,   vr7,   vr7  //q1_org
+
+    vabsd.bu        vr20,   vr13,  vr14  //p0_asub_q0
+    vabsd.bu        vr21,   vr12,  vr13  //p1_asub_p0
+    vabsd.bu        vr22,   vr15,  vr14  //q1_asub_q0
+
+    vreplgr2vr.b    vr4,    a2     //alpha
+    vreplgr2vr.b    vr5,    a3     //beta
+
+    vslt.bu         vr6,    vr20,  vr4  //is_less_than_alpha
+    vslt.bu         vr7,    vr21,  vr5  //is_less_than_beta
+    vand.v          vr8,    vr6,   vr7   //is_less_than
+    vslt.bu         vr7,    vr22,  vr5   //is_less_than_beta
+    vand.v          vr8,    vr7,   vr8   //is_less_than
+    vand.v          vr8,    vr8,   vr3   //is_less_than
+    vsetnez.v       $fcc0,  vr8
+    bceqz           $fcc0,  .END_CHROMA_8
+
+    vneg.b          vr9,    vr1          //neg_tc_h
+    vexth.hu.bu     vr3,    vr12         //p1_org_h
+    vexth.hu.bu     vr4,    vr13         //p0_org_h.1
+    vexth.hu.bu     vr5,    vr14         //q0_org_h.1
+    vexth.hu.bu     vr6,    vr15         //q1_org_h.1
+
+    vexth.hu.bu     vr18,   vr1          //tc_h.1
+    vexth.h.b       vr2,    vr9          //neg_tc_h.1
+
+    AVC_LPF_P0Q0 vr5, vr4, vr3, vr6, vr2, vr18, vr10, vr11, vr16, vr17
+    vpickev.b       vr10,   vr10,   vr10  //p0_h
+    vpickev.b       vr11,   vr11,   vr11  //q0_h
+    vbitsel.v       vr13,   vr13,   vr10,   vr8
+    vbitsel.v       vr14,   vr14,   vr11,   vr8
+    vilvl.b         vr15,   vr14,   vr13
+    addi.d          t4,     t4,     1
+    add.d           t5,     t4,     a1
+    add.d           t6,     t4,     t0
+    add.d           t7,     t4,     t2
+    vstelm.h        vr15,   t4,     0,    0
+    vstelm.h        vr15,   t5,     0,    1
+    vstelm.h        vr15,   t6,     0,    2
+    vstelm.h        vr15,   t7,     0,    3
+    add.d           t4,     t4,     t1
+    add.d           t5,     t4,     a1
+    add.d           t6,     t4,     t0
+    add.d           t7,     t4,     t2
+    vstelm.h        vr15,   t4,     0,    4
+    vstelm.h        vr15,   t5,     0,    5
+    vstelm.h        vr15,   t6,     0,    6
+    vstelm.h        vr15,   t7,     0,    7
+.END_CHROMA_8:
+endfunc
+
+function ff_h264_v_lpf_chroma_8_lsx
+    slli.d          t0,     a1,    1   //img_width_2x
+    la.local        t4,     chroma_shuf
+    vldrepl.w       vr0,    a4,    0   //tmp_vec0
+    vld             vr1,    t4,    0  //tc_vec
+    vshuf.b         vr1,    vr0,   vr0,   vr1   //tc_vec
+    vslti.b         vr2,    vr1,   0
+    vxori.b         vr2,    vr2,   255
+    vandi.b         vr2,    vr2,   1    //bs_vec
+    vsetnez.v       $fcc0,  vr2
+    bceqz           $fcc0,  .END_CHROMA_V_8
+    vldi            vr0,    0
+    sub.d           t4,     a0,    t0
+    vslt.bu         vr3,    vr0,   vr2   //is_bs_greater_than0
+    vld             vr12,   t4,    0    //p1_org
+    vldx            vr13,   t4,    a1   //p0_org
+    vld             vr14,   a0,    0    //q0_org
+    vldx            vr15,   a0,    a1   //q1_org
+
+    vabsd.bu        vr20,   vr13,  vr14  //p0_asub_q0
+    vabsd.bu        vr21,   vr12,  vr13  //p1_asub_p0
+    vabsd.bu        vr22,   vr15,  vr14  //q1_asub_q0
+
+    vreplgr2vr.b    vr4,    a2     //alpha
+    vreplgr2vr.b    vr5,    a3     //beta
+
+    vslt.bu         vr6,    vr20,  vr4  //is_less_than_alpha
+    vslt.bu         vr7,    vr21,  vr5  //is_less_than_beta
+    vand.v          vr8,    vr6,   vr7   //is_less_than
+    vslt.bu         vr7,    vr22,  vr5   //is_less_than_beta
+    vand.v          vr8,    vr7,   vr8   //is_less_than
+    vand.v          vr8,    vr8,   vr3   //is_less_than
+    vsetnez.v       $fcc0,  vr8
+    bceqz           $fcc0,  .END_CHROMA_V_8
+
+    vneg.b          vr9,    vr1          //neg_tc_h
+    vsllwil.hu.bu   vr3,    vr12,   0    //p1_org_h
+    vsllwil.hu.bu   vr4,    vr13,   0    //p0_org_h.1
+    vsllwil.hu.bu   vr5,    vr14,   0    //q0_org_h.1
+    vsllwil.hu.bu   vr6,    vr15,   0    //q1_org_h.1
+
+    vexth.hu.bu     vr18,   vr1          //tc_h.1
+    vexth.h.b       vr2,    vr9          //neg_tc_h.1
+
+    AVC_LPF_P0Q0 vr5, vr4, vr3, vr6, vr2, vr18, vr10, vr11, vr16, vr17
+    vpickev.b       vr10,   vr10,   vr10  //p0_h
+    vpickev.b       vr11,   vr11,   vr11  //q0_h
+    vbitsel.v       vr10,   vr13,   vr10,   vr8
+    vbitsel.v       vr11,   vr14,   vr11,   vr8
+    fstx.d          f10,    t4,     a1
+    fst.d           f11,    a0,     0
+.END_CHROMA_V_8:
+endfunc
+
+.macro AVC_LPF_P0P1P2_OR_Q0Q1Q2 _in0, _in1, _in2, _in3, _in4, _in5  \
+                                _out0, _out1, _out2, _tmp0, _const3
+    vadd.h          \_tmp0,  \_in1,  \_in2
+    vadd.h          \_tmp0,  \_tmp0, \_in3
+    vslli.h         \_out2,  \_in0,  1
+    vslli.h         \_out0,  \_tmp0, 1
+    vadd.h          \_out0,  \_out0, \_in4
+    vadd.h          \_out1,  \_in4,  \_tmp0
+    vadd.h          \_out0,  \_out0, \_in5
+    vmadd.h         \_out2,  \_in4,  \_const3
+    vsrar.h         \_out0,  \_out0, \_const3
+    vadd.h          \_out2,  \_out2, \_tmp0
+    vsrari.h        \_out1,  \_out1, 2
+    vsrar.h         \_out2,  \_out2, \_const3
+.endm
+
+.macro AVC_LPF_P0_OR_Q0 _in0, _in1, _in2, _out0, _tmp0
+    vslli.h         \_tmp0,  \_in2,  1
+    vadd.h          \_out0,  \_in0,  \_in1
+    vadd.h          \_out0,  \_out0, \_tmp0
+    vsrari.h        \_out0,  \_out0, 2
+.endm
+
+////LSX optimization is enough for this function.
+function ff_h264_h_lpf_luma_intra_8_lsx
+    slli.d          t0,     a1,    1   //img_width_2x
+    slli.d          t1,     a1,    2   //img_width_4x
+    addi.d          t4,     a0,    -4   //src
+    addi.d          sp,     sp,    -64
+    fst.d           f24,    sp,    0
+    fst.d           f25,    sp,    8
+    fst.d           f26,    sp,    16
+    fst.d           f27,    sp,    24
+    fst.d           f28,    sp,    32
+    fst.d           f29,    sp,    40
+    fst.d           f30,    sp,    48
+    fst.d           f31,    sp,    56
+    add.d           t2,     t0,    a1   //img_width_3x
+    add.d           t5,     t4,    t1
+    vld             vr0,    t4,    0    //row0
+    vldx            vr1,    t4,    a1   //row1
+    vldx            vr2,    t4,    t0   //row2
+    vldx            vr3,    t4,    t2   //row3
+    add.d           t6,     t5,    t1
+    vld             vr4,    t5,    0    //row4
+    vldx            vr5,    t5,    a1   //row5
+    vldx            vr6,    t5,    t0   //row6
+    vldx            vr7,    t5,    t2   //row7
+    add.d           t7,     t6,    t1
+    vld             vr8,    t6,    0    //row8
+    vldx            vr9,    t6,    a1   //row9
+    vldx            vr10,   t6,    t0   //row10
+    vldx            vr11,   t6,    t2   //row11
+    vld             vr12,   t7,    0    //row12
+    vldx            vr13,   t7,    a1   //row13
+    vldx            vr14,   t7,    t0   //row14
+    vldx            vr15,   t7,    t2   //row15
+    LSX_TRANSPOSE16X8_B vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7,       \
+                        vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
+                        vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7,       \
+                        vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+    // vr0: p3_org, vr1: p2_org, vr2: p1_org, vr3: p0_org
+    // vr4: q0_org, vr5: q1_org, vr6: q2_org, vr7: q3_org
+
+    vreplgr2vr.b    vr16,   a2    //alpha_in
+    vreplgr2vr.b    vr17,   a3    //beta_in
+    vabsd.bu        vr10,   vr3,   vr4    //p0_asub_q0
+    vabsd.bu        vr11,   vr2,   vr3    //p1_asub_p0
+    vabsd.bu        vr12,   vr5,   vr4    //q1_asub_q0
+
+    vslt.bu         vr8,    vr10,  vr16  //is_less_than_alpha
+    vslt.bu         vr9,    vr11,  vr17  //is_less_than_beta
+    vand.v          vr18,   vr8,   vr9   //is_less_than
+    vslt.bu         vr9,    vr12,  vr17  //is_less_than_beta
+    vand.v          vr18,   vr18,  vr9   //is_less_than
+
+    vsetnez.v       $fcc0,  vr18
+    bceqz           $fcc0,  .END_H_INTRA_8
+    vsrli.b         vr16,   vr16,  2     //less_alpha_shift2_add2
+    vaddi.bu        vr16,   vr16,  2
+    vslt.bu         vr16,   vr10,  vr16
+    vsllwil.hu.bu   vr10,   vr2,   0   //p1_org_h.0
+    vexth.hu.bu     vr11,   vr2        //p1_org_h.1
+    vsllwil.hu.bu   vr12,   vr3,   0   //p0_org_h.0
+    vexth.hu.bu     vr13,   vr3        //p0_org_h.1
+
+    vsllwil.hu.bu   vr14,   vr4,   0   //q0_org_h.0
+    vexth.hu.bu     vr15,   vr4        //q0_org_h.1
+    vsllwil.hu.bu   vr19,   vr5,   0   //q1_org_h.0
+    vexth.hu.bu     vr20,   vr5        //q1_org_h.1
+
+    vabsd.bu        vr21,   vr1,   vr3  //p2_asub_p0
+    vslt.bu         vr9,    vr21,  vr17  //is_less_than_beta
+    vand.v          vr9,    vr9,   vr16
+    vxori.b         vr22,   vr9,   0xff  //negate_is_less_than_beta
+    vand.v          vr9,    vr9,   vr18
+    vand.v          vr22,   vr22,  vr18
+
+    vsetnez.v       $fcc0,  vr9
+    bceqz           $fcc0,  .END_H_INTRA_LESS_BETA
+    vsllwil.hu.bu   vr23,   vr1,   0   //p2_org_h.0
+    vexth.hu.bu     vr24,   vr1        //p2_org_h.1
+    vsllwil.hu.bu   vr25,   vr0,   0   //p3_org_h.0
+    vexth.hu.bu     vr26,   vr0        //p3_org_h.1
+    vldi            vr27,   0x403
+
+    AVC_LPF_P0P1P2_OR_Q0Q1Q2 vr25, vr12, vr14, vr10, vr23, vr19, vr28, vr29, vr30, vr31, vr27
+    //vr28: p0_h.0   vr29: p1_h.0  vr30: p2_h.0
+    AVC_LPF_P0P1P2_OR_Q0Q1Q2 vr26, vr13, vr15, vr11, vr24, vr20, vr23, vr25, vr21, vr31, vr27
+    //vr23: p0_h.1   vr25: p1_h.1  vr21: p2_h.1
+    vpickev.b       vr28,   vr23,  vr28  //p0_h
+    vpickev.b       vr29,   vr25,  vr29  //p1_h
+    vpickev.b       vr30,   vr21,  vr30  //p2_h
+    vbitsel.v       vr3,    vr3,   vr28,   vr9
+    vbitsel.v       vr2,    vr2,   vr29,   vr9
+    vbitsel.v       vr1,    vr1,   vr30,   vr9
+.END_H_INTRA_LESS_BETA:
+    AVC_LPF_P0_OR_Q0 vr12, vr19, vr10, vr23, vr25
+    AVC_LPF_P0_OR_Q0 vr13, vr20, vr11, vr24, vr25
+    //vr23: p0_h.0   vr24: p0_h.1
+    vpickev.b       vr23,   vr24,  vr23
+    vbitsel.v       vr3,    vr3,   vr23,   vr22
+
+    vabsd.bu        vr21,   vr6,   vr4   //q2_asub_q0
+    vslt.bu         vr9,    vr21,  vr17  //is_less_than_beta
+    vand.v          vr9,    vr9,   vr16
+    vxori.b         vr22,   vr9,   0xff   //negate_is_less_than_beta
+    vand.v          vr9,    vr9,   vr18
+    vand.v          vr22,   vr22,  vr18
+
+    vsetnez.v       $fcc0,  vr9
+    bceqz           $fcc0,  .END_H_INTRA_LESS_BETA_SEC
+    vsllwil.hu.bu   vr23,   vr6,   0   //q2_org_h.0
+    vexth.hu.bu     vr24,   vr6        //q2_org_h.1
+    vsllwil.hu.bu   vr25,   vr7,   0   //q3_org_h.0
+    vexth.hu.bu     vr26,   vr7        //q3_org_h.1
+    vldi            vr27,   0x403
+
+    AVC_LPF_P0P1P2_OR_Q0Q1Q2 vr25, vr14, vr12, vr19, vr23, vr10, vr28, vr29, vr30, vr31, vr27
+    AVC_LPF_P0P1P2_OR_Q0Q1Q2 vr26, vr15, vr13, vr20, vr24, vr11, vr23, vr25, vr21, vr31, vr27
+    vpickev.b       vr28,   vr23,  vr28  //q0_h
+    vpickev.b       vr29,   vr25,  vr29  //q1_h
+    vpickev.b       vr30,   vr21,  vr30  //q2_h
+    vbitsel.v       vr4,    vr4,   vr28,   vr9
+    vbitsel.v       vr5,    vr5,   vr29,   vr9
+    vbitsel.v       vr6,    vr6,   vr30,   vr9
+.END_H_INTRA_LESS_BETA_SEC:
+    AVC_LPF_P0_OR_Q0 vr14, vr10, vr19, vr23, vr25
+    AVC_LPF_P0_OR_Q0 vr15, vr11, vr20, vr24, vr25
+    vpickev.b       vr23,   vr24,  vr23
+    vbitsel.v       vr4,    vr4,   vr23,   vr22
+
+    vilvl.b         vr14,   vr2,   vr0   // row0.0
+    vilvl.b         vr15,   vr6,   vr4   // row0.1
+    vilvl.b         vr16,   vr3,   vr1   // row2.0
+    vilvl.b         vr17,   vr7,   vr5   // row2.1
+
+    vilvh.b         vr18,   vr2,   vr0   // row1.0
+    vilvh.b         vr19,   vr6,   vr4   // row1.1
+    vilvh.b         vr20,   vr3,   vr1   // row3.0
+    vilvh.b         vr21,   vr7,   vr5   // row3.1
+
+    vilvl.b         vr2,    vr16,   vr14    // row4.0
+    vilvl.b         vr3,    vr17,   vr15    // row4.1
+    vilvl.b         vr4,    vr20,   vr18    // row6.0
+    vilvl.b         vr5,    vr21,   vr19    // row6.1
+
+    vilvh.b         vr6,    vr16,   vr14    // row5.0
+    vilvh.b         vr7,    vr17,   vr15    // row5.1
+    vilvh.b         vr8,    vr20,   vr18    // row7.0
+    vilvh.b         vr9,    vr21,   vr19    // row7.1
+
+    vilvl.w         vr14,   vr3,    vr2   // row4: 0, 4, 1, 5
+    vilvh.w         vr15,   vr3,    vr2   // row4: 2, 6, 3, 7
+    vilvl.w         vr16,   vr7,    vr6   // row5: 0, 4, 1, 5
+    vilvh.w         vr17,   vr7,    vr6   // row5: 2, 6, 3, 7
+
+    vilvl.w         vr18,   vr5,    vr4   // row6: 0, 4, 1, 5
+    vilvh.w         vr19,   vr5,    vr4   // row6: 2, 6, 3, 7
+    vilvl.w         vr20,   vr9,    vr8   // row7: 0, 4, 1, 5
+    vilvh.w         vr21,   vr9,    vr8   // row7: 2, 6, 3, 7
+
+    vbsrl.v         vr0,    vr14,   8
+    vbsrl.v         vr1,    vr15,   8
+    vbsrl.v         vr2,    vr16,   8
+    vbsrl.v         vr3,    vr17,   8
+
+    vbsrl.v         vr4,    vr18,   8
+    vbsrl.v         vr5,    vr19,   8
+    vbsrl.v         vr6,    vr20,   8
+    vbsrl.v         vr7,    vr21,   8
+
+    fst.d           f14,    t4,    0
+    fstx.d          f0,     t4,    a1
+    fstx.d          f15,    t4,    t0
+    fstx.d          f1,     t4,    t2
+    fst.d           f16,    t5,    0
+    fstx.d          f2,     t5,    a1
+    fstx.d          f17,    t5,    t0
+    fstx.d          f3,     t5,    t2
+    fst.d           f18,    t6,    0
+    fstx.d          f4,     t6,    a1
+    fstx.d          f19,    t6,    t0
+    fstx.d          f5,     t6,    t2
+    fst.d           f20,    t7,    0
+    fstx.d          f6,     t7,    a1
+    fstx.d          f21,    t7,    t0
+    fstx.d          f7,     t7,    t2
+.END_H_INTRA_8:
+    fld.d           f24,    sp,    0
+    fld.d           f25,    sp,    8
+    fld.d           f26,    sp,    16
+    fld.d           f27,    sp,    24
+    fld.d           f28,    sp,    32
+    fld.d           f29,    sp,    40
+    fld.d           f30,    sp,    48
+    fld.d           f31,    sp,    56
+    addi.d          sp,     sp,    64
+endfunc
+
+//LSX optimization is enough for this function.
+function ff_h264_v_lpf_luma_intra_8_lsx
+    slli.d          t0,     a1,    1   //img_width_2x
+    add.d           t1,     t0,    a1  //img_width_3x
+    addi.d          sp,     sp,    -64
+    fst.d           f24,    sp,    0
+    fst.d           f25,    sp,    8
+    fst.d           f26,    sp,    16
+    fst.d           f27,    sp,    24
+    fst.d           f28,    sp,    32
+    fst.d           f29,    sp,    40
+    fst.d           f30,    sp,    48
+    fst.d           f31,    sp,    56
+    sub.d           t4,     a0,    t1  //src - img_width_3x
+
+    vld             vr0,    a0,    0   //q0_org
+    vldx            vr1,    a0,    a1  //q1_org
+    vldx            vr2,    t4,    a1  //p1_org
+    vldx            vr3,    t4,    t0  //p0_org
+
+    vreplgr2vr.b    vr4,    a2   //alpha
+    vreplgr2vr.b    vr5,    a3   //beta
+
+    vabsd.bu        vr6,    vr3,   vr0   //p0_asub_q0
+    vabsd.bu        vr7,    vr2,   vr3   //p1_asub_p0
+    vabsd.bu        vr8,    vr1,   vr0   //q1_asub_q0
+
+    vslt.bu         vr9,    vr6,   vr4  //is_less_than_alpha
+    vslt.bu         vr10,   vr7,   vr5  //is_less_than_beta
+    vand.v          vr11,   vr9,   vr10  //is_less_than
+    vslt.bu         vr10,   vr8,   vr5
+    vand.v          vr11,   vr10,  vr11
+
+    vsetnez.v       $fcc0,  vr11
+    bceqz           $fcc0,  .END_V_INTRA_8
+
+    vld             vr12,   t4,    0   //p2_org
+    vldx            vr13,   a0,    t0  //q2_org
+    vsrli.b         vr14,   vr4,   2   //is_alpha_shift2_add2
+    vsllwil.hu.bu   vr15,   vr2,   0  //p1_org_h.0
+    vexth.hu.bu     vr16,   vr2       //p1_org_h.1
+    vaddi.bu        vr14,   vr14,  2
+    vsllwil.hu.bu   vr17,   vr3,   0  //p0_org_h.0
+    vexth.hu.bu     vr18,   vr3       //p0_org_h.1
+    vslt.bu         vr14,   vr6,   vr14
+    vsllwil.hu.bu   vr19,   vr0,   0  //q0_org_h.0
+    vexth.hu.bu     vr20,   vr0       //q0_org_h.1
+    vsllwil.hu.bu   vr21,   vr1,   0  //q1_org_h.0
+    vexth.hu.bu     vr22,   vr1       //q1_org_h.1
+
+    vabsd.bu        vr23,   vr12,  vr3  //p2_asub_p0
+    vslt.bu         vr10,   vr23,  vr5  //is_less_than_beta
+    vand.v          vr10,   vr10,  vr14
+    vxori.b         vr23,   vr10,  0xff //negate_is_less_than_beta
+    vand.v          vr10,   vr10,  vr11
+    vand.v          vr23,   vr23,  vr11
+
+    vsetnez.v       $fcc0,  vr10
+    bceqz           $fcc0,  .END_V_INTRA_LESS_BETA
+    sub.d           t5,     t4,    a1
+    vld             vr24,   t5,    0  //p3_org
+    vsllwil.hu.bu   vr26,   vr12,  0  //p2_org_h.0
+    vexth.hu.bu     vr27,   vr12      //p2_org_h.1
+    vsllwil.hu.bu   vr28,   vr24,  0  //p3_org_h.0
+    vexth.hu.bu     vr29,   vr24      //p3_org_h.1
+    vldi            vr4,    0x403
+
+    AVC_LPF_P0P1P2_OR_Q0Q1Q2 vr28, vr17, vr19, vr15, vr26, vr21, vr25, vr30, vr31, vr24, vr4
+    AVC_LPF_P0P1P2_OR_Q0Q1Q2 vr29, vr18, vr20, vr16, vr27, vr22, vr6, vr7, vr8, vr24, vr4
+
+    vpickev.b       vr25,   vr6,   vr25  //p0_h
+    vpickev.b       vr30,   vr7,   vr30  //p1_h
+    vpickev.b       vr31,   vr8,   vr31  //p2_h
+
+    vbitsel.v       vr3,    vr3,   vr25,   vr10
+    vbitsel.v       vr2,    vr2,   vr30,   vr10
+    vbitsel.v       vr12,   vr12,  vr31,   vr10
+
+    vstx            vr2,    t4,    a1
+    vst             vr12,   t4,    0
+.END_V_INTRA_LESS_BETA:
+    AVC_LPF_P0_OR_Q0 vr17, vr21, vr15, vr24, vr30
+    AVC_LPF_P0_OR_Q0 vr18, vr22, vr16, vr25, vr30
+    vpickev.b       vr24,   vr25,  vr24
+    vbitsel.v       vr3,    vr3,   vr24,   vr23
+    vstx            vr3,    t4,    t0
+
+    vabsd.bu        vr23,   vr13,  vr0   //q2_asub_q0
+    vslt.bu         vr10,   vr23,  vr5   //is_less_than_beta
+    vand.v          vr10,   vr10,  vr14
+    vxori.b         vr23,   vr10,  0xff  //negate_is_less_than_beta
+    vand.v          vr10,   vr10,  vr11
+    vand.v          vr23,   vr23,  vr11
+
+    vsetnez.v       $fcc0,  vr10
+    bceqz           $fcc0,  .END_V_INTRA_LESS_BETA_SEC
+    vldx            vr24,   a0,    t1  //q3_org
+
+    vsllwil.hu.bu   vr26,   vr13,  0  //q2_org_h.0
+    vexth.hu.bu     vr27,   vr13      //q2_org_h.1
+    vsllwil.hu.bu   vr28,   vr24,  0  //q3_org_h.0
+    vexth.hu.bu     vr29,   vr24      //q3_org_h.1
+    vldi            vr4,    0x403
+
+    AVC_LPF_P0P1P2_OR_Q0Q1Q2 vr28, vr19, vr17, vr21, vr26, vr15, vr25, vr30, vr31, vr24, vr4
+    AVC_LPF_P0P1P2_OR_Q0Q1Q2 vr29, vr20, vr18, vr22, vr27, vr16, vr6, vr7, vr8, vr24, vr4
+
+    vpickev.b       vr25,   vr6,   vr25
+    vpickev.b       vr30,   vr7,   vr30
+    vpickev.b       vr31,   vr8,   vr31
+
+    vbitsel.v       vr0,    vr0,   vr25,   vr10
+    vbitsel.v       vr1,    vr1,   vr30,   vr10
+    vbitsel.v       vr13,   vr13,  vr31,   vr10
+    vstx            vr1,    a0,    a1
+    vstx            vr13,   a0,    t0
+.END_V_INTRA_LESS_BETA_SEC:
+    AVC_LPF_P0_OR_Q0 vr19, vr15, vr21, vr24, vr30
+    AVC_LPF_P0_OR_Q0 vr20, vr16, vr22, vr25, vr30
+    vpickev.b       vr24,   vr25,  vr24
+    vbitsel.v       vr0,    vr0,   vr24,   vr23
+    vst             vr0,    a0,    0
+.END_V_INTRA_8:
+    fld.d           f24,    sp,    0
+    fld.d           f25,    sp,    8
+    fld.d           f26,    sp,    16
+    fld.d           f27,    sp,    24
+    fld.d           f28,    sp,    32
+    fld.d           f29,    sp,    40
+    fld.d           f30,    sp,    48
+    fld.d           f31,    sp,    56
+    addi.d          sp,     sp,    64
+endfunc
+
+function ff_h264_h_lpf_chroma_intra_8_lsx
+    addi.d          t4,     a0,    -2
+    slli.d          t0,     a1,    1   //img_2x
+    slli.d          t2,     a1,    2   //img_4x
+    add.d           t1,     t0,    a1  //img_3x
+
+    add.d           t5,     t4,    t2
+    fld.s           f0,     t4,    0   //row0
+    fldx.s          f1,     t4,    a1  //row1
+    fldx.s          f2,     t4,    t0  //row2
+    fldx.s          f3,     t4,    t1  //row3
+    fld.s           f4,     t5,    0   //row4
+    fldx.s          f5,     t5,    a1  //row5
+    fldx.s          f6,     t5,    t0  //row6
+    fldx.s          f7,     t5,    t1  //row7
+
+    vilvl.b         vr8,    vr2,   vr0  //p1_org
+    vilvl.b         vr9,    vr3,   vr1  //p0_org
+    vilvl.b         vr10,   vr6,   vr4  //q0_org
+    vilvl.b         vr11,   vr7,   vr5  //q1_org
+
+    vilvl.b         vr0,    vr9,   vr8
+    vilvl.b         vr1,    vr11,  vr10
+    vilvl.w         vr2,    vr1,   vr0
+    vilvh.w         vr3,    vr1,   vr0
+
+    vilvl.d         vr8,    vr2,   vr2   //p1_org
+    vilvh.d         vr9,    vr2,   vr2   //p0_org
+    vilvl.d         vr10,   vr3,   vr3   //q0_org
+    vilvh.d         vr11,   vr3,   vr3   //q1_org
+
+    vreplgr2vr.b    vr0,    a2   //alpha
+    vreplgr2vr.b    vr1,    a3   //beta
+
+    vabsd.bu        vr2,    vr9,   vr10  //p0_asub_q0
+    vabsd.bu        vr3,    vr8,   vr9   //p1_asub_p0
+    vabsd.bu        vr4,    vr11,  vr10  //q1_asub_q0
+
+    vslt.bu         vr5,    vr2,   vr0  //is_less_than_alpha
+    vslt.bu         vr6,    vr3,   vr1  //is_less_than_beta
+    vand.v          vr7,    vr5,   vr6   //is_less_than
+    vslt.bu         vr6,    vr4,   vr1
+    vand.v          vr7,    vr7,   vr6
+
+    vsetnez.v       $fcc0,  vr7
+    bceqz           $fcc0,  .END_H_CHROMA_INTRA_8
+
+    vexth.hu.bu     vr12,   vr8   //p1_org_h
+    vexth.hu.bu     vr13,   vr9   //p0_org_h
+    vexth.hu.bu     vr14,   vr10  //q0_org_h
+    vexth.hu.bu     vr15,   vr11  //q1_org_h
+
+    AVC_LPF_P0_OR_Q0 vr13, vr15, vr12, vr16, vr18
+    AVC_LPF_P0_OR_Q0 vr14, vr12, vr15, vr17, vr18
+
+    vpickev.b       vr18,   vr16,   vr16
+    vpickev.b       vr19,   vr17,   vr17
+    vbitsel.v       vr9,    vr9,    vr18,   vr7
+    vbitsel.v       vr10,   vr10,   vr19,   vr7
+.END_H_CHROMA_INTRA_8:
+    vilvl.b         vr11,   vr10,   vr9
+    addi.d          t4,     t4,     1
+    vstelm.h        vr11,   t4,     0,      0
+    add.d           t4,     t4,     a1
+    vstelm.h        vr11,   t4,     0,      1
+    add.d           t4,     t4,     a1
+    vstelm.h        vr11,   t4,     0,      2
+    add.d           t4,     t4,     a1
+    vstelm.h        vr11,   t4,     0,      3
+    add.d           t4,     t4,     a1
+    vstelm.h        vr11,   t4,     0,      4
+    add.d           t4,     t4,     a1
+    vstelm.h        vr11,   t4,     0,      5
+    add.d           t4,     t4,     a1
+    vstelm.h        vr11,   t4,     0,      6
+    add.d           t4,     t4,     a1
+    vstelm.h        vr11,   t4,     0,      7
+endfunc
+
+function ff_h264_v_lpf_chroma_intra_8_lsx
+    slli.d          t0,     a1,     1    //img_width_2x
+    sub.d           t2,     a0,     a1
+    sub.d           t1,     a0,     t0   //data - img_width_2x
+
+    vreplgr2vr.b    vr0,    a2
+    vreplgr2vr.b    vr1,    a3
+
+    vld             vr2,    t1,     0   //p1_org
+    vldx            vr3,    t1,     a1  //p0_org
+    vld             vr4,    a0,     0   //q0_org
+    vldx            vr5,    a0,     a1  //q1_org
+
+    vabsd.bu        vr6,    vr3,    vr4  //p0_asub_q0
+    vabsd.bu        vr7,    vr2,    vr3  //p1_asub_p0
+    vabsd.bu        vr8,    vr5,    vr4  //q1_asub_q0
+
+    vslt.bu         vr9,    vr6,    vr0  //is_less_than_alpha
+    vslt.bu         vr10,   vr7,    vr1  //is_less_than_beta
+    vand.v          vr11,   vr9,    vr10  //is_less_than
+    vslt.bu         vr10,   vr8,    vr1
+    vand.v          vr11,   vr10,   vr11
+
+    vsetnez.v       $fcc0,  vr11
+    bceqz           $fcc0,  .END_V_CHROMA_INTRA_8
+
+    vsllwil.hu.bu   vr6,    vr2,    0  //p1_org_h.0
+    vsllwil.hu.bu   vr8,    vr3,    0  //p0_org_h.0
+    vsllwil.hu.bu   vr13,   vr4,    0  //q0_org_h.0
+    vsllwil.hu.bu   vr15,   vr5,    0  //q1_org_h.0
+
+    AVC_LPF_P0_OR_Q0 vr8, vr15, vr6, vr17, vr23
+    AVC_LPF_P0_OR_Q0 vr13, vr6, vr15, vr18, vr23
+
+    vpickev.b       vr19,   vr17,   vr17
+    vpickev.b       vr20,   vr18,   vr18
+    vbitsel.v       vr3,    vr3,    vr19,   vr11
+    vbitsel.v       vr4,    vr4,    vr20,   vr11
+
+    vstelm.d        vr3,    t2,     0,      0
+    vstelm.d        vr4,    a0,     0,      0
+.END_V_CHROMA_INTRA_8:
+endfunc
+
+function ff_biweight_h264_pixels16_8_lsx
+    slli.d          t0,     a2,     1
+    slli.d          t2,     a2,     2
+    add.d           t1,     t0,     a2
+    addi.d          a7,     a7,     1
+    ori             a7,     a7,     1
+    sll.d           a7,     a7,     a4
+    addi.d          a4,     a4,     1
+
+    vreplgr2vr.b    vr0,    a6    //tmp0
+    vreplgr2vr.b    vr1,    a5    //tmp1
+    vreplgr2vr.h    vr8,    a7    //offset
+    vreplgr2vr.h    vr9,    a4    //denom
+    vilvh.b         vr20,   vr1,    vr0    //wgt
+
+    add.d           t4,     a1,     t2
+    vld             vr0,    a1,     0
+    vldx            vr1,    a1,     a2
+    vldx            vr2,    a1,     t0
+    vldx            vr3,    a1,     t1
+    vld             vr4,    t4,     0
+    vldx            vr5,    t4,     a2
+    vldx            vr6,    t4,     t0
+    vldx            vr7,    t4,     t1
+
+    add.d           t5,     a0,     t2
+    vld             vr10,   a0,     0
+    vldx            vr11,   a0,     a2
+    vldx            vr12,   a0,     t0
+    vldx            vr13,   a0,     t1
+    vld             vr14,   t5,     0
+    vldx            vr15,   t5,     a2
+    vldx            vr16,   t5,     t0
+    vldx            vr17,   t5,     t1
+
+    vilvl.b         vr18,   vr10,   vr0
+    vilvl.b         vr19,   vr11,   vr1
+    vilvl.b         vr21,   vr12,   vr2
+    vilvl.b         vr22,   vr13,   vr3
+    vilvh.b         vr0,    vr10,   vr0
+    vilvh.b         vr1,    vr11,   vr1
+    vilvh.b         vr2,    vr12,   vr2
+    vilvh.b         vr3,    vr13,   vr3
+
+    vilvl.b         vr10,   vr14,   vr4
+    vilvl.b         vr11,   vr15,   vr5
+    vilvl.b         vr12,   vr16,   vr6
+    vilvl.b         vr13,   vr17,   vr7
+    vilvh.b         vr14,   vr14,   vr4
+    vilvh.b         vr15,   vr15,   vr5
+    vilvh.b         vr16,   vr16,   vr6
+    vilvh.b         vr17,   vr17,   vr7
+
+    vmov            vr4,    vr8
+    vmov            vr5,    vr8
+    vmov            vr6,    vr8
+    vmov            vr7,    vr8
+    vmaddwev.h.bu.b vr4,    vr18,   vr20
+    vmaddwev.h.bu.b vr5,    vr19,   vr20
+    vmaddwev.h.bu.b vr6,    vr21,   vr20
+    vmaddwev.h.bu.b vr7,    vr22,   vr20
+    vmaddwod.h.bu.b vr4,    vr18,   vr20
+    vmaddwod.h.bu.b vr5,    vr19,   vr20
+    vmaddwod.h.bu.b vr6,    vr21,   vr20
+    vmaddwod.h.bu.b vr7,    vr22,   vr20
+    vmov            vr18,   vr8
+    vmov            vr19,   vr8
+    vmov            vr21,   vr8
+    vmov            vr22,   vr8
+    vmaddwev.h.bu.b vr18,   vr0,    vr20
+    vmaddwev.h.bu.b vr19,   vr1,    vr20
+    vmaddwev.h.bu.b vr21,   vr2,    vr20
+    vmaddwev.h.bu.b vr22,   vr3,    vr20
+    vmaddwod.h.bu.b vr18,   vr0,    vr20
+    vmaddwod.h.bu.b vr19,   vr1,    vr20
+    vmaddwod.h.bu.b vr21,   vr2,    vr20
+    vmaddwod.h.bu.b vr22,   vr3,    vr20
+    vmov            vr0,    vr8
+    vmov            vr1,    vr8
+    vmov            vr2,    vr8
+    vmov            vr3,    vr8
+    vmaddwev.h.bu.b vr0,    vr10,   vr20
+    vmaddwev.h.bu.b vr1,    vr11,   vr20
+    vmaddwev.h.bu.b vr2,    vr12,   vr20
+    vmaddwev.h.bu.b vr3,    vr13,   vr20
+    vmaddwod.h.bu.b vr0,    vr10,   vr20
+    vmaddwod.h.bu.b vr1,    vr11,   vr20
+    vmaddwod.h.bu.b vr2,    vr12,   vr20
+    vmaddwod.h.bu.b vr3,    vr13,   vr20
+    vmov            vr10,   vr8
+    vmov            vr11,   vr8
+    vmov            vr12,   vr8
+    vmov            vr13,   vr8
+    vmaddwev.h.bu.b vr10,   vr14,   vr20
+    vmaddwev.h.bu.b vr11,   vr15,   vr20
+    vmaddwev.h.bu.b vr12,   vr16,   vr20
+    vmaddwev.h.bu.b vr13,   vr17,   vr20
+    vmaddwod.h.bu.b vr10,   vr14,   vr20
+    vmaddwod.h.bu.b vr11,   vr15,   vr20
+    vmaddwod.h.bu.b vr12,   vr16,   vr20
+    vmaddwod.h.bu.b vr13,   vr17,   vr20
+
+    vssran.bu.h     vr4,    vr4,    vr9
+    vssran.bu.h     vr5,    vr5,    vr9
+    vssran.bu.h     vr6,    vr6,    vr9
+    vssran.bu.h     vr7,    vr7,    vr9
+    vssran.bu.h     vr18,   vr18,   vr9
+    vssran.bu.h     vr19,   vr19,   vr9
+    vssran.bu.h     vr21,   vr21,   vr9
+    vssran.bu.h     vr22,   vr22,   vr9
+    vssran.bu.h     vr0,    vr0,    vr9
+    vssran.bu.h     vr1,    vr1,    vr9
+    vssran.bu.h     vr2,    vr2,    vr9
+    vssran.bu.h     vr3,    vr3,    vr9
+    vssran.bu.h     vr10,   vr10,   vr9
+    vssran.bu.h     vr11,   vr11,   vr9
+    vssran.bu.h     vr12,   vr12,   vr9
+    vssran.bu.h     vr13,   vr13,   vr9
+
+    vilvl.d         vr4,    vr18,   vr4
+    vilvl.d         vr5,    vr19,   vr5
+    vilvl.d         vr6,    vr21,   vr6
+    vilvl.d         vr7,    vr22,   vr7
+    vilvl.d         vr0,    vr10,   vr0
+    vilvl.d         vr1,    vr11,   vr1
+    vilvl.d         vr2,    vr12,   vr2
+    vilvl.d         vr3,    vr13,   vr3
+
+    vst             vr4,    a0,     0
+    vstx            vr5,    a0,     a2
+    vstx            vr6,    a0,     t0
+    vstx            vr7,    a0,     t1
+    vst             vr0,    t5,     0
+    vstx            vr1,    t5,     a2
+    vstx            vr2,    t5,     t0
+    vstx            vr3,    t5,     t1
+
+    addi.d          t6,     zero,   16
+    bne             a3,     t6,     .END_BIWEIGHT_PIXELS16
+    add.d           t4,     t4,     t2
+    add.d           t5,     t5,     t2
+    vld             vr0,    t4,     0
+    vldx            vr1,    t4,     a2
+    vldx            vr2,    t4,     t0
+    vldx            vr3,    t4,     t1
+    add.d           t6,     t4,     t2
+    add.d           t7,     t5,     t2
+    vld             vr4,    t6,     0
+    vldx            vr5,    t6,     a2
+    vldx            vr6,    t6,     t0
+    vldx            vr7,    t6,     t1
+
+    vld             vr10,   t5,     0
+    vldx            vr11,   t5,     a2
+    vldx            vr12,   t5,     t0
+    vldx            vr13,   t5,     t1
+    vld             vr14,   t7,     0
+    vldx            vr15,   t7,     a2
+    vldx            vr16,   t7,     t0
+    vldx            vr17,   t7,     t1
+
+    vilvl.b         vr18,   vr10,   vr0
+    vilvl.b         vr19,   vr11,   vr1
+    vilvl.b         vr21,   vr12,   vr2
+    vilvl.b         vr22,   vr13,   vr3
+    vilvh.b         vr0,    vr10,   vr0
+    vilvh.b         vr1,    vr11,   vr1
+    vilvh.b         vr2,    vr12,   vr2
+    vilvh.b         vr3,    vr13,   vr3
+
+    vilvl.b         vr10,   vr14,   vr4
+    vilvl.b         vr11,   vr15,   vr5
+    vilvl.b         vr12,   vr16,   vr6
+    vilvl.b         vr13,   vr17,   vr7
+    vilvh.b         vr14,   vr14,   vr4
+    vilvh.b         vr15,   vr15,   vr5
+    vilvh.b         vr16,   vr16,   vr6
+    vilvh.b         vr17,   vr17,   vr7
+
+    vmov            vr4,    vr8
+    vmov            vr5,    vr8
+    vmov            vr6,    vr8
+    vmov            vr7,    vr8
+    vmaddwev.h.bu.b vr4,    vr18,   vr20
+    vmaddwev.h.bu.b vr5,    vr19,   vr20
+    vmaddwev.h.bu.b vr6,    vr21,   vr20
+    vmaddwev.h.bu.b vr7,    vr22,   vr20
+    vmaddwod.h.bu.b vr4,    vr18,   vr20
+    vmaddwod.h.bu.b vr5,    vr19,   vr20
+    vmaddwod.h.bu.b vr6,    vr21,   vr20
+    vmaddwod.h.bu.b vr7,    vr22,   vr20
+    vmov            vr18,   vr8
+    vmov            vr19,   vr8
+    vmov            vr21,   vr8
+    vmov            vr22,   vr8
+    vmaddwev.h.bu.b vr18,   vr0,    vr20
+    vmaddwev.h.bu.b vr19,   vr1,    vr20
+    vmaddwev.h.bu.b vr21,   vr2,    vr20
+    vmaddwev.h.bu.b vr22,   vr3,    vr20
+    vmaddwod.h.bu.b vr18,   vr0,    vr20
+    vmaddwod.h.bu.b vr19,   vr1,    vr20
+    vmaddwod.h.bu.b vr21,   vr2,    vr20
+    vmaddwod.h.bu.b vr22,   vr3,    vr20
+    vmov            vr0,    vr8
+    vmov            vr1,    vr8
+    vmov            vr2,    vr8
+    vmov            vr3,    vr8
+    vmaddwev.h.bu.b vr0,    vr10,   vr20
+    vmaddwev.h.bu.b vr1,    vr11,   vr20
+    vmaddwev.h.bu.b vr2,    vr12,   vr20
+    vmaddwev.h.bu.b vr3,    vr13,   vr20
+    vmaddwod.h.bu.b vr0,    vr10,   vr20
+    vmaddwod.h.bu.b vr1,    vr11,   vr20
+    vmaddwod.h.bu.b vr2,    vr12,   vr20
+    vmaddwod.h.bu.b vr3,    vr13,   vr20
+    vmov            vr10,   vr8
+    vmov            vr11,   vr8
+    vmov            vr12,   vr8
+    vmov            vr13,   vr8
+    vmaddwev.h.bu.b vr10,   vr14,   vr20
+    vmaddwev.h.bu.b vr11,   vr15,   vr20
+    vmaddwev.h.bu.b vr12,   vr16,   vr20
+    vmaddwev.h.bu.b vr13,   vr17,   vr20
+    vmaddwod.h.bu.b vr10,   vr14,   vr20
+    vmaddwod.h.bu.b vr11,   vr15,   vr20
+    vmaddwod.h.bu.b vr12,   vr16,   vr20
+    vmaddwod.h.bu.b vr13,   vr17,   vr20
+
+    vssran.bu.h     vr4,    vr4,    vr9
+    vssran.bu.h     vr5,    vr5,    vr9
+    vssran.bu.h     vr6,    vr6,    vr9
+    vssran.bu.h     vr7,    vr7,    vr9
+    vssran.bu.h     vr18,   vr18,   vr9
+    vssran.bu.h     vr19,   vr19,   vr9
+    vssran.bu.h     vr21,   vr21,   vr9
+    vssran.bu.h     vr22,   vr22,   vr9
+    vssran.bu.h     vr0,    vr0,    vr9
+    vssran.bu.h     vr1,    vr1,    vr9
+    vssran.bu.h     vr2,    vr2,    vr9
+    vssran.bu.h     vr3,    vr3,    vr9
+    vssran.bu.h     vr10,   vr10,   vr9
+    vssran.bu.h     vr11,   vr11,   vr9
+    vssran.bu.h     vr12,   vr12,   vr9
+    vssran.bu.h     vr13,   vr13,   vr9
+
+    vilvl.d         vr4,    vr18,   vr4
+    vilvl.d         vr5,    vr19,   vr5
+    vilvl.d         vr6,    vr21,   vr6
+    vilvl.d         vr7,    vr22,   vr7
+    vilvl.d         vr0,    vr10,   vr0
+    vilvl.d         vr1,    vr11,   vr1
+    vilvl.d         vr2,    vr12,   vr2
+    vilvl.d         vr3,    vr13,   vr3
+
+    vst             vr4,    t5,     0
+    vstx            vr5,    t5,     a2
+    vstx            vr6,    t5,     t0
+    vstx            vr7,    t5,     t1
+    vst             vr0,    t7,     0
+    vstx            vr1,    t7,     a2
+    vstx            vr2,    t7,     t0
+    vstx            vr3,    t7,     t1
+.END_BIWEIGHT_PIXELS16:
+endfunc
+
+function ff_biweight_h264_pixels16_8_lasx
+    slli.d           t0,     a2,     1
+    slli.d           t2,     a2,     2
+    add.d            t1,     t0,     a2
+    addi.d           a7,     a7,     1
+    ori              a7,     a7,     1
+    sll.d            a7,     a7,     a4
+    addi.d           a4,     a4,     1
+
+    xvreplgr2vr.b    xr0,    a6    //tmp0
+    xvreplgr2vr.b    xr1,    a5    //tmp1
+    xvreplgr2vr.h    xr8,    a7    //offset
+    xvreplgr2vr.h    xr9,    a4    //denom
+    xvilvh.b         xr20,   xr1,    xr0    //wgt
+
+    add.d            t4,     a1,     t2
+    vld              vr0,    a1,     0
+    vldx             vr1,    a1,     a2
+    vldx             vr2,    a1,     t0
+    vldx             vr3,    a1,     t1
+    vld              vr4,    t4,     0
+    vldx             vr5,    t4,     a2
+    vldx             vr6,    t4,     t0
+    vldx             vr7,    t4,     t1
+
+    add.d            t5,     a0,     t2
+    vld              vr10,   a0,     0
+    vldx             vr11,   a0,     a2
+    vldx             vr12,   a0,     t0
+    vldx             vr13,   a0,     t1
+    vld              vr14,   t5,     0
+    vldx             vr15,   t5,     a2
+    vldx             vr16,   t5,     t0
+    vldx             vr17,   t5,     t1
+
+    xvpermi.q        xr1,    xr0,    0x20
+    xvpermi.q        xr3,    xr2,    0x20
+    xvpermi.q        xr5,    xr4,    0x20
+    xvpermi.q        xr7,    xr6,    0x20
+
+    xvpermi.q        xr11,   xr10,   0x20
+    xvpermi.q        xr13,   xr12,   0x20
+    xvpermi.q        xr15,   xr14,   0x20
+    xvpermi.q        xr17,   xr16,   0x20
+
+    xvilvl.b         xr0,    xr11,   xr1   //vec0
+    xvilvl.b         xr2,    xr13,   xr3   //vec2
+    xvilvl.b         xr4,    xr15,   xr5   //vec4
+    xvilvl.b         xr6,    xr17,   xr7   //vec6
+
+    xvilvh.b         xr10,   xr11,   xr1   //vec1
+    xvilvh.b         xr12,   xr13,   xr3   //vec2
+    xvilvh.b         xr14,   xr15,   xr5   //vec5
+    xvilvh.b         xr16,   xr17,   xr7   //vec7
+
+    xmov             xr1,    xr8
+    xmov             xr3,    xr8
+    xmov             xr5,    xr8
+    xmov             xr7,    xr8
+    xvmaddwev.h.bu.b xr1,    xr0,   xr20
+    xvmaddwev.h.bu.b xr3,    xr2,   xr20
+    xvmaddwev.h.bu.b xr5,    xr4,   xr20
+    xvmaddwev.h.bu.b xr7,    xr6,   xr20
+    xvmaddwod.h.bu.b xr1,    xr0,   xr20
+    xvmaddwod.h.bu.b xr3,    xr2,   xr20
+    xvmaddwod.h.bu.b xr5,    xr4,   xr20
+    xvmaddwod.h.bu.b xr7,    xr6,   xr20
+    xmov             xr11,   xr8
+    xmov             xr13,   xr8
+    xmov             xr15,   xr8
+    xmov             xr17,   xr8
+    xvmaddwev.h.bu.b xr11,   xr10,  xr20
+    xvmaddwev.h.bu.b xr13,   xr12,  xr20
+    xvmaddwev.h.bu.b xr15,   xr14,  xr20
+    xvmaddwev.h.bu.b xr17,   xr16,  xr20
+    xvmaddwod.h.bu.b xr11,   xr10,  xr20
+    xvmaddwod.h.bu.b xr13,   xr12,  xr20
+    xvmaddwod.h.bu.b xr15,   xr14,  xr20
+    xvmaddwod.h.bu.b xr17,   xr16,  xr20
+
+    xvssran.bu.h     xr1,    xr1,   xr9   //vec0
+    xvssran.bu.h     xr3,    xr3,   xr9   //vec2
+    xvssran.bu.h     xr5,    xr5,   xr9   //vec4
+    xvssran.bu.h     xr7,    xr7,   xr9   //vec6
+    xvssran.bu.h     xr11,   xr11,  xr9   //vec1
+    xvssran.bu.h     xr13,   xr13,  xr9   //vec3
+    xvssran.bu.h     xr15,   xr15,  xr9   //vec5
+    xvssran.bu.h     xr17,   xr17,  xr9   //vec7
+
+    xvilvl.d         xr0,    xr11,   xr1
+    xvilvl.d         xr2,    xr13,   xr3
+    xvilvl.d         xr4,    xr15,   xr5
+    xvilvl.d         xr6,    xr17,   xr7
+
+    xvpermi.d        xr1,    xr0,    0x4E
+    xvpermi.d        xr3,    xr2,    0x4E
+    xvpermi.d        xr5,    xr4,    0x4E
+    xvpermi.d        xr7,    xr6,    0x4E
+    vst              vr0,    a0,     0
+    vstx             vr1,    a0,     a2
+    vstx             vr2,    a0,     t0
+    vstx             vr3,    a0,     t1
+    vst              vr4,    t5,     0
+    vstx             vr5,    t5,     a2
+    vstx             vr6,    t5,     t0
+    vstx             vr7,    t5,     t1
+
+    addi.d          t6,     zero,   16
+    bne             a3,     t6,     .END_BIWEIGHT_PIXELS16_LASX
+    add.d           t4,     t4,     t2
+    add.d           t5,     t5,     t2
+    vld             vr0,    t4,     0
+    vldx            vr1,    t4,     a2
+    vldx            vr2,    t4,     t0
+    vldx            vr3,    t4,     t1
+    add.d           t6,     t4,     t2
+    add.d           t7,     t5,     t2
+    vld             vr4,    t6,     0
+    vldx            vr5,    t6,     a2
+    vldx            vr6,    t6,     t0
+    vldx            vr7,    t6,     t1
+
+    vld             vr10,   t5,     0
+    vldx            vr11,   t5,     a2
+    vldx            vr12,   t5,     t0
+    vldx            vr13,   t5,     t1
+    vld             vr14,   t7,     0
+    vldx            vr15,   t7,     a2
+    vldx            vr16,   t7,     t0
+    vldx            vr17,   t7,     t1
+
+    xvpermi.q        xr1,    xr0,    0x20
+    xvpermi.q        xr3,    xr2,    0x20
+    xvpermi.q        xr5,    xr4,    0x20
+    xvpermi.q        xr7,    xr6,    0x20
+
+    xvpermi.q        xr11,   xr10,   0x20
+    xvpermi.q        xr13,   xr12,   0x20
+    xvpermi.q        xr15,   xr14,   0x20
+    xvpermi.q        xr17,   xr16,   0x20
+
+    xvilvl.b         xr0,    xr11,   xr1   //vec0
+    xvilvl.b         xr2,    xr13,   xr3   //vec2
+    xvilvl.b         xr4,    xr15,   xr5   //vec4
+    xvilvl.b         xr6,    xr17,   xr7   //vec6
+
+    xvilvh.b         xr10,   xr11,   xr1   //vec1
+    xvilvh.b         xr12,   xr13,   xr3   //vec2
+    xvilvh.b         xr14,   xr15,   xr5   //vec5
+    xvilvh.b         xr16,   xr17,   xr7   //vec7
+
+    xmov             xr1,    xr8
+    xmov             xr3,    xr8
+    xmov             xr5,    xr8
+    xmov             xr7,    xr8
+    xvmaddwev.h.bu.b xr1,    xr0,   xr20
+    xvmaddwev.h.bu.b xr3,    xr2,   xr20
+    xvmaddwev.h.bu.b xr5,    xr4,   xr20
+    xvmaddwev.h.bu.b xr7,    xr6,   xr20
+    xvmaddwod.h.bu.b xr1,    xr0,   xr20
+    xvmaddwod.h.bu.b xr3,    xr2,   xr20
+    xvmaddwod.h.bu.b xr5,    xr4,   xr20
+    xvmaddwod.h.bu.b xr7,    xr6,   xr20
+    xmov             xr11,   xr8
+    xmov             xr13,   xr8
+    xmov             xr15,   xr8
+    xmov             xr17,   xr8
+    xvmaddwev.h.bu.b xr11,   xr10,  xr20
+    xvmaddwev.h.bu.b xr13,   xr12,  xr20
+    xvmaddwev.h.bu.b xr15,   xr14,  xr20
+    xvmaddwev.h.bu.b xr17,   xr16,  xr20
+    xvmaddwod.h.bu.b xr11,   xr10,  xr20
+    xvmaddwod.h.bu.b xr13,   xr12,  xr20
+    xvmaddwod.h.bu.b xr15,   xr14,  xr20
+    xvmaddwod.h.bu.b xr17,   xr16,  xr20
+
+    xvssran.bu.h     xr1,    xr1,   xr9   //vec0
+    xvssran.bu.h     xr3,    xr3,   xr9   //vec2
+    xvssran.bu.h     xr5,    xr5,   xr9   //vec4
+    xvssran.bu.h     xr7,    xr7,   xr9   //vec6
+    xvssran.bu.h     xr11,   xr11,  xr9   //vec1
+    xvssran.bu.h     xr13,   xr13,  xr9   //vec3
+    xvssran.bu.h     xr15,   xr15,  xr9   //vec5
+    xvssran.bu.h     xr17,   xr17,  xr9   //vec7
+
+    xvilvl.d         xr0,    xr11,   xr1
+    xvilvl.d         xr2,    xr13,   xr3
+    xvilvl.d         xr4,    xr15,   xr5
+    xvilvl.d         xr6,    xr17,   xr7
+
+    xvpermi.d        xr1,    xr0,    0x4E
+    xvpermi.d        xr3,    xr2,    0x4E
+    xvpermi.d        xr5,    xr4,    0x4E
+    xvpermi.d        xr7,    xr6,    0x4E
+
+    vst              vr0,    t5,     0
+    vstx             vr1,    t5,     a2
+    vstx             vr2,    t5,     t0
+    vstx             vr3,    t5,     t1
+    vst              vr4,    t7,     0
+    vstx             vr5,    t7,     a2
+    vstx             vr6,    t7,     t0
+    vstx             vr7,    t7,     t1
+.END_BIWEIGHT_PIXELS16_LASX:
+endfunc
+
+function ff_biweight_h264_pixels8_8_lsx
+    slli.d           t0,     a2,     1
+    slli.d           t2,     a2,     2
+    add.d            t1,     t0,     a2
+    addi.d           a7,     a7,     1
+    ori              a7,     a7,     1
+    sll.d            a7,     a7,     a4
+    addi.d           a4,     a4,     1
+    addi.d           t3,     zero,   8
+
+    vreplgr2vr.b     vr0,    a6    //tmp0
+    vreplgr2vr.b     vr1,    a5    //tmp1
+    vreplgr2vr.h     vr8,    a7    //offset
+    vreplgr2vr.h     vr9,    a4    //denom
+    vilvh.b          vr20,   vr1,    vr0    //wgt
+
+    fld.d            f0,     a1,     0   //src0
+    fldx.d           f1,     a1,     a2  //src1
+    fldx.d           f2,     a1,     t0  //src2
+    fldx.d           f3,     a1,     t1  //src3
+    fld.d            f10,    a0,     0   //src10
+    fldx.d           f11,    a0,     a2  //src11
+    fldx.d           f12,    a0,     t0  //src12
+    fldx.d           f13,    a0,     t1  //src13
+
+    vilvl.d          vr4,    vr1,    vr0  //src0
+    vilvl.d          vr5,    vr3,    vr2  //src2
+    vilvl.d          vr6,    vr11,   vr10 //dst0
+    vilvl.d          vr7,    vr13,   vr12 //dst2
+
+    vilvl.b          vr0,    vr6,    vr4  //vec0.0
+    vilvh.b          vr1,    vr6,    vr4  //vec0.1
+    vilvl.b          vr2,    vr7,    vr5  //vec1.0
+    vilvh.b          vr3,    vr7,    vr5  //vec1.1
+
+    vmov             vr4,    vr8
+    vmov             vr5,    vr8
+    vmov             vr6,    vr8
+    vmov             vr7,    vr8
+    vmaddwev.h.bu.b  vr4,    vr0,    vr20
+    vmaddwev.h.bu.b  vr5,    vr1,    vr20
+    vmaddwev.h.bu.b  vr6,    vr2,    vr20
+    vmaddwev.h.bu.b  vr7,    vr3,    vr20
+    vmaddwod.h.bu.b  vr4,    vr0,    vr20
+    vmaddwod.h.bu.b  vr5,    vr1,    vr20
+    vmaddwod.h.bu.b  vr6,    vr2,    vr20
+    vmaddwod.h.bu.b  vr7,    vr3,    vr20
+
+    vssran.bu.h      vr0,    vr4,   vr9   //vec0
+    vssran.bu.h      vr1,    vr5,   vr9   //vec0
+    vssran.bu.h      vr2,    vr6,   vr9   //vec1
+    vssran.bu.h      vr3,    vr7,   vr9   //vec1
+
+    vilvl.d          vr4,    vr1,   vr0
+    vilvl.d          vr6,    vr3,   vr2
+
+    vbsrl.v          vr5,    vr4,   8
+    vbsrl.v          vr7,    vr6,   8
+
+    fst.d            f4,     a0,    0
+    fstx.d           f5,     a0,    a2
+    fstx.d           f6,     a0,    t0
+    fstx.d           f7,     a0,    t1
+
+    blt              a3,     t3,    .END_BIWEIGHT_H264_PIXELS8
+    addi.d           t3,     zero,   16
+    add.d            a1,     a1,     t2
+    add.d            a0,     a0,     t2
+
+    fld.d            f0,     a1,     0   //src0
+    fldx.d           f1,     a1,     a2  //src1
+    fldx.d           f2,     a1,     t0  //src2
+    fldx.d           f3,     a1,     t1  //src3
+    fld.d            f10,    a0,     0   //src10
+    fldx.d           f11,    a0,     a2  //src11
+    fldx.d           f12,    a0,     t0  //src12
+    fldx.d           f13,    a0,     t1  //src13
+
+    vilvl.d          vr4,    vr1,    vr0  //src0
+    vilvl.d          vr5,    vr3,    vr2  //src2
+    vilvl.d          vr6,    vr11,   vr10 //dst0
+    vilvl.d          vr7,    vr13,   vr12 //dst2
+
+    vilvl.b          vr0,    vr6,    vr4  //vec0.0
+    vilvh.b          vr1,    vr6,    vr4  //vec0.1
+    vilvl.b          vr2,    vr7,    vr5  //vec1.0
+    vilvh.b          vr3,    vr7,    vr5  //vec1.1
+
+    vmov             vr4,    vr8
+    vmov             vr5,    vr8
+    vmov             vr6,    vr8
+    vmov             vr7,    vr8
+    vmaddwev.h.bu.b  vr4,    vr0,    vr20
+    vmaddwev.h.bu.b  vr5,    vr1,    vr20
+    vmaddwev.h.bu.b  vr6,    vr2,    vr20
+    vmaddwev.h.bu.b  vr7,    vr3,    vr20
+    vmaddwod.h.bu.b  vr4,    vr0,    vr20
+    vmaddwod.h.bu.b  vr5,    vr1,    vr20
+    vmaddwod.h.bu.b  vr6,    vr2,    vr20
+    vmaddwod.h.bu.b  vr7,    vr3,    vr20
+
+    vssran.bu.h      vr0,    vr4,   vr9   //vec0
+    vssran.bu.h      vr1,    vr5,   vr9   //vec0
+    vssran.bu.h      vr2,    vr6,   vr9   //vec1
+    vssran.bu.h      vr3,    vr7,   vr9   //vec1
+
+    vilvl.d          vr4,    vr1,   vr0
+    vilvl.d          vr6,    vr3,   vr2
+
+    vbsrl.v          vr5,    vr4,   8
+    vbsrl.v          vr7,    vr6,   8
+
+    fst.d            f4,     a0,    0
+    fstx.d           f5,     a0,    a2
+    fstx.d           f6,     a0,    t0
+    fstx.d           f7,     a0,    t1
+    blt              a3,     t3,    .END_BIWEIGHT_H264_PIXELS8
+    add.d            a1,     a1,     t2
+    add.d            a0,     a0,     t2
+    add.d            t4,     a1,     t2
+    add.d            t5,     a0,     t2
+
+    fld.d            f0,     a1,     0   //src0
+    fldx.d           f1,     a1,     a2  //src1
+    fldx.d           f2,     a1,     t0  //src2
+    fldx.d           f3,     a1,     t1  //src3
+    fld.d            f4,     t4,     0   //src4
+    fldx.d           f5,     t4,     a2  //src5
+    fldx.d           f6,     t4,     t0  //src6
+    fldx.d           f7,     t4,     t1  //src7
+    fld.d            f10,    a0,     0   //src10
+    fldx.d           f11,    a0,     a2  //src11
+    fldx.d           f12,    a0,     t0  //src12
+    fldx.d           f13,    a0,     t1  //src13
+    fld.d            f14,    t5,     0   //src10
+    fldx.d           f15,    t5,     a2  //src11
+    fldx.d           f16,    t5,     t0  //src12
+    fldx.d           f17,    t5,     t1  //src13
+
+    vilvl.d          vr0,    vr1,    vr0  //src0
+    vilvl.d          vr2,    vr3,    vr2  //src2
+    vilvl.d          vr4,    vr5,    vr4  //src4
+    vilvl.d          vr6,    vr7,    vr6  //src6
+    vilvl.d          vr10,   vr11,   vr10 //dst0
+    vilvl.d          vr12,   vr13,   vr12 //dst2
+    vilvl.d          vr14,   vr15,   vr14 //dst4
+    vilvl.d          vr16,   vr17,   vr16 //dst6
+
+    vilvl.b          vr1,    vr10,   vr0  //vec0.0
+    vilvh.b          vr3,    vr10,   vr0  //vec0.1
+    vilvl.b          vr5,    vr12,   vr2  //vec2.0
+    vilvh.b          vr7,    vr12,   vr2  //vec2.1
+    vilvl.b          vr11,   vr14,   vr4  //vec4.0
+    vilvh.b          vr13,   vr14,   vr4  //vec4.1
+    vilvl.b          vr15,   vr16,   vr6  //vec6.0
+    vilvh.b          vr17,   vr16,   vr6  //vec6.1
+
+    vmov             vr0,    vr8
+    vmov             vr2,    vr8
+    vmov             vr4,    vr8
+    vmov             vr6,    vr8
+    vmaddwev.h.bu.b  vr0,    vr1,    vr20
+    vmaddwev.h.bu.b  vr2,    vr3,    vr20
+    vmaddwev.h.bu.b  vr4,    vr5,    vr20
+    vmaddwev.h.bu.b  vr6,    vr7,    vr20
+    vmaddwod.h.bu.b  vr0,    vr1,    vr20
+    vmaddwod.h.bu.b  vr2,    vr3,    vr20
+    vmaddwod.h.bu.b  vr4,    vr5,    vr20
+    vmaddwod.h.bu.b  vr6,    vr7,    vr20
+
+    vmov             vr10,   vr8
+    vmov             vr12,   vr8
+    vmov             vr14,   vr8
+    vmov             vr16,   vr8
+
+    vmaddwev.h.bu.b  vr10,   vr11,   vr20
+    vmaddwev.h.bu.b  vr12,   vr13,   vr20
+    vmaddwev.h.bu.b  vr14,   vr15,   vr20
+    vmaddwev.h.bu.b  vr16,   vr17,   vr20
+    vmaddwod.h.bu.b  vr10,   vr11,   vr20
+    vmaddwod.h.bu.b  vr12,   vr13,   vr20
+    vmaddwod.h.bu.b  vr14,   vr15,   vr20
+    vmaddwod.h.bu.b  vr16,   vr17,   vr20
+
+    vssran.bu.h      vr1,    vr0,   vr9   //vec0
+    vssran.bu.h      vr3,    vr2,   vr9   //vec0
+    vssran.bu.h      vr5,    vr4,   vr9   //vec2
+    vssran.bu.h      vr7,    vr6,   vr9   //vec2
+
+    vssran.bu.h      vr11,   vr10,  vr9   //vec4
+    vssran.bu.h      vr13,   vr12,  vr9   //vec4
+    vssran.bu.h      vr15,   vr14,  vr9   //vec6
+    vssran.bu.h      vr17,   vr16,  vr9   //vec6
+
+    vilvl.d          vr0,    vr3,   vr1
+    vilvl.d          vr2,    vr7,   vr5
+    vilvl.d          vr10,   vr13,  vr11
+    vilvl.d          vr12,   vr17,  vr15
+
+    vbsrl.v          vr1,    vr0,   8
+    vbsrl.v          vr3,    vr2,   8
+    vbsrl.v          vr11,   vr10,  8
+    vbsrl.v          vr13,   vr12,  8
+
+    fst.d            f0,     a0,    0
+    fstx.d           f1,     a0,    a2
+    fstx.d           f2,     a0,    t0
+    fstx.d           f3,     a0,    t1
+    fst.d            f10,    t5,    0
+    fstx.d           f11,    t5,    a2
+    fstx.d           f12,    t5,    t0
+    fstx.d           f13,    t5,    t1
+.END_BIWEIGHT_H264_PIXELS8:
+endfunc
+
+function ff_biweight_h264_pixels8_8_lasx
+    slli.d           t0,     a2,     1
+    slli.d           t2,     a2,     2
+    add.d            t1,     t0,     a2
+    addi.d           a7,     a7,     1
+    ori              a7,     a7,     1
+    sll.d            a7,     a7,     a4
+    addi.d           a4,     a4,     1
+    addi.d           t3,     zero,   8
+
+    xvreplgr2vr.b    xr0,    a6    //tmp0
+    xvreplgr2vr.b    xr1,    a5    //tmp1
+    xvreplgr2vr.h    xr8,    a7    //offset
+    xvreplgr2vr.h    xr9,    a4    //denom
+    xvilvh.b         xr20,   xr1,    xr0    //wgt
+
+    fld.d            f0,     a1,     0   //src0
+    fldx.d           f1,     a1,     a2  //src1
+    fldx.d           f2,     a1,     t0  //src2
+    fldx.d           f3,     a1,     t1  //src3
+    fld.d            f10,    a0,     0   //src10
+    fldx.d           f11,    a0,     a2  //src11
+    fldx.d           f12,    a0,     t0  //src12
+    fldx.d           f13,    a0,     t1  //src13
+
+    vilvl.d          vr4,    vr1,    vr0  //src0
+    vilvl.d          vr5,    vr3,    vr2  //src2
+    vilvl.d          vr6,    vr11,   vr10 //dst0
+    vilvl.d          vr7,    vr13,   vr12 //dst2
+
+    xvpermi.q        xr5,    xr4,    0x20
+    xvpermi.q        xr7,    xr6,    0x20
+
+    xvilvl.b         xr0,    xr7,    xr5
+    xvilvh.b         xr1,    xr7,    xr5
+
+    xmov             xr2,    xr8
+    xmov             xr3,    xr8
+    xvmaddwev.h.bu.b xr2,    xr0,    xr20
+    xvmaddwev.h.bu.b xr3,    xr1,    xr20
+    xvmaddwod.h.bu.b xr2,    xr0,    xr20
+    xvmaddwod.h.bu.b xr3,    xr1,    xr20
+
+    xvssran.bu.h     xr4,    xr2,   xr9   //vec0
+    xvssran.bu.h     xr5,    xr3,   xr9   //vec2
+
+    xvilvl.d         xr0,    xr5,   xr4
+    xvpermi.d        xr2,    xr0,   0x4E
+    vbsrl.v          vr1,    vr0,   8
+    vbsrl.v          vr3,    vr2,   8
+
+    fst.d            f0,     a0,    0
+    fstx.d           f1,     a0,    a2
+    fstx.d           f2,     a0,    t0
+    fstx.d           f3,     a0,    t1
+
+    blt              a3,     t3,    .END_BIWEIGHT_H264_PIXELS8_LASX
+    addi.d           t3,     zero,   16
+    add.d            a1,     a1,     t2
+    add.d            a0,     a0,     t2
+
+    fld.d            f0,     a1,     0   //src0
+    fldx.d           f1,     a1,     a2  //src1
+    fldx.d           f2,     a1,     t0  //src2
+    fldx.d           f3,     a1,     t1  //src3
+    fld.d            f10,    a0,     0   //src10
+    fldx.d           f11,    a0,     a2  //src11
+    fldx.d           f12,    a0,     t0  //src12
+    fldx.d           f13,    a0,     t1  //src13
+
+    vilvl.d          vr4,    vr1,    vr0  //src0
+    vilvl.d          vr5,    vr3,    vr2  //src2
+    vilvl.d          vr6,    vr11,   vr10 //dst0
+    vilvl.d          vr7,    vr13,   vr12 //dst2
+
+    xvpermi.q        xr5,    xr4,    0x20
+    xvpermi.q        xr7,    xr6,    0x20
+
+    xvilvl.b         xr0,    xr7,    xr5
+    xvilvh.b         xr1,    xr7,    xr5
+
+    xmov             xr2,    xr8
+    xmov             xr3,    xr8
+    xvmaddwev.h.bu.b xr2,    xr0,    xr20
+    xvmaddwev.h.bu.b xr3,    xr1,    xr20
+    xvmaddwod.h.bu.b xr2,    xr0,    xr20
+    xvmaddwod.h.bu.b xr3,    xr1,    xr20
+
+    xvssran.bu.h     xr4,    xr2,   xr9   //vec0
+    xvssran.bu.h     xr5,    xr3,   xr9   //vec2
+
+    xvilvl.d         xr0,    xr5,   xr4
+    xvpermi.d        xr2,    xr0,   0x4E
+    vbsrl.v          vr1,    vr0,   8
+    vbsrl.v          vr3,    vr2,   8
+
+    fst.d            f0,     a0,    0
+    fstx.d           f1,     a0,    a2
+    fstx.d           f2,     a0,    t0
+    fstx.d           f3,     a0,    t1
+    blt              a3,     t3,    .END_BIWEIGHT_H264_PIXELS8_LASX
+    add.d            a1,     a1,     t2
+    add.d            a0,     a0,     t2
+    add.d            t4,     a1,     t2
+    add.d            t5,     a0,     t2
+
+    fld.d            f0,     a1,     0   //src0
+    fldx.d           f1,     a1,     a2  //src1
+    fldx.d           f2,     a1,     t0  //src2
+    fldx.d           f3,     a1,     t1  //src3
+    fld.d            f4,     t4,     0   //src4
+    fldx.d           f5,     t4,     a2  //src5
+    fldx.d           f6,     t4,     t0  //src6
+    fldx.d           f7,     t4,     t1  //src7
+    fld.d            f10,    a0,     0   //dst0
+    fldx.d           f11,    a0,     a2  //dst1
+    fldx.d           f12,    a0,     t0  //dst2
+    fldx.d           f13,    a0,     t1  //dst3
+    fld.d            f14,    t5,     0   //dst4
+    fldx.d           f15,    t5,     a2  //dst5
+    fldx.d           f16,    t5,     t0  //dst6
+    fldx.d           f17,    t5,     t1  //dst7
+
+    vilvl.d          vr0,    vr1,    vr0  //src0
+    vilvl.d          vr2,    vr3,    vr2  //src2
+    vilvl.d          vr4,    vr5,    vr4  //src4
+    vilvl.d          vr6,    vr7,    vr6  //src6
+    vilvl.d          vr10,   vr11,   vr10 //dst0
+    vilvl.d          vr12,   vr13,   vr12 //dst2
+    vilvl.d          vr14,   vr15,   vr14 //dst4
+    vilvl.d          vr16,   vr17,   vr16 //dst6
+
+    xvpermi.q        xr2,    xr0,    0x20
+    xvpermi.q        xr6,    xr4,    0x20
+    xvpermi.q        xr12,   xr10,   0x20
+    xvpermi.q        xr16,   xr14,   0x20
+
+    xvilvl.b         xr0,    xr12,   xr2
+    xvilvh.b         xr1,    xr12,   xr2
+    xvilvl.b         xr10,   xr16,   xr6
+    xvilvh.b         xr11,   xr16,   xr6
+
+    xmov             xr2,    xr8
+    xmov             xr3,    xr8
+    xmov             xr12,   xr8
+    xmov             xr13,   xr8
+    xvmaddwev.h.bu.b xr2,    xr0,    xr20
+    xvmaddwev.h.bu.b xr3,    xr1,    xr20
+    xvmaddwev.h.bu.b xr12,   xr10,   xr20
+    xvmaddwev.h.bu.b xr13,   xr11,   xr20
+    xvmaddwod.h.bu.b xr2,    xr0,    xr20
+    xvmaddwod.h.bu.b xr3,    xr1,    xr20
+    xvmaddwod.h.bu.b xr12,   xr10,   xr20
+    xvmaddwod.h.bu.b xr13,   xr11,   xr20
+
+    xvssran.bu.h     xr4,    xr2,   xr9   //vec0
+    xvssran.bu.h     xr5,    xr3,   xr9   //vec2
+    xvssran.bu.h     xr14,   xr12,  xr9   //vec0
+    xvssran.bu.h     xr15,   xr13,  xr9   //vec2
+
+    xvilvl.d         xr0,    xr5,   xr4
+    xvilvl.d         xr10,   xr15,  xr14
+    xvpermi.d        xr2,    xr0,   0x4E
+    xvpermi.d        xr12,   xr10,  0x4E
+    vbsrl.v          vr1,    vr0,   8
+    vbsrl.v          vr3,    vr2,   8
+    vbsrl.v          vr11,   vr10,  8
+    vbsrl.v          vr13,   vr12,  8
+
+    fst.d            f0,     a0,    0
+    fstx.d           f1,     a0,    a2
+    fstx.d           f2,     a0,    t0
+    fstx.d           f3,     a0,    t1
+    fst.d            f10,    t5,    0
+    fstx.d           f11,    t5,    a2
+    fstx.d           f12,    t5,    t0
+    fstx.d           f13,    t5,    t1
+.END_BIWEIGHT_H264_PIXELS8_LASX:
+endfunc
+
+//LSX optimization is enough for this function.
+function ff_biweight_h264_pixels4_8_lsx
+    slli.d           t0,     a2,     1
+    slli.d           t2,     a2,     2
+    add.d            t1,     t0,     a2
+    addi.d           a7,     a7,     1
+    ori              a7,     a7,     1
+    sll.d            a7,     a7,     a4
+    addi.d           a4,     a4,     1
+    addi.d           t3,     zero,   4
+
+    vreplgr2vr.b     vr0,    a6    //tmp0
+    vreplgr2vr.b     vr1,    a5    //tmp1
+    vreplgr2vr.h     vr8,    a7    //offset
+    vreplgr2vr.h     vr9,    a4    //denom
+    vilvh.b          vr20,   vr1,    vr0    //wgt
+
+    fld.s            f0,     a1,     0
+    fldx.s           f1,     a1,     a2
+    fld.s            f10,    a0,     0
+    fldx.s           f11,    a0,     a2
+    vilvl.w          vr2,    vr1,    vr0
+    vilvl.w          vr12,   vr11,   vr10
+    vilvl.b          vr0,    vr12,   vr2
+
+    vmov             vr1,    vr8
+    vmaddwev.h.bu.b  vr1,    vr0,    vr20
+    vmaddwod.h.bu.b  vr1,    vr0,    vr20
+
+    vssran.bu.h      vr1,    vr1,    vr9   //vec0
+    vbsrl.v          vr2,    vr1,    4
+    fst.s            f1,     a0,     0
+    fstx.s           f2,     a0,     a2
+
+    blt              a3,     t3,    .END_BIWEIGHT_H264_PIXELS4
+    addi.d           t3,     zero,   8
+    fldx.s           f0,     a1,     t0
+    fldx.s           f1,     a1,     t1
+    fldx.s           f10,    a0,     t0
+    fldx.s           f11,    a0,     t1
+    vilvl.w          vr2,    vr1,    vr0
+    vilvl.w          vr12,   vr11,   vr10
+    vilvl.b          vr0,    vr12,   vr2
+
+    vmov             vr1,    vr8
+    vmaddwev.h.bu.b  vr1,    vr0,    vr20
+    vmaddwod.h.bu.b  vr1,    vr0,    vr20
+
+    vssran.bu.h      vr1,    vr1,    vr9   //vec0
+    vbsrl.v          vr2,    vr1,    4
+    fstx.s           f1,     a0,     t0
+    fstx.s           f2,     a0,     t1
+
+    blt              a3,     t3,    .END_BIWEIGHT_H264_PIXELS4
+    add.d            a1,     a1,     t2
+    add.d            a0,     a0,     t2
+    fld.s            f0,     a1,     0
+    fldx.s           f1,     a1,     a2
+    fldx.s           f2,     a1,     t0
+    fldx.s           f3,     a1,     t1
+    fld.s            f10,    a0,     0
+    fldx.s           f11,    a0,     a2
+    fldx.s           f12,    a0,     t0
+    fldx.s           f13,    a0,     t1
+    vilvl.w          vr4,    vr1,    vr0
+    vilvl.w          vr5,    vr3,    vr2
+    vilvl.w          vr14,   vr11,   vr10
+    vilvl.w          vr15,   vr13,   vr12
+
+    vilvl.b          vr0,    vr14,   vr4
+    vilvl.b          vr10,   vr15,   vr5
+
+    vmov             vr1,    vr8
+    vmov             vr11,   vr8
+    vmaddwev.h.bu.b  vr1,    vr0,    vr20
+    vmaddwev.h.bu.b  vr11,   vr10,   vr20
+    vmaddwod.h.bu.b  vr1,    vr0,    vr20
+    vmaddwod.h.bu.b  vr11,   vr10,   vr20
+
+    vssran.bu.h      vr0,    vr1,    vr9   //vec0
+    vssran.bu.h      vr10,   vr11,   vr9   //vec0
+    vbsrl.v          vr2,    vr0,    4
+    vbsrl.v          vr12,   vr10,   4
+
+    fst.s            f0,     a0,     0
+    fstx.s           f2,     a0,     a2
+    fstx.s           f10,    a0,     t0
+    fstx.s           f12,    a0,     t1
+.END_BIWEIGHT_H264_PIXELS4:
+endfunc
+
+function ff_weight_h264_pixels16_8_lsx
+    slli.d           t0,     a1,     1
+    slli.d           t2,     a1,     2
+    add.d            t1,     t0,     a1
+    addi.d           t3,     zero,   16
+
+    sll.d            a5,     a5,     a3
+    vreplgr2vr.h     vr20,   a4      //weight
+    vreplgr2vr.h     vr8,    a5      //offset
+    vreplgr2vr.h     vr9,    a3      //log2_denom
+    vldi             vr23,   0
+
+    add.d            t4,     a0,     t2
+    vld              vr0,    a0,     0
+    vldx             vr1,    a0,     a1
+    vldx             vr2,    a0,     t0
+    vldx             vr3,    a0,     t1
+    vld              vr4,    t4,     0
+    vldx             vr5,    t4,     a1
+    vldx             vr6,    t4,     t0
+    vldx             vr7,    t4,     t1
+
+    vilvl.b          vr10,   vr23,   vr0
+    vilvl.b          vr11,   vr23,   vr1
+    vilvl.b          vr12,   vr23,   vr2
+    vilvl.b          vr13,   vr23,   vr3
+    vilvl.b          vr14,   vr23,   vr4
+    vilvl.b          vr15,   vr23,   vr5
+    vilvl.b          vr16,   vr23,   vr6
+    vilvl.b          vr17,   vr23,   vr7
+
+    vmul.h           vr10,   vr10,   vr20
+    vmul.h           vr11,   vr11,   vr20
+    vmul.h           vr12,   vr12,   vr20
+    vmul.h           vr13,   vr13,   vr20
+    vmul.h           vr14,   vr14,   vr20
+    vmul.h           vr15,   vr15,   vr20
+    vmul.h           vr16,   vr16,   vr20
+    vmul.h           vr17,   vr17,   vr20
+    vsadd.h          vr10,   vr8,    vr10
+    vsadd.h          vr11,   vr8,    vr11
+    vsadd.h          vr12,   vr8,    vr12
+    vsadd.h          vr13,   vr8,    vr13
+    vsadd.h          vr14,   vr8,    vr14
+    vsadd.h          vr15,   vr8,    vr15
+    vsadd.h          vr16,   vr8,    vr16
+    vsadd.h          vr17,   vr8,    vr17
+
+    vilvh.b          vr18,   vr23,   vr0
+    vilvh.b          vr19,   vr23,   vr1
+    vilvh.b          vr21,   vr23,   vr2
+    vilvh.b          vr22,   vr23,   vr3
+    vilvh.b          vr0,    vr23,   vr4
+    vilvh.b          vr1,    vr23,   vr5
+    vilvh.b          vr2,    vr23,   vr6
+    vilvh.b          vr3,    vr23,   vr7
+    vmul.h           vr18,   vr18,   vr20
+    vmul.h           vr19,   vr19,   vr20
+    vmul.h           vr21,   vr21,   vr20
+    vmul.h           vr22,   vr22,   vr20
+    vmul.h           vr0,    vr0,    vr20
+    vmul.h           vr1,    vr1,    vr20
+    vmul.h           vr2,    vr2,    vr20
+    vmul.h           vr3,    vr3,    vr20
+    vsadd.h          vr18,   vr8,    vr18
+    vsadd.h          vr19,   vr8,    vr19
+    vsadd.h          vr21,   vr8,    vr21
+    vsadd.h          vr22,   vr8,    vr22
+    vsadd.h          vr0,    vr8,    vr0
+    vsadd.h          vr1,    vr8,    vr1
+    vsadd.h          vr2,    vr8,    vr2
+    vsadd.h          vr3,    vr8,    vr3
+
+    vssrarn.bu.h     vr10,   vr10,   vr9
+    vssrarn.bu.h     vr11,   vr11,   vr9
+    vssrarn.bu.h     vr12,   vr12,   vr9
+    vssrarn.bu.h     vr13,   vr13,   vr9
+    vssrarn.bu.h     vr14,   vr14,   vr9
+    vssrarn.bu.h     vr15,   vr15,   vr9
+    vssrarn.bu.h     vr16,   vr16,   vr9
+    vssrarn.bu.h     vr17,   vr17,   vr9
+    vssrarn.bu.h     vr4,    vr18,   vr9
+    vssrarn.bu.h     vr5,    vr19,   vr9
+    vssrarn.bu.h     vr6,    vr21,   vr9
+    vssrarn.bu.h     vr7,    vr22,   vr9
+    vssrarn.bu.h     vr0,    vr0,    vr9
+    vssrarn.bu.h     vr1,    vr1,    vr9
+    vssrarn.bu.h     vr2,    vr2,    vr9
+    vssrarn.bu.h     vr3,    vr3,    vr9
+
+    vilvl.d          vr10,   vr4,    vr10
+    vilvl.d          vr11,   vr5,    vr11
+    vilvl.d          vr12,   vr6,    vr12
+    vilvl.d          vr13,   vr7,    vr13
+    vilvl.d          vr14,   vr0,    vr14
+    vilvl.d          vr15,   vr1,    vr15
+    vilvl.d          vr16,   vr2,    vr16
+    vilvl.d          vr17,   vr3,    vr17
+
+    vst              vr10,   a0,     0
+    vstx             vr11,   a0,     a1
+    vstx             vr12,   a0,     t0
+    vstx             vr13,   a0,     t1
+    vst              vr14,   t4,     0
+    vstx             vr15,   t4,     a1
+    vstx             vr16,   t4,     t0
+    vstx             vr17,   t4,     t1
+
+    bne              a2,     t3,     .END_WEIGHT_H264_PIXELS16_8
+    add.d            a0,     t4,     t2
+    add.d            t4,     a0,     t2
+    vld              vr0,    a0,     0
+    vldx             vr1,    a0,     a1
+    vldx             vr2,    a0,     t0
+    vldx             vr3,    a0,     t1
+    vld              vr4,    t4,     0
+    vldx             vr5,    t4,     a1
+    vldx             vr6,    t4,     t0
+    vldx             vr7,    t4,     t1
+
+    vilvl.b          vr10,   vr23,   vr0
+    vilvl.b          vr11,   vr23,   vr1
+    vilvl.b          vr12,   vr23,   vr2
+    vilvl.b          vr13,   vr23,   vr3
+    vilvl.b          vr14,   vr23,   vr4
+    vilvl.b          vr15,   vr23,   vr5
+    vilvl.b          vr16,   vr23,   vr6
+    vilvl.b          vr17,   vr23,   vr7
+
+    vmul.h           vr10,   vr10,   vr20
+    vmul.h           vr11,   vr11,   vr20
+    vmul.h           vr12,   vr12,   vr20
+    vmul.h           vr13,   vr13,   vr20
+    vmul.h           vr14,   vr14,   vr20
+    vmul.h           vr15,   vr15,   vr20
+    vmul.h           vr16,   vr16,   vr20
+    vmul.h           vr17,   vr17,   vr20
+    vsadd.h          vr10,   vr8,    vr10
+    vsadd.h          vr11,   vr8,    vr11
+    vsadd.h          vr12,   vr8,    vr12
+    vsadd.h          vr13,   vr8,    vr13
+    vsadd.h          vr14,   vr8,    vr14
+    vsadd.h          vr15,   vr8,    vr15
+    vsadd.h          vr16,   vr8,    vr16
+    vsadd.h          vr17,   vr8,    vr17
+
+    vilvh.b          vr18,   vr23,   vr0
+    vilvh.b          vr19,   vr23,   vr1
+    vilvh.b          vr21,   vr23,   vr2
+    vilvh.b          vr22,   vr23,   vr3
+    vilvh.b          vr0,    vr23,   vr4
+    vilvh.b          vr1,    vr23,   vr5
+    vilvh.b          vr2,    vr23,   vr6
+    vilvh.b          vr3,    vr23,   vr7
+    vmul.h           vr18,   vr18,   vr20
+    vmul.h           vr19,   vr19,   vr20
+    vmul.h           vr21,   vr21,   vr20
+    vmul.h           vr22,   vr22,   vr20
+    vmul.h           vr0,    vr0,    vr20
+    vmul.h           vr1,    vr1,    vr20
+    vmul.h           vr2,    vr2,    vr20
+    vmul.h           vr3,    vr3,    vr20
+    vsadd.h          vr18,   vr8,    vr18
+    vsadd.h          vr19,   vr8,    vr19
+    vsadd.h          vr21,   vr8,    vr21
+    vsadd.h          vr22,   vr8,    vr22
+    vsadd.h          vr0,    vr8,    vr0
+    vsadd.h          vr1,    vr8,    vr1
+    vsadd.h          vr2,    vr8,    vr2
+    vsadd.h          vr3,    vr8,    vr3
+
+    vssrarn.bu.h     vr10,   vr10,   vr9
+    vssrarn.bu.h     vr11,   vr11,   vr9
+    vssrarn.bu.h     vr12,   vr12,   vr9
+    vssrarn.bu.h     vr13,   vr13,   vr9
+    vssrarn.bu.h     vr14,   vr14,   vr9
+    vssrarn.bu.h     vr15,   vr15,   vr9
+    vssrarn.bu.h     vr16,   vr16,   vr9
+    vssrarn.bu.h     vr17,   vr17,   vr9
+    vssrarn.bu.h     vr4,    vr18,   vr9
+    vssrarn.bu.h     vr5,    vr19,   vr9
+    vssrarn.bu.h     vr6,    vr21,   vr9
+    vssrarn.bu.h     vr7,    vr22,   vr9
+    vssrarn.bu.h     vr0,    vr0,    vr9
+    vssrarn.bu.h     vr1,    vr1,    vr9
+    vssrarn.bu.h     vr2,    vr2,    vr9
+    vssrarn.bu.h     vr3,    vr3,    vr9
+
+    vilvl.d          vr10,   vr4,    vr10
+    vilvl.d          vr11,   vr5,    vr11
+    vilvl.d          vr12,   vr6,    vr12
+    vilvl.d          vr13,   vr7,    vr13
+    vilvl.d          vr14,   vr0,    vr14
+    vilvl.d          vr15,   vr1,    vr15
+    vilvl.d          vr16,   vr2,    vr16
+    vilvl.d          vr17,   vr3,    vr17
+
+    vst              vr10,   a0,     0
+    vstx             vr11,   a0,     a1
+    vstx             vr12,   a0,     t0
+    vstx             vr13,   a0,     t1
+    vst              vr14,   t4,     0
+    vstx             vr15,   t4,     a1
+    vstx             vr16,   t4,     t0
+    vstx             vr17,   t4,     t1
+.END_WEIGHT_H264_PIXELS16_8:
+endfunc
+
+function ff_weight_h264_pixels16_8_lasx
+    slli.d           t0,     a1,     1
+    slli.d           t2,     a1,     2
+    add.d            t1,     t0,     a1
+    addi.d           t3,     zero,   16
+
+    sll.d            a5,     a5,     a3
+    xvreplgr2vr.h    xr20,   a4      //weight
+    xvreplgr2vr.h    xr8,    a5      //offset
+    xvreplgr2vr.h    xr9,    a3      //log2_denom
+
+    add.d            t4,     a0,     t2
+    vld              vr0,    a0,     0
+    vldx             vr1,    a0,     a1
+    vldx             vr2,    a0,     t0
+    vldx             vr3,    a0,     t1
+    vld              vr4,    t4,     0
+    vldx             vr5,    t4,     a1
+    vldx             vr6,    t4,     t0
+    vldx             vr7,    t4,     t1
+
+    vext2xv.hu.bu    xr0,    xr0
+    vext2xv.hu.bu    xr1,    xr1
+    vext2xv.hu.bu    xr2,    xr2
+    vext2xv.hu.bu    xr3,    xr3
+    vext2xv.hu.bu    xr4,    xr4
+    vext2xv.hu.bu    xr5,    xr5
+    vext2xv.hu.bu    xr6,    xr6
+    vext2xv.hu.bu    xr7,    xr7
+    xvmul.h          xr10,   xr0,    xr20
+    xvmul.h          xr11,   xr1,    xr20
+    xvmul.h          xr12,   xr2,    xr20
+    xvmul.h          xr13,   xr3,    xr20
+    xvmul.h          xr14,   xr4,    xr20
+    xvmul.h          xr15,   xr5,    xr20
+    xvmul.h          xr16,   xr6,    xr20
+    xvmul.h          xr17,   xr7,    xr20
+    xvsadd.h         xr10,   xr8,    xr10
+    xvsadd.h         xr11,   xr8,    xr11
+    xvsadd.h         xr12,   xr8,    xr12
+    xvsadd.h         xr13,   xr8,    xr13
+    xvsadd.h         xr14,   xr8,    xr14
+    xvsadd.h         xr15,   xr8,    xr15
+    xvsadd.h         xr16,   xr8,    xr16
+    xvsadd.h         xr17,   xr8,    xr17
+
+    xvssrarn.bu.h    xr10,   xr10,   xr9
+    xvssrarn.bu.h    xr11,   xr11,   xr9
+    xvssrarn.bu.h    xr12,   xr12,   xr9
+    xvssrarn.bu.h    xr13,   xr13,   xr9
+    xvssrarn.bu.h    xr14,   xr14,   xr9
+    xvssrarn.bu.h    xr15,   xr15,   xr9
+    xvssrarn.bu.h    xr16,   xr16,   xr9
+    xvssrarn.bu.h    xr17,   xr17,   xr9
+
+    xvpermi.d        xr10,   xr10,   0xD8
+    xvpermi.d        xr11,   xr11,   0xD8
+    xvpermi.d        xr12,   xr12,   0xD8
+    xvpermi.d        xr13,   xr13,   0xD8
+    xvpermi.d        xr14,   xr14,   0xD8
+    xvpermi.d        xr15,   xr15,   0xD8
+    xvpermi.d        xr16,   xr16,   0xD8
+    xvpermi.d        xr17,   xr17,   0xD8
+
+    vst              vr10,   a0,     0
+    vstx             vr11,   a0,     a1
+    vstx             vr12,   a0,     t0
+    vstx             vr13,   a0,     t1
+    vst              vr14,   t4,     0
+    vstx             vr15,   t4,     a1
+    vstx             vr16,   t4,     t0
+    vstx             vr17,   t4,     t1
+
+    bne              a2,     t3,     .END_WEIGHT_H264_PIXELS16_8_LASX
+    add.d            a0,     t4,     t2
+    add.d            t4,     a0,     t2
+    vld              vr0,    a0,     0
+    vldx             vr1,    a0,     a1
+    vldx             vr2,    a0,     t0
+    vldx             vr3,    a0,     t1
+    vld              vr4,    t4,     0
+    vldx             vr5,    t4,     a1
+    vldx             vr6,    t4,     t0
+    vldx             vr7,    t4,     t1
+
+    vext2xv.hu.bu    xr0,    xr0
+    vext2xv.hu.bu    xr1,    xr1
+    vext2xv.hu.bu    xr2,    xr2
+    vext2xv.hu.bu    xr3,    xr3
+    vext2xv.hu.bu    xr4,    xr4
+    vext2xv.hu.bu    xr5,    xr5
+    vext2xv.hu.bu    xr6,    xr6
+    vext2xv.hu.bu    xr7,    xr7
+    xvmul.h          xr10,   xr0,    xr20
+    xvmul.h          xr11,   xr1,    xr20
+    xvmul.h          xr12,   xr2,    xr20
+    xvmul.h          xr13,   xr3,    xr20
+    xvmul.h          xr14,   xr4,    xr20
+    xvmul.h          xr15,   xr5,    xr20
+    xvmul.h          xr16,   xr6,    xr20
+    xvmul.h          xr17,   xr7,    xr20
+    xvsadd.h         xr10,   xr8,    xr10
+    xvsadd.h         xr11,   xr8,    xr11
+    xvsadd.h         xr12,   xr8,    xr12
+    xvsadd.h         xr13,   xr8,    xr13
+    xvsadd.h         xr14,   xr8,    xr14
+    xvsadd.h         xr15,   xr8,    xr15
+    xvsadd.h         xr16,   xr8,    xr16
+    xvsadd.h         xr17,   xr8,    xr17
+
+    xvssrarn.bu.h    xr10,   xr10,   xr9
+    xvssrarn.bu.h    xr11,   xr11,   xr9
+    xvssrarn.bu.h    xr12,   xr12,   xr9
+    xvssrarn.bu.h    xr13,   xr13,   xr9
+    xvssrarn.bu.h    xr14,   xr14,   xr9
+    xvssrarn.bu.h    xr15,   xr15,   xr9
+    xvssrarn.bu.h    xr16,   xr16,   xr9
+    xvssrarn.bu.h    xr17,   xr17,   xr9
+
+    xvpermi.d        xr10,   xr10,   0xD8
+    xvpermi.d        xr11,   xr11,   0xD8
+    xvpermi.d        xr12,   xr12,   0xD8
+    xvpermi.d        xr13,   xr13,   0xD8
+    xvpermi.d        xr14,   xr14,   0xD8
+    xvpermi.d        xr15,   xr15,   0xD8
+    xvpermi.d        xr16,   xr16,   0xD8
+    xvpermi.d        xr17,   xr17,   0xD8
+
+    vst              vr10,   a0,     0
+    vstx             vr11,   a0,     a1
+    vstx             vr12,   a0,     t0
+    vstx             vr13,   a0,     t1
+    vst              vr14,   t4,     0
+    vstx             vr15,   t4,     a1
+    vstx             vr16,   t4,     t0
+    vstx             vr17,   t4,     t1
+.END_WEIGHT_H264_PIXELS16_8_LASX:
+endfunc
+
+function ff_weight_h264_pixels8_8_lsx
+    slli.d           t0,     a1,     1
+    slli.d           t2,     a1,     2
+    add.d            t1,     t0,     a1
+    addi.d           t3,     zero,   8
+
+    sll.d            a5,     a5,     a3
+    vreplgr2vr.h     vr20,   a4      //weight
+    vreplgr2vr.h     vr8,    a5      //offset
+    vreplgr2vr.h     vr9,    a3      //log2_denom
+    vldi             vr21,   0
+
+    fld.d            f0,     a0,     0
+    fldx.d           f1,     a0,     a1
+    fldx.d           f2,     a0,     t0
+    fldx.d           f3,     a0,     t1
+
+    vilvl.b          vr10,   vr21,   vr0
+    vilvl.b          vr11,   vr21,   vr1
+    vilvl.b          vr12,   vr21,   vr2
+    vilvl.b          vr13,   vr21,   vr3
+
+    vmul.h           vr10,   vr10,   vr20
+    vmul.h           vr11,   vr11,   vr20
+    vmul.h           vr12,   vr12,   vr20
+    vmul.h           vr13,   vr13,   vr20
+    vsadd.h          vr0,    vr8,    vr10
+    vsadd.h          vr1,    vr8,    vr11
+    vsadd.h          vr2,    vr8,    vr12
+    vsadd.h          vr3,    vr8,    vr13
+
+    vssrarn.bu.h     vr0,    vr0,    vr9
+    vssrarn.bu.h     vr1,    vr1,    vr9
+    vssrarn.bu.h     vr2,    vr2,    vr9
+    vssrarn.bu.h     vr3,    vr3,    vr9
+
+    fst.d            f0,     a0,     0
+    fstx.d           f1,     a0,     a1
+    fstx.d           f2,     a0,     t0
+    fstx.d           f3,     a0,     t1
+
+    blt              a2,     t3,    .END_WEIGHT_H264_PIXELS8
+    add.d            a0,     a0,     t2
+    addi.d           t3,     zero,   16
+    fld.d            f0,     a0,     0
+    fldx.d           f1,     a0,     a1
+    fldx.d           f2,     a0,     t0
+    fldx.d           f3,     a0,     t1
+
+    vilvl.b          vr10,   vr21,   vr0
+    vilvl.b          vr11,   vr21,   vr1
+    vilvl.b          vr12,   vr21,   vr2
+    vilvl.b          vr13,   vr21,   vr3
+
+    vmul.h           vr10,   vr10,   vr20
+    vmul.h           vr11,   vr11,   vr20
+    vmul.h           vr12,   vr12,   vr20
+    vmul.h           vr13,   vr13,   vr20
+    vsadd.h          vr0,    vr8,    vr10
+    vsadd.h          vr1,    vr8,    vr11
+    vsadd.h          vr2,    vr8,    vr12
+    vsadd.h          vr3,    vr8,    vr13
+
+    vssrarn.bu.h     vr0,    vr0,    vr9
+    vssrarn.bu.h     vr1,    vr1,    vr9
+    vssrarn.bu.h     vr2,    vr2,    vr9
+    vssrarn.bu.h     vr3,    vr3,    vr9
+
+    fst.d            f0,     a0,     0
+    fstx.d           f1,     a0,     a1
+    fstx.d           f2,     a0,     t0
+    fstx.d           f3,     a0,     t1
+    blt              a2,     t3,    .END_WEIGHT_H264_PIXELS8
+    add.d            a0,     a0,     t2
+    add.d            t4,     a0,     t2
+
+    fld.d            f0,     a0,     0
+    fldx.d           f1,     a0,     a1
+    fldx.d           f2,     a0,     t0
+    fldx.d           f3,     a0,     t1
+    fld.d            f4,     t4,     0
+    fldx.d           f5,     t4,     a1
+    fldx.d           f6,     t4,     t0
+    fldx.d           f7,     t4,     t1
+
+    vilvl.b          vr10,   vr21,   vr0
+    vilvl.b          vr11,   vr21,   vr1
+    vilvl.b          vr12,   vr21,   vr2
+    vilvl.b          vr13,   vr21,   vr3
+    vilvl.b          vr14,   vr21,   vr4
+    vilvl.b          vr15,   vr21,   vr5
+    vilvl.b          vr16,   vr21,   vr6
+    vilvl.b          vr17,   vr21,   vr7
+
+    vmul.h           vr0,    vr10,   vr20
+    vmul.h           vr1,    vr11,   vr20
+    vmul.h           vr2,    vr12,   vr20
+    vmul.h           vr3,    vr13,   vr20
+    vmul.h           vr4,    vr14,   vr20
+    vmul.h           vr5,    vr15,   vr20
+    vmul.h           vr6,    vr16,   vr20
+    vmul.h           vr7,    vr17,   vr20
+
+    vsadd.h          vr0,    vr8,    vr0
+    vsadd.h          vr1,    vr8,    vr1
+    vsadd.h          vr2,    vr8,    vr2
+    vsadd.h          vr3,    vr8,    vr3
+    vsadd.h          vr4,    vr8,    vr4
+    vsadd.h          vr5,    vr8,    vr5
+    vsadd.h          vr6,    vr8,    vr6
+    vsadd.h          vr7,    vr8,    vr7
+
+    vssrarn.bu.h     vr10,   vr0,    vr9
+    vssrarn.bu.h     vr11,   vr1,    vr9
+    vssrarn.bu.h     vr12,   vr2,    vr9
+    vssrarn.bu.h     vr13,   vr3,    vr9
+    vssrarn.bu.h     vr14,   vr4,    vr9
+    vssrarn.bu.h     vr15,   vr5,    vr9
+    vssrarn.bu.h     vr16,   vr6,    vr9
+    vssrarn.bu.h     vr17,   vr7,    vr9
+
+    fst.d            f10,    a0,     0
+    fstx.d           f11,    a0,     a1
+    fstx.d           f12,    a0,     t0
+    fstx.d           f13,    a0,     t1
+    fst.d            f14,    t4,     0
+    fstx.d           f15,    t4,     a1
+    fstx.d           f16,    t4,     t0
+    fstx.d           f17,    t4,     t1
+.END_WEIGHT_H264_PIXELS8:
+endfunc
+
+function ff_weight_h264_pixels8_8_lasx
+    slli.d           t0,     a1,     1
+    slli.d           t2,     a1,     2
+    add.d            t1,     t0,     a1
+    addi.d           t3,     zero,   8
+
+    sll.d            a5,     a5,     a3
+    xvreplgr2vr.h    xr20,   a4      //weight
+    xvreplgr2vr.h    xr8,    a5      //offset
+    xvreplgr2vr.h    xr9,    a3      //log2_denom
+
+    fld.d            f0,     a0,     0
+    fldx.d           f1,     a0,     a1
+    fldx.d           f2,     a0,     t0
+    fldx.d           f3,     a0,     t1
+    vilvl.d          vr4,    vr1,    vr0
+    vilvl.d          vr5,    vr3,    vr2
+    vext2xv.hu.bu    xr6,    xr4
+    vext2xv.hu.bu    xr7,    xr5
+
+    xvmul.h          xr11,   xr6,    xr20
+    xvmul.h          xr13,   xr7,    xr20
+    xvsadd.h         xr1,    xr8,    xr11
+    xvsadd.h         xr3,    xr8,    xr13
+
+    xvssrarn.bu.h    xr1,    xr1,    xr9
+    xvssrarn.bu.h    xr3,    xr3,    xr9
+    xvpermi.d        xr2,    xr1,    0x2
+    xvpermi.d        xr4,    xr3,    0x2
+
+    fst.d            f1,     a0,     0
+    fstx.d           f2,     a0,     a1
+    fstx.d           f3,     a0,     t0
+    fstx.d           f4,     a0,     t1
+
+    blt              a2,     t3,    .END_WEIGHT_H264_PIXELS8_LASX
+    add.d            a0,     a0,     t2
+    addi.d           t3,     zero,   16
+    fld.d            f0,     a0,     0
+    fldx.d           f1,     a0,     a1
+    fldx.d           f2,     a0,     t0
+    fldx.d           f3,     a0,     t1
+    vilvl.d          vr4,    vr1,    vr0
+    vilvl.d          vr5,    vr3,    vr2
+    vext2xv.hu.bu    xr6,    xr4
+    vext2xv.hu.bu    xr7,    xr5
+
+    xvmul.h          xr11,   xr6,    xr20
+    xvmul.h          xr13,   xr7,    xr20
+    xvsadd.h         xr1,    xr8,    xr11
+    xvsadd.h         xr3,    xr8,    xr13
+
+    xvssrarn.bu.h    xr1,    xr1,    xr9
+    xvssrarn.bu.h    xr3,    xr3,    xr9
+    xvpermi.d        xr2,    xr1,    0x2
+    xvpermi.d        xr4,    xr3,    0x2
+
+    fst.d            f1,     a0,     0
+    fstx.d           f2,     a0,     a1
+    fstx.d           f3,     a0,     t0
+    fstx.d           f4,     a0,     t1
+
+    blt              a2,     t3,    .END_WEIGHT_H264_PIXELS8_LASX
+    add.d            a0,     a0,     t2
+    add.d            t4,     a0,     t2
+
+    fld.d            f0,     a0,     0
+    fldx.d           f1,     a0,     a1
+    fldx.d           f2,     a0,     t0
+    fldx.d           f3,     a0,     t1
+    fld.d            f4,     t4,     0
+    fldx.d           f5,     t4,     a1
+    fldx.d           f6,     t4,     t0
+    fldx.d           f7,     t4,     t1
+
+    vilvl.d          vr10,   vr1,    vr0
+    vilvl.d          vr11,   vr3,    vr2
+    vilvl.d          vr12,   vr5,    vr4
+    vilvl.d          vr13,   vr7,    vr6
+    vext2xv.hu.bu    xr10,   xr10
+    vext2xv.hu.bu    xr11,   xr11
+    vext2xv.hu.bu    xr12,   xr12
+    vext2xv.hu.bu    xr13,   xr13
+
+    xvmul.h          xr0,    xr10,   xr20
+    xvmul.h          xr1,    xr11,   xr20
+    xvmul.h          xr2,    xr12,   xr20
+    xvmul.h          xr3,    xr13,   xr20
+
+    xvsadd.h         xr0,    xr8,    xr0
+    xvsadd.h         xr1,    xr8,    xr1
+    xvsadd.h         xr2,    xr8,    xr2
+    xvsadd.h         xr3,    xr8,    xr3
+
+    xvssrarn.bu.h    xr10,   xr0,    xr9
+    xvssrarn.bu.h    xr12,   xr1,    xr9
+    xvssrarn.bu.h    xr14,   xr2,    xr9
+    xvssrarn.bu.h    xr16,   xr3,    xr9
+    xvpermi.d        xr11,   xr10,   0x2
+    xvpermi.d        xr13,   xr12,   0x2
+    xvpermi.d        xr15,   xr14,   0x2
+    xvpermi.d        xr17,   xr16,   0x2
+
+    fst.d            f10,    a0,     0
+    fstx.d           f11,    a0,     a1
+    fstx.d           f12,    a0,     t0
+    fstx.d           f13,    a0,     t1
+    fst.d            f14,    t4,     0
+    fstx.d           f15,    t4,     a1
+    fstx.d           f16,    t4,     t0
+    fstx.d           f17,    t4,     t1
+.END_WEIGHT_H264_PIXELS8_LASX:
+endfunc
+
+//LSX optimization is enough for this function.
+function ff_weight_h264_pixels4_8_lsx
+    add.d            t0,     a0,     a1
+    addi.d           t3,     zero,   4
+
+    sll.d            a5,     a5,     a3
+    vreplgr2vr.h     vr20,   a4      //weight
+    vreplgr2vr.h     vr8,    a5      //offset
+    vreplgr2vr.h     vr9,    a3      //log2_denom
+    vldi             vr21,   0
+
+    fld.s            f0,     a0,     0
+    fldx.s           f1,     a0,     a1
+    vilvl.w          vr4,    vr1,    vr0
+    vilvl.b          vr5,    vr21,   vr4
+    vmul.h           vr10,   vr5,    vr20
+    vsadd.h          vr0,    vr8,    vr10
+    vssrarn.bu.h     vr0,    vr0,    vr9
+
+    fst.s            f0,     a0,     0
+    vstelm.w         vr0,    t0,     0,    1
+    blt              a2,     t3,    .END_WEIGHT_H264_PIXELS4
+    add.d            a0,     t0,     a1
+    addi.d           t3,     zero,   8
+    fld.s            f0,     a0,     0
+    fldx.s           f1,     a0,     a1
+    add.d            t0,     a0,     a1
+    vilvl.w          vr4,    vr1,    vr0
+    vilvl.b          vr5,    vr21,   vr4
+
+    vmul.h           vr10,   vr5,    vr20
+    vsadd.h          vr0,    vr8,    vr10
+    vssrarn.bu.h     vr0,    vr0,    vr9
+
+    fst.s            f0,     a0,     0
+    vstelm.w         vr0,    t0,     0,    1
+    blt              a2,     t3,    .END_WEIGHT_H264_PIXELS4
+    add.d            a0,     t0,     a1
+    add.d            t0,     a0,     a1
+    add.d            t1,     t0,     a1
+    add.d            t2,     t1,     a1
+
+    fld.s            f0,     a0,     0
+    fld.s            f1,     t0,     0
+    fld.s            f2,     t1,     0
+    fld.s            f3,     t2,     0
+
+    vilvl.w          vr4,    vr1,    vr0
+    vilvl.w          vr5,    vr3,    vr2
+    vilvl.b          vr6,    vr21,   vr4
+    vilvl.b          vr7,    vr21,   vr5
+
+    vmul.h           vr10,   vr6,    vr20
+    vmul.h           vr11,   vr7,    vr20
+    vsadd.h          vr0,    vr8,    vr10
+    vsadd.h          vr1,    vr8,    vr11
+    vssrarn.bu.h     vr10,   vr0,    vr9
+    vssrarn.bu.h     vr11,   vr1,    vr9
+
+    fst.s            f10,    a0,     0
+    vstelm.w         vr10,   t0,     0,    1
+    fst.s            f11,    t1,     0
+    vstelm.w         vr11,   t2,     0,    1
+.END_WEIGHT_H264_PIXELS4:
+endfunc
+
+function ff_h264_add_pixels4_8_lsx
+    slli.d           t0,     a2,     1
+    add.d            t1,     t0,     a2
+    vld              vr0,    a1,     0
+    vld              vr1,    a1,     16
+    vldi             vr2,    0
+    fld.s            f3,     a0,     0
+    fldx.s           f4,     a0,     a2
+    fldx.s           f5,     a0,     t0
+    fldx.s           f6,     a0,     t1
+    vilvl.w          vr7,    vr4,    vr3
+    vilvl.w          vr8,    vr6,    vr5
+    vilvl.b          vr9,    vr2,    vr7
+    vilvl.b          vr10,   vr2,    vr8
+    vadd.h           vr11,   vr0,    vr9
+    vadd.h           vr12,   vr1,    vr10
+    vpickev.b        vr0,    vr12,   vr11
+    vbsrl.v          vr3,    vr0,    4
+    vbsrl.v          vr4,    vr0,    8
+    vbsrl.v          vr5,    vr0,    12
+    fst.s            f0,     a0,     0
+    fstx.s           f3,     a0,     a2
+    fstx.s           f4,     a0,     t0
+    fstx.s           f5,     a0,     t1
+    vst              vr2,    a1,     0
+    vst              vr2,    a1,     16
+endfunc
+
+function ff_h264_add_pixels8_8_lsx
+    slli.d           t0,     a2,     1
+    slli.d           t2,     a2,     2
+    add.d            t1,     t0,     a2
+    add.d            t3,     a0,     t2
+    vldi             vr0,    0
+    vld              vr1,    a1,     0
+    vld              vr2,    a1,     16
+    vld              vr3,    a1,     32
+    vld              vr4,    a1,     48
+    vld              vr5,    a1,     64
+    vld              vr6,    a1,     80
+    vld              vr7,    a1,     96
+    vld              vr8,    a1,     112
+    fld.d            f10,    a0,     0
+    fldx.d           f11,    a0,     a2
+    fldx.d           f12,    a0,     t0
+    fldx.d           f13,    a0,     t1
+    fld.d            f14,    t3,     0
+    fldx.d           f15,    t3,     a2
+    fldx.d           f16,    t3,     t0
+    fldx.d           f17,    t3,     t1
+    vilvl.b          vr10,   vr0,    vr10
+    vilvl.b          vr11,   vr0,    vr11
+    vilvl.b          vr12,   vr0,    vr12
+    vilvl.b          vr13,   vr0,    vr13
+    vilvl.b          vr14,   vr0,    vr14
+    vilvl.b          vr15,   vr0,    vr15
+    vilvl.b          vr16,   vr0,    vr16
+    vilvl.b          vr17,   vr0,    vr17
+    vadd.h           vr1,    vr1,    vr10
+    vadd.h           vr2,    vr2,    vr11
+    vadd.h           vr3,    vr3,    vr12
+    vadd.h           vr4,    vr4,    vr13
+    vadd.h           vr5,    vr5,    vr14
+    vadd.h           vr6,    vr6,    vr15
+    vadd.h           vr7,    vr7,    vr16
+    vadd.h           vr8,    vr8,    vr17
+    vpickev.b        vr10,   vr2,    vr1
+    vpickev.b        vr12,   vr4,    vr3
+    vpickev.b        vr14,   vr6,    vr5
+    vpickev.b        vr16,   vr8,    vr7
+    vbsrl.v          vr11,   vr10,   8
+    vbsrl.v          vr13,   vr12,   8
+    vbsrl.v          vr15,   vr14,   8
+    vbsrl.v          vr17,   vr16,   8
+    vst              vr0,    a1,     0
+    vst              vr0,    a1,     16
+    vst              vr0,    a1,     32
+    vst              vr0,    a1,     48
+    vst              vr0,    a1,     64
+    vst              vr0,    a1,     80
+    vst              vr0,    a1,     96
+    vst              vr0,    a1,     112
+    fst.d            f10,    a0,     0
+    fstx.d           f11,    a0,     a2
+    fstx.d           f12,    a0,     t0
+    fstx.d           f13,    a0,     t1
+    fst.d            f14,    t3,     0
+    fstx.d           f15,    t3,     a2
+    fstx.d           f16,    t3,     t0
+    fstx.d           f17,    t3,     t1
+endfunc
+
+const cnst_value
+.byte 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2
+.byte 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1
+endconst
+
+function ff_h264_loop_filter_strength_lsx
+    vldi             vr0,    0
+    ldptr.w          t0,     sp,     0   //mask_mv1
+    ldptr.w          t1,     sp,     8   //field
+    beqz             t1,     .FIELD
+    la.local         t2,     cnst_value
+    vld              vr1,    t2,     0
+    vld              vr2,    t2,     16
+    b                .END_FIELD
+.FIELD:
+    vldi             vr1,    0x06
+    vldi             vr2,    0x03
+.END_FIELD:
+    vldi             vr3,    0x01
+    slli.d           a6,     a6,     3  //step <<= 3
+    slli.d           a5,     a5,     3  //edges <<= 3
+    move             t3,     zero
+    slli.d           t4,     a6,     2
+    move             t5,     a2
+    move             t6,     a3
+    move             t7,     a1
+    move             t8,     a0
+    slli.d           t0,     t0,     3
+.ITERATION_FIR:
+    bge              t3,     a5,     .END_ITERATION_FIR
+    vand.v           vr20,   vr20,   vr0
+    and              t2,     t0,     t3
+    bnez             t2,     .MASK_MV_FIR
+    beqz             a4,     .BIDIR_FIR
+    vld              vr4,    t5,     4
+    vld              vr5,    t5,     44
+    vld              vr6,    t5,     12
+    vld              vr7,    t5,     52
+    vilvl.w          vr4,    vr5,    vr4
+    vilvl.w          vr6,    vr6,    vr6
+    vilvl.w          vr7,    vr7,    vr7
+    vshuf4i.h        vr5,    vr4,    0x4e
+    vsub.b           vr6,    vr6,    vr4
+    vsub.b           vr7,    vr7,    vr5
+    vor.v            vr6,    vr6,    vr7
+    vld              vr10,   t6,     16
+    vld              vr11,   t6,     48
+    vld              vr12,   t6,     208
+    vld              vr8,    t6,     176
+    vsub.h           vr13,   vr10,   vr11
+    vsub.h           vr14,   vr10,   vr12
+    vsub.h           vr15,   vr8,    vr11
+    vsub.h           vr16,   vr8,    vr12
+    vssrarni.b.h     vr14,   vr13,   0
+    vssrarni.b.h     vr16,   vr15,   0
+    vadd.b           vr14,   vr2,    vr14
+    vadd.b           vr16,   vr2,    vr16
+    vssub.bu         vr14,   vr14,   vr1
+    vssub.bu         vr16,   vr16,   vr1
+    vssrarni.b.h     vr14,   vr14,   0
+    vssrarni.b.h     vr16,   vr16,   0
+    vor.v            vr20,   vr6,    vr14
+    vshuf4i.h        vr16,   vr16,   0x4e
+    vor.v            vr20,   vr20,   vr16
+    vshuf4i.h        vr21,   vr20,   0x4e
+    vmin.bu          vr20,   vr20,   vr21
+    b                .MASK_MV_FIR
+.BIDIR_FIR:
+    vld              vr4,    t5,     4
+    vld              vr5,    t5,     12
+    vld              vr10,   t6,     16
+    vld              vr11,   t6,     48
+    vsub.h           vr12,   vr11,   vr10
+    vssrarni.b.h     vr12,   vr12,   0
+    vadd.b           vr13,   vr12,   vr2
+    vssub.bu         vr14,   vr13,   vr1
+    vsat.h           vr15,   vr14,   7
+    vpickev.b        vr20,   vr15,   vr15
+    vsub.b           vr6,    vr5,    vr4
+    vor.v            vr20,   vr20,   vr6
+.MASK_MV_FIR:
+    vld              vr4,    t7,     12
+    vld              vr5,    t7,     4
+    vor.v            vr6,    vr4,    vr5
+    vmin.bu          vr6,    vr6,    vr3
+    vmin.bu          vr20,   vr20,   vr3
+    vslli.h          vr6,    vr6,    1
+    vmax.bu          vr6,    vr20,   vr6
+    vilvl.b          vr7,    vr0,    vr6
+    add.d            t3,     t3,     a6
+    fst.d            f7,     t8,     32
+    add.d            t5,     t5,     a6
+    add.d            t6,     t6,     t4
+    add.d            t7,     t7,     a6
+    add.d            t8,     t8,     a6
+    b                .ITERATION_FIR
+.END_ITERATION_FIR:
+    move             t3,     zero
+    addi.d           a5,     zero,   32
+    vldi             vr21,   0xff
+    move             t5,     a2
+    move             t6,     a3
+    move             t7,     a1
+    move             t8,     a0
+    slli.d           a7,     a7,     3
+.ITERATION_SEC:
+    bge              t3,     a5,     .END_ITERATION_SEC
+    vand.v           vr20,   vr20,   vr21
+    and              t2,     a7,     t3
+    bnez             t2,     .MASK_MV_SEC
+    beqz             a4,     .BIDIR_SEC
+    vld              vr4,    t5,     11
+    vld              vr5,    t5,     51
+    vld              vr6,    t5,     12
+    vld              vr7,    t5,     52
+    vilvl.w          vr4,    vr5,    vr4
+    vilvl.w          vr6,    vr6,    vr6
+    vilvl.w          vr7,    vr7,    vr7
+    vshuf4i.h        vr5,    vr4,    0x4e
+    vsub.b           vr6,    vr6,    vr4
+    vsub.b           vr7,    vr7,    vr5
+    vor.v            vr6,    vr6,    vr7
+    vld              vr10,   t6,     44
+    vld              vr11,   t6,     48
+    vld              vr12,   t6,     208
+    vld              vr8,    t6,     204
+    vsub.h           vr13,   vr10,   vr11
+    vsub.h           vr14,   vr10,   vr12
+    vsub.h           vr15,   vr8,    vr11
+    vsub.h           vr16,   vr8,    vr12
+    vssrarni.b.h     vr14,   vr13,   0
+    vssrarni.b.h     vr16,   vr15,   0
+    vadd.b           vr14,   vr2,    vr14
+    vadd.b           vr16,   vr2,    vr16
+    vssub.bu         vr14,   vr14,   vr1
+    vssub.bu         vr16,   vr16,   vr1
+    vssrarni.b.h     vr14,   vr14,   0
+    vssrarni.b.h     vr16,   vr16,   0
+    vor.v            vr20,   vr6,    vr14
+    vshuf4i.h        vr16,   vr16,   0x4e
+    vor.v            vr20,   vr20,   vr16
+    vshuf4i.h        vr22,   vr20,   0x4e
+    vmin.bu          vr20,   vr20,   vr22
+    b                .MASK_MV_SEC
+.BIDIR_SEC:
+    vld              vr4,    t5,     11
+    vld              vr5,    t5,     12
+    vld              vr10,   t6,     44
+    vld              vr11,   t6,     48
+    vsub.h           vr12,   vr11,   vr10
+    vssrarni.b.h     vr12,   vr12,   0
+    vadd.b           vr13,   vr12,   vr2
+    vssub.bu         vr14,   vr13,   vr1
+    vssrarni.b.h     vr14,   vr14,   0
+    vsub.b           vr6,    vr5,    vr4
+    vor.v            vr20,   vr14,   vr6
+.MASK_MV_SEC:
+    vld              vr4,    t7,     12
+    vld              vr5,    t7,     11
+    vor.v            vr6,    vr4,    vr5
+    vmin.bu          vr6,    vr6,    vr3
+    vmin.bu          vr20,   vr20,   vr3
+    vslli.h          vr6,    vr6,    1
+    vmax.bu          vr6,    vr20,   vr6
+    vilvl.b          vr7,    vr0,    vr6
+    addi.d           t3,     t3,     8
+    fst.d            f7,     t8,     0
+    addi.d           t5,     t5,     8
+    addi.d           t6,     t6,     32
+    addi.d           t7,     t7,     8
+    addi.d           t8,     t8,     8
+    b                .ITERATION_SEC
+.END_ITERATION_SEC:
+    vld              vr4,    a0,     0
+    vld              vr5,    a0,     16
+    vilvh.d          vr6,    vr4,    vr4
+    vilvh.d          vr7,    vr5,    vr5
+    LSX_TRANSPOSE4x4_H vr4, vr6, vr5, vr7, vr6, vr7, vr8, vr9, vr10, vr11
+    vilvl.d          vr4,    vr7,    vr6
+    vilvl.d          vr5,    vr9,    vr8
+    vst              vr4,    a0,     0
+    vst              vr5,    a0,     16
+endfunc
diff --git a/libavcodec/loongarch/h264dsp_init_loongarch.c b/libavcodec/loongarch/h264dsp_init_loongarch.c
index f8616a7db5..d97c3a86eb 100644
--- a/libavcodec/loongarch/h264dsp_init_loongarch.c
+++ b/libavcodec/loongarch/h264dsp_init_loongarch.c
@@ -29,21 +29,44 @@ av_cold void ff_h264dsp_init_loongarch(H264DSPContext *c, const int bit_depth,
     int cpu_flags = av_get_cpu_flags();
 
     if (have_lsx(cpu_flags)) {
+        if (chroma_format_idc <= 1)
+            c->h264_loop_filter_strength = ff_h264_loop_filter_strength_lsx;
         if (bit_depth == 8) {
             c->h264_idct_add     = ff_h264_idct_add_8_lsx;
             c->h264_idct8_add    = ff_h264_idct8_add_8_lsx;
             c->h264_idct_dc_add  = ff_h264_idct_dc_add_8_lsx;
             c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_lsx;
 
-            if (chroma_format_idc <= 1)
+            if (chroma_format_idc <= 1) {
                 c->h264_idct_add8 = ff_h264_idct_add8_8_lsx;
-            else
+                c->h264_h_loop_filter_chroma = ff_h264_h_lpf_chroma_8_lsx;
+                c->h264_h_loop_filter_chroma_intra = ff_h264_h_lpf_chroma_intra_8_lsx;
+            } else
                 c->h264_idct_add8 = ff_h264_idct_add8_422_8_lsx;
 
             c->h264_idct_add16 = ff_h264_idct_add16_8_lsx;
             c->h264_idct8_add4 = ff_h264_idct8_add4_8_lsx;
             c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_8_lsx;
             c->h264_idct_add16intra = ff_h264_idct_add16_intra_8_lsx;
+
+            c->h264_add_pixels4_clear = ff_h264_add_pixels4_8_lsx;
+            c->h264_add_pixels8_clear = ff_h264_add_pixels8_8_lsx;
+            c->h264_v_loop_filter_luma = ff_h264_v_lpf_luma_8_lsx;
+            c->h264_h_loop_filter_luma = ff_h264_h_lpf_luma_8_lsx;
+            c->h264_v_loop_filter_luma_intra = ff_h264_v_lpf_luma_intra_8_lsx;
+            c->h264_h_loop_filter_luma_intra = ff_h264_h_lpf_luma_intra_8_lsx;
+            c->h264_v_loop_filter_chroma = ff_h264_v_lpf_chroma_8_lsx;
+
+            c->h264_v_loop_filter_chroma_intra = ff_h264_v_lpf_chroma_intra_8_lsx;
+
+            c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16_8_lsx;
+            c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_8_lsx;
+            c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels4_8_lsx;
+            c->weight_h264_pixels_tab[0]   = ff_weight_h264_pixels16_8_lsx;
+            c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels8_8_lsx;
+            c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels4_8_lsx;
+            c->h264_idct8_add    = ff_h264_idct8_add_8_lsx;
+            c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_lsx;
         }
     }
 #if HAVE_LASX
@@ -57,23 +80,13 @@ av_cold void ff_h264dsp_init_loongarch(H264DSPContext *c, const int bit_depth,
             c->h264_h_loop_filter_luma = ff_h264_h_lpf_luma_8_lasx;
             c->h264_v_loop_filter_luma_intra = ff_h264_v_lpf_luma_intra_8_lasx;
             c->h264_h_loop_filter_luma_intra = ff_h264_h_lpf_luma_intra_8_lasx;
-            c->h264_v_loop_filter_chroma = ff_h264_v_lpf_chroma_8_lasx;
-
-            if (chroma_format_idc <= 1)
-                c->h264_h_loop_filter_chroma = ff_h264_h_lpf_chroma_8_lasx;
-            c->h264_v_loop_filter_chroma_intra = ff_h264_v_lpf_chroma_intra_8_lasx;
-
-            if (chroma_format_idc <= 1)
-                c->h264_h_loop_filter_chroma_intra = ff_h264_h_lpf_chroma_intra_8_lasx;
 
             /* Weighted MC */
             c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16_8_lasx;
             c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels8_8_lasx;
-            c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels4_8_lasx;
 
             c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16_8_lasx;
             c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_8_lasx;
-            c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels4_8_lasx;
 
             c->h264_idct8_add    = ff_h264_idct8_add_8_lasx;
             c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_lasx;
diff --git a/libavcodec/loongarch/h264dsp_lasx.c b/libavcodec/loongarch/h264dsp_lasx.c
index 7b2b8ff0f0..5205cc849f 100644
--- a/libavcodec/loongarch/h264dsp_lasx.c
+++ b/libavcodec/loongarch/h264dsp_lasx.c
@@ -67,10 +67,10 @@
 void ff_h264_h_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width,
                                int alpha_in, int beta_in, int8_t *tc)
 {
-    ptrdiff_t img_width_2x = img_width << 1;
-    ptrdiff_t img_width_4x = img_width << 2;
-    ptrdiff_t img_width_8x = img_width << 3;
-    ptrdiff_t img_width_3x = img_width_2x + img_width;
+    int img_width_2x = img_width << 1;
+    int img_width_4x = img_width << 2;
+    int img_width_8x = img_width << 3;
+    int img_width_3x = img_width_2x + img_width;
     __m256i tmp_vec0, bs_vec;
     __m256i tc_vec = {0x0101010100000000, 0x0303030302020202,
                       0x0101010100000000, 0x0303030302020202};
@@ -244,8 +244,8 @@ void ff_h264_h_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width,
 void ff_h264_v_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width,
                                    int alpha_in, int beta_in, int8_t *tc)
 {
-    ptrdiff_t img_width_2x = img_width << 1;
-    ptrdiff_t img_width_3x = img_width + img_width_2x;
+    int img_width_2x = img_width << 1;
+    int img_width_3x = img_width + img_width_2x;
     __m256i tmp_vec0, bs_vec;
     __m256i tc_vec = {0x0101010100000000, 0x0303030302020202,
                       0x0101010100000000, 0x0303030302020202};
@@ -363,184 +363,6 @@ void ff_h264_v_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width,
     }
 }
 
-void ff_h264_h_lpf_chroma_8_lasx(uint8_t *data, ptrdiff_t img_width,
-                                 int alpha_in, int beta_in, int8_t *tc)
-{
-    __m256i tmp_vec0, bs_vec;
-    __m256i tc_vec = {0x0303020201010000, 0x0303020201010000, 0x0, 0x0};
-    __m256i zero = __lasx_xvldi(0);
-    ptrdiff_t img_width_2x = img_width << 1;
-    ptrdiff_t img_width_4x = img_width << 2;
-    ptrdiff_t img_width_3x = img_width_2x + img_width;
-
-    tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0);
-    tc_vec   = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
-    bs_vec   = __lasx_xvslti_b(tc_vec, 0);
-    bs_vec   = __lasx_xvxori_b(bs_vec, 255);
-    bs_vec   = __lasx_xvandi_b(bs_vec, 1);
-    bs_vec   = __lasx_xvpermi_q(zero, bs_vec, 0x30);
-
-    if (__lasx_xbnz_v(bs_vec)) {
-        uint8_t *src = data - 2;
-        __m256i p1_org, p0_org, q0_org, q1_org;
-        __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
-        __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
-        __m256i is_bs_greater_than0;
-
-        is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec);
-
-        {
-            __m256i row0, row1, row2, row3, row4, row5, row6, row7;
-
-            DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
-                      src, img_width_3x, row0, row1, row2, row3);
-            src += img_width_4x;
-            DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
-                      src, img_width_3x, row4, row5, row6, row7);
-            src -= img_width_4x;
-            /* LASX_TRANSPOSE8x4_B */
-            DUP4_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row6, row4,
-                      row7, row5, p1_org, p0_org, q0_org, q1_org);
-            row0 = __lasx_xvilvl_b(p0_org, p1_org);
-            row1 = __lasx_xvilvl_b(q1_org, q0_org);
-            row3 = __lasx_xvilvh_w(row1, row0);
-            row2 = __lasx_xvilvl_w(row1, row0);
-            p1_org = __lasx_xvpermi_d(row2, 0x00);
-            p0_org = __lasx_xvpermi_d(row2, 0x55);
-            q0_org = __lasx_xvpermi_d(row3, 0x00);
-            q1_org = __lasx_xvpermi_d(row3, 0x55);
-        }
-
-        p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
-        p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
-        q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
-
-        alpha = __lasx_xvreplgr2vr_b(alpha_in);
-        beta  = __lasx_xvreplgr2vr_b(beta_in);
-
-        is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
-        is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
-        is_less_than       = is_less_than_alpha & is_less_than_beta;
-        is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
-        is_less_than       = is_less_than_beta & is_less_than;
-        is_less_than       = is_less_than & is_bs_greater_than0;
-
-        if (__lasx_xbnz_v(is_less_than)) {
-            __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
-
-            p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
-            p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
-            q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
-            q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
-
-            {
-                __m256i tc_h, neg_thresh_h, p0_h, q0_h;
-
-                neg_thresh_h = __lasx_xvneg_b(tc_vec);
-                neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
-                tc_h         = __lasx_vext2xv_hu_bu(tc_vec);
-
-                AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h,
-                             neg_thresh_h, tc_h, p0_h, q0_h);
-                DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
-                          p0_h, q0_h);
-                DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8,
-                          p0_h, q0_h);
-                p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
-                q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
-            }
-
-            p0_org = __lasx_xvilvl_b(q0_org, p0_org);
-            src = data - 1;
-            __lasx_xvstelm_h(p0_org, src, 0, 0);
-            src += img_width;
-            __lasx_xvstelm_h(p0_org, src, 0, 1);
-            src += img_width;
-            __lasx_xvstelm_h(p0_org, src, 0, 2);
-            src += img_width;
-            __lasx_xvstelm_h(p0_org, src, 0, 3);
-            src += img_width;
-            __lasx_xvstelm_h(p0_org, src, 0, 4);
-            src += img_width;
-            __lasx_xvstelm_h(p0_org, src, 0, 5);
-            src += img_width;
-            __lasx_xvstelm_h(p0_org, src, 0, 6);
-            src += img_width;
-            __lasx_xvstelm_h(p0_org, src, 0, 7);
-        }
-    }
-}
-
-void ff_h264_v_lpf_chroma_8_lasx(uint8_t *data, ptrdiff_t img_width,
-                                 int alpha_in, int beta_in, int8_t *tc)
-{
-    int img_width_2x = img_width << 1;
-    __m256i tmp_vec0, bs_vec;
-    __m256i tc_vec = {0x0303020201010000, 0x0303020201010000, 0x0, 0x0};
-    __m256i zero = __lasx_xvldi(0);
-
-    tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0);
-    tc_vec   = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
-    bs_vec   = __lasx_xvslti_b(tc_vec, 0);
-    bs_vec   = __lasx_xvxori_b(bs_vec, 255);
-    bs_vec   = __lasx_xvandi_b(bs_vec, 1);
-    bs_vec   = __lasx_xvpermi_q(zero, bs_vec, 0x30);
-
-    if (__lasx_xbnz_v(bs_vec)) {
-        __m256i p1_org, p0_org, q0_org, q1_org;
-        __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
-        __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
-        __m256i is_bs_greater_than0;
-
-        alpha = __lasx_xvreplgr2vr_b(alpha_in);
-        beta  = __lasx_xvreplgr2vr_b(beta_in);
-
-        DUP2_ARG2(__lasx_xvldx, data, -img_width_2x, data, -img_width,
-                  p1_org, p0_org);
-        DUP2_ARG2(__lasx_xvldx, data, 0, data, img_width, q0_org, q1_org);
-
-        is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec);
-        p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
-        p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
-        q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
-
-        is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
-        is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
-        is_less_than       = is_less_than_alpha & is_less_than_beta;
-        is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
-        is_less_than       = is_less_than_beta & is_less_than;
-        is_less_than       = is_less_than & is_bs_greater_than0;
-
-        if (__lasx_xbnz_v(is_less_than)) {
-            __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
-
-            p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
-            p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
-            q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
-            q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
-
-            {
-                __m256i neg_thresh_h, tc_h, p0_h, q0_h;
-
-                neg_thresh_h = __lasx_xvneg_b(tc_vec);
-                neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
-                tc_h         = __lasx_vext2xv_hu_bu(tc_vec);
-
-                AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h,
-                             neg_thresh_h, tc_h, p0_h, q0_h);
-                DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
-                          p0_h, q0_h);
-                DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8,
-                          p0_h, q0_h);
-                p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
-                q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
-                __lasx_xvstelm_d(p0_h, data - img_width, 0, 0);
-                __lasx_xvstelm_d(q0_h, data, 0, 0);
-            }
-        }
-    }
-}
-
 #define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in,          \
                                  q3_or_p3_org_in, p1_or_q1_org_in,          \
                                  p2_or_q2_org_in, q1_or_p1_org_in,          \
@@ -584,9 +406,9 @@ void ff_h264_v_lpf_chroma_8_lasx(uint8_t *data, ptrdiff_t img_width,
 void ff_h264_h_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
                                      int alpha_in, int beta_in)
 {
-    ptrdiff_t img_width_2x = img_width << 1;
-    ptrdiff_t img_width_4x = img_width << 2;
-    ptrdiff_t img_width_3x = img_width_2x + img_width;
+    int img_width_2x = img_width << 1;
+    int img_width_4x = img_width << 2;
+    int img_width_3x = img_width_2x + img_width;
     uint8_t *src = data - 4;
     __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
     __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
@@ -760,8 +582,8 @@ void ff_h264_h_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
 void ff_h264_v_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
                                      int alpha_in, int beta_in)
 {
-    ptrdiff_t img_width_2x = img_width << 1;
-    ptrdiff_t img_width_3x = img_width_2x + img_width;
+    int img_width_2x = img_width << 1;
+    int img_width_3x = img_width_2x + img_width;
     uint8_t *src = data - img_width_2x;
     __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
     __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
@@ -877,1160 +699,6 @@ void ff_h264_v_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
     }
 }
 
-void ff_h264_h_lpf_chroma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
-                                       int alpha_in, int beta_in)
-{
-    uint8_t *src = data - 2;
-    ptrdiff_t img_width_2x = img_width << 1;
-    ptrdiff_t img_width_4x = img_width << 2;
-    ptrdiff_t img_width_3x = img_width_2x + img_width;
-    __m256i p1_org, p0_org, q0_org, q1_org;
-    __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
-    __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
-
-    {
-        __m256i row0, row1, row2, row3, row4, row5, row6, row7;
-
-        DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, src,
-                  img_width_3x, row0, row1, row2, row3);
-        src += img_width_4x;
-        DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, src,
-                  img_width_3x, row4, row5, row6, row7);
-
-        /* LASX_TRANSPOSE8x4_B */
-        DUP4_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row6, row4, row7, row5,
-                  p1_org, p0_org, q0_org, q1_org);
-        row0 = __lasx_xvilvl_b(p0_org, p1_org);
-        row1 = __lasx_xvilvl_b(q1_org, q0_org);
-        row3 = __lasx_xvilvh_w(row1, row0);
-        row2 = __lasx_xvilvl_w(row1, row0);
-        p1_org = __lasx_xvpermi_d(row2, 0x00);
-        p0_org = __lasx_xvpermi_d(row2, 0x55);
-        q0_org = __lasx_xvpermi_d(row3, 0x00);
-        q1_org = __lasx_xvpermi_d(row3, 0x55);
-    }
-
-    alpha = __lasx_xvreplgr2vr_b(alpha_in);
-    beta  = __lasx_xvreplgr2vr_b(beta_in);
-
-    p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
-    p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
-    q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
-
-    is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
-    is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
-    is_less_than       = is_less_than_alpha & is_less_than_beta;
-    is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
-    is_less_than       = is_less_than_beta & is_less_than;
-
-    if (__lasx_xbnz_v(is_less_than)) {
-        __m256i p0_h, q0_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h;
-
-        p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
-        p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
-        q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
-        q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
-
-        AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h);
-        AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h);
-        DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h, p0_h, q0_h);
-        DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8, p0_h, q0_h);
-        p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
-        q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
-    }
-    p0_org = __lasx_xvilvl_b(q0_org, p0_org);
-    src = data - 1;
-    __lasx_xvstelm_h(p0_org, src, 0, 0);
-    src += img_width;
-    __lasx_xvstelm_h(p0_org, src, 0, 1);
-    src += img_width;
-    __lasx_xvstelm_h(p0_org, src, 0, 2);
-    src += img_width;
-    __lasx_xvstelm_h(p0_org, src, 0, 3);
-    src += img_width;
-    __lasx_xvstelm_h(p0_org, src, 0, 4);
-    src += img_width;
-    __lasx_xvstelm_h(p0_org, src, 0, 5);
-    src += img_width;
-    __lasx_xvstelm_h(p0_org, src, 0, 6);
-    src += img_width;
-    __lasx_xvstelm_h(p0_org, src, 0, 7);
-}
-
-void ff_h264_v_lpf_chroma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
-                                       int alpha_in, int beta_in)
-{
-    ptrdiff_t img_width_2x = img_width << 1;
-    __m256i p1_org, p0_org, q0_org, q1_org;
-    __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
-    __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
-
-    alpha = __lasx_xvreplgr2vr_b(alpha_in);
-    beta  = __lasx_xvreplgr2vr_b(beta_in);
-
-    p1_org = __lasx_xvldx(data, -img_width_2x);
-    p0_org = __lasx_xvldx(data, -img_width);
-    DUP2_ARG2(__lasx_xvldx, data, 0, data, img_width, q0_org, q1_org);
-
-    p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
-    p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
-    q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
-
-    is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
-    is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
-    is_less_than       = is_less_than_alpha & is_less_than_beta;
-    is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
-    is_less_than       = is_less_than_beta & is_less_than;
-
-    if (__lasx_xbnz_v(is_less_than)) {
-        __m256i p0_h, q0_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h;
-
-        p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
-        p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
-        q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
-        q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
-
-        AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h);
-        AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h);
-        DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h, p0_h, q0_h);
-        DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8, p0_h, q0_h);
-        p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
-        q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
-        __lasx_xvstelm_d(p0_h, data - img_width, 0, 0);
-        __lasx_xvstelm_d(q0_h, data, 0, 0);
-    }
-}
-
-void ff_biweight_h264_pixels16_8_lasx(uint8_t *dst, uint8_t *src,
-                                      ptrdiff_t stride, int height,
-                                      int log2_denom, int weight_dst,
-                                      int weight_src, int offset_in)
-{
-    __m256i wgt;
-    __m256i src0, src1, src2, src3;
-    __m256i dst0, dst1, dst2, dst3;
-    __m256i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-    __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-    __m256i denom, offset;
-    int stride_2x = stride << 1;
-    int stride_4x = stride << 2;
-    int stride_3x = stride_2x + stride;
-
-    offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
-    offset_in  += ((weight_src + weight_dst) << 7);
-    log2_denom += 1;
-
-    tmp0   = __lasx_xvreplgr2vr_b(weight_src);
-    tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
-    wgt    = __lasx_xvilvh_b(tmp1, tmp0);
-    offset = __lasx_xvreplgr2vr_h(offset_in);
-    denom  = __lasx_xvreplgr2vr_h(log2_denom);
-
-    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
-              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
-    src += stride_4x;
-    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
-              src, stride_3x, tmp4, tmp5, tmp6, tmp7);
-    src += stride_4x;
-    DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4,
-              0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3);
-    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
-              dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
-    dst += stride_4x;
-    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
-              dst, stride_3x, tmp4, tmp5, tmp6, tmp7);
-    dst -= stride_4x;
-    DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4,
-              0x20, tmp7, tmp6, 0x20, dst0, dst1, dst2, dst3);
-
-    DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
-              src0, src1, src2, src3);
-    DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128,
-              dst0, dst1, dst2, dst3);
-    DUP4_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, dst2, src2,
-              dst3, src3, vec0, vec2, vec4, vec6);
-    DUP4_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, dst2, src2,
-              dst3, src3, vec1, vec3, vec5, vec7);
-
-    DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
-              offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
-    DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec4, offset, wgt, vec5,
-              offset, wgt, vec6, offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7);
-
-    tmp0 = __lasx_xvsra_h(tmp0, denom);
-    tmp1 = __lasx_xvsra_h(tmp1, denom);
-    tmp2 = __lasx_xvsra_h(tmp2, denom);
-    tmp3 = __lasx_xvsra_h(tmp3, denom);
-    tmp4 = __lasx_xvsra_h(tmp4, denom);
-    tmp5 = __lasx_xvsra_h(tmp5, denom);
-    tmp6 = __lasx_xvsra_h(tmp6, denom);
-    tmp7 = __lasx_xvsra_h(tmp7, denom);
-
-    DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
-                                  tmp0, tmp1, tmp2, tmp3);
-    DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7,
-                                  tmp4, tmp5, tmp6, tmp7);
-    DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
-              dst0, dst1, dst2, dst3);
-    __lasx_xvstelm_d(dst0, dst, 0, 0);
-    __lasx_xvstelm_d(dst0, dst, 8, 1);
-    dst += stride;
-    __lasx_xvstelm_d(dst0, dst, 0, 2);
-    __lasx_xvstelm_d(dst0, dst, 8, 3);
-    dst += stride;
-    __lasx_xvstelm_d(dst1, dst, 0, 0);
-    __lasx_xvstelm_d(dst1, dst, 8, 1);
-    dst += stride;
-    __lasx_xvstelm_d(dst1, dst, 0, 2);
-    __lasx_xvstelm_d(dst1, dst, 8, 3);
-    dst += stride;
-    __lasx_xvstelm_d(dst2, dst, 0, 0);
-    __lasx_xvstelm_d(dst2, dst, 8, 1);
-    dst += stride;
-    __lasx_xvstelm_d(dst2, dst, 0, 2);
-    __lasx_xvstelm_d(dst2, dst, 8, 3);
-    dst += stride;
-    __lasx_xvstelm_d(dst3, dst, 0, 0);
-    __lasx_xvstelm_d(dst3, dst, 8, 1);
-    dst += stride;
-    __lasx_xvstelm_d(dst3, dst, 0, 2);
-    __lasx_xvstelm_d(dst3, dst, 8, 3);
-    dst += stride;
-
-    if (16 == height) {
-        DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
-                  src, stride_3x, tmp0, tmp1, tmp2, tmp3);
-        src += stride_4x;
-        DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
-                  src, stride_3x, tmp4, tmp5, tmp6, tmp7);
-        src += stride_4x;
-        DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5,
-                  tmp4, 0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3);
-        DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
-                  dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
-        dst += stride_4x;
-        DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
-                  dst, stride_3x, tmp4, tmp5, tmp6, tmp7);
-        dst -= stride_4x;
-        DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5,
-                  tmp4, 0x20, tmp7, tmp6, 0x20, dst0, dst1, dst2, dst3);
-
-        DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
-                  src0, src1, src2, src3);
-        DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128,
-                  dst0, dst1, dst2, dst3);
-        DUP4_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, dst2, src2,
-                  dst3, src3, vec0, vec2, vec4, vec6);
-        DUP4_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, dst2, src2,
-                  dst3, src3, vec1, vec3, vec5, vec7);
-
-        DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
-                  offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
-        DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec4, offset, wgt, vec5,
-                  offset, wgt, vec6, offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7);
-
-        tmp0 = __lasx_xvsra_h(tmp0, denom);
-        tmp1 = __lasx_xvsra_h(tmp1, denom);
-        tmp2 = __lasx_xvsra_h(tmp2, denom);
-        tmp3 = __lasx_xvsra_h(tmp3, denom);
-        tmp4 = __lasx_xvsra_h(tmp4, denom);
-        tmp5 = __lasx_xvsra_h(tmp5, denom);
-        tmp6 = __lasx_xvsra_h(tmp6, denom);
-        tmp7 = __lasx_xvsra_h(tmp7, denom);
-
-        DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
-                                      tmp0, tmp1, tmp2, tmp3);
-        DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7,
-                                      tmp4, tmp5, tmp6, tmp7);
-        DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7,
-                  tmp6, dst0, dst1, dst2, dst3);
-        __lasx_xvstelm_d(dst0, dst, 0, 0);
-        __lasx_xvstelm_d(dst0, dst, 8, 1);
-        dst += stride;
-        __lasx_xvstelm_d(dst0, dst, 0, 2);
-        __lasx_xvstelm_d(dst0, dst, 8, 3);
-        dst += stride;
-        __lasx_xvstelm_d(dst1, dst, 0, 0);
-        __lasx_xvstelm_d(dst1, dst, 8, 1);
-        dst += stride;
-        __lasx_xvstelm_d(dst1, dst, 0, 2);
-        __lasx_xvstelm_d(dst1, dst, 8, 3);
-        dst += stride;
-        __lasx_xvstelm_d(dst2, dst, 0, 0);
-        __lasx_xvstelm_d(dst2, dst, 8, 1);
-        dst += stride;
-        __lasx_xvstelm_d(dst2, dst, 0, 2);
-        __lasx_xvstelm_d(dst2, dst, 8, 3);
-        dst += stride;
-        __lasx_xvstelm_d(dst3, dst, 0, 0);
-        __lasx_xvstelm_d(dst3, dst, 8, 1);
-        dst += stride;
-        __lasx_xvstelm_d(dst3, dst, 0, 2);
-        __lasx_xvstelm_d(dst3, dst, 8, 3);
-    }
-}
-
-static void avc_biwgt_8x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
-                               int32_t log2_denom, int32_t weight_src,
-                               int32_t weight_dst, int32_t offset_in)
-{
-    __m256i wgt, vec0, vec1;
-    __m256i src0, dst0;
-    __m256i tmp0, tmp1, tmp2, tmp3, denom, offset;
-    ptrdiff_t stride_2x = stride << 1;
-    ptrdiff_t stride_3x = stride_2x + stride;
-
-    offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
-    offset_in  += ((weight_src + weight_dst) << 7);
-    log2_denom += 1;
-
-    tmp0   = __lasx_xvreplgr2vr_b(weight_src);
-    tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
-    wgt    = __lasx_xvilvh_b(tmp1, tmp0);
-    offset = __lasx_xvreplgr2vr_h(offset_in);
-    denom  = __lasx_xvreplgr2vr_h(log2_denom);
-
-    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
-              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
-    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
-    src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
-    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
-              dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
-    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
-    dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
-    DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0);
-    vec0 = __lasx_xvilvl_b(dst0, src0);
-    vec1 = __lasx_xvilvh_b(dst0, src0);
-    DUP2_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
-              tmp0, tmp1);
-    tmp0 = __lasx_xvsra_h(tmp0, denom);
-    tmp1 = __lasx_xvsra_h(tmp1, denom);
-    DUP2_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp0, tmp1);
-    dst0 = __lasx_xvpickev_b(tmp1, tmp0);
-    __lasx_xvstelm_d(dst0, dst, 0, 0);
-    __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
-    __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
-    __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
-}
-
-static void avc_biwgt_8x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
-                               int32_t log2_denom, int32_t weight_src,
-                               int32_t weight_dst, int32_t offset_in)
-{
-    __m256i wgt, vec0, vec1, vec2, vec3;
-    __m256i src0, src1, dst0, dst1;
-    __m256i tmp0, tmp1, tmp2, tmp3, denom, offset;
-    ptrdiff_t stride_2x = stride << 1;
-    ptrdiff_t stride_4x = stride << 2;
-    ptrdiff_t stride_3x = stride_2x + stride;
-    uint8_t* dst_tmp = dst;
-
-    offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
-    offset_in  += ((weight_src + weight_dst) << 7);
-    log2_denom += 1;
-
-    tmp0   = __lasx_xvreplgr2vr_b(weight_src);
-    tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
-    wgt    = __lasx_xvilvh_b(tmp1, tmp0);
-    offset = __lasx_xvreplgr2vr_h(offset_in);
-    denom  = __lasx_xvreplgr2vr_h(log2_denom);
-
-    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
-              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
-    src += stride_4x;
-    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
-    src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
-    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
-              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
-    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
-    src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
-    tmp0 = __lasx_xvld(dst_tmp, 0);
-    DUP2_ARG2(__lasx_xvldx, dst_tmp, stride, dst_tmp, stride_2x, tmp1, tmp2);
-    tmp3 = __lasx_xvldx(dst_tmp, stride_3x);
-    dst_tmp += stride_4x;
-    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
-    dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
-    DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
-              dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
-    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
-    dst1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
-
-    DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, dst0, 128, dst1, 128,
-              src0, src1, dst0, dst1);
-    DUP2_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, vec0, vec2);
-    DUP2_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, vec1, vec3);
-    DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
-              offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
-    tmp0 = __lasx_xvsra_h(tmp0, denom);
-    tmp1 = __lasx_xvsra_h(tmp1, denom);
-    tmp2 = __lasx_xvsra_h(tmp2, denom);
-    tmp3 = __lasx_xvsra_h(tmp3, denom);
-    DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
-                                  tmp0, tmp1, tmp2, tmp3);
-    DUP2_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, dst0, dst1);
-    __lasx_xvstelm_d(dst0, dst, 0, 0);
-    __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
-    __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
-    __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
-    dst += stride_4x;
-    __lasx_xvstelm_d(dst1, dst, 0, 0);
-    __lasx_xvstelm_d(dst1, dst + stride, 0, 1);
-    __lasx_xvstelm_d(dst1, dst + stride_2x, 0, 2);
-    __lasx_xvstelm_d(dst1, dst + stride_3x, 0, 3);
-}
-
-static void avc_biwgt_8x16_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
-                                int32_t log2_denom, int32_t weight_src,
-                                int32_t weight_dst, int32_t offset_in)
-{
-    __m256i wgt, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-    __m256i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
-    __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
-    ptrdiff_t stride_2x = stride << 1;
-    ptrdiff_t stride_4x = stride << 2;
-    ptrdiff_t stride_3x = stride_2x + stride;
-    uint8_t* dst_tmp = dst;
-
-    offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
-    offset_in  += ((weight_src + weight_dst) << 7);
-    log2_denom += 1;
-
-    tmp0   = __lasx_xvreplgr2vr_b(weight_src);
-    tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
-    wgt    = __lasx_xvilvh_b(tmp1, tmp0);
-    offset = __lasx_xvreplgr2vr_h(offset_in);
-    denom  = __lasx_xvreplgr2vr_h(log2_denom);
-
-    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
-              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
-    src += stride_4x;
-    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
-    src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
-    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
-              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
-    src += stride_4x;
-    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
-    src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
-    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
-              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
-    src += stride_4x;
-    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
-    src2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
-    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
-              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
-    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
-    src3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
-
-    DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
-              dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
-    dst_tmp += stride_4x;
-    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
-    dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
-    DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
-              dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
-    dst_tmp += stride_4x;
-    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
-    dst1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
-    DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
-              dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
-    dst_tmp += stride_4x;
-    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
-    dst2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
-    DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
-              dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
-    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
-    dst3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
-
-    DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
-              src0, src1, src2, src3);
-    DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128,
-              dst0, dst1, dst2, dst3);
-    DUP4_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, dst2, src2,
-              dst3, src3, vec0, vec2, vec4, vec6);
-    DUP4_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, dst2, src2,
-              dst3, src3, vec1, vec3, vec5, vec7);
-    DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
-              offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
-    DUP4_ARG3(__lasx_xvdp2add_h_b,offset, wgt, vec4, offset, wgt, vec5,
-              offset, wgt, vec6, offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7);
-    tmp0 = __lasx_xvsra_h(tmp0, denom);
-    tmp1 = __lasx_xvsra_h(tmp1, denom);
-    tmp2 = __lasx_xvsra_h(tmp2, denom);
-    tmp3 = __lasx_xvsra_h(tmp3, denom);
-    tmp4 = __lasx_xvsra_h(tmp4, denom);
-    tmp5 = __lasx_xvsra_h(tmp5, denom);
-    tmp6 = __lasx_xvsra_h(tmp6, denom);
-    tmp7 = __lasx_xvsra_h(tmp7, denom);
-    DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
-                                  tmp0, tmp1, tmp2, tmp3);
-    DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7,
-                                  tmp4, tmp5, tmp6, tmp7);
-    DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
-                   dst0, dst1, dst2, dst3)
-    __lasx_xvstelm_d(dst0, dst, 0, 0);
-    __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
-    __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
-    __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
-    dst += stride_4x;
-    __lasx_xvstelm_d(dst1, dst, 0, 0);
-    __lasx_xvstelm_d(dst1, dst + stride, 0, 1);
-    __lasx_xvstelm_d(dst1, dst + stride_2x, 0, 2);
-    __lasx_xvstelm_d(dst1, dst + stride_3x, 0, 3);
-    dst += stride_4x;
-    __lasx_xvstelm_d(dst2, dst, 0, 0);
-    __lasx_xvstelm_d(dst2, dst + stride, 0, 1);
-    __lasx_xvstelm_d(dst2, dst + stride_2x, 0, 2);
-    __lasx_xvstelm_d(dst2, dst + stride_3x, 0, 3);
-    dst += stride_4x;
-    __lasx_xvstelm_d(dst3, dst, 0, 0);
-    __lasx_xvstelm_d(dst3, dst + stride, 0, 1);
-    __lasx_xvstelm_d(dst3, dst + stride_2x, 0, 2);
-    __lasx_xvstelm_d(dst3, dst + stride_3x, 0, 3);
-}
-
-void ff_biweight_h264_pixels8_8_lasx(uint8_t *dst, uint8_t *src,
-                                     ptrdiff_t stride, int height,
-                                     int log2_denom, int weight_dst,
-                                     int weight_src, int offset)
-{
-    if (4 == height) {
-        avc_biwgt_8x4_lasx(src, dst, stride, log2_denom, weight_src, weight_dst,
-                           offset);
-    } else if (8 == height) {
-        avc_biwgt_8x8_lasx(src, dst, stride, log2_denom, weight_src, weight_dst,
-                           offset);
-    } else {
-        avc_biwgt_8x16_lasx(src, dst, stride, log2_denom, weight_src, weight_dst,
-                            offset);
-    }
-}
-
-static void avc_biwgt_4x2_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
-                               int32_t log2_denom, int32_t weight_src,
-                               int32_t weight_dst, int32_t offset_in)
-{
-    __m256i wgt, vec0;
-    __m256i src0, dst0;
-    __m256i tmp0, tmp1, denom, offset;
-
-    offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
-    offset_in  += ((weight_src + weight_dst) << 7);
-    log2_denom += 1;
-
-    tmp0   = __lasx_xvreplgr2vr_b(weight_src);
-    tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
-    wgt    = __lasx_xvilvh_b(tmp1, tmp0);
-    offset = __lasx_xvreplgr2vr_h(offset_in);
-    denom  = __lasx_xvreplgr2vr_h(log2_denom);
-
-    DUP2_ARG2(__lasx_xvldx, src, 0, src, stride, tmp0, tmp1);
-    src0 = __lasx_xvilvl_w(tmp1, tmp0);
-    DUP2_ARG2(__lasx_xvldx, dst, 0, dst, stride, tmp0, tmp1);
-    dst0 = __lasx_xvilvl_w(tmp1, tmp0);
-    DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0);
-    vec0 = __lasx_xvilvl_b(dst0, src0);
-    tmp0 = __lasx_xvdp2add_h_b(offset, wgt, vec0);
-    tmp0 = __lasx_xvsra_h(tmp0, denom);
-    tmp0 = __lasx_xvclip255_h(tmp0);
-    tmp0 = __lasx_xvpickev_b(tmp0, tmp0);
-    __lasx_xvstelm_w(tmp0, dst, 0, 0);
-    __lasx_xvstelm_w(tmp0, dst + stride, 0, 1);
-}
-
-static void avc_biwgt_4x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
-                               int32_t log2_denom, int32_t weight_src,
-                               int32_t weight_dst, int32_t offset_in)
-{
-    __m256i wgt, vec0;
-    __m256i src0, dst0;
-    __m256i tmp0, tmp1, tmp2, tmp3, denom, offset;
-    ptrdiff_t stride_2x = stride << 1;
-    ptrdiff_t stride_3x = stride_2x + stride;
-
-    offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
-    offset_in  += ((weight_src + weight_dst) << 7);
-    log2_denom += 1;
-
-    tmp0   = __lasx_xvreplgr2vr_b(weight_src);
-    tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
-    wgt    = __lasx_xvilvh_b(tmp1, tmp0);
-    offset = __lasx_xvreplgr2vr_h(offset_in);
-    denom  = __lasx_xvreplgr2vr_h(log2_denom);
-
-    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
-              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
-    DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
-    src0 = __lasx_xvilvl_w(tmp1, tmp0);
-    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
-              dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
-    DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
-    dst0 = __lasx_xvilvl_w(tmp1, tmp0);
-    DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0);
-    vec0 = __lasx_xvilvl_b(dst0, src0);
-    dst0 = __lasx_xvilvh_b(dst0, src0);
-    vec0 = __lasx_xvpermi_q(vec0, dst0, 0x02);
-    tmp0 = __lasx_xvdp2add_h_b(offset, wgt, vec0);
-    tmp0 = __lasx_xvsra_h(tmp0, denom);
-    tmp0 = __lasx_xvclip255_h(tmp0);
-    tmp0 = __lasx_xvpickev_b(tmp0, tmp0);
-    __lasx_xvstelm_w(tmp0, dst, 0, 0);
-    __lasx_xvstelm_w(tmp0, dst + stride, 0, 1);
-    __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 4);
-    __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 5);
-}
-
-static void avc_biwgt_4x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
-                               int32_t log2_denom, int32_t weight_src,
-                               int32_t weight_dst, int32_t offset_in)
-{
-    __m256i wgt, vec0, vec1;
-    __m256i src0, dst0;
-    __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
-    ptrdiff_t stride_2x = stride << 1;
-    ptrdiff_t stride_4x = stride << 2;
-    ptrdiff_t stride_3x = stride_2x + stride;
-
-    offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
-    offset_in  += ((weight_src + weight_dst) << 7);
-    log2_denom += 1;
-
-    tmp0   = __lasx_xvreplgr2vr_b(weight_src);
-    tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
-    wgt    = __lasx_xvilvh_b(tmp1, tmp0);
-    offset = __lasx_xvreplgr2vr_h(offset_in);
-    denom  = __lasx_xvreplgr2vr_h(log2_denom);
-
-    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
-              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
-    src += stride_4x;
-    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
-              src, stride_3x, tmp4, tmp5, tmp6, tmp7);
-    DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7, tmp5,
-              tmp0, tmp1, tmp2, tmp3);
-    DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
-    src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
-    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
-              dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
-    dst += stride_4x;
-    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
-              dst, stride_3x, tmp4, tmp5, tmp6, tmp7);
-    dst -= stride_4x;
-    DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7, tmp5,
-              tmp0, tmp1, tmp2, tmp3);
-    DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
-    dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
-    DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0);
-    vec0 = __lasx_xvilvl_b(dst0, src0);
-    vec1 = __lasx_xvilvh_b(dst0, src0);
-    DUP2_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
-              tmp0, tmp1);
-    tmp0 = __lasx_xvsra_h(tmp0, denom);
-    tmp1 = __lasx_xvsra_h(tmp1, denom);
-    DUP2_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp0, tmp1);
-    tmp0 = __lasx_xvpickev_b(tmp1, tmp0);
-    __lasx_xvstelm_w(tmp0, dst, 0, 0);
-    __lasx_xvstelm_w(tmp0, dst + stride, 0, 1);
-    __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 2);
-    __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 3);
-    dst += stride_4x;
-    __lasx_xvstelm_w(tmp0, dst, 0, 4);
-    __lasx_xvstelm_w(tmp0, dst + stride, 0, 5);
-    __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 6);
-    __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 7);
-}
-
-void ff_biweight_h264_pixels4_8_lasx(uint8_t *dst, uint8_t *src,
-                                     ptrdiff_t stride, int height,
-                                     int log2_denom, int weight_dst,
-                                     int weight_src, int offset)
-{
-    if (2 == height) {
-        avc_biwgt_4x2_lasx(src, dst, stride, log2_denom, weight_src,
-                           weight_dst, offset);
-    } else if (4 == height) {
-        avc_biwgt_4x4_lasx(src, dst, stride, log2_denom, weight_src,
-                           weight_dst, offset);
-    } else {
-        avc_biwgt_4x8_lasx(src, dst, stride, log2_denom, weight_src,
-                           weight_dst, offset);
-    }
-}
-
-void ff_weight_h264_pixels16_8_lasx(uint8_t *src, ptrdiff_t stride,
-                                    int height, int log2_denom,
-                                    int weight_src, int offset_in)
-{
-    uint32_t offset_val;
-    ptrdiff_t stride_2x = stride << 1;
-    ptrdiff_t stride_4x = stride << 2;
-    ptrdiff_t stride_3x = stride_2x + stride;
-    __m256i zero = __lasx_xvldi(0);
-    __m256i src0, src1, src2, src3;
-    __m256i src0_l, src1_l, src2_l, src3_l, src0_h, src1_h, src2_h, src3_h;
-    __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-    __m256i wgt, denom, offset;
-
-    offset_val = (unsigned) offset_in << log2_denom;
-
-    wgt    = __lasx_xvreplgr2vr_h(weight_src);
-    offset = __lasx_xvreplgr2vr_h(offset_val);
-    denom  = __lasx_xvreplgr2vr_h(log2_denom);
-
-    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
-              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
-    src += stride_4x;
-    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
-              src, stride_3x, tmp4, tmp5, tmp6, tmp7);
-    src -= stride_4x;
-    DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4,
-              0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3);
-    DUP4_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, zero, src2,
-              zero, src3, src0_l, src1_l, src2_l, src3_l);
-    DUP4_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, zero, src2,
-              zero, src3, src0_h, src1_h, src2_h, src3_h);
-    src0_l = __lasx_xvmul_h(wgt, src0_l);
-    src0_h = __lasx_xvmul_h(wgt, src0_h);
-    src1_l = __lasx_xvmul_h(wgt, src1_l);
-    src1_h = __lasx_xvmul_h(wgt, src1_h);
-    src2_l = __lasx_xvmul_h(wgt, src2_l);
-    src2_h = __lasx_xvmul_h(wgt, src2_h);
-    src3_l = __lasx_xvmul_h(wgt, src3_l);
-    src3_h = __lasx_xvmul_h(wgt, src3_h);
-    DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l, offset,
-              src1_h, offset, src0_l, src0_h, src1_l, src1_h);
-    DUP4_ARG2(__lasx_xvsadd_h, src2_l, offset, src2_h, offset, src3_l, offset,
-              src3_h, offset, src2_l, src2_h, src3_l, src3_h);
-    src0_l = __lasx_xvmaxi_h(src0_l, 0);
-    src0_h = __lasx_xvmaxi_h(src0_h, 0);
-    src1_l = __lasx_xvmaxi_h(src1_l, 0);
-    src1_h = __lasx_xvmaxi_h(src1_h, 0);
-    src2_l = __lasx_xvmaxi_h(src2_l, 0);
-    src2_h = __lasx_xvmaxi_h(src2_h, 0);
-    src3_l = __lasx_xvmaxi_h(src3_l, 0);
-    src3_h = __lasx_xvmaxi_h(src3_h, 0);
-    src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
-    src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
-    src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
-    src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
-    src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom);
-    src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom);
-    src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom);
-    src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom);
-    __lasx_xvstelm_d(src0_l, src, 0, 0);
-    __lasx_xvstelm_d(src0_h, src, 8, 0);
-    src += stride;
-    __lasx_xvstelm_d(src0_l, src, 0, 2);
-    __lasx_xvstelm_d(src0_h, src, 8, 2);
-    src += stride;
-    __lasx_xvstelm_d(src1_l, src, 0, 0);
-    __lasx_xvstelm_d(src1_h, src, 8, 0);
-    src += stride;
-    __lasx_xvstelm_d(src1_l, src, 0, 2);
-    __lasx_xvstelm_d(src1_h, src, 8, 2);
-    src += stride;
-    __lasx_xvstelm_d(src2_l, src, 0, 0);
-    __lasx_xvstelm_d(src2_h, src, 8, 0);
-    src += stride;
-    __lasx_xvstelm_d(src2_l, src, 0, 2);
-    __lasx_xvstelm_d(src2_h, src, 8, 2);
-    src += stride;
-    __lasx_xvstelm_d(src3_l, src, 0, 0);
-    __lasx_xvstelm_d(src3_h, src, 8, 0);
-    src += stride;
-    __lasx_xvstelm_d(src3_l, src, 0, 2);
-    __lasx_xvstelm_d(src3_h, src, 8, 2);
-    src += stride;
-
-    if (16 == height) {
-        DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
-                  src, stride_3x, tmp0, tmp1, tmp2, tmp3);
-        src += stride_4x;
-        DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
-                  src, stride_3x, tmp4, tmp5, tmp6, tmp7);
-        src -= stride_4x;
-        DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5,
-                  tmp4, 0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3);
-        DUP4_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, zero, src2,
-                  zero, src3, src0_l, src1_l, src2_l, src3_l);
-        DUP4_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, zero, src2,
-                  zero, src3, src0_h, src1_h, src2_h, src3_h);
-        src0_l = __lasx_xvmul_h(wgt, src0_l);
-        src0_h = __lasx_xvmul_h(wgt, src0_h);
-        src1_l = __lasx_xvmul_h(wgt, src1_l);
-        src1_h = __lasx_xvmul_h(wgt, src1_h);
-        src2_l = __lasx_xvmul_h(wgt, src2_l);
-        src2_h = __lasx_xvmul_h(wgt, src2_h);
-        src3_l = __lasx_xvmul_h(wgt, src3_l);
-        src3_h = __lasx_xvmul_h(wgt, src3_h);
-        DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l,
-                  offset, src1_h, offset, src0_l, src0_h, src1_l, src1_h);
-        DUP4_ARG2(__lasx_xvsadd_h, src2_l, offset, src2_h, offset, src3_l,
-                  offset, src3_h, offset, src2_l, src2_h, src3_l, src3_h);
-        src0_l = __lasx_xvmaxi_h(src0_l, 0);
-        src0_h = __lasx_xvmaxi_h(src0_h, 0);
-        src1_l = __lasx_xvmaxi_h(src1_l, 0);
-        src1_h = __lasx_xvmaxi_h(src1_h, 0);
-        src2_l = __lasx_xvmaxi_h(src2_l, 0);
-        src2_h = __lasx_xvmaxi_h(src2_h, 0);
-        src3_l = __lasx_xvmaxi_h(src3_l, 0);
-        src3_h = __lasx_xvmaxi_h(src3_h, 0);
-        src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
-        src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
-        src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
-        src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
-        src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom);
-        src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom);
-        src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom);
-        src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom);
-        __lasx_xvstelm_d(src0_l, src, 0, 0);
-        __lasx_xvstelm_d(src0_h, src, 8, 0);
-        src += stride;
-        __lasx_xvstelm_d(src0_l, src, 0, 2);
-        __lasx_xvstelm_d(src0_h, src, 8, 2);
-        src += stride;
-        __lasx_xvstelm_d(src1_l, src, 0, 0);
-        __lasx_xvstelm_d(src1_h, src, 8, 0);
-        src += stride;
-        __lasx_xvstelm_d(src1_l, src, 0, 2);
-        __lasx_xvstelm_d(src1_h, src, 8, 2);
-        src += stride;
-        __lasx_xvstelm_d(src2_l, src, 0, 0);
-        __lasx_xvstelm_d(src2_h, src, 8, 0);
-        src += stride;
-        __lasx_xvstelm_d(src2_l, src, 0, 2);
-        __lasx_xvstelm_d(src2_h, src, 8, 2);
-        src += stride;
-        __lasx_xvstelm_d(src3_l, src, 0, 0);
-        __lasx_xvstelm_d(src3_h, src, 8, 0);
-        src += stride;
-        __lasx_xvstelm_d(src3_l, src, 0, 2);
-        __lasx_xvstelm_d(src3_h, src, 8, 2);
-    }
-}
-
-static void avc_wgt_8x4_lasx(uint8_t *src, ptrdiff_t stride,
-                             int32_t log2_denom, int32_t weight_src,
-                             int32_t offset_in)
-{
-    uint32_t offset_val;
-    ptrdiff_t stride_2x = stride << 1;
-    ptrdiff_t stride_3x = stride_2x + stride;
-    __m256i wgt, zero = __lasx_xvldi(0);
-    __m256i src0, src0_h, src0_l;
-    __m256i tmp0, tmp1, tmp2, tmp3, denom, offset;
-
-    offset_val = (unsigned) offset_in << log2_denom;
-
-    wgt    = __lasx_xvreplgr2vr_h(weight_src);
-    offset = __lasx_xvreplgr2vr_h(offset_val);
-    denom  = __lasx_xvreplgr2vr_h(log2_denom);
-
-    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
-              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
-    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
-    src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
-    src0_l = __lasx_xvilvl_b(zero, src0);
-    src0_h = __lasx_xvilvh_b(zero, src0);
-    src0_l = __lasx_xvmul_h(wgt, src0_l);
-    src0_h = __lasx_xvmul_h(wgt, src0_h);
-    src0_l = __lasx_xvsadd_h(src0_l, offset);
-    src0_h = __lasx_xvsadd_h(src0_h, offset);
-    src0_l = __lasx_xvmaxi_h(src0_l, 0);
-    src0_h = __lasx_xvmaxi_h(src0_h, 0);
-    src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
-    src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
-
-    src0 = __lasx_xvpickev_d(src0_h, src0_l);
-    __lasx_xvstelm_d(src0, src, 0, 0);
-    __lasx_xvstelm_d(src0, src + stride, 0, 1);
-    __lasx_xvstelm_d(src0, src + stride_2x, 0, 2);
-    __lasx_xvstelm_d(src0, src + stride_3x, 0, 3);
-}
-
-static void avc_wgt_8x8_lasx(uint8_t *src, ptrdiff_t stride, int32_t log2_denom,
-                             int32_t src_weight, int32_t offset_in)
-{
-    __m256i src0, src1, src0_h, src0_l, src1_h, src1_l, zero = __lasx_xvldi(0);
-    __m256i tmp0, tmp1, tmp2, tmp3, denom, offset, wgt;
-    uint32_t offset_val;
-    uint8_t* src_tmp = src;
-    ptrdiff_t stride_2x = stride << 1;
-    ptrdiff_t stride_4x = stride << 2;
-    ptrdiff_t stride_3x = stride_2x + stride;
-
-    offset_val = (unsigned) offset_in << log2_denom;
-
-    wgt    = __lasx_xvreplgr2vr_h(src_weight);
-    offset = __lasx_xvreplgr2vr_h(offset_val);
-    denom  = __lasx_xvreplgr2vr_h(log2_denom);
-
-    DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
-              src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
-    src_tmp += stride_4x;
-    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
-    src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
-    DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
-              src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
-    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
-    src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
-    DUP2_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, src0_l, src1_l);
-    DUP2_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, src0_h, src1_h);
-    src0_l = __lasx_xvmul_h(wgt, src0_l);
-    src0_h = __lasx_xvmul_h(wgt, src0_h);
-    src1_l = __lasx_xvmul_h(wgt, src1_l);
-    src1_h = __lasx_xvmul_h(wgt, src1_h);
-    DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l, offset,
-              src1_h, offset, src0_l, src0_h, src1_l, src1_h);
-    src0_l = __lasx_xvmaxi_h(src0_l, 0);
-    src0_h = __lasx_xvmaxi_h(src0_h, 0);
-    src1_l = __lasx_xvmaxi_h(src1_l, 0);
-    src1_h = __lasx_xvmaxi_h(src1_h, 0);
-    src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
-    src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
-    src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
-    src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
-
-    DUP2_ARG2(__lasx_xvpickev_d, src0_h, src0_l, src1_h, src1_l, src0, src1);
-    __lasx_xvstelm_d(src0, src, 0, 0);
-    __lasx_xvstelm_d(src0, src + stride, 0, 1);
-    __lasx_xvstelm_d(src0, src + stride_2x, 0, 2);
-    __lasx_xvstelm_d(src0, src + stride_3x, 0, 3);
-    src += stride_4x;
-    __lasx_xvstelm_d(src1, src, 0, 0);
-    __lasx_xvstelm_d(src1, src + stride, 0, 1);
-    __lasx_xvstelm_d(src1, src + stride_2x, 0, 2);
-    __lasx_xvstelm_d(src1, src + stride_3x, 0, 3);
-}
-
-static void avc_wgt_8x16_lasx(uint8_t *src, ptrdiff_t stride,
-                              int32_t log2_denom, int32_t src_weight,
-                              int32_t offset_in)
-{
-    __m256i src0, src1, src2, src3;
-    __m256i src0_h, src0_l, src1_h, src1_l, src2_h, src2_l, src3_h, src3_l;
-    __m256i tmp0, tmp1, tmp2, tmp3, denom, offset, wgt;
-    __m256i zero = __lasx_xvldi(0);
-    uint32_t offset_val;
-    uint8_t* src_tmp = src;
-    ptrdiff_t stride_2x = stride << 1;
-    ptrdiff_t stride_4x = stride << 2;
-    ptrdiff_t stride_3x = stride_2x + stride;
-
-    offset_val = (unsigned) offset_in << log2_denom;
-
-    wgt    = __lasx_xvreplgr2vr_h(src_weight);
-    offset = __lasx_xvreplgr2vr_h(offset_val);
-    denom  = __lasx_xvreplgr2vr_h(log2_denom);
-
-    DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
-              src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
-    src_tmp += stride_4x;
-    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
-    src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
-    DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
-              src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
-    src_tmp += stride_4x;
-    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
-    src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
-    DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
-              src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
-    src_tmp += stride_4x;
-    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
-    src2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
-    DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
-              src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
-    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
-    src3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
-
-    DUP4_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, zero, src2, zero, src3,
-              src0_l, src1_l, src2_l, src3_l);
-    DUP4_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, zero, src2, zero, src3,
-              src0_h, src1_h, src2_h, src3_h);
-    src0_l = __lasx_xvmul_h(wgt, src0_l);
-    src0_h = __lasx_xvmul_h(wgt, src0_h);
-    src1_l = __lasx_xvmul_h(wgt, src1_l);
-    src1_h = __lasx_xvmul_h(wgt, src1_h);
-    src2_l = __lasx_xvmul_h(wgt, src2_l);
-    src2_h = __lasx_xvmul_h(wgt, src2_h);
-    src3_l = __lasx_xvmul_h(wgt, src3_l);
-    src3_h = __lasx_xvmul_h(wgt, src3_h);
-
-    DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l, offset,
-              src1_h, offset, src0_l, src0_h, src1_l, src1_h);
-    DUP4_ARG2(__lasx_xvsadd_h, src2_l, offset, src2_h, offset, src3_l, offset,
-              src3_h, offset, src2_l, src2_h, src3_l, src3_h);
-
-    src0_l = __lasx_xvmaxi_h(src0_l, 0);
-    src0_h = __lasx_xvmaxi_h(src0_h, 0);
-    src1_l = __lasx_xvmaxi_h(src1_l, 0);
-    src1_h = __lasx_xvmaxi_h(src1_h, 0);
-    src2_l = __lasx_xvmaxi_h(src2_l, 0);
-    src2_h = __lasx_xvmaxi_h(src2_h, 0);
-    src3_l = __lasx_xvmaxi_h(src3_l, 0);
-    src3_h = __lasx_xvmaxi_h(src3_h, 0);
-    src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
-    src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
-    src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
-    src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
-    src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom);
-    src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom);
-    src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom);
-    src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom);
-    DUP4_ARG2(__lasx_xvpickev_d, src0_h, src0_l, src1_h, src1_l, src2_h, src2_l,
-              src3_h, src3_l, src0, src1, src2, src3);
-
-    __lasx_xvstelm_d(src0, src, 0, 0);
-    __lasx_xvstelm_d(src0, src + stride, 0, 1);
-    __lasx_xvstelm_d(src0, src + stride_2x, 0, 2);
-    __lasx_xvstelm_d(src0, src + stride_3x, 0, 3);
-    src += stride_4x;
-    __lasx_xvstelm_d(src1, src, 0, 0);
-    __lasx_xvstelm_d(src1, src + stride, 0, 1);
-    __lasx_xvstelm_d(src1, src + stride_2x, 0, 2);
-    __lasx_xvstelm_d(src1, src + stride_3x, 0, 3);
-    src += stride_4x;
-    __lasx_xvstelm_d(src2, src, 0, 0);
-    __lasx_xvstelm_d(src2, src + stride, 0, 1);
-    __lasx_xvstelm_d(src2, src + stride_2x, 0, 2);
-    __lasx_xvstelm_d(src2, src + stride_3x, 0, 3);
-    src += stride_4x;
-    __lasx_xvstelm_d(src3, src, 0, 0);
-    __lasx_xvstelm_d(src3, src + stride, 0, 1);
-    __lasx_xvstelm_d(src3, src + stride_2x, 0, 2);
-    __lasx_xvstelm_d(src3, src + stride_3x, 0, 3);
-}
-
-void ff_weight_h264_pixels8_8_lasx(uint8_t *src, ptrdiff_t stride,
-                                   int height, int log2_denom,
-                                   int weight_src, int offset)
-{
-    if (4 == height) {
-        avc_wgt_8x4_lasx(src, stride, log2_denom, weight_src, offset);
-    } else if (8 == height) {
-        avc_wgt_8x8_lasx(src, stride, log2_denom, weight_src, offset);
-    } else {
-        avc_wgt_8x16_lasx(src, stride, log2_denom, weight_src, offset);
-    }
-}
-
-static void avc_wgt_4x2_lasx(uint8_t *src, ptrdiff_t stride,
-                             int32_t log2_denom, int32_t weight_src,
-                             int32_t offset_in)
-{
-    uint32_t offset_val;
-    __m256i wgt, zero = __lasx_xvldi(0);
-    __m256i src0, tmp0, tmp1, denom, offset;
-
-    offset_val = (unsigned) offset_in << log2_denom;
-
-    wgt    = __lasx_xvreplgr2vr_h(weight_src);
-    offset = __lasx_xvreplgr2vr_h(offset_val);
-    denom  = __lasx_xvreplgr2vr_h(log2_denom);
-
-    DUP2_ARG2(__lasx_xvldx, src, 0, src, stride, tmp0, tmp1);
-    src0 = __lasx_xvilvl_w(tmp1, tmp0);
-    src0 = __lasx_xvilvl_b(zero, src0);
-    src0 = __lasx_xvmul_h(wgt, src0);
-    src0 = __lasx_xvsadd_h(src0, offset);
-    src0 = __lasx_xvmaxi_h(src0, 0);
-    src0 = __lasx_xvssrlrn_bu_h(src0, denom);
-    __lasx_xvstelm_w(src0, src, 0, 0);
-    __lasx_xvstelm_w(src0, src + stride, 0, 1);
-}
-
-static void avc_wgt_4x4_lasx(uint8_t *src, ptrdiff_t stride,
-                             int32_t log2_denom, int32_t weight_src,
-                             int32_t offset_in)
-{
-    __m256i wgt;
-    __m256i src0, tmp0, tmp1, tmp2, tmp3, denom, offset;
-    uint32_t offset_val;
-    ptrdiff_t stride_2x = stride << 1;
-    ptrdiff_t stride_3x = stride_2x + stride;
-
-    offset_val = (unsigned) offset_in << log2_denom;
-
-    wgt    = __lasx_xvreplgr2vr_h(weight_src);
-    offset = __lasx_xvreplgr2vr_h(offset_val);
-    denom  = __lasx_xvreplgr2vr_h(log2_denom);
-
-    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
-              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
-    DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
-    src0 = __lasx_xvilvl_w(tmp1, tmp0);
-    src0 = __lasx_vext2xv_hu_bu(src0);
-    src0 = __lasx_xvmul_h(wgt, src0);
-    src0 = __lasx_xvsadd_h(src0, offset);
-    src0 = __lasx_xvmaxi_h(src0, 0);
-    src0 = __lasx_xvssrlrn_bu_h(src0, denom);
-    __lasx_xvstelm_w(src0, src, 0, 0);
-    __lasx_xvstelm_w(src0, src + stride, 0, 1);
-    __lasx_xvstelm_w(src0, src + stride_2x, 0, 4);
-    __lasx_xvstelm_w(src0, src + stride_3x, 0, 5);
-}
-
-static void avc_wgt_4x8_lasx(uint8_t *src, ptrdiff_t stride,
-                             int32_t log2_denom, int32_t weight_src,
-                             int32_t offset_in)
-{
-    __m256i src0, src0_h, src0_l;
-    __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
-    __m256i wgt, zero = __lasx_xvldi(0);
-    uint32_t offset_val;
-    ptrdiff_t stride_2x = stride << 1;
-    ptrdiff_t stride_4x = stride << 2;
-    ptrdiff_t stride_3x = stride_2x + stride;
-
-    offset_val = (unsigned) offset_in << log2_denom;
-
-    wgt    = __lasx_xvreplgr2vr_h(weight_src);
-    offset = __lasx_xvreplgr2vr_h(offset_val);
-    denom  = __lasx_xvreplgr2vr_h(log2_denom);
-
-    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
-              src, stride_3x, tmp0, tmp1, tmp2, tmp3);
-    src += stride_4x;
-    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
-              src, stride_3x, tmp4, tmp5, tmp6, tmp7);
-    src -= stride_4x;
-    DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7,
-              tmp5, tmp0, tmp1, tmp2, tmp3);
-    DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
-    src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
-    src0_l = __lasx_xvilvl_b(zero, src0);
-    src0_h = __lasx_xvilvh_b(zero, src0);
-    src0_l = __lasx_xvmul_h(wgt, src0_l);
-    src0_h = __lasx_xvmul_h(wgt, src0_h);
-    src0_l = __lasx_xvsadd_h(src0_l, offset);
-    src0_h = __lasx_xvsadd_h(src0_h, offset);
-    src0_l = __lasx_xvmaxi_h(src0_l, 0);
-    src0_h = __lasx_xvmaxi_h(src0_h, 0);
-    src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
-    src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
-    __lasx_xvstelm_w(src0_l, src, 0, 0);
-    __lasx_xvstelm_w(src0_l, src + stride, 0, 1);
-    __lasx_xvstelm_w(src0_h, src + stride_2x, 0, 0);
-    __lasx_xvstelm_w(src0_h, src + stride_3x, 0, 1);
-    src += stride_4x;
-    __lasx_xvstelm_w(src0_l, src, 0, 4);
-    __lasx_xvstelm_w(src0_l, src + stride, 0, 5);
-    __lasx_xvstelm_w(src0_h, src + stride_2x, 0, 4);
-    __lasx_xvstelm_w(src0_h, src + stride_3x, 0, 5);
-}
-
-void ff_weight_h264_pixels4_8_lasx(uint8_t *src, ptrdiff_t stride,
-                                   int height, int log2_denom,
-                                   int weight_src, int offset)
-{
-    if (2 == height) {
-        avc_wgt_4x2_lasx(src, stride, log2_denom, weight_src, offset);
-    } else if (4 == height) {
-        avc_wgt_4x4_lasx(src, stride, log2_denom, weight_src, offset);
-    } else {
-        avc_wgt_4x8_lasx(src, stride, log2_denom, weight_src, offset);
-    }
-}
-
 void ff_h264_add_pixels4_8_lasx(uint8_t *_dst, int16_t *_src, int stride)
 {
     __m256i src0, dst0, dst1, dst2, dst3, zero;
diff --git a/libavcodec/loongarch/h264dsp_loongarch.h b/libavcodec/loongarch/h264dsp_loongarch.h
index 28dca2b537..e17522dfe0 100644
--- a/libavcodec/loongarch/h264dsp_loongarch.h
+++ b/libavcodec/loongarch/h264dsp_loongarch.h
@@ -47,6 +47,50 @@ void ff_h264_idct_add16_intra_8_lsx(uint8_t *dst, const int32_t *blk_offset,
                                     int16_t *block, int32_t dst_stride,
                                     const uint8_t nzc[15 * 8]);
 
+void ff_h264_h_lpf_luma_8_lsx(uint8_t *src, ptrdiff_t stride,
+                              int alpha, int beta, int8_t *tc0);
+void ff_h264_v_lpf_luma_8_lsx(uint8_t *src, ptrdiff_t stride,
+                              int alpha, int beta, int8_t *tc0);
+void ff_h264_h_lpf_luma_intra_8_lsx(uint8_t *src, ptrdiff_t stride,
+                                    int alpha, int beta);
+void ff_h264_v_lpf_luma_intra_8_lsx(uint8_t *src, ptrdiff_t stride,
+                                    int alpha, int beta);
+void ff_h264_h_lpf_chroma_8_lsx(uint8_t *src, ptrdiff_t stride,
+                                int alpha, int beta, int8_t *tc0);
+void ff_h264_v_lpf_chroma_8_lsx(uint8_t *src, ptrdiff_t stride,
+                                int alpha, int beta, int8_t *tc0);
+void ff_h264_h_lpf_chroma_intra_8_lsx(uint8_t *src, ptrdiff_t stride,
+                                      int alpha, int beta);
+void ff_h264_v_lpf_chroma_intra_8_lsx(uint8_t *src, ptrdiff_t stride,
+                                      int alpha, int beta);
+void ff_biweight_h264_pixels16_8_lsx(uint8_t *dst, uint8_t *src,
+                                     ptrdiff_t stride, int height,
+                                     int log2_denom, int weight_dst,
+                                     int weight_src, int offset_in);
+void ff_biweight_h264_pixels8_8_lsx(uint8_t *dst, uint8_t *src,
+                                    ptrdiff_t stride, int height,
+                                    int log2_denom, int weight_dst,
+                                    int weight_src, int offset);
+void ff_biweight_h264_pixels4_8_lsx(uint8_t *dst, uint8_t *src,
+                                    ptrdiff_t stride, int height,
+                                    int log2_denom, int weight_dst,
+                                    int weight_src, int offset);
+void ff_weight_h264_pixels16_8_lsx(uint8_t *src, ptrdiff_t stride,
+                                   int height, int log2_denom,
+                                   int weight_src, int offset_in);
+void ff_weight_h264_pixels8_8_lsx(uint8_t *src, ptrdiff_t stride,
+                                  int height, int log2_denom,
+                                  int weight_src, int offset);
+void ff_weight_h264_pixels4_8_lsx(uint8_t *src, ptrdiff_t stride,
+                                  int height, int log2_denom,
+                                  int weight_src, int offset);
+void ff_h264_add_pixels4_8_lsx(uint8_t *_dst, int16_t *_src, int stride);
+void ff_h264_add_pixels8_8_lsx(uint8_t *_dst, int16_t *_src, int stride);
+void ff_h264_loop_filter_strength_lsx(int16_t bS[2][4][4], uint8_t nnz[40],
+                                      int8_t ref[2][40], int16_t mv[2][40][2],
+                                      int bidir, int edges, int step,
+                                      int mask_mv0, int mask_mv1, int field);
+
 #if HAVE_LASX
 void ff_h264_h_lpf_luma_8_lasx(uint8_t *src, ptrdiff_t stride,
                                int alpha, int beta, int8_t *tc0);
@@ -56,24 +100,12 @@ void ff_h264_h_lpf_luma_intra_8_lasx(uint8_t *src, ptrdiff_t stride,
                                      int alpha, int beta);
 void ff_h264_v_lpf_luma_intra_8_lasx(uint8_t *src, ptrdiff_t stride,
                                      int alpha, int beta);
-void ff_h264_h_lpf_chroma_8_lasx(uint8_t *src, ptrdiff_t stride,
-                                 int alpha, int beta, int8_t *tc0);
-void ff_h264_v_lpf_chroma_8_lasx(uint8_t *src, ptrdiff_t stride,
-                                 int alpha, int beta, int8_t *tc0);
-void ff_h264_h_lpf_chroma_intra_8_lasx(uint8_t *src, ptrdiff_t stride,
-                                       int alpha, int beta);
-void ff_h264_v_lpf_chroma_intra_8_lasx(uint8_t *src, ptrdiff_t stride,
-                                       int alpha, int beta);
-void ff_biweight_h264_pixels16_8_lasx(uint8_t *dst, uint8_t *src,
-                                      ptrdiff_t stride, int height,
+void ff_biweight_h264_pixels16_8_lasx(unsigned char *dst, unsigned char *src,
+                                      long int stride, int height,
                                       int log2_denom, int weight_dst,
                                       int weight_src, int offset_in);
-void ff_biweight_h264_pixels8_8_lasx(uint8_t *dst, uint8_t *src,
-                                     ptrdiff_t stride, int height,
-                                     int log2_denom, int weight_dst,
-                                     int weight_src, int offset);
-void ff_biweight_h264_pixels4_8_lasx(uint8_t *dst, uint8_t *src,
-                                     ptrdiff_t stride, int height,
+void ff_biweight_h264_pixels8_8_lasx(unsigned char *dst, unsigned char *src,
+                                     long int stride, int height,
                                      int log2_denom, int weight_dst,
                                      int weight_src, int offset);
 void ff_weight_h264_pixels16_8_lasx(uint8_t *src, ptrdiff_t stride,
@@ -82,9 +114,6 @@ void ff_weight_h264_pixels16_8_lasx(uint8_t *src, ptrdiff_t stride,
 void ff_weight_h264_pixels8_8_lasx(uint8_t *src, ptrdiff_t stride,
                                    int height, int log2_denom,
                                    int weight_src, int offset);
-void ff_weight_h264_pixels4_8_lasx(uint8_t *src, ptrdiff_t stride,
-                                   int height, int log2_denom,
-                                   int weight_src, int offset);
 void ff_h264_add_pixels4_8_lasx(uint8_t *_dst, int16_t *_src, int stride);
 
 void ff_h264_add_pixels8_8_lasx(uint8_t *_dst, int16_t *_src, int stride);
-- 
2.20.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [FFmpeg-devel] [PATCH v1 3/6] avcodec/la: Add LSX optimization for h264 chroma and intrapred.
  2023-05-04  8:49 [FFmpeg-devel] Add LSX optimization in avcodec and swscale Hao Chen
  2023-05-04  8:49 ` [FFmpeg-devel] [PATCH v1 1/6] avcodec/la: add LSX optimization for h264 idct Hao Chen
  2023-05-04  8:49 ` [FFmpeg-devel] [PATCH v1 2/6] avcodec/la: Add LSX optimization for loop filter Hao Chen
@ 2023-05-04  8:49 ` Hao Chen
  2023-05-11  7:19   ` Shiyou Yin
  2023-05-04  8:49 ` [FFmpeg-devel] [PATCH v1 4/6] avcodec/la: Add LSX optimization for h264 qpel Hao Chen
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 18+ messages in thread
From: Hao Chen @ 2023-05-04  8:49 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Lu Wang

From: Lu Wang <wanglu@loongson.cn>

./configure --disable-lasx
ffmpeg -i 1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -y /dev/null -an
before: 199fps
after:  214fps
---
 libavcodec/loongarch/Makefile                 |    4 +-
 .../loongarch/h264_intrapred_init_loongarch.c |   18 +-
 libavcodec/loongarch/h264_intrapred_lasx.c    |  121 --
 ...pred_lasx.h => h264_intrapred_loongarch.h} |   12 +-
 libavcodec/loongarch/h264chroma.S             |  966 +++++++++++++
 .../loongarch/h264chroma_init_loongarch.c     |   10 +-
 libavcodec/loongarch/h264chroma_lasx.c        | 1280 -----------------
 libavcodec/loongarch/h264chroma_lasx.h        |   36 -
 libavcodec/loongarch/h264chroma_loongarch.h   |   43 +
 libavcodec/loongarch/h264intrapred.S          |  299 ++++
 10 files changed, 1344 insertions(+), 1445 deletions(-)
 delete mode 100644 libavcodec/loongarch/h264_intrapred_lasx.c
 rename libavcodec/loongarch/{h264_intrapred_lasx.h => h264_intrapred_loongarch.h} (70%)
 create mode 100644 libavcodec/loongarch/h264chroma.S
 delete mode 100644 libavcodec/loongarch/h264chroma_lasx.c
 delete mode 100644 libavcodec/loongarch/h264chroma_lasx.h
 create mode 100644 libavcodec/loongarch/h264chroma_loongarch.h
 create mode 100644 libavcodec/loongarch/h264intrapred.S

diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index 6eabe71c0b..6e73e1bb6a 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -9,11 +9,9 @@ OBJS-$(CONFIG_HPELDSP)                += loongarch/hpeldsp_init_loongarch.o
 OBJS-$(CONFIG_IDCTDSP)                += loongarch/idctdsp_init_loongarch.o
 OBJS-$(CONFIG_VIDEODSP)               += loongarch/videodsp_init.o
 OBJS-$(CONFIG_HEVC_DECODER)           += loongarch/hevcdsp_init_loongarch.o
-LASX-OBJS-$(CONFIG_H264CHROMA)        += loongarch/h264chroma_lasx.o
 LASX-OBJS-$(CONFIG_H264QPEL)          += loongarch/h264qpel_lasx.o
 LASX-OBJS-$(CONFIG_H264DSP)           += loongarch/h264dsp_lasx.o \
                                          loongarch/h264_deblock_lasx.o
-LASX-OBJS-$(CONFIG_H264PRED)          += loongarch/h264_intrapred_lasx.o
 LASX-OBJS-$(CONFIG_VC1_DECODER)       += loongarch/vc1dsp_lasx.o
 LASX-OBJS-$(CONFIG_HPELDSP)           += loongarch/hpeldsp_lasx.o
 LASX-OBJS-$(CONFIG_IDCTDSP)           += loongarch/simple_idct_lasx.o  \
@@ -33,3 +31,5 @@ LSX-OBJS-$(CONFIG_HEVC_DECODER)       += loongarch/hevcdsp_lsx.o \
 LSX-OBJS-$(CONFIG_H264DSP)            += loongarch/h264idct.o \
                                          loongarch/h264idct_la.o \
                                          loongarch/h264dsp.o
+LSX-OBJS-$(CONFIG_H264CHROMA)         += loongarch/h264chroma.o
+LSX-OBJS-$(CONFIG_H264PRED)           += loongarch/h264intrapred.o
diff --git a/libavcodec/loongarch/h264_intrapred_init_loongarch.c b/libavcodec/loongarch/h264_intrapred_init_loongarch.c
index 12620bd842..c415fa30da 100644
--- a/libavcodec/loongarch/h264_intrapred_init_loongarch.c
+++ b/libavcodec/loongarch/h264_intrapred_init_loongarch.c
@@ -21,7 +21,7 @@
 
 #include "libavutil/loongarch/cpu.h"
 #include "libavcodec/h264pred.h"
-#include "h264_intrapred_lasx.h"
+#include "h264_intrapred_loongarch.h"
 
 av_cold void ff_h264_pred_init_loongarch(H264PredContext *h, int codec_id,
                                          const int bit_depth,
@@ -30,6 +30,22 @@ av_cold void ff_h264_pred_init_loongarch(H264PredContext *h, int codec_id,
     int cpu_flags = av_get_cpu_flags();
 
     if (bit_depth == 8) {
+        if (have_lsx(cpu_flags)) {
+            if (chroma_format_idc <= 1) {
+            }
+            if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
+            } else {
+                if (chroma_format_idc <= 1) {
+                }
+                if (codec_id == AV_CODEC_ID_SVQ3) {
+                    h->pred16x16[PLANE_PRED8x8] = ff_h264_pred16x16_plane_svq3_8_lsx;
+                } else if (codec_id == AV_CODEC_ID_RV40) {
+                    h->pred16x16[PLANE_PRED8x8] = ff_h264_pred16x16_plane_rv40_8_lsx;
+                } else {
+                    h->pred16x16[PLANE_PRED8x8] = ff_h264_pred16x16_plane_h264_8_lsx;
+                }
+            }
+        }
         if (have_lasx(cpu_flags)) {
             if (chroma_format_idc <= 1) {
             }
diff --git a/libavcodec/loongarch/h264_intrapred_lasx.c b/libavcodec/loongarch/h264_intrapred_lasx.c
deleted file mode 100644
index c38cd611b8..0000000000
--- a/libavcodec/loongarch/h264_intrapred_lasx.c
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2021 Loongson Technology Corporation Limited
- * Contributed by Hao Chen <chenhao@loongson.cn>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/loongarch/loongson_intrinsics.h"
-#include "h264_intrapred_lasx.h"
-
-#define PRED16X16_PLANE                                                        \
-    ptrdiff_t stride_1, stride_2, stride_3, stride_4, stride_5, stride_6;      \
-    ptrdiff_t stride_8, stride_15;                                             \
-    int32_t res0, res1, res2, res3, cnt;                                       \
-    uint8_t *src0, *src1;                                                      \
-    __m256i reg0, reg1, reg2, reg3, reg4;                                      \
-    __m256i tmp0, tmp1, tmp2, tmp3;                                            \
-    __m256i shuff = {0x0B040A0509060807, 0x0F000E010D020C03, 0, 0};            \
-    __m256i mult = {0x0004000300020001, 0x0008000700060005, 0, 0};             \
-    __m256i int_mult1 = {0x0000000100000000, 0x0000000300000002,               \
-                         0x0000000500000004, 0x0000000700000006};              \
-                                                                               \
-    stride_1 = -stride;                                                        \
-    stride_2 = stride << 1;                                                    \
-    stride_3 = stride_2 + stride;                                              \
-    stride_4 = stride_2 << 1;                                                  \
-    stride_5 = stride_4 + stride;                                              \
-    stride_6 = stride_3 << 1;                                                  \
-    stride_8 = stride_4 << 1;                                                  \
-    stride_15 = (stride_8 << 1) - stride;                                      \
-    src0 = src - 1;                                                            \
-    src1 = src0 + stride_8;                                                    \
-                                                                               \
-    reg0 = __lasx_xvldx(src0, -stride);                                        \
-    reg1 = __lasx_xvldx(src, (8 - stride));                                    \
-    reg0 = __lasx_xvilvl_d(reg1, reg0);                                        \
-    reg0 = __lasx_xvshuf_b(reg0, reg0, shuff);                                 \
-    reg0 = __lasx_xvhsubw_hu_bu(reg0, reg0);                                   \
-    reg0 = __lasx_xvmul_h(reg0, mult);                                         \
-    res1 = (src1[0] - src0[stride_6]) +                                        \
-        2 * (src1[stride] - src0[stride_5]) +                                  \
-        3 * (src1[stride_2] - src0[stride_4]) +                                \
-        4 * (src1[stride_3] - src0[stride_3]) +                                \
-        5 * (src1[stride_4] - src0[stride_2]) +                                \
-        6 * (src1[stride_5] - src0[stride]) +                                  \
-        7 * (src1[stride_6] - src0[0]) +                                       \
-        8 * (src0[stride_15] - src0[stride_1]);                                \
-    reg0 = __lasx_xvhaddw_w_h(reg0, reg0);                                     \
-    reg0 = __lasx_xvhaddw_d_w(reg0, reg0);                                     \
-    reg0 = __lasx_xvhaddw_q_d(reg0, reg0);                                     \
-    res0 = __lasx_xvpickve2gr_w(reg0, 0);                                      \
-
-#define PRED16X16_PLANE_END                                                    \
-    res2 = (src0[stride_15] + src[15 - stride] + 1) << 4;                      \
-    res3 = 7 * (res0 + res1);                                                  \
-    res2 -= res3;                                                              \
-    reg0 = __lasx_xvreplgr2vr_w(res0);                                         \
-    reg1 = __lasx_xvreplgr2vr_w(res1);                                         \
-    reg2 = __lasx_xvreplgr2vr_w(res2);                                         \
-    reg3 = __lasx_xvmul_w(reg0, int_mult1);                                    \
-    reg4 = __lasx_xvslli_w(reg0, 3);                                           \
-    reg4 = __lasx_xvadd_w(reg4, reg3);                                         \
-    for (cnt = 8; cnt--;) {                                                    \
-        tmp0 = __lasx_xvadd_w(reg2, reg3);                                     \
-        tmp1 = __lasx_xvadd_w(reg2, reg4);                                     \
-        tmp0 = __lasx_xvssrani_hu_w(tmp1, tmp0, 5);                            \
-        tmp0 = __lasx_xvpermi_d(tmp0, 0xD8);                                   \
-        reg2 = __lasx_xvadd_w(reg2, reg1);                                     \
-        tmp2 = __lasx_xvadd_w(reg2, reg3);                                     \
-        tmp3 = __lasx_xvadd_w(reg2, reg4);                                     \
-        tmp1 = __lasx_xvssrani_hu_w(tmp3, tmp2, 5);                            \
-        tmp1 = __lasx_xvpermi_d(tmp1, 0xD8);                                   \
-        tmp0 = __lasx_xvssrani_bu_h(tmp1, tmp0, 0);                            \
-        reg2 = __lasx_xvadd_w(reg2, reg1);                                     \
-        __lasx_xvstelm_d(tmp0, src, 0, 0);                                     \
-        __lasx_xvstelm_d(tmp0, src, 8, 2);                                     \
-        src += stride;                                                         \
-        __lasx_xvstelm_d(tmp0, src, 0, 1);                                     \
-        __lasx_xvstelm_d(tmp0, src, 8, 3);                                     \
-        src += stride;                                                         \
-    }
-
-
-void ff_h264_pred16x16_plane_h264_8_lasx(uint8_t *src, ptrdiff_t stride)
-{
-    PRED16X16_PLANE
-    res0 = (5 * res0 + 32) >> 6;
-    res1 = (5 * res1 + 32) >> 6;
-    PRED16X16_PLANE_END
-}
-
-void ff_h264_pred16x16_plane_rv40_8_lasx(uint8_t *src, ptrdiff_t stride)
-{
-    PRED16X16_PLANE
-    res0 = (res0 + (res0 >> 2)) >> 4;
-    res1 = (res1 + (res1 >> 2)) >> 4;
-    PRED16X16_PLANE_END
-}
-
-void ff_h264_pred16x16_plane_svq3_8_lasx(uint8_t *src, ptrdiff_t stride)
-{
-    PRED16X16_PLANE
-    cnt  = (5 * (res0/4)) / 16;
-    res0 = (5 * (res1/4)) / 16;
-    res1 = cnt;
-    PRED16X16_PLANE_END
-}
diff --git a/libavcodec/loongarch/h264_intrapred_lasx.h b/libavcodec/loongarch/h264_intrapred_loongarch.h
similarity index 70%
rename from libavcodec/loongarch/h264_intrapred_lasx.h
rename to libavcodec/loongarch/h264_intrapred_loongarch.h
index 0c2653300c..39be87ee9f 100644
--- a/libavcodec/loongarch/h264_intrapred_lasx.h
+++ b/libavcodec/loongarch/h264_intrapred_loongarch.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
  * Contributed by Hao Chen <chenhao@loongson.cn>
  *
  * This file is part of FFmpeg.
@@ -19,13 +19,17 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef AVCODEC_LOONGARCH_H264_INTRAPRED_LASX_H
-#define AVCODEC_LOONGARCH_H264_INTRAPRED_LASX_H
+#ifndef AVCODEC_LOONGARCH_H264_INTRAPRED_LOONGARCH_H
+#define AVCODEC_LOONGARCH_H264_INTRAPRED_LOONGARCH_H
 
 #include "libavcodec/avcodec.h"
 
+void ff_h264_pred16x16_plane_h264_8_lsx(uint8_t *src, ptrdiff_t stride);
+void ff_h264_pred16x16_plane_rv40_8_lsx(uint8_t *src, ptrdiff_t stride);
+void ff_h264_pred16x16_plane_svq3_8_lsx(uint8_t *src, ptrdiff_t stride);
+
 void ff_h264_pred16x16_plane_h264_8_lasx(uint8_t *src, ptrdiff_t stride);
 void ff_h264_pred16x16_plane_rv40_8_lasx(uint8_t *src, ptrdiff_t stride);
 void ff_h264_pred16x16_plane_svq3_8_lasx(uint8_t *src, ptrdiff_t stride);
 
-#endif  // #ifndef AVCODEC_LOONGARCH_H264_INTRAPRED_LASX_H
+#endif  // #ifndef AVCODEC_LOONGARCH_H264_INTRAPRED_LOONGARCH_H
diff --git a/libavcodec/loongarch/h264chroma.S b/libavcodec/loongarch/h264chroma.S
new file mode 100644
index 0000000000..353b8d004b
--- /dev/null
+++ b/libavcodec/loongarch/h264chroma.S
@@ -0,0 +1,966 @@
+/*
+ * Loongson LSX/LASX optimized h264chroma
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+/* void ff_put_h264_chroma_mc8_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                   int h, int x, int y) */
+function ff_put_h264_chroma_mc8_lsx
+    li.d             t8,      8
+    sub.d            t1,      t8,     a4     // 8-x
+    sub.d            t2,      t8,     a5     // 8-y
+    mul.d            t3,      t1,     t2     // A
+    mul.d            t4,      a4,     t2     // B
+    mul.d            t5,      t1,     a5     // C
+    mul.d            t6,      a4,     a5     // D
+    add.d            t0,      t4,     t5     // E
+    vreplgr2vr.b     vr0,     t3
+    vreplgr2vr.b     vr1,     t4
+    vreplgr2vr.b     vr2,     t5
+    vreplgr2vr.b     vr3,     t6
+    vreplgr2vr.b     vr4,     t0
+    slli.d           t2,      a2,     1
+    add.d            t3,      t2,     a2
+    slli.d           t4,      a2,     2
+
+    bge              zero,    t6,     .ENDLOOP_D
+    move             t1,      a3
+    vilvl.b          vr9,     vr1,    vr0
+    vilvl.b          vr10,    vr3,    vr2
+.LOOP_D:
+    vld              vr5,     a1,     0
+    vld              vr6,     a1,     1
+    add.d            a1,      a1,     a2
+    vld              vr7,     a1,     0
+    vld              vr8,     a1,     1
+    vilvl.b          vr11,    vr6,    vr5
+    vilvl.b          vr12,    vr8,    vr7
+    vmulwev.h.bu     vr13,    vr9,    vr11
+    vmaddwod.h.bu    vr13,    vr9,    vr11
+    vmulwev.h.bu     vr14,    vr10,   vr12
+    vmaddwod.h.bu    vr14,    vr10,   vr12
+    vadd.h           vr13,    vr13,   vr14
+    vsrarni.b.h      vr13,    vr13,   6
+    vstelm.d         vr13,    a0,     0,    0
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+    vld              vr5,     a1,     0
+    vld              vr6,     a1,     1
+    vilvl.b          vr11,    vr8,    vr7
+    vilvl.b          vr12,    vr6,    vr5
+    vmulwev.h.bu     vr13,    vr9,    vr11
+    vmaddwod.h.bu    vr13,    vr9,    vr11
+    vmulwev.h.bu     vr14,    vr10,   vr12
+    vmaddwod.h.bu    vr14,    vr10,   vr12
+    vadd.h           vr13,    vr13,   vr14
+    vsrarni.b.h      vr13,    vr13,   6
+    vstelm.d         vr13,    a0,     0,    0
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+    vld              vr7,     a1,     0
+    vld              vr8,     a1,     1
+    vilvl.b          vr11,    vr6,    vr5
+    vilvl.b          vr12,    vr8,    vr7
+    vmulwev.h.bu     vr13,    vr9,    vr11
+    vmaddwod.h.bu    vr13,    vr9,    vr11
+    vmulwev.h.bu     vr14,    vr10,   vr12
+    vmaddwod.h.bu    vr14,    vr10,   vr12
+    vadd.h           vr13,    vr13,   vr14
+    vsrarni.b.h      vr13,    vr13,   6
+    vstelm.d         vr13,    a0,     0,    0
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+    vld              vr5,     a1,     0
+    vld              vr6,     a1,     1
+    vilvl.b          vr11,    vr8,    vr7
+    vilvl.b          vr12,    vr6,    vr5
+    vmulwev.h.bu     vr13,    vr9,    vr11
+    vmaddwod.h.bu    vr13,    vr9,    vr11
+    vmulwev.h.bu     vr14,    vr10,   vr12
+    vmaddwod.h.bu    vr14,    vr10,   vr12
+    vadd.h           vr13,    vr13,   vr14
+    vsrarni.b.h      vr13,    vr13,   6
+    vstelm.d         vr13,    a0,     0,    0
+    add.d            a0,      a0,     a2
+
+    addi.d           t1,      t1,     -4
+    blt              zero,    t1,     .LOOP_D
+    b                .ENDLOOP
+.ENDLOOP_D:
+
+    bge              zero,    t0,     .ENDLOOP_E
+    move             t1,      a3
+    li.d             t7,      1
+    slt              t8,      zero,   t5
+    maskeqz          t5,      a2,     t8
+    masknez          t7,      t7,     t8
+    or               t7,      t7,     t5
+    vilvl.b          vr7,     vr4,    vr0
+.LOOP_E:
+    vld              vr5,     a1,     0
+    vldx             vr6,     a1,     t7
+    vilvl.b          vr5,     vr6,    vr5
+    vmulwev.h.bu     vr6,     vr7,    vr5
+    vmaddwod.h.bu    vr6,     vr7,    vr5
+    vsrarni.b.h      vr6,     vr6,    6
+    vstelm.d         vr6,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+    vld              vr5,     a1,     0
+    vldx             vr6,     a1,     t7
+    vilvl.b          vr5,     vr6,    vr5
+    vmulwev.h.bu     vr6,     vr7,    vr5
+    vmaddwod.h.bu    vr6,     vr7,    vr5
+    vsrarni.b.h      vr6,     vr6,    6
+    vstelm.d         vr6,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+    vld              vr5,     a1,     0
+    vldx             vr6,     a1,     t7
+    vilvl.b          vr5,     vr6,    vr5
+    vmulwev.h.bu     vr6,     vr7,    vr5
+    vmaddwod.h.bu    vr6,     vr7,    vr5
+    vsrarni.b.h      vr6,     vr6,    6
+    vstelm.d         vr6,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+    vld              vr5,     a1,     0
+    vldx             vr6,     a1,     t7
+    vilvl.b          vr5,     vr6,    vr5
+    vmulwev.h.bu     vr6,     vr7,    vr5
+    vmaddwod.h.bu    vr6,     vr7,    vr5
+    vsrarni.b.h      vr6,     vr6,    6
+    vstelm.d         vr6,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+
+    addi.d           t1,      t1,     -4
+    blt              zero,    t1,     .LOOP_E
+    b                .ENDLOOP
+.ENDLOOP_E:
+
+    move             t1,      a3
+.LOOP:
+    vld              vr5,     a1,     0
+    vmulwev.h.bu     vr6,     vr0,    vr5
+    vmulwod.h.bu     vr7,     vr0,    vr5
+    vsrarni.b.h      vr6,     vr6,    6
+    vsrarni.b.h      vr7,     vr7,    6
+    vilvl.b          vr6,     vr7,    vr6
+    vstelm.d         vr6,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    vldx             vr5,     a1,     a2
+    vmulwev.h.bu     vr6,     vr0,    vr5
+    vmulwod.h.bu     vr7,     vr0,    vr5
+    vsrarni.b.h      vr6,     vr6,    6
+    vsrarni.b.h      vr7,     vr7,    6
+    vilvl.b          vr6,     vr7,    vr6
+    vstelm.d         vr6,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    vldx             vr5,     a1,     t2
+    vmulwev.h.bu     vr6,     vr0,    vr5
+    vmulwod.h.bu     vr7,     vr0,    vr5
+    vsrarni.b.h      vr6,     vr6,    6
+    vsrarni.b.h      vr7,     vr7,    6
+    vilvl.b          vr6,     vr7,    vr6
+    vstelm.d         vr6,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    vldx             vr5,     a1,     t3
+    vmulwev.h.bu     vr6,     vr0,    vr5
+    vmulwod.h.bu     vr7,     vr0,    vr5
+    vsrarni.b.h      vr6,     vr6,    6
+    vsrarni.b.h      vr7,     vr7,    6
+    vilvl.b          vr6,     vr7,    vr6
+    vstelm.d         vr6,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     t4
+
+    addi.d           t1,      t1,     -4
+    blt              zero,    t1,     .LOOP
+.ENDLOOP:
+endfunc
+
+/* void ff_avg_h264_chroma_mc8_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                   int h, int x, int y) */
+function ff_avg_h264_chroma_mc8_lsx
+    li.d             t8,      8
+    sub.d            t1,      t8,     a4     // 8-x
+    sub.d            t2,      t8,     a5     // 8-y
+    mul.d            t3,      t1,     t2     // A
+    mul.d            t4,      a4,     t2     // B
+    mul.d            t5,      t1,     a5     // C
+    mul.d            t6,      a4,     a5     // D
+    add.d            t0,      t4,     t5     // E
+    vreplgr2vr.b     vr0,     t3
+    vreplgr2vr.b     vr1,     t4
+    vreplgr2vr.b     vr2,     t5
+    vreplgr2vr.b     vr3,     t6
+    vreplgr2vr.b     vr4,     t0
+    slli.d           t2,      a2,     1
+    add.d            t3,      t2,     a2
+    slli.d           t4,      a2,     2
+
+    bge              zero,    t6,     .ENDLOOPD
+    move             t1,      a3
+    vilvl.b          vr9,     vr1,    vr0
+    vilvl.b          vr10,    vr3,    vr2
+.LOOPD:
+    vld              vr5,     a1,     0
+    vld              vr6,     a1,     1
+    add.d            a1,      a1,     a2
+    vld              vr7,     a1,     0
+    vld              vr8,     a1,     1
+    vld              vr11,    a0,     0
+    vilvl.b          vr12,    vr6,    vr5
+    vilvl.b          vr13,    vr8,    vr7
+    vmulwev.h.bu     vr14,    vr9,    vr12
+    vmaddwod.h.bu    vr14,    vr9,    vr12
+    vmulwev.h.bu     vr15,    vr10,   vr13
+    vmaddwod.h.bu    vr15,    vr10,   vr13
+    vadd.h           vr14,    vr14,   vr15
+    vsrari.h         vr14,    vr14,   6
+    vsllwil.hu.bu    vr11,    vr11,   0
+    vadd.h           vr11,    vr14,   vr11
+    vsrarni.b.h      vr11,    vr11,   1
+    vstelm.d         vr11,    a0,     0,    0
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+    vld              vr5,     a1,     0
+    vld              vr6,     a1,     1
+    vld              vr11,    a0,     0
+    vilvl.b          vr12,    vr8,    vr7
+    vilvl.b          vr13,    vr6,    vr5
+    vmulwev.h.bu     vr14,    vr9,    vr12
+    vmaddwod.h.bu    vr14,    vr9,    vr12
+    vmulwev.h.bu     vr15,    vr10,   vr13
+    vmaddwod.h.bu    vr15,    vr10,   vr13
+    vadd.h           vr14,    vr14,   vr15
+    vsrari.h         vr14,    vr14,   6
+    vsllwil.hu.bu    vr11,    vr11,   0
+    vadd.h           vr11,    vr14,   vr11
+    vsrarni.b.h      vr11,    vr11,   1
+    vstelm.d         vr11,    a0,     0,    0
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+    vld              vr7,     a1,     0
+    vld              vr8,     a1,     1
+    vld              vr11,    a0,     0
+    vilvl.b          vr12,    vr6,    vr5
+    vilvl.b          vr13,    vr8,    vr7
+    vmulwev.h.bu     vr14,    vr9,    vr12
+    vmaddwod.h.bu    vr14,    vr9,    vr12
+    vmulwev.h.bu     vr15,    vr10,   vr13
+    vmaddwod.h.bu    vr15,    vr10,   vr13
+    vadd.h           vr14,    vr14,   vr15
+    vsrari.h         vr14,    vr14,   6
+    vsllwil.hu.bu    vr11,    vr11,   0
+    vadd.h           vr11,    vr14,   vr11
+    vsrarni.b.h      vr11,    vr11,   1
+    vstelm.d         vr11,    a0,     0,    0
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+    vld              vr5,     a1,     0
+    vld              vr6,     a1,     1
+    vld              vr11,    a0,     0
+    vilvl.b          vr12,    vr8,    vr7
+    vilvl.b          vr13,    vr6,    vr5
+    vmulwev.h.bu     vr14,    vr9,    vr12
+    vmaddwod.h.bu    vr14,    vr9,    vr12
+    vmulwev.h.bu     vr15,    vr10,   vr13
+    vmaddwod.h.bu    vr15,    vr10,   vr13
+    vadd.h           vr14,    vr14,   vr15
+    vsrari.h         vr14,    vr14,   6
+    vsllwil.hu.bu    vr11,    vr11,   0
+    vadd.h           vr11,    vr14,   vr11
+    vsrarni.b.h      vr11,    vr11,   1
+    vstelm.d         vr11,    a0,     0,    0
+    add.d            a0,      a0,     a2
+
+    addi.d           t1,      t1,     -4
+    blt              zero,    t1,     .LOOPD
+    b                .ENDLOOPELSE
+.ENDLOOPD:
+
+    bge              zero,    t0,     .ENDLOOPE
+    move             t1,      a3
+    li.d             t7,      1
+    slt              t8,      zero,   t5
+    maskeqz          t5,      a2,     t8
+    masknez          t7,      t7,     t8
+    or               t7,      t7,     t5
+    vilvl.b          vr7,     vr4,    vr0
+.LOOPE:
+    vld              vr5,     a1,     0
+    vldx             vr6,     a1,     t7
+    vld              vr8,     a0,     0
+    vilvl.b          vr5,     vr6,    vr5
+    vmulwev.h.bu     vr6,     vr7,    vr5
+    vmaddwod.h.bu    vr6,     vr7,    vr5
+    vsrari.h         vr6,     vr6,    6
+    vsllwil.hu.bu    vr8,     vr8,    0
+    vadd.h           vr8,     vr6,    vr8
+    vsrarni.b.h      vr8,     vr8,    1
+    vstelm.d         vr8,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+    vld              vr5,     a1,     0
+    vldx             vr6,     a1,     t7
+    vld              vr8,     a0,     0
+    vilvl.b          vr5,     vr6,    vr5
+    vmulwev.h.bu     vr6,     vr7,    vr5
+    vmaddwod.h.bu    vr6,     vr7,    vr5
+    vsrari.h         vr6,     vr6,    6
+    vsllwil.hu.bu    vr8,     vr8,    0
+    vadd.h           vr8,     vr6,    vr8
+    vsrarni.b.h      vr8,     vr8,    1
+    vstelm.d         vr8,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+    vld              vr5,     a1,     0
+    vldx             vr6,     a1,     t7
+    vld              vr8,     a0,     0
+    vilvl.b          vr5,     vr6,    vr5
+    vmulwev.h.bu     vr6,     vr7,    vr5
+    vmaddwod.h.bu    vr6,     vr7,    vr5
+    vsrari.h         vr6,     vr6,    6
+    vsllwil.hu.bu    vr8,     vr8,    0
+    vadd.h           vr8,     vr6,    vr8
+    vsrarni.b.h      vr8,     vr8,    1
+    vstelm.d         vr8,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+    vld              vr5,     a1,     0
+    vldx             vr6,     a1,     t7
+    vld              vr8,     a0,     0
+    vilvl.b          vr5,     vr6,    vr5
+    vmulwev.h.bu     vr6,     vr7,    vr5
+    vmaddwod.h.bu    vr6,     vr7,    vr5
+    vsrari.h         vr6,     vr6,    6
+    vsllwil.hu.bu    vr8,     vr8,    0
+    vadd.h           vr8,     vr6,    vr8
+    vsrarni.b.h      vr8,     vr8,    1
+    vstelm.d         vr8,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+
+    addi.d           t1,      t1,     -4
+    blt              zero,    t1,     .LOOPE
+    b                .ENDLOOPELSE
+.ENDLOOPE:
+
+    move             t1,      a3
+.LOOPELSE:
+    vld              vr5,     a1,     0
+    vld              vr8,     a0,     0
+    vmulwev.h.bu     vr6,     vr0,    vr5
+    vmulwod.h.bu     vr7,     vr0,    vr5
+    vilvl.h          vr6,     vr7,    vr6
+    vsrari.h         vr6,     vr6,    6
+    vsllwil.hu.bu    vr8,     vr8,    0
+    vadd.h           vr8,     vr6,    vr8
+    vsrarni.b.h      vr8,     vr8,    1
+    vstelm.d         vr8,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    vldx             vr5,     a1,     a2
+    vld              vr8,     a0,     0
+    vmulwev.h.bu     vr6,     vr0,    vr5
+    vmulwod.h.bu     vr7,     vr0,    vr5
+    vilvl.h          vr6,     vr7,    vr6
+    vsrari.h         vr6,     vr6,    6
+    vsllwil.hu.bu    vr8,     vr8,    0
+    vadd.h           vr8,     vr6,    vr8
+    vsrarni.b.h      vr8,     vr8,    1
+    vstelm.d         vr8,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    vldx             vr5,     a1,     t2
+    vld              vr8,     a0,     0
+    vmulwev.h.bu     vr6,     vr0,    vr5
+    vmulwod.h.bu     vr7,     vr0,    vr5
+    vilvl.h          vr6,     vr7,    vr6
+    vsrari.h         vr6,     vr6,    6
+    vsllwil.hu.bu    vr8,     vr8,    0
+    vadd.h           vr8,     vr6,    vr8
+    vsrarni.b.h      vr8,     vr8,    1
+    vstelm.d         vr8,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    vldx             vr5,     a1,     t3
+    vld              vr8,     a0,     0
+    vmulwev.h.bu     vr6,     vr0,    vr5
+    vmulwod.h.bu     vr7,     vr0,    vr5
+    vilvl.h          vr6,     vr7,    vr6
+    vsrari.h         vr6,     vr6,    6
+    vsllwil.hu.bu    vr8,     vr8,    0
+    vadd.h           vr8,     vr6,    vr8
+    vsrarni.b.h      vr8,     vr8,    1
+    vstelm.d         vr8,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     t4
+
+    addi.d           t1,      t1,     -4
+    blt              zero,    t1,     .LOOPELSE
+.ENDLOOPELSE:
+endfunc
+
+/* void ff_put_h264_chroma_mc4_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                   int h, int x, int y) */
+function ff_put_h264_chroma_mc4_lsx
+    li.d             t8,      8
+    sub.d            t1,      t8,     a4     // 8-x
+    sub.d            t2,      t8,     a5     // 8-y
+    mul.d            t3,      t1,     t2     // A
+    mul.d            t4,      a4,     t2     // B
+    mul.d            t5,      t1,     a5     // C
+    mul.d            t6,      a4,     a5     // D
+    add.d            t0,      t4,     t5     // E
+    slli.d           t8,      a2,     1
+    vreplgr2vr.b     vr0,     t3
+    vreplgr2vr.b     vr1,     t4
+    vreplgr2vr.b     vr2,     t5
+    vreplgr2vr.b     vr3,     t6
+    vreplgr2vr.b     vr4,     t0
+
+    bge              zero,    t6,     .ENDPUT_D
+    move             t1,      a3
+    vilvl.b          vr9,     vr1,    vr0
+    vilvl.b          vr10,    vr3,    vr2
+.PUT_D:
+    vld              vr5,     a1,     0
+    vld              vr6,     a1,     1
+    add.d            a1,      a1,     a2
+    vld              vr7,     a1,     0
+    vld              vr8,     a1,     1
+    add.d            a1,      a1,     a2
+    vld              vr11,    a1,     0
+    vld              vr12,    a1,     1
+    vilvl.b          vr5,     vr6,    vr5
+    vilvl.b          vr7,     vr8,    vr7
+    vilvl.b          vr13,    vr12,   vr11
+    vilvl.d          vr5,     vr7,    vr5
+    vilvl.d          vr13,    vr13,   vr7
+    vmulwev.h.bu     vr14,    vr9,    vr5
+    vmaddwod.h.bu    vr14,    vr9,    vr5
+    vmulwev.h.bu     vr15,    vr10,   vr13
+    vmaddwod.h.bu    vr15,    vr10,   vr13
+    vadd.h           vr14,    vr14,   vr15
+    vsrarni.b.h      vr14,    vr14,   6
+    vstelm.w         vr14,    a0,     0,    0
+    add.d            a0,      a0,     a2
+    vstelm.w         vr14,    a0,     0,    1
+    add.d            a0,      a0,     a2
+    addi.d           t1,      t1,     -2
+    blt              zero,    t1,     .PUT_D
+    b                .ENDPUT
+.ENDPUT_D:
+
+    bge              zero,    t0,     .ENDPUT_E
+    move             t1,      a3
+    li.d             t7,      1
+    slt              t8,      zero,   t5
+    maskeqz          t5,      a2,     t8
+    masknez          t7,      t7,     t8
+    or               t7,      t7,     t5
+    vilvl.b          vr7,     vr4,    vr0
+.PUT_E:
+    vld              vr5,     a1,     0
+    vldx             vr6,     a1,     t7
+    vilvl.b          vr5,     vr6,    vr5
+    add.d            a1,      a1,     a2
+    vld              vr8,     a1,     0
+    vldx             vr9,     a1,     t7
+    vilvl.b          vr8,     vr9,    vr8
+    vilvl.d          vr5,     vr8,    vr5
+    vmulwev.h.bu     vr6,     vr7,    vr5
+    vmaddwod.h.bu    vr6,     vr7,    vr5
+    vsrarni.b.h      vr6,     vr6,    6
+    vstelm.w         vr6,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    vstelm.w         vr6,     a0,     0,    1
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+    addi.d           t1,      t1,     -2
+    blt              zero,    t1,     .PUT_E
+    b                .ENDPUT
+.ENDPUT_E:
+
+    move             t1,      a3
+.PUT:
+    vld              vr5,     a1,     0
+    vldx             vr8,     a1,     a2
+    vilvl.w          vr5,     vr8,    vr5
+    vmulwev.h.bu     vr6,     vr0,    vr5
+    vmulwod.h.bu     vr7,     vr0,    vr5
+    vsrarni.b.h      vr6,     vr6,    6
+    vsrarni.b.h      vr7,     vr7,    6
+    vilvl.b          vr6,     vr7,    vr6
+    vstelm.w         vr6,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    vstelm.w         vr6,     a0,     0,    1
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     t8
+    addi.d           t1,      t1,     -2
+    blt              zero,    t1,     .PUT
+.ENDPUT:
+endfunc
+
+/* void ff_put_h264_chroma_mc8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                    int h, int x, int y) */
+function ff_put_h264_chroma_mc8_lasx
+    li.d             t8,      8
+    sub.d            t1,      t8,     a4     // 8-x
+    sub.d            t2,      t8,     a5     // 8-y
+    mul.d            t3,      t1,     t2     // A
+    mul.d            t4,      a4,     t2     // B
+    mul.d            t5,      t1,     a5     // C
+    mul.d            t6,      a4,     a5     // D
+    add.d            t0,      t4,     t5     // E
+    xvreplgr2vr.b    xr0,     t3
+    xvreplgr2vr.b    xr1,     t4
+    xvreplgr2vr.b    xr2,     t5
+    xvreplgr2vr.b    xr3,     t6
+    xvreplgr2vr.b    xr4,     t0
+    slli.d           t2,      a2,     1
+    add.d            t3,      t2,     a2
+    slli.d           t4,      a2,     2
+
+    bge              zero,    t6,     .ENDLOOP_DA
+    move             t1,      a3
+    xvilvl.b         xr9,     xr1,    xr0
+    xvilvl.b         xr10,    xr3,    xr2
+.LOOP_DA:
+    fld.d            f5,      a1,     0
+    fld.d            f6,      a1,     1
+    add.d            a1,      a1,     a2
+    fld.d            f7,      a1,     0
+    fld.d            f8,      a1,     1
+    add.d            a1,      a1,     a2
+    fld.d            f13,     a1,     0
+    fld.d            f14,     a1,     1
+    add.d            a1,      a1,     a2
+    fld.d            f15,     a1,     0
+    fld.d            f16,     a1,     1
+    add.d            a1,      a1,     a2
+    fld.d            f17,     a1,     0
+    fld.d            f18,     a1,     1
+    vilvl.b          vr11,    vr6,    vr5
+    vilvl.b          vr12,    vr8,    vr7
+    vilvl.b          vr14,    vr14,   vr13
+    vilvl.b          vr15,    vr16,   vr15
+    vilvl.b          vr16,    vr18,   vr17
+    xvpermi.q        xr11,    xr12,   0x02
+    xvpermi.q        xr12,    xr14,   0x02
+    xvpermi.q        xr14,    xr15,   0x02
+    xvpermi.q        xr15,    xr16,   0x02
+
+    xvmulwev.h.bu    xr19,    xr9,    xr11
+    xvmaddwod.h.bu   xr19,    xr9,    xr11
+    xvmulwev.h.bu    xr20,    xr10,   xr12
+    xvmaddwod.h.bu   xr20,    xr10,   xr12
+    xvadd.h          xr21,    xr19,   xr20
+    xvsrarni.b.h     xr21,    xr21,   6
+    vstelm.d         vr21,    a0,     0,    0
+    add.d            a0,      a0,     a2
+    xvstelm.d        xr21,    a0,     0,    2
+    add.d            a0,      a0,     a2
+    xvmulwev.h.bu    xr13,    xr9,    xr14
+    xvmaddwod.h.bu   xr13,    xr9,    xr14
+    xvmulwev.h.bu    xr14,    xr10,   xr15
+    xvmaddwod.h.bu   xr14,    xr10,   xr15
+    xvadd.h          xr13,    xr13,   xr14
+    xvsrarni.b.h     xr13,    xr13,   6
+    vstelm.d         vr13,    a0,     0,    0
+    add.d            a0,      a0,     a2
+    xvstelm.d        xr13,    a0,     0,    2
+    add.d            a0,      a0,     a2
+
+    addi.d           t1,      t1,     -4
+    blt              zero,    t1,     .LOOP_DA
+    b                .ENDLOOPA
+.ENDLOOP_DA:
+
+    bge              zero,    t0,     .ENDLOOP_EA
+    move             t1,      a3
+    li.d             t7,      1
+    slt              t8,      zero,   t5
+    maskeqz          t5,      a2,     t8
+    masknez          t7,      t7,     t8
+    or               t7,      t7,     t5
+    xvilvl.b         xr7,     xr4,    xr0
+.LOOP_EA:
+    fld.d            f5,      a1,     0
+    fldx.d           f6,      a1,     t7
+    add.d            a1,      a1,     a2
+    fld.d            f9,      a1,     0
+    fldx.d           f10,     a1,     t7
+    add.d            a1,      a1,     a2
+    fld.d            f11,     a1,     0
+    fldx.d           f12,     a1,     t7
+    add.d            a1,      a1,     a2
+    fld.d            f13,     a1,     0
+    fldx.d           f14,     a1,     t7
+    vilvl.b          vr5,     vr6,    vr5
+    vilvl.b          vr9,     vr10,   vr9
+    vilvl.b          vr11,    vr12,   vr11
+    vilvl.b          vr13,    vr14,   vr13
+    xvpermi.q        xr5,     xr9,    0x02
+    xvpermi.q        xr11,    xr13,   0x02
+
+    xvmulwev.h.bu    xr8,     xr7,    xr5
+    xvmaddwod.h.bu   xr8,     xr7,    xr5
+    xvmulwev.h.bu    xr6,     xr7,    xr11
+    xvmaddwod.h.bu   xr6,     xr7,    xr11
+    xvsrarni.b.h     xr8,     xr8,    6
+    vstelm.d         vr8,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    xvstelm.d        xr8,     a0,     0,    2
+    add.d            a0,      a0,     a2
+    xvsrarni.b.h     xr6,     xr6,    6
+    vstelm.d         vr6,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    xvstelm.d        xr6,     a0,     0,    2
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+
+    addi.d           t1,      t1,     -4
+    blt              zero,    t1,     .LOOP_EA
+    b                .ENDLOOPA
+.ENDLOOP_EA:
+
+    move             t1,      a3
+.LOOPA:
+    fld.d            f5,      a1,     0
+    fldx.d           f6,      a1,     a2
+    fldx.d           f7,      a1,     t2
+    fldx.d           f8,      a1,     t3
+    vilvl.d          vr5,     vr6,    vr5
+    vilvl.d          vr7,     vr8,    vr7
+    xvpermi.q        xr5,     xr7,    0x02
+    xvmulwev.h.bu    xr6,     xr0,    xr5
+    xvmulwod.h.bu    xr7,     xr0,    xr5
+    xvilvl.h         xr8,     xr7,    xr6
+    xvilvh.h         xr9,     xr7,    xr6
+    xvsrarni.b.h     xr9,     xr8,    6
+    vstelm.d         vr9,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    vstelm.d         vr9,     a0,     0,    1
+    add.d            a0,      a0,     a2
+    xvstelm.d        xr9,     a0,     0,    2
+    add.d            a0,      a0,     a2
+    xvstelm.d        xr9,     a0,     0,    3
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     t4
+
+    addi.d           t1,      t1,     -4
+    blt              zero,    t1,     .LOOPA
+.ENDLOOPA:
+endfunc
+
+/* void ff_avg_h264_chroma_mc8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                    int h, int x, int y) */
+function ff_avg_h264_chroma_mc8_lasx
+    li.d             t8,      8
+    sub.d            t1,      t8,     a4     // 8-x
+    sub.d            t2,      t8,     a5     // 8-y
+    mul.d            t3,      t1,     t2     // A
+    mul.d            t4,      a4,     t2     // B
+    mul.d            t5,      t1,     a5     // C
+    mul.d            t6,      a4,     a5     // D
+    add.d            t0,      t4,     t5     // E
+    xvreplgr2vr.b    xr0,     t3
+    xvreplgr2vr.b    xr1,     t4
+    xvreplgr2vr.b    xr2,     t5
+    xvreplgr2vr.b    xr3,     t6
+    xvreplgr2vr.b    xr4,     t0
+    slli.d           t2,      a2,     1
+    add.d            t3,      t2,     a2
+    slli.d           t4,      a2,     2
+
+    bge              zero,    t6,     .ENDLOOPDA
+    move             t1,      a3
+    xvilvl.b         xr9,     xr1,    xr0
+    xvilvl.b         xr10,    xr3,    xr2
+.LOOPDA:
+    fld.d            f5,      a1,     0
+    fld.d            f6,      a1,     1
+    add.d            a1,      a1,     a2
+    fld.d            f7,      a1,     0
+    fld.d            f8,      a1,     1
+    add.d            a1,      a1,     a2
+    fld.d            f11,     a1,     0
+    fld.d            f12,     a1,     1
+    add.d            a1,      a1,     a2
+    fld.d            f13,     a1,     0
+    fld.d            f14,     a1,     1
+    add.d            a1,      a1,     a2
+    fld.d            f15,     a1,     0
+    fld.d            f16,     a1,     1
+    fld.d            f17,     a0,     0
+    fldx.d           f18,     a0,     a2
+    fldx.d           f19,     a0,     t2
+    fldx.d           f20,     a0,     t3
+    vilvl.b          vr5,     vr6,    vr5
+    vilvl.b          vr7,     vr8,    vr7
+    vilvl.b          vr11,    vr12,   vr11
+    vilvl.b          vr13,    vr14,   vr13
+    vilvl.b          vr16,    vr16,   vr15
+    xvpermi.q        xr5,     xr7,    0x02
+    xvpermi.q        xr7,     xr11,   0x02
+    xvpermi.q        xr11,    xr13,   0x02
+    xvpermi.q        xr13,    xr16,   0x02
+    xvpermi.q        xr17,    xr18,   0x02
+    xvpermi.q        xr19,    xr20,   0x02
+
+    xvmulwev.h.bu    xr14,    xr9,    xr5
+    xvmaddwod.h.bu   xr14,    xr9,    xr5
+    xvmulwev.h.bu    xr15,    xr10,   xr7
+    xvmaddwod.h.bu   xr15,    xr10,   xr7
+    xvadd.h          xr14,    xr14,   xr15
+    xvsrari.h        xr14,    xr14,   6
+    xvsllwil.hu.bu   xr17,    xr17,   0
+    xvadd.h          xr20,    xr14,   xr17
+    xvsrarni.b.h     xr20,    xr20,   1
+    xvstelm.d        xr20,    a0,     0,    0
+    add.d            a0,      a0,     a2
+    xvstelm.d        xr20,    a0,     0,    2
+    add.d            a0,      a0,     a2
+    xvmulwev.h.bu    xr14,    xr9,    xr11
+    xvmaddwod.h.bu   xr14,    xr9,    xr11
+    xvmulwev.h.bu    xr15,    xr10,   xr13
+    xvmaddwod.h.bu   xr15,    xr10,   xr13
+    xvadd.h          xr14,    xr14,   xr15
+    xvsrari.h        xr14,    xr14,   6
+    xvsllwil.hu.bu   xr19,    xr19,   0
+    xvadd.h          xr21,    xr14,   xr19
+    xvsrarni.b.h     xr21,    xr21,   1
+    xvstelm.d        xr21,    a0,     0,    0
+    add.d            a0,      a0,     a2
+    xvstelm.d        xr21,    a0,     0,    2
+    add.d            a0,      a0,     a2
+
+    addi.d           t1,      t1,     -4
+    blt              zero,    t1,     .LOOPDA
+    b                .ENDLOOPELSEA
+.ENDLOOPDA:
+
+    bge              zero,    t0,     .ENDLOOPEA
+    move             t1,      a3
+    li.d             t7,      1
+    slt              t8,      zero,   t5
+    maskeqz          t5,      a2,     t8
+    masknez          t7,      t7,     t8
+    or               t7,      t7,     t5
+    xvilvl.b         xr7,     xr4,    xr0
+.LOOPEA:
+    fld.d            f5,      a1,     0
+    fldx.d           f6,      a1,     t7
+    add.d            a1,      a1,     a2
+    fld.d            f8,      a1,     0
+    fldx.d           f9,      a1,     t7
+    add.d            a1,      a1,     a2
+    fld.d            f10,     a1,     0
+    fldx.d           f11,     a1,     t7
+    add.d            a1,      a1,     a2
+    fld.d            f12,     a1,     0
+    fldx.d           f13,     a1,     t7
+    add.d            a1,      a1,     a2
+    fld.d            f14,     a0,     0
+    fldx.d           f15,     a0,     a2
+    fldx.d           f16,     a0,     t2
+    fldx.d           f17,     a0,     t3
+    vilvl.b          vr5,     vr6,    vr5
+    vilvl.b          vr8,     vr9,    vr8
+    vilvl.b          vr10,    vr11,   vr10
+    vilvl.b          vr12,    vr13,   vr12
+    xvpermi.q        xr5,     xr8,    0x02
+    xvpermi.q        xr10,    xr12,   0x02
+    xvpermi.q        xr14,    xr15,   0x02
+    xvpermi.q        xr16,    xr17,   0x02
+
+    xvmulwev.h.bu    xr6,     xr7,    xr5
+    xvmaddwod.h.bu   xr6,     xr7,    xr5
+    xvsrari.h        xr6,     xr6,    6
+    xvsllwil.hu.bu   xr14,    xr14,   0
+    xvadd.h          xr8,     xr6,    xr14
+    xvsrarni.b.h     xr8,     xr8,    1
+    xvstelm.d        xr8,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    xvstelm.d        xr8,     a0,     0,    2
+    add.d            a0,      a0,     a2
+    xvmulwev.h.bu    xr6,     xr7,    xr10
+    xvmaddwod.h.bu   xr6,     xr7,    xr10
+    xvsrari.h        xr6,     xr6,    6
+    xvsllwil.hu.bu   xr16,    xr16,   0
+    xvadd.h          xr8,     xr6,    xr16
+    xvsrarni.b.h     xr8,     xr8,    1
+    xvstelm.d        xr8,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    xvstelm.d        xr8,     a0,     0,    2
+    add.d            a0,      a0,     a2
+
+    addi.d           t1,      t1,     -4
+    blt              zero,    t1,     .LOOPEA
+    b                .ENDLOOPELSEA
+.ENDLOOPEA:
+
+    move             t1,      a3
+.LOOPELSEA:
+    fld.d            f5,      a1,     0
+    fldx.d           f6,      a1,     a2
+    fldx.d           f7,      a1,     t2
+    fldx.d           f8,      a1,     t3
+    fld.d            f9,      a0,     0
+    fldx.d           f10,     a0,     a2
+    fldx.d           f11,     a0,     t2
+    fldx.d           f12,     a0,     t3
+    xvpermi.q        xr5,     xr6,    0x02
+    xvpermi.q        xr7,     xr8,    0x02
+    xvpermi.q        xr9,     xr10,   0x02
+    xvpermi.q        xr11,    xr12,   0x02
+
+    xvmulwev.h.bu    xr12,    xr0,    xr5
+    xvmulwod.h.bu    xr13,    xr0,    xr5
+    xvilvl.h         xr12,    xr13,   xr12
+    xvsrari.h        xr12,    xr12,   6
+    xvsllwil.hu.bu   xr9,     xr9,    0
+    xvadd.h          xr9,     xr12,   xr9
+    xvsrarni.b.h     xr9,     xr9,    1
+    xvstelm.d        xr9,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    xvstelm.d        xr9,     a0,     0,    2
+    add.d            a0,      a0,     a2
+    xvmulwev.h.bu    xr12,    xr0,    xr7
+    xvmulwod.h.bu    xr13,    xr0,    xr7
+    xvilvl.h         xr12,    xr13,   xr12
+    xvsrari.h        xr12,    xr12,   6
+    xvsllwil.hu.bu   xr11,    xr11,   0
+    xvadd.h          xr13,    xr12,   xr11
+    xvsrarni.b.h     xr13,    xr13,   1
+    xvstelm.d        xr13,    a0,     0,    0
+    add.d            a0,      a0,     a2
+    xvstelm.d        xr13,    a0,     0,    2
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     t4
+
+    addi.d           t1,      t1,     -4
+    blt              zero,    t1,     .LOOPELSEA
+.ENDLOOPELSEA:
+endfunc
+
+/* void ff_put_h264_chroma_mc4_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                    int h, int x, int y) */
+function ff_put_h264_chroma_mc4_lasx
+    li.d             t8,      8
+    sub.d            t1,      t8,     a4     // 8-x
+    sub.d            t2,      t8,     a5     // 8-y
+    mul.d            t3,      t1,     t2     // A
+    mul.d            t4,      a4,     t2     // B
+    mul.d            t5,      t1,     a5     // C
+    mul.d            t6,      a4,     a5     // D
+    add.d            t0,      t4,     t5     // E
+    slli.d           t8,      a2,     1
+    vreplgr2vr.b     vr0,     t3
+    vreplgr2vr.b     vr1,     t4
+    vreplgr2vr.b     vr2,     t5
+    vreplgr2vr.b     vr3,     t6
+    vreplgr2vr.b     vr4,     t0
+
+    bge              zero,    t6,     .ENDPUT_DA
+    move             t1,      a3
+    vilvl.b          vr9,     vr1,    vr0
+    vilvl.b          vr10,    vr3,    vr2
+.PUT_DA:
+    fld.d            f5,      a1,     0
+    fld.d            f6,      a1,     1
+    add.d            a1,      a1,     a2
+    fld.d            f7,      a1,     0
+    fld.d            f8,      a1,     1
+    add.d            a1,      a1,     a2
+    fld.d            f11,     a1,     0
+    fld.d            f12,     a1,     1
+    vilvl.b          vr5,     vr6,    vr5
+    vilvl.b          vr7,     vr8,    vr7
+    vilvl.b          vr13,    vr12,   vr11
+    vilvl.d          vr5,     vr7,    vr5
+    vilvl.d          vr13,    vr13,   vr7
+    vmulwev.h.bu     vr14,    vr9,    vr5
+    vmaddwod.h.bu    vr14,    vr9,    vr5
+    vmulwev.h.bu     vr15,    vr10,   vr13
+    vmaddwod.h.bu    vr15,    vr10,   vr13
+    xvadd.h          xr14,    xr14,   xr15
+    vsrarni.b.h      vr16,    vr14,   6
+    vstelm.w         vr16,    a0,     0,    0
+    add.d            a0,      a0,     a2
+    vstelm.w         vr16,    a0,     0,    1
+    add.d            a0,      a0,     a2
+    addi.d           t1,      t1,     -2
+    blt              zero,    t1,     .PUT_DA
+    b                .ENDPUTA
+.ENDPUT_DA:
+
+    bge              zero,    t0,     .ENDPUT_EA
+    move             t1,      a3
+    li.d             t7,      1
+    slt              t8,      zero,   t5
+    maskeqz          t5,      a2,     t8
+    masknez          t7,      t7,     t8
+    or               t7,      t7,     t5
+    vilvl.b          vr7,     vr4,    vr0
+.PUT_EA:
+    fld.d            f5,      a1,     0
+    fldx.d           f6,      a1,     t7
+    vilvl.b          vr5,     vr6,    vr5
+    add.d            a1,      a1,     a2
+    fld.d            f8,      a1,     0
+    fldx.d           f9,      a1,     t7
+    vilvl.b          vr8,     vr9,    vr8
+    vilvl.d          vr5,     vr8,    vr5
+    vmulwev.h.bu     vr6,     vr7,    vr5
+    vmaddwod.h.bu    vr6,     vr7,    vr5
+    vsrarni.b.h      vr6,     vr6,    6
+    vstelm.w         vr6,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    vstelm.w         vr6,     a0,     0,    1
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     a2
+    addi.d           t1,      t1,     -2
+    blt              zero,    t1,     .PUT_EA
+    b                .ENDPUTA
+.ENDPUT_EA:
+
+    move             t1,      a3
+.PUTA:
+    fld.d            f5,      a1,     0
+    fldx.d           f8,      a1,     a2
+    vilvl.w          vr5,     vr8,    vr5
+    vmulwev.h.bu     vr6,     vr0,    vr5
+    vmulwod.h.bu     vr7,     vr0,    vr5
+    vilvl.h          vr6,     vr7,    vr6
+    vsrarni.b.h      vr6,     vr6,    6
+    vstelm.w         vr6,     a0,     0,    0
+    add.d            a0,      a0,     a2
+    vstelm.w         vr6,     a0,     0,    1
+    add.d            a0,      a0,     a2
+    add.d            a1,      a1,     t8
+    addi.d           t1,      t1,     -2
+    blt              zero,    t1,     .PUTA
+.ENDPUTA:
+endfunc
diff --git a/libavcodec/loongarch/h264chroma_init_loongarch.c b/libavcodec/loongarch/h264chroma_init_loongarch.c
index 0ca24ecc47..40a957aad3 100644
--- a/libavcodec/loongarch/h264chroma_init_loongarch.c
+++ b/libavcodec/loongarch/h264chroma_init_loongarch.c
@@ -19,7 +19,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "h264chroma_lasx.h"
+#include "h264chroma_loongarch.h"
 #include "libavutil/attributes.h"
 #include "libavutil/loongarch/cpu.h"
 #include "libavcodec/h264chroma.h"
@@ -27,6 +27,14 @@
 av_cold void ff_h264chroma_init_loongarch(H264ChromaContext *c, int bit_depth)
 {
     int cpu_flags = av_get_cpu_flags();
+    if (have_lsx(cpu_flags)) {
+        if (bit_depth <= 8) {
+            c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_lsx;
+            c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_lsx;
+            c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_lsx;
+        }
+    }
+
     if (have_lasx(cpu_flags)) {
         if (bit_depth <= 8) {
             c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_lasx;
diff --git a/libavcodec/loongarch/h264chroma_lasx.c b/libavcodec/loongarch/h264chroma_lasx.c
deleted file mode 100644
index 1c0e002bdf..0000000000
--- a/libavcodec/loongarch/h264chroma_lasx.c
+++ /dev/null
@@ -1,1280 +0,0 @@
-/*
- * Loongson LASX optimized h264chroma
- *
- * Copyright (c) 2020 Loongson Technology Corporation Limited
- * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "h264chroma_lasx.h"
-#include "libavutil/attributes.h"
-#include "libavutil/avassert.h"
-#include "libavutil/loongarch/loongson_intrinsics.h"
-
-static const uint8_t chroma_mask_arr[64] = {
-    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
-    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
-    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
-    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
-};
-
-static av_always_inline void avc_chroma_hv_8x4_lasx(const uint8_t *src, uint8_t *dst,
-                             ptrdiff_t stride, uint32_t coef_hor0,
-                             uint32_t coef_hor1, uint32_t coef_ver0,
-                             uint32_t coef_ver1)
-{
-    ptrdiff_t stride_2x = stride << 1;
-    ptrdiff_t stride_3x = stride_2x + stride;
-    ptrdiff_t stride_4x = stride_2x << 1;
-    __m256i src0, src1, src2, src3, src4, out;
-    __m256i res_hz0, res_hz1, res_hz2, res_vt0, res_vt1;
-    __m256i mask;
-    __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
-    __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
-    __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
-    __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
-    __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
-
-    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0);
-    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x,
-              src1, src2, src3, src4);
-    DUP2_ARG3(__lasx_xvpermi_q, src2, src1, 0x20, src4, src3, 0x20, src1, src3);
-    src0 = __lasx_xvshuf_b(src0, src0, mask);
-    DUP2_ARG3(__lasx_xvshuf_b, src1, src1, mask, src3, src3, mask, src1, src3);
-    DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, res_hz0, res_hz1);
-    res_hz2 = __lasx_xvdp2_h_bu(src3, coeff_hz_vec);
-    res_vt0 = __lasx_xvmul_h(res_hz1, coeff_vt_vec0);
-    res_vt1 = __lasx_xvmul_h(res_hz2, coeff_vt_vec0);
-    res_hz0 = __lasx_xvpermi_q(res_hz1, res_hz0, 0x20);
-    res_hz1 = __lasx_xvpermi_q(res_hz1, res_hz2, 0x3);
-    res_vt0 = __lasx_xvmadd_h(res_vt0, res_hz0, coeff_vt_vec1);
-    res_vt1 = __lasx_xvmadd_h(res_vt1, res_hz1, coeff_vt_vec1);
-    out = __lasx_xvssrarni_bu_h(res_vt1, res_vt0, 6);
-    __lasx_xvstelm_d(out, dst, 0, 0);
-    __lasx_xvstelm_d(out, dst + stride, 0, 2);
-    __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
-    __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void avc_chroma_hv_8x8_lasx(const uint8_t *src, uint8_t *dst,
-                             ptrdiff_t stride, uint32_t coef_hor0,
-                             uint32_t coef_hor1, uint32_t coef_ver0,
-                             uint32_t coef_ver1)
-{
-    ptrdiff_t stride_2x = stride << 1;
-    ptrdiff_t stride_3x = stride_2x + stride;
-    ptrdiff_t stride_4x = stride << 2;
-    __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8;
-    __m256i out0, out1;
-    __m256i res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
-    __m256i res_vt0, res_vt1, res_vt2, res_vt3;
-    __m256i mask;
-    __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
-    __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
-    __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
-    __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
-    __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
-
-    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0);
-    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x,
-              src1, src2, src3, src4);
-    src += stride_4x;
-    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x,
-              src5, src6, src7, src8);
-    DUP4_ARG3(__lasx_xvpermi_q, src2, src1, 0x20, src4, src3, 0x20, src6, src5, 0x20,
-              src8, src7, 0x20, src1, src3, src5, src7);
-    src0 = __lasx_xvshuf_b(src0, src0, mask);
-    DUP4_ARG3(__lasx_xvshuf_b, src1, src1, mask, src3, src3, mask, src5, src5, mask, src7,
-              src7, mask, src1, src3, src5, src7);
-    DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, src3,
-              coeff_hz_vec, src5, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
-    res_hz4 = __lasx_xvdp2_h_bu(src7, coeff_hz_vec);
-    res_vt0 = __lasx_xvmul_h(res_hz1, coeff_vt_vec0);
-    res_vt1 = __lasx_xvmul_h(res_hz2, coeff_vt_vec0);
-    res_vt2 = __lasx_xvmul_h(res_hz3, coeff_vt_vec0);
-    res_vt3 = __lasx_xvmul_h(res_hz4, coeff_vt_vec0);
-    res_hz0 = __lasx_xvpermi_q(res_hz1, res_hz0, 0x20);
-    res_hz1 = __lasx_xvpermi_q(res_hz1, res_hz2, 0x3);
-    res_hz2 = __lasx_xvpermi_q(res_hz2, res_hz3, 0x3);
-    res_hz3 = __lasx_xvpermi_q(res_hz3, res_hz4, 0x3);
-    DUP4_ARG3(__lasx_xvmadd_h, res_vt0, res_hz0, coeff_vt_vec1, res_vt1, res_hz1, coeff_vt_vec1,
-              res_vt2, res_hz2, coeff_vt_vec1, res_vt3, res_hz3, coeff_vt_vec1,
-              res_vt0, res_vt1, res_vt2, res_vt3);
-    DUP2_ARG3(__lasx_xvssrarni_bu_h, res_vt1, res_vt0, 6, res_vt3, res_vt2, 6, out0, out1);
-    __lasx_xvstelm_d(out0, dst, 0, 0);
-    __lasx_xvstelm_d(out0, dst + stride, 0, 2);
-    __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1);
-    __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3);
-    dst += stride_4x;
-    __lasx_xvstelm_d(out1, dst, 0, 0);
-    __lasx_xvstelm_d(out1, dst + stride, 0, 2);
-    __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1);
-    __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void avc_chroma_hz_8x4_lasx(const uint8_t *src, uint8_t *dst,
-                             ptrdiff_t stride, uint32_t coeff0, uint32_t coeff1)
-{
-    ptrdiff_t stride_2x = stride << 1;
-    ptrdiff_t stride_3x = stride_2x + stride;
-    __m256i src0, src1, src2, src3, out;
-    __m256i res0, res1;
-    __m256i mask;
-    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
-    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
-    __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
-    coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
-    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0);
-    DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src1, src2);
-    src3 = __lasx_xvldx(src, stride_3x);
-    DUP2_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src0, src2);
-    DUP2_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src0, src2);
-    DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1);
-    out = __lasx_xvssrarni_bu_h(res1, res0, 6);
-    __lasx_xvstelm_d(out, dst, 0, 0);
-    __lasx_xvstelm_d(out, dst + stride, 0, 2);
-    __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
-    __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
-
-}
-
-static av_always_inline void avc_chroma_hz_8x8_lasx(const uint8_t *src, uint8_t *dst,
-                             ptrdiff_t stride, uint32_t coeff0, uint32_t coeff1)
-{
-    ptrdiff_t stride_2x = stride << 1;
-    ptrdiff_t stride_3x = stride_2x + stride;
-    ptrdiff_t stride_4x = stride << 2;
-    __m256i src0, src1, src2, src3, src4, src5, src6, src7;
-    __m256i out0, out1;
-    __m256i res0, res1, res2, res3;
-    __m256i mask;
-    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
-    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
-    __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
-    coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
-    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0);
-    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x,
-              src1, src2, src3, src4);
-    src += stride_4x;
-    DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src5, src6);
-    src7 = __lasx_xvldx(src, stride_3x);
-    DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src5, src4, 0x20,
-              src7, src6, 0x20, src0, src2, src4, src6);
-    DUP4_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src4, src4, mask,
-              src6, src6, mask, src0, src2, src4, src6);
-    DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, src4, coeff_vec, src6,
-              coeff_vec, res0, res1, res2, res3);
-    DUP2_ARG3(__lasx_xvssrarni_bu_h, res1, res0, 6, res3, res2, 6, out0, out1);
-    __lasx_xvstelm_d(out0, dst, 0, 0);
-    __lasx_xvstelm_d(out0, dst + stride, 0, 2);
-    __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1);
-    __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3);
-    dst += stride_4x;
-    __lasx_xvstelm_d(out1, dst, 0, 0);
-    __lasx_xvstelm_d(out1, dst + stride, 0, 2);
-    __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1);
-    __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void avc_chroma_hz_nonmult_lasx(const uint8_t *src,
-                             uint8_t *dst, ptrdiff_t stride, uint32_t coeff0,
-                             uint32_t coeff1, int32_t height)
-{
-    uint32_t row;
-    ptrdiff_t stride_2x = stride << 1;
-    ptrdiff_t stride_3x = stride_2x + stride;
-    ptrdiff_t stride_4x = stride << 2;
-    __m256i src0, src1, src2, src3, out;
-    __m256i res0, res1;
-    __m256i mask;
-    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
-    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
-    __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
-    mask = __lasx_xvld(chroma_mask_arr, 0);
-    coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
-
-    for (row = height >> 2; row--;) {
-        DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
-                  src0, src1, src2, src3);
-        src += stride_4x;
-        DUP2_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src0, src2);
-        DUP2_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src0, src2);
-        DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1);
-        out = __lasx_xvssrarni_bu_h(res1, res0, 6);
-        __lasx_xvstelm_d(out, dst, 0, 0);
-        __lasx_xvstelm_d(out, dst + stride, 0, 2);
-        __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
-        __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
-        dst += stride_4x;
-    }
-
-    if ((height & 3)) {
-        src0 = __lasx_xvld(src, 0);
-        src1 = __lasx_xvldx(src, stride);
-        src1 = __lasx_xvpermi_q(src1, src0, 0x20);
-        src0 = __lasx_xvshuf_b(src1, src1, mask);
-        res0 = __lasx_xvdp2_h_bu(src0, coeff_vec);
-        out  = __lasx_xvssrarni_bu_h(res0, res0, 6);
-        __lasx_xvstelm_d(out, dst, 0, 0);
-        dst += stride;
-        __lasx_xvstelm_d(out, dst, 0, 2);
-    }
-}
-
-static av_always_inline void avc_chroma_vt_8x4_lasx(const uint8_t *src, uint8_t *dst,
-                             ptrdiff_t stride, uint32_t coeff0, uint32_t coeff1)
-{
-    ptrdiff_t stride_2x = stride << 1;
-    ptrdiff_t stride_3x = stride_2x + stride;
-    __m256i src0, src1, src2, src3, src4, out;
-    __m256i res0, res1;
-    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
-    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
-    __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
-    coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
-    src0 = __lasx_xvld(src, 0);
-    src += stride;
-    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
-              src1, src2, src3, src4);
-    DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20,
-              src4, src3, 0x20, src0, src1, src2, src3);
-    DUP2_ARG2(__lasx_xvilvl_b, src1, src0, src3, src2, src0, src2);
-    DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1);
-    out  = __lasx_xvssrarni_bu_h(res1, res0, 6);
-    __lasx_xvstelm_d(out, dst, 0, 0);
-    __lasx_xvstelm_d(out, dst + stride, 0, 2);
-    __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
-    __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void avc_chroma_vt_8x8_lasx(const uint8_t *src, uint8_t *dst,
-                             ptrdiff_t stride, uint32_t coeff0, uint32_t coeff1)
-{
-    ptrdiff_t stride_2x = stride << 1;
-    ptrdiff_t stride_3x = stride_2x + stride;
-    ptrdiff_t stride_4x = stride << 2;
-    __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8;
-    __m256i out0, out1;
-    __m256i res0, res1, res2, res3;
-    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
-    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
-    __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
-    coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
-    src0 = __lasx_xvld(src, 0);
-    src += stride;
-    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
-              src1, src2, src3, src4);
-    src += stride_4x;
-    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
-              src5, src6, src7, src8);
-    DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20,
-              src4, src3, 0x20, src0, src1, src2, src3);
-    DUP4_ARG3(__lasx_xvpermi_q, src5, src4, 0x20, src6, src5, 0x20, src7, src6, 0x20,
-              src8, src7, 0x20, src4, src5, src6, src7);
-    DUP4_ARG2(__lasx_xvilvl_b, src1, src0, src3, src2, src5, src4, src7, src6,
-              src0, src2, src4, src6);
-    DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, src4, coeff_vec,
-              src6, coeff_vec, res0, res1, res2, res3);
-    DUP2_ARG3(__lasx_xvssrarni_bu_h, res1, res0, 6, res3, res2, 6, out0, out1);
-    __lasx_xvstelm_d(out0, dst, 0, 0);
-    __lasx_xvstelm_d(out0, dst + stride, 0, 2);
-    __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1);
-    __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3);
-    dst += stride_4x;
-    __lasx_xvstelm_d(out1, dst, 0, 0);
-    __lasx_xvstelm_d(out1, dst + stride, 0, 2);
-    __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1);
-    __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void copy_width8x8_lasx(const uint8_t *src, uint8_t *dst,
-                             ptrdiff_t stride)
-{
-    uint64_t tmp[8];
-    ptrdiff_t stride_2, stride_3, stride_4;
-    __asm__ volatile (
-        "slli.d   %[stride_2],     %[stride],     1             \n\t"
-        "add.d    %[stride_3],     %[stride_2],   %[stride]     \n\t"
-        "slli.d   %[stride_4],     %[stride_2],   1             \n\t"
-        "ld.d     %[tmp0],         %[src],        0x0           \n\t"
-        "ldx.d    %[tmp1],         %[src],        %[stride]     \n\t"
-        "ldx.d    %[tmp2],         %[src],        %[stride_2]   \n\t"
-        "ldx.d    %[tmp3],         %[src],        %[stride_3]   \n\t"
-        "add.d    %[src],          %[src],        %[stride_4]   \n\t"
-        "ld.d     %[tmp4],         %[src],        0x0           \n\t"
-        "ldx.d    %[tmp5],         %[src],        %[stride]     \n\t"
-        "ldx.d    %[tmp6],         %[src],        %[stride_2]   \n\t"
-        "ldx.d    %[tmp7],         %[src],        %[stride_3]   \n\t"
-
-        "st.d     %[tmp0],         %[dst],        0x0           \n\t"
-        "stx.d    %[tmp1],         %[dst],        %[stride]     \n\t"
-        "stx.d    %[tmp2],         %[dst],        %[stride_2]   \n\t"
-        "stx.d    %[tmp3],         %[dst],        %[stride_3]   \n\t"
-        "add.d    %[dst],          %[dst],        %[stride_4]   \n\t"
-        "st.d     %[tmp4],         %[dst],        0x0           \n\t"
-        "stx.d    %[tmp5],         %[dst],        %[stride]     \n\t"
-        "stx.d    %[tmp6],         %[dst],        %[stride_2]   \n\t"
-        "stx.d    %[tmp7],         %[dst],        %[stride_3]   \n\t"
-        : [tmp0]"=&r"(tmp[0]),        [tmp1]"=&r"(tmp[1]),
-          [tmp2]"=&r"(tmp[2]),        [tmp3]"=&r"(tmp[3]),
-          [tmp4]"=&r"(tmp[4]),        [tmp5]"=&r"(tmp[5]),
-          [tmp6]"=&r"(tmp[6]),        [tmp7]"=&r"(tmp[7]),
-          [dst]"+&r"(dst),            [src]"+&r"(src),
-          [stride_2]"=&r"(stride_2),  [stride_3]"=&r"(stride_3),
-          [stride_4]"=&r"(stride_4)
-        : [stride]"r"(stride)
-        : "memory"
-    );
-}
-
-static av_always_inline void copy_width8x4_lasx(const uint8_t *src, uint8_t *dst,
-                             ptrdiff_t stride)
-{
-    uint64_t tmp[4];
-    ptrdiff_t stride_2, stride_3;
-    __asm__ volatile (
-        "slli.d   %[stride_2],     %[stride],     1             \n\t"
-        "add.d    %[stride_3],     %[stride_2],   %[stride]     \n\t"
-        "ld.d     %[tmp0],         %[src],        0x0           \n\t"
-        "ldx.d    %[tmp1],         %[src],        %[stride]     \n\t"
-        "ldx.d    %[tmp2],         %[src],        %[stride_2]   \n\t"
-        "ldx.d    %[tmp3],         %[src],        %[stride_3]   \n\t"
-
-        "st.d     %[tmp0],         %[dst],        0x0           \n\t"
-        "stx.d    %[tmp1],         %[dst],        %[stride]     \n\t"
-        "stx.d    %[tmp2],         %[dst],        %[stride_2]   \n\t"
-        "stx.d    %[tmp3],         %[dst],        %[stride_3]   \n\t"
-        : [tmp0]"=&r"(tmp[0]),        [tmp1]"=&r"(tmp[1]),
-          [tmp2]"=&r"(tmp[2]),        [tmp3]"=&r"(tmp[3]),
-          [stride_2]"=&r"(stride_2),  [stride_3]"=&r"(stride_3)
-        : [stride]"r"(stride), [dst]"r"(dst), [src]"r"(src)
-        : "memory"
-    );
-}
-
-static void avc_chroma_hv_8w_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
-                                  uint32_t coef_hor0, uint32_t coef_hor1,
-                                  uint32_t coef_ver0, uint32_t coef_ver1,
-                                  int32_t height)
-{
-    if (4 == height) {
-        avc_chroma_hv_8x4_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
-                               coef_ver1);
-    } else if (8 == height) {
-        avc_chroma_hv_8x8_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
-                               coef_ver1);
-    }
-}
-
-static void avc_chroma_hv_4x2_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
-                                   uint32_t coef_hor0, uint32_t coef_hor1,
-                                   uint32_t coef_ver0, uint32_t coef_ver1)
-{
-    ptrdiff_t stride_2 = stride << 1;
-    __m256i src0, src1, src2;
-    __m256i res_hz, res_vt;
-    __m256i mask;
-    __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
-    __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
-    __m256i coeff_hz_vec  = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
-    __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
-    __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
-    __m256i coeff_vt_vec  = __lasx_xvpermi_q(coeff_vt_vec1, coeff_vt_vec0, 0x02);
-
-    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0);
-    DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2, src1, src2);
-    DUP2_ARG3(__lasx_xvshuf_b, src1, src0, mask, src2, src1, mask, src0, src1);
-    src0 = __lasx_xvpermi_q(src0, src1, 0x02);
-    res_hz = __lasx_xvdp2_h_bu(src0, coeff_hz_vec);
-    res_vt = __lasx_xvmul_h(res_hz, coeff_vt_vec);
-    res_hz = __lasx_xvpermi_q(res_hz, res_vt, 0x01);
-    res_vt = __lasx_xvadd_h(res_hz, res_vt);
-    res_vt = __lasx_xvssrarni_bu_h(res_vt, res_vt, 6);
-    __lasx_xvstelm_w(res_vt, dst, 0, 0);
-    __lasx_xvstelm_w(res_vt, dst + stride, 0, 1);
-}
-
-static void avc_chroma_hv_4x4_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
-                                   uint32_t coef_hor0, uint32_t coef_hor1,
-                                   uint32_t coef_ver0, uint32_t coef_ver1)
-{
-    ptrdiff_t stride_2 = stride << 1;
-    ptrdiff_t stride_3 = stride_2 + stride;
-    ptrdiff_t stride_4 = stride_2 << 1;
-    __m256i src0, src1, src2, src3, src4;
-    __m256i res_hz0, res_hz1, res_vt0, res_vt1;
-    __m256i mask;
-    __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
-    __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
-    __m256i coeff_hz_vec  = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
-    __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
-    __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
-
-    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0);
-    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
-              src, stride_4, src1, src2, src3, src4);
-    DUP4_ARG3(__lasx_xvshuf_b, src1, src0, mask, src2, src1, mask, src3, src2, mask,
-              src4, src3, mask, src0, src1, src2, src3);
-    DUP2_ARG3(__lasx_xvpermi_q, src0, src2, 0x02, src1, src3, 0x02, src0, src1);
-    DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, res_hz0, res_hz1);
-    DUP2_ARG2(__lasx_xvmul_h, res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
-    res_hz0 = __lasx_xvadd_h(res_vt0, res_vt1);
-    res_hz0 = __lasx_xvssrarni_bu_h(res_hz0, res_hz0, 6);
-    __lasx_xvstelm_w(res_hz0, dst, 0, 0);
-    __lasx_xvstelm_w(res_hz0, dst + stride, 0, 1);
-    __lasx_xvstelm_w(res_hz0, dst + stride_2, 0, 4);
-    __lasx_xvstelm_w(res_hz0, dst + stride_3, 0, 5);
-}
-
-static void avc_chroma_hv_4x8_lasx(const uint8_t *src, uint8_t * dst, ptrdiff_t stride,
-                                   uint32_t coef_hor0, uint32_t coef_hor1,
-                                   uint32_t coef_ver0, uint32_t coef_ver1)
-{
-    ptrdiff_t stride_2 = stride << 1;
-    ptrdiff_t stride_3 = stride_2 + stride;
-    ptrdiff_t stride_4 = stride_2 << 1;
-    __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8;
-    __m256i res_hz0, res_hz1, res_hz2, res_hz3;
-    __m256i res_vt0, res_vt1, res_vt2, res_vt3;
-    __m256i mask;
-    __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
-    __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
-    __m256i coeff_hz_vec  = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
-    __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
-    __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
-
-    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0);
-    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
-              src, stride_4, src1, src2, src3, src4);
-    src += stride_4;
-    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
-              src, stride_4, src5, src6, src7, src8);
-    DUP4_ARG3(__lasx_xvshuf_b, src1, src0, mask, src2, src1, mask, src3, src2, mask,
-              src4, src3, mask, src0, src1, src2, src3);
-    DUP4_ARG3(__lasx_xvshuf_b, src5, src4, mask, src6, src5, mask, src7, src6, mask,
-              src8, src7, mask, src4, src5, src6, src7);
-    DUP4_ARG3(__lasx_xvpermi_q, src0, src2, 0x02, src1, src3, 0x02, src4, src6, 0x02,
-              src5, src7, 0x02, src0, src1, src4, src5);
-    DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, src4, coeff_hz_vec,
-              src5, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
-    DUP4_ARG2(__lasx_xvmul_h, res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
-              coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
-    DUP2_ARG2(__lasx_xvadd_h, res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt2);
-    res_hz0 = __lasx_xvssrarni_bu_h(res_vt2, res_vt0, 6);
-    __lasx_xvstelm_w(res_hz0, dst, 0, 0);
-    __lasx_xvstelm_w(res_hz0, dst + stride, 0, 1);
-    __lasx_xvstelm_w(res_hz0, dst + stride_2, 0, 4);
-    __lasx_xvstelm_w(res_hz0, dst + stride_3, 0, 5);
-    dst += stride_4;
-    __lasx_xvstelm_w(res_hz0, dst, 0, 2);
-    __lasx_xvstelm_w(res_hz0, dst + stride, 0, 3);
-    __lasx_xvstelm_w(res_hz0, dst + stride_2, 0, 6);
-    __lasx_xvstelm_w(res_hz0, dst + stride_3, 0, 7);
-}
-
-static void avc_chroma_hv_4w_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
-                                  uint32_t coef_hor0, uint32_t coef_hor1,
-                                  uint32_t coef_ver0, uint32_t coef_ver1,
-                                  int32_t height)
-{
-    if (8 == height) {
-        avc_chroma_hv_4x8_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
-                               coef_ver1);
-    } else if (4 == height) {
-        avc_chroma_hv_4x4_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
-                               coef_ver1);
-    } else if (2 == height) {
-        avc_chroma_hv_4x2_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
-                               coef_ver1);
-    }
-}
-
-static void avc_chroma_hz_4x2_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
-                                   uint32_t coeff0, uint32_t coeff1)
-{
-    __m256i src0, src1;
-    __m256i res, mask;
-    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
-    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
-    __m256i coeff_vec  = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
-    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0);
-    src1 = __lasx_xvldx(src, stride);
-    src0 = __lasx_xvshuf_b(src1, src0, mask);
-    res = __lasx_xvdp2_h_bu(src0, coeff_vec);
-    res = __lasx_xvslli_h(res, 3);
-    res = __lasx_xvssrarni_bu_h(res, res, 6);
-    __lasx_xvstelm_w(res, dst, 0, 0);
-    __lasx_xvstelm_w(res, dst + stride, 0, 1);
-}
-
-static void avc_chroma_hz_4x4_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
-                                   uint32_t coeff0, uint32_t coeff1)
-{
-    ptrdiff_t stride_2 = stride << 1;
-    ptrdiff_t stride_3 = stride_2 + stride;
-    __m256i src0, src1, src2, src3;
-    __m256i res, mask;
-    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
-    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
-    __m256i coeff_vec  = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
-    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0);
-    DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2, src1, src2);
-    src3 = __lasx_xvldx(src, stride_3);
-    DUP2_ARG3(__lasx_xvshuf_b, src1, src0, mask, src3, src2, mask, src0, src2);
-    src0 = __lasx_xvpermi_q(src0, src2, 0x02);
-    res = __lasx_xvdp2_h_bu(src0, coeff_vec);
-    res = __lasx_xvslli_h(res, 3);
-    res = __lasx_xvssrarni_bu_h(res, res, 6);
-    __lasx_xvstelm_w(res, dst, 0, 0);
-    __lasx_xvstelm_w(res, dst + stride, 0, 1);
-    __lasx_xvstelm_w(res, dst + stride_2, 0, 4);
-    __lasx_xvstelm_w(res, dst + stride_3, 0, 5);
-}
-
-static void avc_chroma_hz_4x8_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
-                                   uint32_t coeff0, uint32_t coeff1)
-{
-    ptrdiff_t stride_2 = stride << 1;
-    ptrdiff_t stride_3 = stride_2 + stride;
-    ptrdiff_t stride_4 = stride_2 << 1;
-    __m256i src0, src1, src2, src3, src4, src5, src6, src7;
-    __m256i res0, res1, mask;
-    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
-    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
-    __m256i coeff_vec  = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
-    coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
-    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0);
-    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
-              src, stride_4, src1, src2, src3, src4);
-    src += stride_4;
-    DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2, src5, src6);
-    src7 = __lasx_xvldx(src, stride_3);
-    DUP4_ARG3(__lasx_xvshuf_b, src1, src0, mask, src3, src2, mask, src5, src4, mask,
-              src7, src6, mask, src0, src2, src4, src6);
-    DUP2_ARG3(__lasx_xvpermi_q, src0, src2, 0x02, src4, src6, 0x02, src0, src4);
-    DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src4, coeff_vec, res0, res1);
-    res0 = __lasx_xvssrarni_bu_h(res1, res0, 6);
-    __lasx_xvstelm_w(res0, dst, 0, 0);
-    __lasx_xvstelm_w(res0, dst + stride, 0, 1);
-    __lasx_xvstelm_w(res0, dst + stride_2, 0, 4);
-    __lasx_xvstelm_w(res0, dst + stride_3, 0, 5);
-    dst += stride_4;
-    __lasx_xvstelm_w(res0, dst, 0, 2);
-    __lasx_xvstelm_w(res0, dst + stride, 0, 3);
-    __lasx_xvstelm_w(res0, dst + stride_2, 0, 6);
-    __lasx_xvstelm_w(res0, dst + stride_3, 0, 7);
-}
-
-static void avc_chroma_hz_4w_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
-                                  uint32_t coeff0, uint32_t coeff1,
-                                  int32_t height)
-{
-    if (8 == height) {
-        avc_chroma_hz_4x8_lasx(src, dst, stride, coeff0, coeff1);
-    } else if (4 == height) {
-        avc_chroma_hz_4x4_lasx(src, dst, stride, coeff0, coeff1);
-    } else if (2 == height) {
-        avc_chroma_hz_4x2_lasx(src, dst, stride, coeff0, coeff1);
-    }
-}
-
-static void avc_chroma_hz_8w_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
-                                  uint32_t coeff0, uint32_t coeff1,
-                                  int32_t height)
-{
-    if (4 == height) {
-        avc_chroma_hz_8x4_lasx(src, dst, stride, coeff0, coeff1);
-    } else if (8 == height) {
-        avc_chroma_hz_8x8_lasx(src, dst, stride, coeff0, coeff1);
-    } else {
-        avc_chroma_hz_nonmult_lasx(src, dst, stride, coeff0, coeff1, height);
-    }
-}
-
-static void avc_chroma_vt_4x2_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
-                                   uint32_t coeff0, uint32_t coeff1)
-{
-    __m256i src0, src1, src2;
-    __m256i tmp0, tmp1;
-    __m256i res;
-    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
-    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
-    __m256i coeff_vec  = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
-    src0 = __lasx_xvld(src, 0);
-    DUP2_ARG2(__lasx_xvldx, src, stride, src, stride << 1, src1, src2);
-    DUP2_ARG2(__lasx_xvilvl_b, src1, src0, src2, src1, tmp0, tmp1);
-    tmp0 = __lasx_xvilvl_d(tmp1, tmp0);
-    res  = __lasx_xvdp2_h_bu(tmp0, coeff_vec);
-    res  = __lasx_xvslli_h(res, 3);
-    res  = __lasx_xvssrarni_bu_h(res, res, 6);
-    __lasx_xvstelm_w(res, dst, 0, 0);
-    __lasx_xvstelm_w(res, dst + stride, 0, 1);
-}
-
-static void avc_chroma_vt_4x4_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
-                                   uint32_t coeff0, uint32_t coeff1)
-{
-    ptrdiff_t stride_2 = stride << 1;
-    ptrdiff_t stride_3 = stride_2 + stride;
-    ptrdiff_t stride_4 = stride_2 << 1;
-    __m256i src0, src1, src2, src3, src4;
-    __m256i tmp0, tmp1, tmp2, tmp3;
-    __m256i res;
-    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
-    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
-    __m256i coeff_vec  = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
-    src0 = __lasx_xvld(src, 0);
-    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
-              src, stride_4, src1, src2, src3, src4);
-    DUP4_ARG2(__lasx_xvilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
-              tmp0, tmp1, tmp2, tmp3);
-    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
-    tmp0 = __lasx_xvpermi_q(tmp0, tmp2, 0x02);
-    res = __lasx_xvdp2_h_bu(tmp0, coeff_vec);
-    res = __lasx_xvslli_h(res, 3);
-    res = __lasx_xvssrarni_bu_h(res, res, 6);
-    __lasx_xvstelm_w(res, dst, 0, 0);
-    __lasx_xvstelm_w(res, dst + stride, 0, 1);
-    __lasx_xvstelm_w(res, dst + stride_2, 0, 4);
-    __lasx_xvstelm_w(res, dst + stride_3, 0, 5);
-}
-
-static void avc_chroma_vt_4x8_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
-                                   uint32_t coeff0, uint32_t coeff1)
-{
-    ptrdiff_t stride_2 = stride << 1;
-    ptrdiff_t stride_3 = stride_2 + stride;
-    ptrdiff_t stride_4 = stride_2 << 1;
-    __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8;
-    __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-    __m256i res0, res1;
-    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
-    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
-    __m256i coeff_vec  = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
-    coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
-    src0 = __lasx_xvld(src, 0);
-    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
-              src, stride_4, src1, src2, src3, src4);
-    src += stride_4;
-    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
-              src, stride_4, src5, src6, src7, src8);
-    DUP4_ARG2(__lasx_xvilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
-              tmp0, tmp1, tmp2, tmp3);
-    DUP4_ARG2(__lasx_xvilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
-              tmp4, tmp5, tmp6, tmp7);
-    DUP4_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
-              tmp0, tmp2, tmp4, tmp6);
-    tmp0 = __lasx_xvpermi_q(tmp0, tmp2, 0x02);
-    tmp4 = __lasx_xvpermi_q(tmp4, tmp6, 0x02);
-    DUP2_ARG2(__lasx_xvdp2_h_bu, tmp0, coeff_vec, tmp4, coeff_vec, res0, res1);
-    res0 = __lasx_xvssrarni_bu_h(res1, res0, 6);
-    __lasx_xvstelm_w(res0, dst, 0, 0);
-    __lasx_xvstelm_w(res0, dst + stride, 0, 1);
-    __lasx_xvstelm_w(res0, dst + stride_2, 0, 4);
-    __lasx_xvstelm_w(res0, dst + stride_3, 0, 5);
-    dst += stride_4;
-    __lasx_xvstelm_w(res0, dst, 0, 2);
-    __lasx_xvstelm_w(res0, dst + stride, 0, 3);
-    __lasx_xvstelm_w(res0, dst + stride_2, 0, 6);
-    __lasx_xvstelm_w(res0, dst + stride_3, 0, 7);
-}
-
-static void avc_chroma_vt_4w_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
-                                  uint32_t coeff0, uint32_t coeff1,
-                                  int32_t height)
-{
-    if (8 == height) {
-        avc_chroma_vt_4x8_lasx(src, dst, stride, coeff0, coeff1);
-    } else if (4 == height) {
-        avc_chroma_vt_4x4_lasx(src, dst, stride, coeff0, coeff1);
-    } else if (2 == height) {
-        avc_chroma_vt_4x2_lasx(src, dst, stride, coeff0, coeff1);
-    }
-}
-
-static void avc_chroma_vt_8w_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
-                                  uint32_t coeff0, uint32_t coeff1,
-                                  int32_t height)
-{
-    if (4 == height) {
-        avc_chroma_vt_8x4_lasx(src, dst, stride, coeff0, coeff1);
-    } else if (8 == height) {
-        avc_chroma_vt_8x8_lasx(src, dst, stride, coeff0, coeff1);
-    }
-}
-
-static void copy_width4_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
-                             int32_t height)
-{
-    uint32_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
-
-    if (8 == height) {
-        ptrdiff_t stride_2, stride_3, stride_4;
-
-        __asm__ volatile (
-        "slli.d   %[stride_2],     %[stride],     1             \n\t"
-        "add.d    %[stride_3],     %[stride_2],   %[stride]     \n\t"
-        "slli.d   %[stride_4],     %[stride_2],   1             \n\t"
-        "ld.wu    %[tp0],          %[src],        0             \n\t"
-        "ldx.wu   %[tp1],          %[src],        %[stride]     \n\t"
-        "ldx.wu   %[tp2],          %[src],        %[stride_2]   \n\t"
-        "ldx.wu   %[tp3],          %[src],        %[stride_3]   \n\t"
-        "add.d    %[src],          %[src],        %[stride_4]   \n\t"
-        "ld.wu    %[tp4],          %[src],        0             \n\t"
-        "ldx.wu   %[tp5],          %[src],        %[stride]     \n\t"
-        "ldx.wu   %[tp6],          %[src],        %[stride_2]   \n\t"
-        "ldx.wu   %[tp7],          %[src],        %[stride_3]   \n\t"
-        "st.w     %[tp0],          %[dst],        0             \n\t"
-        "stx.w    %[tp1],          %[dst],        %[stride]     \n\t"
-        "stx.w    %[tp2],          %[dst],        %[stride_2]   \n\t"
-        "stx.w    %[tp3],          %[dst],        %[stride_3]   \n\t"
-        "add.d    %[dst],          %[dst],        %[stride_4]   \n\t"
-        "st.w     %[tp4],          %[dst],        0             \n\t"
-        "stx.w    %[tp5],          %[dst],        %[stride]     \n\t"
-        "stx.w    %[tp6],          %[dst],        %[stride_2]   \n\t"
-        "stx.w    %[tp7],          %[dst],        %[stride_3]   \n\t"
-        : [stride_2]"+&r"(stride_2), [stride_3]"+&r"(stride_3), [stride_4]"+&r"(stride_4),
-          [src]"+&r"(src), [dst]"+&r"(dst), [tp0]"+&r"(tp0), [tp1]"+&r"(tp1),
-          [tp2]"+&r"(tp2), [tp3]"+&r"(tp3), [tp4]"+&r"(tp4), [tp5]"+&r"(tp5),
-          [tp6]"+&r"(tp6), [tp7]"+&r"(tp7)
-        : [stride]"r"(stride)
-        : "memory"
-        );
-    } else if (4 == height) {
-        ptrdiff_t stride_2, stride_3;
-
-        __asm__ volatile (
-        "slli.d   %[stride_2],     %[stride],     1             \n\t"
-        "add.d    %[stride_3],     %[stride_2],   %[stride]     \n\t"
-        "ld.wu    %[tp0],          %[src],        0             \n\t"
-        "ldx.wu   %[tp1],          %[src],        %[stride]     \n\t"
-        "ldx.wu   %[tp2],          %[src],        %[stride_2]   \n\t"
-        "ldx.wu   %[tp3],          %[src],        %[stride_3]   \n\t"
-        "st.w     %[tp0],          %[dst],        0             \n\t"
-        "stx.w    %[tp1],          %[dst],        %[stride]     \n\t"
-        "stx.w    %[tp2],          %[dst],        %[stride_2]   \n\t"
-        "stx.w    %[tp3],          %[dst],        %[stride_3]   \n\t"
-        : [stride_2]"+&r"(stride_2), [stride_3]"+&r"(stride_3),
-          [src]"+&r"(src), [dst]"+&r"(dst), [tp0]"+&r"(tp0), [tp1]"+&r"(tp1),
-          [tp2]"+&r"(tp2), [tp3]"+&r"(tp3)
-        : [stride]"r"(stride)
-        : "memory"
-        );
-    } else if (2 == height) {
-        __asm__ volatile (
-        "ld.wu    %[tp0],          %[src],        0             \n\t"
-        "ldx.wu   %[tp1],          %[src],        %[stride]     \n\t"
-        "st.w     %[tp0],          %[dst],        0             \n\t"
-        "stx.w    %[tp1],          %[dst],        %[stride]     \n\t"
-        : [tp0]"+&r"(tp0), [tp1]"+&r"(tp1)
-        : [src]"r"(src), [dst]"r"(dst), [stride]"r"(stride)
-        : "memory"
-        );
-    }
-}
-
-static void copy_width8_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
-                             int32_t height)
-{
-    if (8 == height) {
-        copy_width8x8_lasx(src, dst, stride);
-    } else if (4 == height) {
-        copy_width8x4_lasx(src, dst, stride);
-    }
-}
-
-void ff_put_h264_chroma_mc4_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride,
-                                 int height, int x, int y)
-{
-    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
-
-    if(x && y) {
-        avc_chroma_hv_4w_lasx(src, dst, stride, x, (8 - x), y, (8 - y), height);
-    } else if (x) {
-        avc_chroma_hz_4w_lasx(src, dst, stride, x, (8 - x), height);
-    } else if (y) {
-        avc_chroma_vt_4w_lasx(src, dst, stride, y, (8 - y), height);
-    } else {
-        copy_width4_lasx(src, dst, stride, height);
-    }
-}
-
-void ff_put_h264_chroma_mc8_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride,
-                                 int height, int x, int y)
-{
-    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
-
-    if (!(x || y)) {
-        copy_width8_lasx(src, dst, stride, height);
-    } else if (x && y) {
-        avc_chroma_hv_8w_lasx(src, dst, stride, x, (8 - x), y, (8 - y), height);
-    } else if (x) {
-        avc_chroma_hz_8w_lasx(src, dst, stride, x, (8 - x), height);
-    } else {
-        avc_chroma_vt_8w_lasx(src, dst, stride, y, (8 - y), height);
-    }
-}
-
-static av_always_inline void avc_chroma_hv_and_aver_dst_8x4_lasx(const uint8_t *src,
-                             uint8_t *dst, ptrdiff_t stride, uint32_t coef_hor0,
-                             uint32_t coef_hor1, uint32_t coef_ver0,
-                             uint32_t coef_ver1)
-{
-    ptrdiff_t stride_2x = stride << 1;
-    ptrdiff_t stride_3x = stride_2x + stride;
-    ptrdiff_t stride_4x = stride << 2;
-    __m256i tp0, tp1, tp2, tp3;
-    __m256i src0, src1, src2, src3, src4, out;
-    __m256i res_hz0, res_hz1, res_hz2, res_vt0, res_vt1;
-    __m256i mask;
-    __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
-    __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
-    __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
-    __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
-    __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
-
-    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0);
-    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x,
-              src1, src2, src3, src4);
-    DUP2_ARG3(__lasx_xvpermi_q, src2, src1, 0x20, src4, src3, 0x20, src1, src3);
-    src0 = __lasx_xvshuf_b(src0, src0, mask);
-    DUP2_ARG3(__lasx_xvshuf_b, src1, src1, mask, src3, src3, mask, src1, src3);
-    DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, res_hz0, res_hz1);
-    res_hz2 = __lasx_xvdp2_h_bu(src3, coeff_hz_vec);
-    res_vt0 = __lasx_xvmul_h(res_hz1, coeff_vt_vec0);
-    res_vt1 = __lasx_xvmul_h(res_hz2, coeff_vt_vec0);
-    res_hz0 = __lasx_xvpermi_q(res_hz1, res_hz0, 0x20);
-    res_hz1 = __lasx_xvpermi_q(res_hz1, res_hz2, 0x3);
-    res_vt0 = __lasx_xvmadd_h(res_vt0, res_hz0, coeff_vt_vec1);
-    res_vt1 = __lasx_xvmadd_h(res_vt1, res_hz1, coeff_vt_vec1);
-    out = __lasx_xvssrarni_bu_h(res_vt1, res_vt0, 6);
-    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
-              tp0, tp1, tp2, tp3);
-    DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
-    tp0 = __lasx_xvpermi_q(tp2, tp0, 0x20);
-    out = __lasx_xvavgr_bu(out, tp0);
-    __lasx_xvstelm_d(out, dst, 0, 0);
-    __lasx_xvstelm_d(out, dst + stride, 0, 2);
-    __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
-    __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void avc_chroma_hv_and_aver_dst_8x8_lasx(const uint8_t *src,
-                             uint8_t *dst, ptrdiff_t stride, uint32_t coef_hor0,
-                             uint32_t coef_hor1, uint32_t coef_ver0,
-                             uint32_t coef_ver1)
-{
-    ptrdiff_t stride_2x = stride << 1;
-    ptrdiff_t stride_3x = stride_2x + stride;
-    ptrdiff_t stride_4x = stride << 2;
-    __m256i tp0, tp1, tp2, tp3, dst0, dst1;
-    __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8;
-    __m256i out0, out1;
-    __m256i res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
-    __m256i res_vt0, res_vt1, res_vt2, res_vt3;
-    __m256i mask;
-    __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
-    __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
-    __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
-    __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
-    __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
-
-    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0);
-    src += stride;
-    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
-              src1, src2, src3, src4);
-    src += stride_4x;
-    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
-              src5, src6, src7, src8);
-    DUP4_ARG3(__lasx_xvpermi_q, src2, src1, 0x20, src4, src3, 0x20, src6, src5, 0x20,
-              src8, src7, 0x20, src1, src3, src5, src7);
-    src0 = __lasx_xvshuf_b(src0, src0, mask);
-    DUP4_ARG3(__lasx_xvshuf_b, src1, src1, mask, src3, src3, mask, src5, src5, mask, src7,
-              src7, mask, src1, src3, src5, src7);
-    DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, src3,
-              coeff_hz_vec, src5, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
-    res_hz4 = __lasx_xvdp2_h_bu(src7, coeff_hz_vec);
-    res_vt0 = __lasx_xvmul_h(res_hz1, coeff_vt_vec0);
-    res_vt1 = __lasx_xvmul_h(res_hz2, coeff_vt_vec0);
-    res_vt2 = __lasx_xvmul_h(res_hz3, coeff_vt_vec0);
-    res_vt3 = __lasx_xvmul_h(res_hz4, coeff_vt_vec0);
-    res_hz0 = __lasx_xvpermi_q(res_hz1, res_hz0, 0x20);
-    res_hz1 = __lasx_xvpermi_q(res_hz1, res_hz2, 0x3);
-    res_hz2 = __lasx_xvpermi_q(res_hz2, res_hz3, 0x3);
-    res_hz3 = __lasx_xvpermi_q(res_hz3, res_hz4, 0x3);
-    res_vt0 = __lasx_xvmadd_h(res_vt0, res_hz0, coeff_vt_vec1);
-    res_vt1 = __lasx_xvmadd_h(res_vt1, res_hz1, coeff_vt_vec1);
-    res_vt2 = __lasx_xvmadd_h(res_vt2, res_hz2, coeff_vt_vec1);
-    res_vt3 = __lasx_xvmadd_h(res_vt3, res_hz3, coeff_vt_vec1);
-    DUP2_ARG3(__lasx_xvssrarni_bu_h, res_vt1, res_vt0, 6, res_vt3, res_vt2, 6,
-              out0, out1);
-    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
-              tp0, tp1, tp2, tp3);
-    DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
-    dst0 = __lasx_xvpermi_q(tp2, tp0, 0x20);
-    dst += stride_4x;
-    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
-              tp0, tp1, tp2, tp3);
-    dst -= stride_4x;
-    DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
-    dst1 = __lasx_xvpermi_q(tp2, tp0, 0x20);
-    out0 = __lasx_xvavgr_bu(out0, dst0);
-    out1 = __lasx_xvavgr_bu(out1, dst1);
-    __lasx_xvstelm_d(out0, dst, 0, 0);
-    __lasx_xvstelm_d(out0, dst + stride, 0, 2);
-    __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1);
-    __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3);
-    dst += stride_4x;
-    __lasx_xvstelm_d(out1, dst, 0, 0);
-    __lasx_xvstelm_d(out1, dst + stride, 0, 2);
-    __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1);
-    __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void avc_chroma_hz_and_aver_dst_8x4_lasx(const uint8_t *src,
-                             uint8_t *dst, ptrdiff_t stride, uint32_t coeff0,
-                             uint32_t coeff1)
-{
-    ptrdiff_t stride_2x = stride << 1;
-    ptrdiff_t stride_3x = stride_2x + stride;
-    __m256i tp0, tp1, tp2, tp3;
-    __m256i src0, src1, src2, src3, out;
-    __m256i res0, res1;
-    __m256i mask;
-    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
-    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
-    __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
-    coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
-    mask = __lasx_xvld(chroma_mask_arr, 0);
-    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
-              src0, src1, src2, src3);
-    DUP2_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src0, src2);
-    DUP2_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src0, src2);
-    DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1);
-    out = __lasx_xvssrarni_bu_h(res1, res0, 6);
-    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
-              tp0, tp1, tp2, tp3);
-    DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
-    tp0 = __lasx_xvpermi_q(tp2, tp0, 0x20);
-    out = __lasx_xvavgr_bu(out, tp0);
-    __lasx_xvstelm_d(out, dst, 0, 0);
-    __lasx_xvstelm_d(out, dst + stride, 0, 2);
-    __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
-    __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void avc_chroma_hz_and_aver_dst_8x8_lasx(const uint8_t *src,
-                             uint8_t *dst, ptrdiff_t stride, uint32_t coeff0,
-                             uint32_t coeff1)
-{
-    ptrdiff_t stride_2x = stride << 1;
-    ptrdiff_t stride_3x = stride_2x + stride;
-    ptrdiff_t stride_4x = stride << 2;
-    __m256i tp0, tp1, tp2, tp3, dst0, dst1;
-    __m256i src0, src1, src2, src3, src4, src5, src6, src7;
-    __m256i out0, out1;
-    __m256i res0, res1, res2, res3;
-    __m256i mask;
-    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
-    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
-    __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
-    coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
-    mask = __lasx_xvld(chroma_mask_arr, 0);
-    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
-              src0, src1, src2, src3);
-    src += stride_4x;
-    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
-              src4, src5, src6, src7);
-    DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src5, src4, 0x20,
-              src7, src6, 0x20, src0, src2, src4, src6);
-    DUP4_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src4, src4,
-              mask, src6, src6, mask, src0, src2, src4, src6);
-    DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, src4, coeff_vec, src6,
-              coeff_vec, res0, res1, res2, res3);
-    DUP2_ARG3(__lasx_xvssrarni_bu_h, res1, res0, 6, res3, res2, 6, out0, out1);
-    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
-              tp0, tp1, tp2, tp3);
-    DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
-    dst0 = __lasx_xvpermi_q(tp2, tp0, 0x20);
-    dst += stride_4x;
-    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
-              tp0, tp1, tp2, tp3);
-    dst -= stride_4x;
-    DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
-    dst1 = __lasx_xvpermi_q(tp2, tp0, 0x20);
-    out0 = __lasx_xvavgr_bu(out0, dst0);
-    out1 = __lasx_xvavgr_bu(out1, dst1);
-    __lasx_xvstelm_d(out0, dst, 0, 0);
-    __lasx_xvstelm_d(out0, dst + stride, 0, 2);
-    __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1);
-    __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3);
-    dst += stride_4x;
-    __lasx_xvstelm_d(out1, dst, 0, 0);
-    __lasx_xvstelm_d(out1, dst + stride, 0, 2);
-    __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1);
-    __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void avc_chroma_vt_and_aver_dst_8x4_lasx(const uint8_t *src,
-                             uint8_t *dst, ptrdiff_t stride, uint32_t coeff0,
-                             uint32_t coeff1)
-{
-    ptrdiff_t stride_2x = stride << 1;
-    ptrdiff_t stride_3x = stride_2x + stride;
-    ptrdiff_t stride_4x = stride << 2;
-    __m256i tp0, tp1, tp2, tp3;
-    __m256i src0, src1, src2, src3, src4, out;
-    __m256i res0, res1;
-    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
-    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
-    __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
-    coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
-    src0 = __lasx_xvld(src, 0);
-    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x,
-              src1, src2, src3, src4);
-    DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20,
-              src4, src3, 0x20, src0, src1, src2, src3);
-    DUP2_ARG2(__lasx_xvilvl_b, src1, src0, src3, src2, src0, src2);
-    DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1);
-    out = __lasx_xvssrarni_bu_h(res1, res0, 6);
-    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
-              tp0, tp1, tp2, tp3);
-    DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
-    tp0 = __lasx_xvpermi_q(tp2, tp0, 0x20);
-    out = __lasx_xvavgr_bu(out, tp0);
-    __lasx_xvstelm_d(out, dst, 0, 0);
-    __lasx_xvstelm_d(out, dst + stride, 0, 2);
-    __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
-    __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void avc_chroma_vt_and_aver_dst_8x8_lasx(const uint8_t *src,
-                             uint8_t *dst, ptrdiff_t stride, uint32_t coeff0,
-                             uint32_t coeff1)
-{
-    ptrdiff_t stride_2x = stride << 1;
-    ptrdiff_t stride_3x = stride_2x + stride;
-    ptrdiff_t stride_4x = stride << 2;
-    __m256i tp0, tp1, tp2, tp3, dst0, dst1;
-    __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8;
-    __m256i out0, out1;
-    __m256i res0, res1, res2, res3;
-    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
-    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
-    __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
-    coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
-    src0 = __lasx_xvld(src, 0);
-    src += stride;
-    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
-              src1, src2, src3, src4);
-    src += stride_4x;
-    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
-              src5, src6, src7, src8);
-    DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20,
-              src4, src3, 0x20, src0, src1, src2, src3);
-    DUP4_ARG3(__lasx_xvpermi_q, src5, src4, 0x20, src6, src5, 0x20, src7, src6, 0x20,
-              src8, src7, 0x20, src4, src5, src6, src7);
-    DUP4_ARG2(__lasx_xvilvl_b, src1, src0, src3, src2, src5, src4, src7, src6,
-              src0, src2, src4, src6);
-    DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, src4, coeff_vec, src6,
-              coeff_vec, res0, res1, res2, res3);
-    DUP2_ARG3(__lasx_xvssrarni_bu_h, res1, res0, 6, res3, res2, 6, out0, out1);
-    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
-              tp0, tp1, tp2, tp3);
-    DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
-    dst0 = __lasx_xvpermi_q(tp2, tp0, 0x20);
-    dst += stride_4x;
-    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
-              tp0, tp1, tp2, tp3);
-    dst -= stride_4x;
-    DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
-    dst1 = __lasx_xvpermi_q(tp2, tp0, 0x20);
-    out0 = __lasx_xvavgr_bu(out0, dst0);
-    out1 = __lasx_xvavgr_bu(out1, dst1);
-    __lasx_xvstelm_d(out0, dst, 0, 0);
-    __lasx_xvstelm_d(out0, dst + stride, 0, 2);
-    __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1);
-    __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3);
-    dst += stride_4x;
-    __lasx_xvstelm_d(out1, dst, 0, 0);
-    __lasx_xvstelm_d(out1, dst + stride, 0, 2);
-    __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1);
-    __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void avg_width8x8_lasx(const uint8_t *src, uint8_t *dst,
-                                               ptrdiff_t stride)
-{
-    __m256i src0, src1, src2, src3;
-    __m256i dst0, dst1, dst2, dst3;
-    ptrdiff_t stride_2x = stride << 1;
-    ptrdiff_t stride_3x = stride_2x + stride;
-    ptrdiff_t stride_4x = stride << 2;
-
-    src0 = __lasx_xvldrepl_d(src, 0);
-    src1 = __lasx_xvldrepl_d(src + stride, 0);
-    src2 = __lasx_xvldrepl_d(src + stride_2x, 0);
-    src3 = __lasx_xvldrepl_d(src + stride_3x, 0);
-    dst0 = __lasx_xvldrepl_d(dst, 0);
-    dst1 = __lasx_xvldrepl_d(dst + stride, 0);
-    dst2 = __lasx_xvldrepl_d(dst + stride_2x, 0);
-    dst3 = __lasx_xvldrepl_d(dst + stride_3x, 0);
-    src0 = __lasx_xvpackev_d(src1,src0);
-    src2 = __lasx_xvpackev_d(src3,src2);
-    src0 = __lasx_xvpermi_q(src0, src2, 0x02);
-    dst0 = __lasx_xvpackev_d(dst1,dst0);
-    dst2 = __lasx_xvpackev_d(dst3,dst2);
-    dst0 = __lasx_xvpermi_q(dst0, dst2, 0x02);
-    dst0 = __lasx_xvavgr_bu(src0, dst0);
-    __lasx_xvstelm_d(dst0, dst, 0, 0);
-    __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
-    __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
-    __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
-
-    src += stride_4x;
-    dst += stride_4x;
-    src0 = __lasx_xvldrepl_d(src, 0);
-    src1 = __lasx_xvldrepl_d(src + stride, 0);
-    src2 = __lasx_xvldrepl_d(src + stride_2x, 0);
-    src3 = __lasx_xvldrepl_d(src + stride_3x, 0);
-    dst0 = __lasx_xvldrepl_d(dst, 0);
-    dst1 = __lasx_xvldrepl_d(dst + stride, 0);
-    dst2 = __lasx_xvldrepl_d(dst + stride_2x, 0);
-    dst3 = __lasx_xvldrepl_d(dst + stride_3x, 0);
-    src0 = __lasx_xvpackev_d(src1,src0);
-    src2 = __lasx_xvpackev_d(src3,src2);
-    src0 = __lasx_xvpermi_q(src0, src2, 0x02);
-    dst0 = __lasx_xvpackev_d(dst1,dst0);
-    dst2 = __lasx_xvpackev_d(dst3,dst2);
-    dst0 = __lasx_xvpermi_q(dst0, dst2, 0x02);
-    dst0 = __lasx_xvavgr_bu(src0, dst0);
-    __lasx_xvstelm_d(dst0, dst, 0, 0);
-    __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
-    __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
-    __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void avg_width8x4_lasx(const uint8_t *src, uint8_t *dst,
-                                               ptrdiff_t stride)
-{
-    __m256i src0, src1, src2, src3;
-    __m256i dst0, dst1, dst2, dst3;
-    ptrdiff_t stride_2x = stride << 1;
-    ptrdiff_t stride_3x = stride_2x + stride;
-
-    src0 = __lasx_xvldrepl_d(src, 0);
-    src1 = __lasx_xvldrepl_d(src + stride, 0);
-    src2 = __lasx_xvldrepl_d(src + stride_2x, 0);
-    src3 = __lasx_xvldrepl_d(src + stride_3x, 0);
-    dst0 = __lasx_xvldrepl_d(dst, 0);
-    dst1 = __lasx_xvldrepl_d(dst + stride, 0);
-    dst2 = __lasx_xvldrepl_d(dst + stride_2x, 0);
-    dst3 = __lasx_xvldrepl_d(dst + stride_3x, 0);
-    src0 = __lasx_xvpackev_d(src1,src0);
-    src2 = __lasx_xvpackev_d(src3,src2);
-    src0 = __lasx_xvpermi_q(src0, src2, 0x02);
-    dst0 = __lasx_xvpackev_d(dst1,dst0);
-    dst2 = __lasx_xvpackev_d(dst3,dst2);
-    dst0 = __lasx_xvpermi_q(dst0, dst2, 0x02);
-    dst0 = __lasx_xvavgr_bu(src0, dst0);
-    __lasx_xvstelm_d(dst0, dst, 0, 0);
-    __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
-    __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
-    __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
-}
-
-static void avc_chroma_hv_and_aver_dst_8w_lasx(const uint8_t *src, uint8_t *dst,
-                                               ptrdiff_t stride,
-                                               uint32_t coef_hor0,
-                                               uint32_t coef_hor1,
-                                               uint32_t coef_ver0,
-                                               uint32_t coef_ver1,
-                                               int32_t height)
-{
-    if (4 == height) {
-        avc_chroma_hv_and_aver_dst_8x4_lasx(src, dst, stride, coef_hor0,
-                                            coef_hor1, coef_ver0, coef_ver1);
-    } else if (8 == height) {
-        avc_chroma_hv_and_aver_dst_8x8_lasx(src, dst, stride, coef_hor0,
-                                            coef_hor1, coef_ver0, coef_ver1);
-    }
-}
-
-static void avc_chroma_hz_and_aver_dst_8w_lasx(const uint8_t *src, uint8_t *dst,
-                                               ptrdiff_t stride, uint32_t coeff0,
-                                               uint32_t coeff1, int32_t height)
-{
-    if (4 == height) {
-        avc_chroma_hz_and_aver_dst_8x4_lasx(src, dst, stride, coeff0, coeff1);
-    } else if (8 == height) {
-        avc_chroma_hz_and_aver_dst_8x8_lasx(src, dst, stride, coeff0, coeff1);
-    }
-}
-
-static void avc_chroma_vt_and_aver_dst_8w_lasx(const uint8_t *src, uint8_t *dst,
-                                               ptrdiff_t stride, uint32_t coeff0,
-                                               uint32_t coeff1, int32_t height)
-{
-    if (4 == height) {
-        avc_chroma_vt_and_aver_dst_8x4_lasx(src, dst, stride, coeff0, coeff1);
-    } else if (8 == height) {
-        avc_chroma_vt_and_aver_dst_8x8_lasx(src, dst, stride, coeff0, coeff1);
-    }
-}
-
-static void avg_width8_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
-                            int32_t height)
-{
-    if (8 == height) {
-        avg_width8x8_lasx(src, dst, stride);
-    } else if (4 == height) {
-        avg_width8x4_lasx(src, dst, stride);
-    }
-}
-
-void ff_avg_h264_chroma_mc8_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride,
-                                 int height, int x, int y)
-{
-    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
-
-    if (!(x || y)) {
-        avg_width8_lasx(src, dst, stride, height);
-    } else if (x && y) {
-        avc_chroma_hv_and_aver_dst_8w_lasx(src, dst, stride, x, (8 - x), y,
-                                           (8 - y), height);
-    } else if (x) {
-        avc_chroma_hz_and_aver_dst_8w_lasx(src, dst, stride, x, (8 - x), height);
-    } else {
-        avc_chroma_vt_and_aver_dst_8w_lasx(src, dst, stride, y, (8 - y), height);
-    }
-}
diff --git a/libavcodec/loongarch/h264chroma_lasx.h b/libavcodec/loongarch/h264chroma_lasx.h
deleted file mode 100644
index 633752035e..0000000000
--- a/libavcodec/loongarch/h264chroma_lasx.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2020 Loongson Technology Corporation Limited
- * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_LOONGARCH_H264CHROMA_LASX_H
-#define AVCODEC_LOONGARCH_H264CHROMA_LASX_H
-
-#include <stdint.h>
-#include <stddef.h>
-#include "libavcodec/h264.h"
-
-void ff_put_h264_chroma_mc4_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride,
-        int h, int x, int y);
-void ff_put_h264_chroma_mc8_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride,
-        int h, int x, int y);
-void ff_avg_h264_chroma_mc8_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride,
-        int h, int x, int y);
-
-#endif /* AVCODEC_LOONGARCH_H264CHROMA_LASX_H */
diff --git a/libavcodec/loongarch/h264chroma_loongarch.h b/libavcodec/loongarch/h264chroma_loongarch.h
new file mode 100644
index 0000000000..26a7155389
--- /dev/null
+++ b/libavcodec/loongarch/h264chroma_loongarch.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_LOONGARCH_H264CHROMA_LOONGARCH_H
+#define AVCODEC_LOONGARCH_H264CHROMA_LOONGARCH_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include "libavcodec/h264.h"
+
+void ff_put_h264_chroma_mc8_lsx(unsigned char *dst, const unsigned char *src,
+                                long int stride, int h, int x, int y);
+void ff_avg_h264_chroma_mc8_lsx(unsigned char *dst, const unsigned char *src,
+                                long int stride, int h, int x, int y);
+void ff_put_h264_chroma_mc4_lsx(unsigned char *dst, const unsigned char *src,
+                                long int stride, int h, int x, int y);
+
+void ff_put_h264_chroma_mc4_lasx(unsigned char *dst, const unsigned char *src,
+                                 long int stride, int h, int x, int y);
+void ff_put_h264_chroma_mc8_lasx(unsigned char *dst, const unsigned char *src,
+                                 long int stride, int h, int x, int y);
+void ff_avg_h264_chroma_mc8_lasx(unsigned char *dst, const unsigned char *src,
+                                 long int stride, int h, int x, int y);
+
+#endif /* AVCODEC_LOONGARCH_H264CHROMA_LOONGARCH_H */
diff --git a/libavcodec/loongarch/h264intrapred.S b/libavcodec/loongarch/h264intrapred.S
new file mode 100644
index 0000000000..a03f467b6e
--- /dev/null
+++ b/libavcodec/loongarch/h264intrapred.S
@@ -0,0 +1,299 @@
+/*
+ * Loongson LSX optimized h264intrapred
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+const shufa
+.byte 6, 5, 4, 3, 2, 1, 0
+endconst
+
+const mulk
+.byte 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0
+endconst
+
+const mulh
+.byte 0, 0, 1, 0,  2,  0,  3, 0,  4, 0,  5, 0,  6, 0,  7, 0
+.byte 8, 0, 9, 0, 10,  0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 0
+endconst
+
+.macro PRED16X16_PLANE
+    slli.d        t6,    a1,    1
+    slli.d        t4,    a1,    3
+    addi.d        t0,    a0,    7
+    sub.d         t0,    t0,    a1
+    add.d         t1,    a0,    t4
+    addi.d        t1,    t1,    -1
+    sub.d         t2,    t1,    t6
+
+    ld.bu         t3,    t0,    1
+    ld.bu         t4,    t0,    -1
+    ld.bu         t5,    t1,    0
+    ld.bu         t7,    t2,    0
+    sub.d         t3,    t3,    t4
+    sub.d         t4,    t5,    t7
+
+    la.local      t5,    mulk
+    vld           vr0,   t5,    0
+    fld.d         f1,    t0,    2
+    fld.d         f2,    t0,    -8
+    la.local      t5,    shufa
+    fld.d         f3,    t5,    0
+    vshuf.b       vr2,   vr2,   vr2,   vr3
+    vilvl.b       vr1,   vr1,   vr2
+    vhsubw.hu.bu  vr1,   vr1,   vr1
+    vmul.h        vr0,   vr0,   vr1
+    vhaddw.w.h    vr1,   vr0,   vr0
+    vhaddw.d.w    vr0,   vr1,   vr1
+    vhaddw.q.d    vr1,   vr0,   vr0
+    vpickve2gr.w  t5,    vr1,   0
+    add.d         t3,    t3,    t5
+//2
+    sub.d         t2,    t2,    a1
+    ld.bu         t8,    t2,    0
+    ldx.bu        t7,    t1,    a1
+    sub.d         t5,    t7,    t8
+    slli.d        t5,    t5,    1
+
+//3&4
+    add.d         t1,    t1,    t6
+    sub.d         t2,    t2,    a1
+    ld.bu         t8,    t2,    0
+    ld.bu         t7,    t1,    0
+    sub.d         t7,    t7,    t8
+    slli.d        t8,    t7,    1
+    add.d         t7,    t7,    t8
+    add.d         t5,    t5,    t7
+    sub.d         t2,    t2,    a1
+    ld.bu         t8,    t2,    0
+    ldx.bu        t7,    t1,    a1
+    sub.d         t7,    t7,    t8
+    slli.d        t7,    t7,    2
+    add.d         t5,    t5,    t7
+
+//5&6
+    add.d         t1,    t1,    t6
+    sub.d         t2,    t2,    a1
+    ld.bu         t8,    t2,    0
+    ld.bu         t7,    t1,    0
+    sub.d         t7,    t7,    t8
+    slli.d        t8,    t7,    2
+    add.d         t7,    t7,    t8
+    add.d         t5,    t5,    t7
+    sub.d         t2,    t2,    a1
+    ld.bu         t8,    t2,    0
+    ldx.bu        t7,    t1,    a1
+    sub.d         t7,    t7,    t8
+    slli.d        t8,    t7,    1
+    slli.d        t7,    t7,    2
+    add.d         t7,    t7,    t8
+    add.d         t5,    t5,    t7
+
+//7&8
+    add.d         t1,    t1,    t6
+    sub.d         t2,    t2,    a1
+    ld.bu         t8,    t2,    0
+    ld.bu         t7,    t1,    0
+    sub.d         t7,    t7,    t8
+    slli.d        t8,    t7,    3
+    sub.d         t7,    t8,    t7
+    add.d         t5,    t5,    t7
+    sub.d         t2,    t2,    a1
+    ld.bu         t8,    t2,    0
+    ldx.bu        t7,    t1,    a1
+    sub.d         t7,    t7,    t8
+    slli.d        t7,    t7,    3
+    add.d         t5,    t5,    t7
+    add.d         t4,    t4,    t5
+    add.d         t1,    t1,    a1
+.endm
+
+.macro PRED16X16_PLANE_END
+    ld.bu         t7,    t1,    0
+    ld.bu         t8,    t2,    16
+    add.d         t5,    t7,    t8
+    addi.d        t5,    t5,    1
+    slli.d        t5,    t5,    4
+    add.d         t7,    t3,    t4
+    slli.d        t8,    t7,    3
+    sub.d         t7,    t8,    t7
+    sub.d         t5,    t5,    t7
+
+    la.local      t8,    mulh
+    vld           vr3,   t8,    0
+    slli.d        t8,    t3,    3
+    vreplgr2vr.h  vr4,   t3
+    vreplgr2vr.h  vr9,   t8
+    vmul.h        vr5,   vr3,   vr4
+
+.rept 16
+    move          t7,    t5
+    add.d         t5,    t5,    t4
+    vreplgr2vr.h  vr6,   t7
+    vadd.h        vr7,   vr6,   vr5
+    vadd.h        vr8,   vr9,   vr7
+    vssrani.bu.h  vr8,   vr7,   5
+    vst           vr8,   a0,    0
+    add.d         a0,    a0,    a1
+.endr
+.endm
+
+.macro PRED16X16_PLANE_END_LASX
+    ld.bu         t7,    t1,    0
+    ld.bu         t8,    t2,    16
+    add.d         t5,    t7,    t8
+    addi.d        t5,    t5,    1
+    slli.d        t5,    t5,    4
+    add.d         t7,    t3,    t4
+    slli.d        t8,    t7,    3
+    sub.d         t7,    t8,    t7
+    sub.d         t5,    t5,    t7
+
+    la.local      t8,    mulh
+    xvld          xr3,   t8,    0
+    xvreplgr2vr.h xr4,   t3
+    xvmul.h       xr5,   xr3,   xr4
+
+.rept 8
+    move          t7,    t5
+    add.d         t5,    t5,    t4
+    xvreplgr2vr.h xr6,   t7
+    xvreplgr2vr.h xr8,   t5
+    add.d         t5,    t5,    t4
+    xvadd.h       xr7,   xr6,   xr5
+    xvadd.h       xr9,   xr8,   xr5
+
+    xvssrani.bu.h xr9,   xr7,   5
+    vstelm.d      vr9,   a0,    0,    0
+    xvstelm.d     xr9,   a0,    8,    2
+    add.d         a0,    a0,    a1
+    vstelm.d      vr9,   a0,    0,    1
+    xvstelm.d     xr9,   a0,    8,    3
+    add.d         a0,    a0,    a1
+.endr
+.endm
+
+/* void ff_h264_pred16x16_plane_h264_8_lsx(uint8_t *src, ptrdiff_t stride)
+ */
+function ff_h264_pred16x16_plane_h264_8_lsx
+    PRED16X16_PLANE
+
+    slli.d        t7,    t3,    2
+    add.d         t3,    t3,    t7
+    addi.d        t3,    t3,    32
+    srai.d        t3,    t3,    6
+    slli.d        t7,    t4,    2
+    add.d         t4,    t4,    t7
+    addi.d        t4,    t4,    32
+    srai.d        t4,    t4,    6
+
+    PRED16X16_PLANE_END
+endfunc
+
+/* void ff_h264_pred16x16_plane_rv40_8_lsx(uint8_t *src, ptrdiff_t stride)
+ */
+function ff_h264_pred16x16_plane_rv40_8_lsx
+    PRED16X16_PLANE
+
+    srai.d        t7,    t3,    2
+    add.d         t3,    t3,    t7
+    srai.d        t3,    t3,    4
+    srai.d        t7,    t4,    2
+    add.d         t4,    t4,    t7
+    srai.d        t4,    t4,    4
+
+    PRED16X16_PLANE_END
+endfunc
+
+/* void ff_h264_pred16x16_plane_svq3_8_lsx(uint8_t *src, ptrdiff_t stride)
+ */
+function ff_h264_pred16x16_plane_svq3_8_lsx
+    PRED16X16_PLANE
+
+    li.d          t6,    4
+    li.d          t7,    5
+    li.d          t8,    16
+    div.d         t3,    t3,    t6
+    mul.d         t3,    t3,    t7
+    div.d         t3,    t3,    t8
+    div.d         t4,    t4,    t6
+    mul.d         t4,    t4,    t7
+    div.d         t4,    t4,    t8
+    move          t7,    t3
+    move          t3,    t4
+    move          t4,    t7
+
+    PRED16X16_PLANE_END
+endfunc
+
+/* void ff_h264_pred16x16_plane_h264_8_lasx(uint8_t *src, ptrdiff_t stride)
+ */
+function ff_h264_pred16x16_plane_h264_8_lasx
+    PRED16X16_PLANE
+
+    slli.d        t7,    t3,    2
+    add.d         t3,    t3,    t7
+    addi.d        t3,    t3,    32
+    srai.d        t3,    t3,    6
+    slli.d        t7,    t4,    2
+    add.d         t4,    t4,    t7
+    addi.d        t4,    t4,    32
+    srai.d        t4,    t4,    6
+
+    PRED16X16_PLANE_END_LASX
+endfunc
+
+/* void ff_h264_pred16x16_plane_rv40_8_lasx(uint8_t *src, ptrdiff_t stride)
+ */
+function ff_h264_pred16x16_plane_rv40_8_lasx
+    PRED16X16_PLANE
+
+    srai.d        t7,    t3,    2
+    add.d         t3,    t3,    t7
+    srai.d        t3,    t3,    4
+    srai.d        t7,    t4,    2
+    add.d         t4,    t4,    t7
+    srai.d        t4,    t4,    4
+
+    PRED16X16_PLANE_END_LASX
+endfunc
+
+/* void ff_h264_pred16x16_plane_svq3_8_lasx(uint8_t *src, ptrdiff_t stride)
+ */
+function ff_h264_pred16x16_plane_svq3_8_lasx
+    PRED16X16_PLANE
+
+    li.d          t5,    4
+    li.d          t7,    5
+    li.d          t8,    16
+    div.d         t3,    t3,    t5
+    mul.d         t3,    t3,    t7
+    div.d         t3,    t3,    t8
+    div.d         t4,    t4,    t5
+    mul.d         t4,    t4,    t7
+    div.d         t4,    t4,    t8
+    move          t7,    t3
+    move          t3,    t4
+    move          t4,    t7
+
+    PRED16X16_PLANE_END_LASX
+endfunc
-- 
2.20.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [FFmpeg-devel] [PATCH v1 4/6] avcodec/la: Add LSX optimization for h264 qpel.
  2023-05-04  8:49 [FFmpeg-devel] Add LSX optimization in avcodec and swscale Hao Chen
                   ` (2 preceding siblings ...)
  2023-05-04  8:49 ` [FFmpeg-devel] [PATCH v1 3/6] avcodec/la: Add LSX optimization for h264 chroma and intrapred Hao Chen
@ 2023-05-04  8:49 ` Hao Chen
  2023-05-04  8:49 ` [FFmpeg-devel] [PATCH v1 5/6] swscale/la: Optimize the functions of the swscale series with lsx Hao Chen
  2023-05-04  8:49 ` [FFmpeg-devel] [PATCH v1 6/6] swscale/la: Add following builtin optimized functions Hao Chen
  5 siblings, 0 replies; 18+ messages in thread
From: Hao Chen @ 2023-05-04  8:49 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: yuanhecai

From: yuanhecai <yuanhecai@loongson.cn>

./configure --disable-lasx
ffmpeg -i 1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -y /dev/null -an
before: 214fps
after:  274fps
---
 libavcodec/loongarch/Makefile                 |    2 +
 libavcodec/loongarch/h264qpel.S               | 3635 +++++++++++++++++
 .../loongarch/h264qpel_init_loongarch.c       |   74 +-
 libavcodec/loongarch/h264qpel_lasx.c          |  401 +-
 libavcodec/loongarch/h264qpel_lasx.h          |  158 -
 libavcodec/loongarch/h264qpel_loongarch.h     |  312 ++
 libavcodec/loongarch/h264qpel_lsx.c           |  488 +++
 7 files changed, 4511 insertions(+), 559 deletions(-)
 create mode 100644 libavcodec/loongarch/h264qpel.S
 delete mode 100644 libavcodec/loongarch/h264qpel_lasx.h
 create mode 100644 libavcodec/loongarch/h264qpel_loongarch.h
 create mode 100644 libavcodec/loongarch/h264qpel_lsx.c

diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index 6e73e1bb6a..b80ea17752 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -31,5 +31,7 @@ LSX-OBJS-$(CONFIG_HEVC_DECODER)       += loongarch/hevcdsp_lsx.o \
 LSX-OBJS-$(CONFIG_H264DSP)            += loongarch/h264idct.o \
                                          loongarch/h264idct_la.o \
                                          loongarch/h264dsp.o
+LSX-OBJS-$(CONFIG_H264QPEL)           += loongarch/h264qpel.o \
+                                         loongarch/h264qpel_lsx.o
 LSX-OBJS-$(CONFIG_H264CHROMA)         += loongarch/h264chroma.o
 LSX-OBJS-$(CONFIG_H264PRED)           += loongarch/h264intrapred.o
diff --git a/libavcodec/loongarch/h264qpel.S b/libavcodec/loongarch/h264qpel.S
new file mode 100644
index 0000000000..aaf989a71b
--- /dev/null
+++ b/libavcodec/loongarch/h264qpel.S
@@ -0,0 +1,3635 @@
+/*
+ * Loongson LSX optimized h264qpel
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Hecai Yuan <yuanhecai@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+/*
+ * void put_h264_qpel16_mc00(uint8_t *dst, const uint8_t *src,
+ *                           ptrdiff_t stride)
+ */
+function ff_put_h264_qpel16_mc00_lsx
+    slli.d        t0,     a2,     1
+    add.d         t1,     t0,     a2
+    slli.d        t2,     t0,     1
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    vldx          vr2,    a1,     t0
+    vldx          vr3,    a1,     t1
+    add.d         a1,     a1,     t2
+    vld           vr4,    a1,     0
+    vldx          vr5,    a1,     a2
+    vldx          vr6,    a1,     t0
+    vldx          vr7,    a1,     t1
+    add.d         a1,     a1,     t2
+
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a2
+    vstx          vr2,    a0,     t0
+    vstx          vr3,    a0,     t1
+    add.d         a0,     a0,     t2
+    vst           vr4,    a0,     0
+    vstx          vr5,    a0,     a2
+    vstx          vr6,    a0,     t0
+    vstx          vr7,    a0,     t1
+    add.d         a0,     a0,     t2
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    vldx          vr2,    a1,     t0
+    vldx          vr3,    a1,     t1
+    add.d         a1,     a1,     t2
+    vld           vr4,    a1,     0
+    vldx          vr5,    a1,     a2
+    vldx          vr6,    a1,     t0
+    vldx          vr7,    a1,     t1
+
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a2
+    vstx          vr2,    a0,     t0
+    vstx          vr3,    a0,     t1
+    add.d         a0,     a0,     t2
+    vst           vr4,    a0,     0
+    vstx          vr5,    a0,     a2
+    vstx          vr6,    a0,     t0
+    vstx          vr7,    a0,     t1
+endfunc
+
+.macro LSX_QPEL8_H_LOWPASS out0, out1
+    vbsrl.v       vr2,    vr0,    1
+    vbsrl.v       vr3,    vr1,    1
+    vbsrl.v       vr4,    vr0,    2
+    vbsrl.v       vr5,    vr1,    2
+    vbsrl.v       vr6,    vr0,    3
+    vbsrl.v       vr7,    vr1,    3
+    vbsrl.v       vr8,    vr0,    4
+    vbsrl.v       vr9,    vr1,    4
+    vbsrl.v       vr10,   vr0,    5
+    vbsrl.v       vr11,   vr1,    5
+
+    vilvl.b       vr6,    vr4,    vr6
+    vilvl.b       vr7,    vr5,    vr7
+    vilvl.b       vr8,    vr2,    vr8
+    vilvl.b       vr9,    vr3,    vr9
+    vilvl.b       vr10,   vr0,    vr10
+    vilvl.b       vr11,   vr1,    vr11
+
+    vhaddw.hu.bu  vr6,    vr6,    vr6
+    vhaddw.hu.bu  vr7,    vr7,    vr7
+    vhaddw.hu.bu  vr8,    vr8,    vr8
+    vhaddw.hu.bu  vr9,    vr9,    vr9
+    vhaddw.hu.bu  vr10,   vr10,   vr10
+    vhaddw.hu.bu  vr11,   vr11,   vr11
+
+    vmul.h        vr2,    vr6,    vr20
+    vmul.h        vr3,    vr7,    vr20
+    vmul.h        vr4,    vr8,    vr21
+    vmul.h        vr5,    vr9,    vr21
+    vssub.h       vr2,    vr2,    vr4
+    vssub.h       vr3,    vr3,    vr5
+    vsadd.h       vr2,    vr2,    vr10
+    vsadd.h       vr3,    vr3,    vr11
+    vsadd.h       \out0,  vr2,    vr22
+    vsadd.h       \out1,  vr3,    vr22
+.endm
+
+/*
+ * void put_h264_qpel16_mc10(uint8_t *dst, const uint8_t *src,
+ *                           ptrdiff_t stride)
+ */
+function ff_put_h264_qpel16_mc10_lsx
+    addi.d        t8,     a1,     0
+    vldi          vr20,   0x414
+    vldi          vr21,   0x405
+    vldi          vr22,   0x410
+
+    slli.d        t1,     a2,     1
+    add.d         t2,     t1,     a2
+    addi.d        t0,     a1,     -2   // t0 = src - 2
+    addi.d        a1,     t0,     8    // a1 = t0 + 8
+
+    vld           vr0,    t0,     0
+    vldx          vr1,    t0,     a2
+    LSX_QPEL8_H_LOWPASS vr12, vr13
+    vldx          vr0,    t0,     t1
+    vldx          vr1,    t0,     t2
+    LSX_QPEL8_H_LOWPASS vr14, vr15
+
+    alsl.d        t0,     a2,     t0,    2
+
+    vld           vr0,    t0,     0
+    vldx          vr1,    t0,     a2
+    LSX_QPEL8_H_LOWPASS vr16, vr17
+    vldx          vr0,    t0,     t1
+    vldx          vr1,    t0,     t2
+    LSX_QPEL8_H_LOWPASS vr18, vr19
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr2, vr3
+    vssrani.bu.h  vr2,    vr12,   5
+    vssrani.bu.h  vr3,    vr13,   5
+    vld           vr10,   t8,     0
+    vldx          vr11,   t8,     a2
+    vavgr.bu      vr0,    vr2,    vr10
+    vavgr.bu      vr1,    vr3,    vr11
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a2
+
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr4, vr5
+    vssrani.bu.h  vr4,    vr14,   5
+    vssrani.bu.h  vr5,    vr15,   5
+    vldx          vr12,   t8,     t1
+    vldx          vr13,   t8,     t2
+    vavgr.bu      vr2,    vr4,    vr12
+    vavgr.bu      vr3,    vr5,    vr13
+    vstx          vr2,    a0,     t1
+    vstx          vr3,    a0,     t2
+
+    alsl.d        a0,     a2,     a0,    2
+    alsl.d        t8,     a2,     t8,    2
+    alsl.d        a1,     a2,     a1,    2
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr6, vr7
+    vssrani.bu.h  vr6,    vr16,   5
+    vssrani.bu.h  vr7,    vr17,   5
+    vld           vr14,   t8,     0
+    vldx          vr15,   t8,     a2
+    vavgr.bu      vr4,    vr6,    vr14
+    vavgr.bu      vr5,    vr7,    vr15
+    vst           vr4,    a0,     0
+    vstx          vr5,    a0,     a2
+
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr8, vr9
+    vssrani.bu.h  vr8,    vr18,   5
+    vssrani.bu.h  vr9,    vr19,   5
+    vldx          vr16,   t8,     t1
+    vldx          vr17,   t8,     t2
+    vavgr.bu      vr6,    vr8,    vr16
+    vavgr.bu      vr7,    vr9,    vr17
+    vstx          vr6,    a0,     t1
+    vstx          vr7,    a0,     t2
+
+    alsl.d        t0,     a2,     t0,    2
+    alsl.d        t8,     a2,     t8,    2
+    alsl.d        a0,     a2,     a0,    2
+    alsl.d        a1,     a2,     a1,    2
+
+    vld           vr0,    t0,     0
+    vldx          vr1,    t0,     a2
+    LSX_QPEL8_H_LOWPASS vr12, vr13
+    vldx          vr0,    t0,     t1
+    vldx          vr1,    t0,     t2
+    LSX_QPEL8_H_LOWPASS vr14, vr15
+
+    alsl.d        t0,     a2,     t0,    2
+
+    vld           vr0,    t0,     0
+    vldx          vr1,    t0,     a2
+    LSX_QPEL8_H_LOWPASS vr16, vr17
+    vldx          vr0,    t0,     t1
+    vldx          vr1,    t0,     t2
+    LSX_QPEL8_H_LOWPASS vr18, vr19
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr2, vr3
+    vssrani.bu.h  vr2,    vr12,   5
+    vssrani.bu.h  vr3,    vr13,   5
+    vld           vr10,   t8,     0
+    vldx          vr11,   t8,     a2
+    vavgr.bu      vr0,    vr2,    vr10
+    vavgr.bu      vr1,    vr3,    vr11
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a2
+
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr4, vr5
+    vssrani.bu.h  vr4,    vr14,   5
+    vssrani.bu.h  vr5,    vr15,   5
+    vldx          vr12,   t8,     t1
+    vldx          vr13,   t8,     t2
+    vavgr.bu      vr2,    vr4,    vr12
+    vavgr.bu      vr3,    vr5,    vr13
+    vstx          vr2,    a0,     t1
+    vstx          vr3,    a0,     t2
+
+    alsl.d        a0,     a2,     a0,    2
+    alsl.d        t8,     a2,     t8,    2
+    alsl.d        a1,     a2,     a1,    2
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr6, vr7
+    vssrani.bu.h  vr6,    vr16,   5
+    vssrani.bu.h  vr7,    vr17,   5
+    vld           vr14,   t8,     0
+    vldx          vr15,   t8,     a2
+    vavgr.bu      vr4,    vr6,    vr14
+    vavgr.bu      vr5,    vr7,    vr15
+    vst           vr4,    a0,     0
+    vstx          vr5,    a0,     a2
+
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr8, vr9
+    vssrani.bu.h  vr8,    vr18,   5
+    vssrani.bu.h  vr9,    vr19,   5
+    vldx          vr16,   t8,     t1
+    vldx          vr17,   t8,     t2
+    vavgr.bu      vr6,    vr8,    vr16
+    vavgr.bu      vr7,    vr9,    vr17
+    vstx          vr6,    a0,     t1
+    vstx          vr7,    a0,     t2
+endfunc
+
+/*
+ * void put_h264_qpel16_mc20(uint8_t *dst, const uint8_t *src,
+ *                           ptrdiff_t stride)
+ */
+function ff_put_h264_qpel16_mc20_lsx
+    vldi          vr20,   0x414
+    vldi          vr21,   0x405
+    vldi          vr22,   0x410
+
+    slli.d        t1,     a2,     1
+    add.d         t2,     t1,     a2
+    addi.d        t0,     a1,     -2   // t0 = src - 2
+    addi.d        a1,     t0,     8    // a1 = t0 + 8
+
+    vld           vr0,    t0,     0
+    vldx          vr1,    t0,     a2
+    LSX_QPEL8_H_LOWPASS vr12, vr13
+    vldx          vr0,    t0,     t1
+    vldx          vr1,    t0,     t2
+    LSX_QPEL8_H_LOWPASS vr14, vr15
+
+    alsl.d        t0,     a2,     t0,    2
+
+    vld           vr0,    t0,     0
+    vldx          vr1,    t0,     a2
+    LSX_QPEL8_H_LOWPASS vr16, vr17
+    vldx          vr0,    t0,     t1
+    vldx          vr1,    t0,     t2
+    LSX_QPEL8_H_LOWPASS vr18, vr19
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr2, vr3
+    vssrani.bu.h  vr2,    vr12,   5
+    vssrani.bu.h  vr3,    vr13,   5
+    vst           vr2,    a0,     0
+    vstx          vr3,    a0,     a2
+
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr4, vr5
+    vssrani.bu.h  vr4,    vr14,   5
+    vssrani.bu.h  vr5,    vr15,   5
+    vstx          vr4,    a0,     t1
+    vstx          vr5,    a0,     t2
+
+    alsl.d        a0,     a2,     a0,    2
+    alsl.d        a1,     a2,     a1,    2
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr6, vr7
+    vssrani.bu.h  vr6,    vr16,   5
+    vssrani.bu.h  vr7,    vr17,   5
+    vst           vr6,    a0,     0
+    vstx          vr7,    a0,     a2
+
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr8, vr9
+    vssrani.bu.h  vr8,    vr18,   5
+    vssrani.bu.h  vr9,    vr19,   5
+    vstx          vr8,    a0,     t1
+    vstx          vr9,    a0,     t2
+
+    alsl.d        t0,     a2,     t0,    2
+    alsl.d        a0,     a2,     a0,    2
+    alsl.d        a1,     a2,     a1,    2
+
+    vld           vr0,    t0,     0
+    vldx          vr1,    t0,     a2
+    LSX_QPEL8_H_LOWPASS vr12, vr13
+    vldx          vr0,    t0,     t1
+    vldx          vr1,    t0,     t2
+    LSX_QPEL8_H_LOWPASS vr14, vr15
+
+    alsl.d        t0,     a2,     t0,    2
+
+    vld           vr0,    t0,     0
+    vldx          vr1,    t0,     a2
+    LSX_QPEL8_H_LOWPASS vr16, vr17
+    vldx          vr0,    t0,     t1
+    vldx          vr1,    t0,     t2
+    LSX_QPEL8_H_LOWPASS vr18, vr19
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr2, vr3
+    vssrani.bu.h  vr2,    vr12,   5
+    vssrani.bu.h  vr3,    vr13,   5
+    vst           vr2,    a0,     0
+    vstx          vr3,    a0,     a2
+
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr4, vr5
+    vssrani.bu.h  vr4,    vr14,   5
+    vssrani.bu.h  vr5,    vr15,   5
+    vstx          vr4,    a0,     t1
+    vstx          vr5,    a0,     t2
+
+    alsl.d        a1,     a2,     a1,    2
+    alsl.d        a0,     a2,     a0,    2
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr6, vr7
+    vssrani.bu.h  vr6,    vr16,   5
+    vssrani.bu.h  vr7,    vr17,   5
+    vst           vr6,    a0,     0
+    vstx          vr7,    a0,     a2
+
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr8, vr9
+    vssrani.bu.h  vr8,    vr18,   5
+    vssrani.bu.h  vr9,    vr19,   5
+    vstx          vr8,    a0,     t1
+    vstx          vr9,    a0,     t2
+endfunc
+
+/*
+ * void put_h264_qpel16_mc30(uint8_t *dst, const uint8_t *src,
+ *                           ptrdiff_t stride)
+ */
+function ff_put_h264_qpel16_mc30_lsx
+    vldi          vr20,   0x414
+    vldi          vr21,   0x405
+    vldi          vr22,   0x410
+
+    slli.d        t1,     a2,     1
+    add.d         t2,     t1,     a2
+    addi.d        t0,     a1,     -2   // t0 = src - 2
+    addi.d        t8,     a1,     1    // t8 = src + 1
+    addi.d        a1,     t0,     8    // a1 = t0 + 8
+
+    vld           vr0,    t0,     0
+    vldx          vr1,    t0,     a2
+    LSX_QPEL8_H_LOWPASS vr12, vr13
+    vldx          vr0,    t0,     t1
+    vldx          vr1,    t0,     t2
+    LSX_QPEL8_H_LOWPASS vr14, vr15
+
+    alsl.d        t0,     a2,     t0,    2
+
+    vld           vr0,    t0,     0
+    vldx          vr1,    t0,     a2
+    LSX_QPEL8_H_LOWPASS vr16, vr17
+    vldx          vr0,    t0,     t1
+    vldx          vr1,    t0,     t2
+    LSX_QPEL8_H_LOWPASS vr18, vr19
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr2, vr3
+    vssrani.bu.h  vr2,    vr12,   5
+    vssrani.bu.h  vr3,    vr13,   5
+    vld           vr10,   t8,     0
+    vldx          vr11,   t8,     a2
+    vavgr.bu      vr0,    vr2,    vr10
+    vavgr.bu      vr1,    vr3,    vr11
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a2
+
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr4, vr5
+    vssrani.bu.h  vr4,    vr14,   5
+    vssrani.bu.h  vr5,    vr15,   5
+    vldx          vr12,   t8,     t1
+    vldx          vr13,   t8,     t2
+    vavgr.bu      vr2,    vr4,    vr12
+    vavgr.bu      vr3,    vr5,    vr13
+    vstx          vr2,    a0,     t1
+    vstx          vr3,    a0,     t2
+
+    alsl.d        a1,     a2,     a1,    2
+    alsl.d        t8,     a2,     t8,    2
+    alsl.d        a0,     a2,     a0,    2
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr6, vr7
+    vssrani.bu.h  vr6,    vr16,   5
+    vssrani.bu.h  vr7,    vr17,   5
+    vld           vr14,   t8,     0
+    vldx          vr15,   t8,     a2
+    vavgr.bu      vr4,    vr6,    vr14
+    vavgr.bu      vr5,    vr7,    vr15
+    vst           vr4,    a0,     0
+    vstx          vr5,    a0,     a2
+
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr8, vr9
+    vssrani.bu.h  vr8,    vr18,   5
+    vssrani.bu.h  vr9,    vr19,   5
+    vldx          vr16,   t8,     t1
+    vldx          vr17,   t8,     t2
+    vavgr.bu      vr6,    vr8,    vr16
+    vavgr.bu      vr7,    vr9,    vr17
+    vstx          vr6,    a0,     t1
+    vstx          vr7,    a0,     t2
+
+    alsl.d        t0,     a2,     t0,    2
+    alsl.d        a0,     a2,     a0,    2
+    alsl.d        t8,     a2,     t8,    2
+    alsl.d        a1,     a2,     a1,    2
+
+    vld           vr0,    t0,     0
+    vldx          vr1,    t0,     a2
+    LSX_QPEL8_H_LOWPASS vr12, vr13
+    vldx          vr0,    t0,     t1
+    vldx          vr1,    t0,     t2
+    LSX_QPEL8_H_LOWPASS vr14, vr15
+
+    alsl.d        t0,     a2,     t0,    2
+
+    vld           vr0,    t0,     0
+    vldx          vr1,    t0,     a2
+    LSX_QPEL8_H_LOWPASS vr16, vr17
+    vldx          vr0,    t0,     t1
+    vldx          vr1,    t0,     t2
+    LSX_QPEL8_H_LOWPASS vr18, vr19
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr2, vr3
+    vssrani.bu.h  vr2,    vr12,   5
+    vssrani.bu.h  vr3,    vr13,   5
+    vld           vr10,   t8,     0
+    vldx          vr11,   t8,     a2
+    vavgr.bu      vr0,    vr2,    vr10
+    vavgr.bu      vr1,    vr3,    vr11
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a2
+
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr4, vr5
+    vssrani.bu.h  vr4,    vr14,   5
+    vssrani.bu.h  vr5,    vr15,   5
+    vldx          vr12,   t8,     t1
+    vldx          vr13,   t8,     t2
+    vavgr.bu      vr2,    vr4,    vr12
+    vavgr.bu      vr3,    vr5,    vr13
+    vstx          vr2,    a0,     t1
+    vstx          vr3,    a0,     t2
+
+    alsl.d        a1,     a2,     a1,    2
+    alsl.d        a0,     a2,     a0,    2
+    alsl.d        t8,     a2,     t8,    2
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr6, vr7
+    vssrani.bu.h  vr6,    vr16,   5
+    vssrani.bu.h  vr7,    vr17,   5
+    vld           vr14,   t8,     0
+    vldx          vr15,   t8,     a2
+    vavgr.bu      vr4,    vr6,    vr14
+    vavgr.bu      vr5,    vr7,    vr15
+    vst           vr4,    a0,     0
+    vstx          vr5,    a0,     a2
+
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr8, vr9
+    vssrani.bu.h  vr8,    vr18,   5
+    vssrani.bu.h  vr9,    vr19,   5
+    vldx          vr16,   t8,     t1
+    vldx          vr17,   t8,     t2
+    vavgr.bu      vr6,    vr8,    vr16
+    vavgr.bu      vr7,    vr9,    vr17
+    vstx          vr6,    a0,     t1
+    vstx          vr7,    a0,     t2
+endfunc
+
+.macro LSX_QPEL8_V_LOWPASS in0, in1, in2, in3, in4, in5, in6
+    vilvl.b       vr7,    \in3,   \in2
+    vilvl.b       vr8,    \in4,   \in3
+    vilvl.b       vr9,    \in4,   \in1
+    vilvl.b       vr10,   \in5,   \in2
+    vilvl.b       vr11,   \in5,   \in0
+    vilvl.b       vr12,   \in6,   \in1
+
+    vhaddw.hu.bu  vr7,    vr7,    vr7
+    vhaddw.hu.bu  vr8,    vr8,    vr8
+    vhaddw.hu.bu  vr9,    vr9,    vr9
+    vhaddw.hu.bu  vr10,   vr10,   vr10
+    vhaddw.hu.bu  vr11,   vr11,   vr11
+    vhaddw.hu.bu  vr12,   vr12,   vr12
+
+    vmul.h        vr7,    vr7,    vr20
+    vmul.h        vr8,    vr8,    vr20
+    vmul.h        vr9,    vr9,    vr21
+    vmul.h        vr10,   vr10,   vr21
+
+    vssub.h       vr7,    vr7,    vr9
+    vssub.h       vr8,    vr8,    vr10
+    vsadd.h       vr7,    vr7,    vr11
+    vsadd.h       vr8,    vr8,    vr12
+    vsadd.h       vr7,    vr7,    vr22
+    vsadd.h       vr8,    vr8,    vr22
+
+    vilvh.b       vr13,   \in3,   \in2
+    vilvh.b       vr14,   \in4,   \in3
+    vilvh.b       vr15,   \in4,   \in1
+    vilvh.b       vr16,   \in5,   \in2
+    vilvh.b       vr17,   \in5,   \in0
+    vilvh.b       vr18,   \in6,   \in1
+
+    vhaddw.hu.bu  vr13,   vr13,   vr13
+    vhaddw.hu.bu  vr14,   vr14,   vr14
+    vhaddw.hu.bu  vr15,   vr15,   vr15
+    vhaddw.hu.bu  vr16,   vr16,   vr16
+    vhaddw.hu.bu  vr17,   vr17,   vr17
+    vhaddw.hu.bu  vr18,   vr18,   vr18
+
+    vmul.h        vr13,   vr13,   vr20
+    vmul.h        vr14,   vr14,   vr20
+    vmul.h        vr15,   vr15,   vr21
+    vmul.h        vr16,   vr16,   vr21
+
+    vssub.h       vr13,   vr13,   vr15
+    vssub.h       vr14,   vr14,   vr16
+    vsadd.h       vr13,   vr13,   vr17
+    vsadd.h       vr14,   vr14,   vr18
+    vsadd.h       vr13,   vr13,   vr22
+    vsadd.h       vr14,   vr14,   vr22
+
+    vssrani.bu.h  vr13,   vr7,    5
+    vssrani.bu.h  vr14,   vr8,    5
+.endm
+
+/*
+ * void put_h264_qpel16_mc01(uint8_t *dst, const uint8_t *src,
+ *                           ptrdiff_t stride)
+ */
+function ff_put_h264_qpel16_mc01_lsx
+    slli.d        t0,     a2,     1
+    add.d         t1,     t0,     a2
+    sub.d         t2,     a1,     t0  // t2 = src - 2 * stride
+    vldi          vr20,   0x414
+    vldi          vr21,   0x405
+    vldi          vr22,   0x410
+
+    vld           vr0,    t2,     0
+    vldx          vr1,    t2,     a2
+    vldx          vr2,    t2,     t0
+    vldx          vr3,    t2,     t1
+    alsl.d        t2,     a2,     t2,    2  // t2 = t2 + 4 * stride
+    vld           vr4,    t2,     0
+    vldx          vr5,    t2,     a2
+    vldx          vr6,    t2,     t0
+    LSX_QPEL8_V_LOWPASS vr0, vr1, vr2, vr3, vr4, vr5, vr6
+    vavgr.bu      vr13,   vr2,    vr13
+    vavgr.bu      vr14,   vr3,    vr14
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr0,    t2,     t1
+    alsl.d        t2,     a2,     t2,    2  // t2 = t2 + 4 *stride
+    vld           vr1,    t2,     0
+    LSX_QPEL8_V_LOWPASS vr2, vr3, vr4, vr5, vr6, vr0, vr1
+    vavgr.bu      vr13,   vr4,    vr13
+    vavgr.bu      vr14,   vr5,    vr14
+    vstx          vr13,   a0,     t0
+    vstx          vr14,   a0,     t1
+
+    alsl.d        a0,     a2,     a0,    2  // dst = dst + 4 * stride
+
+    vldx          vr2,    t2,     a2
+    vldx          vr3,    t2,     t0
+    LSX_QPEL8_V_LOWPASS vr4, vr5, vr6, vr0, vr1, vr2, vr3
+    vavgr.bu      vr13,   vr6,    vr13
+    vavgr.bu      vr14,   vr0,    vr14
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr4,    t2,     t1
+    alsl.d        t2,     a2,     t2,    2 // t2 = t2 + 4 * stride
+    vld           vr5,    t2,     0
+    LSX_QPEL8_V_LOWPASS vr6, vr0, vr1, vr2, vr3, vr4, vr5
+    vavgr.bu      vr13,   vr1,    vr13
+    vavgr.bu      vr14,   vr2,    vr14
+    vstx          vr13,   a0,     t0
+    vstx          vr14,   a0,     t1
+
+    alsl.d        a0,     a2,     a0,    2  // dst = dst + 4 * stride
+
+    vldx          vr6,    t2,     a2
+    vldx          vr0,    t2,     t0
+    LSX_QPEL8_V_LOWPASS vr1, vr2, vr3, vr4, vr5, vr6, vr0
+    vavgr.bu      vr13,   vr3,    vr13
+    vavgr.bu      vr14,   vr4,    vr14
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr1,    t2,     t1
+    alsl.d        t2,     a2,     t2,    2  // t2 = t2 + 4 * stride
+    vld           vr2,    t2,     0
+    LSX_QPEL8_V_LOWPASS vr3, vr4, vr5, vr6, vr0, vr1, vr2
+    vavgr.bu      vr13,   vr5,    vr13
+    vavgr.bu      vr14,   vr6,    vr14
+    vstx          vr13,   a0,     t0
+    vstx          vr14,   a0,     t1
+
+    alsl.d        a0,     a2,     a0,    2  // dst = dst + 4 * stride
+
+    vldx          vr3,    t2,     a2
+    vldx          vr4,    t2,     t0
+    LSX_QPEL8_V_LOWPASS vr5, vr6, vr0, vr1, vr2, vr3, vr4
+    vavgr.bu      vr13,   vr0,    vr13
+    vavgr.bu      vr14,   vr1,    vr14
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr5,    t2,     t1
+    alsl.d        t2,     a2,     t2,    2 // t2 = t2 + 4 * stride
+    vld           vr6,    t2,     0
+    LSX_QPEL8_V_LOWPASS vr0, vr1, vr2, vr3, vr4, vr5, vr6
+    vavgr.bu      vr13,   vr2,    vr13
+    vavgr.bu      vr14,   vr3,    vr14
+    vstx          vr13,   a0,     t0
+    vstx          vr14,   a0,     t1
+endfunc
+
+/*
+ * void put_h264_qpel16_mc11(uint8_t *dst, const uint8_t *src,
+ *                           ptrdiff_t stride)
+ */
+function ff_put_h264_qpel16_mc11_lsx
+    slli.d        t1,     a2,     1
+    add.d         t2,     t1,     a2
+    vldi          vr20,   0x414
+    vldi          vr21,   0x405
+    vldi          vr22,   0x410
+
+    addi.d        sp,     sp,     -64
+    fst.d         f24,    sp,     0
+    fst.d         f25,    sp,     8
+    fst.d         f26,    sp,     16
+    fst.d         f27,    sp,     24
+    fst.d         f28,    sp,     32
+    fst.d         f29,    sp,     40
+    fst.d         f30,    sp,     48
+    fst.d         f31,    sp,     56
+
+    sub.d         t4,     a1,     t1   // t4 = src - 2 * stride
+    addi.d        t0,     a1,     -2   // t0 = src - 2
+    addi.d        a1,     t0,     8    // a1 = t0 + 8
+
+    vld           vr0,    t0,     0
+    vldx          vr1,    t0,     a2
+    LSX_QPEL8_H_LOWPASS vr12, vr13
+    vldx          vr0,    t0,     t1
+    vldx          vr1,    t0,     t2
+    LSX_QPEL8_H_LOWPASS vr14, vr15
+
+    alsl.d        t0,     a2,     t0,    2
+
+    vld           vr0,    t0,     0
+    vldx          vr1,    t0,     a2
+    LSX_QPEL8_H_LOWPASS vr16, vr17
+    vldx          vr0,    t0,     t1
+    vldx          vr1,    t0,     t2
+    LSX_QPEL8_H_LOWPASS vr18, vr19
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr23, vr24
+    vssrani.bu.h  vr23,   vr12,   5
+    vssrani.bu.h  vr24,   vr13,   5
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr25, vr26
+    vssrani.bu.h  vr25,   vr14,   5
+    vssrani.bu.h  vr26,   vr15,   5
+
+    alsl.d        a1,     a2,     a1,    2
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr27, vr28
+    vssrani.bu.h  vr27,   vr16,   5
+    vssrani.bu.h  vr28,   vr17,   5
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr29, vr30
+    vssrani.bu.h  vr29,   vr18,   5
+    vssrani.bu.h  vr30,   vr19,   5
+
+    vld           vr0,    t4,     0      // t4 = src - 2 * stride
+    vldx          vr1,    t4,     a2
+    vldx          vr2,    t4,     t1
+    vldx          vr3,    t4,     t2
+    alsl.d        t4,     a2,     t4,    2
+    vld           vr4,    t4,     0
+    vldx          vr5,    t4,     a2
+    vldx          vr6,    t4,     t1
+    LSX_QPEL8_V_LOWPASS vr0, vr1, vr2, vr3, vr4, vr5, vr6
+    vavgr.bu      vr13,   vr23,   vr13
+    vavgr.bu      vr14,   vr24,   vr14
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr0,    t4,     t2
+    alsl.d        t4,     a2,     t4,    2
+    vld           vr1,    t4,     0
+    LSX_QPEL8_V_LOWPASS vr2, vr3, vr4, vr5, vr6, vr0, vr1
+    vavgr.bu      vr13,   vr25,   vr13
+    vavgr.bu      vr14,   vr26,   vr14
+    vstx          vr13,   a0,     t1
+    vstx          vr14,   a0,     t2
+
+    add.d         t6,     t4,     zero      // t6 = src + 6 * stride
+    alsl.d        a0,     a2,     a0,    2  // dst = dst + 4 * stride
+
+    vldx          vr2,    t4,     a2
+    vldx          vr3,    t4,     t1
+    LSX_QPEL8_V_LOWPASS vr4, vr5, vr6, vr0, vr1, vr2, vr3
+    vavgr.bu      vr13,   vr27,   vr13
+    vavgr.bu      vr14,   vr28,   vr14
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr4,    t4,     t2
+    alsl.d        t4,     a2,     t4,    2
+    vld           vr5,    t4,     0
+    LSX_QPEL8_V_LOWPASS vr6, vr0, vr1, vr2, vr3, vr4, vr5
+    vavgr.bu      vr13,   vr29,   vr13
+    vavgr.bu      vr14,   vr30,   vr14
+    vstx          vr13,   a0,     t1
+    vstx          vr14,   a0,     t2
+
+    alsl.d        t0,     a2,     t0,    2
+    alsl.d        a1,     a2,     a1,    2  // a1 = src + 8 * stride
+
+    vld           vr0,    t0,     0
+    vldx          vr1,    t0,     a2
+    LSX_QPEL8_H_LOWPASS vr12, vr13
+    vldx          vr0,    t0,     t1
+    vldx          vr1,    t0,     t2
+    LSX_QPEL8_H_LOWPASS vr14, vr15
+
+    alsl.d        t0,     a2,     t0,    2
+
+    vld           vr0,    t0,     0
+    vldx          vr1,    t0,     a2
+    LSX_QPEL8_H_LOWPASS vr16, vr17
+    vldx          vr0,    t0,     t1
+    vldx          vr1,    t0,     t2
+    LSX_QPEL8_H_LOWPASS vr18, vr19
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr23, vr24
+    vssrani.bu.h  vr23,   vr12,   5
+    vssrani.bu.h  vr24,   vr13,   5
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr25, vr26
+    vssrani.bu.h  vr25,   vr14,   5
+    vssrani.bu.h  vr26,   vr15,   5
+
+    alsl.d        a1,     a2,     a1,    2
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr27, vr28
+    vssrani.bu.h  vr27,   vr16,   5
+    vssrani.bu.h  vr28,   vr17,   5
+
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr29, vr30
+    vssrani.bu.h  vr29,   vr18,   5
+    vssrani.bu.h  vr30,   vr19,   5
+
+    alsl.d        a0,     a2,     a0,    2   // dst = dst + 4 * stride
+
+    // t6 = src + 6 * stride
+    vld           vr0,    t6,     0
+    vldx          vr1,    t6,     a2
+    vldx          vr2,    t6,     t1
+    vldx          vr3,    t6,     t2
+    alsl.d        t6,     a2,     t6,    2
+    vld           vr4,    t6,     0
+    vldx          vr5,    t6,     a2
+    vldx          vr6,    t6,     t1
+
+    LSX_QPEL8_V_LOWPASS vr0, vr1, vr2, vr3, vr4, vr5, vr6
+    vavgr.bu      vr13,   vr23,   vr13
+    vavgr.bu      vr14,   vr24,   vr14
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr0,    t6,     t2
+    alsl.d        t6,     a2,     t6,    2
+    vld           vr1,    t6,     0
+    LSX_QPEL8_V_LOWPASS vr2, vr3, vr4, vr5, vr6, vr0, vr1
+    vavgr.bu      vr13,   vr25,   vr13
+    vavgr.bu      vr14,   vr26,   vr14
+    vstx          vr13,   a0,     t1
+    vstx          vr14,   a0,     t2
+
+    alsl.d        a0,     a2,     a0,    2    // dst = dst + 4 *stride
+
+    vldx          vr2,    t6,     a2
+    vldx          vr3,    t6,     t1
+    LSX_QPEL8_V_LOWPASS vr4, vr5, vr6, vr0, vr1, vr2, vr3
+    vavgr.bu      vr13,   vr27,   vr13
+    vavgr.bu      vr14,   vr28,   vr14
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr4,    t6,     t2
+    alsl.d        t6,     a2,     t6,    2
+    vld           vr5,    t6,     0
+    LSX_QPEL8_V_LOWPASS vr6, vr0, vr1, vr2, vr3, vr4, vr5
+    vavgr.bu      vr13,   vr29,   vr13
+    vavgr.bu      vr14,   vr30,   vr14
+    vstx          vr13,   a0,     t1
+    vstx          vr14,   a0,     t2
+
+    fld.d         f24,    sp,     0
+    fld.d         f25,    sp,     8
+    fld.d         f26,    sp,     16
+    fld.d         f27,    sp,     24
+    fld.d         f28,    sp,     32
+    fld.d         f29,    sp,     40
+    fld.d         f30,    sp,     48
+    fld.d         f31,    sp,     56
+    addi.d        sp,     sp,     64
+endfunc
+
+/*
+ * void avg_h264_qpel16_mc00(uint8_t *dst, const uint8_t *src,
+ *                           ptrdiff_t stride)
+ */
+function ff_avg_h264_qpel16_mc00_lsx
+    slli.d        t0,     a2,     1
+    add.d         t1,     t0,     a2
+    slli.d        t2,     t0,     1
+    addi.d        t3,     a0,     0
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    vldx          vr2,    a1,     t0
+    vldx          vr3,    a1,     t1
+    add.d         a1,     a1,     t2
+    vld           vr4,    a1,     0
+    vldx          vr5,    a1,     a2
+    vldx          vr6,    a1,     t0
+    vldx          vr7,    a1,     t1
+    add.d         a1,     a1,     t2
+
+    vld           vr8,    t3,     0
+    vldx          vr9,    t3,     a2
+    vldx          vr10,   t3,     t0
+    vldx          vr11,   t3,     t1
+    add.d         t3,     t3,     t2
+    vld           vr12,   t3,     0
+    vldx          vr13,   t3,     a2
+    vldx          vr14,   t3,     t0
+    vldx          vr15,   t3,     t1
+    add.d         t3,     t3,     t2
+
+    vavgr.bu      vr0,    vr8,    vr0
+    vavgr.bu      vr1,    vr9,    vr1
+    vavgr.bu      vr2,    vr10,   vr2
+    vavgr.bu      vr3,    vr11,   vr3
+    vavgr.bu      vr4,    vr12,   vr4
+    vavgr.bu      vr5,    vr13,   vr5
+    vavgr.bu      vr6,    vr14,   vr6
+    vavgr.bu      vr7,    vr15,   vr7
+
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a2
+    vstx          vr2,    a0,     t0
+    vstx          vr3,    a0,     t1
+    add.d         a0,     a0,     t2
+    vst           vr4,    a0,     0
+    vstx          vr5,    a0,     a2
+    vstx          vr6,    a0,     t0
+    vstx          vr7,    a0,     t1
+
+    add.d         a0,     a0,     t2
+
+    /* h8~h15 */
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    vldx          vr2,    a1,     t0
+    vldx          vr3,    a1,     t1
+    add.d         a1,     a1,     t2
+    vld           vr4,    a1,     0
+    vldx          vr5,    a1,     a2
+    vldx          vr6,    a1,     t0
+    vldx          vr7,    a1,     t1
+
+    vld           vr8,    t3,     0
+    vldx          vr9,    t3,     a2
+    vldx          vr10,   t3,     t0
+    vldx          vr11,   t3,     t1
+    add.d         t3,     t3,     t2
+    vld           vr12,   t3,     0
+    vldx          vr13,   t3,     a2
+    vldx          vr14,   t3,     t0
+    vldx          vr15,   t3,     t1
+
+    vavgr.bu      vr0,    vr8,    vr0
+    vavgr.bu      vr1,    vr9,    vr1
+    vavgr.bu      vr2,    vr10,   vr2
+    vavgr.bu      vr3,    vr11,   vr3
+    vavgr.bu      vr4,    vr12,   vr4
+    vavgr.bu      vr5,    vr13,   vr5
+    vavgr.bu      vr6,    vr14,   vr6
+    vavgr.bu      vr7,    vr15,   vr7
+
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a2
+    vstx          vr2,    a0,     t0
+    vstx          vr3,    a0,     t1
+    add.d         a0,     a0,     t2
+    vst           vr4,    a0,     0
+    vstx          vr5,    a0,     a2
+    vstx          vr6,    a0,     t0
+    vstx          vr7,    a0,     t1
+endfunc
+
+/*
+ * void put_h264_qpel16_mc31(uint8_t *dst, const uint8_t *src,
+ *                           ptrdiff_t stride)
+ */
+function ff_put_h264_qpel16_mc31_lsx
+    slli.d        t1,     a2,     1
+    add.d         t2,     t1,     a2
+    vldi          vr20,   0x414
+    vldi          vr21,   0x405
+    vldi          vr22,   0x410
+
+    addi.d        sp,     sp,     -64
+    fst.d         f24,    sp,     0
+    fst.d         f25,    sp,     8
+    fst.d         f26,    sp,     16
+    fst.d         f27,    sp,     24
+    fst.d         f28,    sp,     32
+    fst.d         f29,    sp,     40
+    fst.d         f30,    sp,     48
+    fst.d         f31,    sp,     56
+
+    addi.d        t0,     a1,     -2   // t0 = src - 2
+    add.d         t3,     a1,     zero // t3 = src
+    sub.d         t4,     a1,     t1   // t4 = src - 2 * stride
+    addi.d        t4,     t4,     1
+
+    vld           vr0,    t0,     0
+    vldx          vr1,    t0,     a2
+    LSX_QPEL8_H_LOWPASS vr12, vr13
+    vldx          vr0,    t0,     t1
+    vldx          vr1,    t0,     t2
+    LSX_QPEL8_H_LOWPASS vr14, vr15
+
+    alsl.d        a1,     a2,     t0,    2
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr16, vr17
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr18, vr19
+
+    addi.d        a1,     t0,     8
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr23, vr24
+    vssrani.bu.h  vr23,   vr12,   5
+    vssrani.bu.h  vr24,   vr13,   5
+
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr25, vr26
+    vssrani.bu.h  vr25,   vr14,   5
+    vssrani.bu.h  vr26,   vr15,   5
+
+    alsl.d        a1,     a2,     a1,    2
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr27, vr28
+    vssrani.bu.h  vr27,   vr16,   5
+    vssrani.bu.h  vr28,   vr17,   5
+
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr29, vr30
+    vssrani.bu.h  vr29,   vr18,   5
+    vssrani.bu.h  vr30,   vr19,   5
+
+    vld           vr0,    t4,     0      // t4 = src - 2 * stride + 1
+    vldx          vr1,    t4,     a2
+    vldx          vr2,    t4,     t1
+    vldx          vr3,    t4,     t2
+    alsl.d        t4,     a2,     t4,    2
+    vld           vr4,    t4,     0
+    vldx          vr5,    t4,     a2
+    vldx          vr6,    t4,     t1
+    LSX_QPEL8_V_LOWPASS vr0, vr1, vr2, vr3, vr4, vr5, vr6
+    vavgr.bu      vr13,   vr23,   vr13
+    vavgr.bu      vr14,   vr24,   vr14
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr0,    t4,     t2
+    alsl.d        t4,     a2,     t4,    2
+    vld           vr1,    t4,     0
+    LSX_QPEL8_V_LOWPASS vr2, vr3, vr4, vr5, vr6, vr0, vr1
+    vavgr.bu      vr13,   vr25,   vr13
+    vavgr.bu      vr14,   vr26,   vr14
+    add.d         t6,     t4,     zero     // t6 = src + 6 * stride
+    vstx          vr13,   a0,     t1
+    vstx          vr14,   a0,     t2
+
+    alsl.d        a0,     a2,     a0,    2  // dst = dst + 4 * stride
+
+    vldx          vr2,    t4,     a2
+    vldx          vr3,    t4,     t1
+    LSX_QPEL8_V_LOWPASS vr4, vr5, vr6, vr0, vr1, vr2, vr3
+    vavgr.bu      vr13,   vr27,   vr13
+    vavgr.bu      vr14,   vr28,   vr14
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr4,    t4,     t2
+    alsl.d        t4,     a2,     t4,    2
+    vld           vr5,    t4,     0
+    LSX_QPEL8_V_LOWPASS vr6, vr0, vr1, vr2, vr3, vr4, vr5
+    vavgr.bu      vr13,   vr29,   vr13
+    vavgr.bu      vr14,   vr30,   vr14
+    vstx          vr13,   a0,     t1
+    vstx          vr14,   a0,     t2
+
+    alsl.d        a1,     a2,     t0,    3  // a1 = src + 8 * stride
+    addi.d        t5,     a1,     8         // a1 = src + 8 * stride + 8
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr12, vr13
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr14, vr15
+
+    alsl.d        a1,     a2,     a1,    2
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr16, vr17
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr18, vr19
+
+    vld           vr0,    t5,     0
+    vldx          vr1,    t5,     a2
+    LSX_QPEL8_H_LOWPASS vr23, vr24
+    vssrani.bu.h  vr23,   vr12,   5
+    vssrani.bu.h  vr24,   vr13,   5
+    vldx          vr0,    t5,     t1
+    vldx          vr1,    t5,     t2
+    LSX_QPEL8_H_LOWPASS vr25, vr26
+    vssrani.bu.h  vr25,   vr14,   5
+    vssrani.bu.h  vr26,   vr15,   5
+
+    alsl.d        t5,     a2,     t5,    2
+
+    vld           vr0,    t5,     0
+    vldx          vr1,    t5,     a2
+    LSX_QPEL8_H_LOWPASS vr27, vr28
+    vssrani.bu.h  vr27,   vr16,   5
+    vssrani.bu.h  vr28,   vr17,   5
+    vldx          vr0,    t5,     t1
+    vldx          vr1,    t5,     t2
+    LSX_QPEL8_H_LOWPASS vr29, vr30
+    vssrani.bu.h  vr29,   vr18,   5
+    vssrani.bu.h  vr30,   vr19,   5
+
+    alsl.d        a0,     a2,     a0,    2   // dst = dst + 4 * stride
+
+    // t6 = src + 6 * stride + 1
+    vld           vr0,    t6,     0
+    vldx          vr1,    t6,     a2
+    vldx          vr2,    t6,     t1
+    vldx          vr3,    t6,     t2
+    alsl.d        t6,     a2,     t6,    2
+    vld           vr4,    t6,     0
+    vldx          vr5,    t6,     a2
+    vldx          vr6,    t6,     t1
+
+    LSX_QPEL8_V_LOWPASS vr0, vr1, vr2, vr3, vr4, vr5, vr6
+    vavgr.bu      vr13,   vr23,   vr13
+    vavgr.bu      vr14,   vr24,   vr14
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr0,    t6,     t2
+    alsl.d        t6,     a2,     t6,    2
+    vld           vr1,    t6,     0
+    LSX_QPEL8_V_LOWPASS vr2, vr3, vr4, vr5, vr6, vr0, vr1
+    vavgr.bu      vr13,   vr25,   vr13
+    vavgr.bu      vr14,   vr26,   vr14
+    vstx          vr13,   a0,     t1
+    vstx          vr14,   a0,     t2
+
+    alsl.d        a0,     a2,     a0,    2    // dst = dst + 4 *stride
+
+    vldx          vr2,    t6,     a2
+    vldx          vr3,    t6,     t1
+    LSX_QPEL8_V_LOWPASS vr4, vr5, vr6, vr0, vr1, vr2, vr3
+    vavgr.bu      vr13,   vr27,   vr13
+    vavgr.bu      vr14,   vr28,   vr14
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr4,    t6,     t2
+    alsl.d        t6,     a2,     t6,    2
+    vld           vr5,    t6,     0
+    LSX_QPEL8_V_LOWPASS vr6, vr0, vr1, vr2, vr3, vr4, vr5
+    vavgr.bu      vr13,   vr29,   vr13
+    vavgr.bu      vr14,   vr30,   vr14
+    vstx          vr13,   a0,     t1
+    vstx          vr14,   a0,     t2
+
+    fld.d         f24,    sp,     0
+    fld.d         f25,    sp,     8
+    fld.d         f26,    sp,     16
+    fld.d         f27,    sp,     24
+    fld.d         f28,    sp,     32
+    fld.d         f29,    sp,     40
+    fld.d         f30,    sp,     48
+    fld.d         f31,    sp,     56
+    addi.d        sp,     sp,     64
+endfunc
+
+/*
+ * void put_h264_qpel16_mc33(uint8_t *dst, const uint8_t *src,
+ *                           ptrdiff_t stride)
+ */
+function ff_put_h264_qpel16_mc33_lsx
+    slli.d        t1,     a2,     1
+    add.d         t2,     t1,     a2
+    vldi          vr20,   0x414
+    vldi          vr21,   0x405
+    vldi          vr22,   0x410
+
+    addi.d        sp,     sp,     -64
+    fst.d         f24,    sp,     0
+    fst.d         f25,    sp,     8
+    fst.d         f26,    sp,     16
+    fst.d         f27,    sp,     24
+    fst.d         f28,    sp,     32
+    fst.d         f29,    sp,     40
+    fst.d         f30,    sp,     48
+    fst.d         f31,    sp,     56
+
+    addi.d        t0,     a1,     -2   // t0 = src - 2
+    add.d         t0,     t0,     a2
+    add.d         t3,     a1,     zero // t3 = src
+    sub.d         t4,     a1,     t1   // t4 = src - 2 * stride
+    addi.d        t4,     t4,     1
+
+    vld           vr0,    t0,     0
+    vldx          vr1,    t0,     a2
+    LSX_QPEL8_H_LOWPASS vr12, vr13
+    vldx          vr0,    t0,     t1
+    vldx          vr1,    t0,     t2
+    LSX_QPEL8_H_LOWPASS vr14, vr15
+
+    alsl.d        a1,     a2,     t0,    2
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr16, vr17
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr18, vr19
+
+    addi.d        a1,     t0,     8
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr23, vr24
+    vssrani.bu.h  vr23,   vr12,   5
+    vssrani.bu.h  vr24,   vr13,   5
+
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr25, vr26
+    vssrani.bu.h  vr25,   vr14,   5
+    vssrani.bu.h  vr26,   vr15,   5
+
+    alsl.d        a1,     a2,     a1,    2
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr27, vr28
+    vssrani.bu.h  vr27,   vr16,   5
+    vssrani.bu.h  vr28,   vr17,   5
+
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr29, vr30
+    vssrani.bu.h  vr29,   vr18,   5
+    vssrani.bu.h  vr30,   vr19,   5
+
+    vld           vr0,    t4,     0      // t4 = src - 2 * stride + 1
+    vldx          vr1,    t4,     a2
+    vldx          vr2,    t4,     t1
+    vldx          vr3,    t4,     t2
+    alsl.d        t4,     a2,     t4,    2
+    vld           vr4,    t4,     0
+    vldx          vr5,    t4,     a2
+    vldx          vr6,    t4,     t1
+    LSX_QPEL8_V_LOWPASS vr0, vr1, vr2, vr3, vr4, vr5, vr6
+    vavgr.bu      vr13,   vr23,   vr13
+    vavgr.bu      vr14,   vr24,   vr14
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr0,    t4,     t2
+    alsl.d        t4,     a2,     t4,    2
+    vld           vr1,    t4,     0
+    LSX_QPEL8_V_LOWPASS vr2, vr3, vr4, vr5, vr6, vr0, vr1
+    vavgr.bu      vr13,   vr25,   vr13
+    vavgr.bu      vr14,   vr26,   vr14
+    add.d         t6,     t4,     zero     // t6 = src + 6 * stride
+
+    vstx          vr13,   a0,     t1
+    vstx          vr14,   a0,     t2
+
+    alsl.d        a0,     a2,     a0,    2  // dst = dst + 4 * stride
+
+    vldx          vr2,    t4,     a2
+    vldx          vr3,    t4,     t1
+    LSX_QPEL8_V_LOWPASS vr4, vr5, vr6, vr0, vr1, vr2, vr3
+    vavgr.bu      vr13,   vr27,   vr13
+    vavgr.bu      vr14,   vr28,   vr14
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr4,    t4,     t2
+    alsl.d        t4,     a2,     t4,    2
+    vld           vr5,    t4,     0
+    LSX_QPEL8_V_LOWPASS vr6, vr0, vr1, vr2, vr3, vr4, vr5
+    vavgr.bu      vr13,   vr29,   vr13
+    vavgr.bu      vr14,   vr30,   vr14
+    vstx          vr13,   a0,     t1
+    vstx          vr14,   a0,     t2
+
+    alsl.d        a1,     a2,     t0,    3  // a1 = src + 8 * stride
+    addi.d        t5,     a1,     8         // a1 = src + 8 * stride + 8
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr12, vr13
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr14, vr15
+
+    alsl.d        a1,     a2,     a1,    2
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr16, vr17
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr18, vr19
+
+    vld           vr0,    t5,     0
+    vldx          vr1,    t5,     a2
+    LSX_QPEL8_H_LOWPASS vr23, vr24
+    vssrani.bu.h  vr23,   vr12,   5
+    vssrani.bu.h  vr24,   vr13,   5
+    vldx          vr0,    t5,     t1
+    vldx          vr1,    t5,     t2
+    LSX_QPEL8_H_LOWPASS vr25, vr26
+    vssrani.bu.h  vr25,   vr14,   5
+    vssrani.bu.h  vr26,   vr15,   5
+
+    alsl.d        t5,     a2,     t5,    2
+
+    vld           vr0,    t5,     0
+    vldx          vr1,    t5,     a2
+    LSX_QPEL8_H_LOWPASS vr27, vr28
+    vssrani.bu.h  vr27,   vr16,   5
+    vssrani.bu.h  vr28,   vr17,   5
+    vldx          vr0,    t5,     t1
+    vldx          vr1,    t5,     t2
+    LSX_QPEL8_H_LOWPASS vr29, vr30
+    vssrani.bu.h  vr29,   vr18,   5
+    vssrani.bu.h  vr30,   vr19,   5
+
+    alsl.d        a0,     a2,     a0,    2   // dst = dst + 4 * stride
+
+    // t6 = src + 6 * stride + 1
+    vld           vr0,    t6,     0
+    vldx          vr1,    t6,     a2
+    vldx          vr2,    t6,     t1
+    vldx          vr3,    t6,     t2
+    alsl.d        t6,     a2,     t6,    2
+    vld           vr4,    t6,     0
+    vldx          vr5,    t6,     a2
+    vldx          vr6,    t6,     t1
+
+    LSX_QPEL8_V_LOWPASS vr0, vr1, vr2, vr3, vr4, vr5, vr6
+    vavgr.bu      vr13,   vr23,   vr13
+    vavgr.bu      vr14,   vr24,   vr14
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr0,    t6,     t2
+    alsl.d        t6,     a2,     t6,    2
+    vld           vr1,    t6,     0
+    LSX_QPEL8_V_LOWPASS vr2, vr3, vr4, vr5, vr6, vr0, vr1
+    vavgr.bu      vr13,   vr25,   vr13
+    vavgr.bu      vr14,   vr26,   vr14
+    vstx          vr13,   a0,     t1
+    vstx          vr14,   a0,     t2
+
+    alsl.d        a0,     a2,     a0,    2    // dst = dst + 4 *stride
+
+    vldx          vr2,    t6,     a2
+    vldx          vr3,    t6,     t1
+    LSX_QPEL8_V_LOWPASS vr4, vr5, vr6, vr0, vr1, vr2, vr3
+    vavgr.bu      vr13,   vr27,   vr13
+    vavgr.bu      vr14,   vr28,   vr14
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr4,    t6,     t2
+    alsl.d        t6,     a2,     t6,    2
+    vld           vr5,    t6,     0
+    LSX_QPEL8_V_LOWPASS vr6, vr0, vr1, vr2, vr3, vr4, vr5
+    vavgr.bu      vr13,   vr29,   vr13
+    vavgr.bu      vr14,   vr30,   vr14
+    vstx          vr13,   a0,     t1
+    vstx          vr14,   a0,     t2
+
+    fld.d         f24,    sp,     0
+    fld.d         f25,    sp,     8
+    fld.d         f26,    sp,     16
+    fld.d         f27,    sp,     24
+    fld.d         f28,    sp,     32
+    fld.d         f29,    sp,     40
+    fld.d         f30,    sp,     48
+    fld.d         f31,    sp,     56
+    addi.d        sp,     sp,     64
+endfunc
+
+/*
+ * void put_h264_qpel16_mc13(uint8_t *dst, const uint8_t *src,
+ *                           ptrdiff_t stride)
+ */
+function ff_put_h264_qpel16_mc13_lsx
+    slli.d        t1,     a2,     1
+    add.d         t2,     t1,     a2
+    vldi          vr20,   0x414
+    vldi          vr21,   0x405
+    vldi          vr22,   0x410
+
+    addi.d        sp,     sp,     -64
+    fst.d         f24,    sp,     0
+    fst.d         f25,    sp,     8
+    fst.d         f26,    sp,     16
+    fst.d         f27,    sp,     24
+    fst.d         f28,    sp,     32
+    fst.d         f29,    sp,     40
+    fst.d         f30,    sp,     48
+    fst.d         f31,    sp,     56
+
+    addi.d        t0,     a1,     -2   // t0 = src - 2
+    add.d         t0,     t0,     a2
+    add.d         t3,     a1,     zero // t3 = src
+    sub.d         t4,     a1,     t1   // t4 = src - 2 * stride
+
+    vld           vr0,    t0,     0
+    vldx          vr1,    t0,     a2
+    LSX_QPEL8_H_LOWPASS vr12, vr13
+    vldx          vr0,    t0,     t1
+    vldx          vr1,    t0,     t2
+    LSX_QPEL8_H_LOWPASS vr14, vr15
+
+    alsl.d        a1,     a2,     t0,    2
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr16, vr17
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr18, vr19
+
+    addi.d        a1,     t0,     8
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr23, vr24
+    vssrani.bu.h  vr23,   vr12,   5
+    vssrani.bu.h  vr24,   vr13,   5
+
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr25, vr26
+    vssrani.bu.h  vr25,   vr14,   5
+    vssrani.bu.h  vr26,   vr15,   5
+
+    alsl.d        a1,     a2,     a1,    2
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr27, vr28
+    vssrani.bu.h  vr27,   vr16,   5
+    vssrani.bu.h  vr28,   vr17,   5
+
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr29, vr30
+    vssrani.bu.h  vr29,   vr18,   5
+    vssrani.bu.h  vr30,   vr19,   5
+
+    vld           vr0,    t4,     0      // t4 = src - 2 * stride + 1
+    vldx          vr1,    t4,     a2
+    vldx          vr2,    t4,     t1
+    vldx          vr3,    t4,     t2
+    alsl.d        t4,     a2,     t4,    2
+    vld           vr4,    t4,     0
+    vldx          vr5,    t4,     a2
+    vldx          vr6,    t4,     t1
+    LSX_QPEL8_V_LOWPASS vr0, vr1, vr2, vr3, vr4, vr5, vr6
+    vavgr.bu      vr13,   vr23,   vr13
+    vavgr.bu      vr14,   vr24,   vr14
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr0,    t4,     t2
+    alsl.d        t4,     a2,     t4,    2
+    vld           vr1,    t4,     0
+    LSX_QPEL8_V_LOWPASS vr2, vr3, vr4, vr5, vr6, vr0, vr1
+    vavgr.bu      vr13,   vr25,   vr13
+    vavgr.bu      vr14,   vr26,   vr14
+    add.d         t6,     t4,     zero     // t6 = src + 6 * stride
+
+    vstx          vr13,   a0,     t1
+    vstx          vr14,   a0,     t2
+
+    alsl.d        a0,     a2,     a0,    2  // dst = dst + 4 * stride
+
+    vldx          vr2,    t4,     a2
+    vldx          vr3,    t4,     t1
+    LSX_QPEL8_V_LOWPASS vr4, vr5, vr6, vr0, vr1, vr2, vr3
+    vavgr.bu      vr13,   vr27,   vr13
+    vavgr.bu      vr14,   vr28,   vr14
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr4,    t4,     t2
+    alsl.d        t4,     a2,     t4,    2
+    vld           vr5,    t4,     0
+    LSX_QPEL8_V_LOWPASS vr6, vr0, vr1, vr2, vr3, vr4, vr5
+    vavgr.bu      vr13,   vr29,   vr13
+    vavgr.bu      vr14,   vr30,   vr14
+    vstx          vr13,   a0,     t1
+    vstx          vr14,   a0,     t2
+
+    alsl.d        a1,     a2,     t0,    3  // a1 = src + 8 * stride
+    addi.d        t5,     a1,     8         // a1 = src + 8 * stride + 8
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr12, vr13
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr14, vr15
+
+    alsl.d        a1,     a2,     a1,    2
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr16, vr17
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr18, vr19
+
+    vld           vr0,    t5,     0
+    vldx          vr1,    t5,     a2
+    LSX_QPEL8_H_LOWPASS vr23, vr24
+    vssrani.bu.h  vr23,   vr12,   5
+    vssrani.bu.h  vr24,   vr13,   5
+    vldx          vr0,    t5,     t1
+    vldx          vr1,    t5,     t2
+    LSX_QPEL8_H_LOWPASS vr25, vr26
+    vssrani.bu.h  vr25,   vr14,   5
+    vssrani.bu.h  vr26,   vr15,   5
+
+    alsl.d        t5,     a2,     t5,    2
+
+    vld           vr0,    t5,     0
+    vldx          vr1,    t5,     a2
+    LSX_QPEL8_H_LOWPASS vr27, vr28
+    vssrani.bu.h  vr27,   vr16,   5
+    vssrani.bu.h  vr28,   vr17,   5
+    vldx          vr0,    t5,     t1
+    vldx          vr1,    t5,     t2
+    LSX_QPEL8_H_LOWPASS vr29, vr30
+    vssrani.bu.h  vr29,   vr18,   5
+    vssrani.bu.h  vr30,   vr19,   5
+
+    alsl.d        a0,     a2,     a0,    2   // dst = dst + 4 * stride
+
+    // t6 = src + 6 * stride + 1
+    vld           vr0,    t6,     0
+    vldx          vr1,    t6,     a2
+    vldx          vr2,    t6,     t1
+    vldx          vr3,    t6,     t2
+    alsl.d        t6,     a2,     t6,    2
+    vld           vr4,    t6,     0
+    vldx          vr5,    t6,     a2
+    vldx          vr6,    t6,     t1
+
+    LSX_QPEL8_V_LOWPASS vr0, vr1, vr2, vr3, vr4, vr5, vr6
+    vavgr.bu      vr13,   vr23,   vr13
+    vavgr.bu      vr14,   vr24,   vr14
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr0,    t6,     t2
+    alsl.d        t6,     a2,     t6,    2
+    vld           vr1,    t6,     0
+    LSX_QPEL8_V_LOWPASS vr2, vr3, vr4, vr5, vr6, vr0, vr1
+    vavgr.bu      vr13,   vr25,   vr13
+    vavgr.bu      vr14,   vr26,   vr14
+    vstx          vr13,   a0,     t1
+    vstx          vr14,   a0,     t2
+
+    alsl.d        a0,     a2,     a0,    2    // dst = dst + 4 *stride
+
+    vldx          vr2,    t6,     a2
+    vldx          vr3,    t6,     t1
+    LSX_QPEL8_V_LOWPASS vr4, vr5, vr6, vr0, vr1, vr2, vr3
+    vavgr.bu      vr13,   vr27,   vr13
+    vavgr.bu      vr14,   vr28,   vr14
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr4,    t6,     t2
+    alsl.d        t6,     a2,     t6,    2
+    vld           vr5,    t6,     0
+    LSX_QPEL8_V_LOWPASS vr6, vr0, vr1, vr2, vr3, vr4, vr5
+    vavgr.bu      vr13,   vr29,   vr13
+    vavgr.bu      vr14,   vr30,   vr14
+    vstx          vr13,   a0,     t1
+    vstx          vr14,   a0,     t2
+
+    fld.d         f24,    sp,     0
+    fld.d         f25,    sp,     8
+    fld.d         f26,    sp,     16
+    fld.d         f27,    sp,     24
+    fld.d         f28,    sp,     32
+    fld.d         f29,    sp,     40
+    fld.d         f30,    sp,     48
+    fld.d         f31,    sp,     56
+    addi.d        sp,     sp,     64
+endfunc
+
+/*
+ * void put_h264_qpel16_mc03(uint8_t *dst, const uint8_t *src,
+ *                           ptrdiff_t stride)
+ */
+function ff_put_h264_qpel16_mc03_lsx
+    slli.d        t0,     a2,     1
+    add.d         t1,     t0,     a2
+    sub.d         t2,     a1,     t0  // t2 = src - 2 * stride
+    vldi          vr20,   0x414
+    vldi          vr21,   0x405
+    vldi          vr22,   0x410
+
+    vld           vr0,    t2,     0
+    vldx          vr1,    t2,     a2
+    vldx          vr2,    t2,     t0
+    vldx          vr3,    t2,     t1
+    alsl.d        t2,     a2,     t2,    2  // t2 = t2 + 4 * stride
+    vld           vr4,    t2,     0
+    vldx          vr5,    t2,     a2
+    vldx          vr6,    t2,     t0
+    LSX_QPEL8_V_LOWPASS vr0, vr1, vr2, vr3, vr4, vr5, vr6
+    vavgr.bu      vr13,   vr3,    vr13
+    vavgr.bu      vr14,   vr4,    vr14
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr0,    t2,     t1
+    alsl.d        t2,     a2,     t2,    2  // t2 = t2 + 4 *stride
+    vld           vr1,    t2,     0
+    LSX_QPEL8_V_LOWPASS vr2, vr3, vr4, vr5, vr6, vr0, vr1
+    vavgr.bu      vr13,   vr5,    vr13
+    vavgr.bu      vr14,   vr6,    vr14
+    vstx          vr13,   a0,     t0
+    vstx          vr14,   a0,     t1
+
+    alsl.d        a0,     a2,     a0,    2  // dst = dst + 4 * stride
+
+    vldx          vr2,    t2,     a2
+    vldx          vr3,    t2,     t0
+    LSX_QPEL8_V_LOWPASS vr4, vr5, vr6, vr0, vr1, vr2, vr3
+    vavgr.bu      vr13,   vr0,    vr13
+    vavgr.bu      vr14,   vr1,    vr14
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr4,    t2,     t1
+    alsl.d        t2,     a2,     t2,    2  // t2 = t2 + 4 *stride
+    vld           vr5,    t2,     0
+    LSX_QPEL8_V_LOWPASS vr6, vr0, vr1, vr2, vr3, vr4, vr5
+    vavgr.bu      vr13,   vr2,    vr13
+    vavgr.bu      vr14,   vr3,    vr14
+    vstx          vr13,   a0,     t0
+    vstx          vr14,   a0,     t1
+
+    alsl.d        a0,     a2,     a0,    2  // dst = dst + 4 * stride
+
+    vldx          vr6,    t2,     a2
+    vldx          vr0,    t2,     t0
+    LSX_QPEL8_V_LOWPASS vr1, vr2, vr3, vr4, vr5, vr6, vr0
+    vavgr.bu      vr13,   vr4,    vr13
+    vavgr.bu      vr14,   vr5,    vr14
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr1,    t2,     t1
+    alsl.d        t2,     a2,     t2,    2 // t2 = t2 + 4 * stride
+    vld           vr2,    t2,     0
+    LSX_QPEL8_V_LOWPASS vr3, vr4, vr5, vr6, vr0, vr1, vr2
+    vavgr.bu      vr13,   vr6,    vr13
+    vavgr.bu      vr14,   vr0,    vr14
+    vstx          vr13,   a0,     t0
+    vstx          vr14,   a0,     t1
+
+    alsl.d        a0,     a2,     a0,    2  // dst = dst + 4 * stride
+
+    vldx          vr3,    t2,     a2
+    vldx          vr4,    t2,     t0
+    LSX_QPEL8_V_LOWPASS vr5, vr6, vr0, vr1, vr2, vr3, vr4
+    vavgr.bu      vr13,   vr1,    vr13
+    vavgr.bu      vr14,   vr2,    vr14
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr5,    t2,     t1
+    alsl.d        t2,     a2,     t2,    2  // t2 = t2 + 4 * stride
+    vld           vr6,    t2,     0
+    LSX_QPEL8_V_LOWPASS vr0, vr1, vr2, vr3, vr4, vr5, vr6
+    vavgr.bu      vr13,   vr3,    vr13
+    vavgr.bu      vr14,   vr4,    vr14
+    vstx          vr13,   a0,     t0
+    vstx          vr14,   a0,     t1
+endfunc
+
+/*
+ * void avg_h264_qpel16_mc10(uint8_t *dst, const uint8_t *src,
+ *                           ptrdiff_t stride)
+ */
+function ff_avg_h264_qpel16_mc10_lsx
+    addi.d        t0,     a0,     0   // t0 = dst
+    addi.d        t1,     a1,     -2  // t1 = src - 2
+    addi.d        t4,     t1,     8
+
+    slli.d        t2,     a2,     1
+    add.d         t3,     a2,     t2
+
+    vldi          vr20,   0x414
+    vldi          vr21,   0x405
+    vldi          vr22,   0x410
+
+    vld           vr0,    t1,     0
+    vldx          vr1,    t1,     a2
+    LSX_QPEL8_H_LOWPASS vr12, vr13
+    vldx          vr0,    t1,     t2
+    vldx          vr1,    t1,     t3
+    LSX_QPEL8_H_LOWPASS vr14, vr15
+
+    alsl.d        t1,     a2,     t1,    2
+
+    vld           vr0,    t1,     0
+    vldx          vr1,    t1,     a2
+    LSX_QPEL8_H_LOWPASS vr16, vr17
+    vldx          vr0,    t1,     t2
+    vldx          vr1,    t1,     t3
+    LSX_QPEL8_H_LOWPASS vr18, vr19
+
+    vld           vr0,    t4,     0
+    vldx          vr1,    t4,     a2
+    LSX_QPEL8_H_LOWPASS vr2, vr3
+    vssrani.bu.h  vr2,    vr12,   5
+    vssrani.bu.h  vr3,    vr13,   5
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    vld           vr12,   t0,     0
+    vldx          vr13,   t0,     a2
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vavgr.bu      vr0,    vr0,    vr12
+    vavgr.bu      vr1,    vr1,    vr13
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a2
+
+    vldx          vr0,    t4,     t2
+    vldx          vr1,    t4,     t3
+    LSX_QPEL8_H_LOWPASS vr2, vr3
+    vssrani.bu.h  vr2,    vr14,   5
+    vssrani.bu.h  vr3,    vr15,   5
+    vldx          vr0,    a1,     t2
+    vldx          vr1,    a1,     t3
+    vldx          vr12,   t0,     t2
+    vldx          vr13,   t0,     t3
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vavgr.bu      vr0,    vr0,    vr12
+    vavgr.bu      vr1,    vr1,    vr13
+    vstx          vr0,    a0,     t2
+    vstx          vr1,    a0,     t3
+
+    alsl.d        t4,     a2,     t4,    2
+    alsl.d        a1,     a2,     a1,    2
+    alsl.d        t0,     a2,     t0,    2
+    alsl.d        a0,     a2,     a0,    2
+
+    vld           vr0,    t4,     0
+    vldx          vr1,    t4,     a2
+    LSX_QPEL8_H_LOWPASS vr2, vr3
+    vssrani.bu.h  vr2,    vr16,   5
+    vssrani.bu.h  vr3,    vr17,   5
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    vld           vr12,   t0,     0
+    vldx          vr13,   t0,     a2
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vavgr.bu      vr0,    vr0,    vr12
+    vavgr.bu      vr1,    vr1,    vr13
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a2
+
+    vldx          vr0,    t4,     t2
+    vldx          vr1,    t4,     t3
+    LSX_QPEL8_H_LOWPASS vr2, vr3
+    vssrani.bu.h  vr2,    vr18,   5
+    vssrani.bu.h  vr3,    vr19,   5
+    vldx          vr0,    a1,     t2
+    vldx          vr1,    a1,     t3
+    vldx          vr12,   t0,     t2
+    vldx          vr13,   t0,     t3
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vavgr.bu      vr0,    vr0,    vr12
+    vavgr.bu      vr1,    vr1,    vr13
+    vstx          vr0,    a0,     t2
+    vstx          vr1,    a0,     t3
+
+    alsl.d        t4,     a2,     t4,    2
+    alsl.d        a1,     a2,     a1,    2
+    alsl.d        t0,     a2,     t0,    2
+    alsl.d        a0,     a2,     a0,    2
+    alsl.d        t1,     a2,     t1,    2   // t1 = src + 8 * stride -2
+
+    vld           vr0,    t1,     0
+    vldx          vr1,    t1,     a2
+    LSX_QPEL8_H_LOWPASS vr12, vr13
+    vldx          vr0,    t1,     t2
+    vldx          vr1,    t1,     t3
+    LSX_QPEL8_H_LOWPASS vr14, vr15
+
+    alsl.d        t1,     a2,     t1,    2
+
+    vld           vr0,    t1,     0
+    vldx          vr1,    t1,     a2
+    LSX_QPEL8_H_LOWPASS vr16, vr17
+    vldx          vr0,    t1,     t2
+    vldx          vr1,    t1,     t3
+    LSX_QPEL8_H_LOWPASS vr18, vr19
+
+    vld           vr0,    t4,     0
+    vldx          vr1,    t4,     a2
+    LSX_QPEL8_H_LOWPASS vr2, vr3
+    vssrani.bu.h  vr2,    vr12,   5
+    vssrani.bu.h  vr3,    vr13,   5
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    vld           vr12,   t0,     0
+    vldx          vr13,   t0,     a2
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vavgr.bu      vr0,    vr0,    vr12
+    vavgr.bu      vr1,    vr1,    vr13
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a2
+
+    vldx          vr0,    t4,     t2
+    vldx          vr1,    t4,     t3
+    LSX_QPEL8_H_LOWPASS vr2, vr3
+    vssrani.bu.h  vr2,    vr14,   5
+    vssrani.bu.h  vr3,    vr15,   5
+    vldx          vr0,    a1,     t2
+    vldx          vr1,    a1,     t3
+    vldx          vr12,   t0,     t2
+    vldx          vr13,   t0,     t3
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vavgr.bu      vr0,    vr0,    vr12
+    vavgr.bu      vr1,    vr1,    vr13
+    vstx          vr0,    a0,     t2
+    vstx          vr1,    a0,     t3
+
+    alsl.d        t4,     a2,     t4,    2
+    alsl.d        a1,     a2,     a1,    2
+    alsl.d        t0,     a2,     t0,    2
+    alsl.d        a0,     a2,     a0,    2
+
+    vld           vr0,    t4,     0
+    vldx          vr1,    t4,     a2
+    LSX_QPEL8_H_LOWPASS vr2, vr3
+    vssrani.bu.h  vr2,    vr16,   5
+    vssrani.bu.h  vr3,    vr17,   5
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    vld           vr12,   t0,     0
+    vldx          vr13,   t0,     a2
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vavgr.bu      vr0,    vr0,    vr12
+    vavgr.bu      vr1,    vr1,    vr13
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a2
+
+    vldx          vr0,    t4,     t2
+    vldx          vr1,    t4,     t3
+    LSX_QPEL8_H_LOWPASS vr2, vr3
+    vssrani.bu.h  vr2,    vr18,   5
+    vssrani.bu.h  vr3,    vr19,   5
+    vldx          vr0,    a1,     t2
+    vldx          vr1,    a1,     t3
+    vldx          vr12,   t0,     t2
+    vldx          vr13,   t0,     t3
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vavgr.bu      vr0,    vr0,    vr12
+    vavgr.bu      vr1,    vr1,    vr13
+    vstx          vr0,    a0,     t2
+    vstx          vr1,    a0,     t3
+endfunc
+
+/*
+ * void avg_h264_qpel16_mc30(uint8_t *dst, const uint8_t *src,
+ *                           ptrdiff_t stride)
+ */
+function ff_avg_h264_qpel16_mc30_lsx
+    addi.d        t0,     a0,     0   // t0 = dst
+    addi.d        t1,     a1,     -2  // t1 = src - 2
+    addi.d        t4,     t1,     8
+    addi.d        a1,     a1,     1   // a1 = a1 + 1
+
+    slli.d        t2,     a2,     1
+    add.d         t3,     a2,     t2
+
+    vldi          vr20,   0x414
+    vldi          vr21,   0x405
+    vldi          vr22,   0x410
+
+    vld           vr0,    t1,     0
+    vldx          vr1,    t1,     a2
+    LSX_QPEL8_H_LOWPASS vr12, vr13
+    vldx          vr0,    t1,     t2
+    vldx          vr1,    t1,     t3
+    LSX_QPEL8_H_LOWPASS vr14, vr15
+
+    alsl.d        t1,     a2,     t1,    2
+
+    vld           vr0,    t1,     0
+    vldx          vr1,    t1,     a2
+    LSX_QPEL8_H_LOWPASS vr16, vr17
+    vldx          vr0,    t1,     t2
+    vldx          vr1,    t1,     t3
+    LSX_QPEL8_H_LOWPASS vr18, vr19
+
+    vld           vr0,    t4,     0
+    vldx          vr1,    t4,     a2
+    LSX_QPEL8_H_LOWPASS vr2, vr3
+    vssrani.bu.h  vr2,    vr12,   5
+    vssrani.bu.h  vr3,    vr13,   5
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    vld           vr12,   t0,     0
+    vldx          vr13,   t0,     a2
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vavgr.bu      vr0,    vr0,    vr12
+    vavgr.bu      vr1,    vr1,    vr13
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a2
+
+    vldx          vr0,    t4,     t2
+    vldx          vr1,    t4,     t3
+    LSX_QPEL8_H_LOWPASS vr2, vr3
+    vssrani.bu.h  vr2,    vr14,   5
+    vssrani.bu.h  vr3,    vr15,   5
+    vldx          vr0,    a1,     t2
+    vldx          vr1,    a1,     t3
+    vldx          vr12,   t0,     t2
+    vldx          vr13,   t0,     t3
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vavgr.bu      vr0,    vr0,    vr12
+    vavgr.bu      vr1,    vr1,    vr13
+    vstx          vr0,    a0,     t2
+    vstx          vr1,    a0,     t3
+
+    alsl.d        t4,     a2,     t4,    2
+    alsl.d        a1,     a2,     a1,    2
+    alsl.d        t0,     a2,     t0,    2
+    alsl.d        a0,     a2,     a0,    2
+
+    vld           vr0,    t4,     0
+    vldx          vr1,    t4,     a2
+    LSX_QPEL8_H_LOWPASS vr2, vr3
+    vssrani.bu.h  vr2,    vr16,   5
+    vssrani.bu.h  vr3,    vr17,   5
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    vld           vr12,   t0,     0
+    vldx          vr13,   t0,     a2
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vavgr.bu      vr0,    vr0,    vr12
+    vavgr.bu      vr1,    vr1,    vr13
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a2
+
+    vldx          vr0,    t4,     t2
+    vldx          vr1,    t4,     t3
+    LSX_QPEL8_H_LOWPASS vr2, vr3
+    vssrani.bu.h  vr2,    vr18,   5
+    vssrani.bu.h  vr3,    vr19,   5
+    vldx          vr0,    a1,     t2
+    vldx          vr1,    a1,     t3
+    vldx          vr12,   t0,     t2
+    vldx          vr13,   t0,     t3
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vavgr.bu      vr0,    vr0,    vr12
+    vavgr.bu      vr1,    vr1,    vr13
+    vstx          vr0,    a0,     t2
+    vstx          vr1,    a0,     t3
+
+    alsl.d        t4,     a2,     t4,    2
+    alsl.d        a1,     a2,     a1,    2
+    alsl.d        t0,     a2,     t0,    2
+    alsl.d        a0,     a2,     a0,    2
+    alsl.d        t1,     a2,     t1,    2   // t1 = src + 8 * stride -2
+
+    vld           vr0,    t1,     0
+    vldx          vr1,    t1,     a2
+    LSX_QPEL8_H_LOWPASS vr12, vr13
+    vldx          vr0,    t1,     t2
+    vldx          vr1,    t1,     t3
+    LSX_QPEL8_H_LOWPASS vr14, vr15
+
+    alsl.d        t1,     a2,     t1,    2
+
+    vld           vr0,    t1,     0
+    vldx          vr1,    t1,     a2
+    LSX_QPEL8_H_LOWPASS vr16, vr17
+    vldx          vr0,    t1,     t2
+    vldx          vr1,    t1,     t3
+    LSX_QPEL8_H_LOWPASS vr18, vr19
+
+    vld           vr0,    t4,     0
+    vldx          vr1,    t4,     a2
+    LSX_QPEL8_H_LOWPASS vr2, vr3
+    vssrani.bu.h  vr2,    vr12,   5
+    vssrani.bu.h  vr3,    vr13,   5
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    vld           vr12,   t0,     0
+    vldx          vr13,   t0,     a2
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vavgr.bu      vr0,    vr0,    vr12
+    vavgr.bu      vr1,    vr1,    vr13
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a2
+
+    vldx          vr0,    t4,     t2
+    vldx          vr1,    t4,     t3
+    LSX_QPEL8_H_LOWPASS vr2, vr3
+    vssrani.bu.h  vr2,    vr14,   5
+    vssrani.bu.h  vr3,    vr15,   5
+    vldx          vr0,    a1,     t2
+    vldx          vr1,    a1,     t3
+    vldx          vr12,   t0,     t2
+    vldx          vr13,   t0,     t3
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vavgr.bu      vr0,    vr0,    vr12
+    vavgr.bu      vr1,    vr1,    vr13
+    vstx          vr0,    a0,     t2
+    vstx          vr1,    a0,     t3
+
+    alsl.d        t4,     a2,     t4,    2
+    alsl.d        a1,     a2,     a1,    2
+    alsl.d        t0,     a2,     t0,    2
+    alsl.d        a0,     a2,     a0,    2
+
+    vld           vr0,    t4,     0
+    vldx          vr1,    t4,     a2
+    LSX_QPEL8_H_LOWPASS vr2, vr3
+    vssrani.bu.h  vr2,    vr16,   5
+    vssrani.bu.h  vr3,    vr17,   5
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    vld           vr12,   t0,     0
+    vldx          vr13,   t0,     a2
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vavgr.bu      vr0,    vr0,    vr12
+    vavgr.bu      vr1,    vr1,    vr13
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a2
+
+    vldx          vr0,    t4,     t2
+    vldx          vr1,    t4,     t3
+    LSX_QPEL8_H_LOWPASS vr2, vr3
+    vssrani.bu.h  vr2,    vr18,   5
+    vssrani.bu.h  vr3,    vr19,   5
+    vldx          vr0,    a1,     t2
+    vldx          vr1,    a1,     t3
+    vldx          vr12,   t0,     t2
+    vldx          vr13,   t0,     t3
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vavgr.bu      vr0,    vr0,    vr12
+    vavgr.bu      vr1,    vr1,    vr13
+    vstx          vr0,    a0,     t2
+    vstx          vr1,    a0,     t3
+endfunc
+
+/*
+ * void put_h264_qpel16_mc02(uint8_t *dst, const uint8_t *src,
+ *                           ptrdiff_t stride)
+ */
+function ff_put_h264_qpel16_mc02_lsx
+    slli.d        t0,     a2,     1
+    add.d         t1,     t0,     a2
+    sub.d         t2,     a1,     t0  // t2 = src - 2 * stride
+    vldi          vr20,   0x414
+    vldi          vr21,   0x405
+    vldi          vr22,   0x410
+
+    vld           vr0,    t2,     0
+    vldx          vr1,    t2,     a2
+    vldx          vr2,    t2,     t0
+    vldx          vr3,    t2,     t1
+    alsl.d        t2,     a2,     t2,    2  // t2 = t2 + 4 * stride
+    vld           vr4,    t2,     0
+    vldx          vr5,    t2,     a2
+    vldx          vr6,    t2,     t0
+    LSX_QPEL8_V_LOWPASS vr0, vr1, vr2, vr3, vr4, vr5, vr6
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr0,    t2,     t1
+    alsl.d        t2,     a2,     t2,    2  // t2 = t2 + 4 *stride
+    vld           vr1,    t2,     0
+    LSX_QPEL8_V_LOWPASS vr2, vr3, vr4, vr5, vr6, vr0, vr1
+    vstx          vr13,   a0,     t0
+    vstx          vr14,   a0,     t1
+
+    alsl.d        a0,     a2,     a0,    2  // dst = dst + 4 * stride
+
+    vldx          vr2,    t2,     a2
+    vldx          vr3,    t2,     t0
+    LSX_QPEL8_V_LOWPASS vr4, vr5, vr6, vr0, vr1, vr2, vr3
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr4,    t2,     t1
+    alsl.d        t2,     a2,     t2,    2 // t2 = t2 + 4 * stride
+    vld           vr5,    t2,     0
+    LSX_QPEL8_V_LOWPASS vr6, vr0, vr1, vr2, vr3, vr4, vr5
+    vstx          vr13,   a0,     t0
+    vstx          vr14,   a0,     t1
+
+    alsl.d        a0,     a2,     a0,    2  // dst = dst + 4 * stride
+
+    vldx          vr6,    t2,     a2
+    vldx          vr0,    t2,     t0
+    LSX_QPEL8_V_LOWPASS vr1, vr2, vr3, vr4, vr5, vr6, vr0
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr1,    t2,     t1
+    alsl.d        t2,     a2,     t2,    2  // t2 = t2 + 4 * stride
+    vld           vr2,    t2,     0
+    LSX_QPEL8_V_LOWPASS vr3, vr4, vr5, vr6, vr0, vr1, vr2
+    vstx          vr13,   a0,     t0
+    vstx          vr14,   a0,     t1
+
+    alsl.d        a0,     a2,     a0,    2  // dst = dst + 4 * stride
+
+    vldx          vr3,    t2,     a2
+    vldx          vr4,    t2,     t0
+    LSX_QPEL8_V_LOWPASS vr5, vr6, vr0, vr1, vr2, vr3, vr4
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr5,    t2,     t1
+    alsl.d        t2,     a2,     t2,    2 // t2 = t2 + 4 * stride
+    vld           vr6,    t2,     0
+    LSX_QPEL8_V_LOWPASS vr0, vr1, vr2, vr3, vr4, vr5, vr6
+    vstx          vr13,   a0,     t0
+    vstx          vr14,   a0,     t1
+endfunc
+
+.macro lsx_avc_luma_hv_qrt_and_aver_dst_16x16
+    vldi          vr20,   0x414
+    vldi          vr21,   0x405
+    vldi          vr22,   0x410
+
+    addi.d        sp,     sp,     -64
+    fst.d         f24,    sp,     0
+    fst.d         f25,    sp,     8
+    fst.d         f26,    sp,     16
+    fst.d         f27,    sp,     24
+    fst.d         f28,    sp,     32
+    fst.d         f29,    sp,     40
+    fst.d         f30,    sp,     48
+    fst.d         f31,    sp,     56
+
+    vld           vr0,    t0,     0
+    vldx          vr1,    t0,     a2
+    LSX_QPEL8_H_LOWPASS vr12, vr13
+    vldx          vr0,    t0,     t1
+    vldx          vr1,    t0,     t2
+    LSX_QPEL8_H_LOWPASS vr14, vr15
+
+    alsl.d        a1,     a2,     t0,    2
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr16, vr17
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr18, vr19
+
+    addi.d        a1,     t0,     8
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr23, vr24
+    vssrani.bu.h  vr23,   vr12,   5
+    vssrani.bu.h  vr24,   vr13,   5
+
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr25, vr26
+    vssrani.bu.h  vr25,   vr14,   5
+    vssrani.bu.h  vr26,   vr15,   5
+
+    alsl.d        a1,     a2,     a1,    2
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr27, vr28
+    vssrani.bu.h  vr27,   vr16,   5
+    vssrani.bu.h  vr28,   vr17,   5
+
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr29, vr30
+    vssrani.bu.h  vr29,   vr18,   5
+    vssrani.bu.h  vr30,   vr19,   5
+
+    vld           vr0,    t4,     0      // t4 = src - 2 * stride + 1
+    vldx          vr1,    t4,     a2
+    vldx          vr2,    t4,     t1
+    vldx          vr3,    t4,     t2
+    alsl.d        t4,     a2,     t4,    2
+    vld           vr4,    t4,     0
+    vldx          vr5,    t4,     a2
+    vldx          vr6,    t4,     t1
+    LSX_QPEL8_V_LOWPASS vr0, vr1, vr2, vr3, vr4, vr5, vr6
+    vld           vr0,    t8,     0
+    vldx          vr1,    t8,     a2
+    vavgr.bu      vr13,   vr23,   vr13
+    vavgr.bu      vr14,   vr24,   vr14
+    vavgr.bu      vr13,   vr13,   vr0
+    vavgr.bu      vr14,   vr14,   vr1
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr0,    t4,     t2
+    alsl.d        t4,     a2,     t4,    2
+    vld           vr1,    t4,     0
+    LSX_QPEL8_V_LOWPASS vr2, vr3, vr4, vr5, vr6, vr0, vr1
+    vldx          vr2,    t8,     t1
+    vldx          vr3,    t8,     t2
+    vavgr.bu      vr13,   vr25,   vr13
+    vavgr.bu      vr14,   vr26,   vr14
+    vavgr.bu      vr13,   vr13,   vr2
+    vavgr.bu      vr14,   vr14,   vr3
+    add.d         t6,     t4,     zero     // t6 = src + 6 * stride
+    vstx          vr13,   a0,     t1
+    vstx          vr14,   a0,     t2
+
+    alsl.d        a0,     a2,     a0,    2  // dst = dst + 4 * stride
+    alsl.d        t8,     a2,     t8,    2
+
+    vldx          vr2,    t4,     a2
+    vldx          vr3,    t4,     t1
+    LSX_QPEL8_V_LOWPASS vr4, vr5, vr6, vr0, vr1, vr2, vr3
+    vld           vr4,    t8,     0
+    vldx          vr5,    t8,     a2
+    vavgr.bu      vr13,   vr27,   vr13
+    vavgr.bu      vr14,   vr28,   vr14
+    vavgr.bu      vr13,   vr13,   vr4
+    vavgr.bu      vr14,   vr14,   vr5
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr4,    t4,     t2
+    alsl.d        t4,     a2,     t4,    2
+    vld           vr5,    t4,     0
+    LSX_QPEL8_V_LOWPASS vr6, vr0, vr1, vr2, vr3, vr4, vr5
+    vldx          vr6,    t8,     t1
+    vldx          vr0,    t8,     t2
+    vavgr.bu      vr13,   vr29,   vr13
+    vavgr.bu      vr14,   vr30,   vr14
+    vavgr.bu      vr13,   vr13,   vr6
+    vavgr.bu      vr14,   vr14,   vr0
+    vstx          vr13,   a0,     t1
+    vstx          vr14,   a0,     t2
+
+    alsl.d        a1,     a2,     t0,    3  // a1 = src + 8 * stride
+    addi.d        t5,     a1,     8         // a1 = src + 8 * stride + 8
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr12, vr13
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr14, vr15
+
+    alsl.d        a1,     a2,     a1,    2
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr16, vr17
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr18, vr19
+
+    vld           vr0,    t5,     0
+    vldx          vr1,    t5,     a2
+    LSX_QPEL8_H_LOWPASS vr23, vr24
+    vssrani.bu.h  vr23,   vr12,   5
+    vssrani.bu.h  vr24,   vr13,   5
+    vldx          vr0,    t5,     t1
+    vldx          vr1,    t5,     t2
+    LSX_QPEL8_H_LOWPASS vr25, vr26
+    vssrani.bu.h  vr25,   vr14,   5
+    vssrani.bu.h  vr26,   vr15,   5
+
+    alsl.d        t5,     a2,     t5,    2
+
+    vld           vr0,    t5,     0
+    vldx          vr1,    t5,     a2
+    LSX_QPEL8_H_LOWPASS vr27, vr28
+    vssrani.bu.h  vr27,   vr16,   5
+    vssrani.bu.h  vr28,   vr17,   5
+    vldx          vr0,    t5,     t1
+    vldx          vr1,    t5,     t2
+    LSX_QPEL8_H_LOWPASS vr29, vr30
+    vssrani.bu.h  vr29,   vr18,   5
+    vssrani.bu.h  vr30,   vr19,   5
+
+    alsl.d        a0,     a2,     a0,    2   // dst = dst + 4 * stride
+    alsl.d        t8,     a2,     t8,    2
+    // t6 = src + 6 * stride + 1
+    vld           vr0,    t6,     0
+    vldx          vr1,    t6,     a2
+    vldx          vr2,    t6,     t1
+    vldx          vr3,    t6,     t2
+    alsl.d        t6,     a2,     t6,    2
+    vld           vr4,    t6,     0
+    vldx          vr5,    t6,     a2
+    vldx          vr6,    t6,     t1
+
+    LSX_QPEL8_V_LOWPASS vr0, vr1, vr2, vr3, vr4, vr5, vr6
+    vld           vr0,    t8,     0
+    vldx          vr1,    t8,     a2
+    vavgr.bu      vr13,   vr23,   vr13
+    vavgr.bu      vr14,   vr24,   vr14
+    vavgr.bu      vr13,   vr13,   vr0
+    vavgr.bu      vr14,   vr14,   vr1
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr0,    t6,     t2
+    alsl.d        t6,     a2,     t6,    2
+    vld           vr1,    t6,     0
+    LSX_QPEL8_V_LOWPASS vr2, vr3, vr4, vr5, vr6, vr0, vr1
+    vldx          vr2,    t8,     t1
+    vldx          vr3,    t8,     t2
+    vavgr.bu      vr13,   vr25,   vr13
+    vavgr.bu      vr14,   vr26,   vr14
+    vavgr.bu      vr13,   vr13,   vr2
+    vavgr.bu      vr14,   vr14,   vr3
+    vstx          vr13,   a0,     t1
+    vstx          vr14,   a0,     t2
+
+    alsl.d        a0,     a2,     a0,    2    // dst = dst + 4 *stride
+    alsl.d        t8,     a2,     t8,    2
+
+    vldx          vr2,    t6,     a2
+    vldx          vr3,    t6,     t1
+    LSX_QPEL8_V_LOWPASS vr4, vr5, vr6, vr0, vr1, vr2, vr3
+    vld           vr4,    t8,     0
+    vldx          vr5,    t8,     a2
+    vavgr.bu      vr13,   vr27,   vr13
+    vavgr.bu      vr14,   vr28,   vr14
+    vavgr.bu      vr13,   vr13,   vr4
+    vavgr.bu      vr14,   vr14,   vr5
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr4,    t6,     t2
+    alsl.d        t6,     a2,     t6,    2
+    vld           vr5,    t6,     0
+    LSX_QPEL8_V_LOWPASS vr6, vr0, vr1, vr2, vr3, vr4, vr5
+    vldx          vr6,    t8,     t1
+    vldx          vr0,    t8,     t2
+    vavgr.bu      vr13,   vr29,   vr13
+    vavgr.bu      vr14,   vr30,   vr14
+    vavgr.bu      vr13,   vr13,   vr6
+    vavgr.bu      vr14,   vr14,   vr0
+    vstx          vr13,   a0,     t1
+    vstx          vr14,   a0,     t2
+
+    fld.d         f24,    sp,     0
+    fld.d         f25,    sp,     8
+    fld.d         f26,    sp,     16
+    fld.d         f27,    sp,     24
+    fld.d         f28,    sp,     32
+    fld.d         f29,    sp,     40
+    fld.d         f30,    sp,     48
+    fld.d         f31,    sp,     56
+    addi.d        sp,     sp,     64
+.endm
+
+/*
+ * void avg_h264_qpel16_mc33(uint8_t *dst, const uint8_t *src,
+ *                           ptrdiff_t stride)
+ */
+function ff_avg_h264_qpel16_mc33_lsx
+    slli.d        t1,     a2,     1
+    add.d         t2,     t1,     a2
+
+    addi.d        t0,     a1,     -2   // t0 = src - 2
+    add.d         t0,     t0,     a2   // t0 = src + stride - 2
+    add.d         t3,     a1,     zero // t3 = src
+    sub.d         t4,     a1,     t1   // t4 = src - 2 * stride
+    addi.d        t4,     t4,     1
+    addi.d        t8,     a0,     0
+
+    lsx_avc_luma_hv_qrt_and_aver_dst_16x16
+endfunc
+
+/*
+ * void avg_h264_qpel16_mc11(uint8_t *dst, const uint8_t *src,
+ *                           ptrdiff_t stride)
+ */
+function ff_avg_h264_qpel16_mc11_lsx
+    slli.d        t1,     a2,     1
+    add.d         t2,     t1,     a2
+
+    addi.d        t0,     a1,     -2   // t0 = src - 2
+    add.d         t3,     a1,     zero // t3 = src
+    sub.d         t4,     a1,     t1   // t4 = src - 2 * stride
+    addi.d        t8,     a0,     0
+
+    lsx_avc_luma_hv_qrt_and_aver_dst_16x16
+endfunc
+
+/*
+ * void avg_h264_qpel16_mc31(uint8_t *dst, const uint8_t *src,
+ *                           ptrdiff_t stride)
+ */
+function ff_avg_h264_qpel16_mc31_lsx
+    slli.d        t1,     a2,     1
+    add.d         t2,     t1,     a2
+
+    addi.d        t0,     a1,     -2   // t0 = src - 2
+    add.d         t3,     a1,     zero // t3 = src
+    sub.d         t4,     a1,     t1   // t4 = src - 2 * stride
+    addi.d        t4,     t4,     1
+    addi.d        t8,     a0,     0
+
+    lsx_avc_luma_hv_qrt_and_aver_dst_16x16
+endfunc
+
+/*
+ * void avg_h264_qpel16_mc13(uint8_t *dst, const uint8_t *src,
+ *                           ptrdiff_t stride)
+ */
+function ff_avg_h264_qpel16_mc13_lsx
+    slli.d        t1,     a2,     1
+    add.d         t2,     t1,     a2
+
+    addi.d        t0,     a1,     -2   // t0 = src - 2
+    add.d         t0,     t0,     a2
+    add.d         t3,     a1,     zero // t3 = src
+    sub.d         t4,     a1,     t1   // t4 = src - 2 * stride
+    addi.d        t8,     a0,     0
+
+    lsx_avc_luma_hv_qrt_and_aver_dst_16x16
+endfunc
+
+/*
+ * void avg_h264_qpel16_mc20(uint8_t *dst, const uint8_t *src,
+ *                           ptrdiff_t stride)
+ */
+function ff_avg_h264_qpel16_mc20_lsx
+    slli.d        t1,     a2,     1
+    add.d         t2,     t1,     a2
+    vldi          vr20,   0x414
+    vldi          vr21,   0x405
+    vldi          vr22,   0x410
+
+    addi.d        t0,     a1,     -2   // t0 = src - 2
+    addi.d        t5,     a0,     0
+
+    vld           vr0,    t0,     0
+    vldx          vr1,    t0,     a2
+    LSX_QPEL8_H_LOWPASS vr12, vr13
+    vldx          vr0,    t0,     t1
+    vldx          vr1,    t0,     t2
+    LSX_QPEL8_H_LOWPASS vr14, vr15
+
+    alsl.d        a1,     a2,     t0,    2
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr16, vr17
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr18, vr19
+
+    addi.d        t0,     t0,     8
+
+    vld           vr0,    t0,     0
+    vldx          vr1,    t0,     a2
+    LSX_QPEL8_H_LOWPASS vr2, vr3
+    vld           vr0,    t5,     0
+    vldx          vr1,    t5,     a2
+    vssrani.bu.h  vr2,    vr12,   5
+    vssrani.bu.h  vr3,    vr13,   5
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a2
+
+    vldx          vr0,    t0,     t1
+    vldx          vr1,    t0,     t2
+    LSX_QPEL8_H_LOWPASS vr2, vr3
+    vldx          vr0,    t5,     t1
+    vldx          vr1,    t5,     t2
+    vssrani.bu.h  vr2,    vr14,   5
+    vssrani.bu.h  vr3,    vr15,   5
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vstx          vr0,    a0,     t1
+    vstx          vr1,    a0,     t2
+
+    alsl.d        t0,     a2,     t0,    2
+    alsl.d        t5,     a2,     t5,    2
+    alsl.d        a0,     a2,     a0,    2
+
+    vld           vr0,    t0,     0
+    vldx          vr1,    t0,     a2
+    LSX_QPEL8_H_LOWPASS vr2, vr3
+    vld           vr0,    t5,     0
+    vldx          vr1,    t5,     a2
+    vssrani.bu.h  vr2,    vr16,   5
+    vssrani.bu.h  vr3,    vr17,   5
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a2
+
+    vldx          vr0,    t0,     t1
+    vldx          vr1,    t0,     t2
+    LSX_QPEL8_H_LOWPASS vr2, vr3
+    vldx          vr0,    t5,     t1
+    vldx          vr1,    t5,     t2
+    vssrani.bu.h  vr2,    vr18,   5
+    vssrani.bu.h  vr3,    vr19,   5
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vstx          vr0,    a0,     t1
+    vstx          vr1,    a0,     t2
+
+    alsl.d        a1,     a2,     a1,    2
+    alsl.d        t0,     a2,     t0,    2
+    alsl.d        t5,     a2,     t5,    2
+    alsl.d        a0,     a2,     a0,    2
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr12, vr13
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr14, vr15
+
+    alsl.d        a1,     a2,     a1,    2
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    LSX_QPEL8_H_LOWPASS vr16, vr17
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr18, vr19
+
+    vld           vr0,    t0,     0
+    vldx          vr1,    t0,     a2
+    LSX_QPEL8_H_LOWPASS vr2, vr3
+    vld           vr0,    t5,     0
+    vldx          vr1,    t5,     a2
+    vssrani.bu.h  vr2,    vr12,   5
+    vssrani.bu.h  vr3,    vr13,   5
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a2
+
+    vldx          vr0,    t0,     t1
+    vldx          vr1,    t0,     t2
+    LSX_QPEL8_H_LOWPASS vr2, vr3
+    vldx          vr0,    t5,     t1
+    vldx          vr1,    t5,     t2
+    vssrani.bu.h  vr2,    vr14,   5
+    vssrani.bu.h  vr3,    vr15,   5
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vstx          vr0,    a0,     t1
+    vstx          vr1,    a0,     t2
+
+    alsl.d        t0,     a2,     t0,    2
+    alsl.d        t5,     a2,     t5,    2
+    alsl.d        a0,     a2,     a0,    2
+
+    vld           vr0,    t0,     0
+    vldx          vr1,    t0,     a2
+    LSX_QPEL8_H_LOWPASS vr2, vr3
+    vld           vr0,    t5,     0
+    vldx          vr1,    t5,     a2
+    vssrani.bu.h  vr2,    vr16,   5
+    vssrani.bu.h  vr3,    vr17,   5
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a2
+
+    vldx          vr0,    t0,     t1
+    vldx          vr1,    t0,     t2
+    LSX_QPEL8_H_LOWPASS vr2, vr3
+    vldx          vr0,    t5,     t1
+    vldx          vr1,    t5,     t2
+    vssrani.bu.h  vr2,    vr18,   5
+    vssrani.bu.h  vr3,    vr19,   5
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vstx          vr0,    a0,     t1
+    vstx          vr1,    a0,     t2
+endfunc
+
+.macro LSX_QPEL8_HV_LOWPASS_H out0, out1
+    vbsrl.v       vr2,    vr0,    1
+    vbsrl.v       vr3,    vr1,    1
+    vbsrl.v       vr4,    vr0,    2
+    vbsrl.v       vr5,    vr1,    2
+    vbsrl.v       vr6,    vr0,    3
+    vbsrl.v       vr7,    vr1,    3
+    vbsrl.v       vr8,    vr0,    4
+    vbsrl.v       vr9,    vr1,    4
+    vbsrl.v       vr10,   vr0,    5
+    vbsrl.v       vr11,   vr1,    5
+
+    vilvl.b       vr6,    vr4,    vr6
+    vilvl.b       vr7,    vr5,    vr7
+    vilvl.b       vr8,    vr2,    vr8
+    vilvl.b       vr9,    vr3,    vr9
+    vilvl.b       vr10,   vr0,    vr10
+    vilvl.b       vr11,   vr1,    vr11
+
+    vhaddw.hu.bu  vr6,    vr6,    vr6
+    vhaddw.hu.bu  vr7,    vr7,    vr7
+    vhaddw.hu.bu  vr8,    vr8,    vr8
+    vhaddw.hu.bu  vr9,    vr9,    vr9
+    vhaddw.hu.bu  vr10,   vr10,   vr10
+    vhaddw.hu.bu  vr11,   vr11,   vr11
+
+    vmul.h        vr2,    vr6,    vr20
+    vmul.h        vr3,    vr7,    vr20
+    vmul.h        vr4,    vr8,    vr21
+    vmul.h        vr5,    vr9,    vr21
+    vssub.h       vr2,    vr2,    vr4
+    vssub.h       vr3,    vr3,    vr5
+    vsadd.h       \out0,  vr2,    vr10
+    vsadd.h       \out1,  vr3,    vr11
+.endm
+
+.macro LSX_QPEL8_HV_LOWPASS_V in0, in1, in2, in3, in4, in5, in6, out0, out1, out2, out3
+    vilvl.h       vr0,    \in2,   \in3
+    vilvl.h       vr1,    \in3,   \in4  // tmp0
+    vilvl.h       vr2,    \in1,   \in4
+    vilvl.h       vr3,    \in2,   \in5  // tmp2
+    vilvl.h       vr4,    \in0,   \in5
+    vilvl.h       vr5,    \in1,   \in6  // tmp4
+    vhaddw.w.h    vr0,    vr0,    vr0
+    vhaddw.w.h    vr1,    vr1,    vr1
+    vhaddw.w.h    vr2,    vr2,    vr2
+    vhaddw.w.h    vr3,    vr3,    vr3
+    vhaddw.w.h    vr4,    vr4,    vr4
+    vhaddw.w.h    vr5,    vr5,    vr5
+    vmul.w        vr0,    vr0,    vr22
+    vmul.w        vr1,    vr1,    vr22
+    vmul.w        vr2,    vr2,    vr23
+    vmul.w        vr3,    vr3,    vr23
+    vssub.w       vr0,    vr0,    vr2
+    vssub.w       vr1,    vr1,    vr3
+    vsadd.w       vr0,    vr0,    vr4
+    vsadd.w       vr1,    vr1,    vr5
+    vsadd.w       \out0,  vr0,    vr24
+    vsadd.w       \out1,  vr1,    vr24
+
+    vilvh.h       vr0,    \in2,   \in3
+    vilvh.h       vr1,    \in3,   \in4  // tmp0
+    vilvh.h       vr2,    \in1,   \in4
+    vilvh.h       vr3,    \in2,   \in5  // tmp2
+    vilvh.h       vr4,    \in0,   \in5
+    vilvh.h       vr5,    \in1,   \in6  // tmp4
+    vhaddw.w.h    vr0,    vr0,    vr0
+    vhaddw.w.h    vr1,    vr1,    vr1
+    vhaddw.w.h    vr2,    vr2,    vr2
+    vhaddw.w.h    vr3,    vr3,    vr3
+    vhaddw.w.h    vr4,    vr4,    vr4
+    vhaddw.w.h    vr5,    vr5,    vr5
+    vmul.w        vr0,    vr0,    vr22
+    vmul.w        vr1,    vr1,    vr22
+    vmul.w        vr2,    vr2,    vr23
+    vmul.w        vr3,    vr3,    vr23
+    vssub.w       vr0,    vr0,    vr2
+    vssub.w       vr1,    vr1,    vr3
+    vsadd.w       vr0,    vr0,    vr4
+    vsadd.w       vr1,    vr1,    vr5
+    vsadd.w       \out2,  vr0,    vr24
+    vsadd.w       \out3,  vr1,    vr24
+
+    vssrani.hu.w  \out2,  \out0,  10
+    vssrani.hu.w  \out3,  \out1,  10
+    vssrani.bu.h  \out3,  \out2,  0
+.endm
+
+.macro put_h264_qpel8_hv_lowpass_core_lsx in0, in1
+    vld           vr0,    \in0,  0
+    vldx          vr1,    \in0,  a3
+    LSX_QPEL8_HV_LOWPASS_H vr12, vr13 // a b$
+    vldx          vr0,    \in0,  t1
+    vldx          vr1,    \in0,  t2
+    LSX_QPEL8_HV_LOWPASS_H vr14, vr15 // c d$
+
+    alsl.d        \in0,   a3,    \in0,   2
+
+    vld           vr0,    \in0,  0
+    vldx          vr1,    \in0,  a3
+    LSX_QPEL8_HV_LOWPASS_H vr16, vr17 // e f$
+    vldx          vr0,    \in0,  t1
+    vldx          vr1,    \in0,  t2
+    LSX_QPEL8_HV_LOWPASS_H vr18, vr19 // g h$
+
+    LSX_QPEL8_HV_LOWPASS_V vr12, vr13, vr14, vr15, vr16, vr17, vr18, vr6, vr7, vr0, vr1
+    vstelm.d      vr1,    \in1,    0,     0
+    add.d         \in1,   \in1,    a2
+    vstelm.d      vr1,    \in1,    0,     1
+
+    alsl.d        \in0,    a3,    \in0,   2
+
+    // tmp8
+    vld           vr0,    \in0,   0
+    vldx          vr1,    \in0,   a3
+    LSX_QPEL8_HV_LOWPASS_H vr12, vr13
+    LSX_QPEL8_HV_LOWPASS_V vr14, vr15, vr16, vr17, vr18, vr19, vr12, vr6, vr7, vr0, vr1
+    add.d         \in1,   \in1,   a2
+    vstelm.d      vr1,    \in1,   0,     0
+    add.d         \in1,   \in1,   a2
+    vstelm.d      vr1,    \in1,   0,     1
+
+    // tmp10
+    vldx          vr0,    \in0,   t1
+    vldx          vr1,    \in0,   t2
+    LSX_QPEL8_HV_LOWPASS_H vr14, vr15
+    LSX_QPEL8_HV_LOWPASS_V vr16, vr17, vr18, vr19, vr12, vr13, vr14, vr6, vr7, vr0, vr1
+    add.d         \in1,   \in1,   a2
+    vstelm.d      vr1,    \in1,   0,     0
+    add.d         \in1,   \in1,   a2
+    vstelm.d      vr1,    \in1,   0,     1
+
+    // tmp12
+    alsl.d        \in0,   a3,     \in0,  2
+
+    vld           vr0,    \in0,   0
+    vldx          vr1,    \in0,   a3
+    LSX_QPEL8_HV_LOWPASS_H vr16, vr17
+    LSX_QPEL8_HV_LOWPASS_V vr18, vr19, vr12, vr13, vr14, vr15, vr16, vr6, vr7, vr0, vr1
+    add.d         \in1,   \in1,   a2
+    vstelm.d      vr1,    \in1,   0,     0
+    add.d         \in1,   \in1,   a2
+    vstelm.d      vr1,    \in1,   0,     1
+.endm
+
+function put_h264_qpel8_hv_lowpass_lsx
+    slli.d        t1,     a3,     1
+    add.d         t2,     t1,     a3
+
+    addi.d        sp,     sp,     -8
+    fst.d         f24,    sp,     0
+
+    vldi          vr20,   0x414   // h_20
+    vldi          vr21,   0x405   // h_5
+    vldi          vr22,   0x814   // w_20
+    vldi          vr23,   0x805   // w_5
+    addi.d        t4,     zero,   512
+    vreplgr2vr.w  vr24,   t4      // w_512
+
+    addi.d        t0,     a1,     -2   // t0 = src - 2
+    sub.d         t0,     t0,     t1   // t0 = t0 - 2 * stride
+
+    put_h264_qpel8_hv_lowpass_core_lsx t0, a0
+
+    fld.d         f24,    sp,     0
+    addi.d        sp,     sp,     8
+endfunc
+
+/*
+ * void put_h264_qpel16_h_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+ *                                    ptrdiff_t dstStride, ptrdiff_t srcStride)
+ */
+function put_h264_qpel8_h_lowpass_lsx
+    slli.d        t1,     a3,     1
+    add.d         t2,     t1,     a3
+    vldi          vr20,   0x414
+    vldi          vr21,   0x405
+    vldi          vr22,   0x410
+
+    addi.d        t0,     a1,     -2   // t0 = src - 2
+    add.d         t3,     a1,     zero // t3 = src
+
+    vld           vr0,    t0,     0
+    vldx          vr1,    t0,     a3
+    LSX_QPEL8_H_LOWPASS vr12, vr13
+    vssrani.bu.h  vr13,   vr12,   5
+    vstelm.d      vr13,   a0,     0,    0
+    add.d         a0,     a0,     a2
+    vstelm.d      vr13,   a0,     0,    1
+    add.d         a0,     a0,     a2
+
+    vldx          vr0,    t0,     t1
+    vldx          vr1,    t0,     t2
+    LSX_QPEL8_H_LOWPASS vr12, vr13
+    vssrani.bu.h  vr13,   vr12,   5
+    vstelm.d      vr13,   a0,     0,    0
+    add.d         a0,     a0,     a2
+    vstelm.d      vr13,   a0,     0,    1
+    add.d         a0,     a0,     a2
+
+    alsl.d        a1,     a3,     t0,    2
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a3
+    LSX_QPEL8_H_LOWPASS vr12, vr13
+    vssrani.bu.h  vr13,   vr12,   5
+    vstelm.d      vr13,   a0,     0,    0
+    add.d         a0,     a0,     a2
+    vstelm.d      vr13,   a0,     0,    1
+    add.d         a0,     a0,     a2
+
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr12, vr13
+    vssrani.bu.h  vr13,   vr12,   5
+    vstelm.d      vr13,   a0,     0,    0
+    add.d         a0,     a0,     a2
+    vstelm.d      vr13,   a0,     0,    1
+    add.d         a0,     a0,     a2
+endfunc
+
+/*
+ * void put_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half,
+ *                            ptrdiff_t dstStride, ptrdiff_t srcStride)
+ */
+function put_pixels16_l2_8_lsx
+    slli.d        t0,     a4,     1
+    add.d         t1,     t0,     a4
+    slli.d        t2,     t0,     1
+    slli.d        t3,     a3,     1
+    add.d         t4,     t3,     a3
+    slli.d        t5,     t3,     1
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a4
+    vldx          vr2,    a1,     t0
+    vldx          vr3,    a1,     t1
+    add.d         a1,     a1,     t2
+    vld           vr4,    a1,     0
+    vldx          vr5,    a1,     a4
+    vldx          vr6,    a1,     t0
+    vldx          vr7,    a1,     t1
+    add.d         a1,     a1,     t2
+
+    vld           vr8,    a2,     0x00
+    vld           vr9,    a2,     0x10
+    vld           vr10,   a2,     0x20
+    vld           vr11,   a2,     0x30
+    vld           vr12,   a2,     0x40
+    vld           vr13,   a2,     0x50
+    vld           vr14,   a2,     0x60
+    vld           vr15,   a2,     0x70
+
+    vavgr.bu      vr0,    vr8,    vr0
+    vavgr.bu      vr1,    vr9,    vr1
+    vavgr.bu      vr2,    vr10,   vr2
+    vavgr.bu      vr3,    vr11,   vr3
+    vavgr.bu      vr4,    vr12,   vr4
+    vavgr.bu      vr5,    vr13,   vr5
+    vavgr.bu      vr6,    vr14,   vr6
+    vavgr.bu      vr7,    vr15,   vr7
+
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a3
+    vstx          vr2,    a0,     t3
+    vstx          vr3,    a0,     t4
+    add.d         a0,     a0,     t5
+    vst           vr4,    a0,     0
+    vstx          vr5,    a0,     a3
+    vstx          vr6,    a0,     t3
+    vstx          vr7,    a0,     t4
+    add.d         a0,     a0,     t5
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a4
+    vldx          vr2,    a1,     t0
+    vldx          vr3,    a1,     t1
+    add.d         a1,     a1,     t2
+    vld           vr4,    a1,     0
+    vldx          vr5,    a1,     a4
+    vldx          vr6,    a1,     t0
+    vldx          vr7,    a1,     t1
+
+    vld           vr8,    a2,     0x80
+    vld           vr9,    a2,     0x90
+    vld           vr10,   a2,     0xa0
+    vld           vr11,   a2,     0xb0
+    vld           vr12,   a2,     0xc0
+    vld           vr13,   a2,     0xd0
+    vld           vr14,   a2,     0xe0
+    vld           vr15,   a2,     0xf0
+
+    vavgr.bu      vr0,    vr8,    vr0
+    vavgr.bu      vr1,    vr9,    vr1
+    vavgr.bu      vr2,    vr10,   vr2
+    vavgr.bu      vr3,    vr11,   vr3
+    vavgr.bu      vr4,    vr12,   vr4
+    vavgr.bu      vr5,    vr13,   vr5
+    vavgr.bu      vr6,    vr14,   vr6
+    vavgr.bu      vr7,    vr15,   vr7
+
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a3
+    vstx          vr2,    a0,     t3
+    vstx          vr3,    a0,     t4
+    add.d         a0,     a0,     t5
+    vst           vr4,    a0,     0
+    vstx          vr5,    a0,     a3
+    vstx          vr6,    a0,     t3
+    vstx          vr7,    a0,     t4
+endfunc
+
+.macro LSX_QPEL8_V_LOWPASS_1 in0, in1, in2, in3, in4, in5, in6
+    vilvl.b       vr7,    \in3,   \in2
+    vilvl.b       vr8,    \in4,   \in3
+    vilvl.b       vr9,    \in4,   \in1
+    vilvl.b       vr10,   \in5,   \in2
+    vilvl.b       vr11,   \in5,   \in0
+    vilvl.b       vr12,   \in6,   \in1
+
+    vhaddw.hu.bu  vr7,    vr7,    vr7
+    vhaddw.hu.bu  vr8,    vr8,    vr8
+    vhaddw.hu.bu  vr9,    vr9,    vr9
+    vhaddw.hu.bu  vr10,   vr10,   vr10
+    vhaddw.hu.bu  vr11,   vr11,   vr11
+    vhaddw.hu.bu  vr12,   vr12,   vr12
+
+    vmul.h        vr7,    vr7,    vr20
+    vmul.h        vr8,    vr8,    vr20
+    vmul.h        vr9,    vr9,    vr21
+    vmul.h        vr10,   vr10,   vr21
+
+    vssub.h       vr7,    vr7,    vr9
+    vssub.h       vr8,    vr8,    vr10
+    vsadd.h       vr7,    vr7,    vr11
+    vsadd.h       vr8,    vr8,    vr12
+    vsadd.h       vr7,    vr7,    vr22
+    vsadd.h       vr8,    vr8,    vr22
+
+    vssrani.bu.h  vr8,    vr7,    5
+.endm
+
+/*
+ * void put_h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride,
+ *                               int srcStride)
+ */
+function put_h264_qpel8_v_lowpass_lsx
+    slli.d        t0,     a3,     1
+    add.d         t1,     t0,     a3
+    sub.d         t2,     a1,     t0  // t2 = src - 2 * stride
+    vldi          vr20,   0x414
+    vldi          vr21,   0x405
+    vldi          vr22,   0x410
+
+    fld.d         f0,     t2,     0
+    fldx.d        f1,     t2,     a3
+    fldx.d        f2,     t2,     t0
+    fldx.d        f3,     t2,     t1
+    alsl.d        t2,     a3,     t2,    2  // t2 = t2 + 4 * stride
+    fld.d         f4,     t2,     0
+    fldx.d        f5,     t2,     a3
+    fldx.d        f6,     t2,     t0
+    LSX_QPEL8_V_LOWPASS_1 vr0, vr1, vr2, vr3, vr4, vr5, vr6
+    vstelm.d      vr8,    a0,     0,    0
+    add.d         a0,     a0,     a2
+    vstelm.d      vr8,    a0,     0,    1
+    add.d         a0,     a0,     a2
+
+    fldx.d        f0,     t2,     t1
+    alsl.d        t2,     a3,     t2,    2  // t2 = t2 + 4 *stride
+    fld.d         f1,     t2,     0
+    LSX_QPEL8_V_LOWPASS_1 vr2, vr3, vr4, vr5, vr6, vr0, vr1
+    vstelm.d      vr8,    a0,     0,    0
+    add.d         a0,     a0,     a2
+    vstelm.d      vr8,    a0,     0,    1
+    add.d         a0,     a0,     a2
+
+    fldx.d        f2,     t2,     a3
+    fldx.d        f3,     t2,     t0
+    LSX_QPEL8_V_LOWPASS_1 vr4, vr5, vr6, vr0, vr1, vr2, vr3
+    vstelm.d      vr8,    a0,     0,    0
+    add.d         a0,     a0,     a2
+    vstelm.d      vr8,    a0,     0,    1
+    add.d         a0,     a0,     a2
+
+    fldx.d        f4,     t2,     t1
+    alsl.d        t2,     a3,     t2,    2 // t2 = t2 + 4 * stride
+    fld.d         f5,     t2,     0
+    LSX_QPEL8_V_LOWPASS_1 vr6, vr0, vr1, vr2, vr3, vr4, vr5
+    vstelm.d      vr8,    a0,     0,    0
+    add.d         a0,     a0,     a2
+    vstelm.d      vr8,    a0,     0,    1
+endfunc
+
+/*
+ * void avg_h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride,
+ *                               int srcStride)
+ */
+function avg_h264_qpel8_v_lowpass_lsx
+    slli.d        t0,     a3,     1
+    add.d         t1,     t0,     a3
+    sub.d         t2,     a1,     t0  // t2 = src - 2 * stride
+    addi.d        t3,     a0,     0
+    slli.d        t4,     a2,     1
+    add.d         t5,     t4,     a2
+    vldi          vr20,   0x414
+    vldi          vr21,   0x405
+    vldi          vr22,   0x410
+
+    fld.d         f0,     t2,     0
+    fldx.d        f1,     t2,     a3
+    fldx.d        f2,     t2,     t0
+    fldx.d        f3,     t2,     t1
+    alsl.d        t2,     a3,     t2,    2  // t2 = t2 + 4 * stride
+    fld.d         f4,     t2,     0
+    fldx.d        f5,     t2,     a3
+    fldx.d        f6,     t2,     t0
+    LSX_QPEL8_V_LOWPASS_1 vr0, vr1, vr2, vr3, vr4, vr5, vr6
+    fld.d         f0,     t3,     0
+    fldx.d        f1,     t3,     a2
+    vilvl.d       vr0,    vr1,    vr0
+    vavgr.bu      vr8,    vr8,    vr0
+    vstelm.d      vr8,    a0,     0,    0
+    add.d         a0,     a0,     a2
+    vstelm.d      vr8,    a0,     0,    1
+    add.d         a0,     a0,     a2
+
+    fldx.d        f0,     t2,     t1
+    alsl.d        t2,     a3,     t2,   2  // t2 = t2 + 4 *stride
+    fld.d         f1,     t2,     0
+    LSX_QPEL8_V_LOWPASS_1 vr2, vr3, vr4, vr5, vr6, vr0, vr1
+    fldx.d        f2,     t3,     t4
+    fldx.d        f3,     t3,     t5
+    vilvl.d       vr2,    vr3,    vr2
+    vavgr.bu      vr8,    vr8,    vr2
+    vstelm.d      vr8,    a0,     0,    0
+    add.d         a0,     a0,     a2
+    vstelm.d      vr8,    a0,     0,    1
+    add.d         a0,     a0,     a2
+
+    alsl.d        t3,     a2,     t3,   2
+
+    fldx.d        f2,     t2,     a3
+    fldx.d        f3,     t2,     t0
+    LSX_QPEL8_V_LOWPASS_1 vr4, vr5, vr6, vr0, vr1, vr2, vr3
+    fld.d         f4,     t3,     0
+    fldx.d        f5,     t3,     a2
+    vilvl.d       vr4,    vr5,    vr4
+    vavgr.bu      vr8,    vr8,    vr4
+    vstelm.d      vr8,    a0,     0,    0
+    add.d         a0,     a0,     a2
+    vstelm.d      vr8,    a0,     0,    1
+    add.d         a0,     a0,     a2
+
+    fldx.d        f4,     t2,     t1
+    alsl.d        t2,     a3,     t2,   2 // t2 = t2 + 4 * stride
+    fld.d         f5,     t2,     0
+    LSX_QPEL8_V_LOWPASS_1 vr6, vr0, vr1, vr2, vr3, vr4, vr5
+    fldx.d        f6,     t3,     t4
+    fldx.d        f0,     t3,     t5
+    vilvl.d       vr6,    vr0,    vr6
+    vavgr.bu      vr8,    vr8,    vr6
+    vstelm.d      vr8,    a0,     0,    0
+    add.d         a0,     a0,     a2
+    vstelm.d      vr8,    a0,     0,    1
+endfunc
+
+/*
+ * void avg_pixels16_l2_8(uint8_t *dst, const uint8_t *src, uint8_t *half,
+ *                        ptrdiff_t dstStride, ptrdiff_t srcStride)
+ */
+function avg_pixels16_l2_8_lsx
+    slli.d        t0,     a4,     1
+    add.d         t1,     t0,     a4
+    slli.d        t2,     t0,     1
+    slli.d        t3,     a3,     1
+    add.d         t4,     t3,     a3
+    slli.d        t5,     t3,     1
+    addi.d        t6,     a0,     0
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a4
+    vldx          vr2,    a1,     t0
+    vldx          vr3,    a1,     t1
+    add.d         a1,     a1,     t2
+    vld           vr4,    a1,     0
+    vldx          vr5,    a1,     a4
+    vldx          vr6,    a1,     t0
+    vldx          vr7,    a1,     t1
+    add.d         a1,     a1,     t2
+
+    vld           vr8,    a2,     0x00
+    vld           vr9,    a2,     0x10
+    vld           vr10,   a2,     0x20
+    vld           vr11,   a2,     0x30
+    vld           vr12,   a2,     0x40
+    vld           vr13,   a2,     0x50
+    vld           vr14,   a2,     0x60
+    vld           vr15,   a2,     0x70
+
+    vavgr.bu      vr0,    vr8,    vr0
+    vavgr.bu      vr1,    vr9,    vr1
+    vavgr.bu      vr2,    vr10,   vr2
+    vavgr.bu      vr3,    vr11,   vr3
+    vavgr.bu      vr4,    vr12,   vr4
+    vavgr.bu      vr5,    vr13,   vr5
+    vavgr.bu      vr6,    vr14,   vr6
+    vavgr.bu      vr7,    vr15,   vr7
+
+    vld           vr8,    t6,     0
+    vldx          vr9,    t6,     a3
+    vldx          vr10,   t6,     t3
+    vldx          vr11,   t6,     t4
+    add.d         t6,     t6,     t5
+    vld           vr12,   t6,     0
+    vldx          vr13,   t6,     a3
+    vldx          vr14,   t6,     t3
+    vldx          vr15,   t6,     t4
+    add.d         t6,     t6,     t5
+
+    vavgr.bu      vr0,    vr8,    vr0
+    vavgr.bu      vr1,    vr9,    vr1
+    vavgr.bu      vr2,    vr10,   vr2
+    vavgr.bu      vr3,    vr11,   vr3
+    vavgr.bu      vr4,    vr12,   vr4
+    vavgr.bu      vr5,    vr13,   vr5
+    vavgr.bu      vr6,    vr14,   vr6
+    vavgr.bu      vr7,    vr15,   vr7
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a3
+    vstx          vr2,    a0,     t3
+    vstx          vr3,    a0,     t4
+    add.d         a0,     a0,     t5
+    vst           vr4,    a0,     0
+    vstx          vr5,    a0,     a3
+    vstx          vr6,    a0,     t3
+    vstx          vr7,    a0,     t4
+    add.d         a0,     a0,     t5
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a4
+    vldx          vr2,    a1,     t0
+    vldx          vr3,    a1,     t1
+    add.d         a1,     a1,     t2
+    vld           vr4,    a1,     0
+    vldx          vr5,    a1,     a4
+    vldx          vr6,    a1,     t0
+    vldx          vr7,    a1,     t1
+
+    vld           vr8,    a2,     0x80
+    vld           vr9,    a2,     0x90
+    vld           vr10,   a2,     0xa0
+    vld           vr11,   a2,     0xb0
+    vld           vr12,   a2,     0xc0
+    vld           vr13,   a2,     0xd0
+    vld           vr14,   a2,     0xe0
+    vld           vr15,   a2,     0xf0
+
+    vavgr.bu      vr0,    vr8,    vr0
+    vavgr.bu      vr1,    vr9,    vr1
+    vavgr.bu      vr2,    vr10,   vr2
+    vavgr.bu      vr3,    vr11,   vr3
+    vavgr.bu      vr4,    vr12,   vr4
+    vavgr.bu      vr5,    vr13,   vr5
+    vavgr.bu      vr6,    vr14,   vr6
+    vavgr.bu      vr7,    vr15,   vr7
+
+    vld           vr8,    t6,     0
+    vldx          vr9,    t6,     a3
+    vldx          vr10,   t6,     t3
+    vldx          vr11,   t6,     t4
+    add.d         t6,     t6,     t5
+    vld           vr12,   t6,     0
+    vldx          vr13,   t6,     a3
+    vldx          vr14,   t6,     t3
+    vldx          vr15,   t6,     t4
+
+    vavgr.bu      vr0,    vr8,    vr0
+    vavgr.bu      vr1,    vr9,    vr1
+    vavgr.bu      vr2,    vr10,   vr2
+    vavgr.bu      vr3,    vr11,   vr3
+    vavgr.bu      vr4,    vr12,   vr4
+    vavgr.bu      vr5,    vr13,   vr5
+    vavgr.bu      vr6,    vr14,   vr6
+    vavgr.bu      vr7,    vr15,   vr7
+
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a3
+    vstx          vr2,    a0,     t3
+    vstx          vr3,    a0,     t4
+    add.d         a0,     a0,     t5
+    vst           vr4,    a0,     0
+    vstx          vr5,    a0,     a3
+    vstx          vr6,    a0,     t3
+    vstx          vr7,    a0,     t4
+endfunc
+
+.macro avg_h264_qpel8_hv_lowpass_core_lsx in0, in1, in2
+    vld           vr0,    \in0,  0
+    vldx          vr1,    \in0,  a3
+    LSX_QPEL8_HV_LOWPASS_H vr12, vr13 // a b
+    vldx          vr0,    \in0,  t1
+    vldx          vr1,    \in0,  t2
+    LSX_QPEL8_HV_LOWPASS_H vr14, vr15 // c d
+
+    alsl.d        \in0,   a3,    \in0,   2
+
+    vld           vr0,    \in0,   0
+    vldx          vr1,    \in0,   a3
+    LSX_QPEL8_HV_LOWPASS_H vr16, vr17 // e f
+    vldx          vr0,    \in0,   t1
+    vldx          vr1,    \in0,   t2
+    LSX_QPEL8_HV_LOWPASS_H vr18, vr19 // g h
+
+    LSX_QPEL8_HV_LOWPASS_V vr12, vr13, vr14, vr15, vr16, vr17, vr18, vr6, vr7, vr0, vr1
+    fld.d         f2,     \in2,    0
+    fldx.d        f3,     \in2,    a2
+    vilvl.d       vr2,    vr3,     vr2
+    vavgr.bu      vr1,    vr2,     vr1
+    vstelm.d      vr1,    \in1,    0,     0
+    add.d         \in1,   \in1,    a2
+    vstelm.d      vr1,    \in1,    0,     1
+
+    alsl.d        \in0,    a3,     \in0,  2
+
+    // tmp8
+    vld           vr0,    \in0,   0
+    vldx          vr1,    \in0,   a3
+    LSX_QPEL8_HV_LOWPASS_H vr12, vr13
+    LSX_QPEL8_HV_LOWPASS_V vr14, vr15, vr16, vr17, vr18, vr19, vr12, vr6, vr7, vr0, vr1
+    fldx.d        f2,     \in2,    t5
+    fldx.d        f3,     \in2,    t6
+    vilvl.d       vr2,    vr3,     vr2
+    vavgr.bu      vr1,    vr2,     vr1
+    add.d         \in1,   \in1,   a2
+    vstelm.d      vr1,    \in1,   0,     0
+    add.d         \in1,   \in1,   a2
+    vstelm.d      vr1,    \in1,   0,     1
+
+    alsl.d        \in2,   a2,     \in2,  2
+
+    // tmp10
+    vldx          vr0,    \in0,   t1
+    vldx          vr1,    \in0,   t2
+    LSX_QPEL8_HV_LOWPASS_H vr14, vr15
+    LSX_QPEL8_HV_LOWPASS_V vr16, vr17, vr18, vr19, vr12, vr13, vr14, vr6, vr7, vr0, vr1
+    fld.d         f2,     \in2,    0
+    fldx.d        f3,     \in2,    a2
+    vilvl.d       vr2,    vr3,     vr2
+    vavgr.bu      vr1,    vr2,     vr1
+    add.d         \in1,   \in1,   a2
+    vstelm.d      vr1,    \in1,   0,     0
+    add.d         \in1,   \in1,   a2
+    vstelm.d      vr1,    \in1,   0,     1
+
+    // tmp12
+    alsl.d        \in0,   a3,     \in0,  2
+
+    vld           vr0,    \in0,   0
+    vldx          vr1,    \in0,   a3
+    LSX_QPEL8_HV_LOWPASS_H vr16, vr17
+    LSX_QPEL8_HV_LOWPASS_V vr18, vr19, vr12, vr13, vr14, vr15, vr16, vr6, vr7, vr0, vr1
+    fldx.d        f2,     \in2,   t5
+    fldx.d        f3,     \in2,   t6
+    vilvl.d       vr2,    vr3,    vr2
+    vavgr.bu      vr1,    vr2,    vr1
+    add.d         \in1,   \in1,   a2
+    vstelm.d      vr1,    \in1,   0,     0
+    add.d         \in1,   \in1,   a2
+    vstelm.d      vr1,    \in1,   0,     1
+.endm
+
+function avg_h264_qpel8_hv_lowpass_lsx
+    slli.d        t1,     a3,     1
+    add.d         t2,     t1,     a3
+    slli.d        t5,     a2,     1
+    add.d         t6,     a2,     t5
+
+    addi.d        sp,     sp,     -8
+    fst.d         f24,    sp,     0
+
+    vldi          vr20,   0x414   // h_20
+    vldi          vr21,   0x405   // h_5
+    vldi          vr22,   0x814   // w_20
+    vldi          vr23,   0x805   // w_5
+    addi.d        t4,     zero,   512
+    vreplgr2vr.w  vr24,   t4      // w_512
+
+    addi.d        t0,     a1,     -2   // t0 = src - 2
+    sub.d         t0,     t0,     t1   // t0 = t0 - 2 * stride
+    addi.d        t3,     a0,     0    // t3 = dst
+
+    avg_h264_qpel8_hv_lowpass_core_lsx t0, a0, t3
+
+    fld.d         f24,    sp,     0
+    addi.d        sp,     sp,     8
+endfunc
+
+function put_pixels8_l2_8_lsx
+    slli.d        t0,     a4,     1
+    add.d         t1,     t0,     a4
+    slli.d        t2,     t0,     1
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a4
+    vldx          vr2,    a1,     t0
+    vldx          vr3,    a1,     t1
+    add.d         a1,     a1,     t2
+    vld           vr4,    a1,     0
+    vldx          vr5,    a1,     a4
+    vldx          vr6,    a1,     t0
+    vldx          vr7,    a1,     t1
+
+    vld           vr8,    a2,     0x00
+    vld           vr9,    a2,     0x08
+    vld           vr10,   a2,     0x10
+    vld           vr11,   a2,     0x18
+    vld           vr12,   a2,     0x20
+    vld           vr13,   a2,     0x28
+    vld           vr14,   a2,     0x30
+    vld           vr15,   a2,     0x38
+
+    vavgr.bu      vr0,    vr8,    vr0
+    vavgr.bu      vr1,    vr9,    vr1
+    vavgr.bu      vr2,    vr10,   vr2
+    vavgr.bu      vr3,    vr11,   vr3
+    vavgr.bu      vr4,    vr12,   vr4
+    vavgr.bu      vr5,    vr13,   vr5
+    vavgr.bu      vr6,    vr14,   vr6
+    vavgr.bu      vr7,    vr15,   vr7
+
+    vstelm.d      vr0,    a0,     0,     0
+    add.d         a0,     a0,     a3
+    vstelm.d      vr1,    a0,     0,     0
+    add.d         a0,     a0,     a3
+    vstelm.d      vr2,    a0,     0,     0
+    add.d         a0,     a0,     a3
+    vstelm.d      vr3,    a0,     0,     0
+    add.d         a0,     a0,     a3
+    vstelm.d      vr4,    a0,     0,     0
+    add.d         a0,     a0,     a3
+    vstelm.d      vr5,    a0,     0,     0
+    add.d         a0,     a0,     a3
+    vstelm.d      vr6,    a0,     0,     0
+    add.d         a0,     a0,     a3
+    vstelm.d      vr7,    a0,     0,     0
+endfunc
+
+/*
+ * void ff_put_h264_qpel8_mc00(uint8_t *dst, const uint8_t *src,
+ *                             ptrdiff_t stride)
+ */
+function ff_put_h264_qpel8_mc00_lsx
+    slli.d        t0,     a2,     1
+    add.d         t1,     t0,     a2
+    slli.d        t2,     t0,     1
+    ld.d          t3,     a1,     0x0
+    ldx.d         t4,     a1,     a2
+    ldx.d         t5,     a1,     t0
+    ldx.d         t6,     a1,     t1
+    st.d          t3,     a0,     0x0
+    stx.d         t4,     a0,     a2
+    stx.d         t5,     a0,     t0
+    stx.d         t6,     a0,     t1
+
+    add.d         a1,     a1,     t2
+    add.d         a0,     a0,     t2
+
+    ld.d          t3,     a1,     0x0
+    ldx.d         t4,     a1,     a2
+    ldx.d         t5,     a1,     t0
+    ldx.d         t6,     a1,     t1
+    st.d          t3,     a0,     0x0
+    stx.d         t4,     a0,     a2
+    stx.d         t5,     a0,     t0
+    stx.d         t6,     a0,     t1
+endfunc
+
+/*
+ * void ff_avg_h264_qpel8_mc00(uint8_t *dst, const uint8_t *src,
+ *                             ptrdiff_t stride)
+ */
+function ff_avg_h264_qpel8_mc00_lsx
+    slli.d        t0,     a2,     1
+    add.d         t1,     t0,     a2
+    slli.d        t2,     t0,     1
+    addi.d        t3,     a0,     0
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    vldx          vr2,    a1,     t0
+    vldx          vr3,    a1,     t1
+    add.d         a1,     a1,     t2
+    vld           vr4,    a1,     0
+    vldx          vr5,    a1,     a2
+    vldx          vr6,    a1,     t0
+    vldx          vr7,    a1,     t1
+
+    vld           vr8,    t3,     0
+    vldx          vr9,    t3,     a2
+    vldx          vr10,   t3,     t0
+    vldx          vr11,   t3,     t1
+    add.d         t3,     t3,     t2
+    vld           vr12,   t3,     0
+    vldx          vr13,   t3,     a2
+    vldx          vr14,   t3,     t0
+    vldx          vr15,   t3,     t1
+
+    vavgr.bu      vr0,    vr8,    vr0
+    vavgr.bu      vr1,    vr9,    vr1
+    vavgr.bu      vr2,    vr10,   vr2
+    vavgr.bu      vr3,    vr11,   vr3
+    vavgr.bu      vr4,    vr12,   vr4
+    vavgr.bu      vr5,    vr13,   vr5
+    vavgr.bu      vr6,    vr14,   vr6
+    vavgr.bu      vr7,    vr15,   vr7
+
+    vstelm.d      vr0,    a0,     0,     0
+    add.d         a0,     a0,     a2
+    vstelm.d      vr1,    a0,     0,     0
+    add.d         a0,     a0,     a2
+    vstelm.d      vr2,    a0,     0,     0
+    add.d         a0,     a0,     a2
+    vstelm.d      vr3,    a0,     0,     0
+    add.d         a0,     a0,     a2
+    vstelm.d      vr4,    a0,     0,     0
+    add.d         a0,     a0,     a2
+    vstelm.d      vr5,    a0,     0,     0
+    add.d         a0,     a0,     a2
+    vstelm.d      vr6,    a0,     0,     0
+    add.d         a0,     a0,     a2
+    vstelm.d      vr7,    a0,     0,     0
+endfunc
+
+function avg_pixels8_l2_8_lsx
+    slli.d        t0,     a4,     1
+    add.d         t1,     t0,     a4
+    slli.d        t2,     t0,     1
+    addi.d        t3,     a0,     0
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a4
+    vldx          vr2,    a1,     t0
+    vldx          vr3,    a1,     t1
+    add.d         a1,     a1,     t2
+    vld           vr4,    a1,     0
+    vldx          vr5,    a1,     a4
+    vldx          vr6,    a1,     t0
+    vldx          vr7,    a1,     t1
+
+    vld           vr8,    a2,     0x00
+    vld           vr9,    a2,     0x08
+    vld           vr10,   a2,     0x10
+    vld           vr11,   a2,     0x18
+    vld           vr12,   a2,     0x20
+    vld           vr13,   a2,     0x28
+    vld           vr14,   a2,     0x30
+    vld           vr15,   a2,     0x38
+
+    vavgr.bu      vr0,    vr8,    vr0
+    vavgr.bu      vr1,    vr9,    vr1
+    vavgr.bu      vr2,    vr10,   vr2
+    vavgr.bu      vr3,    vr11,   vr3
+    vavgr.bu      vr4,    vr12,   vr4
+    vavgr.bu      vr5,    vr13,   vr5
+    vavgr.bu      vr6,    vr14,   vr6
+    vavgr.bu      vr7,    vr15,   vr7
+
+    slli.d        t0,     a3,     1
+    add.d         t1,     t0,     a3
+    slli.d        t2,     t0,     1
+    vld           vr8,    t3,     0
+    vldx          vr9,    t3,     a3
+    vldx          vr10,   t3,     t0
+    vldx          vr11,   t3,     t1
+    add.d         t3,     t3,     t2
+    vld           vr12,   t3,     0
+    vldx          vr13,   t3,     a3
+    vldx          vr14,   t3,     t0
+    vldx          vr15,   t3,     t1
+
+    vavgr.bu      vr0,    vr8,    vr0
+    vavgr.bu      vr1,    vr9,    vr1
+    vavgr.bu      vr2,    vr10,   vr2
+    vavgr.bu      vr3,    vr11,   vr3
+    vavgr.bu      vr4,    vr12,   vr4
+    vavgr.bu      vr5,    vr13,   vr5
+    vavgr.bu      vr6,    vr14,   vr6
+    vavgr.bu      vr7,    vr15,   vr7
+
+    vstelm.d      vr0,    a0,     0,     0
+    add.d         a0,     a0,     a3
+    vstelm.d      vr1,    a0,     0,     0
+    add.d         a0,     a0,     a3
+    vstelm.d      vr2,    a0,     0,     0
+    add.d         a0,     a0,     a3
+    vstelm.d      vr3,    a0,     0,     0
+    add.d         a0,     a0,     a3
+    vstelm.d      vr4,    a0,     0,     0
+    add.d         a0,     a0,     a3
+    vstelm.d      vr5,    a0,     0,     0
+    add.d         a0,     a0,     a3
+    vstelm.d      vr6,    a0,     0,     0
+    add.d         a0,     a0,     a3
+    vstelm.d      vr7,    a0,     0,     0
+endfunc
+
+function avg_h264_qpel8_h_lowpass_lsx
+    slli.d        t1,     a3,     1
+    add.d         t2,     t1,     a3
+    slli.d        t5,     a2,     1
+    add.d         t6,     t5,     a2
+    vldi          vr20,   0x414
+    vldi          vr21,   0x405
+    vldi          vr22,   0x410
+
+    addi.d        t0,     a1,     -2   // t0 = src - 2
+    add.d         t3,     a1,     zero // t3 = src
+    addi.d        t4,     a0,     0    // t4 = dst
+
+    vld           vr0,    t0,     0
+    vldx          vr1,    t0,     a3
+    LSX_QPEL8_H_LOWPASS vr12, vr13
+    vssrani.bu.h  vr13,   vr12,   5
+    fld.d         f0,     t4,     0
+    fldx.d        f1,     t4,     a2
+    vilvl.d       vr0,    vr1,    vr0
+    vavgr.bu      vr13,   vr13,   vr0
+    vstelm.d      vr13,   a0,     0,    0
+    add.d         a0,     a0,     a2
+    vstelm.d      vr13,   a0,     0,    1
+    add.d         a0,     a0,     a2
+
+    vldx          vr0,    t0,     t1
+    vldx          vr1,    t0,     t2
+    LSX_QPEL8_H_LOWPASS vr12, vr13
+    vssrani.bu.h  vr13,   vr12,   5
+    fldx.d        f0,     t4,     t5
+    fldx.d        f1,     t4,     t6
+    vilvl.d       vr0,    vr1,    vr0
+    vavgr.bu      vr13,   vr13,   vr0
+    vstelm.d      vr13,   a0,     0,    0
+    add.d         a0,     a0,     a2
+    vstelm.d      vr13,   a0,     0,    1
+    add.d         a0,     a0,     a2
+
+    alsl.d        a1,     a3,     t0,    2
+    alsl.d        t4,     a2,     t4,    2
+
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a3
+    LSX_QPEL8_H_LOWPASS vr12, vr13
+    vssrani.bu.h  vr13,   vr12,   5
+    fld.d         f0,     t4,     0
+    fldx.d        f1,     t4,     a2
+    vilvl.d       vr0,    vr1,    vr0
+    vavgr.bu      vr13,   vr13,   vr0
+    vstelm.d      vr13,   a0,     0,    0
+    add.d         a0,     a0,     a2
+    vstelm.d      vr13,   a0,     0,    1
+    add.d         a0,     a0,     a2
+
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    LSX_QPEL8_H_LOWPASS vr12, vr13
+    vssrani.bu.h  vr13,   vr12,   5
+    fldx.d        f0,     t4,     t5
+    fldx.d        f1,     t4,     t6
+    vilvl.d       vr0,    vr1,    vr0
+    vavgr.bu      vr13,   vr13,   vr0
+    vstelm.d      vr13,   a0,     0,    0
+    add.d         a0,     a0,     a2
+    vstelm.d      vr13,   a0,     0,    1
+endfunc
diff --git a/libavcodec/loongarch/h264qpel_init_loongarch.c b/libavcodec/loongarch/h264qpel_init_loongarch.c
index 969c9c376c..9d3a5cb164 100644
--- a/libavcodec/loongarch/h264qpel_init_loongarch.c
+++ b/libavcodec/loongarch/h264qpel_init_loongarch.c
@@ -19,7 +19,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "h264qpel_lasx.h"
+#include "h264qpel_loongarch.h"
 #include "libavutil/attributes.h"
 #include "libavutil/loongarch/cpu.h"
 #include "libavcodec/h264qpel.h"
@@ -27,6 +27,77 @@
 av_cold void ff_h264qpel_init_loongarch(H264QpelContext *c, int bit_depth)
 {
     int cpu_flags = av_get_cpu_flags();
+
+    if (have_lsx(cpu_flags)) {
+        if (8 == bit_depth) {
+            c->put_h264_qpel_pixels_tab[0][0]  = ff_put_h264_qpel16_mc00_lsx;
+            c->put_h264_qpel_pixels_tab[0][1]  = ff_put_h264_qpel16_mc10_lsx;
+            c->put_h264_qpel_pixels_tab[0][2]  = ff_put_h264_qpel16_mc20_lsx;
+            c->put_h264_qpel_pixels_tab[0][3]  = ff_put_h264_qpel16_mc30_lsx;
+            c->put_h264_qpel_pixels_tab[0][4]  = ff_put_h264_qpel16_mc01_lsx;
+            c->put_h264_qpel_pixels_tab[0][5]  = ff_put_h264_qpel16_mc11_lsx;
+            c->put_h264_qpel_pixels_tab[0][6]  = ff_put_h264_qpel16_mc21_lsx;
+            c->put_h264_qpel_pixels_tab[0][7]  = ff_put_h264_qpel16_mc31_lsx;
+            c->put_h264_qpel_pixels_tab[0][8]  = ff_put_h264_qpel16_mc02_lsx;
+            c->put_h264_qpel_pixels_tab[0][9]  = ff_put_h264_qpel16_mc12_lsx;
+            c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_lsx;
+            c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_lsx;
+            c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_lsx;
+            c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_lsx;
+            c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_lsx;
+            c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_lsx;
+
+            c->avg_h264_qpel_pixels_tab[0][0]  = ff_avg_h264_qpel16_mc00_lsx;
+            c->avg_h264_qpel_pixels_tab[0][1]  = ff_avg_h264_qpel16_mc10_lsx;
+            c->avg_h264_qpel_pixels_tab[0][2]  = ff_avg_h264_qpel16_mc20_lsx;
+            c->avg_h264_qpel_pixels_tab[0][3]  = ff_avg_h264_qpel16_mc30_lsx;
+            c->avg_h264_qpel_pixels_tab[0][4]  = ff_avg_h264_qpel16_mc01_lsx;
+            c->avg_h264_qpel_pixels_tab[0][5]  = ff_avg_h264_qpel16_mc11_lsx;
+            c->avg_h264_qpel_pixels_tab[0][6]  = ff_avg_h264_qpel16_mc21_lsx;
+            c->avg_h264_qpel_pixels_tab[0][7]  = ff_avg_h264_qpel16_mc31_lsx;
+            c->avg_h264_qpel_pixels_tab[0][8]  = ff_avg_h264_qpel16_mc02_lsx;
+            c->avg_h264_qpel_pixels_tab[0][9]  = ff_avg_h264_qpel16_mc12_lsx;
+            c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_lsx;
+            c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_lsx;
+            c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_lsx;
+            c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_lsx;
+            c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_lsx;
+            c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_lsx;
+
+            c->put_h264_qpel_pixels_tab[1][0]  = ff_put_h264_qpel8_mc00_lsx;
+            c->put_h264_qpel_pixels_tab[1][1]  = ff_put_h264_qpel8_mc10_lsx;
+            c->put_h264_qpel_pixels_tab[1][2]  = ff_put_h264_qpel8_mc20_lsx;
+            c->put_h264_qpel_pixels_tab[1][3]  = ff_put_h264_qpel8_mc30_lsx;
+            c->put_h264_qpel_pixels_tab[1][4]  = ff_put_h264_qpel8_mc01_lsx;
+            c->put_h264_qpel_pixels_tab[1][5]  = ff_put_h264_qpel8_mc11_lsx;
+            c->put_h264_qpel_pixels_tab[1][6]  = ff_put_h264_qpel8_mc21_lsx;
+            c->put_h264_qpel_pixels_tab[1][7]  = ff_put_h264_qpel8_mc31_lsx;
+            c->put_h264_qpel_pixels_tab[1][8]  = ff_put_h264_qpel8_mc02_lsx;
+            c->put_h264_qpel_pixels_tab[1][9]  = ff_put_h264_qpel8_mc12_lsx;
+            c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_lsx;
+            c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_lsx;
+            c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_lsx;
+            c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_lsx;
+            c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_lsx;
+            c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_lsx;
+
+            c->avg_h264_qpel_pixels_tab[1][0]  = ff_avg_h264_qpel8_mc00_lsx;
+            c->avg_h264_qpel_pixels_tab[1][1]  = ff_avg_h264_qpel8_mc10_lsx;
+            c->avg_h264_qpel_pixels_tab[1][2]  = ff_avg_h264_qpel8_mc20_lsx;
+            c->avg_h264_qpel_pixels_tab[1][3]  = ff_avg_h264_qpel8_mc30_lsx;
+            c->avg_h264_qpel_pixels_tab[1][5]  = ff_avg_h264_qpel8_mc11_lsx;
+            c->avg_h264_qpel_pixels_tab[1][6]  = ff_avg_h264_qpel8_mc21_lsx;
+            c->avg_h264_qpel_pixels_tab[1][7]  = ff_avg_h264_qpel8_mc31_lsx;
+            c->avg_h264_qpel_pixels_tab[1][8]  = ff_avg_h264_qpel8_mc02_lsx;
+            c->avg_h264_qpel_pixels_tab[1][9]  = ff_avg_h264_qpel8_mc12_lsx;
+            c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_lsx;
+            c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_lsx;
+            c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_lsx;
+            c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_lsx;
+            c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_lsx;
+        }
+    }
+#if HAVE_LASX
     if (have_lasx(cpu_flags)) {
         if (8 == bit_depth) {
             c->put_h264_qpel_pixels_tab[0][0]  = ff_put_h264_qpel16_mc00_lasx;
@@ -95,4 +166,5 @@ av_cold void ff_h264qpel_init_loongarch(H264QpelContext *c, int bit_depth)
             c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_lasx;
         }
     }
+#endif
 }
diff --git a/libavcodec/loongarch/h264qpel_lasx.c b/libavcodec/loongarch/h264qpel_lasx.c
index 1c142e510e..519bb03fe6 100644
--- a/libavcodec/loongarch/h264qpel_lasx.c
+++ b/libavcodec/loongarch/h264qpel_lasx.c
@@ -21,7 +21,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "h264qpel_lasx.h"
+#include "h264qpel_loongarch.h"
 #include "libavutil/loongarch/loongson_intrinsics.h"
 #include "libavutil/attributes.h"
 
@@ -418,157 +418,6 @@ avg_pixels8_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
     );
 }
 
-/* avg_pixels8_8_lsx   : dst = avg(src, dst)
- * put_pixels8_l2_8_lsx: dst = avg(src, half) , half stride is 8.
- * avg_pixels8_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
-static av_always_inline void
-put_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half,
-                     ptrdiff_t dstStride, ptrdiff_t srcStride)
-{
-    ptrdiff_t stride_2, stride_3, stride_4;
-    __asm__ volatile (
-    /* h0~h7 */
-    "slli.d     %[stride_2],     %[srcStride],   1            \n\t"
-    "add.d      %[stride_3],     %[stride_2],    %[srcStride] \n\t"
-    "slli.d     %[stride_4],     %[stride_2],    1            \n\t"
-    "vld        $vr0,            %[src],         0            \n\t"
-    "vldx       $vr1,            %[src],         %[srcStride] \n\t"
-    "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
-    "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
-    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
-    "vld        $vr4,            %[src],         0            \n\t"
-    "vldx       $vr5,            %[src],         %[srcStride] \n\t"
-    "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
-    "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
-
-    "vld        $vr8,            %[half],        0x00         \n\t"
-    "vld        $vr9,            %[half],        0x08         \n\t"
-    "vld        $vr10,           %[half],        0x10         \n\t"
-    "vld        $vr11,           %[half],        0x18         \n\t"
-    "vld        $vr12,           %[half],        0x20         \n\t"
-    "vld        $vr13,           %[half],        0x28         \n\t"
-    "vld        $vr14,           %[half],        0x30         \n\t"
-    "vld        $vr15,           %[half],        0x38         \n\t"
-
-    "vavgr.bu   $vr0,            $vr8,           $vr0         \n\t"
-    "vavgr.bu   $vr1,            $vr9,           $vr1         \n\t"
-    "vavgr.bu   $vr2,            $vr10,          $vr2         \n\t"
-    "vavgr.bu   $vr3,            $vr11,          $vr3         \n\t"
-    "vavgr.bu   $vr4,            $vr12,          $vr4         \n\t"
-    "vavgr.bu   $vr5,            $vr13,          $vr5         \n\t"
-    "vavgr.bu   $vr6,            $vr14,          $vr6         \n\t"
-    "vavgr.bu   $vr7,            $vr15,          $vr7         \n\t"
-
-    "vstelm.d   $vr0,            %[dst],         0,  0        \n\t"
-    "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
-    "vstelm.d   $vr1,            %[dst],         0,  0        \n\t"
-    "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
-    "vstelm.d   $vr2,            %[dst],         0,  0        \n\t"
-    "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
-    "vstelm.d   $vr3,            %[dst],         0,  0        \n\t"
-    "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
-    "vstelm.d   $vr4,            %[dst],         0,  0        \n\t"
-    "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
-    "vstelm.d   $vr5,            %[dst],         0,  0        \n\t"
-    "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
-    "vstelm.d   $vr6,            %[dst],         0,  0        \n\t"
-    "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
-    "vstelm.d   $vr7,            %[dst],         0,  0        \n\t"
-    : [dst]"+&r"(dst), [half]"+&r"(half), [src]"+&r"(src),
-      [stride_2]"=&r"(stride_2),  [stride_3]"=&r"(stride_3),
-      [stride_4]"=&r"(stride_4)
-    : [srcStride]"r"(srcStride), [dstStride]"r"(dstStride)
-    : "memory"
-    );
-}
-
-/* avg_pixels8_8_lsx   : dst = avg(src, dst)
- * put_pixels8_l2_8_lsx: dst = avg(src, half) , half stride is 8.
- * avg_pixels8_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
-static av_always_inline void
-avg_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half,
-                     ptrdiff_t dstStride, ptrdiff_t srcStride)
-{
-    uint8_t *tmp = dst;
-    ptrdiff_t stride_2, stride_3, stride_4;
-    __asm__ volatile (
-    /* h0~h7 */
-    "slli.d     %[stride_2],     %[srcStride],   1            \n\t"
-    "add.d      %[stride_3],     %[stride_2],    %[srcStride] \n\t"
-    "slli.d     %[stride_4],     %[stride_2],    1            \n\t"
-    "vld        $vr0,            %[src],         0            \n\t"
-    "vldx       $vr1,            %[src],         %[srcStride] \n\t"
-    "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
-    "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
-    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
-    "vld        $vr4,            %[src],         0            \n\t"
-    "vldx       $vr5,            %[src],         %[srcStride] \n\t"
-    "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
-    "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
-
-    "vld        $vr8,            %[half],        0x00         \n\t"
-    "vld        $vr9,            %[half],        0x08         \n\t"
-    "vld        $vr10,           %[half],        0x10         \n\t"
-    "vld        $vr11,           %[half],        0x18         \n\t"
-    "vld        $vr12,           %[half],        0x20         \n\t"
-    "vld        $vr13,           %[half],        0x28         \n\t"
-    "vld        $vr14,           %[half],        0x30         \n\t"
-    "vld        $vr15,           %[half],        0x38         \n\t"
-
-    "vavgr.bu    $vr0,           $vr8,           $vr0         \n\t"
-    "vavgr.bu    $vr1,           $vr9,           $vr1         \n\t"
-    "vavgr.bu    $vr2,           $vr10,          $vr2         \n\t"
-    "vavgr.bu    $vr3,           $vr11,          $vr3         \n\t"
-    "vavgr.bu    $vr4,           $vr12,          $vr4         \n\t"
-    "vavgr.bu    $vr5,           $vr13,          $vr5         \n\t"
-    "vavgr.bu    $vr6,           $vr14,          $vr6         \n\t"
-    "vavgr.bu    $vr7,           $vr15,          $vr7         \n\t"
-
-    "slli.d     %[stride_2],     %[dstStride],   1            \n\t"
-    "add.d      %[stride_3],     %[stride_2],    %[dstStride] \n\t"
-    "slli.d     %[stride_4],     %[stride_2],    1            \n\t"
-    "vld        $vr8,            %[tmp],         0            \n\t"
-    "vldx       $vr9,            %[tmp],         %[dstStride] \n\t"
-    "vldx       $vr10,           %[tmp],         %[stride_2]  \n\t"
-    "vldx       $vr11,           %[tmp],         %[stride_3]  \n\t"
-    "add.d      %[tmp],          %[tmp],         %[stride_4]  \n\t"
-    "vld        $vr12,           %[tmp],         0            \n\t"
-    "vldx       $vr13,           %[tmp],         %[dstStride] \n\t"
-    "vldx       $vr14,           %[tmp],         %[stride_2]  \n\t"
-    "vldx       $vr15,           %[tmp],         %[stride_3]  \n\t"
-
-    "vavgr.bu    $vr0,           $vr8,           $vr0         \n\t"
-    "vavgr.bu    $vr1,           $vr9,           $vr1         \n\t"
-    "vavgr.bu    $vr2,           $vr10,          $vr2         \n\t"
-    "vavgr.bu    $vr3,           $vr11,          $vr3         \n\t"
-    "vavgr.bu    $vr4,           $vr12,          $vr4         \n\t"
-    "vavgr.bu    $vr5,           $vr13,          $vr5         \n\t"
-    "vavgr.bu    $vr6,           $vr14,          $vr6         \n\t"
-    "vavgr.bu    $vr7,           $vr15,          $vr7         \n\t"
-
-    "vstelm.d    $vr0,           %[dst],         0,  0        \n\t"
-    "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
-    "vstelm.d    $vr1,           %[dst],         0,  0        \n\t"
-    "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
-    "vstelm.d    $vr2,           %[dst],         0,  0        \n\t"
-    "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
-    "vstelm.d    $vr3,           %[dst],         0,  0        \n\t"
-    "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
-    "vstelm.d    $vr4,           %[dst],         0,  0        \n\t"
-    "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
-    "vstelm.d    $vr5,           %[dst],         0,  0        \n\t"
-    "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
-    "vstelm.d    $vr6,           %[dst],         0,  0        \n\t"
-    "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
-    "vstelm.d    $vr7,           %[dst],         0,  0        \n\t"
-    : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [half]"+&r"(half),
-      [src]"+&r"(src), [stride_2]"=&r"(stride_2),
-      [stride_3]"=&r"(stride_3), [stride_4]"=&r"(stride_4)
-    : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride)
-    : "memory"
-    );
-}
-
 /* put_pixels16_8_lsx: dst = src */
 static av_always_inline void
 put_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
@@ -729,254 +578,6 @@ avg_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
     );
 }
 
-/* avg_pixels16_8_lsx   : dst = avg(src, dst)
- * put_pixels16_l2_8_lsx: dst = avg(src, half) , half stride is 8.
- * avg_pixels16_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
-static av_always_inline void
-put_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half,
-                      ptrdiff_t dstStride, ptrdiff_t srcStride)
-{
-    ptrdiff_t stride_2, stride_3, stride_4;
-    ptrdiff_t dstride_2, dstride_3, dstride_4;
-    __asm__ volatile (
-    "slli.d     %[stride_2],     %[srcStride],   1            \n\t"
-    "add.d      %[stride_3],     %[stride_2],    %[srcStride] \n\t"
-    "slli.d     %[stride_4],     %[stride_2],    1            \n\t"
-    "slli.d     %[dstride_2],    %[dstStride],   1            \n\t"
-    "add.d      %[dstride_3],    %[dstride_2],   %[dstStride] \n\t"
-    "slli.d     %[dstride_4],    %[dstride_2],   1            \n\t"
-    /* h0~h7 */
-    "vld        $vr0,            %[src],         0            \n\t"
-    "vldx       $vr1,            %[src],         %[srcStride] \n\t"
-    "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
-    "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
-    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
-    "vld        $vr4,            %[src],         0            \n\t"
-    "vldx       $vr5,            %[src],         %[srcStride] \n\t"
-    "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
-    "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
-    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
-
-    "vld        $vr8,            %[half],        0x00         \n\t"
-    "vld        $vr9,            %[half],        0x10         \n\t"
-    "vld        $vr10,           %[half],        0x20         \n\t"
-    "vld        $vr11,           %[half],        0x30         \n\t"
-    "vld        $vr12,           %[half],        0x40         \n\t"
-    "vld        $vr13,           %[half],        0x50         \n\t"
-    "vld        $vr14,           %[half],        0x60         \n\t"
-    "vld        $vr15,           %[half],        0x70         \n\t"
-
-    "vavgr.bu   $vr0,            $vr8,           $vr0         \n\t"
-    "vavgr.bu   $vr1,            $vr9,           $vr1         \n\t"
-    "vavgr.bu   $vr2,            $vr10,          $vr2         \n\t"
-    "vavgr.bu   $vr3,            $vr11,          $vr3         \n\t"
-    "vavgr.bu   $vr4,            $vr12,          $vr4         \n\t"
-    "vavgr.bu   $vr5,            $vr13,          $vr5         \n\t"
-    "vavgr.bu   $vr6,            $vr14,          $vr6         \n\t"
-    "vavgr.bu   $vr7,            $vr15,          $vr7         \n\t"
-
-    "vst        $vr0,            %[dst],         0            \n\t"
-    "vstx       $vr1,            %[dst],         %[dstStride] \n\t"
-    "vstx       $vr2,            %[dst],         %[dstride_2] \n\t"
-    "vstx       $vr3,            %[dst],         %[dstride_3] \n\t"
-    "add.d      %[dst],          %[dst],         %[dstride_4] \n\t"
-    "vst        $vr4,            %[dst],         0            \n\t"
-    "vstx       $vr5,            %[dst],         %[dstStride] \n\t"
-    "vstx       $vr6,            %[dst],         %[dstride_2] \n\t"
-    "vstx       $vr7,            %[dst],         %[dstride_3] \n\t"
-    "add.d      %[dst],          %[dst],         %[dstride_4] \n\t"
-
-    /* h8~h15 */
-    "vld        $vr0,            %[src],         0            \n\t"
-    "vldx       $vr1,            %[src],         %[srcStride] \n\t"
-    "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
-    "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
-    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
-    "vld        $vr4,            %[src],         0            \n\t"
-    "vldx       $vr5,            %[src],         %[srcStride] \n\t"
-    "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
-    "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
-
-    "vld        $vr8,            %[half],        0x80         \n\t"
-    "vld        $vr9,            %[half],        0x90         \n\t"
-    "vld        $vr10,           %[half],        0xa0         \n\t"
-    "vld        $vr11,           %[half],        0xb0         \n\t"
-    "vld        $vr12,           %[half],        0xc0         \n\t"
-    "vld        $vr13,           %[half],        0xd0         \n\t"
-    "vld        $vr14,           %[half],        0xe0         \n\t"
-    "vld        $vr15,           %[half],        0xf0         \n\t"
-
-    "vavgr.bu   $vr0,            $vr8,           $vr0         \n\t"
-    "vavgr.bu   $vr1,            $vr9,           $vr1         \n\t"
-    "vavgr.bu   $vr2,            $vr10,          $vr2         \n\t"
-    "vavgr.bu   $vr3,            $vr11,          $vr3         \n\t"
-    "vavgr.bu   $vr4,            $vr12,          $vr4         \n\t"
-    "vavgr.bu   $vr5,            $vr13,          $vr5         \n\t"
-    "vavgr.bu   $vr6,            $vr14,          $vr6         \n\t"
-    "vavgr.bu   $vr7,            $vr15,          $vr7         \n\t"
-
-    "vst        $vr0,            %[dst],         0            \n\t"
-    "vstx       $vr1,            %[dst],         %[dstStride] \n\t"
-    "vstx       $vr2,            %[dst],         %[dstride_2] \n\t"
-    "vstx       $vr3,            %[dst],         %[dstride_3] \n\t"
-    "add.d      %[dst],          %[dst],         %[dstride_4] \n\t"
-    "vst        $vr4,            %[dst],         0            \n\t"
-    "vstx       $vr5,            %[dst],         %[dstStride] \n\t"
-    "vstx       $vr6,            %[dst],         %[dstride_2] \n\t"
-    "vstx       $vr7,            %[dst],         %[dstride_3] \n\t"
-    : [dst]"+&r"(dst), [half]"+&r"(half), [src]"+&r"(src),
-      [stride_2]"=&r"(stride_2),  [stride_3]"=&r"(stride_3),
-      [stride_4]"=&r"(stride_4),  [dstride_2]"=&r"(dstride_2),
-      [dstride_3]"=&r"(dstride_3), [dstride_4]"=&r"(dstride_4)
-    : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride)
-    : "memory"
-    );
-}
-
-/* avg_pixels16_8_lsx    : dst = avg(src, dst)
- * put_pixels16_l2_8_lsx: dst = avg(src, half) , half stride is 8.
- * avg_pixels16_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
-static av_always_inline void
-avg_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half,
-                      ptrdiff_t dstStride, ptrdiff_t srcStride)
-{
-    uint8_t *tmp = dst;
-    ptrdiff_t stride_2, stride_3, stride_4;
-    ptrdiff_t dstride_2, dstride_3, dstride_4;
-    __asm__ volatile (
-    "slli.d     %[stride_2],     %[srcStride],   1            \n\t"
-    "add.d      %[stride_3],     %[stride_2],    %[srcStride] \n\t"
-    "slli.d     %[stride_4],     %[stride_2],    1            \n\t"
-    "slli.d     %[dstride_2],    %[dstStride],   1            \n\t"
-    "add.d      %[dstride_3],    %[dstride_2],   %[dstStride] \n\t"
-    "slli.d     %[dstride_4],    %[dstride_2],   1            \n\t"
-    /* h0~h7 */
-    "vld        $vr0,            %[src],         0            \n\t"
-    "vldx       $vr1,            %[src],         %[srcStride] \n\t"
-    "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
-    "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
-    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
-    "vld        $vr4,            %[src],         0            \n\t"
-    "vldx       $vr5,            %[src],         %[srcStride] \n\t"
-    "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
-    "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
-    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
-
-    "vld        $vr8,            %[half],        0x00         \n\t"
-    "vld        $vr9,            %[half],        0x10         \n\t"
-    "vld        $vr10,           %[half],        0x20         \n\t"
-    "vld        $vr11,           %[half],        0x30         \n\t"
-    "vld        $vr12,           %[half],        0x40         \n\t"
-    "vld        $vr13,           %[half],        0x50         \n\t"
-    "vld        $vr14,           %[half],        0x60         \n\t"
-    "vld        $vr15,           %[half],        0x70         \n\t"
-
-    "vavgr.bu   $vr0,            $vr8,           $vr0         \n\t"
-    "vavgr.bu   $vr1,            $vr9,           $vr1         \n\t"
-    "vavgr.bu   $vr2,            $vr10,          $vr2         \n\t"
-    "vavgr.bu   $vr3,            $vr11,          $vr3         \n\t"
-    "vavgr.bu   $vr4,            $vr12,          $vr4         \n\t"
-    "vavgr.bu   $vr5,            $vr13,          $vr5         \n\t"
-    "vavgr.bu   $vr6,            $vr14,          $vr6         \n\t"
-    "vavgr.bu   $vr7,            $vr15,          $vr7         \n\t"
-
-    "vld        $vr8,            %[tmp],         0            \n\t"
-    "vldx       $vr9,            %[tmp],         %[dstStride] \n\t"
-    "vldx       $vr10,           %[tmp],         %[dstride_2] \n\t"
-    "vldx       $vr11,           %[tmp],         %[dstride_3] \n\t"
-    "add.d      %[tmp],          %[tmp],         %[dstride_4] \n\t"
-    "vld        $vr12,           %[tmp],         0            \n\t"
-    "vldx       $vr13,           %[tmp],         %[dstStride] \n\t"
-    "vldx       $vr14,           %[tmp],         %[dstride_2] \n\t"
-    "vldx       $vr15,           %[tmp],         %[dstride_3] \n\t"
-    "add.d      %[tmp],          %[tmp],         %[dstride_4] \n\t"
-
-    "vavgr.bu    $vr0,           $vr8,           $vr0         \n\t"
-    "vavgr.bu    $vr1,           $vr9,           $vr1         \n\t"
-    "vavgr.bu    $vr2,           $vr10,          $vr2         \n\t"
-    "vavgr.bu    $vr3,           $vr11,          $vr3         \n\t"
-    "vavgr.bu    $vr4,           $vr12,          $vr4         \n\t"
-    "vavgr.bu    $vr5,           $vr13,          $vr5         \n\t"
-    "vavgr.bu    $vr6,           $vr14,          $vr6         \n\t"
-    "vavgr.bu    $vr7,           $vr15,          $vr7         \n\t"
-
-    "vst        $vr0,            %[dst],         0            \n\t"
-    "vstx       $vr1,            %[dst],         %[dstStride] \n\t"
-    "vstx       $vr2,            %[dst],         %[dstride_2] \n\t"
-    "vstx       $vr3,            %[dst],         %[dstride_3] \n\t"
-    "add.d      %[dst],          %[dst],         %[dstride_4] \n\t"
-    "vst        $vr4,            %[dst],         0            \n\t"
-    "vstx       $vr5,            %[dst],         %[dstStride] \n\t"
-    "vstx       $vr6,            %[dst],         %[dstride_2] \n\t"
-    "vstx       $vr7,            %[dst],         %[dstride_3] \n\t"
-    "add.d      %[dst],          %[dst],         %[dstride_4] \n\t"
-
-    /* h8~h15    */
-    "vld        $vr0,            %[src],         0            \n\t"
-    "vldx       $vr1,            %[src],         %[srcStride] \n\t"
-    "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
-    "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
-    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
-    "vld        $vr4,            %[src],         0            \n\t"
-    "vldx       $vr5,            %[src],         %[srcStride] \n\t"
-    "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
-    "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
-
-    "vld        $vr8,            %[half],        0x80         \n\t"
-    "vld        $vr9,            %[half],        0x90         \n\t"
-    "vld        $vr10,           %[half],        0xa0         \n\t"
-    "vld        $vr11,           %[half],        0xb0         \n\t"
-    "vld        $vr12,           %[half],        0xc0         \n\t"
-    "vld        $vr13,           %[half],        0xd0         \n\t"
-    "vld        $vr14,           %[half],        0xe0         \n\t"
-    "vld        $vr15,           %[half],        0xf0         \n\t"
-
-    "vavgr.bu    $vr0,           $vr8,           $vr0         \n\t"
-    "vavgr.bu    $vr1,           $vr9,           $vr1         \n\t"
-    "vavgr.bu    $vr2,           $vr10,          $vr2         \n\t"
-    "vavgr.bu    $vr3,           $vr11,          $vr3         \n\t"
-    "vavgr.bu    $vr4,           $vr12,          $vr4         \n\t"
-    "vavgr.bu    $vr5,           $vr13,          $vr5         \n\t"
-    "vavgr.bu    $vr6,           $vr14,          $vr6         \n\t"
-    "vavgr.bu    $vr7,           $vr15,          $vr7         \n\t"
-
-    "vld        $vr8,            %[tmp],         0            \n\t"
-    "vldx       $vr9,            %[tmp],         %[dstStride] \n\t"
-    "vldx       $vr10,           %[tmp],         %[dstride_2] \n\t"
-    "vldx       $vr11,           %[tmp],         %[dstride_3] \n\t"
-    "add.d      %[tmp],          %[tmp],         %[dstride_4] \n\t"
-    "vld        $vr12,           %[tmp],         0            \n\t"
-    "vldx       $vr13,           %[tmp],         %[dstStride] \n\t"
-    "vldx       $vr14,           %[tmp],         %[dstride_2] \n\t"
-    "vldx       $vr15,           %[tmp],         %[dstride_3] \n\t"
-
-    "vavgr.bu    $vr0,           $vr8,           $vr0         \n\t"
-    "vavgr.bu    $vr1,           $vr9,           $vr1         \n\t"
-    "vavgr.bu    $vr2,           $vr10,          $vr2         \n\t"
-    "vavgr.bu    $vr3,           $vr11,          $vr3         \n\t"
-    "vavgr.bu    $vr4,           $vr12,          $vr4         \n\t"
-    "vavgr.bu    $vr5,           $vr13,          $vr5         \n\t"
-    "vavgr.bu    $vr6,           $vr14,          $vr6         \n\t"
-    "vavgr.bu    $vr7,           $vr15,          $vr7         \n\t"
-
-    "vst        $vr0,            %[dst],         0            \n\t"
-    "vstx       $vr1,            %[dst],         %[dstStride] \n\t"
-    "vstx       $vr2,            %[dst],         %[dstride_2] \n\t"
-    "vstx       $vr3,            %[dst],         %[dstride_3] \n\t"
-    "add.d      %[dst],          %[dst],         %[dstride_4] \n\t"
-    "vst        $vr4,            %[dst],         0            \n\t"
-    "vstx       $vr5,            %[dst],         %[dstStride] \n\t"
-    "vstx       $vr6,            %[dst],         %[dstride_2] \n\t"
-    "vstx       $vr7,            %[dst],         %[dstride_3] \n\t"
-    : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [half]"+&r"(half), [src]"+&r"(src),
-      [stride_2]"=&r"(stride_2),  [stride_3]"=&r"(stride_3),
-      [stride_4]"=&r"(stride_4),  [dstride_2]"=&r"(dstride_2),
-      [dstride_3]"=&r"(dstride_3), [dstride_4]"=&r"(dstride_4)
-    : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride)
-    : "memory"
-    );
-}
-
 #define QPEL8_H_LOWPASS(out_v)                                               \
     src00 = __lasx_xvld(src, - 2);                                           \
     src += srcStride;                                                        \
diff --git a/libavcodec/loongarch/h264qpel_lasx.h b/libavcodec/loongarch/h264qpel_lasx.h
deleted file mode 100644
index 32b6b50917..0000000000
--- a/libavcodec/loongarch/h264qpel_lasx.h
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Copyright (c) 2020 Loongson Technology Corporation Limited
- * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_LOONGARCH_H264QPEL_LASX_H
-#define AVCODEC_LOONGARCH_H264QPEL_LASX_H
-
-#include <stdint.h>
-#include <stddef.h>
-#include "libavcodec/h264.h"
-
-void ff_h264_h_lpf_luma_inter_lasx(uint8_t *src, int stride,
-                                   int alpha, int beta, int8_t *tc0);
-void ff_h264_v_lpf_luma_inter_lasx(uint8_t *src, int stride,
-                                   int alpha, int beta, int8_t *tc0);
-void ff_put_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-
-void ff_put_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride);
-void ff_put_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride);
-void ff_put_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride);
-void ff_put_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride);
-void ff_put_h264_qpel8_mc01_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride);
-void ff_put_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride);
-void ff_put_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride);
-void ff_put_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride);
-void ff_put_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride);
-void ff_put_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride);
-void ff_put_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride);
-void ff_put_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride);
-void ff_put_h264_qpel8_mc03_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride);
-void ff_put_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride);
-void ff_put_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride);
-void ff_put_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride);
-void ff_avg_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t dst_stride);
-#endif  // #ifndef AVCODEC_LOONGARCH_H264QPEL_LASX_H
diff --git a/libavcodec/loongarch/h264qpel_loongarch.h b/libavcodec/loongarch/h264qpel_loongarch.h
new file mode 100644
index 0000000000..68232730da
--- /dev/null
+++ b/libavcodec/loongarch/h264qpel_loongarch.h
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_LOONGARCH_H264QPEL_LOONGARCH_H
+#define AVCODEC_LOONGARCH_H264QPEL_LOONGARCH_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include "libavcodec/h264.h"
+#include "config.h"
+
+void put_h264_qpel8_hv_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+                                   ptrdiff_t dstStride, ptrdiff_t srcStride);
+void put_h264_qpel8_h_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dstStride, ptrdiff_t srcStride);
+void put_h264_qpel8_v_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dstStride, ptrdiff_t srcStride);
+void put_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half,
+                           ptrdiff_t dstStride, ptrdiff_t srcStride);
+void put_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half,
+                          ptrdiff_t dstStride, ptrdiff_t srcStride);
+
+void avg_h264_qpel8_h_lowpass_lsx(uint8_t *dst, const uint8_t *src, int dstStride,
+                                  int srcStride);
+void avg_h264_qpel8_v_lowpass_lsx(uint8_t *dst, uint8_t *src, int dstStride,
+                                  int srcStride);
+void avg_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half,
+                           ptrdiff_t dstStride, ptrdiff_t srcStride);
+void avg_h264_qpel8_hv_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+                                   ptrdiff_t dstStride, ptrdiff_t srcStride);
+void avg_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half,
+                          ptrdiff_t dstStride, ptrdiff_t srcStride);
+
+void ff_put_h264_qpel16_mc00_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc10_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc20_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc30_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc01_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc11_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc13_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc31_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc33_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc03_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc02_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc22_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc21_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel16_mc12_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel16_mc32_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel16_mc23_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+
+void ff_avg_h264_qpel16_mc00_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc10_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc30_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc33_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc11_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc31_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc13_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc20_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc02_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc03_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc23_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc21_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc01_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc32_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc12_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc22_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+
+void ff_put_h264_qpel8_mc03_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc00_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc01_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc30_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc10_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc33_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc13_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc31_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc11_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc32_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc21_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc23_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc12_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc02_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc22_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc20_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+
+void ff_avg_h264_qpel8_mc00_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc10_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc20_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc30_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc11_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc21_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc31_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc02_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc12_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc22_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc32_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc13_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc23_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc33_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+
+#if HAVE_LASX
+void ff_h264_h_lpf_luma_inter_lasx(uint8_t *src, int stride,
+                                   int alpha, int beta, int8_t *tc0);
+void ff_h264_v_lpf_luma_inter_lasx(uint8_t *src, int stride,
+                                   int alpha, int beta, int8_t *tc0);
+void ff_put_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+
+void ff_put_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc01_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc03_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+#endif
+
+#endif  // #ifndef AVCODEC_LOONGARCH_H264QPEL_LOONGARCH_H
diff --git a/libavcodec/loongarch/h264qpel_lsx.c b/libavcodec/loongarch/h264qpel_lsx.c
new file mode 100644
index 0000000000..99c523b439
--- /dev/null
+++ b/libavcodec/loongarch/h264qpel_lsx.c
@@ -0,0 +1,488 @@
+/*
+ * Loongson LSX optimized h264qpel
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Hecai Yuan <yuanhecai@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264qpel_loongarch.h"
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "libavutil/attributes.h"
+
+static void put_h264_qpel16_hv_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+                                           ptrdiff_t dstStride, ptrdiff_t srcStride)
+{
+    put_h264_qpel8_hv_lowpass_lsx(dst, src, dstStride, srcStride);
+    put_h264_qpel8_hv_lowpass_lsx(dst + 8, src + 8, dstStride, srcStride);
+    src += srcStride << 3;
+    dst += dstStride << 3;
+    put_h264_qpel8_hv_lowpass_lsx(dst, src, dstStride, srcStride);
+    put_h264_qpel8_hv_lowpass_lsx(dst + 8, src + 8, dstStride, srcStride);
+}
+
+void ff_put_h264_qpel16_mc22_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    put_h264_qpel16_hv_lowpass_lsx(dst, src, stride, stride);
+}
+
+static void put_h264_qpel16_h_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+                                          int dstStride, int srcStride)
+{
+    put_h264_qpel8_h_lowpass_lsx(dst, src, dstStride, srcStride);
+    put_h264_qpel8_h_lowpass_lsx(dst+8, src+8, dstStride, srcStride);
+    src += srcStride << 3;
+    dst += dstStride << 3;
+    put_h264_qpel8_h_lowpass_lsx(dst, src, dstStride, srcStride);
+    put_h264_qpel8_h_lowpass_lsx(dst+8, src+8, dstStride, srcStride);
+}
+
+static void put_h264_qpel16_v_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+                                           int dstStride, int srcStride)
+{
+    put_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, dstStride, srcStride);
+    put_h264_qpel8_v_lowpass_lsx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
+    src += 8*srcStride;
+    dst += 8*dstStride;
+    put_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, dstStride, srcStride);
+    put_h264_qpel8_v_lowpass_lsx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
+}
+
+void ff_put_h264_qpel16_mc21_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t temp[512];
+    uint8_t *const halfH  = temp;
+    uint8_t *const halfHV = temp + 256;
+
+    put_h264_qpel16_h_lowpass_lsx(halfH, src, 16, stride);
+    put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+    put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+void ff_put_h264_qpel16_mc12_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t temp[512];
+    uint8_t *const halfHV = temp;
+    uint8_t *const halfH  = temp + 256;
+
+    put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+    put_h264_qpel16_v_lowpass_lsx(halfH, src, 16, stride);
+    put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+void ff_put_h264_qpel16_mc32_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t temp[512];
+    uint8_t *const halfHV = temp;
+    uint8_t *const halfH  = temp + 256;
+
+    put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+    put_h264_qpel16_v_lowpass_lsx(halfH, src + 1, 16, stride);
+    put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+void ff_put_h264_qpel16_mc23_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t temp[512];
+    uint8_t *const halfH  = temp;
+    uint8_t *const halfHV = temp + 256;
+
+    put_h264_qpel16_h_lowpass_lsx(halfH, src + stride, 16, stride);
+    put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+    put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+static void avg_h264_qpel16_v_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+                                          int dstStride, int srcStride)
+{
+    avg_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, dstStride, srcStride);
+    avg_h264_qpel8_v_lowpass_lsx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
+    src += 8*srcStride;
+    dst += 8*dstStride;
+    avg_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, dstStride, srcStride);
+    avg_h264_qpel8_v_lowpass_lsx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
+}
+
+void ff_avg_h264_qpel16_mc02_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avg_h264_qpel16_v_lowpass_lsx(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel16_mc03_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t half[256];
+
+    put_h264_qpel16_v_lowpass_lsx(half, src, 16, stride);
+    avg_pixels16_l2_8_lsx(dst, src + stride, half, stride, stride);
+}
+
+void ff_avg_h264_qpel16_mc23_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t temp[512];
+    uint8_t *const halfH  = temp;
+    uint8_t *const halfHV = temp + 256;
+
+    put_h264_qpel16_h_lowpass_lsx(halfH, src + stride, 16, stride);
+    put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+    avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+void ff_avg_h264_qpel16_mc21_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t temp[512];
+    uint8_t *const halfH  = temp;
+    uint8_t *const halfHV = temp + 256;
+
+    put_h264_qpel16_h_lowpass_lsx(halfH, src, 16, stride);
+    put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+    avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+void ff_avg_h264_qpel16_mc01_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t half[256];
+
+    put_h264_qpel16_v_lowpass_lsx(half, src, 16, stride);
+    avg_pixels16_l2_8_lsx(dst, src, half, stride, stride);
+}
+
+void ff_avg_h264_qpel16_mc32_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t temp[512];
+    uint8_t *const halfHV = temp;
+    uint8_t *const halfH  = temp + 256;
+
+    put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+    put_h264_qpel16_v_lowpass_lsx(halfH, src + 1, 16, stride);
+    avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+void ff_avg_h264_qpel16_mc12_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t temp[512];
+    uint8_t *const halfHV = temp;
+    uint8_t *const halfH  = temp + 256;
+
+    put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+    put_h264_qpel16_v_lowpass_lsx(halfH, src, 16, stride);
+    avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+static void avg_h264_qpel16_hv_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+                                           ptrdiff_t dstStride, ptrdiff_t srcStride)
+{
+    avg_h264_qpel8_hv_lowpass_lsx(dst, src, dstStride, srcStride);
+    avg_h264_qpel8_hv_lowpass_lsx(dst + 8, src + 8, dstStride, srcStride);
+    src += srcStride << 3;
+    dst += dstStride << 3;
+    avg_h264_qpel8_hv_lowpass_lsx(dst, src, dstStride, srcStride);
+    avg_h264_qpel8_hv_lowpass_lsx(dst + 8, src + 8, dstStride, srcStride);
+}
+
+void ff_avg_h264_qpel16_mc22_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avg_h264_qpel16_hv_lowpass_lsx(dst, src, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc03_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t half[64];
+
+    put_h264_qpel8_v_lowpass_lsx(half, (uint8_t*)src, 8, stride);
+    put_pixels8_l2_8_lsx(dst, src + stride, half, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc01_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t half[64];
+
+    put_h264_qpel8_v_lowpass_lsx(half, (uint8_t*)src, 8, stride);
+    put_pixels8_l2_8_lsx(dst, src, half, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc30_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t half[64];
+
+    put_h264_qpel8_h_lowpass_lsx(half, src, 8, stride);
+    put_pixels8_l2_8_lsx(dst, src+1, half, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc10_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t half[64];
+
+    put_h264_qpel8_h_lowpass_lsx(half, src, 8, stride);
+    put_pixels8_l2_8_lsx(dst, src, half, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc33_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src + 1, 8, stride);
+    put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc13_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src, 8, stride);
+    put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc31_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src + 1, 8, stride);
+    put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc11_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src, 8, stride);
+    put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc32_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t temp[128];
+    uint8_t *const halfHV = temp;
+    uint8_t *const halfH  = temp + 64;
+
+    put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfH, (uint8_t*)src + 1, 8, stride);
+    put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc21_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t temp[128];
+    uint8_t *const halfH  = temp;
+    uint8_t *const halfHV = temp + 64;
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride);
+    put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+    put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc23_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t temp[128];
+    uint8_t *const halfH  = temp;
+    uint8_t *const halfHV = temp + 64;
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride);
+    put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+    put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc12_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t temp[128];
+    uint8_t *const halfHV = temp;
+    uint8_t *const halfH  = temp + 64;
+
+    put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfH, (uint8_t*)src, 8, stride);
+    put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc02_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    put_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc22_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    put_h264_qpel8_hv_lowpass_lsx(dst, src, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc20_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    put_h264_qpel8_h_lowpass_lsx(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc10_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t half[64];
+
+    put_h264_qpel8_h_lowpass_lsx(half, src, 8, stride);
+    avg_pixels8_l2_8_lsx(dst, src, half, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc20_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avg_h264_qpel8_h_lowpass_lsx(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc30_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t half[64];
+
+    put_h264_qpel8_h_lowpass_lsx(half, src, 8, stride);
+    avg_pixels8_l2_8_lsx(dst, src+1, half, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc11_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src, 8, stride);
+    avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc21_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t temp[128];
+    uint8_t *const halfH  = temp;
+    uint8_t *const halfHV = temp + 64;
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride);
+    put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+    avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc31_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src + 1, 8, stride);
+    avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc02_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avg_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc12_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t temp[128];
+    uint8_t *const halfHV = temp;
+    uint8_t *const halfH  = temp + 64;
+
+    put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfH, (uint8_t*)src, 8, stride);
+    avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc22_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avg_h264_qpel8_hv_lowpass_lsx(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc32_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t temp[128];
+    uint8_t *const halfHV = temp;
+    uint8_t *const halfH  = temp + 64;
+
+    put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfH, (uint8_t*)src + 1, 8, stride);
+    avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc13_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src, 8, stride);
+    avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc23_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t temp[128];
+    uint8_t *const halfH  = temp;
+    uint8_t *const halfHV = temp + 64;
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride);
+    put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+    avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc33_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src + 1, 8, stride);
+    avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
-- 
2.20.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [FFmpeg-devel] [PATCH v1 5/6] swscale/la: Optimize the functions of the swscale series with lsx.
  2023-05-04  8:49 [FFmpeg-devel] Add LSX optimization in avcodec and swscale Hao Chen
                   ` (3 preceding siblings ...)
  2023-05-04  8:49 ` [FFmpeg-devel] [PATCH v1 4/6] avcodec/la: Add LSX optimization for h264 qpel Hao Chen
@ 2023-05-04  8:49 ` Hao Chen
  2023-05-04  8:49 ` [FFmpeg-devel] [PATCH v1 6/6] swscale/la: Add following builtin optimized functions Hao Chen
  5 siblings, 0 replies; 18+ messages in thread
From: Hao Chen @ 2023-05-04  8:49 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Lu Wang

From: Lu Wang <wanglu@loongson.cn>

./configure --disable-lasx
ffmpeg -i ~/media/1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -s 640x480
-pix_fmt bgra -y /dev/null -an
before: 91fps
after:  160fps
---
 libswscale/loongarch/Makefile                 |    5 +
 libswscale/loongarch/input.S                  |  285 +++
 libswscale/loongarch/output.S                 |  138 ++
 libswscale/loongarch/output_lasx.c            |    4 +-
 libswscale/loongarch/output_lsx.c             | 1828 ++++++++++++++++
 libswscale/loongarch/swscale.S                | 1868 +++++++++++++++++
 libswscale/loongarch/swscale_init_loongarch.c |   32 +-
 libswscale/loongarch/swscale_loongarch.h      |   43 +-
 libswscale/loongarch/swscale_lsx.c            |   57 +
 libswscale/utils.c                            |    3 +-
 10 files changed, 4256 insertions(+), 7 deletions(-)
 create mode 100644 libswscale/loongarch/input.S
 create mode 100644 libswscale/loongarch/output.S
 create mode 100644 libswscale/loongarch/output_lsx.c
 create mode 100644 libswscale/loongarch/swscale.S
 create mode 100644 libswscale/loongarch/swscale_lsx.c

diff --git a/libswscale/loongarch/Makefile b/libswscale/loongarch/Makefile
index 8e665e826c..c0b6a449c0 100644
--- a/libswscale/loongarch/Makefile
+++ b/libswscale/loongarch/Makefile
@@ -4,3 +4,8 @@ LASX-OBJS-$(CONFIG_SWSCALE) += loongarch/swscale_lasx.o \
                                loongarch/yuv2rgb_lasx.o \
                                loongarch/rgb2rgb_lasx.o \
                                loongarch/output_lasx.o
+LSX-OBJS-$(CONFIG_SWSCALE)  += loongarch/swscale.o \
+                               loongarch/swscale_lsx.o \
+                               loongarch/input.o   \
+                               loongarch/output.o  \
+                               loongarch/output_lsx.o
diff --git a/libswscale/loongarch/input.S b/libswscale/loongarch/input.S
new file mode 100644
index 0000000000..d01f7384b1
--- /dev/null
+++ b/libswscale/loongarch/input.S
@@ -0,0 +1,285 @@
+/*
+ * Loongson LSX optimized swscale
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/loongarch/loongson_asm.S"
+
+/* void planar_rgb_to_y_lsx(uint8_t *_dst, const uint8_t *src[4],
+ *                          int width, int32_t *rgb2yuv)
+ */
+function planar_rgb_to_y_lsx
+    ld.d            a5,     a1,    0
+    ld.d            a6,     a1,    8
+    ld.d            a7,     a1,    16
+
+    ld.w            t1,     a3,    0     // ry
+    ld.w            t2,     a3,    4     // gy
+    ld.w            t3,     a3,    8     // by
+    li.w            t4,     9
+    li.w            t5,     524544
+    li.w            t7,     4
+    li.w            t8,     8
+    vldi            vr7,    0
+    vreplgr2vr.w    vr1,    t1
+    vreplgr2vr.w    vr2,    t2
+    vreplgr2vr.w    vr3,    t3
+    vreplgr2vr.w    vr4,    t4
+    vreplgr2vr.w    vr5,    t5
+    bge             a2,     t8,    .WIDTH8
+    bge             a2,     t7,    .WIDTH4
+    blt             zero,   a2,    .WIDTH
+    b               .END
+
+.WIDTH8:
+    vld             vr8,    a5,    0
+    vld             vr9,    a6,    0
+    vld             vr10,   a7,    0
+    vilvl.b         vr11,   vr7,   vr8
+    vilvl.b         vr12,   vr7,   vr9
+    vilvl.b         vr13,   vr7,   vr10
+    vilvl.h         vr14,   vr7,   vr11
+    vilvl.h         vr15,   vr7,   vr12
+    vilvl.h         vr16,   vr7,   vr13
+    vilvh.h         vr17,   vr7,   vr11
+    vilvh.h         vr18,   vr7,   vr12
+    vilvh.h         vr19,   vr7,   vr13
+    vmul.w          vr20,   vr1,   vr16
+    vmul.w          vr21,   vr1,   vr19
+    vmadd.w         vr20,   vr2,   vr14
+    vmadd.w         vr20,   vr3,   vr15
+    vmadd.w         vr21,   vr2,   vr17
+    vmadd.w         vr21,   vr3,   vr18
+    vadd.w          vr20,   vr20,  vr5
+    vadd.w          vr21,   vr21,  vr5
+    vsra.w          vr20,   vr20,  vr4
+    vsra.w          vr21,   vr21,  vr4
+    vpickev.h       vr20,   vr21,  vr20
+    vst             vr20,   a0,    0
+    addi.d          a2,     a2,    -8
+    addi.d          a5,     a5,    8
+    addi.d          a6,     a6,    8
+    addi.d          a7,     a7,    8
+    addi.d          a0,     a0,    16
+    bge             a2,     t8,    .WIDTH8
+    bge             a2,     t7,    .WIDTH4
+    blt             zero,   a2,    .WIDTH
+    b               .END
+
+.WIDTH4:
+    vld             vr8,    a5,    0
+    vld             vr9,    a6,    0
+    vld             vr10,   a7,    0
+    vilvl.b         vr11,   vr7,   vr8
+    vilvl.b         vr12,   vr7,   vr9
+    vilvl.b         vr13,   vr7,   vr10
+    vilvl.h         vr14,   vr7,   vr11
+    vilvl.h         vr15,   vr7,   vr12
+    vilvl.h         vr16,   vr7,   vr13
+    vmul.w          vr17,   vr1,   vr16
+    vmadd.w         vr17,   vr2,   vr14
+    vmadd.w         vr17,   vr3,   vr15
+    vadd.w          vr17,   vr17,  vr5
+    vsra.w          vr17,   vr17,  vr4
+    vpickev.h       vr17,   vr17,  vr17
+    vstelm.d        vr17,   a0,    0,    0
+    addi.d          a2,     a2,    -4
+    addi.d          a5,     a5,    4
+    addi.d          a6,     a6,    4
+    addi.d          a7,     a7,    4
+    addi.d          a0,     a0,    8
+    bge             a2,     t7,    .WIDTH4
+    blt             zero,   a2,    .WIDTH
+    b               .END
+
+.WIDTH:
+    ld.bu           t0,     a5,    0
+    ld.bu           t4,     a6,    0
+    ld.bu           t6,     a7,    0
+    mul.w           t8,     t6,    t1
+    mul.w           t7,     t0,    t2
+    add.w           t8,     t8,    t7
+    mul.w           t7,     t4,    t3
+    add.w           t8,     t8,    t7
+    add.w           t8,     t8,    t5
+    srai.w          t8,     t8,    9
+    st.h            t8,     a0,    0
+    addi.d          a2,     a2,    -1
+    addi.d          a5,     a5,    1
+    addi.d          a6,     a6,    1
+    addi.d          a7,     a7,    1
+    addi.d          a0,     a0,    2
+    blt             zero,   a2,    .WIDTH
+.END:
+endfunc
+
+/* void planar_rgb_to_uv_lsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
+ *                           int width, int32_t *rgb2yuv)
+ */
+function planar_rgb_to_uv_lsx
+    addi.d          sp,     sp,    -24
+    st.d            s1,     sp,    0
+    st.d            s2,     sp,    8
+    st.d            s3,     sp,    16
+
+    ld.d            a5,     a2,    0
+    ld.d            a6,     a2,    8
+    ld.d            a7,     a2,    16
+    ld.w            t1,     a4,    12     // ru
+    ld.w            t2,     a4,    16     // gu
+    ld.w            t3,     a4,    20     // bu
+    ld.w            s1,     a4,    24     // rv
+    ld.w            s2,     a4,    28     // gv
+    ld.w            s3,     a4,    32     // bv
+    li.w            t4,     9
+    li.w            t5,     4194560
+    li.w            t7,     4
+    li.w            t8,     8
+    vldi            vr0,    0
+    vreplgr2vr.w    vr1,    t1
+    vreplgr2vr.w    vr2,    t2
+    vreplgr2vr.w    vr3,    t3
+    vreplgr2vr.w    vr4,    s1
+    vreplgr2vr.w    vr5,    s2
+    vreplgr2vr.w    vr6,    s3
+    vreplgr2vr.w    vr7,    t4
+    vreplgr2vr.w    vr8,    t5
+    bge             a2,     t8,    .LOOP_WIDTH8
+    bge             a2,     t7,    .LOOP_WIDTH4
+    blt             zero,   a2,    .LOOP_WIDTH
+    b               .LOOP_END
+
+.LOOP_WIDTH8:
+    vld             vr9,    a5,    0
+    vld             vr10,   a6,    0
+    vld             vr11,   a7,    0
+    vilvl.b         vr9,    vr0,   vr9
+    vilvl.b         vr10,   vr0,   vr10
+    vilvl.b         vr11,   vr0,   vr11
+    vilvl.h         vr12,   vr0,   vr9
+    vilvl.h         vr13,   vr0,   vr10
+    vilvl.h         vr14,   vr0,   vr11
+    vilvh.h         vr15,   vr0,   vr9
+    vilvh.h         vr16,   vr0,   vr10
+    vilvh.h         vr17,   vr0,   vr11
+    vmul.w          vr18,   vr1,   vr14
+    vmul.w          vr19,   vr1,   vr17
+    vmul.w          vr20,   vr4,   vr14
+    vmul.w          vr21,   vr4,   vr17
+    vmadd.w         vr18,   vr2,   vr12
+    vmadd.w         vr18,   vr3,   vr13
+    vmadd.w         vr19,   vr2,   vr15
+    vmadd.w         vr19,   vr3,   vr16
+    vmadd.w         vr20,   vr5,   vr12
+    vmadd.w         vr20,   vr6,   vr13
+    vmadd.w         vr21,   vr5,   vr15
+    vmadd.w         vr21,   vr6,   vr16
+    vadd.w          vr18,   vr18,  vr8
+    vadd.w          vr19,   vr19,  vr8
+    vadd.w          vr20,   vr20,  vr8
+    vadd.w          vr21,   vr21,  vr8
+    vsra.w          vr18,   vr18,  vr7
+    vsra.w          vr19,   vr19,  vr7
+    vsra.w          vr20,   vr20,  vr7
+    vsra.w          vr21,   vr21,  vr7
+    vpickev.h       vr18,   vr19,  vr18
+    vpickev.h       vr20,   vr21,  vr20
+    vst             vr18,   a0,    0
+    vst             vr20,   a1,    0
+    addi.d          a3,     a3,    -8
+    addi.d          a5,     a5,    8
+    addi.d          a6,     a6,    8
+    addi.d          a7,     a7,    8
+    addi.d          a0,     a0,    16
+    addi.d          a1,     a1,    16
+    bge             a3,     t8,    .LOOP_WIDTH8
+    bge             a3,     t7,    .LOOP_WIDTH4
+    blt             zero,   a3,    .LOOP_WIDTH
+    b               .LOOP_END
+
+.LOOP_WIDTH4:
+    vld             vr9,    a5,    0
+    vld             vr10,   a6,    0
+    vld             vr11,   a7,    0
+    vilvl.b         vr9,    vr0,   vr9
+    vilvl.b         vr10,   vr0,   vr10
+    vilvl.b         vr11,   vr0,   vr11
+    vilvl.h         vr12,   vr0,   vr9
+    vilvl.h         vr13,   vr0,   vr10
+    vilvl.h         vr14,   vr0,   vr11
+    vmul.w          vr18,   vr1,   vr14
+    vmul.w          vr19,   vr4,   vr14
+    vmadd.w         vr18,   vr2,   vr12
+    vmadd.w         vr18,   vr3,   vr13
+    vmadd.w         vr19,   vr5,   vr12
+    vmadd.w         vr19,   vr6,   vr13
+    vadd.w          vr18,   vr18,  vr8
+    vadd.w          vr19,   vr19,  vr8
+    vsra.w          vr18,   vr18,  vr7
+    vsra.w          vr19,   vr19,  vr7
+    vpickev.h       vr18,   vr18,  vr18
+    vpickev.h       vr19,   vr19,  vr19
+    vstelm.d        vr18,   a0,    0,    0
+    vstelm.d        vr19,   a1,    0,    0
+    addi.d          a3,     a3,    -4
+    addi.d          a5,     a5,    4
+    addi.d          a6,     a6,    4
+    addi.d          a7,     a7,    4
+    addi.d          a0,     a0,    8
+    addi.d          a1,     a1,    8
+    bge             a3,     t7,    .LOOP_WIDTH4
+    blt             zero,   a3,    .LOOP_WIDTH
+    b               .LOOP_END
+
+.LOOP_WIDTH:
+    ld.bu           t0,     a5,    0
+    ld.bu           t4,     a6,    0
+    ld.bu           t6,     a7,    0
+    mul.w           t8,     t6,    t1
+    mul.w           t7,     t0,    t2
+    add.w           t8,     t8,    t7
+    mul.w           t7,     t4,    t3
+    add.w           t8,     t8,    t7
+    add.w           t8,     t8,    t5
+    srai.w          t8,     t8,    9
+    st.h            t8,     a0,    0
+    mul.w           t8,     t6,    s1
+    mul.w           t7,     t0,    s2
+    add.w           t8,     t8,    t7
+    mul.w           t7,     t4,    s3
+    add.w           t8,     t8,    t7
+    add.w           t8,     t8,    t5
+    srai.w          t8,     t8,    9
+    st.h            t8,     a1,    0
+    addi.d          a3,     a3,    -1
+    addi.d          a5,     a5,    1
+    addi.d          a6,     a6,    1
+    addi.d          a7,     a7,    1
+    addi.d          a0,     a0,    2
+    addi.d          a1,     a1,    2
+    blt             zero,   a3,    .LOOP_WIDTH
+
+.LOOP_END:
+    ld.d            s1,     sp,    0
+    ld.d            s2,     sp,    8
+    ld.d            s3,     sp,    16
+    addi.d          sp,     sp,    24
+endfunc
diff --git a/libswscale/loongarch/output.S b/libswscale/loongarch/output.S
new file mode 100644
index 0000000000..b44bac502a
--- /dev/null
+++ b/libswscale/loongarch/output.S
@@ -0,0 +1,138 @@
+/*
+ * Loongson LSX optimized swscale
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/loongarch/loongson_asm.S"
+
+/* static void ff_yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
+ *                                 const int16_t **src, uint8_t *dest, int dstW,
+ *                                 const uint8_t *dither, int offset)
+ */
+function ff_yuv2planeX_8_lsx
+    addi.w          t1,     a6,     1
+    addi.w          t2,     a6,     2
+    addi.w          t3,     a6,     3
+    addi.w          t4,     a6,     4
+    addi.w          t5,     a6,     5
+    addi.w          t6,     a6,     6
+    addi.w          t7,     a6,     7
+    andi            t0,     a6,     7
+    andi            t1,     t1,     7
+    andi            t2,     t2,     7
+    andi            t3,     t3,     7
+    andi            t4,     t4,     7
+    andi            t5,     t5,     7
+    andi            t6,     t6,     7
+    andi            t7,     t7,     7
+    ldx.bu          t0,     a5,     t0
+    ldx.bu          t1,     a5,     t1
+    ldx.bu          t2,     a5,     t2
+    ldx.bu          t3,     a5,     t3
+    ldx.bu          t4,     a5,     t4
+    ldx.bu          t5,     a5,     t5
+    ldx.bu          t6,     a5,     t6
+    ldx.bu          t7,     a5,     t7
+    vreplgr2vr.w    vr0,    t0
+    vreplgr2vr.w    vr1,    t1
+    vreplgr2vr.w    vr2,    t2
+    vreplgr2vr.w    vr3,    t3
+    vreplgr2vr.w    vr4,    t4
+    vreplgr2vr.w    vr5,    t5
+    vreplgr2vr.w    vr6,    t6
+    vreplgr2vr.w    vr7,    t7
+    vilvl.w         vr0,    vr2,    vr0
+    vilvl.w         vr4,    vr6,    vr4
+    vilvl.w         vr1,    vr3,    vr1
+    vilvl.w         vr5,    vr7,    vr5
+    vilvl.d         vr12,   vr4,    vr0
+    vilvl.d         vr13,   vr5,    vr1
+    li.w            t5,     0
+    li.w            t8,     8
+    bge             a4,     t8,     .WIDTH8
+    blt             zero,   a4,     .WIDTH
+    b               .END
+
+.WIDTH8:
+    li.d            t1,     0
+    li.d            t4,     0
+    vslli.w         vr2,    vr12,   12
+    vslli.w         vr3,    vr13,   12
+    move            t3,     a0
+
+.FILTERSIZE8:
+    ldx.d           t2,     a2,     t1
+    vldx            vr4,    t2,     t5
+    vldrepl.h       vr5,    t3,     0
+    vmaddwev.w.h    vr2,    vr4,    vr5
+    vmaddwod.w.h    vr3,    vr4,    vr5
+    addi.d          t1,     t1,     8
+    addi.d          t3,     t3,     2
+    addi.d          t4,     t4,     1
+    blt             t4,     a1,     .FILTERSIZE8
+    vsrai.w         vr2,    vr2,    19
+    vsrai.w         vr3,    vr3,    19
+    vclip255.w      vr2,    vr2
+    vclip255.w      vr3,    vr3
+    vpickev.h       vr2,    vr3,    vr2
+    vpickev.b       vr2,    vr2,    vr2
+    vbsrl.v         vr3,    vr2,    4
+    vilvl.b         vr2,    vr3,    vr2
+    fst.d           f2,     a3,     0
+    addi.d          t5,     t5,     16
+    addi.d          a4,     a4,     -8
+    addi.d          a3,     a3,     8
+    bge             a4,     t8,     .WIDTH8
+    blt             zero,   a4,     .WIDTH
+    b               .END
+
+.WIDTH:
+    li.d            t1,     0
+    li.d            t4,     0
+    vslli.w         vr2,    vr12,   12
+    vslli.w         vr3,    vr13,   12
+.FILTERSIZE:
+    ldx.d           t2,     a2,     t1
+    vldx            vr4,    t2,     t5
+    vldrepl.h       vr5,    a0,     0
+    vmaddwev.w.h    vr2,    vr4,    vr5
+    vmaddwod.w.h    vr3,    vr4,    vr5
+    addi.d          t1,     t1,     8
+    addi.d          a0,     a0,     2
+    addi.d          t4,     t4,     1
+    blt             t4,     a1,     .FILTERSIZE
+    vsrai.w         vr2,    vr2,    19
+    vsrai.w         vr3,    vr3,    19
+    vclip255.w      vr2,    vr2
+    vclip255.w      vr3,    vr3
+    vpickev.h       vr2,    vr3,    vr2
+    vpickev.b       vr2,    vr2,    vr2
+    vbsrl.v         vr3,    vr2,    4
+    vilvl.b         vr2,    vr3,    vr2
+
+.DEST:
+    vstelm.b        vr2,    a3,     0,    0
+    vbsrl.v         vr2,    vr2,    1
+    addi.d          a4,     a4,     -1
+    addi.d          a3,     a3,     1
+    blt             zero,   a4,     .DEST
+.END:
+endfunc
diff --git a/libswscale/loongarch/output_lasx.c b/libswscale/loongarch/output_lasx.c
index 36a4c4503b..277d7063e6 100644
--- a/libswscale/loongarch/output_lasx.c
+++ b/libswscale/loongarch/output_lasx.c
@@ -1773,11 +1773,9 @@ YUV2RGBWRAPPER(yuv2, rgb_full, bgr4_byte_full,  AV_PIX_FMT_BGR4_BYTE, 0)
 YUV2RGBWRAPPER(yuv2, rgb_full, rgb4_byte_full,  AV_PIX_FMT_RGB4_BYTE, 0)
 YUV2RGBWRAPPER(yuv2, rgb_full, bgr8_full,   AV_PIX_FMT_BGR8,  0)
 YUV2RGBWRAPPER(yuv2, rgb_full, rgb8_full,   AV_PIX_FMT_RGB8,  0)
-#undef yuvTorgb
-#undef yuvTorgb_setup
 
 
-av_cold void ff_sws_init_output_loongarch(SwsContext *c)
+av_cold void ff_sws_init_output_lasx(SwsContext *c)
 {
 
     if(c->flags & SWS_FULL_CHR_H_INT) {
diff --git a/libswscale/loongarch/output_lsx.c b/libswscale/loongarch/output_lsx.c
new file mode 100644
index 0000000000..768cc3abc6
--- /dev/null
+++ b/libswscale/loongarch/output_lsx.c
@@ -0,0 +1,1828 @@
+/*
+ * Copyright (C) 2023 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "swscale_loongarch.h"
+#include "libavutil/loongarch/loongson_intrinsics.h"
+
+
+/*Copy from libswscale/output.c*/
+static av_always_inline void
+yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2,
+              unsigned A1, unsigned A2,
+              const void *_r, const void *_g, const void *_b, int y,
+              enum AVPixelFormat target, int hasAlpha)
+{
+    if (target == AV_PIX_FMT_ARGB || target == AV_PIX_FMT_RGBA ||
+        target == AV_PIX_FMT_ABGR || target == AV_PIX_FMT_BGRA) {
+        uint32_t *dest = (uint32_t *) _dest;
+        const uint32_t *r = (const uint32_t *) _r;
+        const uint32_t *g = (const uint32_t *) _g;
+        const uint32_t *b = (const uint32_t *) _b;
+
+#if CONFIG_SMALL
+        dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
+        dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
+#else
+#if defined(ASSERT_LEVEL) && ASSERT_LEVEL > 1
+        int sh = (target == AV_PIX_FMT_RGB32_1 ||
+                  target == AV_PIX_FMT_BGR32_1) ? 0 : 24;
+        av_assert2((((r[Y1] + g[Y1] + b[Y1]) >> sh) & 0xFF) == 0xFF);
+#endif
+        dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
+        dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
+#endif
+    } else if (target == AV_PIX_FMT_RGB24 || target == AV_PIX_FMT_BGR24) {
+        uint8_t *dest = (uint8_t *) _dest;
+        const uint8_t *r = (const uint8_t *) _r;
+        const uint8_t *g = (const uint8_t *) _g;
+        const uint8_t *b = (const uint8_t *) _b;
+
+#define r_b ((target == AV_PIX_FMT_RGB24) ? r : b)
+#define b_r ((target == AV_PIX_FMT_RGB24) ? b : r)
+
+        dest[i * 6 + 0] = r_b[Y1];
+        dest[i * 6 + 1] =   g[Y1];
+        dest[i * 6 + 2] = b_r[Y1];
+        dest[i * 6 + 3] = r_b[Y2];
+        dest[i * 6 + 4] =   g[Y2];
+        dest[i * 6 + 5] = b_r[Y2];
+#undef r_b
+#undef b_r
+    } else if (target == AV_PIX_FMT_RGB565 || target == AV_PIX_FMT_BGR565 ||
+               target == AV_PIX_FMT_RGB555 || target == AV_PIX_FMT_BGR555 ||
+               target == AV_PIX_FMT_RGB444 || target == AV_PIX_FMT_BGR444) {
+        uint16_t *dest = (uint16_t *) _dest;
+        const uint16_t *r = (const uint16_t *) _r;
+        const uint16_t *g = (const uint16_t *) _g;
+        const uint16_t *b = (const uint16_t *) _b;
+        int dr1, dg1, db1, dr2, dg2, db2;
+
+        if (target == AV_PIX_FMT_RGB565 || target == AV_PIX_FMT_BGR565) {
+            dr1 = ff_dither_2x2_8[ y & 1     ][0];
+            dg1 = ff_dither_2x2_4[ y & 1     ][0];
+            db1 = ff_dither_2x2_8[(y & 1) ^ 1][0];
+            dr2 = ff_dither_2x2_8[ y & 1     ][1];
+            dg2 = ff_dither_2x2_4[ y & 1     ][1];
+            db2 = ff_dither_2x2_8[(y & 1) ^ 1][1];
+        } else if (target == AV_PIX_FMT_RGB555 || target == AV_PIX_FMT_BGR555) {
+            dr1 = ff_dither_2x2_8[ y & 1     ][0];
+            dg1 = ff_dither_2x2_8[ y & 1     ][1];
+            db1 = ff_dither_2x2_8[(y & 1) ^ 1][0];
+            dr2 = ff_dither_2x2_8[ y & 1     ][1];
+            dg2 = ff_dither_2x2_8[ y & 1     ][0];
+            db2 = ff_dither_2x2_8[(y & 1) ^ 1][1];
+        } else {
+            dr1 = ff_dither_4x4_16[ y & 3     ][0];
+            dg1 = ff_dither_4x4_16[ y & 3     ][1];
+            db1 = ff_dither_4x4_16[(y & 3) ^ 3][0];
+            dr2 = ff_dither_4x4_16[ y & 3     ][1];
+            dg2 = ff_dither_4x4_16[ y & 3     ][0];
+            db2 = ff_dither_4x4_16[(y & 3) ^ 3][1];
+        }
+
+        dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
+        dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
+    } else { /* 8/4 bits */
+        uint8_t *dest = (uint8_t *) _dest;
+        const uint8_t *r = (const uint8_t *) _r;
+        const uint8_t *g = (const uint8_t *) _g;
+        const uint8_t *b = (const uint8_t *) _b;
+        int dr1, dg1, db1, dr2, dg2, db2;
+
+        if (target == AV_PIX_FMT_RGB8 || target == AV_PIX_FMT_BGR8) {
+            const uint8_t * const d64 = ff_dither_8x8_73[y & 7];
+            const uint8_t * const d32 = ff_dither_8x8_32[y & 7];
+            dr1 = dg1 = d32[(i * 2 + 0) & 7];
+            db1 =       d64[(i * 2 + 0) & 7];
+            dr2 = dg2 = d32[(i * 2 + 1) & 7];
+            db2 =       d64[(i * 2 + 1) & 7];
+        } else {
+            const uint8_t * const d64  = ff_dither_8x8_73 [y & 7];
+            const uint8_t * const d128 = ff_dither_8x8_220[y & 7];
+            dr1 = db1 = d128[(i * 2 + 0) & 7];
+            dg1 =        d64[(i * 2 + 0) & 7];
+            dr2 = db2 = d128[(i * 2 + 1) & 7];
+            dg2 =        d64[(i * 2 + 1) & 7];
+        }
+
+        if (target == AV_PIX_FMT_RGB4 || target == AV_PIX_FMT_BGR4) {
+            dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] +
+                    ((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4);
+        } else {
+            dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
+            dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
+        }
+    }
+}
+
+#define WRITE_YUV2RGB_LSX(vec_y1, vec_y2, vec_u, vec_v, t1, t2, t3, t4) \
+{                                                                       \
+    Y1 = __lsx_vpickve2gr_w(vec_y1, t1);                                \
+    Y2 = __lsx_vpickve2gr_w(vec_y2, t2);                                \
+    U  = __lsx_vpickve2gr_w(vec_u, t3);                                 \
+    V  = __lsx_vpickve2gr_w(vec_v, t4);                                 \
+    r  =  c->table_rV[V];                                               \
+    g  = (c->table_gU[U] + c->table_gV[V]);                             \
+    b  =  c->table_bU[U];                                               \
+    yuv2rgb_write(dest, count, Y1, Y2, 0, 0,                            \
+                  r, g, b, y, target, 0);                               \
+    count++;                                                            \
+}
+
+static void
+yuv2rgb_X_template_lsx(SwsContext *c, const int16_t *lumFilter,
+                       const int16_t **lumSrc, int lumFilterSize,
+                       const int16_t *chrFilter, const int16_t **chrUSrc,
+                       const int16_t **chrVSrc, int chrFilterSize,
+                       const int16_t **alpSrc, uint8_t *dest, int dstW,
+                       int y, enum AVPixelFormat target, int hasAlpha)
+{
+    int i, j;
+    int count = 0;
+    int t     = 1 << 18;
+    int len   = dstW >> 5;
+    int res   = dstW & 31;
+    int len_count = (dstW + 1) >> 1;
+    const void *r, *g, *b;
+    int head = YUVRGB_TABLE_HEADROOM;
+    __m128i headroom  = __lsx_vreplgr2vr_w(head);
+
+    for (i = 0; i < len; i++) {
+        int Y1, Y2, U, V, count_lum = count << 1;
+        __m128i l_src1, l_src2, l_src3, l_src4, u_src1, u_src2, v_src1, v_src2;
+        __m128i yl_ev, yl_ev1, yl_ev2, yl_od1, yl_od2, yh_ev1, yh_ev2, yh_od1, yh_od2;
+        __m128i u_ev1, u_ev2, u_od1, u_od2, v_ev1, v_ev2, v_od1, v_od2, temp;
+
+        yl_ev  = __lsx_vldrepl_w(&t, 0);
+        yl_ev1 = yl_ev;
+        yl_od1 = yl_ev;
+        yh_ev1 = yl_ev;
+        yh_od1 = yl_ev;
+        u_ev1  = yl_ev;
+        v_ev1  = yl_ev;
+        u_od1  = yl_ev;
+        v_od1  = yl_ev;
+        yl_ev2 = yl_ev;
+        yl_od2 = yl_ev;
+        yh_ev2 = yl_ev;
+        yh_od2 = yl_ev;
+        u_ev2  = yl_ev;
+        v_ev2  = yl_ev;
+        u_od2  = yl_ev;
+        v_od2  = yl_ev;
+
+        for (j = 0; j < lumFilterSize; j++) {
+            temp   = __lsx_vldrepl_h((lumFilter + j), 0);
+            DUP2_ARG2(__lsx_vld, lumSrc[j] + count_lum, 0, lumSrc[j] + count_lum,
+                      16, l_src1, l_src2);
+            DUP2_ARG2(__lsx_vld, lumSrc[j] + count_lum, 32, lumSrc[j] + count_lum,
+                      48, l_src3, l_src4);
+            yl_ev1  = __lsx_vmaddwev_w_h(yl_ev1, temp, l_src1);
+            yl_od1  = __lsx_vmaddwod_w_h(yl_od1, temp, l_src1);
+            yh_ev1  = __lsx_vmaddwev_w_h(yh_ev1, temp, l_src3);
+            yh_od1  = __lsx_vmaddwod_w_h(yh_od1, temp, l_src3);
+            yl_ev2  = __lsx_vmaddwev_w_h(yl_ev2, temp, l_src2);
+            yl_od2  = __lsx_vmaddwod_w_h(yl_od2, temp, l_src2);
+            yh_ev2  = __lsx_vmaddwev_w_h(yh_ev2, temp, l_src4);
+            yh_od2  = __lsx_vmaddwod_w_h(yh_od2, temp, l_src4);
+        }
+        for (j = 0; j < chrFilterSize; j++) {
+            DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
+                      u_src1, v_src1);
+            DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 16, chrVSrc[j] + count, 16,
+                      u_src2, v_src2);
+            temp  = __lsx_vldrepl_h((chrFilter + j), 0);
+            u_ev1 = __lsx_vmaddwev_w_h(u_ev1, temp, u_src1);
+            u_od1 = __lsx_vmaddwod_w_h(u_od1, temp, u_src1);
+            v_ev1 = __lsx_vmaddwev_w_h(v_ev1, temp, v_src1);
+            v_od1 = __lsx_vmaddwod_w_h(v_od1, temp, v_src1);
+            u_ev2 = __lsx_vmaddwev_w_h(u_ev2, temp, u_src2);
+            u_od2 = __lsx_vmaddwod_w_h(u_od2, temp, u_src2);
+            v_ev2 = __lsx_vmaddwev_w_h(v_ev2, temp, v_src2);
+            v_od2 = __lsx_vmaddwod_w_h(v_od2, temp, v_src2);
+        }
+        yl_ev1 = __lsx_vsrai_w(yl_ev1, 19);
+        yh_ev1 = __lsx_vsrai_w(yh_ev1, 19);
+        yl_od1 = __lsx_vsrai_w(yl_od1, 19);
+        yh_od1 = __lsx_vsrai_w(yh_od1, 19);
+        u_ev1  = __lsx_vsrai_w(u_ev1, 19);
+        v_ev1  = __lsx_vsrai_w(v_ev1, 19);
+        u_od1  = __lsx_vsrai_w(u_od1, 19);
+        v_od1  = __lsx_vsrai_w(v_od1, 19);
+        yl_ev2 = __lsx_vsrai_w(yl_ev2, 19);
+        yh_ev2 = __lsx_vsrai_w(yh_ev2, 19);
+        yl_od2 = __lsx_vsrai_w(yl_od2, 19);
+        yh_od2 = __lsx_vsrai_w(yh_od2, 19);
+        u_ev2  = __lsx_vsrai_w(u_ev2, 19);
+        v_ev2  = __lsx_vsrai_w(v_ev2, 19);
+        u_od2  = __lsx_vsrai_w(u_od2, 19);
+        v_od2  = __lsx_vsrai_w(v_od2, 19);
+        u_ev1  = __lsx_vadd_w(u_ev1, headroom);
+        v_ev1  = __lsx_vadd_w(v_ev1, headroom);
+        u_od1  = __lsx_vadd_w(u_od1, headroom);
+        v_od1  = __lsx_vadd_w(v_od1, headroom);
+        u_ev2  = __lsx_vadd_w(u_ev2, headroom);
+        v_ev2  = __lsx_vadd_w(v_ev2, headroom);
+        u_od2  = __lsx_vadd_w(u_od2, headroom);
+        v_od2  = __lsx_vadd_w(v_od2, headroom);
+
+        WRITE_YUV2RGB_LSX(yl_ev1, yl_od1, u_ev1, v_ev1, 0, 0, 0, 0);
+        WRITE_YUV2RGB_LSX(yl_ev1, yl_od1, u_od1, v_od1, 1, 1, 0, 0);
+        WRITE_YUV2RGB_LSX(yl_ev1, yl_od1, u_ev1, v_ev1, 2, 2, 1, 1);
+        WRITE_YUV2RGB_LSX(yl_ev1, yl_od1, u_od1, v_od1, 3, 3, 1, 1);
+        WRITE_YUV2RGB_LSX(yl_ev2, yl_od2, u_ev1, v_ev1, 0, 0, 2, 2);
+        WRITE_YUV2RGB_LSX(yl_ev2, yl_od2, u_od1, v_od1, 1, 1, 2, 2);
+        WRITE_YUV2RGB_LSX(yl_ev2, yl_od2, u_ev1, v_ev1, 2, 2, 3, 3);
+        WRITE_YUV2RGB_LSX(yl_ev2, yl_od2, u_od1, v_od1, 3, 3, 3, 3);
+        WRITE_YUV2RGB_LSX(yh_ev1, yh_od1, u_ev2, v_ev2, 0, 0, 0, 0);
+        WRITE_YUV2RGB_LSX(yh_ev1, yh_od1, u_od2, v_od2, 1, 1, 0, 0);
+        WRITE_YUV2RGB_LSX(yh_ev1, yh_od1, u_ev2, v_ev2, 2, 2, 1, 1);
+        WRITE_YUV2RGB_LSX(yh_ev1, yh_od1, u_od2, v_od2, 3, 3, 1, 1);
+        WRITE_YUV2RGB_LSX(yh_ev2, yh_od2, u_ev2, v_ev2, 0, 0, 2, 2);
+        WRITE_YUV2RGB_LSX(yh_ev2, yh_od2, u_od2, v_od2, 1, 1, 2, 2);
+        WRITE_YUV2RGB_LSX(yh_ev2, yh_od2, u_ev2, v_ev2, 2, 2, 3, 3);
+        WRITE_YUV2RGB_LSX(yh_ev2, yh_od2, u_od2, v_od2, 3, 3, 3, 3);
+    }
+
+    if (res >= 16) {
+        int Y1, Y2, U, V, count_lum = count << 1;
+        __m128i l_src1, l_src2, u_src1, v_src1;
+        __m128i yl_ev, yl_ev1, yl_ev2, yl_od1, yl_od2;
+        __m128i u_ev1, u_od1, v_ev1, v_od1, temp;
+
+        yl_ev  = __lsx_vldrepl_w(&t, 0);
+        yl_ev1 = yl_ev;
+        yl_od1 = yl_ev;
+        u_ev1  = yl_ev;
+        v_ev1  = yl_ev;
+        u_od1  = yl_ev;
+        v_od1  = yl_ev;
+        yl_ev2 = yl_ev;
+        yl_od2 = yl_ev;
+
+        for (j = 0; j < lumFilterSize; j++) {
+            temp   = __lsx_vldrepl_h((lumFilter + j), 0);
+            DUP2_ARG2(__lsx_vld, lumSrc[j] + count_lum, 0, lumSrc[j] + count_lum,
+                      16, l_src1, l_src2);
+            yl_ev1  = __lsx_vmaddwev_w_h(yl_ev1, temp, l_src1);
+            yl_od1  = __lsx_vmaddwod_w_h(yl_od1, temp, l_src1);
+            yl_ev2  = __lsx_vmaddwev_w_h(yl_ev2, temp, l_src2);
+            yl_od2  = __lsx_vmaddwod_w_h(yl_od2, temp, l_src2);
+        }
+        for (j = 0; j < chrFilterSize; j++) {
+            DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
+                      u_src1, v_src1);
+            temp  = __lsx_vldrepl_h((chrFilter + j), 0);
+            u_ev1 = __lsx_vmaddwev_w_h(u_ev1, temp, u_src1);
+            u_od1 = __lsx_vmaddwod_w_h(u_od1, temp, u_src1);
+            v_ev1 = __lsx_vmaddwev_w_h(v_ev1, temp, v_src1);
+            v_od1 = __lsx_vmaddwod_w_h(v_od1, temp, v_src1);
+        }
+        yl_ev1 = __lsx_vsrai_w(yl_ev1, 19);
+        yl_od1 = __lsx_vsrai_w(yl_od1, 19);
+        u_ev1  = __lsx_vsrai_w(u_ev1, 19);
+        v_ev1  = __lsx_vsrai_w(v_ev1, 19);
+        u_od1  = __lsx_vsrai_w(u_od1, 19);
+        v_od1  = __lsx_vsrai_w(v_od1, 19);
+        yl_ev2 = __lsx_vsrai_w(yl_ev2, 19);
+        yl_od2 = __lsx_vsrai_w(yl_od2, 19);
+        u_ev1  = __lsx_vadd_w(u_ev1, headroom);
+        v_ev1  = __lsx_vadd_w(v_ev1, headroom);
+        u_od1  = __lsx_vadd_w(u_od1, headroom);
+        v_od1  = __lsx_vadd_w(v_od1, headroom);
+
+        WRITE_YUV2RGB_LSX(yl_ev1, yl_od1, u_ev1, v_ev1, 0, 0, 0, 0);
+        WRITE_YUV2RGB_LSX(yl_ev1, yl_od1, u_od1, v_od1, 1, 1, 0, 0);
+        WRITE_YUV2RGB_LSX(yl_ev1, yl_od1, u_ev1, v_ev1, 2, 2, 1, 1);
+        WRITE_YUV2RGB_LSX(yl_ev1, yl_od1, u_od1, v_od1, 3, 3, 1, 1);
+        WRITE_YUV2RGB_LSX(yl_ev2, yl_od2, u_ev1, v_ev1, 0, 0, 2, 2);
+        WRITE_YUV2RGB_LSX(yl_ev2, yl_od2, u_od1, v_od1, 1, 1, 2, 2);
+        WRITE_YUV2RGB_LSX(yl_ev2, yl_od2, u_ev1, v_ev1, 2, 2, 3, 3);
+        WRITE_YUV2RGB_LSX(yl_ev2, yl_od2, u_od1, v_od1, 3, 3, 3, 3);
+        res -= 16;
+    }
+
+    if (res >= 8) {
+        int Y1, Y2, U, V, count_lum = count << 1;
+        __m128i l_src1, u_src, v_src;
+        __m128i yl_ev, yl_od;
+        __m128i u_ev, u_od, v_ev, v_od, temp;
+
+        yl_ev = __lsx_vldrepl_w(&t, 0);
+        yl_od = yl_ev;
+        u_ev  = yl_ev;
+        v_ev  = yl_ev;
+        u_od  = yl_ev;
+        v_od  = yl_ev;
+        for (j = 0; j < lumFilterSize; j++) {
+            temp   = __lsx_vldrepl_h((lumFilter + j), 0);
+            l_src1 = __lsx_vld(lumSrc[j] + count_lum, 0);
+            yl_ev  = __lsx_vmaddwev_w_h(yl_ev, temp, l_src1);
+            yl_od  = __lsx_vmaddwod_w_h(yl_od, temp, l_src1);
+        }
+        for (j = 0; j < chrFilterSize; j++) {
+            DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
+                      u_src, v_src);
+            temp  = __lsx_vldrepl_h((chrFilter + j), 0);
+            u_ev  = __lsx_vmaddwev_w_h(u_ev, temp, u_src);
+            u_od  = __lsx_vmaddwod_w_h(u_od, temp, u_src);
+            v_ev  = __lsx_vmaddwev_w_h(v_ev, temp, v_src);
+            v_od  = __lsx_vmaddwod_w_h(v_od, temp, v_src);
+        }
+        yl_ev = __lsx_vsrai_w(yl_ev, 19);
+        yl_od = __lsx_vsrai_w(yl_od, 19);
+        u_ev  = __lsx_vsrai_w(u_ev, 19);
+        v_ev  = __lsx_vsrai_w(v_ev, 19);
+        u_od  = __lsx_vsrai_w(u_od, 19);
+        v_od  = __lsx_vsrai_w(v_od, 19);
+        u_ev  = __lsx_vadd_w(u_ev, headroom);
+        v_ev  = __lsx_vadd_w(v_ev, headroom);
+        u_od  = __lsx_vadd_w(u_od, headroom);
+        v_od  = __lsx_vadd_w(v_od, headroom);
+        WRITE_YUV2RGB_LSX(yl_ev, yl_od, u_ev, v_ev, 0, 0, 0, 0);
+        WRITE_YUV2RGB_LSX(yl_ev, yl_od, u_od, v_od, 1, 1, 0, 0);
+        WRITE_YUV2RGB_LSX(yl_ev, yl_od, u_ev, v_ev, 2, 2, 1, 1);
+        WRITE_YUV2RGB_LSX(yl_ev, yl_od, u_od, v_od, 3, 3, 1, 1);
+        res -= 8;
+    }
+
+    if (res >= 4) {
+        int Y1, Y2, U, V, count_lum = count << 1;
+        __m128i l_src1, u_src, v_src;
+        __m128i yl_ev, yl_od;
+        __m128i u_ev, u_od, v_ev, v_od, temp;
+
+        yl_ev = __lsx_vldrepl_w(&t, 0);
+        yl_od = yl_ev;
+        u_ev  = yl_ev;
+        v_ev  = yl_ev;
+        u_od  = yl_ev;
+        v_od  = yl_ev;
+        for (j = 0; j < lumFilterSize; j++) {
+            temp   = __lsx_vldrepl_h((lumFilter + j), 0);
+            l_src1 = __lsx_vld(lumSrc[j] + count_lum, 0);
+            yl_ev  = __lsx_vmaddwev_w_h(yl_ev, temp, l_src1);
+            yl_od  = __lsx_vmaddwod_w_h(yl_od, temp, l_src1);
+        }
+        for (j = 0; j < chrFilterSize; j++) {
+            DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
+                      u_src, v_src);
+            temp  = __lsx_vldrepl_h((chrFilter + j), 0);
+            u_ev  = __lsx_vmaddwev_w_h(u_ev, temp, u_src);
+            u_od  = __lsx_vmaddwod_w_h(u_od, temp, u_src);
+            v_ev  = __lsx_vmaddwev_w_h(v_ev, temp, v_src);
+            v_od  = __lsx_vmaddwod_w_h(v_od, temp, v_src);
+        }
+        yl_ev = __lsx_vsrai_w(yl_ev, 19);
+        yl_od = __lsx_vsrai_w(yl_od, 19);
+        u_ev  = __lsx_vsrai_w(u_ev, 19);
+        v_ev  = __lsx_vsrai_w(v_ev, 19);
+        u_od  = __lsx_vsrai_w(u_od, 19);
+        v_od  = __lsx_vsrai_w(v_od, 19);
+        u_ev  = __lsx_vadd_w(u_ev, headroom);
+        v_ev  = __lsx_vadd_w(v_ev, headroom);
+        u_od  = __lsx_vadd_w(u_od, headroom);
+        v_od  = __lsx_vadd_w(v_od, headroom);
+        WRITE_YUV2RGB_LSX(yl_ev, yl_od, u_ev, v_ev, 0, 0, 0, 0);
+        WRITE_YUV2RGB_LSX(yl_ev, yl_od, u_od, v_od, 1, 1, 0, 0);
+        res -= 4;
+    }
+
+    if (res >= 2) {
+        int Y1, Y2, U, V, count_lum = count << 1;
+        __m128i l_src1, u_src, v_src;
+        __m128i yl_ev, yl_od;
+        __m128i u_ev, u_od, v_ev, v_od, temp;
+
+        yl_ev = __lsx_vldrepl_w(&t, 0);
+        yl_od = yl_ev;
+        u_ev  = yl_ev;
+        v_ev  = yl_ev;
+        u_od  = yl_ev;
+        v_od  = yl_ev;
+        for (j = 0; j < lumFilterSize; j++) {
+            temp   = __lsx_vldrepl_h((lumFilter + j), 0);
+            l_src1 = __lsx_vld(lumSrc[j] + count_lum, 0);
+            yl_ev  = __lsx_vmaddwev_w_h(yl_ev, temp, l_src1);
+            yl_od  = __lsx_vmaddwod_w_h(yl_od, temp, l_src1);
+        }
+        for (j = 0; j < chrFilterSize; j++) {
+            DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
+                      u_src, v_src);
+            temp  = __lsx_vldrepl_h((chrFilter + j), 0);
+            u_ev  = __lsx_vmaddwev_w_h(u_ev, temp, u_src);
+            u_od  = __lsx_vmaddwod_w_h(u_od, temp, u_src);
+            v_ev  = __lsx_vmaddwev_w_h(v_ev, temp, v_src);
+            v_od  = __lsx_vmaddwod_w_h(v_od, temp, v_src);
+        }
+        yl_ev = __lsx_vsrai_w(yl_ev, 19);
+        yl_od = __lsx_vsrai_w(yl_od, 19);
+        u_ev  = __lsx_vsrai_w(u_ev, 19);
+        v_ev  = __lsx_vsrai_w(v_ev, 19);
+        u_od  = __lsx_vsrai_w(u_od, 19);
+        v_od  = __lsx_vsrai_w(v_od, 19);
+        u_ev  = __lsx_vadd_w(u_ev, headroom);
+        v_ev  = __lsx_vadd_w(v_ev, headroom);
+        u_od  = __lsx_vadd_w(u_od, headroom);
+        v_od  = __lsx_vadd_w(v_od, headroom);
+        WRITE_YUV2RGB_LSX(yl_ev, yl_od, u_ev, v_ev, 0, 0, 0, 0);
+        res -= 2;
+    }
+
+    for (; count < len_count; count++) {
+        int Y1 = 1 << 18;
+        int Y2 = Y1;
+        int U  = Y1;
+        int V  = Y1;
+
+        for (j = 0; j < lumFilterSize; j++) {
+            Y1 += lumSrc[j][count * 2]     * lumFilter[j];
+            Y2 += lumSrc[j][count * 2 + 1] * lumFilter[j];
+        }
+        for (j = 0; j < chrFilterSize; j++) {
+            U += chrUSrc[j][count] * chrFilter[j];
+            V += chrVSrc[j][count] * chrFilter[j];
+        }
+        Y1 >>= 19;
+        Y2 >>= 19;
+        U  >>= 19;
+        V  >>= 19;
+        r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM];
+        g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
+             c->table_gV[V + YUVRGB_TABLE_HEADROOM]);
+        b =  c->table_bU[U + YUVRGB_TABLE_HEADROOM];
+
+        yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
+                      r, g, b, y, target, 0);
+    }
+}
+
+static void
+yuv2rgb_2_template_lsx(SwsContext *c, const int16_t *buf[2],
+                       const int16_t *ubuf[2], const int16_t *vbuf[2],
+                       const int16_t *abuf[2], uint8_t *dest, int dstW,
+                       int yalpha, int uvalpha, int y,
+                       enum AVPixelFormat target, int hasAlpha)
+{
+    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
+                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
+                  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
+    int yalpha1   = 4096 - yalpha;
+    int uvalpha1  = 4096 - uvalpha;
+    int i, count  = 0;
+    int len       = dstW - 7;
+    int len_count = (dstW + 1) >> 1;
+    const void *r, *g, *b;
+    int head  = YUVRGB_TABLE_HEADROOM;
+    __m128i v_yalpha1  = __lsx_vreplgr2vr_w(yalpha1);
+    __m128i v_uvalpha1 = __lsx_vreplgr2vr_w(uvalpha1);
+    __m128i v_yalpha   = __lsx_vreplgr2vr_w(yalpha);
+    __m128i v_uvalpha  = __lsx_vreplgr2vr_w(uvalpha);
+    __m128i headroom   = __lsx_vreplgr2vr_w(head);
+    __m128i zero       = __lsx_vldi(0);
+
+    for (i = 0; i < len; i += 8) {
+        int Y1, Y2, U, V;
+        int i_dex = i << 1;
+        int c_dex = count << 1;
+        __m128i y0_h, y0_l, y0, u0, v0;
+        __m128i y1_h, y1_l, y1, u1, v1;
+        __m128i y_l, y_h, u, v;
+
+        DUP4_ARG2(__lsx_vldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
+                  buf1, i_dex, y0, u0, v0, y1);
+        DUP2_ARG2(__lsx_vldx, ubuf1, c_dex, vbuf1, c_dex, u1, v1);
+        DUP2_ARG2(__lsx_vsllwil_w_h, y0, 0, y1, 0, y0_l, y1_l);
+        DUP2_ARG1(__lsx_vexth_w_h, y0, y1, y0_h, y1_h);
+        DUP4_ARG2(__lsx_vilvl_h, zero, u0, zero, u1, zero, v0, zero, v1,
+                  u0, u1, v0, v1);
+        y0_l = __lsx_vmul_w(y0_l, v_yalpha1);
+        y0_h = __lsx_vmul_w(y0_h, v_yalpha1);
+        u0   = __lsx_vmul_w(u0, v_uvalpha1);
+        v0   = __lsx_vmul_w(v0, v_uvalpha1);
+        y_l  = __lsx_vmadd_w(y0_l, v_yalpha, y1_l);
+        y_h  = __lsx_vmadd_w(y0_h, v_yalpha, y1_h);
+        u    = __lsx_vmadd_w(u0, v_uvalpha, u1);
+        v    = __lsx_vmadd_w(v0, v_uvalpha, v1);
+        y_l  = __lsx_vsrai_w(y_l, 19);
+        y_h  = __lsx_vsrai_w(y_h, 19);
+        u    = __lsx_vsrai_w(u, 19);
+        v    = __lsx_vsrai_w(v, 19);
+        u    = __lsx_vadd_w(u, headroom);
+        v    = __lsx_vadd_w(v, headroom);
+        WRITE_YUV2RGB_LSX(y_l, y_l, u, v, 0, 1, 0, 0);
+        WRITE_YUV2RGB_LSX(y_l, y_l, u, v, 2, 3, 1, 1);
+        WRITE_YUV2RGB_LSX(y_h, y_h, u, v, 0, 1, 2, 2);
+        WRITE_YUV2RGB_LSX(y_h, y_h, u, v, 2, 3, 3, 3);
+    }
+    if (dstW - i >= 4) {
+        int Y1, Y2, U, V;
+        int i_dex = i << 1;
+        __m128i y0_l, y0, u0, v0;
+        __m128i y1_l, y1, u1, v1;
+        __m128i y_l, u, v;
+
+        y0   = __lsx_vldx(buf0, i_dex);
+        u0   = __lsx_vldrepl_d((ubuf0 + count), 0);
+        v0   = __lsx_vldrepl_d((vbuf0 + count), 0);
+        y1   = __lsx_vldx(buf1, i_dex);
+        u1   = __lsx_vldrepl_d((ubuf1 + count), 0);
+        v1   = __lsx_vldrepl_d((vbuf1 + count), 0);
+        DUP2_ARG2(__lsx_vilvl_h, zero, y0, zero, y1, y0_l, y1_l);
+        DUP4_ARG2(__lsx_vilvl_h, zero, u0, zero, u1, zero, v0, zero, v1,
+                  u0, u1, v0, v1);
+        y0_l = __lsx_vmul_w(y0_l, v_yalpha1);
+        u0   = __lsx_vmul_w(u0, v_uvalpha1);
+        v0   = __lsx_vmul_w(v0, v_uvalpha1);
+        y_l  = __lsx_vmadd_w(y0_l, v_yalpha, y1_l);
+        u    = __lsx_vmadd_w(u0, v_uvalpha, u1);
+        v    = __lsx_vmadd_w(v0, v_uvalpha, v1);
+        y_l  = __lsx_vsrai_w(y_l, 19);
+        u    = __lsx_vsrai_w(u, 19);
+        v    = __lsx_vsrai_w(v, 19);
+        u    = __lsx_vadd_w(u, headroom);
+        v    = __lsx_vadd_w(v, headroom);
+        WRITE_YUV2RGB_LSX(y_l, y_l, u, v, 0, 1, 0, 0);
+        WRITE_YUV2RGB_LSX(y_l, y_l, u, v, 2, 3, 1, 1);
+        i += 4;
+    }
+    for (; count < len_count; count++) {
+        int Y1 = (buf0[count * 2]     * yalpha1  +
+                  buf1[count * 2]     * yalpha)  >> 19;
+        int Y2 = (buf0[count * 2 + 1] * yalpha1  +
+                  buf1[count * 2 + 1] * yalpha) >> 19;
+        int U  = (ubuf0[count] * uvalpha1 + ubuf1[count] * uvalpha) >> 19;
+        int V  = (vbuf0[count] * uvalpha1 + vbuf1[count] * uvalpha) >> 19;
+
+        r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM],
+        g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
+             c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
+        b =  c->table_bU[U + YUVRGB_TABLE_HEADROOM];
+
+        yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
+                      r, g, b, y, target, 0);
+    }
+}
+
+static void
+yuv2rgb_1_template_lsx(SwsContext *c, const int16_t *buf0,
+                       const int16_t *ubuf[2], const int16_t *vbuf[2],
+                       const int16_t *abuf0, uint8_t *dest, int dstW,
+                       int uvalpha, int y, enum AVPixelFormat target,
+                       int hasAlpha)
+{
+    const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
+    int i;
+    int len       = (dstW - 7);
+    int len_count = (dstW + 1) >> 1;
+    const void *r, *g, *b;
+
+    if (uvalpha < 2048) {
+        int count    = 0;
+        int head = YUVRGB_TABLE_HEADROOM;
+        __m128i headroom  = __lsx_vreplgr2vr_h(head);
+
+        for (i = 0; i < len; i += 8) {
+            int Y1, Y2, U, V;
+            int i_dex = i << 1;
+            int c_dex = count << 1;
+            __m128i src_y, src_u, src_v;
+            __m128i u, v, uv, y_l, y_h;
+
+            src_y = __lsx_vldx(buf0, i_dex);
+            DUP2_ARG2(__lsx_vldx, ubuf0, c_dex, vbuf0, c_dex, src_u, src_v);
+            src_y = __lsx_vsrari_h(src_y, 7);
+            src_u = __lsx_vsrari_h(src_u, 7);
+            src_v = __lsx_vsrari_h(src_v, 7);
+            y_l   = __lsx_vsllwil_w_h(src_y, 0);
+            y_h   = __lsx_vexth_w_h(src_y);
+            uv    = __lsx_vilvl_h(src_v, src_u);
+            u     = __lsx_vaddwev_w_h(uv, headroom);
+            v     = __lsx_vaddwod_w_h(uv, headroom);
+            WRITE_YUV2RGB_LSX(y_l, y_l, u, v, 0, 1, 0, 0);
+            WRITE_YUV2RGB_LSX(y_l, y_l, u, v, 2, 3, 1, 1);
+            WRITE_YUV2RGB_LSX(y_h, y_h, u, v, 0, 1, 2, 2);
+            WRITE_YUV2RGB_LSX(y_h, y_h, u, v, 2, 3, 3, 3);
+        }
+        if (dstW - i >= 4){
+            int Y1, Y2, U, V;
+            int i_dex = i << 1;
+            __m128i src_y, src_u, src_v;
+            __m128i y_l, u, v, uv;
+
+            src_y  = __lsx_vldx(buf0, i_dex);
+            src_u  = __lsx_vldrepl_d((ubuf0 + count), 0);
+            src_v  = __lsx_vldrepl_d((vbuf0 + count), 0);
+            y_l    = __lsx_vsrari_h(src_y, 7);
+            y_l    = __lsx_vsllwil_w_h(y_l, 0);
+            uv     = __lsx_vilvl_h(src_v, src_u);
+            uv     = __lsx_vsrari_h(uv, 7);
+            u      = __lsx_vaddwev_w_h(uv, headroom);
+            v      = __lsx_vaddwod_w_h(uv, headroom);
+            WRITE_YUV2RGB_LSX(y_l, y_l, u, v, 0, 1, 0, 0);
+            WRITE_YUV2RGB_LSX(y_l, y_l, u, v, 2, 3, 1, 1);
+            i += 4;
+        }
+        for (; count < len_count; count++) {
+            int Y1 = (buf0[count * 2    ] + 64) >> 7;
+            int Y2 = (buf0[count * 2 + 1] + 64) >> 7;
+            int U  = (ubuf0[count]        + 64) >> 7;
+            int V  = (vbuf0[count]        + 64) >> 7;
+
+            r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM],
+            g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
+                 c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
+            b =  c->table_bU[U + YUVRGB_TABLE_HEADROOM];
+
+            yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
+                          r, g, b, y, target, 0);
+        }
+    } else {
+        const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
+        int count = 0;
+        int HEADROOM = YUVRGB_TABLE_HEADROOM;
+        __m128i headroom    = __lsx_vreplgr2vr_w(HEADROOM);
+
+        for (i = 0; i < len; i += 8) {
+            int Y1, Y2, U, V;
+            int i_dex = i << 1;
+            int c_dex = count << 1;
+            __m128i src_y, src_u0, src_v0, src_u1, src_v1;
+            __m128i y_l, y_h, u1, u2, v1, v2;
+
+            DUP4_ARG2(__lsx_vldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
+                      ubuf1, c_dex, src_y, src_u0, src_v0, src_u1);
+            src_v1 = __lsx_vldx(vbuf1, c_dex);
+            src_y  = __lsx_vsrari_h(src_y, 7);
+            u1      = __lsx_vaddwev_w_h(src_u0, src_u1);
+            v1      = __lsx_vaddwod_w_h(src_u0, src_u1);
+            u2      = __lsx_vaddwev_w_h(src_v0, src_v1);
+            v2      = __lsx_vaddwod_w_h(src_v0, src_v1);
+            y_l     = __lsx_vsllwil_w_h(src_y, 0);
+            y_h     = __lsx_vexth_w_h(src_y);
+            u1      = __lsx_vsrari_w(u1, 8);
+            v1      = __lsx_vsrari_w(v1, 8);
+            u2      = __lsx_vsrari_w(u2, 8);
+            v2      = __lsx_vsrari_w(v2, 8);
+            u1      = __lsx_vadd_w(u1, headroom);
+            v1      = __lsx_vadd_w(v1, headroom);
+            u2      = __lsx_vadd_w(u2, headroom);
+            v2      = __lsx_vadd_w(v2, headroom);
+            WRITE_YUV2RGB_LSX(y_l, y_l, u1, v1, 0, 1, 0, 0);
+            WRITE_YUV2RGB_LSX(y_l, y_l, u2, v2, 2, 3, 0, 0);
+            WRITE_YUV2RGB_LSX(y_h, y_h, u1, v1, 0, 1, 1, 1);
+            WRITE_YUV2RGB_LSX(y_h, y_h, u2, v2, 2, 3, 1, 1);
+        }
+        if (dstW - i >= 4) {
+            int Y1, Y2, U, V;
+            int i_dex = i << 1;
+            __m128i src_y, src_u0, src_v0, src_u1, src_v1;
+            __m128i uv;
+
+            src_y  = __lsx_vldx(buf0, i_dex);
+            src_u0 = __lsx_vldrepl_d((ubuf0 + count), 0);
+            src_v0 = __lsx_vldrepl_d((vbuf0 + count), 0);
+            src_u1 = __lsx_vldrepl_d((ubuf1 + count), 0);
+            src_v1 = __lsx_vldrepl_d((vbuf1 + count), 0);
+
+            src_u0 = __lsx_vilvl_h(src_u1, src_u0);
+            src_v0 = __lsx_vilvl_h(src_v1, src_v0);
+            src_y  = __lsx_vsrari_h(src_y, 7);
+            src_y  = __lsx_vsllwil_w_h(src_y, 0);
+            uv     = __lsx_vilvl_h(src_v0, src_u0);
+            uv     = __lsx_vhaddw_w_h(uv, uv);
+            uv     = __lsx_vsrari_w(uv, 8);
+            uv     = __lsx_vadd_w(uv, headroom);
+            WRITE_YUV2RGB_LSX(src_y, src_y, uv, uv, 0, 1, 0, 1);
+            WRITE_YUV2RGB_LSX(src_y, src_y, uv, uv, 2, 3, 2, 3);
+            i += 4;
+        }
+        for (; count < len_count; count++) {
+            int Y1 = (buf0[count * 2    ]         +  64) >> 7;
+            int Y2 = (buf0[count * 2 + 1]         +  64) >> 7;
+            int U  = (ubuf0[count] + ubuf1[count] + 128) >> 8;
+            int V  = (vbuf0[count] + vbuf1[count] + 128) >> 8;
+
+            r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM],
+            g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
+                 c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
+            b =  c->table_bU[U + YUVRGB_TABLE_HEADROOM];
+
+            yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
+                          r, g, b, y, target, 0);
+        }
+    }
+}
+
+#define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha)                               \
+static void name ## ext ## _X_lsx(SwsContext *c, const int16_t *lumFilter,            \
+                                  const int16_t **lumSrc, int lumFilterSize,          \
+                                  const int16_t *chrFilter, const int16_t **chrUSrc,  \
+                                  const int16_t **chrVSrc, int chrFilterSize,         \
+                                  const int16_t **alpSrc, uint8_t *dest, int dstW,    \
+                                  int y)                                              \
+{                                                                                     \
+    name ## base ## _X_template_lsx(c, lumFilter, lumSrc, lumFilterSize,              \
+                                    chrFilter, chrUSrc, chrVSrc, chrFilterSize,       \
+                                    alpSrc, dest, dstW, y, fmt, hasAlpha);            \
+}
+
+#define YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha)                              \
+YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha)                                       \
+static void name ## ext ## _2_lsx(SwsContext *c, const int16_t *buf[2],               \
+                                  const int16_t *ubuf[2], const int16_t *vbuf[2],     \
+                                  const int16_t *abuf[2], uint8_t *dest, int dstW,    \
+                                  int yalpha, int uvalpha, int y)                     \
+{                                                                                     \
+    name ## base ## _2_template_lsx(c, buf, ubuf, vbuf, abuf, dest,                   \
+                                    dstW, yalpha, uvalpha, y, fmt, hasAlpha);         \
+}
+
+#define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha)                                \
+YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha)                                      \
+static void name ## ext ## _1_lsx(SwsContext *c, const int16_t *buf0,                 \
+                                  const int16_t *ubuf[2], const int16_t *vbuf[2],     \
+                                  const int16_t *abuf0, uint8_t *dest, int dstW,      \
+                                  int uvalpha, int y)                                 \
+{                                                                                     \
+    name ## base ## _1_template_lsx(c, buf0, ubuf, vbuf, abuf0, dest,                 \
+                                    dstW, uvalpha, y, fmt, hasAlpha);                 \
+}
+
+#if CONFIG_SMALL
+#else
+#if CONFIG_SWSCALE_ALPHA
+#endif
+YUV2RGBWRAPPER(yuv2rgb,, x32_1,  AV_PIX_FMT_RGB32_1, 0)
+YUV2RGBWRAPPER(yuv2rgb,, x32,    AV_PIX_FMT_RGB32,   0)
+#endif
+YUV2RGBWRAPPER(yuv2, rgb, rgb24, AV_PIX_FMT_RGB24,     0)
+YUV2RGBWRAPPER(yuv2, rgb, bgr24, AV_PIX_FMT_BGR24,     0)
+YUV2RGBWRAPPER(yuv2rgb,,  16,    AV_PIX_FMT_RGB565,    0)
+YUV2RGBWRAPPER(yuv2rgb,,  15,    AV_PIX_FMT_RGB555,    0)
+YUV2RGBWRAPPER(yuv2rgb,,  12,    AV_PIX_FMT_RGB444,    0)
+YUV2RGBWRAPPER(yuv2rgb,,  8,     AV_PIX_FMT_RGB8,      0)
+YUV2RGBWRAPPER(yuv2rgb,,  4,     AV_PIX_FMT_RGB4,      0)
+YUV2RGBWRAPPER(yuv2rgb,,  4b,    AV_PIX_FMT_RGB4_BYTE, 0)
+
+// This function is copied from libswscale/output.c
+static av_always_inline void yuv2rgb_write_full(SwsContext *c,
+    uint8_t *dest, int i, int R, int A, int G, int B,
+    int y, enum AVPixelFormat target, int hasAlpha, int err[4])
+{
+    int isrgb8 = target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8;
+
+    if ((R | G | B) & 0xC0000000) {
+        R = av_clip_uintp2(R, 30);
+        G = av_clip_uintp2(G, 30);
+        B = av_clip_uintp2(B, 30);
+    }
+
+    switch(target) {
+    case AV_PIX_FMT_ARGB:
+        dest[0] = hasAlpha ? A : 255;
+        dest[1] = R >> 22;
+        dest[2] = G >> 22;
+        dest[3] = B >> 22;
+        break;
+    case AV_PIX_FMT_RGB24:
+        dest[0] = R >> 22;
+        dest[1] = G >> 22;
+        dest[2] = B >> 22;
+        break;
+    case AV_PIX_FMT_RGBA:
+        dest[0] = R >> 22;
+        dest[1] = G >> 22;
+        dest[2] = B >> 22;
+        dest[3] = hasAlpha ? A : 255;
+        break;
+    case AV_PIX_FMT_ABGR:
+        dest[0] = hasAlpha ? A : 255;
+        dest[1] = B >> 22;
+        dest[2] = G >> 22;
+        dest[3] = R >> 22;
+        break;
+    case AV_PIX_FMT_BGR24:
+        dest[0] = B >> 22;
+        dest[1] = G >> 22;
+        dest[2] = R >> 22;
+        break;
+    case AV_PIX_FMT_BGRA:
+        dest[0] = B >> 22;
+        dest[1] = G >> 22;
+        dest[2] = R >> 22;
+        dest[3] = hasAlpha ? A : 255;
+        break;
+    case AV_PIX_FMT_BGR4_BYTE:
+    case AV_PIX_FMT_RGB4_BYTE:
+    case AV_PIX_FMT_BGR8:
+    case AV_PIX_FMT_RGB8:
+    {
+        int r,g,b;
+
+        switch (c->dither) {
+        default:
+        case SWS_DITHER_AUTO:
+        case SWS_DITHER_ED:
+            R >>= 22;
+            G >>= 22;
+            B >>= 22;
+            R += (7*err[0] + 1*c->dither_error[0][i] + 5*c->dither_error[0][i+1] + 3*c->dither_error[0][i+2])>>4;
+            G += (7*err[1] + 1*c->dither_error[1][i] + 5*c->dither_error[1][i+1] + 3*c->dither_error[1][i+2])>>4;
+            B += (7*err[2] + 1*c->dither_error[2][i] + 5*c->dither_error[2][i+1] + 3*c->dither_error[2][i+2])>>4;
+            c->dither_error[0][i] = err[0];
+            c->dither_error[1][i] = err[1];
+            c->dither_error[2][i] = err[2];
+            r = R >> (isrgb8 ? 5 : 7);
+            g = G >> (isrgb8 ? 5 : 6);
+            b = B >> (isrgb8 ? 6 : 7);
+            r = av_clip(r, 0, isrgb8 ? 7 : 1);
+            g = av_clip(g, 0, isrgb8 ? 7 : 3);
+            b = av_clip(b, 0, isrgb8 ? 3 : 1);
+            err[0] = R - r*(isrgb8 ? 36 : 255);
+            err[1] = G - g*(isrgb8 ? 36 : 85);
+            err[2] = B - b*(isrgb8 ? 85 : 255);
+            break;
+        case SWS_DITHER_A_DITHER:
+            if (isrgb8) {
+  /* see http://pippin.gimp.org/a_dither/ for details/origin */
+#define A_DITHER(u,v)   (((((u)+((v)*236))*119)&0xff))
+                r = (((R >> 19) + A_DITHER(i,y)  -96)>>8);
+                g = (((G >> 19) + A_DITHER(i + 17,y) - 96)>>8);
+                b = (((B >> 20) + A_DITHER(i + 17*2,y) -96)>>8);
+                r = av_clip_uintp2(r, 3);
+                g = av_clip_uintp2(g, 3);
+                b = av_clip_uintp2(b, 2);
+            } else {
+                r = (((R >> 21) + A_DITHER(i,y)-256)>>8);
+                g = (((G >> 19) + A_DITHER(i + 17,y)-256)>>8);
+                b = (((B >> 21) + A_DITHER(i + 17*2,y)-256)>>8);
+                r = av_clip_uintp2(r, 1);
+                g = av_clip_uintp2(g, 2);
+                b = av_clip_uintp2(b, 1);
+            }
+            break;
+        case SWS_DITHER_X_DITHER:
+            if (isrgb8) {
+  /* see http://pippin.gimp.org/a_dither/ for details/origin */
+#define X_DITHER(u,v)   (((((u)^((v)*237))*181)&0x1ff)/2)
+                r = (((R >> 19) + X_DITHER(i,y) - 96)>>8);
+                g = (((G >> 19) + X_DITHER(i + 17,y) - 96)>>8);
+                b = (((B >> 20) + X_DITHER(i + 17*2,y) - 96)>>8);
+                r = av_clip_uintp2(r, 3);
+                g = av_clip_uintp2(g, 3);
+                b = av_clip_uintp2(b, 2);
+            } else {
+                r = (((R >> 21) + X_DITHER(i,y)-256)>>8);
+                g = (((G >> 19) + X_DITHER(i + 17,y)-256)>>8);
+                b = (((B >> 21) + X_DITHER(i + 17*2,y)-256)>>8);
+                r = av_clip_uintp2(r, 1);
+                g = av_clip_uintp2(g, 2);
+                b = av_clip_uintp2(b, 1);
+            }
+
+            break;
+        }
+
+        if(target == AV_PIX_FMT_BGR4_BYTE) {
+            dest[0] = r + 2*g + 8*b;
+        } else if(target == AV_PIX_FMT_RGB4_BYTE) {
+            dest[0] = b + 2*g + 8*r;
+        } else if(target == AV_PIX_FMT_BGR8) {
+            dest[0] = r + 8*g + 64*b;
+        } else if(target == AV_PIX_FMT_RGB8) {
+            dest[0] = b + 4*g + 32*r;
+        } else
+            av_assert2(0);
+        break; }
+    }
+}
+
+#define YUVTORGB_SETUP_LSX                                   \
+    int y_offset   = c->yuv2rgb_y_offset;                    \
+    int y_coeff    = c->yuv2rgb_y_coeff;                     \
+    int v2r_coe    = c->yuv2rgb_v2r_coeff;                   \
+    int v2g_coe    = c->yuv2rgb_v2g_coeff;                   \
+    int u2g_coe    = c->yuv2rgb_u2g_coeff;                   \
+    int u2b_coe    = c->yuv2rgb_u2b_coeff;                   \
+    __m128i offset = __lsx_vreplgr2vr_w(y_offset);           \
+    __m128i coeff  = __lsx_vreplgr2vr_w(y_coeff);            \
+    __m128i v2r    = __lsx_vreplgr2vr_w(v2r_coe);            \
+    __m128i v2g    = __lsx_vreplgr2vr_w(v2g_coe);            \
+    __m128i u2g    = __lsx_vreplgr2vr_w(u2g_coe);            \
+    __m128i u2b    = __lsx_vreplgr2vr_w(u2b_coe);            \
+
+#define YUVTORGB_LSX(y, u, v, R, G, B, offset, coeff,        \
+                     y_temp, v2r, v2g, u2g, u2b)             \
+{                                                            \
+     y = __lsx_vsub_w(y, offset);                            \
+     y = __lsx_vmul_w(y, coeff);                             \
+     y = __lsx_vadd_w(y, y_temp);                            \
+     R = __lsx_vmadd_w(y, v, v2r);                           \
+     v = __lsx_vmadd_w(y, v, v2g);                           \
+     G = __lsx_vmadd_w(v, u, u2g);                           \
+     B = __lsx_vmadd_w(y, u, u2b);                           \
+}
+
+#define WRITE_FULL_A_LSX(r, g, b, a, t1, s)                                  \
+{                                                                            \
+    R = __lsx_vpickve2gr_w(r, t1);                                           \
+    G = __lsx_vpickve2gr_w(g, t1);                                           \
+    B = __lsx_vpickve2gr_w(b, t1);                                           \
+    A = __lsx_vpickve2gr_w(a, t1);                                           \
+    if (A & 0x100)                                                           \
+        A = av_clip_uint8(A);                                                \
+    yuv2rgb_write_full(c, dest, i + s, R, A, G, B, y, target, hasAlpha, err);\
+    dest += step;                                                            \
+}
+
+#define WRITE_FULL_LSX(r, g, b, t1, s)                                        \
+{                                                                             \
+    R = __lsx_vpickve2gr_w(r, t1);                                            \
+    G = __lsx_vpickve2gr_w(g, t1);                                            \
+    B = __lsx_vpickve2gr_w(b, t1);                                            \
+    yuv2rgb_write_full(c, dest, i + s, R, 0, G, B, y, target, hasAlpha, err); \
+    dest += step;                                                             \
+}
+
+static void
+yuv2rgb_full_X_template_lsx(SwsContext *c, const int16_t *lumFilter,
+                            const int16_t **lumSrc, int lumFilterSize,
+                            const int16_t *chrFilter, const int16_t **chrUSrc,
+                            const int16_t **chrVSrc, int chrFilterSize,
+                            const int16_t **alpSrc, uint8_t *dest,
+                            int dstW, int y, enum AVPixelFormat target,
+                            int hasAlpha)
+{
+    int i, j, B, G, R, A;
+    int step       = (target == AV_PIX_FMT_RGB24 ||
+                      target == AV_PIX_FMT_BGR24) ? 3 : 4;
+    int err[4]     = {0};
+    int a_temp     = 1 << 18;
+    int templ      = 1 << 9;
+    int tempc      = templ - (128 << 19);
+    int ytemp      = 1 << 21;
+    int len        = dstW - 7;
+    __m128i y_temp = __lsx_vreplgr2vr_w(ytemp);
+    YUVTORGB_SETUP_LSX
+
+    if(   target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
+       || target == AV_PIX_FMT_BGR8      || target == AV_PIX_FMT_RGB8)
+        step = 1;
+
+    for (i = 0; i < len; i += 8) {
+        __m128i l_src, u_src, v_src;
+        __m128i y_ev, y_od, u_ev, u_od, v_ev, v_od, temp;
+        __m128i R_ev, R_od, G_ev, G_od, B_ev, B_od;
+        int n = i << 1;
+
+        y_ev = y_od = __lsx_vreplgr2vr_w(templ);
+        u_ev = u_od = v_ev = v_od = __lsx_vreplgr2vr_w(tempc);
+        for (j = 0; j < lumFilterSize; j++) {
+            temp  = __lsx_vldrepl_h((lumFilter + j), 0);
+            l_src = __lsx_vldx(lumSrc[j], n);
+            y_ev  = __lsx_vmaddwev_w_h(y_ev, l_src, temp);
+            y_od  = __lsx_vmaddwod_w_h(y_od, l_src, temp);
+        }
+        for (j = 0; j < chrFilterSize; j++) {
+            temp  = __lsx_vldrepl_h((chrFilter + j), 0);
+            DUP2_ARG2(__lsx_vldx, chrUSrc[j], n, chrVSrc[j], n,
+                      u_src, v_src);
+            DUP2_ARG3(__lsx_vmaddwev_w_h, u_ev, u_src, temp, v_ev,
+                      v_src, temp, u_ev, v_ev);
+            DUP2_ARG3(__lsx_vmaddwod_w_h, u_od, u_src, temp, v_od,
+                      v_src, temp, u_od, v_od);
+        }
+        y_ev = __lsx_vsrai_w(y_ev, 10);
+        y_od = __lsx_vsrai_w(y_od, 10);
+        u_ev = __lsx_vsrai_w(u_ev, 10);
+        u_od = __lsx_vsrai_w(u_od, 10);
+        v_ev = __lsx_vsrai_w(v_ev, 10);
+        v_od = __lsx_vsrai_w(v_od, 10);
+        YUVTORGB_LSX(y_ev, u_ev, v_ev, R_ev, G_ev, B_ev, offset, coeff,
+                     y_temp, v2r, v2g, u2g, u2b);
+        YUVTORGB_LSX(y_od, u_od, v_od, R_od, G_od, B_od, offset, coeff,
+                     y_temp, v2r, v2g, u2g, u2b);
+
+        if (hasAlpha) {
+            __m128i a_src, a_ev, a_od;
+
+            a_ev = a_od = __lsx_vreplgr2vr_w(a_temp);
+            for (j = 0; j < lumFilterSize; j++) {
+                temp  = __lsx_vldrepl_h(lumFilter + j, 0);
+                a_src = __lsx_vldx(alpSrc[j], n);
+                a_ev  = __lsx_vmaddwev_w_h(a_ev, a_src, temp);
+                a_od  = __lsx_vmaddwod_w_h(a_od, a_src, temp);
+            }
+            a_ev = __lsx_vsrai_w(a_ev, 19);
+            a_od = __lsx_vsrai_w(a_od, 19);
+            WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 0, 0);
+            WRITE_FULL_A_LSX(R_od, G_od, B_od, a_od, 0, 1);
+            WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 1, 2);
+            WRITE_FULL_A_LSX(R_od, G_od, B_od, a_od, 1, 3);
+            WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 2, 4);
+            WRITE_FULL_A_LSX(R_od, G_od, B_od, a_od, 2, 5);
+            WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 3, 6);
+            WRITE_FULL_A_LSX(R_od, G_od, B_od, a_od, 3, 7);
+        } else {
+            WRITE_FULL_LSX(R_ev, G_ev, B_ev, 0, 0);
+            WRITE_FULL_LSX(R_od, G_od, B_od, 0, 1);
+            WRITE_FULL_LSX(R_ev, G_ev, B_ev, 1, 2);
+            WRITE_FULL_LSX(R_od, G_od, B_od, 1, 3);
+            WRITE_FULL_LSX(R_ev, G_ev, B_ev, 2, 4);
+            WRITE_FULL_LSX(R_od, G_od, B_od, 2, 5);
+            WRITE_FULL_LSX(R_ev, G_ev, B_ev, 3, 6);
+            WRITE_FULL_LSX(R_od, G_od, B_od, 3, 7);
+        }
+    }
+    if (dstW - i >= 4) {
+        __m128i l_src, u_src, v_src;
+        __m128i y_ev, u_ev, v_ev, uv, temp;
+        __m128i R_ev, G_ev, B_ev;
+        int n = i << 1;
+
+        y_ev = __lsx_vreplgr2vr_w(templ);
+        u_ev = v_ev = __lsx_vreplgr2vr_w(tempc);
+        for (j = 0; j < lumFilterSize; j++) {
+            temp  = __lsx_vldrepl_h((lumFilter + j), 0);
+            l_src = __lsx_vldx(lumSrc[j], n);
+            l_src = __lsx_vilvl_h(l_src, l_src);
+            y_ev  = __lsx_vmaddwev_w_h(y_ev, l_src, temp);
+        }
+        for (j = 0; j < chrFilterSize; j++) {
+            temp  = __lsx_vldrepl_h((chrFilter + j), 0);
+            DUP2_ARG2(__lsx_vldx, chrUSrc[j], n, chrVSrc[j], n, u_src, v_src);
+            uv    = __lsx_vilvl_h(v_src, u_src);
+            u_ev  = __lsx_vmaddwev_w_h(u_ev, uv, temp);
+            v_ev  = __lsx_vmaddwod_w_h(v_ev, uv, temp);
+        }
+        y_ev = __lsx_vsrai_w(y_ev, 10);
+        u_ev = __lsx_vsrai_w(u_ev, 10);
+        v_ev = __lsx_vsrai_w(v_ev, 10);
+        YUVTORGB_LSX(y_ev, u_ev, v_ev, R_ev, G_ev, B_ev, offset, coeff,
+                     y_temp, v2r, v2g, u2g, u2b);
+
+        if (hasAlpha) {
+            __m128i a_src, a_ev;
+
+            a_ev = __lsx_vreplgr2vr_w(a_temp);
+            for (j = 0; j < lumFilterSize; j++) {
+                temp  = __lsx_vldrepl_h(lumFilter + j, 0);
+                a_src = __lsx_vldx(alpSrc[j], n);
+                a_src = __lsx_vilvl_h(a_src, a_src);
+                a_ev  =  __lsx_vmaddwev_w_h(a_ev, a_src, temp);
+            }
+            a_ev = __lsx_vsrai_w(a_ev, 19);
+            WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 0, 0);
+            WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 1, 1);
+            WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 2, 2);
+            WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 3, 3);
+        } else {
+            WRITE_FULL_LSX(R_ev, G_ev, B_ev, 0, 0);
+            WRITE_FULL_LSX(R_ev, G_ev, B_ev, 1, 1);
+            WRITE_FULL_LSX(R_ev, G_ev, B_ev, 2, 2);
+            WRITE_FULL_LSX(R_ev, G_ev, B_ev, 3, 3);
+        }
+        i += 4;
+    }
+    for (; i < dstW; i++) {
+        int Y = templ;
+        int V, U = V = tempc;
+
+        A = 0;
+        for (j = 0; j < lumFilterSize; j++) {
+            Y += lumSrc[j][i] * lumFilter[j];
+        }
+        for (j = 0; j < chrFilterSize; j++) {
+            U += chrUSrc[j][i] * chrFilter[j];
+            V += chrVSrc[j][i] * chrFilter[j];
+
+        }
+        Y >>= 10;
+        U >>= 10;
+        V >>= 10;
+        if (hasAlpha) {
+            A = 1 << 18;
+            for (j = 0; j < lumFilterSize; j++) {
+                A += alpSrc[j][i] * lumFilter[j];
+            }
+            A >>= 19;
+            if (A & 0x100)
+                A = av_clip_uint8(A);
+        }
+        Y -= y_offset;
+        Y *= y_coeff;
+        Y += ytemp;
+        R  = (unsigned)Y + V * v2r_coe;
+        G  = (unsigned)Y + V * v2g_coe + U * u2g_coe;
+        B  = (unsigned)Y + U * u2b_coe;
+        yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
+        dest += step;
+    }
+    c->dither_error[0][i] = err[0];
+    c->dither_error[1][i] = err[1];
+    c->dither_error[2][i] = err[2];
+}
+
+static void
+yuv2rgb_full_2_template_lsx(SwsContext *c, const int16_t *buf[2],
+                            const int16_t *ubuf[2], const int16_t *vbuf[2],
+                            const int16_t *abuf[2], uint8_t *dest, int dstW,
+                            int yalpha, int uvalpha, int y,
+                            enum AVPixelFormat target, int hasAlpha)
+{
+    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
+                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
+                  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
+                  *abuf0 = hasAlpha ? abuf[0] : NULL,
+                  *abuf1 = hasAlpha ? abuf[1] : NULL;
+    int yalpha1  = 4096 - yalpha;
+    int uvalpha1 = 4096 - uvalpha;
+    int uvtemp   = 128 << 19;
+    int atemp    = 1 << 18;
+    int err[4]   = {0};
+    int ytemp    = 1 << 21;
+    int len      = dstW - 7;
+    int i, R, G, B, A;
+    int step = (target == AV_PIX_FMT_RGB24 ||
+                target == AV_PIX_FMT_BGR24) ? 3 : 4;
+    __m128i v_uvalpha1 = __lsx_vreplgr2vr_w(uvalpha1);
+    __m128i v_yalpha1  = __lsx_vreplgr2vr_w(yalpha1);
+    __m128i v_uvalpha  = __lsx_vreplgr2vr_w(uvalpha);
+    __m128i v_yalpha   = __lsx_vreplgr2vr_w(yalpha);
+    __m128i uv         = __lsx_vreplgr2vr_w(uvtemp);
+    __m128i a_bias     = __lsx_vreplgr2vr_w(atemp);
+    __m128i y_temp     = __lsx_vreplgr2vr_w(ytemp);
+    YUVTORGB_SETUP_LSX
+
+    av_assert2(yalpha  <= 4096U);
+    av_assert2(uvalpha <= 4096U);
+
+    if(   target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
+       || target == AV_PIX_FMT_BGR8      || target == AV_PIX_FMT_RGB8)
+        step = 1;
+
+    for (i = 0; i < len; i += 8) {
+        __m128i b0, b1, ub0, ub1, vb0, vb1;
+        __m128i y0_l, y0_h, y1_l, y1_h, u0_l, u0_h;
+        __m128i v0_l, v0_h, u1_l, u1_h, v1_l, v1_h;
+        __m128i y_l, y_h, v_l, v_h, u_l, u_h;
+        __m128i R_l, R_h, G_l, G_h, B_l, B_h;
+        int n = i << 1;
+
+        DUP4_ARG2(__lsx_vldx, buf0, n, buf1, n, ubuf0,
+                  n, ubuf1, n, b0, b1, ub0, ub1);
+        DUP2_ARG2(__lsx_vldx, vbuf0, n, vbuf1, n, vb0 , vb1);
+        DUP2_ARG2(__lsx_vsllwil_w_h, b0, 0, b1, 0, y0_l, y1_l);
+        DUP4_ARG2(__lsx_vsllwil_w_h, ub0, 0, ub1, 0, vb0, 0, vb1, 0,
+                  u0_l, u1_l, v0_l, v1_l);
+        DUP2_ARG1(__lsx_vexth_w_h, b0, b1, y0_h, y1_h);
+        DUP4_ARG1(__lsx_vexth_w_h, ub0, ub1, vb0, vb1,
+                  u0_h, u1_h, v0_h, v1_h);
+        y0_l = __lsx_vmul_w(y0_l, v_yalpha1);
+        y0_h = __lsx_vmul_w(y0_h, v_yalpha1);
+        u0_l = __lsx_vmul_w(u0_l, v_uvalpha1);
+        u0_h = __lsx_vmul_w(u0_h, v_uvalpha1);
+        v0_l = __lsx_vmul_w(v0_l, v_uvalpha1);
+        v0_h = __lsx_vmul_w(v0_h, v_uvalpha1);
+        y_l  = __lsx_vmadd_w(y0_l, v_yalpha, y1_l);
+        y_h  = __lsx_vmadd_w(y0_h, v_yalpha, y1_h);
+        u_l  = __lsx_vmadd_w(u0_l, v_uvalpha, u1_l);
+        u_h  = __lsx_vmadd_w(u0_h, v_uvalpha, u1_h);
+        v_l  = __lsx_vmadd_w(v0_l, v_uvalpha, v1_l);
+        v_h  = __lsx_vmadd_w(v0_h, v_uvalpha, v1_h);
+        u_l  = __lsx_vsub_w(u_l, uv);
+        u_h  = __lsx_vsub_w(u_h, uv);
+        v_l  = __lsx_vsub_w(v_l, uv);
+        v_h  = __lsx_vsub_w(v_h, uv);
+        y_l  = __lsx_vsrai_w(y_l, 10);
+        y_h  = __lsx_vsrai_w(y_h, 10);
+        u_l  = __lsx_vsrai_w(u_l, 10);
+        u_h  = __lsx_vsrai_w(u_h, 10);
+        v_l  = __lsx_vsrai_w(v_l, 10);
+        v_h  = __lsx_vsrai_w(v_h, 10);
+        YUVTORGB_LSX(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
+                     y_temp, v2r, v2g, u2g, u2b);
+        YUVTORGB_LSX(y_h, u_h, v_h, R_h, G_h, B_h, offset, coeff,
+                     y_temp, v2r, v2g, u2g, u2b);
+
+        if (hasAlpha) {
+            __m128i a0, a1, a0_l, a0_h;
+            __m128i a_l, a_h, a1_l, a1_h;
+
+            DUP2_ARG2(__lsx_vldx, abuf0, n, abuf1, n, a0, a1);
+            DUP2_ARG2(__lsx_vsllwil_w_h, a0, 0, a1, 0, a0_l, a1_l);
+            DUP2_ARG1(__lsx_vexth_w_h, a0, a1, a0_h, a1_h);
+            a_l = __lsx_vmadd_w(a_bias, a0_l, v_yalpha1);
+            a_h = __lsx_vmadd_w(a_bias, a0_h, v_yalpha1);
+            a_l = __lsx_vmadd_w(a_l, v_yalpha, a1_l);
+            a_h = __lsx_vmadd_w(a_h, v_yalpha, a1_h);
+            a_l = __lsx_vsrai_w(a_l, 19);
+            a_h = __lsx_vsrai_w(a_h, 19);
+            WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 0, 0);
+            WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 1, 1);
+            WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 2, 2);
+            WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 3, 3);
+            WRITE_FULL_A_LSX(R_h, G_h, B_h, a_h, 0, 4);
+            WRITE_FULL_A_LSX(R_h, G_h, B_h, a_h, 1, 5);
+            WRITE_FULL_A_LSX(R_h, G_h, B_h, a_h, 2, 6);
+            WRITE_FULL_A_LSX(R_h, G_h, B_h, a_h, 3, 7);
+        } else {
+            WRITE_FULL_LSX(R_l, G_l, B_l, 0, 0);
+            WRITE_FULL_LSX(R_l, G_l, B_l, 1, 1);
+            WRITE_FULL_LSX(R_l, G_l, B_l, 2, 2);
+            WRITE_FULL_LSX(R_l, G_l, B_l, 3, 3);
+            WRITE_FULL_LSX(R_h, G_h, B_h, 0, 4);
+            WRITE_FULL_LSX(R_h, G_h, B_h, 1, 5);
+            WRITE_FULL_LSX(R_h, G_h, B_h, 2, 6);
+            WRITE_FULL_LSX(R_h, G_h, B_h, 3, 7);
+        }
+    }
+    if (dstW - i >= 4) {
+        __m128i b0, b1, ub0, ub1, vb0, vb1;
+        __m128i y0_l, y1_l, u0_l;
+        __m128i v0_l, u1_l, v1_l;
+        __m128i y_l, u_l, v_l;
+        __m128i R_l, G_l, B_l;
+        int n = i << 1;
+
+        DUP4_ARG2(__lsx_vldx, buf0, n, buf1, n, ubuf0, n,
+                  ubuf1, n, b0, b1, ub0, ub1);
+        DUP2_ARG2(__lsx_vldx, vbuf0, n, vbuf1, n, vb0, vb1);
+        DUP2_ARG2(__lsx_vsllwil_w_h, b0, 0, b1, 0, y0_l, y1_l);
+        DUP4_ARG2(__lsx_vsllwil_w_h, ub0, 0, ub1, 0, vb0, 0, vb1, 0,
+                  u0_l, u1_l, v0_l, v1_l);
+        y0_l = __lsx_vmul_w(y0_l, v_yalpha1);
+        u0_l = __lsx_vmul_w(u0_l, v_uvalpha1);
+        v0_l = __lsx_vmul_w(v0_l, v_uvalpha1);
+        y_l  = __lsx_vmadd_w(y0_l, v_yalpha, y1_l);
+        u_l  = __lsx_vmadd_w(u0_l, v_uvalpha, u1_l);
+        v_l  = __lsx_vmadd_w(v0_l, v_uvalpha, v1_l);
+        u_l  = __lsx_vsub_w(u_l, uv);
+        v_l  = __lsx_vsub_w(v_l, uv);
+        y_l  = __lsx_vsrai_w(y_l, 10);
+        u_l  = __lsx_vsrai_w(u_l, 10);
+        v_l  = __lsx_vsrai_w(v_l, 10);
+        YUVTORGB_LSX(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
+                     y_temp, v2r, v2g, u2g, u2b);
+
+        if (hasAlpha) {
+            __m128i a0, a1, a0_l;
+            __m128i a_l, a1_l;
+
+            DUP2_ARG2(__lsx_vldx, abuf0, n, abuf1, n, a0, a1);
+            DUP2_ARG2(__lsx_vsllwil_w_h, a0, 0, a1, 0, a0_l, a1_l);
+            a_l = __lsx_vmadd_w(a_bias, a0_l, v_yalpha1);
+            a_l = __lsx_vmadd_w(a_l, v_yalpha, a1_l);
+            a_l = __lsx_vsrai_w(a_l, 19);
+            WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 0, 0);
+            WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 1, 1);
+            WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 2, 2);
+            WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 3, 3);
+        } else {
+            WRITE_FULL_LSX(R_l, G_l, B_l, 0, 0);
+            WRITE_FULL_LSX(R_l, G_l, B_l, 1, 1);
+            WRITE_FULL_LSX(R_l, G_l, B_l, 2, 2);
+            WRITE_FULL_LSX(R_l, G_l, B_l, 3, 3);
+        }
+        i += 4;
+    }
+    for (; i < dstW; i++){
+        int Y = ( buf0[i] * yalpha1  +  buf1[i] * yalpha         ) >> 10;
+        int U = (ubuf0[i] * uvalpha1 + ubuf1[i] * uvalpha- uvtemp) >> 10;
+        int V = (vbuf0[i] * uvalpha1 + vbuf1[i] * uvalpha- uvtemp) >> 10;
+
+        A = 0;
+        if (hasAlpha){
+            A = (abuf0[i] * yalpha1 + abuf1[i] * yalpha + atemp) >> 19;
+            if (A & 0x100)
+                A = av_clip_uint8(A);
+        }
+
+        Y -= y_offset;
+        Y *= y_coeff;
+        Y += ytemp;
+        R  = (unsigned)Y + V * v2r_coe;
+        G  = (unsigned)Y + V * v2g_coe + U * u2g_coe;
+        B  = (unsigned)Y + U * u2b_coe;
+        yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
+        dest += step;
+    }
+    c->dither_error[0][i] = err[0];
+    c->dither_error[1][i] = err[1];
+    c->dither_error[2][i] = err[2];
+}
+
+static void
+yuv2rgb_full_1_template_lsx(SwsContext *c, const int16_t *buf0,
+                            const int16_t *ubuf[2], const int16_t *vbuf[2],
+                            const int16_t *abuf0, uint8_t *dest, int dstW,
+                            int uvalpha, int y, enum AVPixelFormat target,
+                            int hasAlpha)
+{
+    const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
+    int i, B, G, R, A;
+    int step = (target == AV_PIX_FMT_RGB24 || target == AV_PIX_FMT_BGR24) ? 3 : 4;
+    int err[4]     = {0};
+    int ytemp      = 1 << 21;
+    int bias_int   = 64;
+    int len        = dstW - 7;
+    __m128i y_temp = __lsx_vreplgr2vr_w(ytemp);
+    YUVTORGB_SETUP_LSX
+
+    if(   target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
+       || target == AV_PIX_FMT_BGR8      || target == AV_PIX_FMT_RGB8)
+        step = 1;
+    if (uvalpha < 2048) {
+        int uvtemp   = 128 << 7;
+        __m128i uv   = __lsx_vreplgr2vr_w(uvtemp);
+        __m128i bias = __lsx_vreplgr2vr_w(bias_int);
+
+        for (i = 0; i < len; i += 8) {
+            __m128i b, ub, vb, ub_l, ub_h, vb_l, vb_h;
+            __m128i y_l, y_h, u_l, u_h, v_l, v_h;
+            __m128i R_l, R_h, G_l, G_h, B_l, B_h;
+            int n = i << 1;
+
+            DUP2_ARG2(__lsx_vldx, buf0, n, ubuf0, n, b, ub);
+            vb  = __lsx_vldx(vbuf0, n);
+            y_l = __lsx_vsllwil_w_h(b, 2);
+            y_h = __lsx_vexth_w_h(b);
+            DUP2_ARG2(__lsx_vsllwil_w_h, ub, 0, vb, 0, ub_l, vb_l);
+            DUP2_ARG1(__lsx_vexth_w_h, ub, vb, ub_h, vb_h);
+            y_h = __lsx_vslli_w(y_h, 2);
+            u_l = __lsx_vsub_w(ub_l, uv);
+            u_h = __lsx_vsub_w(ub_h, uv);
+            v_l = __lsx_vsub_w(vb_l, uv);
+            v_h = __lsx_vsub_w(vb_h, uv);
+            u_l = __lsx_vslli_w(u_l, 2);
+            u_h = __lsx_vslli_w(u_h, 2);
+            v_l = __lsx_vslli_w(v_l, 2);
+            v_h = __lsx_vslli_w(v_h, 2);
+            YUVTORGB_LSX(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
+                         y_temp, v2r, v2g, u2g, u2b);
+            YUVTORGB_LSX(y_h, u_h, v_h, R_h, G_h, B_h, offset, coeff,
+                         y_temp, v2r, v2g, u2g, u2b);
+
+            if(hasAlpha) {
+                __m128i a_src;
+                __m128i a_l, a_h;
+
+                a_src = __lsx_vld(abuf0 + i, 0);
+                a_l   = __lsx_vsllwil_w_h(a_src, 0);
+                a_h   = __lsx_vexth_w_h(a_src);
+                a_l   = __lsx_vadd_w(a_l, bias);
+                a_h   = __lsx_vadd_w(a_h, bias);
+                a_l   = __lsx_vsrai_w(a_l, 7);
+                a_h   = __lsx_vsrai_w(a_h, 7);
+                WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 0, 0);
+                WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 1, 1);
+                WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 2, 2);
+                WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 3, 3);
+                WRITE_FULL_A_LSX(R_h, G_h, B_h, a_h, 0, 4);
+                WRITE_FULL_A_LSX(R_h, G_h, B_h, a_h, 1, 5);
+                WRITE_FULL_A_LSX(R_h, G_h, B_h, a_h, 2, 6);
+                WRITE_FULL_A_LSX(R_h, G_h, B_h, a_h, 3, 7);
+            } else {
+                WRITE_FULL_LSX(R_l, G_l, B_l, 0, 0);
+                WRITE_FULL_LSX(R_l, G_l, B_l, 1, 1);
+                WRITE_FULL_LSX(R_l, G_l, B_l, 2, 2);
+                WRITE_FULL_LSX(R_l, G_l, B_l, 3, 3);
+                WRITE_FULL_LSX(R_h, G_h, B_h, 0, 4);
+                WRITE_FULL_LSX(R_h, G_h, B_h, 1, 5);
+                WRITE_FULL_LSX(R_h, G_h, B_h, 2, 6);
+                WRITE_FULL_LSX(R_h, G_h, B_h, 3, 7);
+            }
+        }
+        if (dstW - i >= 4) {
+            __m128i b, ub, vb, ub_l, vb_l;
+            __m128i y_l, u_l, v_l;
+            __m128i R_l, G_l, B_l;
+            int n = i << 1;
+
+            DUP2_ARG2(__lsx_vldx, buf0, n, ubuf0, n, b, ub);
+            vb  = __lsx_vldx(vbuf0, n);
+            y_l = __lsx_vsllwil_w_h(b, 0);
+            DUP2_ARG2(__lsx_vsllwil_w_h, ub, 0, vb, 0, ub_l, vb_l);
+            y_l = __lsx_vslli_w(y_l, 2);
+            u_l = __lsx_vsub_w(ub_l, uv);
+            v_l = __lsx_vsub_w(vb_l, uv);
+            u_l = __lsx_vslli_w(u_l, 2);
+            v_l = __lsx_vslli_w(v_l, 2);
+            YUVTORGB_LSX(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
+                         y_temp, v2r, v2g, u2g, u2b);
+
+            if(hasAlpha) {
+                __m128i a_src, a_l;
+
+                a_src = __lsx_vldx(abuf0, n);
+                a_src = __lsx_vsllwil_w_h(a_src, 0);
+                a_l   = __lsx_vadd_w(bias, a_src);
+                a_l   = __lsx_vsrai_w(a_l, 7);
+                WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 0, 0);
+                WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 1, 1);
+                WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 2, 2);
+                WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 3, 3);
+            } else {
+                WRITE_FULL_LSX(R_l, G_l, B_l, 0, 0);
+                WRITE_FULL_LSX(R_l, G_l, B_l, 1, 1);
+                WRITE_FULL_LSX(R_l, G_l, B_l, 2, 2);
+                WRITE_FULL_LSX(R_l, G_l, B_l, 3, 3);
+            }
+            i += 4;
+        }
+        for (; i < dstW; i++) {
+            int Y = buf0[i] << 2;
+            int U = (ubuf0[i] - uvtemp) << 2;
+            int V = (vbuf0[i] - uvtemp) << 2;
+
+            A = 0;
+            if(hasAlpha) {
+                A = (abuf0[i] + 64) >> 7;
+                if (A & 0x100)
+                    A = av_clip_uint8(A);
+            }
+            Y -= y_offset;
+            Y *= y_coeff;
+            Y += ytemp;
+            R  = (unsigned)Y + V * v2r_coe;
+            G  = (unsigned)Y + V * v2g_coe + U * u2g_coe;
+            B  = (unsigned)Y + U * u2b_coe;
+            yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
+            dest += step;
+        }
+    } else {
+        const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
+        int uvtemp   = 128 << 8;
+        __m128i uv   = __lsx_vreplgr2vr_w(uvtemp);
+        __m128i zero = __lsx_vldi(0);
+        __m128i bias = __lsx_vreplgr2vr_h(bias_int);
+
+        for (i = 0; i < len; i += 8) {
+            __m128i b, ub0, ub1, vb0, vb1;
+            __m128i y_ev, y_od, u_ev, u_od, v_ev, v_od;
+            __m128i R_ev, R_od, G_ev, G_od, B_ev, B_od;
+            int n = i << 1;
+
+            DUP4_ARG2(__lsx_vldx, buf0, n, ubuf0, n, vbuf0, n,
+                      ubuf1, n, b, ub0, vb0, ub1);
+            vb1 = __lsx_vldx(vbuf, n);
+            y_ev = __lsx_vaddwev_w_h(b, zero);
+            y_od = __lsx_vaddwod_w_h(b, zero);
+            DUP2_ARG2(__lsx_vaddwev_w_h, ub0, vb0, ub1, vb1, u_ev, v_ev);
+            DUP2_ARG2(__lsx_vaddwod_w_h, ub0, vb0, ub1, vb1, u_od, v_od);
+            DUP2_ARG2(__lsx_vslli_w, y_ev, 2, y_od, 2, y_ev, y_od);
+            DUP4_ARG2(__lsx_vsub_w, u_ev, uv, u_od, uv, v_ev, uv, v_od, uv,
+                      u_ev, u_od, v_ev, v_od);
+            DUP4_ARG2(__lsx_vslli_w, u_ev, 1, u_od, 1, v_ev, 1, v_od, 1,
+                      u_ev, u_od, v_ev, v_od);
+            YUVTORGB_LSX(y_ev, u_ev, v_ev, R_ev, G_ev, B_ev, offset, coeff,
+                         y_temp, v2r, v2g, u2g, u2b);
+            YUVTORGB_LSX(y_od, u_od, v_od, R_od, G_od, B_od, offset, coeff,
+                         y_temp, v2r, v2g, u2g, u2b);
+
+            if(hasAlpha) {
+                __m128i a_src;
+                __m128i a_ev, a_od;
+
+                a_src = __lsx_vld(abuf0 + i, 0);
+                a_ev  = __lsx_vaddwev_w_h(bias, a_src);
+                a_od  = __lsx_vaddwod_w_h(bias, a_src);
+                a_ev  = __lsx_vsrai_w(a_ev, 7);
+                a_od  = __lsx_vsrai_w(a_od, 7);
+                WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 0, 0);
+                WRITE_FULL_A_LSX(R_od, G_od, B_od, a_od, 0, 1);
+                WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 1, 2);
+                WRITE_FULL_A_LSX(R_od, G_od, B_od, a_od, 1, 3);
+                WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 2, 4);
+                WRITE_FULL_A_LSX(R_od, G_od, B_od, a_od, 2, 5);
+                WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 3, 6);
+                WRITE_FULL_A_LSX(R_od, G_od, B_od, a_od, 3, 7);
+            } else {
+                WRITE_FULL_LSX(R_ev, G_ev, B_ev, 0, 0);
+                WRITE_FULL_LSX(R_od, G_od, B_od, 0, 1);
+                WRITE_FULL_LSX(R_ev, G_ev, B_ev, 1, 2);
+                WRITE_FULL_LSX(R_od, G_od, B_od, 1, 3);
+                WRITE_FULL_LSX(R_ev, G_ev, B_ev, 2, 4);
+                WRITE_FULL_LSX(R_od, G_od, B_od, 2, 5);
+                WRITE_FULL_LSX(R_ev, G_ev, B_ev, 3, 6);
+                WRITE_FULL_LSX(R_od, G_od, B_od, 3, 7);
+            }
+        }
+        if (dstW - i >= 4) {
+            __m128i b, ub0, ub1, vb0, vb1;
+            __m128i y_l, u_l, v_l;
+            __m128i R_l, G_l, B_l;
+            int n = i << 1;
+
+            DUP4_ARG2(__lsx_vldx, buf0, n, ubuf0, n, vbuf0, n,
+                      ubuf1, n, b, ub0, vb0, ub1);
+            vb1 = __lsx_vldx(vbuf1, n);
+            y_l = __lsx_vsllwil_w_h(b, 0);
+            y_l = __lsx_vslli_w(y_l, 2);
+            DUP4_ARG2(__lsx_vsllwil_w_h, ub0, 0, vb0, 0, ub1, 0, vb1, 0,
+                      ub0, vb0, ub1, vb1);
+            DUP2_ARG2(__lsx_vadd_w, ub0, ub1, vb0, vb1, u_l, v_l);
+            u_l = __lsx_vsub_w(u_l, uv);
+            v_l = __lsx_vsub_w(v_l, uv);
+            u_l = __lsx_vslli_w(u_l, 1);
+            v_l = __lsx_vslli_w(v_l, 1);
+            YUVTORGB_LSX(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
+                         y_temp, v2r, v2g, u2g, u2b);
+
+            if(hasAlpha) {
+                __m128i a_src;
+                __m128i a_l;
+
+                a_src  = __lsx_vld(abuf0 + i, 0);
+                a_src  = __lsx_vilvl_h(a_src, a_src);
+                a_l    = __lsx_vaddwev_w_h(bias, a_l);
+                a_l   = __lsx_vsrai_w(a_l, 7);
+                WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 0, 0);
+                WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 1, 1);
+                WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 2, 2);
+                WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 3, 3);
+            } else {
+                WRITE_FULL_LSX(R_l, G_l, B_l, 0, 0);
+                WRITE_FULL_LSX(R_l, G_l, B_l, 1, 1);
+                WRITE_FULL_LSX(R_l, G_l, B_l, 2, 2);
+                WRITE_FULL_LSX(R_l, G_l, B_l, 3, 3);
+            }
+            i += 4;
+        }
+        for (; i < dstW; i++) {
+            int Y = buf0[i] << 2;
+            int U = (ubuf0[i] + ubuf1[i] - uvtemp) << 1;
+            int V = (vbuf0[i] + vbuf1[i] - uvtemp) << 1;
+
+            A = 0;
+            if(hasAlpha) {
+                A = (abuf0[i] + 64) >> 7;
+                if (A & 0x100)
+                    A = av_clip_uint8(A);
+            }
+            Y -= y_offset;
+            Y *= y_coeff;
+            Y += ytemp;
+            R  = (unsigned)Y + V * v2r_coe;
+            G  = (unsigned)Y + V * v2g_coe + U * u2g_coe;
+            B  = (unsigned)Y + U * u2b_coe;
+            yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
+            dest += step;
+        }
+    }
+    c->dither_error[0][i] = err[0];
+    c->dither_error[1][i] = err[1];
+    c->dither_error[2][i] = err[2];
+}
+
+#if CONFIG_SMALL
+YUV2RGBWRAPPER(yuv2, rgb_full, bgra32_full, AV_PIX_FMT_BGRA,
+               CONFIG_SWSCALE_ALPHA && c->needAlpha)
+YUV2RGBWRAPPER(yuv2, rgb_full, abgr32_full, AV_PIX_FMT_ABGR,
+               CONFIG_SWSCALE_ALPHA && c->needAlpha)
+YUV2RGBWRAPPER(yuv2, rgb_full, rgba32_full, AV_PIX_FMT_RGBA,
+               CONFIG_SWSCALE_ALPHA && c->needAlpha)
+YUV2RGBWRAPPER(yuv2, rgb_full, argb32_full, AV_PIX_FMT_ARGB,
+               CONFIG_SWSCALE_ALPHA && c->needAlpha)
+#else
+#if CONFIG_SWSCALE_ALPHA
+YUV2RGBWRAPPER(yuv2, rgb_full, bgra32_full, AV_PIX_FMT_BGRA,  1)
+YUV2RGBWRAPPER(yuv2, rgb_full, abgr32_full, AV_PIX_FMT_ABGR,  1)
+YUV2RGBWRAPPER(yuv2, rgb_full, rgba32_full, AV_PIX_FMT_RGBA,  1)
+YUV2RGBWRAPPER(yuv2, rgb_full, argb32_full, AV_PIX_FMT_ARGB,  1)
+#endif
+YUV2RGBWRAPPER(yuv2, rgb_full, bgrx32_full, AV_PIX_FMT_BGRA,  0)
+YUV2RGBWRAPPER(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR,  0)
+YUV2RGBWRAPPER(yuv2, rgb_full, rgbx32_full, AV_PIX_FMT_RGBA,  0)
+YUV2RGBWRAPPER(yuv2, rgb_full, xrgb32_full, AV_PIX_FMT_ARGB,  0)
+#endif
+YUV2RGBWRAPPER(yuv2, rgb_full, bgr24_full,  AV_PIX_FMT_BGR24, 0)
+YUV2RGBWRAPPER(yuv2, rgb_full, rgb24_full,  AV_PIX_FMT_RGB24, 0)
+
+YUV2RGBWRAPPER(yuv2, rgb_full, bgr4_byte_full,  AV_PIX_FMT_BGR4_BYTE, 0)
+YUV2RGBWRAPPER(yuv2, rgb_full, rgb4_byte_full,  AV_PIX_FMT_RGB4_BYTE, 0)
+YUV2RGBWRAPPER(yuv2, rgb_full, bgr8_full,   AV_PIX_FMT_BGR8,  0)
+YUV2RGBWRAPPER(yuv2, rgb_full, rgb8_full,   AV_PIX_FMT_RGB8,  0)
+
+
+av_cold void ff_sws_init_output_lsx(SwsContext *c)
+{
+    if(c->flags & SWS_FULL_CHR_H_INT) {
+        switch (c->dstFormat) {
+        case AV_PIX_FMT_RGBA:
+#if CONFIG_SMALL
+            c->yuv2packedX = yuv2rgba32_full_X_lsx;
+            c->yuv2packed2 = yuv2rgba32_full_2_lsx;
+            c->yuv2packed1 = yuv2rgba32_full_1_lsx;
+#else
+#if CONFIG_SWSCALE_ALPHA
+            if (c->needAlpha) {
+                c->yuv2packedX = yuv2rgba32_full_X_lsx;
+                c->yuv2packed2 = yuv2rgba32_full_2_lsx;
+                c->yuv2packed1 = yuv2rgba32_full_1_lsx;
+            } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+            {
+                c->yuv2packedX = yuv2rgbx32_full_X_lsx;
+                c->yuv2packed2 = yuv2rgbx32_full_2_lsx;
+                c->yuv2packed1 = yuv2rgbx32_full_1_lsx;
+            }
+#endif /* !CONFIG_SMALL */
+            break;
+        case AV_PIX_FMT_ARGB:
+#if CONFIG_SMALL
+            c->yuv2packedX = yuv2argb32_full_X_lsx;
+            c->yuv2packed2 = yuv2argb32_full_2_lsx;
+            c->yuv2packed1 = yuv2argb32_full_1_lsx;
+#else
+#if CONFIG_SWSCALE_ALPHA
+            if (c->needAlpha) {
+                c->yuv2packedX = yuv2argb32_full_X_lsx;
+                c->yuv2packed2 = yuv2argb32_full_2_lsx;
+                c->yuv2packed1 = yuv2argb32_full_1_lsx;
+            } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+            {
+                c->yuv2packedX = yuv2xrgb32_full_X_lsx;
+                c->yuv2packed2 = yuv2xrgb32_full_2_lsx;
+                c->yuv2packed1 = yuv2xrgb32_full_1_lsx;
+            }
+#endif /* !CONFIG_SMALL */
+            break;
+        case AV_PIX_FMT_BGRA:
+#if CONFIG_SMALL
+            c->yuv2packedX = yuv2bgra32_full_X_lsx;
+            c->yuv2packed2 = yuv2bgra32_full_2_lsx;
+            c->yuv2packed1 = yuv2bgra32_full_1_lsx;
+#else
+#if CONFIG_SWSCALE_ALPHA
+            if (c->needAlpha) {
+                c->yuv2packedX = yuv2bgra32_full_X_lsx;
+                c->yuv2packed2 = yuv2bgra32_full_2_lsx;
+                c->yuv2packed1 = yuv2bgra32_full_1_lsx;
+            } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+            {
+                c->yuv2packedX = yuv2bgrx32_full_X_lsx;
+                c->yuv2packed2 = yuv2bgrx32_full_2_lsx;
+                c->yuv2packed1 = yuv2bgrx32_full_1_lsx;
+            }
+#endif /* !CONFIG_SMALL */
+            break;
+        case AV_PIX_FMT_ABGR:
+#if CONFIG_SMALL
+            c->yuv2packedX = yuv2abgr32_full_X_lsx;
+            c->yuv2packed2 = yuv2abgr32_full_2_lsx;
+            c->yuv2packed1 = yuv2abgr32_full_1_lsx;
+#else
+#if CONFIG_SWSCALE_ALPHA
+            if (c->needAlpha) {
+                c->yuv2packedX = yuv2abgr32_full_X_lsx;
+                c->yuv2packed2 = yuv2abgr32_full_2_lsx;
+                c->yuv2packed1 = yuv2abgr32_full_1_lsx;
+            } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+            {
+                c->yuv2packedX = yuv2xbgr32_full_X_lsx;
+                c->yuv2packed2 = yuv2xbgr32_full_2_lsx;
+                c->yuv2packed1 = yuv2xbgr32_full_1_lsx;
+            }
+#endif /* !CONFIG_SMALL */
+            break;
+        case AV_PIX_FMT_RGB24:
+            c->yuv2packedX = yuv2rgb24_full_X_lsx;
+            c->yuv2packed2 = yuv2rgb24_full_2_lsx;
+            c->yuv2packed1 = yuv2rgb24_full_1_lsx;
+            break;
+        case AV_PIX_FMT_BGR24:
+            c->yuv2packedX = yuv2bgr24_full_X_lsx;
+            c->yuv2packed2 = yuv2bgr24_full_2_lsx;
+            c->yuv2packed1 = yuv2bgr24_full_1_lsx;
+            break;
+        case AV_PIX_FMT_BGR4_BYTE:
+            c->yuv2packedX = yuv2bgr4_byte_full_X_lsx;
+            c->yuv2packed2 = yuv2bgr4_byte_full_2_lsx;
+            c->yuv2packed1 = yuv2bgr4_byte_full_1_lsx;
+            break;
+        case AV_PIX_FMT_RGB4_BYTE:
+            c->yuv2packedX = yuv2rgb4_byte_full_X_lsx;
+            c->yuv2packed2 = yuv2rgb4_byte_full_2_lsx;
+            c->yuv2packed1 = yuv2rgb4_byte_full_1_lsx;
+            break;
+        case AV_PIX_FMT_BGR8:
+            c->yuv2packedX = yuv2bgr8_full_X_lsx;
+            c->yuv2packed2 = yuv2bgr8_full_2_lsx;
+            c->yuv2packed1 = yuv2bgr8_full_1_lsx;
+            break;
+        case AV_PIX_FMT_RGB8:
+            c->yuv2packedX = yuv2rgb8_full_X_lsx;
+            c->yuv2packed2 = yuv2rgb8_full_2_lsx;
+            c->yuv2packed1 = yuv2rgb8_full_1_lsx;
+            break;
+    }
+    } else {
+        switch (c->dstFormat) {
+        case AV_PIX_FMT_RGB32:
+        case AV_PIX_FMT_BGR32:
+#if CONFIG_SMALL
+#else
+#if CONFIG_SWSCALE_ALPHA
+            if (c->needAlpha) {
+            } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+            {
+                c->yuv2packed1 = yuv2rgbx32_1_lsx;
+                c->yuv2packed2 = yuv2rgbx32_2_lsx;
+                c->yuv2packedX = yuv2rgbx32_X_lsx;
+            }
+#endif /* !CONFIG_SMALL */
+            break;
+        case AV_PIX_FMT_RGB32_1:
+        case AV_PIX_FMT_BGR32_1:
+#if CONFIG_SMALL
+#else
+#if CONFIG_SWSCALE_ALPHA
+            if (c->needAlpha) {
+            } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+            {
+                c->yuv2packed1 = yuv2rgbx32_1_1_lsx;
+                c->yuv2packed2 = yuv2rgbx32_1_2_lsx;
+                c->yuv2packedX = yuv2rgbx32_1_X_lsx;
+            }
+#endif /* !CONFIG_SMALL */
+            break;
+        case AV_PIX_FMT_RGB24:
+            c->yuv2packed1 = yuv2rgb24_1_lsx;
+            c->yuv2packed2 = yuv2rgb24_2_lsx;
+            c->yuv2packedX = yuv2rgb24_X_lsx;
+            break;
+        case AV_PIX_FMT_BGR24:
+            c->yuv2packed1 = yuv2bgr24_1_lsx;
+            c->yuv2packed2 = yuv2bgr24_2_lsx;
+            c->yuv2packedX = yuv2bgr24_X_lsx;
+            break;
+        case AV_PIX_FMT_RGB565LE:
+        case AV_PIX_FMT_RGB565BE:
+        case AV_PIX_FMT_BGR565LE:
+        case AV_PIX_FMT_BGR565BE:
+            c->yuv2packed1 = yuv2rgb16_1_lsx;
+            c->yuv2packed2 = yuv2rgb16_2_lsx;
+            c->yuv2packedX = yuv2rgb16_X_lsx;
+            break;
+        case AV_PIX_FMT_RGB555LE:
+        case AV_PIX_FMT_RGB555BE:
+        case AV_PIX_FMT_BGR555LE:
+        case AV_PIX_FMT_BGR555BE:
+            c->yuv2packed1 = yuv2rgb15_1_lsx;
+            c->yuv2packed2 = yuv2rgb15_2_lsx;
+            c->yuv2packedX = yuv2rgb15_X_lsx;
+            break;
+        case AV_PIX_FMT_RGB444LE:
+        case AV_PIX_FMT_RGB444BE:
+        case AV_PIX_FMT_BGR444LE:
+        case AV_PIX_FMT_BGR444BE:
+            c->yuv2packed1 = yuv2rgb12_1_lsx;
+            c->yuv2packed2 = yuv2rgb12_2_lsx;
+            c->yuv2packedX = yuv2rgb12_X_lsx;
+            break;
+        case AV_PIX_FMT_RGB8:
+        case AV_PIX_FMT_BGR8:
+            c->yuv2packed1 = yuv2rgb8_1_lsx;
+            c->yuv2packed2 = yuv2rgb8_2_lsx;
+            c->yuv2packedX = yuv2rgb8_X_lsx;
+            break;
+        case AV_PIX_FMT_RGB4:
+        case AV_PIX_FMT_BGR4:
+            c->yuv2packed1 = yuv2rgb4_1_lsx;
+            c->yuv2packed2 = yuv2rgb4_2_lsx;
+            c->yuv2packedX = yuv2rgb4_X_lsx;
+            break;
+        case AV_PIX_FMT_RGB4_BYTE:
+        case AV_PIX_FMT_BGR4_BYTE:
+            c->yuv2packed1 = yuv2rgb4b_1_lsx;
+            c->yuv2packed2 = yuv2rgb4b_2_lsx;
+            c->yuv2packedX = yuv2rgb4b_X_lsx;
+            break;
+        }
+    }
+}
diff --git a/libswscale/loongarch/swscale.S b/libswscale/loongarch/swscale.S
new file mode 100644
index 0000000000..aa4c5cbe28
--- /dev/null
+++ b/libswscale/loongarch/swscale.S
@@ -0,0 +1,1868 @@
+/*
+ * Loongson LSX optimized swscale
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/loongarch/loongson_asm.S"
+
+/* void ff_hscale_8_to_15_lsx(SwsContext *c, int16_t *dst, int dstW,
+ *                            const uint8_t *src, const int16_t *filter,
+ *                            const int32_t *filterPos, int filterSize)
+ */
+function ff_hscale_8_to_15_lsx
+    addi.d           sp,      sp,     -72
+    st.d             s0,      sp,     0
+    st.d             s1,      sp,     8
+    st.d             s2,      sp,     16
+    st.d             s3,      sp,     24
+    st.d             s4,      sp,     32
+    st.d             s5,      sp,     40
+    st.d             s6,      sp,     48
+    st.d             s7,      sp,     56
+    st.d             s8,      sp,     64
+    li.w             t0,      32767
+    li.w             t8,      8
+    li.w             t7,      4
+    vldi             vr0,     0
+    vreplgr2vr.w     vr20,    t0
+    beq              a6,      t7,     .LOOP_DSTW4
+    beq              a6,      t8,     .LOOP_DSTW8
+    blt              t8,      a6,     .LOOP_START
+    b                .END_DSTW4
+
+.LOOP_START:
+    li.w             t1,      0
+    li.w             s1,      0
+    li.w             s2,      0
+    li.w             s3,      0
+    li.w             s4,      0
+    li.w             s5,      0
+    vldi             vr22,    0
+    addi.w           s0,      a6,     -7
+    slli.w           s7,      a6,     1
+    slli.w           s8,      a6,     2
+    add.w            t6,      s7,     s8
+.LOOP_DSTW:
+    ld.w             t2,      a5,     0
+    ld.w             t3,      a5,     4
+    ld.w             t4,      a5,     8
+    ld.w             t5,      a5,     12
+    fldx.d           f1,      a3,     t2
+    fldx.d           f2,      a3,     t3
+    fldx.d           f3,      a3,     t4
+    fldx.d           f4,      a3,     t5
+    vld              vr9,     a4,     0
+    vldx             vr10,    a4,     s7
+    vldx             vr11,    a4,     s8
+    vldx             vr12,    a4,     t6
+    vilvl.b          vr1,     vr0,    vr1
+    vilvl.b          vr2,     vr0,    vr2
+    vilvl.b          vr3,     vr0,    vr3
+    vilvl.b          vr4,     vr0,    vr4
+    vdp2.w.h         vr17,    vr1,    vr9
+    vdp2.w.h         vr18,    vr2,    vr10
+    vdp2.w.h         vr19,    vr3,    vr11
+    vdp2.w.h         vr21,    vr4,    vr12
+    vhaddw.d.w       vr1,     vr17,   vr17
+    vhaddw.d.w       vr2,     vr18,   vr18
+    vhaddw.d.w       vr3,     vr19,   vr19
+    vhaddw.d.w       vr4,     vr21,   vr21
+    vhaddw.q.d       vr1,     vr1,    vr1
+    vhaddw.q.d       vr2,     vr2,    vr2
+    vhaddw.q.d       vr3,     vr3,    vr3
+    vhaddw.q.d       vr4,     vr4,    vr4
+    vilvl.w          vr1,     vr2,    vr1
+    vilvl.w          vr3,     vr4,    vr3
+    vilvl.d          vr1,     vr3,    vr1
+    vadd.w           vr22,    vr22,   vr1
+    addi.w           s1,      s1,     8
+    addi.d           a3,      a3,     8
+    addi.d           a4,      a4,     16
+    blt              s1,      s0,     .LOOP_DSTW
+    blt              s1,      a6,     .DSTWA
+    b                .END_FILTER
+.DSTWA:
+    ld.w             t2,      a5,     0
+    li.w             t3,      0
+    move             s6,      s1
+.FILTERSIZEA:
+    add.w            t4,      t2,     t3
+    ldx.bu           t5,      a3,     t4
+    mul.w            t6,      a6,     t1
+    add.w            t6,      t6,     t3
+    slli.w           t6,      t6,     1
+    ldx.h            t6,      a4,     t6
+    mul.w            t6,      t5,     t6
+    add.w            s2,      s2,     t6
+    addi.w           t3,      t3,     1
+    addi.w           s6,      s6,     1
+    blt              s6,      a6,     .FILTERSIZEA
+
+    ld.w             t2,      a5,     4
+    li.w             t3,      0
+    move             s6,      s1
+    addi.w           t1,      t1,     1
+.FILTERSIZEB:
+    add.w            t4,      t2,     t3
+    ldx.bu           t5,      a3,     t4
+    mul.w            t6,      a6,     t1
+    add.w            t6,      t6,     t3
+    slli.w           t6,      t6,     1
+    ldx.h            t6,      a4,     t6
+    mul.w            t6,      t5,     t6
+    add.w            s3,      s3,     t6
+    addi.w           t3,      t3,     1
+    addi.w           s6,      s6,     1
+    blt              s6,      a6,     .FILTERSIZEB
+    ld.w             t2,      a5,     8
+    addi.w           t1,      t1,     1
+    li.w             t3,      0
+    move             s6,      s1
+.FILTERSIZEC:
+    add.w            t4,      t2,     t3
+    ldx.bu           t5,      a3,     t4
+    mul.w            t6,      a6,     t1
+    add.w            t6,      t6,     t3
+    slli.w           t6,      t6,     1
+    ldx.h            t6,      a4,     t6
+    mul.w            t6,      t5,     t6
+    add.w            s4,      s4,     t6
+    addi.w           t3,      t3,     1
+    addi.w           s6,      s6,     1
+    blt              s6,      a6,     .FILTERSIZEC
+    ld.w             t2,      a5,     12
+    addi.w           t1,      t1,     1
+    move             s6,      s1
+    li.w             t3,      0
+.FILTERSIZED:
+    add.w            t4,      t2,     t3
+    ldx.bu           t5,      a3,     t4
+    mul.w            t6,      a6,     t1
+    add.w            t6,      t6,     t3
+    slli.w           t6,      t6,     1
+    ldx.h            t6,      a4,     t6
+    mul.w            t6,      t5,     t6
+    add.w            s5,      s5,     t6
+    addi.w           t3,      t3,     1
+    addi.w           s6,      s6,     1
+    blt              s6,      a6,     .FILTERSIZED
+.END_FILTER:
+    vpickve2gr.w     t1,      vr22,   0
+    vpickve2gr.w     t2,      vr22,   1
+    vpickve2gr.w     t3,      vr22,   2
+    vpickve2gr.w     t4,      vr22,   3
+    add.w            s2,      s2,     t1
+    add.w            s3,      s3,     t2
+    add.w            s4,      s4,     t3
+    add.w            s5,      s5,     t4
+    srai.w           s2,      s2,     7
+    srai.w           s3,      s3,     7
+    srai.w           s4,      s4,     7
+    srai.w           s5,      s5,     7
+    slt              t1,      s2,     t0
+    slt              t2,      s3,     t0
+    slt              t3,      s4,     t0
+    slt              t4,      s5,     t0
+    maskeqz          s2,      s2,     t1
+    maskeqz          s3,      s3,     t2
+    maskeqz          s4,      s4,     t3
+    maskeqz          s5,      s5,     t4
+    masknez          t1,      t0,     t1
+    masknez          t2,      t0,     t2
+    masknez          t3,      t0,     t3
+    masknez          t4,      t0,     t4
+    or               s2,      s2,     t1
+    or               s3,      s3,     t2
+    or               s4,      s4,     t3
+    or               s5,      s5,     t4
+    st.h             s2,      a1,     0
+    st.h             s3,      a1,     2
+    st.h             s4,      a1,     4
+    st.h             s5,      a1,     6
+
+    addi.d           a1,      a1,     8
+    sub.d            a3,      a3,     s1
+    addi.d           a5,      a5,     16
+    slli.d           t3,      a6,     3
+    add.d            a4,      a4,     t3
+    sub.d            a4,      a4,     s1
+    sub.d            a4,      a4,     s1
+    addi.d           a2,      a2,     -4
+    bge              a2,      t7,     .LOOP_START
+    blt              zero,    a2,     .RES
+    b                .END_LOOP
+.RES:
+    li.w             t1,      0
+.DSTW:
+    slli.w           t2,      t1,     2
+    ldx.w            t2,      a5,     t2
+    li.w             t3,      0
+    li.w             t8,      0
+.FILTERSIZE:
+    add.w            t4,      t2,     t3
+    ldx.bu           t5,      a3,     t4
+    mul.w            t6,      a6,     t1
+    add.w            t6,      t6,     t3
+    slli.w           t7,      t6,     1
+    ldx.h            t7,      a4,     t7
+    mul.w            t7,      t5,     t7
+    add.w            t8,      t8,     t7
+    addi.w           t3,      t3,     1
+    blt              t3,      a6,     .FILTERSIZE
+    srai.w           t8,      t8,     7
+    slt              t5,      t8,     t0
+    maskeqz          t8,      t8,     t5
+    masknez          t5,      t0,     t5
+    or               t8,      t8,     t5
+    slli.w           t4,      t1,     1
+    stx.h            t8,      a1,     t4
+    addi.w           t1,      t1,     1
+    blt              t1,      a2,     .DSTW
+    b                .END_LOOP
+
+.LOOP_DSTW8:
+    ld.w             t1,      a5,     0
+    ld.w             t2,      a5,     4
+    ld.w             t3,      a5,     8
+    ld.w             t4,      a5,     12
+    fldx.d           f1,      a3,     t1
+    fldx.d           f2,      a3,     t2
+    fldx.d           f3,      a3,     t3
+    fldx.d           f4,      a3,     t4
+    ld.w             t1,      a5,     16
+    ld.w             t2,      a5,     20
+    ld.w             t3,      a5,     24
+    ld.w             t4,      a5,     28
+    fldx.d           f5,      a3,     t1
+    fldx.d           f6,      a3,     t2
+    fldx.d           f7,      a3,     t3
+    fldx.d           f8,      a3,     t4
+    vld              vr9,     a4,     0
+    vld              vr10,    a4,     16
+    vld              vr11,    a4,     32
+    vld              vr12,    a4,     48
+    vld              vr13,    a4,     64
+    vld              vr14,    a4,     80
+    vld              vr15,    a4,     96
+    vld              vr16,    a4,     112
+    vilvl.b          vr1,     vr0,    vr1
+    vilvl.b          vr2,     vr0,    vr2
+    vilvl.b          vr3,     vr0,    vr3
+    vilvl.b          vr4,     vr0,    vr4
+    vilvl.b          vr5,     vr0,    vr5
+    vilvl.b          vr6,     vr0,    vr6
+    vilvl.b          vr7,     vr0,    vr7
+    vilvl.b          vr8,     vr0,    vr8
+
+    vdp2.w.h         vr17,    vr1,    vr9
+    vdp2.w.h         vr18,    vr2,    vr10
+    vdp2.w.h         vr19,    vr3,    vr11
+    vdp2.w.h         vr21,    vr4,    vr12
+    vdp2.w.h         vr1,     vr5,    vr13
+    vdp2.w.h         vr2,     vr6,    vr14
+    vdp2.w.h         vr3,     vr7,    vr15
+    vdp2.w.h         vr4,     vr8,    vr16
+    vhaddw.d.w       vr5,     vr1,    vr1
+    vhaddw.d.w       vr6,     vr2,    vr2
+    vhaddw.d.w       vr7,     vr3,    vr3
+    vhaddw.d.w       vr8,     vr4,    vr4
+    vhaddw.d.w       vr1,     vr17,   vr17
+    vhaddw.d.w       vr2,     vr18,   vr18
+    vhaddw.d.w       vr3,     vr19,   vr19
+    vhaddw.d.w       vr4,     vr21,   vr21
+    vhaddw.q.d       vr1,     vr1,    vr1
+    vhaddw.q.d       vr2,     vr2,    vr2
+    vhaddw.q.d       vr3,     vr3,    vr3
+    vhaddw.q.d       vr4,     vr4,    vr4
+    vhaddw.q.d       vr5,     vr5,    vr5
+    vhaddw.q.d       vr6,     vr6,    vr6
+    vhaddw.q.d       vr7,     vr7,    vr7
+    vhaddw.q.d       vr8,     vr8,    vr8
+    vilvl.w          vr1,     vr2,    vr1
+    vilvl.w          vr3,     vr4,    vr3
+    vilvl.w          vr5,     vr6,    vr5
+    vilvl.w          vr7,     vr8,    vr7
+    vilvl.d          vr1,     vr3,    vr1
+    vilvl.d          vr5,     vr7,    vr5
+    vsrai.w          vr1,     vr1,    7
+    vsrai.w          vr5,     vr5,    7
+    vmin.w           vr1,     vr1,    vr20
+    vmin.w           vr5,     vr5,    vr20
+
+    vpickev.h        vr1,     vr5,    vr1
+    vst              vr1,     a1,     0
+    addi.d           a1,      a1,     16
+    addi.d           a5,      a5,     32
+    addi.d           a4,      a4,     128
+    addi.d           a2,      a2,     -8
+    bge              a2,      t8,     .LOOP_DSTW8
+    blt              zero,    a2,     .RES8
+    b                .END_LOOP
+.RES8:
+    li.w             t1,      0
+.DSTW8:
+    slli.w           t2,      t1,     2
+    ldx.w            t2,      a5,     t2
+    li.w             t3,      0
+    li.w             t8,      0
+.FILTERSIZE8:
+    add.w            t4,      t2,     t3
+    ldx.bu           t5,      a3,     t4
+    mul.w            t6,      a6,     t1
+    add.w            t6,      t6,     t3
+    slli.w           t7,      t6,     1
+    ldx.h            t7,      a4,     t7
+    mul.w            t7,      t5,     t7
+    add.w            t8,      t8,     t7
+    addi.w           t3,      t3,     1
+    blt              t3,      a6,     .FILTERSIZE8
+    srai.w           t8,      t8,     7
+    slt              t5,      t8,     t0
+    maskeqz          t8,      t8,     t5
+    masknez          t5,      t0,     t5
+    or               t8,      t8,     t5
+    slli.w           t4,      t1,     1
+    stx.h            t8,      a1,     t4
+    addi.w           t1,      t1,     1
+    blt              t1,      a2,     .DSTW8
+    b                .END_LOOP
+
+.LOOP_DSTW4:
+    ld.w             t1,      a5,     0
+    ld.w             t2,      a5,     4
+    ld.w             t3,      a5,     8
+    ld.w             t4,      a5,     12
+    fldx.s           f1,      a3,     t1
+    fldx.s           f2,      a3,     t2
+    fldx.s           f3,      a3,     t3
+    fldx.s           f4,      a3,     t4
+    ld.w             t1,      a5,     16
+    ld.w             t2,      a5,     20
+    ld.w             t3,      a5,     24
+    ld.w             t4,      a5,     28
+    fldx.s           f5,      a3,     t1
+    fldx.s           f6,      a3,     t2
+    fldx.s           f7,      a3,     t3
+    fldx.s           f8,      a3,     t4
+    vld              vr9,     a4,     0
+    vld              vr10,    a4,     16
+    vld              vr11,    a4,     32
+    vld              vr12,    a4,     48
+    vilvl.w          vr1,     vr2,    vr1
+    vilvl.w          vr3,     vr4,    vr3
+    vilvl.w          vr5,     vr6,    vr5
+    vilvl.w          vr7,     vr8,    vr7
+    vilvl.b          vr1,     vr0,    vr1
+    vilvl.b          vr3,     vr0,    vr3
+    vilvl.b          vr5,     vr0,    vr5
+    vilvl.b          vr7,     vr0,    vr7
+
+    vdp2.w.h         vr13,    vr1,    vr9
+    vdp2.w.h         vr14,    vr3,    vr10
+    vdp2.w.h         vr15,    vr5,    vr11
+    vdp2.w.h         vr16,    vr7,    vr12
+    vhaddw.d.w       vr13,    vr13,   vr13
+    vhaddw.d.w       vr14,    vr14,   vr14
+    vhaddw.d.w       vr15,    vr15,   vr15
+    vhaddw.d.w       vr16,    vr16,   vr16
+    vpickev.w        vr13,    vr14,   vr13
+    vpickev.w        vr15,    vr16,   vr15
+    vsrai.w          vr13,    vr13,   7
+    vsrai.w          vr15,    vr15,   7
+    vmin.w           vr13,    vr13,   vr20
+    vmin.w           vr15,    vr15,   vr20
+
+    vpickev.h        vr13,    vr15,   vr13
+    vst              vr13,    a1,     0
+    addi.d           a1,      a1,     16
+    addi.d           a5,      a5,     32
+    addi.d           a4,      a4,     64
+    addi.d           a2,      a2,     -8
+    bge              a2,      t8,     .LOOP_DSTW4
+    blt              zero,    a2,     .RES4
+    b                .END_LOOP
+.RES4:
+    li.w             t1,      0
+.DSTW4:
+    slli.w           t2,      t1,     2
+    ldx.w            t2,      a5,     t2
+    li.w             t3,      0
+    li.w             t8,      0
+.FILTERSIZE4:
+    add.w            t4,      t2,     t3
+    ldx.bu           t5,      a3,     t4
+    mul.w            t6,      a6,     t1
+    add.w            t6,      t6,     t3
+    slli.w           t7,      t6,     1
+    ldx.h            t7,      a4,     t7
+    mul.w            t7,      t5,     t7
+    add.w            t8,      t8,     t7
+    addi.w           t3,      t3,     1
+    blt              t3,      a6,     .FILTERSIZE4
+    srai.w           t8,      t8,     7
+    slt              t5,      t8,     t0
+    maskeqz          t8,      t8,     t5
+    masknez          t5,      t0,     t5
+    or               t8,      t8,     t5
+    slli.w           t4,      t1,     1
+    stx.h            t8,      a1,     t4
+    addi.w           t1,      t1,     1
+    blt              t1,      a2,     .DSTW4
+    b                .END_LOOP
+.END_DSTW4:
+
+    li.w             t1,      0
+.LOOP_DSTW1:
+    slli.w           t2,      t1,     2
+    ldx.w            t2,      a5,     t2
+    li.w             t3,      0
+    li.w             t8,      0
+.FILTERSIZE1:
+    add.w            t4,      t2,     t3
+    ldx.bu           t5,      a3,     t4
+    mul.w            t6,      a6,     t1
+    add.w            t6,      t6,     t3
+    slli.w           t7,      t6,     1
+    ldx.h            t7,      a4,     t7
+    mul.w            t7,      t5,     t7
+    add.w            t8,      t8,     t7
+    addi.w           t3,      t3,     1
+    blt              t3,      a6,     .FILTERSIZE1
+    srai.w           t8,      t8,     7
+    slt              t5,      t8,     t0
+    maskeqz          t8,      t8,     t5
+    masknez          t5,      t0,     t5
+    or               t8,      t8,     t5
+    slli.w           t4,      t1,     1
+    stx.h            t8,      a1,     t4
+    addi.w           t1,      t1,     1
+    blt              t1,      a2,     .LOOP_DSTW1
+    b                .END_LOOP
+.END_LOOP:
+
+    ld.d             s0,      sp,     0
+    ld.d             s1,      sp,     8
+    ld.d             s2,      sp,     16
+    ld.d             s3,      sp,     24
+    ld.d             s4,      sp,     32
+    ld.d             s5,      sp,     40
+    ld.d             s6,      sp,     48
+    ld.d             s7,      sp,     56
+    ld.d             s8,      sp,     64
+    addi.d           sp,      sp,     72
+endfunc
+
+/* void ff_hscale_8_to_19_lsx(SwsContext *c, int16_t *dst, int dstW,
+ *                            const uint8_t *src, const int16_t *filter,
+ *                            const int32_t *filterPos, int filterSize)
+ */
+function ff_hscale_8_to_19_lsx
+    addi.d           sp,      sp,     -72
+    st.d             s0,      sp,     0
+    st.d             s1,      sp,     8
+    st.d             s2,      sp,     16
+    st.d             s3,      sp,     24
+    st.d             s4,      sp,     32
+    st.d             s5,      sp,     40
+    st.d             s6,      sp,     48
+    st.d             s7,      sp,     56
+    st.d             s8,      sp,     64
+    li.w             t0,      524287
+    li.w             t8,      8
+    li.w             t7,      4
+    vldi             vr0,     0
+    vreplgr2vr.w     vr20,    t0
+    beq              a6,      t7,     .LOOP_DST4
+    beq              a6,      t8,     .LOOP_DST8
+    blt              t8,      a6,     .LOOP
+    b                .END_DST4
+
+.LOOP:
+    li.w             t1,      0
+    li.w             s1,      0
+    li.w             s2,      0
+    li.w             s3,      0
+    li.w             s4,      0
+    li.w             s5,      0
+    vldi             vr22,    0
+    addi.w           s0,      a6,     -7
+    slli.w           s7,      a6,     1
+    slli.w           s8,      a6,     2
+    add.w            t6,      s7,     s8
+.LOOP_DST:
+    ld.w             t2,      a5,     0
+    ld.w             t3,      a5,     4
+    ld.w             t4,      a5,     8
+    ld.w             t5,      a5,     12
+    fldx.d           f1,      a3,     t2
+    fldx.d           f2,      a3,     t3
+    fldx.d           f3,      a3,     t4
+    fldx.d           f4,      a3,     t5
+    vld              vr9,     a4,     0
+    vldx             vr10,    a4,     s7
+    vldx             vr11,    a4,     s8
+    vldx             vr12,    a4,     t6
+    vilvl.b          vr1,     vr0,    vr1
+    vilvl.b          vr2,     vr0,    vr2
+    vilvl.b          vr3,     vr0,    vr3
+    vilvl.b          vr4,     vr0,    vr4
+    vdp2.w.h         vr17,    vr1,    vr9
+    vdp2.w.h         vr18,    vr2,    vr10
+    vdp2.w.h         vr19,    vr3,    vr11
+    vdp2.w.h         vr21,    vr4,    vr12
+    vhaddw.d.w       vr1,     vr17,   vr17
+    vhaddw.d.w       vr2,     vr18,   vr18
+    vhaddw.d.w       vr3,     vr19,   vr19
+    vhaddw.d.w       vr4,     vr21,   vr21
+    vhaddw.q.d       vr1,     vr1,    vr1
+    vhaddw.q.d       vr2,     vr2,    vr2
+    vhaddw.q.d       vr3,     vr3,    vr3
+    vhaddw.q.d       vr4,     vr4,    vr4
+    vilvl.w          vr1,     vr2,    vr1
+    vilvl.w          vr3,     vr4,    vr3
+    vilvl.d          vr1,     vr3,    vr1
+    vadd.w           vr22,    vr22,   vr1
+    addi.w           s1,      s1,     8
+    addi.d           a3,      a3,     8
+    addi.d           a4,      a4,     16
+    blt              s1,      s0,     .LOOP_DST
+    blt              s1,      a6,     .DSTA
+    b                .END_FILTERA
+.DSTA:
+    ld.w             t2,      a5,     0
+    li.w             t3,      0
+    move             s6,      s1
+.FILTERA:
+    add.w            t4,      t2,     t3
+    ldx.bu           t5,      a3,     t4
+    mul.w            t6,      a6,     t1
+    add.w            t6,      t6,     t3
+    slli.w           t6,      t6,     1
+    ldx.h            t6,      a4,     t6
+    mul.w            t6,      t5,     t6
+    add.w            s2,      s2,     t6
+    addi.w           t3,      t3,     1
+    addi.w           s6,      s6,     1
+    blt              s6,      a6,     .FILTERA
+
+    ld.w             t2,      a5,     4
+    li.w             t3,      0
+    move             s6,      s1
+    addi.w           t1,      t1,     1
+.FILTERB:
+    add.w            t4,      t2,     t3
+    ldx.bu           t5,      a3,     t4
+    mul.w            t6,      a6,     t1
+    add.w            t6,      t6,     t3
+    slli.w           t6,      t6,     1
+    ldx.h            t6,      a4,     t6
+    mul.w            t6,      t5,     t6
+    add.w            s3,      s3,     t6
+    addi.w           t3,      t3,     1
+    addi.w           s6,      s6,     1
+    blt              s6,      a6,     .FILTERB
+    ld.w             t2,      a5,     8
+    addi.w           t1,      t1,     1
+    li.w             t3,      0
+    move             s6,      s1
+.FILTERC:
+    add.w            t4,      t2,     t3
+    ldx.bu           t5,      a3,     t4
+    mul.w            t6,      a6,     t1
+    add.w            t6,      t6,     t3
+    slli.w           t6,      t6,     1
+    ldx.h            t6,      a4,     t6
+    mul.w            t6,      t5,     t6
+    add.w            s4,      s4,     t6
+    addi.w           t3,      t3,     1
+    addi.w           s6,      s6,     1
+    blt              s6,      a6,     .FILTERC
+    ld.w             t2,      a5,     12
+    addi.w           t1,      t1,     1
+    move             s6,      s1
+    li.w             t3,      0
+.FILTERD:
+    add.w            t4,      t2,     t3
+    ldx.bu           t5,      a3,     t4
+    mul.w            t6,      a6,     t1
+    add.w            t6,      t6,     t3
+    slli.w           t6,      t6,     1
+    ldx.h            t6,      a4,     t6
+    mul.w            t6,      t5,     t6
+    add.w            s5,      s5,     t6
+    addi.w           t3,      t3,     1
+    addi.w           s6,      s6,     1
+    blt              s6,      a6,     .FILTERD
+.END_FILTERA:
+    vpickve2gr.w     t1,      vr22,   0
+    vpickve2gr.w     t2,      vr22,   1
+    vpickve2gr.w     t3,      vr22,   2
+    vpickve2gr.w     t4,      vr22,   3
+    add.w            s2,      s2,     t1
+    add.w            s3,      s3,     t2
+    add.w            s4,      s4,     t3
+    add.w            s5,      s5,     t4
+    srai.w           s2,      s2,     3
+    srai.w           s3,      s3,     3
+    srai.w           s4,      s4,     3
+    srai.w           s5,      s5,     3
+    slt              t1,      s2,     t0
+    slt              t2,      s3,     t0
+    slt              t3,      s4,     t0
+    slt              t4,      s5,     t0
+    maskeqz          s2,      s2,     t1
+    maskeqz          s3,      s3,     t2
+    maskeqz          s4,      s4,     t3
+    maskeqz          s5,      s5,     t4
+    masknez          t1,      t0,     t1
+    masknez          t2,      t0,     t2
+    masknez          t3,      t0,     t3
+    masknez          t4,      t0,     t4
+    or               s2,      s2,     t1
+    or               s3,      s3,     t2
+    or               s4,      s4,     t3
+    or               s5,      s5,     t4
+    st.w             s2,      a1,     0
+    st.w             s3,      a1,     4
+    st.w             s4,      a1,     8
+    st.w             s5,      a1,     12
+
+    addi.d           a1,      a1,     16
+    sub.d            a3,      a3,     s1
+    addi.d           a5,      a5,     16
+    slli.d           t3,      a6,     3
+    add.d            a4,      a4,     t3
+    sub.d            a4,      a4,     s1
+    sub.d            a4,      a4,     s1
+    addi.d           a2,      a2,     -4
+    bge              a2,      t7,     .LOOP
+    blt              zero,    a2,     .RESA
+    b                .END
+.RESA:
+    li.w             t1,      0
+.DST:
+    slli.w           t2,      t1,     2
+    ldx.w            t2,      a5,     t2
+    li.w             t3,      0
+    li.w             t8,      0
+.FILTER:
+    add.w            t4,      t2,     t3
+    ldx.bu           t5,      a3,     t4
+    mul.w            t6,      a6,     t1
+    add.w            t6,      t6,     t3
+    slli.w           t7,      t6,     1
+    ldx.h            t7,      a4,     t7
+    mul.w            t7,      t5,     t7
+    add.w            t8,      t8,     t7
+    addi.w           t3,      t3,     1
+    blt              t3,      a6,     .FILTER
+    srai.w           t8,      t8,     3
+    slt              t5,      t8,     t0
+    maskeqz          t8,      t8,     t5
+    masknez          t5,      t0,     t5
+    or               t8,      t8,     t5
+    slli.w           t4,      t1,     2
+    stx.w            t8,      a1,     t4
+    addi.w           t1,      t1,     1
+    blt              t1,      a2,     .DST
+    b                .END
+
+.LOOP_DST8:
+    ld.w             t1,      a5,     0
+    ld.w             t2,      a5,     4
+    ld.w             t3,      a5,     8
+    ld.w             t4,      a5,     12
+    fldx.d           f1,      a3,     t1
+    fldx.d           f2,      a3,     t2
+    fldx.d           f3,      a3,     t3
+    fldx.d           f4,      a3,     t4
+    ld.w             t1,      a5,     16
+    ld.w             t2,      a5,     20
+    ld.w             t3,      a5,     24
+    ld.w             t4,      a5,     28
+    fldx.d           f5,      a3,     t1
+    fldx.d           f6,      a3,     t2
+    fldx.d           f7,      a3,     t3
+    fldx.d           f8,      a3,     t4
+    vld              vr9,     a4,     0
+    vld              vr10,    a4,     16
+    vld              vr11,    a4,     32
+    vld              vr12,    a4,     48
+    vld              vr13,    a4,     64
+    vld              vr14,    a4,     80
+    vld              vr15,    a4,     96
+    vld              vr16,    a4,     112
+    vilvl.b          vr1,     vr0,    vr1
+    vilvl.b          vr2,     vr0,    vr2
+    vilvl.b          vr3,     vr0,    vr3
+    vilvl.b          vr4,     vr0,    vr4
+    vilvl.b          vr5,     vr0,    vr5
+    vilvl.b          vr6,     vr0,    vr6
+    vilvl.b          vr7,     vr0,    vr7
+    vilvl.b          vr8,     vr0,    vr8
+
+    vdp2.w.h         vr17,    vr1,    vr9
+    vdp2.w.h         vr18,    vr2,    vr10
+    vdp2.w.h         vr19,    vr3,    vr11
+    vdp2.w.h         vr21,    vr4,    vr12
+    vdp2.w.h         vr1,     vr5,    vr13
+    vdp2.w.h         vr2,     vr6,    vr14
+    vdp2.w.h         vr3,     vr7,    vr15
+    vdp2.w.h         vr4,     vr8,    vr16
+    vhaddw.d.w       vr5,     vr1,    vr1
+    vhaddw.d.w       vr6,     vr2,    vr2
+    vhaddw.d.w       vr7,     vr3,    vr3
+    vhaddw.d.w       vr8,     vr4,    vr4
+    vhaddw.d.w       vr1,     vr17,   vr17
+    vhaddw.d.w       vr2,     vr18,   vr18
+    vhaddw.d.w       vr3,     vr19,   vr19
+    vhaddw.d.w       vr4,     vr21,   vr21
+    vhaddw.q.d       vr1,     vr1,    vr1
+    vhaddw.q.d       vr2,     vr2,    vr2
+    vhaddw.q.d       vr3,     vr3,    vr3
+    vhaddw.q.d       vr4,     vr4,    vr4
+    vhaddw.q.d       vr5,     vr5,    vr5
+    vhaddw.q.d       vr6,     vr6,    vr6
+    vhaddw.q.d       vr7,     vr7,    vr7
+    vhaddw.q.d       vr8,     vr8,    vr8
+    vilvl.w          vr1,     vr2,    vr1
+    vilvl.w          vr3,     vr4,    vr3
+    vilvl.w          vr5,     vr6,    vr5
+    vilvl.w          vr7,     vr8,    vr7
+    vilvl.d          vr1,     vr3,    vr1
+    vilvl.d          vr5,     vr7,    vr5
+    vsrai.w          vr1,     vr1,    3
+    vsrai.w          vr5,     vr5,    3
+    vmin.w           vr1,     vr1,    vr20
+    vmin.w           vr5,     vr5,    vr20
+
+    vst              vr1,     a1,     0
+    vst              vr5,     a1,     16
+    addi.d           a1,      a1,     32
+    addi.d           a5,      a5,     32
+    addi.d           a4,      a4,     128
+    addi.d           a2,      a2,     -8
+    bge              a2,      t8,     .LOOP_DST8
+    blt              zero,    a2,     .REST8
+    b                .END
+.REST8:
+    li.w             t1,      0
+.DST8:
+    slli.w           t2,      t1,     2
+    ldx.w            t2,      a5,     t2
+    li.w             t3,      0
+    li.w             t8,      0
+.FILTER8:
+    add.w            t4,      t2,     t3
+    ldx.bu           t5,      a3,     t4
+    mul.w            t6,      a6,     t1
+    add.w            t6,      t6,     t3
+    slli.w           t7,      t6,     1
+    ldx.h            t7,      a4,     t7
+    mul.w            t7,      t5,     t7
+    add.w            t8,      t8,     t7
+    addi.w           t3,      t3,     1
+    blt              t3,      a6,     .FILTER8
+    srai.w           t8,      t8,     3
+    slt              t5,      t8,     t0
+    maskeqz          t8,      t8,     t5
+    masknez          t5,      t0,     t5
+    or               t8,      t8,     t5
+    slli.w           t4,      t1,     2
+    stx.w            t8,      a1,     t4
+    addi.w           t1,      t1,     1
+    blt              t1,      a2,     .DST8
+    b                .END
+
+.LOOP_DST4:
+    ld.w             t1,      a5,     0
+    ld.w             t2,      a5,     4
+    ld.w             t3,      a5,     8
+    ld.w             t4,      a5,     12
+    fldx.s           f1,      a3,     t1
+    fldx.s           f2,      a3,     t2
+    fldx.s           f3,      a3,     t3
+    fldx.s           f4,      a3,     t4
+    ld.w             t1,      a5,     16
+    ld.w             t2,      a5,     20
+    ld.w             t3,      a5,     24
+    ld.w             t4,      a5,     28
+    fldx.s           f5,      a3,     t1
+    fldx.s           f6,      a3,     t2
+    fldx.s           f7,      a3,     t3
+    fldx.s           f8,      a3,     t4
+    vld              vr9,     a4,     0
+    vld              vr10,    a4,     16
+    vld              vr11,    a4,     32
+    vld              vr12,    a4,     48
+    vilvl.w          vr1,     vr2,    vr1
+    vilvl.w          vr3,     vr4,    vr3
+    vilvl.w          vr5,     vr6,    vr5
+    vilvl.w          vr7,     vr8,    vr7
+    vilvl.b          vr1,     vr0,    vr1
+    vilvl.b          vr3,     vr0,    vr3
+    vilvl.b          vr5,     vr0,    vr5
+    vilvl.b          vr7,     vr0,    vr7
+
+    vdp2.w.h         vr13,    vr1,    vr9
+    vdp2.w.h         vr14,    vr3,    vr10
+    vdp2.w.h         vr15,    vr5,    vr11
+    vdp2.w.h         vr16,    vr7,    vr12
+    vhaddw.d.w       vr13,    vr13,   vr13
+    vhaddw.d.w       vr14,    vr14,   vr14
+    vhaddw.d.w       vr15,    vr15,   vr15
+    vhaddw.d.w       vr16,    vr16,   vr16
+    vpickev.w        vr13,    vr14,   vr13
+    vpickev.w        vr15,    vr16,   vr15
+    vsrai.w          vr13,    vr13,   3
+    vsrai.w          vr15,    vr15,   3
+    vmin.w           vr13,    vr13,   vr20
+    vmin.w           vr15,    vr15,   vr20
+
+    vst              vr13,    a1,     0
+    vst              vr15,    a1,     16
+    addi.d           a1,      a1,     32
+    addi.d           a5,      a5,     32
+    addi.d           a4,      a4,     64
+    addi.d           a2,      a2,     -8
+    bge              a2,      t8,     .LOOP_DST4
+    blt              zero,    a2,     .REST4
+    b                .END
+.REST4:
+    li.w             t1,      0
+.DST4:
+    slli.w           t2,      t1,     2
+    ldx.w            t2,      a5,     t2
+    li.w             t3,      0
+    li.w             t8,      0
+.FILTER4:
+    add.w            t4,      t2,     t3
+    ldx.bu           t5,      a3,     t4
+    mul.w            t6,      a6,     t1
+    add.w            t6,      t6,     t3
+    slli.w           t7,      t6,     1
+    ldx.h            t7,      a4,     t7
+    mul.w            t7,      t5,     t7
+    add.w            t8,      t8,     t7
+    addi.w           t3,      t3,     1
+    blt              t3,      a6,     .FILTER4
+    srai.w           t8,      t8,     3
+    slt              t5,      t8,     t0
+    maskeqz          t8,      t8,     t5
+    masknez          t5,      t0,     t5
+    or               t8,      t8,     t5
+    slli.w           t4,      t1,     2
+    stx.w            t8,      a1,     t4
+    addi.w           t1,      t1,     1
+    blt              t1,      a2,     .DST4
+    b                .END
+.END_DST4:
+
+    li.w             t1,      0
+.LOOP_DST1:
+    slli.w           t2,      t1,     2
+    ldx.w            t2,      a5,     t2
+    li.w             t3,      0
+    li.w             t8,      0
+.FILTER1:
+    add.w            t4,      t2,     t3
+    ldx.bu           t5,      a3,     t4
+    mul.w            t6,      a6,     t1
+    add.w            t6,      t6,     t3
+    slli.w           t7,      t6,     1
+    ldx.h            t7,      a4,     t7
+    mul.w            t7,      t5,     t7
+    add.w            t8,      t8,     t7
+    addi.w           t3,      t3,     1
+    blt              t3,      a6,     .FILTER1
+    srai.w           t8,      t8,     3
+    slt              t5,      t8,     t0
+    maskeqz          t8,      t8,     t5
+    masknez          t5,      t0,     t5
+    or               t8,      t8,     t5
+    slli.w           t4,      t1,     2
+    stx.w            t8,      a1,     t4
+    addi.w           t1,      t1,     1
+    blt              t1,      a2,     .LOOP_DST1
+    b                .END
+.END:
+
+    ld.d             s0,      sp,     0
+    ld.d             s1,      sp,     8
+    ld.d             s2,      sp,     16
+    ld.d             s3,      sp,     24
+    ld.d             s4,      sp,     32
+    ld.d             s5,      sp,     40
+    ld.d             s6,      sp,     48
+    ld.d             s7,      sp,     56
+    ld.d             s8,      sp,     64
+    addi.d           sp,      sp,     72
+endfunc
+
+/* void ff_hscale_16_to_15_sub_lsx(SwsContext *c, int16_t *dst, int dstW,
+ *                                 const uint8_t *src, const int16_t *filter,
+ *                                 const int32_t *filterPos, int filterSize, int sh)
+ */
+function ff_hscale_16_to_15_sub_lsx
+    addi.d           sp,      sp,     -72
+    st.d             s0,      sp,     0
+    st.d             s1,      sp,     8
+    st.d             s2,      sp,     16
+    st.d             s3,      sp,     24
+    st.d             s4,      sp,     32
+    st.d             s5,      sp,     40
+    st.d             s6,      sp,     48
+    st.d             s7,      sp,     56
+    st.d             s8,      sp,     64
+    li.w             t0,      32767
+    li.w             t8,      8
+    li.w             t7,      4
+    vreplgr2vr.w     vr20,    t0
+    vreplgr2vr.w     vr0,     a7
+    beq              a6,      t7,     .LOOP_HS15_DST4
+    beq              a6,      t8,     .LOOP_HS15_DST8
+    blt              t8,      a6,     .LOOP_HS15
+    b                .END_HS15_DST4
+
+.LOOP_HS15:
+    li.w             t1,      0
+    li.w             s1,      0
+    li.w             s2,      0
+    li.w             s3,      0
+    li.w             s4,      0
+    li.w             s5,      0
+    vldi             vr22,    0
+    addi.w           s0,      a6,     -7
+    slli.w           s7,      a6,     1
+    slli.w           s8,      a6,     2
+    add.w            t6,      s7,     s8
+.LOOP_HS15_DST:
+    ld.w             t2,      a5,     0
+    ld.w             t3,      a5,     4
+    ld.w             t4,      a5,     8
+    ld.w             t5,      a5,     12
+    slli.w           t2,      t2,     1
+    slli.w           t3,      t3,     1
+    slli.w           t4,      t4,     1
+    slli.w           t5,      t5,     1
+    vldx             vr1,     a3,     t2
+    vldx             vr2,     a3,     t3
+    vldx             vr3,     a3,     t4
+    vldx             vr4,     a3,     t5
+    vld              vr9,     a4,     0
+    vldx             vr10,    a4,     s7
+    vldx             vr11,    a4,     s8
+    vldx             vr12,    a4,     t6
+    vmulwev.w.hu.h   vr17,    vr1,    vr9
+    vmulwev.w.hu.h   vr18,    vr2,    vr10
+    vmulwev.w.hu.h   vr19,    vr3,    vr11
+    vmulwev.w.hu.h   vr21,    vr4,    vr12
+    vmaddwod.w.hu.h  vr17,    vr1,    vr9
+    vmaddwod.w.hu.h  vr18,    vr2,    vr10
+    vmaddwod.w.hu.h  vr19,    vr3,    vr11
+    vmaddwod.w.hu.h  vr21,    vr4,    vr12
+    vhaddw.d.w       vr1,     vr17,   vr17
+    vhaddw.d.w       vr2,     vr18,   vr18
+    vhaddw.d.w       vr3,     vr19,   vr19
+    vhaddw.d.w       vr4,     vr21,   vr21
+    vhaddw.q.d       vr1,     vr1,    vr1
+    vhaddw.q.d       vr2,     vr2,    vr2
+    vhaddw.q.d       vr3,     vr3,    vr3
+    vhaddw.q.d       vr4,     vr4,    vr4
+    vilvl.w          vr1,     vr2,    vr1
+    vilvl.w          vr3,     vr4,    vr3
+    vilvl.d          vr1,     vr3,    vr1
+    vadd.w           vr22,    vr22,   vr1
+    addi.w           s1,      s1,     8
+    addi.d           a3,      a3,     16
+    addi.d           a4,      a4,     16
+    blt              s1,      s0,     .LOOP_HS15_DST
+    blt              s1,      a6,     .HS15_DSTA
+    b                .END_HS15_FILTERA
+.HS15_DSTA:
+    ld.w             t2,      a5,     0
+    li.w             t3,      0
+    move             s6,      s1
+.HS15_FILTERA:
+    add.w            t4,      t2,     t3
+    slli.w           t4,      t4,     1
+    ldx.hu           t5,      a3,     t4
+    mul.w            t6,      a6,     t1
+    add.w            t6,      t6,     t3
+    slli.w           t6,      t6,     1
+    ldx.h            t6,      a4,     t6
+    mul.w            t6,      t5,     t6
+    add.w            s2,      s2,     t6
+    addi.w           t3,      t3,     1
+    addi.w           s6,      s6,     1
+    blt              s6,      a6,     .HS15_FILTERA
+
+    ld.w             t2,      a5,     4
+    li.w             t3,      0
+    move             s6,      s1
+    addi.w           t1,      t1,     1
+.HS15_FILTERB:
+    add.w            t4,      t2,     t3
+    slli.w           t4,      t4,     1
+    ldx.hu           t5,      a3,     t4
+    mul.w            t6,      a6,     t1
+    add.w            t6,      t6,     t3
+    slli.w           t6,      t6,     1
+    ldx.h            t6,      a4,     t6
+    mul.w            t6,      t5,     t6
+    add.w            s3,      s3,     t6
+    addi.w           t3,      t3,     1
+    addi.w           s6,      s6,     1
+    blt              s6,      a6,     .HS15_FILTERB
+    ld.w             t2,      a5,     8
+    addi.w           t1,      t1,     1
+    li.w             t3,      0
+    move             s6,      s1
+.HS15_FILTERC:
+    add.w            t4,      t2,     t3
+    slli.w           t4,      t4,     1
+    ldx.hu           t5,      a3,     t4
+    mul.w            t6,      a6,     t1
+    add.w            t6,      t6,     t3
+    slli.w           t6,      t6,     1
+    ldx.h            t6,      a4,     t6
+    mul.w            t6,      t5,     t6
+    add.w            s4,      s4,     t6
+    addi.w           t3,      t3,     1
+    addi.w           s6,      s6,     1
+    blt              s6,      a6,     .HS15_FILTERC
+    ld.w             t2,      a5,     12
+    addi.w           t1,      t1,     1
+    move             s6,      s1
+    li.w             t3,      0
+.HS15_FILTERD:
+    add.w            t4,      t2,     t3
+    slli.w           t4,      t4,     1
+    ldx.hu           t5,      a3,     t4
+    mul.w            t6,      a6,     t1
+    add.w            t6,      t6,     t3
+    slli.w           t6,      t6,     1
+    ldx.h            t6,      a4,     t6
+    mul.w            t6,      t5,     t6
+    add.w            s5,      s5,     t6
+    addi.w           t3,      t3,     1
+    addi.w           s6,      s6,     1
+    blt              s6,      a6,     .HS15_FILTERD
+.END_HS15_FILTERA:
+    vpickve2gr.w     t1,      vr22,   0
+    vpickve2gr.w     t2,      vr22,   1
+    vpickve2gr.w     t3,      vr22,   2
+    vpickve2gr.w     t4,      vr22,   3
+    add.w            s2,      s2,     t1
+    add.w            s3,      s3,     t2
+    add.w            s4,      s4,     t3
+    add.w            s5,      s5,     t4
+    sra.w            s2,      s2,     a7
+    sra.w            s3,      s3,     a7
+    sra.w            s4,      s4,     a7
+    sra.w            s5,      s5,     a7
+    slt              t1,      s2,     t0
+    slt              t2,      s3,     t0
+    slt              t3,      s4,     t0
+    slt              t4,      s5,     t0
+    maskeqz          s2,      s2,     t1
+    maskeqz          s3,      s3,     t2
+    maskeqz          s4,      s4,     t3
+    maskeqz          s5,      s5,     t4
+    masknez          t1,      t0,     t1
+    masknez          t2,      t0,     t2
+    masknez          t3,      t0,     t3
+    masknez          t4,      t0,     t4
+    or               s2,      s2,     t1
+    or               s3,      s3,     t2
+    or               s4,      s4,     t3
+    or               s5,      s5,     t4
+    st.h             s2,      a1,     0
+    st.h             s3,      a1,     2
+    st.h             s4,      a1,     4
+    st.h             s5,      a1,     6
+
+    addi.d           a1,      a1,     8
+    sub.d            a3,      a3,     s1
+    sub.d            a3,      a3,     s1
+    addi.d           a5,      a5,     16
+    slli.d           t3,      a6,     3
+    add.d            a4,      a4,     t3
+    sub.d            a4,      a4,     s1
+    sub.d            a4,      a4,     s1
+    addi.d           a2,      a2,     -4
+    bge              a2,      t7,     .LOOP_HS15
+    blt              zero,    a2,     .HS15_RESA
+    b                .HS15_END
+.HS15_RESA:
+    li.w             t1,      0
+.HS15_DST:
+    slli.w           t2,      t1,     2
+    ldx.w            t2,      a5,     t2
+    li.w             t3,      0
+    li.w             t8,      0
+.HS15_FILTER:
+    add.w            t4,      t2,     t3
+    slli.w           t4,      t4,     1
+    ldx.hu           t5,      a3,     t4
+    mul.w            t6,      a6,     t1
+    add.w            t6,      t6,     t3
+    slli.w           t7,      t6,     1
+    ldx.h            t7,      a4,     t7
+    mul.w            t7,      t5,     t7
+    add.w            t8,      t8,     t7
+    addi.w           t3,      t3,     1
+    blt              t3,      a6,     .HS15_FILTER
+    sra.w            t8,      t8,     a7
+    slt              t5,      t8,     t0
+    maskeqz          t8,      t8,     t5
+    masknez          t5,      t0,     t5
+    or               t8,      t8,     t5
+    slli.w           t4,      t1,     1
+    stx.h            t8,      a1,     t4
+    addi.w           t1,      t1,     1
+    blt              t1,      a2,     .HS15_DST
+    b                .HS15_END
+
+.LOOP_HS15_DST8:
+    ld.w             t1,      a5,     0
+    ld.w             t2,      a5,     4
+    ld.w             t3,      a5,     8
+    ld.w             t4,      a5,     12
+    slli.w           t1,      t1,     1
+    slli.w           t2,      t2,     1
+    slli.w           t3,      t3,     1
+    slli.w           t4,      t4,     1
+    vldx             vr1,     a3,     t1
+    vldx             vr2,     a3,     t2
+    vldx             vr3,     a3,     t3
+    vldx             vr4,     a3,     t4
+    ld.w             t1,      a5,     16
+    ld.w             t2,      a5,     20
+    ld.w             t3,      a5,     24
+    ld.w             t4,      a5,     28
+    slli.w           t1,      t1,     1
+    slli.w           t2,      t2,     1
+    slli.w           t3,      t3,     1
+    slli.w           t4,      t4,     1
+    vldx             vr5,     a3,     t1
+    vldx             vr6,     a3,     t2
+    vldx             vr7,     a3,     t3
+    vldx             vr8,     a3,     t4
+    vld              vr9,     a4,     0
+    vld              vr10,    a4,     16
+    vld              vr11,    a4,     32
+    vld              vr12,    a4,     48
+    vld              vr13,    a4,     64
+    vld              vr14,    a4,     80
+    vld              vr15,    a4,     96
+    vld              vr16,    a4,     112
+
+    vmulwev.w.hu.h   vr17,    vr1,    vr9
+    vmulwev.w.hu.h   vr18,    vr2,    vr10
+    vmulwev.w.hu.h   vr19,    vr3,    vr11
+    vmulwev.w.hu.h   vr21,    vr4,    vr12
+    vmaddwod.w.hu.h  vr17,    vr1,    vr9
+    vmaddwod.w.hu.h  vr18,    vr2,    vr10
+    vmaddwod.w.hu.h  vr19,    vr3,    vr11
+    vmaddwod.w.hu.h  vr21,    vr4,    vr12
+    vmulwev.w.hu.h   vr1,     vr5,    vr13
+    vmulwev.w.hu.h   vr2,     vr6,    vr14
+    vmulwev.w.hu.h   vr3,     vr7,    vr15
+    vmulwev.w.hu.h   vr4,     vr8,    vr16
+    vmaddwod.w.hu.h  vr1,     vr5,    vr13
+    vmaddwod.w.hu.h  vr2,     vr6,    vr14
+    vmaddwod.w.hu.h  vr3,     vr7,    vr15
+    vmaddwod.w.hu.h  vr4,     vr8,    vr16
+    vhaddw.d.w       vr5,     vr1,    vr1
+    vhaddw.d.w       vr6,     vr2,    vr2
+    vhaddw.d.w       vr7,     vr3,    vr3
+    vhaddw.d.w       vr8,     vr4,    vr4
+    vhaddw.d.w       vr1,     vr17,   vr17
+    vhaddw.d.w       vr2,     vr18,   vr18
+    vhaddw.d.w       vr3,     vr19,   vr19
+    vhaddw.d.w       vr4,     vr21,   vr21
+    vhaddw.q.d       vr1,     vr1,    vr1
+    vhaddw.q.d       vr2,     vr2,    vr2
+    vhaddw.q.d       vr3,     vr3,    vr3
+    vhaddw.q.d       vr4,     vr4,    vr4
+    vhaddw.q.d       vr5,     vr5,    vr5
+    vhaddw.q.d       vr6,     vr6,    vr6
+    vhaddw.q.d       vr7,     vr7,    vr7
+    vhaddw.q.d       vr8,     vr8,    vr8
+    vilvl.w          vr1,     vr2,    vr1
+    vilvl.w          vr3,     vr4,    vr3
+    vilvl.w          vr5,     vr6,    vr5
+    vilvl.w          vr7,     vr8,    vr7
+    vilvl.d          vr1,     vr3,    vr1
+    vilvl.d          vr5,     vr7,    vr5
+    vsra.w           vr1,     vr1,    vr0
+    vsra.w           vr5,     vr5,    vr0
+    vmin.w           vr1,     vr1,    vr20
+    vmin.w           vr5,     vr5,    vr20
+
+    vpickev.h        vr1,     vr5,    vr1
+    vst              vr1,     a1,     0
+    addi.d           a1,      a1,     16
+    addi.d           a5,      a5,     32
+    addi.d           a4,      a4,     128
+    addi.d           a2,      a2,     -8
+    bge              a2,      t8,     .LOOP_HS15_DST8
+    blt              zero,    a2,     .HS15_REST8
+    b                .HS15_END
+.HS15_REST8:
+    li.w             t1,      0
+.HS15_DST8:
+    slli.w           t2,      t1,     2
+    ldx.w            t2,      a5,     t2
+    li.w             t3,      0
+    li.w             t8,      0
+.HS15_FILTER8:
+    add.w            t4,      t2,     t3
+    slli.w           t4,      t4,     1
+    ldx.hu           t5,      a3,     t4
+    mul.w            t6,      a6,     t1
+    add.w            t6,      t6,     t3
+    slli.w           t7,      t6,     1
+    ldx.h            t7,      a4,     t7
+    mul.w            t7,      t5,     t7
+    add.w            t8,      t8,     t7
+    addi.w           t3,      t3,     1
+    blt              t3,      a6,     .HS15_FILTER8
+    sra.w            t8,      t8,     a7
+    slt              t5,      t8,     t0
+    maskeqz          t8,      t8,     t5
+    masknez          t5,      t0,     t5
+    or               t8,      t8,     t5
+    slli.w           t4,      t1,     1
+    stx.h            t8,      a1,     t4
+    addi.w           t1,      t1,     1
+    blt              t1,      a2,     .HS15_DST8
+    b                .HS15_END
+
+.LOOP_HS15_DST4:
+    ld.w             t1,      a5,     0
+    ld.w             t2,      a5,     4
+    ld.w             t3,      a5,     8
+    ld.w             t4,      a5,     12
+    slli.w           t1,      t1,     1
+    slli.w           t2,      t2,     1
+    slli.w           t3,      t3,     1
+    slli.w           t4,      t4,     1
+    fldx.d           f1,      a3,     t1
+    fldx.d           f2,      a3,     t2
+    fldx.d           f3,      a3,     t3
+    fldx.d           f4,      a3,     t4
+    ld.w             t1,      a5,     16
+    ld.w             t2,      a5,     20
+    ld.w             t3,      a5,     24
+    ld.w             t4,      a5,     28
+    slli.w           t1,      t1,     1
+    slli.w           t2,      t2,     1
+    slli.w           t3,      t3,     1
+    slli.w           t4,      t4,     1
+    fldx.d           f5,      a3,     t1
+    fldx.d           f6,      a3,     t2
+    fldx.d           f7,      a3,     t3
+    fldx.d           f8,      a3,     t4
+    vld              vr9,     a4,     0
+    vld              vr10,    a4,     16
+    vld              vr11,    a4,     32
+    vld              vr12,    a4,     48
+    vilvl.d          vr1,     vr2,    vr1
+    vilvl.d          vr3,     vr4,    vr3
+    vilvl.d          vr5,     vr6,    vr5
+    vilvl.d          vr7,     vr8,    vr7
+    vmulwev.w.hu.h   vr13,    vr1,    vr9
+    vmulwev.w.hu.h   vr14,    vr3,    vr10
+    vmulwev.w.hu.h   vr15,    vr5,    vr11
+    vmulwev.w.hu.h   vr16,    vr7,    vr12
+    vmaddwod.w.hu.h  vr13,    vr1,    vr9
+    vmaddwod.w.hu.h  vr14,    vr3,    vr10
+    vmaddwod.w.hu.h  vr15,    vr5,    vr11
+    vmaddwod.w.hu.h  vr16,    vr7,    vr12
+    vhaddw.d.w       vr13,    vr13,   vr13
+    vhaddw.d.w       vr14,    vr14,   vr14
+    vhaddw.d.w       vr15,    vr15,   vr15
+    vhaddw.d.w       vr16,    vr16,   vr16
+    vpickev.w        vr13,    vr14,   vr13
+    vpickev.w        vr15,    vr16,   vr15
+    vsra.w           vr13,    vr13,   vr0
+    vsra.w           vr15,    vr15,   vr0
+    vmin.w           vr13,    vr13,   vr20
+    vmin.w           vr15,    vr15,   vr20
+
+    vpickev.h        vr13,    vr15,   vr13
+    vst              vr13,    a1,     0
+    addi.d           a1,      a1,     16
+    addi.d           a5,      a5,     32
+    addi.d           a4,      a4,     64
+    addi.d           a2,      a2,     -8
+    bge              a2,      t8,     .LOOP_HS15_DST4
+    blt              zero,    a2,     .HS15_REST4
+    b                .HS15_END
+.HS15_REST4:
+    li.w             t1,      0
+.HS15_DST4:
+    slli.w           t2,      t1,     2
+    ldx.w            t2,      a5,     t2
+    li.w             t3,      0
+    li.w             t8,      0
+.HS15_FILTER4:
+    add.w            t4,      t2,     t3
+    slli.w           t4,      t4,     1
+    ldx.hu           t5,      a3,     t4
+    mul.w            t6,      a6,     t1
+    add.w            t6,      t6,     t3
+    slli.w           t7,      t6,     1
+    ldx.h            t7,      a4,     t7
+    mul.w            t7,      t5,     t7
+    add.w            t8,      t8,     t7
+    addi.w           t3,      t3,     1
+    blt              t3,      a6,     .HS15_FILTER4
+    sra.w            t8,      t8,     a7
+    slt              t5,      t8,     t0
+    maskeqz          t8,      t8,     t5
+    masknez          t5,      t0,     t5
+    or               t8,      t8,     t5
+    slli.w           t4,      t1,     1
+    stx.h            t8,      a1,     t4
+    addi.w           t1,      t1,     1
+    blt              t1,      a2,     .HS15_DST4
+    b                .HS15_END
+.END_HS15_DST4:
+
+    li.w             t1,      0
+.LOOP_HS15_DST1:
+    slli.w           t2,      t1,     2
+    ldx.w            t2,      a5,     t2
+    li.w             t3,      0
+    li.w             t8,      0
+.HS15_FILTER1:
+    add.w            t4,      t2,     t3
+    slli.w           t4,      t4,     1
+    ldx.hu           t5,      a3,     t4
+    mul.w            t6,      a6,     t1
+    add.w            t6,      t6,     t3
+    slli.w           t7,      t6,     1
+    ldx.h            t7,      a4,     t7
+    mul.w            t7,      t5,     t7
+    add.w            t8,      t8,     t7
+    addi.w           t3,      t3,     1
+    blt              t3,      a6,     .HS15_FILTER1
+    sra.w            t8,      t8,     a7
+    slt              t5,      t8,     t0
+    maskeqz          t8,      t8,     t5
+    masknez          t5,      t0,     t5
+    or               t8,      t8,     t5
+    slli.w           t4,      t1,     1
+    stx.h            t8,      a1,     t4
+    addi.w           t1,      t1,     1
+    blt              t1,      a2,     .LOOP_HS15_DST1
+    b                .HS15_END
+.HS15_END:
+
+    ld.d             s0,      sp,     0
+    ld.d             s1,      sp,     8
+    ld.d             s2,      sp,     16
+    ld.d             s3,      sp,     24
+    ld.d             s4,      sp,     32
+    ld.d             s5,      sp,     40
+    ld.d             s6,      sp,     48
+    ld.d             s7,      sp,     56
+    ld.d             s8,      sp,     64
+    addi.d           sp,      sp,     72
+endfunc
+
+/* void ff_hscale_16_to_19_sub_lsx(SwsContext *c, int16_t *dst, int dstW,
+ *                                 const uint8_t *src, const int16_t *filter,
+ *                                 const int32_t *filterPos, int filterSize, int sh)
+ */
+function ff_hscale_16_to_19_sub_lsx
+    addi.d           sp,      sp,     -72
+    st.d             s0,      sp,     0
+    st.d             s1,      sp,     8
+    st.d             s2,      sp,     16
+    st.d             s3,      sp,     24
+    st.d             s4,      sp,     32
+    st.d             s5,      sp,     40
+    st.d             s6,      sp,     48
+    st.d             s7,      sp,     56
+    st.d             s8,      sp,     64
+
+    li.w             t0,      524287
+    li.w             t8,      8
+    li.w             t7,      4
+    vreplgr2vr.w     vr20,    t0
+    vreplgr2vr.w     vr0,     a7
+    beq              a6,      t7,     .LOOP_HS19_DST4
+    beq              a6,      t8,     .LOOP_HS19_DST8
+    blt              t8,      a6,     .LOOP_HS19
+    b                .END_HS19_DST4
+
+.LOOP_HS19:
+    li.w             t1,      0
+    li.w             s1,      0
+    li.w             s2,      0
+    li.w             s3,      0
+    li.w             s4,      0
+    li.w             s5,      0
+    vldi             vr22,    0
+    addi.w           s0,      a6,     -7
+    slli.w           s7,      a6,     1
+    slli.w           s8,      a6,     2
+    add.w            t6,      s7,     s8
+.LOOP_HS19_DST:
+    ld.w             t2,      a5,     0
+    ld.w             t3,      a5,     4
+    ld.w             t4,      a5,     8
+    ld.w             t5,      a5,     12
+    slli.w           t2,      t2,     1
+    slli.w           t3,      t3,     1
+    slli.w           t4,      t4,     1
+    slli.w           t5,      t5,     1
+    vldx             vr1,     a3,     t2
+    vldx             vr2,     a3,     t3
+    vldx             vr3,     a3,     t4
+    vldx             vr4,     a3,     t5
+    vld              vr9,     a4,     0
+    vldx             vr10,    a4,     s7
+    vldx             vr11,    a4,     s8
+    vldx             vr12,    a4,     t6
+    vmulwev.w.hu.h   vr17,    vr1,    vr9
+    vmulwev.w.hu.h   vr18,    vr2,    vr10
+    vmulwev.w.hu.h   vr19,    vr3,    vr11
+    vmulwev.w.hu.h   vr21,    vr4,    vr12
+    vmaddwod.w.hu.h  vr17,    vr1,    vr9
+    vmaddwod.w.hu.h  vr18,    vr2,    vr10
+    vmaddwod.w.hu.h  vr19,    vr3,    vr11
+    vmaddwod.w.hu.h  vr21,    vr4,    vr12
+    vhaddw.d.w       vr1,     vr17,   vr17
+    vhaddw.d.w       vr2,     vr18,   vr18
+    vhaddw.d.w       vr3,     vr19,   vr19
+    vhaddw.d.w       vr4,     vr21,   vr21
+    vhaddw.q.d       vr1,     vr1,    vr1
+    vhaddw.q.d       vr2,     vr2,    vr2
+    vhaddw.q.d       vr3,     vr3,    vr3
+    vhaddw.q.d       vr4,     vr4,    vr4
+    vilvl.w          vr1,     vr2,    vr1
+    vilvl.w          vr3,     vr4,    vr3
+    vilvl.d          vr1,     vr3,    vr1
+    vadd.w           vr22,    vr22,   vr1
+    addi.w           s1,      s1,     8
+    addi.d           a3,      a3,     16
+    addi.d           a4,      a4,     16
+    blt              s1,      s0,     .LOOP_HS19_DST
+    blt              s1,      a6,     .HS19_DSTA
+    b                .END_HS19_FILTERA
+.HS19_DSTA:
+    ld.w             t2,      a5,     0
+    li.w             t3,      0
+    move             s6,      s1
+.HS19_FILTERA:
+    add.w            t4,      t2,     t3
+    slli.w           t4,      t4,     1
+    ldx.hu           t5,      a3,     t4
+    mul.w            t6,      a6,     t1
+    add.w            t6,      t6,     t3
+    slli.w           t6,      t6,     1
+    ldx.h            t6,      a4,     t6
+    mul.w            t6,      t5,     t6
+    add.w            s2,      s2,     t6
+    addi.w           t3,      t3,     1
+    addi.w           s6,      s6,     1
+    blt              s6,      a6,     .HS19_FILTERA
+
+    ld.w             t2,      a5,     4
+    li.w             t3,      0
+    move             s6,      s1
+    addi.w           t1,      t1,     1
+.HS19_FILTERB:
+    add.w            t4,      t2,     t3
+    slli.w           t4,      t4,     1
+    ldx.hu           t5,      a3,     t4
+    mul.w            t6,      a6,     t1
+    add.w            t6,      t6,     t3
+    slli.w           t6,      t6,     1
+    ldx.h            t6,      a4,     t6
+    mul.w            t6,      t5,     t6
+    add.w            s3,      s3,     t6
+    addi.w           t3,      t3,     1
+    addi.w           s6,      s6,     1
+    blt              s6,      a6,     .HS19_FILTERB
+    ld.w             t2,      a5,     8
+    addi.w           t1,      t1,     1
+    li.w             t3,      0
+    move             s6,      s1
+.HS19_FILTERC:
+    add.w            t4,      t2,     t3
+    slli.w           t4,      t4,     1
+    ldx.hu           t5,      a3,     t4
+    mul.w            t6,      a6,     t1
+    add.w            t6,      t6,     t3
+    slli.w           t6,      t6,     1
+    ldx.h            t6,      a4,     t6
+    mul.w            t6,      t5,     t6
+    add.w            s4,      s4,     t6
+    addi.w           t3,      t3,     1
+    addi.w           s6,      s6,     1
+    blt              s6,      a6,     .HS19_FILTERC
+    ld.w             t2,      a5,     12
+    addi.w           t1,      t1,     1
+    move             s6,      s1
+    li.w             t3,      0
+.HS19_FILTERD:
+    add.w            t4,      t2,     t3
+    slli.w           t4,      t4,     1
+    ldx.hu           t5,      a3,     t4
+    mul.w            t6,      a6,     t1
+    add.w            t6,      t6,     t3
+    slli.w           t6,      t6,     1
+    ldx.h            t6,      a4,     t6
+    mul.w            t6,      t5,     t6
+    add.w            s5,      s5,     t6
+    addi.w           t3,      t3,     1
+    addi.w           s6,      s6,     1
+    blt              s6,      a6,     .HS19_FILTERD
+.END_HS19_FILTERA:
+    vpickve2gr.w     t1,      vr22,   0
+    vpickve2gr.w     t2,      vr22,   1
+    vpickve2gr.w     t3,      vr22,   2
+    vpickve2gr.w     t4,      vr22,   3
+    add.w            s2,      s2,     t1
+    add.w            s3,      s3,     t2
+    add.w            s4,      s4,     t3
+    add.w            s5,      s5,     t4
+    sra.w            s2,      s2,     a7
+    sra.w            s3,      s3,     a7
+    sra.w            s4,      s4,     a7
+    sra.w            s5,      s5,     a7
+    slt              t1,      s2,     t0
+    slt              t2,      s3,     t0
+    slt              t3,      s4,     t0
+    slt              t4,      s5,     t0
+    maskeqz          s2,      s2,     t1
+    maskeqz          s3,      s3,     t2
+    maskeqz          s4,      s4,     t3
+    maskeqz          s5,      s5,     t4
+    masknez          t1,      t0,     t1
+    masknez          t2,      t0,     t2
+    masknez          t3,      t0,     t3
+    masknez          t4,      t0,     t4
+    or               s2,      s2,     t1
+    or               s3,      s3,     t2
+    or               s4,      s4,     t3
+    or               s5,      s5,     t4
+    st.w             s2,      a1,     0
+    st.w             s3,      a1,     4
+    st.w             s4,      a1,     8
+    st.w             s5,      a1,     12
+
+    addi.d           a1,      a1,     16
+    sub.d            a3,      a3,     s1
+    sub.d            a3,      a3,     s1
+    addi.d           a5,      a5,     16
+    slli.d           t3,      a6,     3
+    add.d            a4,      a4,     t3
+    sub.d            a4,      a4,     s1
+    sub.d            a4,      a4,     s1
+    addi.d           a2,      a2,     -4
+    bge              a2,      t7,     .LOOP_HS19
+    blt              zero,    a2,     .HS19_RESA
+    b                .HS19_END
+.HS19_RESA:
+    li.w             t1,      0
+.HS19_DST:
+    slli.w           t2,      t1,     2
+    ldx.w            t2,      a5,     t2
+    li.w             t3,      0
+    li.w             t8,      0
+.HS19_FILTER:
+    add.w            t4,      t2,     t3
+    slli.w           t4,      t4,     1
+    ldx.hu           t5,      a3,     t4
+    mul.w            t6,      a6,     t1
+    add.w            t6,      t6,     t3
+    slli.w           t7,      t6,     1
+    ldx.h            t7,      a4,     t7
+    mul.w            t7,      t5,     t7
+    add.w            t8,      t8,     t7
+    addi.w           t3,      t3,     1
+    blt              t3,      a6,     .HS19_FILTER
+    sra.w            t8,      t8,     a7
+    slt              t5,      t8,     t0
+    maskeqz          t8,      t8,     t5
+    masknez          t5,      t0,     t5
+    or               t8,      t8,     t5
+    slli.w           t4,      t1,     2
+    stx.w            t8,      a1,     t4
+    addi.w           t1,      t1,     1
+    blt              t1,      a2,     .HS19_DST
+    b                .HS19_END
+
+.LOOP_HS19_DST8:
+    ld.w             t1,      a5,     0
+    ld.w             t2,      a5,     4
+    ld.w             t3,      a5,     8
+    ld.w             t4,      a5,     12
+    slli.w           t1,      t1,     1
+    slli.w           t2,      t2,     1
+    slli.w           t3,      t3,     1
+    slli.w           t4,      t4,     1
+    vldx             vr1,     a3,     t1
+    vldx             vr2,     a3,     t2
+    vldx             vr3,     a3,     t3
+    vldx             vr4,     a3,     t4
+    ld.w             t1,      a5,     16
+    ld.w             t2,      a5,     20
+    ld.w             t3,      a5,     24
+    ld.w             t4,      a5,     28
+    slli.w           t1,      t1,     1
+    slli.w           t2,      t2,     1
+    slli.w           t3,      t3,     1
+    slli.w           t4,      t4,     1
+    vldx             vr5,     a3,     t1
+    vldx             vr6,     a3,     t2
+    vldx             vr7,     a3,     t3
+    vldx             vr8,     a3,     t4
+    vld              vr9,     a4,     0
+    vld              vr10,    a4,     16
+    vld              vr11,    a4,     32
+    vld              vr12,    a4,     48
+    vld              vr13,    a4,     64
+    vld              vr14,    a4,     80
+    vld              vr15,    a4,     96
+    vld              vr16,    a4,     112
+    vmulwev.w.hu.h   vr17,    vr1,    vr9
+    vmulwev.w.hu.h   vr18,    vr2,    vr10
+    vmulwev.w.hu.h   vr19,    vr3,    vr11
+    vmulwev.w.hu.h   vr21,    vr4,    vr12
+    vmaddwod.w.hu.h  vr17,    vr1,    vr9
+    vmaddwod.w.hu.h  vr18,    vr2,    vr10
+    vmaddwod.w.hu.h  vr19,    vr3,    vr11
+    vmaddwod.w.hu.h  vr21,    vr4,    vr12
+    vmulwev.w.hu.h   vr1,     vr5,    vr13
+    vmulwev.w.hu.h   vr2,     vr6,    vr14
+    vmulwev.w.hu.h   vr3,     vr7,    vr15
+    vmulwev.w.hu.h   vr4,     vr8,    vr16
+    vmaddwod.w.hu.h  vr1,     vr5,    vr13
+    vmaddwod.w.hu.h  vr2,     vr6,    vr14
+    vmaddwod.w.hu.h  vr3,     vr7,    vr15
+    vmaddwod.w.hu.h  vr4,     vr8,    vr16
+    vhaddw.d.w       vr5,     vr1,    vr1
+    vhaddw.d.w       vr6,     vr2,    vr2
+    vhaddw.d.w       vr7,     vr3,    vr3
+    vhaddw.d.w       vr8,     vr4,    vr4
+    vhaddw.d.w       vr1,     vr17,   vr17
+    vhaddw.d.w       vr2,     vr18,   vr18
+    vhaddw.d.w       vr3,     vr19,   vr19
+    vhaddw.d.w       vr4,     vr21,   vr21
+    vhaddw.q.d       vr1,     vr1,    vr1
+    vhaddw.q.d       vr2,     vr2,    vr2
+    vhaddw.q.d       vr3,     vr3,    vr3
+    vhaddw.q.d       vr4,     vr4,    vr4
+    vhaddw.q.d       vr5,     vr5,    vr5
+    vhaddw.q.d       vr6,     vr6,    vr6
+    vhaddw.q.d       vr7,     vr7,    vr7
+    vhaddw.q.d       vr8,     vr8,    vr8
+    vilvl.w          vr1,     vr2,    vr1
+    vilvl.w          vr3,     vr4,    vr3
+    vilvl.w          vr5,     vr6,    vr5
+    vilvl.w          vr7,     vr8,    vr7
+    vilvl.d          vr1,     vr3,    vr1
+    vilvl.d          vr5,     vr7,    vr5
+    vsra.w           vr1,     vr1,    vr0
+    vsra.w           vr5,     vr5,    vr0
+    vmin.w           vr1,     vr1,    vr20
+    vmin.w           vr5,     vr5,    vr20
+
+    vst              vr1,     a1,     0
+    vst              vr5,     a1,     16
+    addi.d           a1,      a1,     32
+    addi.d           a5,      a5,     32
+    addi.d           a4,      a4,     128
+    addi.d           a2,      a2,     -8
+    bge              a2,      t8,     .LOOP_HS19_DST8
+    blt              zero,    a2,     .HS19_REST8
+    b                .HS19_END
+.HS19_REST8:
+    li.w             t1,      0
+.HS19_DST8:
+    slli.w           t2,      t1,     2
+    ldx.w            t2,      a5,     t2
+    li.w             t3,      0
+    li.w             t8,      0
+.HS19_FILTER8:
+    add.w            t4,      t2,     t3
+    slli.w           t4,      t4,     1
+    ldx.hu           t5,      a3,     t4
+    mul.w            t6,      a6,     t1
+    add.w            t6,      t6,     t3
+    slli.w           t7,      t6,     1
+    ldx.h            t7,      a4,     t7
+    mul.w            t7,      t5,     t7
+    add.w            t8,      t8,     t7
+    addi.w           t3,      t3,     1
+    blt              t3,      a6,     .HS19_FILTER8
+    sra.w            t8,      t8,     a7
+    slt              t5,      t8,     t0
+    maskeqz          t8,      t8,     t5
+    masknez          t5,      t0,     t5
+    or               t8,      t8,     t5
+    slli.w           t4,      t1,     2
+    stx.w            t8,      a1,     t4
+    addi.w           t1,      t1,     1
+    blt              t1,      a2,     .HS19_DST8
+    b                .HS19_END
+
+.LOOP_HS19_DST4:
+    ld.w             t1,      a5,     0
+    ld.w             t2,      a5,     4
+    ld.w             t3,      a5,     8
+    ld.w             t4,      a5,     12
+    slli.w           t1,      t1,     1
+    slli.w           t2,      t2,     1
+    slli.w           t3,      t3,     1
+    slli.w           t4,      t4,     1
+    fldx.d           f1,      a3,     t1
+    fldx.d           f2,      a3,     t2
+    fldx.d           f3,      a3,     t3
+    fldx.d           f4,      a3,     t4
+    ld.w             t1,      a5,     16
+    ld.w             t2,      a5,     20
+    ld.w             t3,      a5,     24
+    ld.w             t4,      a5,     28
+    slli.w           t1,      t1,     1
+    slli.w           t2,      t2,     1
+    slli.w           t3,      t3,     1
+    slli.w           t4,      t4,     1
+    fldx.d           f5,      a3,     t1
+    fldx.d           f6,      a3,     t2
+    fldx.d           f7,      a3,     t3
+    fldx.d           f8,      a3,     t4
+    vld              vr9,     a4,     0
+    vld              vr10,    a4,     16
+    vld              vr11,    a4,     32
+    vld              vr12,    a4,     48
+    vilvl.d          vr1,     vr2,    vr1
+    vilvl.d          vr3,     vr4,    vr3
+    vilvl.d          vr5,     vr6,    vr5
+    vilvl.d          vr7,     vr8,    vr7
+    vmulwev.w.hu.h   vr13,    vr1,    vr9
+    vmulwev.w.hu.h   vr14,    vr3,    vr10
+    vmulwev.w.hu.h   vr15,    vr5,    vr11
+    vmulwev.w.hu.h   vr16,    vr7,    vr12
+    vmaddwod.w.hu.h  vr13,    vr1,    vr9
+    vmaddwod.w.hu.h  vr14,    vr3,    vr10
+    vmaddwod.w.hu.h  vr15,    vr5,    vr11
+    vmaddwod.w.hu.h  vr16,    vr7,    vr12
+    vhaddw.d.w       vr13,    vr13,   vr13
+    vhaddw.d.w       vr14,    vr14,   vr14
+    vhaddw.d.w       vr15,    vr15,   vr15
+    vhaddw.d.w       vr16,    vr16,   vr16
+    vpickev.w        vr13,    vr14,   vr13
+    vpickev.w        vr15,    vr16,   vr15
+    vsra.w           vr13,    vr13,   vr0
+    vsra.w           vr15,    vr15,   vr0
+    vmin.w           vr13,    vr13,   vr20
+    vmin.w           vr15,    vr15,   vr20
+
+    vst              vr13,    a1,     0
+    vst              vr15,    a1,     16
+    addi.d           a1,      a1,     32
+    addi.d           a5,      a5,     32
+    addi.d           a4,      a4,     64
+    addi.d           a2,      a2,     -8
+    bge              a2,      t8,     .LOOP_HS19_DST4
+    blt              zero,    a2,     .HS19_REST4
+    b                .HS19_END
+.HS19_REST4:
+    li.w             t1,      0
+.HS19_DST4:
+    slli.w           t2,      t1,     2
+    ldx.w            t2,      a5,     t2
+    li.w             t3,      0
+    li.w             t8,      0
+.HS19_FILTER4:
+    add.w            t4,      t2,     t3
+    slli.w           t4,      t4,     1
+    ldx.hu           t5,      a3,     t4
+    mul.w            t6,      a6,     t1
+    add.w            t6,      t6,     t3
+    slli.w           t7,      t6,     1
+    ldx.h            t7,      a4,     t7
+    mul.w            t7,      t5,     t7
+    add.w            t8,      t8,     t7
+    addi.w           t3,      t3,     1
+    blt              t3,      a6,     .HS19_FILTER4
+    sra.w            t8,      t8,     a7
+    slt              t5,      t8,     t0
+    maskeqz          t8,      t8,     t5
+    masknez          t5,      t0,     t5
+    or               t8,      t8,     t5
+    slli.w           t4,      t1,     2
+    stx.w            t8,      a1,     t4
+    addi.w           t1,      t1,     1
+    blt              t1,      a2,     .HS19_DST4
+    b                .HS19_END
+.END_HS19_DST4:
+
+    li.w             t1,      0
+.LOOP_HS19_DST1:
+    slli.w           t2,      t1,     2
+    ldx.w            t2,      a5,     t2
+    li.w             t3,      0
+    li.w             t8,      0
+.HS19_FILTER1:
+    add.w            t4,      t2,     t3
+    slli.w           t4,      t4,     1
+    ldx.hu           t5,      a3,     t4
+    mul.w            t6,      a6,     t1
+    add.w            t6,      t6,     t3
+    slli.w           t7,      t6,     1
+    ldx.h            t7,      a4,     t7
+    mul.w            t7,      t5,     t7
+    add.w            t8,      t8,     t7
+    addi.w           t3,      t3,     1
+    blt              t3,      a6,     .HS19_FILTER1
+    sra.w            t8,      t8,     a7
+    slt              t5,      t8,     t0
+    maskeqz          t8,      t8,     t5
+    masknez          t5,      t0,     t5
+    or               t8,      t8,     t5
+    slli.w           t4,      t1,     2
+    stx.w            t8,      a1,     t4
+    addi.w           t1,      t1,     1
+    blt              t1,      a2,     .LOOP_HS19_DST1
+    b                .HS19_END
+.HS19_END:
+
+    ld.d             s0,      sp,     0
+    ld.d             s1,      sp,     8
+    ld.d             s2,      sp,     16
+    ld.d             s3,      sp,     24
+    ld.d             s4,      sp,     32
+    ld.d             s5,      sp,     40
+    ld.d             s6,      sp,     48
+    ld.d             s7,      sp,     56
+    ld.d             s8,      sp,     64
+    addi.d           sp,      sp,     72
+endfunc
diff --git a/libswscale/loongarch/swscale_init_loongarch.c b/libswscale/loongarch/swscale_init_loongarch.c
index 97fe947e2e..c13a1662ec 100644
--- a/libswscale/loongarch/swscale_init_loongarch.c
+++ b/libswscale/loongarch/swscale_init_loongarch.c
@@ -27,8 +27,33 @@
 av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
+    if (have_lsx(cpu_flags)) {
+        ff_sws_init_output_lsx(c);
+        if (c->srcBpc == 8) {
+            if (c->dstBpc <= 14) {
+                c->hyScale = c->hcScale = ff_hscale_8_to_15_lsx;
+            } else {
+                c->hyScale = c->hcScale = ff_hscale_8_to_19_lsx;
+            }
+        } else {
+            c->hyScale = c->hcScale = c->dstBpc > 14 ? ff_hscale_16_to_19_lsx
+                                                     : ff_hscale_16_to_15_lsx;
+        }
+        switch (c->srcFormat) {
+        case AV_PIX_FMT_GBRAP:
+        case AV_PIX_FMT_GBRP:
+            {
+                c->readChrPlanar = planar_rgb_to_uv_lsx;
+                c->readLumPlanar = planar_rgb_to_y_lsx;
+            }
+            break;
+        }
+        if (c->dstBpc == 8)
+            c->yuv2planeX = ff_yuv2planeX_8_lsx;
+    }
+#if HAVE_LASX
     if (have_lasx(cpu_flags)) {
-        ff_sws_init_output_loongarch(c);
+        ff_sws_init_output_lasx(c);
         if (c->srcBpc == 8) {
             if (c->dstBpc <= 14) {
                 c->hyScale = c->hcScale = ff_hscale_8_to_15_lasx;
@@ -51,17 +76,21 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
         if (c->dstBpc == 8)
             c->yuv2planeX = ff_yuv2planeX_8_lasx;
     }
+#endif // #if HAVE_LASX
 }
 
 av_cold void rgb2rgb_init_loongarch(void)
 {
+#if HAVE_LASX
     int cpu_flags = av_get_cpu_flags();
     if (have_lasx(cpu_flags))
         interleaveBytes = ff_interleave_bytes_lasx;
+#endif // #if HAVE_LASX
 }
 
 av_cold SwsFunc ff_yuv2rgb_init_loongarch(SwsContext *c)
 {
+#if HAVE_LASX
     int cpu_flags = av_get_cpu_flags();
     if (have_lasx(cpu_flags)) {
         switch (c->dstFormat) {
@@ -91,5 +120,6 @@ av_cold SwsFunc ff_yuv2rgb_init_loongarch(SwsContext *c)
                     return yuv420_abgr32_lasx;
         }
     }
+#endif // #if HAVE_LASX
     return NULL;
 }
diff --git a/libswscale/loongarch/swscale_loongarch.h b/libswscale/loongarch/swscale_loongarch.h
index c52eb1016b..bc29913ac6 100644
--- a/libswscale/loongarch/swscale_loongarch.h
+++ b/libswscale/loongarch/swscale_loongarch.h
@@ -24,7 +24,45 @@
 
 #include "libswscale/swscale.h"
 #include "libswscale/swscale_internal.h"
+#include "config.h"
 
+void ff_hscale_8_to_15_lsx(SwsContext *c, int16_t *dst, int dstW,
+                           const uint8_t *src, const int16_t *filter,
+                           const int32_t *filterPos, int filterSize);
+
+void ff_hscale_8_to_19_lsx(SwsContext *c, int16_t *_dst, int dstW,
+                           const uint8_t *src, const int16_t *filter,
+                           const int32_t *filterPos, int filterSize);
+
+void ff_hscale_16_to_15_lsx(SwsContext *c, int16_t *_dst, int dstW,
+                            const uint8_t *_src, const int16_t *filter,
+                            const int32_t *filterPos, int filterSize);
+
+void ff_hscale_16_to_15_sub_lsx(SwsContext *c, int16_t *_dst, int dstW,
+                                const uint8_t *_src, const int16_t *filter,
+                                const int32_t *filterPos, int filterSize, int sh);
+
+void ff_hscale_16_to_19_lsx(SwsContext *c, int16_t *_dst, int dstW,
+                            const uint8_t *_src, const int16_t *filter,
+                            const int32_t *filterPos, int filterSize);
+
+void ff_hscale_16_to_19_sub_lsx(SwsContext *c, int16_t *_dst, int dstW,
+                                const uint8_t *_src, const int16_t *filter,
+                                const int32_t *filterPos, int filterSize, int sh);
+
+void planar_rgb_to_uv_lsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
+                          int width, int32_t *rgb2yuv, void *opq);
+
+void planar_rgb_to_y_lsx(uint8_t *_dst, const uint8_t *src[4], int width,
+                         int32_t *rgb2yuv, void *opq);
+
+void ff_yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
+                         const int16_t **src, uint8_t *dest, int dstW,
+                         const uint8_t *dither, int offset);
+
+av_cold void ff_sws_init_output_lsx(SwsContext *c);
+
+#if HAVE_LASX
 void ff_hscale_8_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
                             const uint8_t *src, const int16_t *filter,
                             const int32_t *filterPos, int filterSize);
@@ -69,10 +107,11 @@ void ff_interleave_bytes_lasx(const uint8_t *src1, const uint8_t *src2,
                               uint8_t *dest, int width, int height,
                               int src1Stride, int src2Stride, int dstStride);
 
-av_cold void ff_sws_init_output_loongarch(SwsContext *c);
-
 void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
                           const int16_t **src, uint8_t *dest, int dstW,
                           const uint8_t *dither, int offset);
 
+av_cold void ff_sws_init_output_lasx(SwsContext *c);
+#endif // #if HAVE_LASX
+
 #endif /* SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H */
diff --git a/libswscale/loongarch/swscale_lsx.c b/libswscale/loongarch/swscale_lsx.c
new file mode 100644
index 0000000000..da8eabfca3
--- /dev/null
+++ b/libswscale/loongarch/swscale_lsx.c
@@ -0,0 +1,57 @@
+/*
+ * Loongson LSX optimized swscale
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "swscale_loongarch.h"
+
+void ff_hscale_16_to_15_lsx(SwsContext *c, int16_t *_dst, int dstW,
+                            const uint8_t *_src, const int16_t *filter,
+                            const int32_t *filterPos, int filterSize)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
+    int sh              = desc->comp[0].depth - 1;
+
+    if (sh < 15) {
+        sh = isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8 ? 13 :
+                      (desc->comp[0].depth - 1);
+    } else if (desc->flags && AV_PIX_FMT_FLAG_FLOAT) {
+        sh = 15;
+    }
+    ff_hscale_16_to_15_sub_lsx(c, _dst, dstW, _src, filter, filterPos, filterSize, sh);
+}
+
+void ff_hscale_16_to_19_lsx(SwsContext *c, int16_t *_dst, int dstW,
+                            const uint8_t *_src, const int16_t *filter,
+                            const int32_t *filterPos, int filterSize)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
+    int bits            = desc->comp[0].depth - 1;
+    int sh              = bits - 4;
+
+    if ((isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8) && desc->comp[0].depth<16) {
+
+        sh = 9;
+    } else if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) { /* float input are process like uint 16bpc */
+        sh = 16 - 1 - 4;
+    }
+    ff_hscale_16_to_19_sub_lsx(c, _dst, dstW, _src, filter, filterPos, filterSize, sh);
+}
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 925c536bf1..b02e6cdc64 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -653,7 +653,7 @@ static av_cold int initFilter(int16_t **outFilter, int32_t **filterPos,
             filterAlign = 1;
     }
 
-    if (have_lasx(cpu_flags)) {
+    if (have_lasx(cpu_flags) || have_lsx(cpu_flags)) {
         int reNum = minFilterSize & (0x07);
 
         if (minFilterSize < 5)
@@ -1806,6 +1806,7 @@ static av_cold int sws_init_single_context(SwsContext *c, SwsFilter *srcFilter,
             const int filterAlign = X86_MMX(cpu_flags)     ? 4 :
                                     PPC_ALTIVEC(cpu_flags) ? 8 :
                                     have_neon(cpu_flags)   ? 4 :
+                                    have_lsx(cpu_flags)    ? 8 :
                                     have_lasx(cpu_flags)   ? 8 : 1;
 
             if ((ret = initFilter(&c->hLumFilter, &c->hLumFilterPos,
-- 
2.20.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [FFmpeg-devel] [PATCH v1 6/6] swscale/la: Add following builtin optimized functions
  2023-05-04  8:49 [FFmpeg-devel] Add LSX optimization in avcodec and swscale Hao Chen
                   ` (4 preceding siblings ...)
  2023-05-04  8:49 ` [FFmpeg-devel] [PATCH v1 5/6] swscale/la: Optimize the functions of the swscale series with lsx Hao Chen
@ 2023-05-04  8:49 ` Hao Chen
  5 siblings, 0 replies; 18+ messages in thread
From: Hao Chen @ 2023-05-04  8:49 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Jin Bo

From: Jin Bo <jinbo@loongson.cn>

yuv420_rgb24_lsx
yuv420_bgr24_lsx
yuv420_rgba32_lsx
yuv420_argb32_lsx
yuv420_bgra32_lsx
yuv420_abgr32_lsx
./configure --disable-lasx
ffmpeg -i ~/media/1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo
-pix_fmt rgb24 -y /dev/null -an
before: 184fps
after:  207fps
---
 libswscale/loongarch/Makefile                 |   3 +-
 libswscale/loongarch/swscale_init_loongarch.c |  30 +-
 libswscale/loongarch/swscale_loongarch.h      |  18 +
 libswscale/loongarch/yuv2rgb_lsx.c            | 361 ++++++++++++++++++
 4 files changed, 410 insertions(+), 2 deletions(-)
 create mode 100644 libswscale/loongarch/yuv2rgb_lsx.c

diff --git a/libswscale/loongarch/Makefile b/libswscale/loongarch/Makefile
index c0b6a449c0..c35ba309a4 100644
--- a/libswscale/loongarch/Makefile
+++ b/libswscale/loongarch/Makefile
@@ -8,4 +8,5 @@ LSX-OBJS-$(CONFIG_SWSCALE)  += loongarch/swscale.o \
                                loongarch/swscale_lsx.o \
                                loongarch/input.o   \
                                loongarch/output.o  \
-                               loongarch/output_lsx.o
+                               loongarch/output_lsx.o  \
+                               loongarch/yuv2rgb_lsx.o
diff --git a/libswscale/loongarch/swscale_init_loongarch.c b/libswscale/loongarch/swscale_init_loongarch.c
index c13a1662ec..53e4f970b6 100644
--- a/libswscale/loongarch/swscale_init_loongarch.c
+++ b/libswscale/loongarch/swscale_init_loongarch.c
@@ -90,8 +90,8 @@ av_cold void rgb2rgb_init_loongarch(void)
 
 av_cold SwsFunc ff_yuv2rgb_init_loongarch(SwsContext *c)
 {
-#if HAVE_LASX
     int cpu_flags = av_get_cpu_flags();
+#if HAVE_LASX
     if (have_lasx(cpu_flags)) {
         switch (c->dstFormat) {
             case AV_PIX_FMT_RGB24:
@@ -121,5 +121,33 @@ av_cold SwsFunc ff_yuv2rgb_init_loongarch(SwsContext *c)
         }
     }
 #endif // #if HAVE_LASX
+    if (have_lsx(cpu_flags)) {
+        switch (c->dstFormat) {
+            case AV_PIX_FMT_RGB24:
+                return yuv420_rgb24_lsx;
+            case AV_PIX_FMT_BGR24:
+                return yuv420_bgr24_lsx;
+            case AV_PIX_FMT_RGBA:
+                if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat)) {
+                    break;
+                } else
+                    return yuv420_rgba32_lsx;
+            case AV_PIX_FMT_ARGB:
+                if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat)) {
+                    break;
+                } else
+                    return yuv420_argb32_lsx;
+            case AV_PIX_FMT_BGRA:
+                if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat)) {
+                    break;
+                } else
+                    return yuv420_bgra32_lsx;
+            case AV_PIX_FMT_ABGR:
+                if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat)) {
+                    break;
+                } else
+                    return yuv420_abgr32_lsx;
+        }
+    }
     return NULL;
 }
diff --git a/libswscale/loongarch/swscale_loongarch.h b/libswscale/loongarch/swscale_loongarch.h
index bc29913ac6..0514abae21 100644
--- a/libswscale/loongarch/swscale_loongarch.h
+++ b/libswscale/loongarch/swscale_loongarch.h
@@ -62,6 +62,24 @@ void ff_yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
 
 av_cold void ff_sws_init_output_lsx(SwsContext *c);
 
+int yuv420_rgb24_lsx(SwsContext *c, const uint8_t *src[], int srcStride[],
+                     int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
+
+int yuv420_bgr24_lsx(SwsContext *c, const uint8_t *src[], int srcStride[],
+                     int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
+
+int yuv420_rgba32_lsx(SwsContext *c, const uint8_t *src[], int srcStride[],
+                      int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
+
+int yuv420_bgra32_lsx(SwsContext *c, const uint8_t *src[], int srcStride[],
+                      int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
+
+int yuv420_argb32_lsx(SwsContext *c, const uint8_t *src[], int srcStride[],
+                      int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
+
+int yuv420_abgr32_lsx(SwsContext *c, const uint8_t *src[], int srcStride[],
+                      int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
+
 #if HAVE_LASX
 void ff_hscale_8_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
                             const uint8_t *src, const int16_t *filter,
diff --git a/libswscale/loongarch/yuv2rgb_lsx.c b/libswscale/loongarch/yuv2rgb_lsx.c
new file mode 100644
index 0000000000..11cd2f79d9
--- /dev/null
+++ b/libswscale/loongarch/yuv2rgb_lsx.c
@@ -0,0 +1,361 @@
+/*
+ * Copyright (C) 2023 Loongson Technology Co. Ltd.
+ * Contributed by Bo Jin(jinbo@loongson.cn)
+ * All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "swscale_loongarch.h"
+#include "libavutil/loongarch/loongson_intrinsics.h"
+
+#define YUV2RGB_LOAD_COE                               \
+    /* Load x_offset */                                \
+    __m128i y_offset = __lsx_vreplgr2vr_d(c->yOffset); \
+    __m128i u_offset = __lsx_vreplgr2vr_d(c->uOffset); \
+    __m128i v_offset = __lsx_vreplgr2vr_d(c->vOffset); \
+    /* Load x_coeff  */                                \
+    __m128i ug_coeff = __lsx_vreplgr2vr_d(c->ugCoeff); \
+    __m128i vg_coeff = __lsx_vreplgr2vr_d(c->vgCoeff); \
+    __m128i y_coeff  = __lsx_vreplgr2vr_d(c->yCoeff);  \
+    __m128i ub_coeff = __lsx_vreplgr2vr_d(c->ubCoeff); \
+    __m128i vr_coeff = __lsx_vreplgr2vr_d(c->vrCoeff); \
+
+#define LOAD_YUV_16                                                   \
+    m_y1 = __lsx_vld(py_1, 0);                                        \
+    m_y2 = __lsx_vld(py_2, 0);                                        \
+    m_u  = __lsx_vldrepl_d(pu, 0);                                    \
+    m_v  = __lsx_vldrepl_d(pv, 0);                                    \
+    DUP2_ARG2(__lsx_vilvl_b, m_u, m_u, m_v, m_v, m_u, m_v);           \
+    DUP2_ARG2(__lsx_vilvh_b, zero, m_u, zero, m_v, m_u_h, m_v_h);     \
+    DUP2_ARG2(__lsx_vilvl_b, zero, m_u, zero, m_v, m_u, m_v);         \
+    DUP2_ARG2(__lsx_vilvh_b, zero, m_y1, zero, m_y2, m_y1_h, m_y2_h); \
+    DUP2_ARG2(__lsx_vilvl_b, zero, m_y1, zero, m_y2, m_y1, m_y2);     \
+
+/* YUV2RGB method
+ * The conversion method is as follows:
+ * R = Y' * y_coeff + V' * vr_coeff
+ * G = Y' * y_coeff + V' * vg_coeff + U' * ug_coeff
+ * B = Y' * y_coeff + U' * ub_coeff
+ *
+ * where X' = X * 8 - x_offset
+ *
+ */
+
+#define YUV2RGB(y1, y2, u, v, r1, g1, b1, r2, g2, b2)               \
+{                                                                   \
+    y1  = __lsx_vslli_h(y1, 3);                                     \
+    y2  = __lsx_vslli_h(y2, 3);                                     \
+    u   = __lsx_vslli_h(u, 3);                                      \
+    v   = __lsx_vslli_h(v, 3);                                      \
+    y1  = __lsx_vsub_h(y1, y_offset);                               \
+    y2  = __lsx_vsub_h(y2, y_offset);                               \
+    u   = __lsx_vsub_h(u, u_offset);                                \
+    v   = __lsx_vsub_h(v, v_offset);                                \
+    y_1 = __lsx_vmuh_h(y1, y_coeff);                                \
+    y_2 = __lsx_vmuh_h(y2, y_coeff);                                \
+    u2g = __lsx_vmuh_h(u, ug_coeff);                                \
+    u2b = __lsx_vmuh_h(u, ub_coeff);                                \
+    v2r = __lsx_vmuh_h(v, vr_coeff);                                \
+    v2g = __lsx_vmuh_h(v, vg_coeff);                                \
+    r1  = __lsx_vsadd_h(y_1, v2r);                                  \
+    v2g = __lsx_vsadd_h(v2g, u2g);                                  \
+    g1  = __lsx_vsadd_h(y_1, v2g);                                  \
+    b1  = __lsx_vsadd_h(y_1, u2b);                                  \
+    r2  = __lsx_vsadd_h(y_2, v2r);                                  \
+    g2  = __lsx_vsadd_h(y_2, v2g);                                  \
+    b2  = __lsx_vsadd_h(y_2, u2b);                                  \
+    DUP4_ARG1(__lsx_vclip255_h, r1, g1, b1, r2, r1, g1, b1, r2);    \
+    DUP2_ARG1(__lsx_vclip255_h, g2, b2, g2, b2);                    \
+}
+
+#define RGB_PACK(r, g, b, rgb_l, rgb_h)                                 \
+{                                                                       \
+    __m128i rg;                                                         \
+    rg = __lsx_vpackev_b(g, r);                                         \
+    DUP2_ARG3(__lsx_vshuf_b, b, rg, shuf2, b, rg, shuf3, rgb_l, rgb_h); \
+}
+
+#define RGB32_PACK(a, r, g, b, rgb_l, rgb_h)                         \
+{                                                                    \
+    __m128i ra, bg;                                                  \
+    ra    = __lsx_vpackev_b(r, a);                                   \
+    bg    = __lsx_vpackev_b(b, g);                                   \
+    rgb_l = __lsx_vilvl_h(bg, ra);                                   \
+    rgb_h = __lsx_vilvh_h(bg, ra);                                   \
+}
+
+#define RGB_STORE(rgb_l, rgb_h, image)                               \
+{                                                                    \
+    __lsx_vstelm_d(rgb_l, image, 0,  0);                             \
+    __lsx_vstelm_d(rgb_l, image, 8,  1);                             \
+    __lsx_vstelm_d(rgb_h, image, 16, 0);                             \
+}
+
+#define RGB32_STORE(rgb_l, rgb_h, image)                             \
+{                                                                    \
+    __lsx_vst(rgb_l, image, 0);                                      \
+    __lsx_vst(rgb_h, image, 16);                                     \
+}
+
+#define YUV2RGBFUNC(func_name, dst_type, alpha)                                     \
+           int func_name(SwsContext *c, const uint8_t *src[],                       \
+                         int srcStride[], int srcSliceY, int srcSliceH,             \
+                         uint8_t *dst[], int dstStride[])                           \
+{                                                                                   \
+    int x, y, h_size, vshift, res;                                                  \
+    __m128i m_y1, m_y2, m_u, m_v;                                                   \
+    __m128i m_y1_h, m_y2_h, m_u_h, m_v_h;                                           \
+    __m128i y_1, y_2, u2g, v2g, u2b, v2r, rgb1_l, rgb1_h;                           \
+    __m128i rgb2_l, rgb2_h, r1, g1, b1, r2, g2, b2;                                 \
+    __m128i shuf2 = {0x0504120302100100, 0x0A18090816070614};                       \
+    __m128i shuf3 = {0x1E0F0E1C0D0C1A0B, 0x0101010101010101};                       \
+    __m128i zero = __lsx_vldi(0);                                                   \
+                                                                                    \
+    YUV2RGB_LOAD_COE                                                                \
+                                                                                    \
+    h_size = c->dstW >> 4;                                                          \
+    res = (c->dstW & 15) >> 1;                                                      \
+    vshift = c->srcFormat != AV_PIX_FMT_YUV422P;                                    \
+    for (y = 0; y < srcSliceH; y += 2) {                                            \
+        dst_type av_unused *r, *g, *b;                                              \
+        dst_type *image1    = (dst_type *)(dst[0] + (y + srcSliceY) * dstStride[0]);\
+        dst_type *image2    = (dst_type *)(image1 +                   dstStride[0]);\
+        const uint8_t *py_1 = src[0] +               y * srcStride[0];              \
+        const uint8_t *py_2 = py_1   +                   srcStride[0];              \
+        const uint8_t *pu   = src[1] +   (y >> vshift) * srcStride[1];              \
+        const uint8_t *pv   = src[2] +   (y >> vshift) * srcStride[2];              \
+        for(x = 0; x < h_size; x++) {                                               \
+
+#define YUV2RGBFUNC32(func_name, dst_type, alpha)                                   \
+           int func_name(SwsContext *c, const uint8_t *src[],                       \
+                         int srcStride[], int srcSliceY, int srcSliceH,             \
+                         uint8_t *dst[], int dstStride[])                           \
+{                                                                                   \
+    int x, y, h_size, vshift, res;                                                  \
+    __m128i m_y1, m_y2, m_u, m_v;                                                   \
+    __m128i m_y1_h, m_y2_h, m_u_h, m_v_h;                                           \
+    __m128i y_1, y_2, u2g, v2g, u2b, v2r, rgb1_l, rgb1_h;                           \
+    __m128i rgb2_l, rgb2_h, r1, g1, b1, r2, g2, b2;                                 \
+    __m128i a = __lsx_vldi(0xFF);                                                   \
+    __m128i zero = __lsx_vldi(0);                                                   \
+                                                                                    \
+    YUV2RGB_LOAD_COE                                                                \
+                                                                                    \
+    h_size = c->dstW >> 4;                                                          \
+    res = (c->dstW & 15) >> 1;                                                      \
+    vshift = c->srcFormat != AV_PIX_FMT_YUV422P;                                    \
+    for (y = 0; y < srcSliceH; y += 2) {                                            \
+        int yd = y + srcSliceY;                                                     \
+        dst_type av_unused *r, *g, *b;                                              \
+        dst_type *image1    = (dst_type *)(dst[0] + (yd)     * dstStride[0]);       \
+        dst_type *image2    = (dst_type *)(dst[0] + (yd + 1) * dstStride[0]);       \
+        const uint8_t *py_1 = src[0] +               y * srcStride[0];              \
+        const uint8_t *py_2 = py_1   +                   srcStride[0];              \
+        const uint8_t *pu   = src[1] +   (y >> vshift) * srcStride[1];              \
+        const uint8_t *pv   = src[2] +   (y >> vshift) * srcStride[2];              \
+        for(x = 0; x < h_size; x++) {                                               \
+
+#define DEALYUV2RGBREMAIN                                                           \
+            py_1 += 16;                                                             \
+            py_2 += 16;                                                             \
+            pu += 8;                                                                \
+            pv += 8;                                                                \
+            image1 += 48;                                                           \
+            image2 += 48;                                                           \
+        }                                                                           \
+        for (x = 0; x < res; x++) {                                                 \
+            int av_unused U, V, Y;                                                  \
+            U = pu[0];                                                              \
+            V = pv[0];                                                              \
+            r = (void *)c->table_rV[V+YUVRGB_TABLE_HEADROOM];                       \
+            g = (void *)(c->table_gU[U+YUVRGB_TABLE_HEADROOM]                       \
+                       + c->table_gV[V+YUVRGB_TABLE_HEADROOM]);                     \
+            b = (void *)c->table_bU[U+YUVRGB_TABLE_HEADROOM];
+
+#define DEALYUV2RGBREMAIN32                                                         \
+            py_1 += 16;                                                             \
+            py_2 += 16;                                                             \
+            pu += 8;                                                                \
+            pv += 8;                                                                \
+            image1 += 16;                                                           \
+            image2 += 16;                                                           \
+        }                                                                           \
+        for (x = 0; x < res; x++) {                                                 \
+            int av_unused U, V, Y;                                                  \
+            U = pu[0];                                                              \
+            V = pv[0];                                                              \
+            r = (void *)c->table_rV[V+YUVRGB_TABLE_HEADROOM];                       \
+            g = (void *)(c->table_gU[U+YUVRGB_TABLE_HEADROOM]                       \
+                       + c->table_gV[V+YUVRGB_TABLE_HEADROOM]);                     \
+            b = (void *)c->table_bU[U+YUVRGB_TABLE_HEADROOM];                       \
+
+#define PUTRGB24(dst, src)                  \
+    Y      = src[0];                        \
+    dst[0] = r[Y];                          \
+    dst[1] = g[Y];                          \
+    dst[2] = b[Y];                          \
+    Y      = src[1];                        \
+    dst[3] = r[Y];                          \
+    dst[4] = g[Y];                          \
+    dst[5] = b[Y];
+
+#define PUTBGR24(dst, src)                  \
+    Y      = src[0];                        \
+    dst[0] = b[Y];                          \
+    dst[1] = g[Y];                          \
+    dst[2] = r[Y];                          \
+    Y      = src[1];                        \
+    dst[3] = b[Y];                          \
+    dst[4] = g[Y];                          \
+    dst[5] = r[Y];
+
+#define PUTRGB(dst, src)                    \
+    Y      = src[0];                        \
+    dst[0] = r[Y] + g[Y] + b[Y];            \
+    Y      = src[1];                        \
+    dst[1] = r[Y] + g[Y] + b[Y];            \
+
+#define ENDRES                              \
+    pu += 1;                                \
+    pv += 1;                                \
+    py_1 += 2;                              \
+    py_2 += 2;                              \
+    image1 += 6;                            \
+    image2 += 6;                            \
+
+#define ENDRES32                            \
+    pu += 1;                                \
+    pv += 1;                                \
+    py_1 += 2;                              \
+    py_2 += 2;                              \
+    image1 += 2;                            \
+    image2 += 2;                            \
+
+#define END_FUNC()                          \
+        }                                   \
+    }                                       \
+    return srcSliceH;                       \
+}
+
+YUV2RGBFUNC(yuv420_rgb24_lsx, uint8_t, 0)
+    LOAD_YUV_16
+    YUV2RGB(m_y1, m_y2, m_u, m_v, r1, g1, b1, r2, g2, b2);
+    RGB_PACK(r1, g1, b1, rgb1_l, rgb1_h);
+    RGB_PACK(r2, g2, b2, rgb2_l, rgb2_h);
+    RGB_STORE(rgb1_l, rgb1_h, image1);
+    RGB_STORE(rgb2_l, rgb2_h, image2);
+    YUV2RGB(m_y1_h, m_y2_h, m_u_h, m_v_h, r1, g1, b1, r2, g2, b2);
+    RGB_PACK(r1, g1, b1, rgb1_l, rgb1_h);
+    RGB_PACK(r2, g2, b2, rgb2_l, rgb2_h);
+    RGB_STORE(rgb1_l, rgb1_h, image1 + 24);
+    RGB_STORE(rgb2_l, rgb2_h, image2 + 24);
+    DEALYUV2RGBREMAIN
+    PUTRGB24(image1, py_1);
+    PUTRGB24(image2, py_2);
+    ENDRES
+    END_FUNC()
+
+YUV2RGBFUNC(yuv420_bgr24_lsx, uint8_t, 0)
+    LOAD_YUV_16
+    YUV2RGB(m_y1, m_y2, m_u, m_v, r1, g1, b1, r2, g2, b2);
+    RGB_PACK(b1, g1, r1, rgb1_l, rgb1_h);
+    RGB_PACK(b2, g2, r2, rgb2_l, rgb2_h);
+    RGB_STORE(rgb1_l, rgb1_h, image1);
+    RGB_STORE(rgb2_l, rgb2_h, image2);
+    YUV2RGB(m_y1_h, m_y2_h, m_u_h, m_v_h, r1, g1, b1, r2, g2, b2);
+    RGB_PACK(b1, g1, r1, rgb1_l, rgb1_h);
+    RGB_PACK(b2, g2, r2, rgb2_l, rgb2_h);
+    RGB_STORE(rgb1_l, rgb1_h, image1 + 24);
+    RGB_STORE(rgb2_l, rgb2_h, image2 + 24);
+    DEALYUV2RGBREMAIN
+    PUTBGR24(image1, py_1);
+    PUTBGR24(image2, py_2);
+    ENDRES
+    END_FUNC()
+
+YUV2RGBFUNC32(yuv420_rgba32_lsx, uint32_t, 0)
+    LOAD_YUV_16
+    YUV2RGB(m_y1, m_y2, m_u, m_v, r1, g1, b1, r2, g2, b2);
+    RGB32_PACK(r1, g1, b1, a, rgb1_l, rgb1_h);
+    RGB32_PACK(r2, g2, b2, a, rgb2_l, rgb2_h);
+    RGB32_STORE(rgb1_l, rgb1_h, image1);
+    RGB32_STORE(rgb2_l, rgb2_h, image2);
+    YUV2RGB(m_y1_h, m_y2_h, m_u_h, m_v_h, r1, g1, b1, r2, g2, b2);
+    RGB32_PACK(r1, g1, b1, a, rgb1_l, rgb1_h);
+    RGB32_PACK(r2, g2, b2, a, rgb2_l, rgb2_h);
+    RGB32_STORE(rgb1_l, rgb1_h, image1 + 8);
+    RGB32_STORE(rgb2_l, rgb2_h, image2 + 8);
+    DEALYUV2RGBREMAIN32
+    PUTRGB(image1, py_1);
+    PUTRGB(image2, py_2);
+    ENDRES32
+    END_FUNC()
+
+YUV2RGBFUNC32(yuv420_bgra32_lsx, uint32_t, 0)
+    LOAD_YUV_16
+    YUV2RGB(m_y1, m_y2, m_u, m_v, r1, g1, b1, r2, g2, b2);
+    RGB32_PACK(b1, g1, r1, a, rgb1_l, rgb1_h);
+    RGB32_PACK(b2, g2, r2, a, rgb2_l, rgb2_h);
+    RGB32_STORE(rgb1_l, rgb1_h, image1);
+    RGB32_STORE(rgb2_l, rgb2_h, image2);
+    YUV2RGB(m_y1_h, m_y2_h, m_u_h, m_v_h, r1, g1, b1, r2, g2, b2);
+    RGB32_PACK(b1, g1, r1, a, rgb1_l, rgb1_h);
+    RGB32_PACK(b2, g2, r2, a, rgb2_l, rgb2_h);
+    RGB32_STORE(rgb1_l, rgb1_h, image1 + 8);
+    RGB32_STORE(rgb2_l, rgb2_h, image2 + 8);
+    DEALYUV2RGBREMAIN32
+    PUTRGB(image1, py_1);
+    PUTRGB(image2, py_2);
+    ENDRES32
+    END_FUNC()
+
+YUV2RGBFUNC32(yuv420_argb32_lsx, uint32_t, 0)
+    LOAD_YUV_16
+    YUV2RGB(m_y1, m_y2, m_u, m_v, r1, g1, b1, r2, g2, b2);
+    RGB32_PACK(a, r1, g1, b1, rgb1_l, rgb1_h);
+    RGB32_PACK(a, r2, g2, b2, rgb2_l, rgb2_h);
+    RGB32_STORE(rgb1_l, rgb1_h, image1);
+    RGB32_STORE(rgb2_l, rgb2_h, image2);
+    YUV2RGB(m_y1_h, m_y2_h, m_u_h, m_v_h, r1, g1, b1, r2, g2, b2);
+    RGB32_PACK(a, r1, g1, b1, rgb1_l, rgb1_h);
+    RGB32_PACK(a, r2, g2, b2, rgb2_l, rgb2_h);
+    RGB32_STORE(rgb1_l, rgb1_h, image1 + 8);
+    RGB32_STORE(rgb2_l, rgb2_h, image2 + 8);
+    DEALYUV2RGBREMAIN32
+    PUTRGB(image1, py_1);
+    PUTRGB(image2, py_2);
+    ENDRES32
+    END_FUNC()
+
+YUV2RGBFUNC32(yuv420_abgr32_lsx, uint32_t, 0)
+    LOAD_YUV_16
+    YUV2RGB(m_y1, m_y2, m_u, m_v, r1, g1, b1, r2, g2, b2);
+    RGB32_PACK(a, b1, g1, r1, rgb1_l, rgb1_h);
+    RGB32_PACK(a, b2, g2, r2, rgb2_l, rgb2_h);
+    RGB32_STORE(rgb1_l, rgb1_h, image1);
+    RGB32_STORE(rgb2_l, rgb2_h, image2);
+    YUV2RGB(m_y1_h, m_y2_h, m_u_h, m_v_h, r1, g1, b1, r2, g2, b2);
+    RGB32_PACK(a, b1, g1, r1, rgb1_l, rgb1_h);
+    RGB32_PACK(a, b2, g2, r2, rgb2_l, rgb2_h);
+    RGB32_STORE(rgb1_l, rgb1_h, image1 + 8);
+    RGB32_STORE(rgb2_l, rgb2_h, image2 + 8);
+    DEALYUV2RGBREMAIN32
+    PUTRGB(image1, py_1);
+    PUTRGB(image2, py_2);
+    ENDRES32
+    END_FUNC()
-- 
2.20.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [FFmpeg-devel] [PATCH v1 1/6] avcodec/la: add LSX optimization for h264 idct.
  2023-05-04  8:49 ` [FFmpeg-devel] [PATCH v1 1/6] avcodec/la: add LSX optimization for h264 idct Hao Chen
@ 2023-05-09  2:47   ` yinshiyou-hf
  0 siblings, 0 replies; 18+ messages in thread
From: yinshiyou-hf @ 2023-05-09  2:47 UTC (permalink / raw)
  To: FFmpeg development discussions and patches




> -----原始邮件-----
> 发件人: "Hao Chen" <chenhao@loongson.cn>
> 发送时间:2023-05-04 16:49:47 (星期四)
> 收件人: ffmpeg-devel@ffmpeg.org
> 抄送: "Shiyou Yin" <yinshiyou-hf@loongson.cn>
> 主题: [FFmpeg-devel] [PATCH v1 1/6] avcodec/la: add LSX optimization for h264 idct.
> 

> diff --git a/libavcodec/loongarch/h264dsp_lasx.h b/libavcodec/loongarch/h264dsp_loongarch.h
> similarity index 68%
> rename from libavcodec/loongarch/h264dsp_lasx.h
> rename to libavcodec/loongarch/h264dsp_loongarch.h
> index 4cf813750b..28dca2b537 100644
> --- a/libavcodec/loongarch/h264dsp_lasx.h
> +++ b/libavcodec/loongarch/h264dsp_loongarch.h
> @@ -1,5 +1,5 @@
>  /*
> - * Copyright (c) 2021 Loongson Technology Corporation Limited
> + * Copyright (c) 2023 Loongson Technology Corporation Limited
>   * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
>   *                Xiwei  Gu  <guxiwei-hf@loongson.cn>
>   *
> @@ -20,11 +20,34 @@
>   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>   */
>  
> -#ifndef AVCODEC_LOONGARCH_H264DSP_LASX_H
> -#define AVCODEC_LOONGARCH_H264DSP_LASX_H
> +#ifndef AVCODEC_LOONGARCH_H264DSP_LOONGARCH_H
> +#define AVCODEC_LOONGARCH_H264DSP_LOONGARCH_H
>  
>  #include "libavcodec/h264dec.h"
> +#include "config.h"
>  
> +void ff_h264_idct_add_8_lsx(uint8_t *dst, int16_t *src, int dst_stride);
> +void ff_h264_idct8_add_8_lsx(uint8_t *dst, int16_t *src, int dst_stride);
> +void ff_h264_idct_dc_add_8_lsx(uint8_t *dst, int16_t *src, int dst_stride);
> +void ff_h264_idct8_dc_add_8_lsx(uint8_t *dst, int16_t *src, int dst_stride);
> +void ff_h264_luma_dc_dequant_idct_8_lsx(int16_t *_output, int16_t *_input, int qmul);
> +void ff_h264_idct_add16_8_lsx(uint8_t *dst, const int32_t *blk_offset,
> +                              int16_t *block, int32_t dst_stride,
> +                              const uint8_t nzc[15 * 8]);
> +void ff_h264_idct8_add4_8_lsx(uint8_t *dst, const int32_t *blk_offset,
> +                              int16_t *block, int32_t dst_stride,
> +                              const uint8_t nzc[15 * 8]);
> +void ff_h264_idct_add8_8_lsx(uint8_t **dst, const int32_t *blk_offset,
> +                             int16_t *block, int32_t dst_stride,
> +                             const uint8_t nzc[15 * 8]);
> +void ff_h264_idct_add8_422_8_lsx(uint8_t **dst, const int32_t *blk_offset,
> +                                 int16_t *block, int32_t dst_stride,
> +                                 const uint8_t nzc[15 * 8]);
> +void ff_h264_idct_add16_intra_8_lsx(uint8_t *dst, const int32_t *blk_offset,
> +                                    int16_t *block, int32_t dst_stride,
> +                                    const uint8_t nzc[15 * 8]);
> +
> +#if HAVE_LASX
>  void ff_h264_h_lpf_luma_8_lasx(uint8_t *src, ptrdiff_t stride,
>                                 int alpha, int beta, int8_t *tc0);
>  void ff_h264_v_lpf_luma_8_lasx(uint8_t *src, ptrdiff_t stride,
> @@ -65,33 +88,16 @@ void ff_weight_h264_pixels4_8_lasx(uint8_t *src, ptrdiff_t stride,
>  void ff_h264_add_pixels4_8_lasx(uint8_t *_dst, int16_t *_src, int stride);
>  
>  void ff_h264_add_pixels8_8_lasx(uint8_t *_dst, int16_t *_src, int stride);
> -void ff_h264_idct_add_lasx(uint8_t *dst, int16_t *src, int32_t dst_stride);
> -void ff_h264_idct8_addblk_lasx(uint8_t *dst, int16_t *src, int32_t dst_stride);
> -void ff_h264_idct4x4_addblk_dc_lasx(uint8_t *dst, int16_t *src,
> -                                    int32_t dst_stride);
> -void ff_h264_idct8_dc_addblk_lasx(uint8_t *dst, int16_t *src,
> +void ff_h264_idct8_add_8_lasx(uint8_t *dst, int16_t *src, int32_t dst_stride);
> +void ff_h264_idct8_dc_add_8_lasx(uint8_t *dst, int16_t *src,
>                                    int32_t dst_stride);
> -void ff_h264_idct_add16_lasx(uint8_t *dst, const int32_t *blk_offset,
> -                             int16_t *block, int32_t dst_stride,
> -                             const uint8_t nzc[15 * 8]);
> -void ff_h264_idct8_add4_lasx(uint8_t *dst, const int32_t *blk_offset,
> -                             int16_t *block, int32_t dst_stride,
> -                             const uint8_t nzc[15 * 8]);
> -void ff_h264_idct_add8_lasx(uint8_t **dst, const int32_t *blk_offset,
> -                            int16_t *block, int32_t dst_stride,
> -                            const uint8_t nzc[15 * 8]);
> -void ff_h264_idct_add8_422_lasx(uint8_t **dst, const int32_t *blk_offset,
> -                                int16_t *block, int32_t dst_stride,
> -                                const uint8_t nzc[15 * 8]);
> -void ff_h264_idct_add16_intra_lasx(uint8_t *dst, const int32_t *blk_offset,
> -                                   int16_t *block, int32_t dst_stride,
> -                                   const uint8_t nzc[15 * 8]);
> -void ff_h264_deq_idct_luma_dc_lasx(int16_t *dst, int16_t *src,
> -                                   int32_t de_qval);
> -
> +void ff_h264_idct8_add4_8_lasx(uint8_t *dst, const int32_t *blk_offset,
> +                               int16_t *block, int32_t dst_stride,
> +                               const uint8_t nzc[15 * 8]);

This function has not been initialized.


本邮件及其附件含有龙芯中科的商业秘密信息,仅限于发送给上面地址中列出的个人或群组。禁止任何其他人以任何形式使用(包括但不限于全部或部分地泄露、复制或散发)本邮件及其附件中的信息。如果您错收本邮件,请您立即电话或邮件通知发件人并删除本邮件。 
This email and its attachments contain confidential information from Loongson Technology , which is intended only for the person or entity whose address is listed above. Any use of the information contained herein in any way (including, but not limited to, total or partial disclosure, reproduction or dissemination) by persons other than the intended recipient(s) is prohibited. If you receive this email in error, please notify the sender by phone or email immediately and delete it. 
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [FFmpeg-devel] [PATCH v1 3/6] avcodec/la: Add LSX optimization for h264 chroma and intrapred.
  2023-05-04  8:49 ` [FFmpeg-devel] [PATCH v1 3/6] avcodec/la: Add LSX optimization for h264 chroma and intrapred Hao Chen
@ 2023-05-11  7:19   ` Shiyou Yin
  0 siblings, 0 replies; 18+ messages in thread
From: Shiyou Yin @ 2023-05-11  7:19 UTC (permalink / raw)
  To: FFmpeg development discussions and patches; +Cc: Lu Wang


> 2023年5月4日 16:49,Hao Chen <chenhao@loongson.cn> 写道:

> diff --git a/libavcodec/loongarch/h264chroma_loongarch.h b/libavcodec/loongarch/h264chroma_loongarch.h
> new file mode 100644
> index 0000000000..26a7155389
> --- /dev/null
> +++ b/libavcodec/loongarch/h264chroma_loongarch.h
> @@ -0,0 +1,43 @@
> +/*
> + * Copyright (c) 2023 Loongson Technology Corporation Limited
> + * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn <mailto:yinshiyou-hf@loongson.cn>>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#ifndef AVCODEC_LOONGARCH_H264CHROMA_LOONGARCH_H
> +#define AVCODEC_LOONGARCH_H264CHROMA_LOONGARCH_H
> +
> +#include <stdint.h>
> +#include <stddef.h>
> +#include "libavcodec/h264.h"
> +
stdint.h and stddef.h is not necessary.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [FFmpeg-devel] Add LSX optimization in avcodec and swscale.
  2023-05-22  4:40 ` Shiyou Yin
@ 2023-05-25 19:07   ` Michael Niedermayer
  0 siblings, 0 replies; 18+ messages in thread
From: Michael Niedermayer @ 2023-05-25 19:07 UTC (permalink / raw)
  To: FFmpeg development discussions and patches


[-- Attachment #1.1: Type: text/plain, Size: 1220 bytes --]

On Mon, May 22, 2023 at 12:40:08PM +0800, Shiyou Yin wrote:
> 
> 
> > 2023年5月20日 15:27,Hao Chen <chenhao@loongson.cn> 写道:
> > 
> > Retrigger the fate test.
> > v1: Add LSX optimization in avcodec and swscale, due to the 2K series CPUs only support lsx.
> > v2: Modified the implementation of some functions and added support for the checkasm --bench feature.
> > v3: Fix whitespace errors in patch.
> > 
> > [PATCH v3 1/7] avcodec/la: add LSX optimization for h264 idct.
> > [PATCH v3 2/7] avcodec/la: Add LSX optimization for loop filter.
> > [PATCH v3 3/7] avcodec/la: Add LSX optimization for h264 chroma and
> > [PATCH v3 4/7] avcodec/la: Add LSX optimization for h264 qpel.
> > [PATCH v3 5/7] swscale/la: Optimize the functions of the swscale
> > [PATCH v3 6/7] swscale/la: Add following builtin optimized functions
> > [PATCH v3 7/7] avutil/la: Add function performance testing
> > 
> > 
> LGTM.

i will apply the latest version of this patchset

thx

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

When you are offended at any man's fault, turn to yourself and study your
own failings. Then you will forget your anger. -- Epictetus

[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

[-- Attachment #2: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [FFmpeg-devel] Add LSX optimization in avcodec and swscale.
@ 2023-05-25  7:24 Hao Chen
  0 siblings, 0 replies; 18+ messages in thread
From: Hao Chen @ 2023-05-25  7:24 UTC (permalink / raw)
  To: ffmpeg-devel

v1: Add LSX optimization in avcodec and swscale, due to the 2K series CPUs only support lsx.
v2: Modified the implementation of some functions and added support for the checkasm --bench feature.
v3: Fix whitespace errors in patch.
v4: Remove clobbering memory in libavutil/loongarch/timer.h
v5: Fix whitespace errors agian.
[PATCH v5 1/7] avcodec/la: add LSX optimization for h264 idct.
[PATCH v5 2/7] avcodec/la: Add LSX optimization for loop filter.
[PATCH v5 3/7] avcodec/la: Add LSX optimization for h264 chroma and
[PATCH v5 4/7] avcodec/la: Add LSX optimization for h264 qpel.
[PATCH v5 5/7] swscale/la: Optimize the functions of the swscale
[PATCH v5 6/7] swscale/la: Add following builtin optimized functions
[PATCH v5 7/7] avutil/la: Add function performance testing

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [FFmpeg-devel] Add LSX optimization in avcodec and swscale.
@ 2023-05-24  7:48 Hao Chen
  0 siblings, 0 replies; 18+ messages in thread
From: Hao Chen @ 2023-05-24  7:48 UTC (permalink / raw)
  To: ffmpeg-devel

v1: Add LSX optimization in avcodec and swscale, due to the 2K series CPUs only support lsx.
v2: Modified the implementation of some functions and added support for the checkasm --bench feature.
v3: Fix whitespace errors in patch.
v4: Remove clobbering memory in libavutil/loongarch/timer.h

[PATCH v4 1/7] avcodec/la: add LSX optimization for h264 idct.
[PATCH v4 2/7] avcodec/la: Add LSX optimization for loop filter.
[PATCH v4 3/7] avcodec/la: Add LSX optimization for h264 chroma and
[PATCH v4 4/7] avcodec/la: Add LSX optimization for h264 qpel.
[PATCH v4 5/7] swscale/la: Optimize the functions of the swscale
[PATCH v4 6/7] swscale/la: Add following builtin optimized functions
[PATCH v4 7/7] avutil/la: Add function performance testing

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [FFmpeg-devel] Add LSX optimization in avcodec and swscale.
  2023-05-20  7:27 Hao Chen
@ 2023-05-22  4:40 ` Shiyou Yin
  2023-05-25 19:07   ` Michael Niedermayer
  0 siblings, 1 reply; 18+ messages in thread
From: Shiyou Yin @ 2023-05-22  4:40 UTC (permalink / raw)
  To: FFmpeg development discussions and patches; +Cc: Michael Niedermayer



> 2023年5月20日 15:27,Hao Chen <chenhao@loongson.cn> 写道:
> 
> Retrigger the fate test.
> v1: Add LSX optimization in avcodec and swscale, due to the 2K series CPUs only support lsx.
> v2: Modified the implementation of some functions and added support for the checkasm --bench feature.
> v3: Fix whitespace errors in patch.
> 
> [PATCH v3 1/7] avcodec/la: add LSX optimization for h264 idct.
> [PATCH v3 2/7] avcodec/la: Add LSX optimization for loop filter.
> [PATCH v3 3/7] avcodec/la: Add LSX optimization for h264 chroma and
> [PATCH v3 4/7] avcodec/la: Add LSX optimization for h264 qpel.
> [PATCH v3 5/7] swscale/la: Optimize the functions of the swscale
> [PATCH v3 6/7] swscale/la: Add following builtin optimized functions
> [PATCH v3 7/7] avutil/la: Add function performance testing
> 
> 
LGTM.

Michael, please help to review and merge this PR,
FFmpeg added checkasm for h264chroma recently, and this PR happens to fix the failure on LA.


_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [FFmpeg-devel] Add LSX optimization in avcodec and swscale.
@ 2023-05-20  7:27 Hao Chen
  2023-05-22  4:40 ` Shiyou Yin
  0 siblings, 1 reply; 18+ messages in thread
From: Hao Chen @ 2023-05-20  7:27 UTC (permalink / raw)
  To: ffmpeg-devel

Retrigger the fate test.
v1: Add LSX optimization in avcodec and swscale, due to the 2K series CPUs only support lsx.
v2: Modified the implementation of some functions and added support for the checkasm --bench feature.
v3: Fix whitespace errors in patch.

[PATCH v3 1/7] avcodec/la: add LSX optimization for h264 idct.
[PATCH v3 2/7] avcodec/la: Add LSX optimization for loop filter.
[PATCH v3 3/7] avcodec/la: Add LSX optimization for h264 chroma and
[PATCH v3 4/7] avcodec/la: Add LSX optimization for h264 qpel.
[PATCH v3 5/7] swscale/la: Optimize the functions of the swscale
[PATCH v3 6/7] swscale/la: Add following builtin optimized functions
[PATCH v3 7/7] avutil/la: Add function performance testing

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [FFmpeg-devel] Add LSX optimization in avcodec and swscale.
@ 2023-05-20  1:56 Hao Chen
  0 siblings, 0 replies; 18+ messages in thread
From: Hao Chen @ 2023-05-20  1:56 UTC (permalink / raw)
  To: ffmpeg-devel

v1: Add LSX optimization in avcodec and swscale, due to the 2K series CPUs only support lsx.
v2: Modified the implementation of some functions and added support for the checkasm --bench feature.
v3: Fix whitespace errors in patch.

[PATCH v3 1/7] avcodec/la: add LSX optimization for h264 idct.
[PATCH v3 2/7] avcodec/la: Add LSX optimization for loop filter.
[PATCH v3 3/7] avcodec/la: Add LSX optimization for h264 chroma and
[PATCH v3 4/7] avcodec/la: Add LSX optimization for h264 qpel.
[PATCH v3 5/7] swscale/la: Optimize the functions of the swscale
[PATCH v3 6/7] swscale/la: Add following builtin optimized functions
[PATCH v3 7/7] avutil/la: Add function performance testing

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [FFmpeg-devel] Add LSX optimization in avcodec and swscale.
  2023-05-17  7:03 Hao Chen
@ 2023-05-17  8:27 ` Shiyou Yin
  0 siblings, 0 replies; 18+ messages in thread
From: Shiyou Yin @ 2023-05-17  8:27 UTC (permalink / raw)
  To: FFmpeg development discussions and patches



> 2023年5月17日 15:03,Hao Chen <chenhao@loongson.cn> 写道:
> 
> v1: Add LSX optimization in avcodec and swscale, due to the 2K series CPUs only support lsx.
> v2: Modified the implementation of some functions and added support for the checkasm --bench feature.
> 
> [PATCH v2 1/7] avcodec/la: add LSX optimization for h264 idct.
> [PATCH v2 2/7] avcodec/la: Add LSX optimization for loop filter.
> [PATCH v2 3/7] avcodec/la: Add LSX optimization for h264 chroma and
> [PATCH v2 4/7] avcodec/la: Add LSX optimization for h264 qpel.
> [PATCH v2 5/7] swscale/la: Optimize the functions of the swscale.
> [PATCH v2 6/7] swscale/la: Add following builtin optimized functions.
> [PATCH v2 7/7] avutil/la: Add function performance testing.
> 
> _______________________________________________
> 

LGTM

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [FFmpeg-devel] Add LSX optimization in avcodec and swscale.
@ 2023-05-17  7:03 Hao Chen
  2023-05-17  8:27 ` Shiyou Yin
  0 siblings, 1 reply; 18+ messages in thread
From: Hao Chen @ 2023-05-17  7:03 UTC (permalink / raw)
  To: ffmpeg-devel

v1: Add LSX optimization in avcodec and swscale, due to the 2K series CPUs only support lsx.
v2: Modified the implementation of some functions and added support for the checkasm --bench feature.

[PATCH v2 1/7] avcodec/la: add LSX optimization for h264 idct.
[PATCH v2 2/7] avcodec/la: Add LSX optimization for loop filter.
[PATCH v2 3/7] avcodec/la: Add LSX optimization for h264 chroma and
[PATCH v2 4/7] avcodec/la: Add LSX optimization for h264 qpel.
[PATCH v2 5/7] swscale/la: Optimize the functions of the swscale.
[PATCH v2 6/7] swscale/la: Add following builtin optimized functions.
[PATCH v2 7/7] avutil/la: Add function performance testing.

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [FFmpeg-devel] Add LSX optimization in avcodec and swscale.
@ 2023-05-17  6:54 Hao Chen
  0 siblings, 0 replies; 18+ messages in thread
From: Hao Chen @ 2023-05-17  6:54 UTC (permalink / raw)
  To: ffmpeg-devel

v1: Add LSX optimization in avcodec and swscale, due to the 2K series CPUs only support lsx.
v2: Modified the implementation of some functions and added support for the checkasm --bench feature.
[PATCH v2 1/7] avcodec/la: add LSX optimization for h264 idct.
[PATCH v2 2/7] avcodec/la: Add LSX optimization for loop filter.
[PATCH v2 3/7] avcodec/la: Add LSX optimization for h264 chroma and
[PATCH v2 4/7] avcodec/la: Add LSX optimization for h264 qpel.
[PATCH v2 5/7] swscale/la: Optimize the functions of the swscale
[PATCH v2 6/7] swscale/la: Add following builtin optimized functions
[PATCH v2 7/7] avutil/la: Add function performance testing

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2023-05-25 19:07 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-05-04  8:49 [FFmpeg-devel] Add LSX optimization in avcodec and swscale Hao Chen
2023-05-04  8:49 ` [FFmpeg-devel] [PATCH v1 1/6] avcodec/la: add LSX optimization for h264 idct Hao Chen
2023-05-09  2:47   ` yinshiyou-hf
2023-05-04  8:49 ` [FFmpeg-devel] [PATCH v1 2/6] avcodec/la: Add LSX optimization for loop filter Hao Chen
2023-05-04  8:49 ` [FFmpeg-devel] [PATCH v1 3/6] avcodec/la: Add LSX optimization for h264 chroma and intrapred Hao Chen
2023-05-11  7:19   ` Shiyou Yin
2023-05-04  8:49 ` [FFmpeg-devel] [PATCH v1 4/6] avcodec/la: Add LSX optimization for h264 qpel Hao Chen
2023-05-04  8:49 ` [FFmpeg-devel] [PATCH v1 5/6] swscale/la: Optimize the functions of the swscale series with lsx Hao Chen
2023-05-04  8:49 ` [FFmpeg-devel] [PATCH v1 6/6] swscale/la: Add following builtin optimized functions Hao Chen
2023-05-17  6:54 [FFmpeg-devel] Add LSX optimization in avcodec and swscale Hao Chen
2023-05-17  7:03 Hao Chen
2023-05-17  8:27 ` Shiyou Yin
2023-05-20  1:56 Hao Chen
2023-05-20  7:27 Hao Chen
2023-05-22  4:40 ` Shiyou Yin
2023-05-25 19:07   ` Michael Niedermayer
2023-05-24  7:48 Hao Chen
2023-05-25  7:24 Hao Chen

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git