Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] Optimize HEVC decoding for loongarch
@ 2022-02-14  8:25 Hao Chen
  2022-02-14  8:25 ` [FFmpeg-devel] [PATCH v1 1/5] avutil: [loongarch] Update loongson_intrinsics.h to v1.1.0 Hao Chen
                   ` (5 more replies)
  0 siblings, 6 replies; 7+ messages in thread
From: Hao Chen @ 2022-02-14  8:25 UTC (permalink / raw)
  To: ffmpeg-devel

ffmpeg -i 5_h265_1080p_60fps_3Mbps.mkv -f rawvideo -y /dev/null -an
before: 94fps
after : 191fps

[PATCH v1 1/5] avutil: [loongarch] Update loongson_intrinsics.h to
[PATCH v1 2/5] avcodec: [loongarch] Optimize Hevcdsp with LSX.
[PATCH v1 3/5] avcodec: [loongarch] Optimize Hevc_idct/lpf with LSX.
[PATCH v1 4/5] avcodec: [loongarch] Optimize Hevc_mc_bi with LSX.
[PATCH v1 5/5] avcodec: [loongarch] Optimize Hevc_mc_uni/w with LSX.

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [FFmpeg-devel] [PATCH v1 1/5] avutil: [loongarch] Update loongson_intrinsics.h to v1.1.0
  2022-02-14  8:25 [FFmpeg-devel] Optimize HEVC decoding for loongarch Hao Chen
@ 2022-02-14  8:25 ` Hao Chen
  2022-02-14  8:25 ` [FFmpeg-devel] [PATCH v1 2/5] avcodec: [loongarch] Optimize Hevcdsp with LSX Hao Chen
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Hao Chen @ 2022-02-14  8:25 UTC (permalink / raw)
  To: ffmpeg-devel

The loongson_intrinsics.h file is updated from v1.0.3 version
to v1.1.0. Some spelling mistakes are fixed and new functions are added.

Signed-off-by: Hao Chen <chenhao@loongson.cn>
---
 libavutil/loongarch/loongson_intrinsics.h | 1697 +++++++++++----------
 1 file changed, 884 insertions(+), 813 deletions(-)

diff --git a/libavutil/loongarch/loongson_intrinsics.h b/libavutil/loongarch/loongson_intrinsics.h
index 6e0439f829..eb256863c8 100644
--- a/libavutil/loongarch/loongson_intrinsics.h
+++ b/libavutil/loongarch/loongson_intrinsics.h
@@ -33,7 +33,7 @@
  *                Xiwei Gu   <guxiwei-hf@loongson.cn>
  *                Lu Wang    <wanglu@loongson.cn>
  *
- * This file is a header file for loongarch builtin extention.
+ * This file is a header file for loongarch builtin extension.
  *
  */
 
@@ -42,50 +42,50 @@
 
 /**
  * MAJOR version: Macro usage changes.
- * MINOR version: Add new functions, or bug fix.
+ * MINOR version: Add new functions, or bug fixes.
  * MICRO version: Comment changes or implementation changes.
  */
 #define LSOM_VERSION_MAJOR 1
-#define LSOM_VERSION_MINOR 0
-#define LSOM_VERSION_MICRO 3
+#define LSOM_VERSION_MINOR 1
+#define LSOM_VERSION_MICRO 0
 
 #define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
-{ \
-    _OUT0 = _INS(_IN0); \
-    _OUT1 = _INS(_IN1); \
-}
+  {                                               \
+    _OUT0 = _INS(_IN0);                           \
+    _OUT1 = _INS(_IN1);                           \
+  }
 
 #define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
-{ \
-    _OUT0 = _INS(_IN0, _IN1); \
-    _OUT1 = _INS(_IN2, _IN3); \
-}
+  {                                                           \
+    _OUT0 = _INS(_IN0, _IN1);                                 \
+    _OUT1 = _INS(_IN2, _IN3);                                 \
+  }
 
 #define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \
-{ \
-    _OUT0 = _INS(_IN0, _IN1, _IN2); \
-    _OUT1 = _INS(_IN3, _IN4, _IN5); \
-}
+  {                                                                       \
+    _OUT0 = _INS(_IN0, _IN1, _IN2);                                       \
+    _OUT1 = _INS(_IN3, _IN4, _IN5);                                       \
+  }
 
 #define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \
-{ \
-    DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1); \
-    DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3); \
-}
-
-#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, \
-                  _OUT0, _OUT1, _OUT2, _OUT3) \
-{ \
-    DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1); \
-    DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3); \
-}
-
-#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, \
-                  _IN8, _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3) \
-{ \
-    DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4,  _IN5,  _OUT0, _OUT1); \
-    DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3); \
-}
+  {                                                                         \
+    DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1);                              \
+    DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3);                              \
+  }
+
+#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, \
+                  _OUT1, _OUT2, _OUT3)                                         \
+  {                                                                            \
+    DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1);                     \
+    DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3);                     \
+  }
+
+#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, \
+                  _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3)             \
+  {                                                                           \
+    DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1);        \
+    DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3);      \
+  }
 
 #ifdef __loongarch_sx
 #include <lsxintrin.h>
@@ -94,11 +94,11 @@
  * Description : Dot product & addition of byte vector elements
  * Arguments   : Inputs  - in_c, in_h, in_l
  *               Outputs - out
- *               Retrun Type - halfword
+ *               Return Type - halfword
  * Details     : Signed byte elements from in_h are multiplied by
  *               signed byte elements from in_l, and then added adjacent to
  *               each other to get results with the twice size of input.
- *               Then the results plus to signed half word elements from in_c.
+ *               Then the results plus to signed half-word elements from in_c.
  * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
  *        in_c : 1,2,3,4, 1,2,3,4
  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
@@ -106,13 +106,13 @@
  *         out : 23,40,41,26, 23,40,41,26
  * =============================================================================
  */
-static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h, __m128i in_l)
-{
-    __m128i out;
+static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h,
+                                        __m128i in_l) {
+  __m128i out;
 
-    out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
-    out = __lsx_vmaddwod_h_b(out, in_h, in_l);
-    return out;
+  out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
+  out = __lsx_vmaddwod_h_b(out, in_h, in_l);
+  return out;
 }
 
 /*
@@ -120,35 +120,61 @@ static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h, __m128i in_l
  * Description : Dot product & addition of byte vector elements
  * Arguments   : Inputs  - in_c, in_h, in_l
  *               Outputs - out
- *               Retrun Type - halfword
+ *               Return Type - halfword
  * Details     : Unsigned byte elements from in_h are multiplied by
  *               unsigned byte elements from in_l, and then added adjacent to
  *               each other to get results with the twice size of input.
- *               The results plus to signed half word elements from in_c.
- * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
+ *               The results plus to signed half-word elements from in_c.
+ * Example     : out = __lsx_vdp2add_h_bu(in_c, in_h, in_l)
  *        in_c : 1,2,3,4, 1,2,3,4
  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
  *         out : 23,40,41,26, 23,40,41,26
  * =============================================================================
  */
-static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h, __m128i in_l)
-{
-    __m128i out;
+static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h,
+                                         __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
+  out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs  - in_c, in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get results with the twice size of input.
+ *               The results plus to signed half-word elements from in_c.
+ * Example     : out = __lsx_vdp2add_h_bu_b(in_c, in_h, in_l)
+ *        in_c : 1,1,1,1, 1,1,1,1
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : -1,-2,-3,-4, -5,-6,-7,-8, 1,2,3,4, 5,6,7,8
+ *         out : -4,-24,-60,-112, 6,26,62,114
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, __m128i in_h,
+                                           __m128i in_l) {
+  __m128i out;
 
-    out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
-    out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
-    return out;
+  out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l);
+  out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
+  return out;
 }
 
 /*
  * =============================================================================
- * Description : Dot product & addition of half word vector elements
+ * Description : Dot product & addition of half-word vector elements
  * Arguments   : Inputs  - in_c, in_h, in_l
  *               Outputs - out
- *               Retrun Type - __m128i
- * Details     : Signed half word elements from in_h are multiplied by
- *               signed half word elements from in_l, and then added adjacent to
+ *               Return Type - __m128i
+ * Details     : Signed half-word elements from in_h are multiplied by
+ *               signed half-word elements from in_l, and then added adjacent to
  *               each other to get results with the twice size of input.
  *               Then the results plus to signed word elements from in_c.
  * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
@@ -158,13 +184,13 @@ static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h, __m128i in_
  *         out : 23,40,41,26
  * =============================================================================
  */
-static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h, __m128i in_l)
-{
-    __m128i out;
+static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h,
+                                        __m128i in_l) {
+  __m128i out;
 
-    out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
-    out = __lsx_vmaddwod_w_h(out, in_h, in_l);
-    return out;
+  out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
+  out = __lsx_vmaddwod_w_h(out, in_h, in_l);
+  return out;
 }
 
 /*
@@ -172,7 +198,7 @@ static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h, __m128i in_l
  * Description : Dot product of byte vector elements
  * Arguments   : Inputs  - in_h, in_l
  *               Outputs - out
- *               Retrun Type - halfword
+ *               Return Type - halfword
  * Details     : Signed byte elements from in_h are multiplied by
  *               signed byte elements from in_l, and then added adjacent to
  *               each other to get results with the twice size of input.
@@ -182,13 +208,12 @@ static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h, __m128i in_l
  *         out : 22,38,38,22, 22,38,38,22
  * =============================================================================
  */
-static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l)
-{
-    __m128i out;
+static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) {
+  __m128i out;
 
-    out = __lsx_vmulwev_h_b(in_h, in_l);
-    out = __lsx_vmaddwod_h_b(out, in_h, in_l);
-    return out;
+  out = __lsx_vmulwev_h_b(in_h, in_l);
+  out = __lsx_vmaddwod_h_b(out, in_h, in_l);
+  return out;
 }
 
 /*
@@ -196,7 +221,7 @@ static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l)
  * Description : Dot product of byte vector elements
  * Arguments   : Inputs  - in_h, in_l
  *               Outputs - out
- *               Retrun Type - halfword
+ *               Return Type - halfword
  * Details     : Unsigned byte elements from in_h are multiplied by
  *               unsigned byte elements from in_l, and then added adjacent to
  *               each other to get results with the twice size of input.
@@ -206,13 +231,12 @@ static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l)
  *         out : 22,38,38,22, 22,38,38,22
  * =============================================================================
  */
-static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l)
-{
-    __m128i out;
+static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) {
+  __m128i out;
 
-    out = __lsx_vmulwev_h_bu(in_h, in_l);
-    out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
-    return out;
+  out = __lsx_vmulwev_h_bu(in_h, in_l);
+  out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
+  return out;
 }
 
 /*
@@ -220,7 +244,7 @@ static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l)
  * Description : Dot product of byte vector elements
  * Arguments   : Inputs  - in_h, in_l
  *               Outputs - out
- *               Retrun Type - halfword
+ *               Return Type - halfword
  * Details     : Unsigned byte elements from in_h are multiplied by
  *               signed byte elements from in_l, and then added adjacent to
  *               each other to get results with the twice size of input.
@@ -230,13 +254,12 @@ static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l)
  *         out : 22,38,38,22, 22,38,38,6
  * =============================================================================
  */
-static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l)
-{
-    __m128i out;
+static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) {
+  __m128i out;
 
-    out = __lsx_vmulwev_h_bu_b(in_h, in_l);
-    out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
-    return out;
+  out = __lsx_vmulwev_h_bu_b(in_h, in_l);
+  out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
+  return out;
 }
 
 /*
@@ -244,7 +267,7 @@ static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l)
  * Description : Dot product of byte vector elements
  * Arguments   : Inputs  - in_h, in_l
  *               Outputs - out
- *               Retrun Type - halfword
+ *               Return Type - halfword
  * Details     : Signed byte elements from in_h are multiplied by
  *               signed byte elements from in_l, and then added adjacent to
  *               each other to get results with the twice size of input.
@@ -254,19 +277,19 @@ static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l)
  *         out : 22,38,38,22
  * =============================================================================
  */
-static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l)
-{
-    __m128i out;
+static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) {
+  __m128i out;
 
-    out = __lsx_vmulwev_w_h(in_h, in_l);
-    out = __lsx_vmaddwod_w_h(out, in_h, in_l);
-    return out;
+  out = __lsx_vmulwev_w_h(in_h, in_l);
+  out = __lsx_vmaddwod_w_h(out, in_h, in_l);
+  return out;
 }
 
 /*
  * =============================================================================
  * Description : Clip all halfword elements of input vector between min & max
- *               out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) : (_in))
+ *               out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) :
+ *               (_in))
  * Arguments   : Inputs  - _in  (input vector)
  *                       - min  (min threshold)
  *                       - max  (max threshold)
@@ -279,13 +302,12 @@ static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l)
  *         out : 1,2,9,9, 1,9,9,9
  * =============================================================================
  */
-static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max)
-{
-    __m128i out;
+static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max) {
+  __m128i out;
 
-    out = __lsx_vmax_h(min, _in);
-    out = __lsx_vmin_h(max, out);
-    return out;
+  out = __lsx_vmax_h(min, _in);
+  out = __lsx_vmin_h(max, out);
+  return out;
 }
 
 /*
@@ -293,20 +315,19 @@ static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max)
  * Description : Set each element of vector between 0 and 255
  * Arguments   : Inputs  - _in
  *               Outputs - out
- *               Retrun Type - halfword
+ *               Return Type - halfword
  * Details     : Signed byte elements from _in are clamped between 0 and 255.
  * Example     : out = __lsx_vclip255_h(_in)
  *         _in : -8,255,280,249, -8,255,280,249
  *         out : 0,255,255,249, 0,255,255,249
  * =============================================================================
  */
-static inline __m128i __lsx_vclip255_h(__m128i _in)
-{
-    __m128i out;
+static inline __m128i __lsx_vclip255_h(__m128i _in) {
+  __m128i out;
 
-    out = __lsx_vmaxi_h(_in, 0);
-    out = __lsx_vsat_hu(out, 7);
-    return out;
+  out = __lsx_vmaxi_h(_in, 0);
+  out = __lsx_vsat_hu(out, 7);
+  return out;
 }
 
 /*
@@ -314,20 +335,19 @@ static inline __m128i __lsx_vclip255_h(__m128i _in)
  * Description : Set each element of vector between 0 and 255
  * Arguments   : Inputs  - _in
  *               Outputs - out
- *               Retrun Type - word
+ *               Return Type - word
  * Details     : Signed byte elements from _in are clamped between 0 and 255.
  * Example     : out = __lsx_vclip255_w(_in)
  *         _in : -8,255,280,249
  *         out : 0,255,255,249
  * =============================================================================
  */
-static inline __m128i __lsx_vclip255_w(__m128i _in)
-{
-    __m128i out;
+static inline __m128i __lsx_vclip255_w(__m128i _in) {
+  __m128i out;
 
-    out = __lsx_vmaxi_w(_in, 0);
-    out = __lsx_vsat_wu(out, 7);
-    return out;
+  out = __lsx_vmaxi_w(_in, 0);
+  out = __lsx_vsat_wu(out, 7);
+  return out;
 }
 
 /*
@@ -343,12 +363,12 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
  *   _in1(out) : 1,2,3,4
  * =============================================================================
  */
-#define LSX_SWAP(_in0, _in1)                                            \
-{                                                                       \
-    _in0 = __lsx_vxor_v(_in0, _in1);                                    \
-    _in1 = __lsx_vxor_v(_in0, _in1);                                    \
-    _in0 = __lsx_vxor_v(_in0, _in1);                                    \
-}                                                                       \
+#define LSX_SWAP(_in0, _in1)         \
+  {                                  \
+    _in0 = __lsx_vxor_v(_in0, _in1); \
+    _in1 = __lsx_vxor_v(_in0, _in1); \
+    _in0 = __lsx_vxor_v(_in0, _in1); \
+  }
 
 /*
  * =============================================================================
@@ -364,25 +384,27 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
  * =============================================================================
  */
 #define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
-{                                                                              \
+  {                                                                            \
     __m128i _t0, _t1, _t2, _t3;                                                \
                                                                                \
-    _t0   = __lsx_vilvl_w(_in1, _in0);                                         \
-    _t1   = __lsx_vilvh_w(_in1, _in0);                                         \
-    _t2   = __lsx_vilvl_w(_in3, _in2);                                         \
-    _t3   = __lsx_vilvh_w(_in3, _in2);                                         \
+    _t0 = __lsx_vilvl_w(_in1, _in0);                                           \
+    _t1 = __lsx_vilvh_w(_in1, _in0);                                           \
+    _t2 = __lsx_vilvl_w(_in3, _in2);                                           \
+    _t3 = __lsx_vilvh_w(_in3, _in2);                                           \
     _out0 = __lsx_vilvl_d(_t2, _t0);                                           \
     _out1 = __lsx_vilvh_d(_t2, _t0);                                           \
     _out2 = __lsx_vilvl_d(_t3, _t1);                                           \
     _out3 = __lsx_vilvh_d(_t3, _t1);                                           \
-}
+  }
 
 /*
  * =============================================================================
  * Description : Transpose 8x8 block with byte elements in vectors
  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
- *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
- * Details     : The rows of the matrix become columns, and the columns become rows.
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *               _out7
+ * Details     : The rows of the matrix become columns, and the columns
+ *               become rows.
  * Example     : LSX_TRANSPOSE8x8_B
  *        _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00
  *        _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00
@@ -403,34 +425,35 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
  *      _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00
  * =============================================================================
  */
-#define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,        \
-                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
-{                                                                                 \
-   __m128i zero = {0};                                                            \
-   __m128i shuf8 = {0x0F0E0D0C0B0A0908, 0x1716151413121110};                      \
-   __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                                \
-                                                                                  \
-   _t0 = __lsx_vilvl_b(_in2, _in0);                                               \
-   _t1 = __lsx_vilvl_b(_in3, _in1);                                               \
-   _t2 = __lsx_vilvl_b(_in6, _in4);                                               \
-   _t3 = __lsx_vilvl_b(_in7, _in5);                                               \
-   _t4 = __lsx_vilvl_b(_t1, _t0);                                                 \
-   _t5 = __lsx_vilvh_b(_t1, _t0);                                                 \
-   _t6 = __lsx_vilvl_b(_t3, _t2);                                                 \
-   _t7 = __lsx_vilvh_b(_t3, _t2);                                                 \
-   _out0 = __lsx_vilvl_w(_t6, _t4);                                               \
-   _out2 = __lsx_vilvh_w(_t6, _t4);                                               \
-   _out4 = __lsx_vilvl_w(_t7, _t5);                                               \
-   _out6 = __lsx_vilvh_w(_t7, _t5);                                               \
-   _out1 = __lsx_vshuf_b(zero, _out0, shuf8);                                     \
-   _out3 = __lsx_vshuf_b(zero, _out2, shuf8);                                     \
-   _out5 = __lsx_vshuf_b(zero, _out4, shuf8);                                     \
-   _out7 = __lsx_vshuf_b(zero, _out6, shuf8);                                     \
-}
+#define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    __m128i zero = { 0 };                                                   \
+    __m128i shuf8 = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };             \
+    __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                         \
+                                                                            \
+    _t0 = __lsx_vilvl_b(_in2, _in0);                                        \
+    _t1 = __lsx_vilvl_b(_in3, _in1);                                        \
+    _t2 = __lsx_vilvl_b(_in6, _in4);                                        \
+    _t3 = __lsx_vilvl_b(_in7, _in5);                                        \
+    _t4 = __lsx_vilvl_b(_t1, _t0);                                          \
+    _t5 = __lsx_vilvh_b(_t1, _t0);                                          \
+    _t6 = __lsx_vilvl_b(_t3, _t2);                                          \
+    _t7 = __lsx_vilvh_b(_t3, _t2);                                          \
+    _out0 = __lsx_vilvl_w(_t6, _t4);                                        \
+    _out2 = __lsx_vilvh_w(_t6, _t4);                                        \
+    _out4 = __lsx_vilvl_w(_t7, _t5);                                        \
+    _out6 = __lsx_vilvh_w(_t7, _t5);                                        \
+    _out1 = __lsx_vshuf_b(zero, _out0, shuf8);                              \
+    _out3 = __lsx_vshuf_b(zero, _out2, shuf8);                              \
+    _out5 = __lsx_vshuf_b(zero, _out4, shuf8);                              \
+    _out7 = __lsx_vshuf_b(zero, _out6, shuf8);                              \
+  }
 
 /*
  * =============================================================================
- * Description : Transpose 8x8 block with half word elements in vectors
+ * Description : Transpose 8x8 block with half-word elements in vectors
  * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
  *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
  * Details     :
@@ -445,37 +468,38 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
  *              70,71,72,73,74,75,76,77           07,17,27,37,47,57,67,77
  * =============================================================================
  */
-#define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,        \
-                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
-{                                                                                 \
-    __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                     \
-                                                                                  \
-    _s0 = __lsx_vilvl_h(_in6, _in4);                                              \
-    _s1 = __lsx_vilvl_h(_in7, _in5);                                              \
-    _t0 = __lsx_vilvl_h(_s1, _s0);                                                \
-    _t1 = __lsx_vilvh_h(_s1, _s0);                                                \
-    _s0 = __lsx_vilvh_h(_in6, _in4);                                              \
-    _s1 = __lsx_vilvh_h(_in7, _in5);                                              \
-    _t2 = __lsx_vilvl_h(_s1, _s0);                                                \
-    _t3 = __lsx_vilvh_h(_s1, _s0);                                                \
-    _s0 = __lsx_vilvl_h(_in2, _in0);                                              \
-    _s1 = __lsx_vilvl_h(_in3, _in1);                                              \
-    _t4 = __lsx_vilvl_h(_s1, _s0);                                                \
-    _t5 = __lsx_vilvh_h(_s1, _s0);                                                \
-    _s0 = __lsx_vilvh_h(_in2, _in0);                                              \
-    _s1 = __lsx_vilvh_h(_in3, _in1);                                              \
-    _t6 = __lsx_vilvl_h(_s1, _s0);                                                \
-    _t7 = __lsx_vilvh_h(_s1, _s0);                                                \
-                                                                                  \
-    _out0 = __lsx_vpickev_d(_t0, _t4);                                            \
-    _out2 = __lsx_vpickev_d(_t1, _t5);                                            \
-    _out4 = __lsx_vpickev_d(_t2, _t6);                                            \
-    _out6 = __lsx_vpickev_d(_t3, _t7);                                            \
-    _out1 = __lsx_vpickod_d(_t0, _t4);                                            \
-    _out3 = __lsx_vpickod_d(_t1, _t5);                                            \
-    _out5 = __lsx_vpickod_d(_t2, _t6);                                            \
-    _out7 = __lsx_vpickod_d(_t3, _t7);                                            \
-}
+#define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;               \
+                                                                            \
+    _s0 = __lsx_vilvl_h(_in6, _in4);                                        \
+    _s1 = __lsx_vilvl_h(_in7, _in5);                                        \
+    _t0 = __lsx_vilvl_h(_s1, _s0);                                          \
+    _t1 = __lsx_vilvh_h(_s1, _s0);                                          \
+    _s0 = __lsx_vilvh_h(_in6, _in4);                                        \
+    _s1 = __lsx_vilvh_h(_in7, _in5);                                        \
+    _t2 = __lsx_vilvl_h(_s1, _s0);                                          \
+    _t3 = __lsx_vilvh_h(_s1, _s0);                                          \
+    _s0 = __lsx_vilvl_h(_in2, _in0);                                        \
+    _s1 = __lsx_vilvl_h(_in3, _in1);                                        \
+    _t4 = __lsx_vilvl_h(_s1, _s0);                                          \
+    _t5 = __lsx_vilvh_h(_s1, _s0);                                          \
+    _s0 = __lsx_vilvh_h(_in2, _in0);                                        \
+    _s1 = __lsx_vilvh_h(_in3, _in1);                                        \
+    _t6 = __lsx_vilvl_h(_s1, _s0);                                          \
+    _t7 = __lsx_vilvh_h(_s1, _s0);                                          \
+                                                                            \
+    _out0 = __lsx_vpickev_d(_t0, _t4);                                      \
+    _out2 = __lsx_vpickev_d(_t1, _t5);                                      \
+    _out4 = __lsx_vpickev_d(_t2, _t6);                                      \
+    _out6 = __lsx_vpickev_d(_t3, _t7);                                      \
+    _out1 = __lsx_vpickod_d(_t0, _t4);                                      \
+    _out3 = __lsx_vpickod_d(_t1, _t5);                                      \
+    _out5 = __lsx_vpickod_d(_t2, _t6);                                      \
+    _out7 = __lsx_vpickod_d(_t3, _t7);                                      \
+  }
 
 /*
  * =============================================================================
@@ -483,7 +507,8 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
  * Arguments   : Inputs  - _in0, _in1, _in2, _in3      (input 8x4 byte block)
  *               Outputs - _out0, _out1, _out2, _out3  (output 4x8 byte block)
  *               Return Type - as per RTYPE
- * Details     : The rows of the matrix become columns, and the columns become rows.
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
  * Example     : LSX_TRANSPOSE8x4_B
  *        _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00
  *        _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00
@@ -500,26 +525,26 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
  *       _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
  * =============================================================================
  */
-#define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,       \
-                           _out0, _out1, _out2, _out3)                           \
-{                                                                                \
-    __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                                  \
-                                                                                 \
-    _tmp0_m = __lsx_vpackev_w(_in4, _in0);                                       \
-    _tmp1_m = __lsx_vpackev_w(_in5, _in1);                                       \
-    _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m);                                   \
-    _tmp0_m = __lsx_vpackev_w(_in6, _in2);                                       \
-    _tmp1_m = __lsx_vpackev_w(_in7, _in3);                                       \
-                                                                                 \
-    _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m);                                   \
-    _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m);                                   \
-    _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m);                                   \
-                                                                                 \
-    _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m);                                     \
-    _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m);                                     \
-    _out1 = __lsx_vilvh_d(_out2, _out0);                                         \
-    _out3 = __lsx_vilvh_d(_out0, _out2);                                         \
-}
+#define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+                           _out0, _out1, _out2, _out3)                     \
+  {                                                                        \
+    __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                            \
+                                                                           \
+    _tmp0_m = __lsx_vpackev_w(_in4, _in0);                                 \
+    _tmp1_m = __lsx_vpackev_w(_in5, _in1);                                 \
+    _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m);                             \
+    _tmp0_m = __lsx_vpackev_w(_in6, _in2);                                 \
+    _tmp1_m = __lsx_vpackev_w(_in7, _in3);                                 \
+                                                                           \
+    _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m);                             \
+    _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m);                             \
+    _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m);                             \
+                                                                           \
+    _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m);                               \
+    _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m);                               \
+    _out1 = __lsx_vilvh_d(_out2, _out0);                                   \
+    _out3 = __lsx_vilvh_d(_out0, _out2);                                   \
+  }
 
 /*
  * =============================================================================
@@ -547,29 +572,30 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
  *              120,121,122,123,124,125,126,127
  * =============================================================================
  */
-#define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, _in8,  \
-                            _in9, _in10, _in11, _in12, _in13, _in14, _in15, _out0, \
-                            _out1, _out2, _out3, _out4, _out5, _out6, _out7)       \
-{                                                                                  \
-    __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7;                \
-    __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                                \
-    DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5,       \
-              _tmp0, _tmp1, _tmp2, _tmp3);                                         \
-    DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15,        \
-              _in13, _tmp4, _tmp5, _tmp6, _tmp7);                                  \
-    DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2);                \
-    DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3);                \
-    DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6);                \
-    DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7);                \
-    DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4);                    \
-    DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6);                    \
-    DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5);                    \
-    DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7);                    \
-    DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2);            \
-    DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3);            \
-    DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6);            \
-    DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7);            \
-}
+#define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                            _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
+                            _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+                            _out6, _out7)                                    \
+  {                                                                          \
+    __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7;          \
+    __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                          \
+    DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \
+              _tmp0, _tmp1, _tmp2, _tmp3);                                   \
+    DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15,  \
+              _in13, _tmp4, _tmp5, _tmp6, _tmp7);                            \
+    DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2);          \
+    DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3);          \
+    DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6);          \
+    DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7);          \
+    DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4);              \
+    DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6);              \
+    DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5);              \
+    DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7);              \
+    DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2);      \
+    DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3);      \
+    DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6);      \
+    DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7);      \
+  }
 
 /*
  * =============================================================================
@@ -585,33 +611,33 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
  * =============================================================================
  */
 #define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
-{                                                                             \
+  {                                                                           \
     _out0 = __lsx_vadd_b(_in0, _in3);                                         \
     _out1 = __lsx_vadd_b(_in1, _in2);                                         \
     _out2 = __lsx_vsub_b(_in1, _in2);                                         \
     _out3 = __lsx_vsub_b(_in0, _in3);                                         \
-}
+  }
 #define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
-{                                                                             \
+  {                                                                           \
     _out0 = __lsx_vadd_h(_in0, _in3);                                         \
     _out1 = __lsx_vadd_h(_in1, _in2);                                         \
     _out2 = __lsx_vsub_h(_in1, _in2);                                         \
     _out3 = __lsx_vsub_h(_in0, _in3);                                         \
-}
+  }
 #define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
-{                                                                             \
+  {                                                                           \
     _out0 = __lsx_vadd_w(_in0, _in3);                                         \
     _out1 = __lsx_vadd_w(_in1, _in2);                                         \
     _out2 = __lsx_vsub_w(_in1, _in2);                                         \
     _out3 = __lsx_vsub_w(_in0, _in3);                                         \
-}
+  }
 #define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
-{                                                                             \
+  {                                                                           \
     _out0 = __lsx_vadd_d(_in0, _in3);                                         \
     _out1 = __lsx_vadd_d(_in1, _in2);                                         \
     _out2 = __lsx_vsub_d(_in1, _in2);                                         \
     _out3 = __lsx_vsub_d(_in0, _in3);                                         \
-}
+  }
 
 /*
  * =============================================================================
@@ -630,59 +656,63 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
  *              _out7 = _in0 - _in7;
  * =============================================================================
  */
-#define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,        \
-                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
-{                                                                                \
-    _out0 = __lsx_vadd_b(_in0, _in7);                                            \
-    _out1 = __lsx_vadd_b(_in1, _in6);                                            \
-    _out2 = __lsx_vadd_b(_in2, _in5);                                            \
-    _out3 = __lsx_vadd_b(_in3, _in4);                                            \
-    _out4 = __lsx_vsub_b(_in3, _in4);                                            \
-    _out5 = __lsx_vsub_b(_in2, _in5);                                            \
-    _out6 = __lsx_vsub_b(_in1, _in6);                                            \
-    _out7 = __lsx_vsub_b(_in0, _in7);                                            \
-}
-
-#define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,        \
-                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
-{                                                                                \
-    _out0 = __lsx_vadd_h(_in0, _in7);                                            \
-    _out1 = __lsx_vadd_h(_in1, _in6);                                            \
-    _out2 = __lsx_vadd_h(_in2, _in5);                                            \
-    _out3 = __lsx_vadd_h(_in3, _in4);                                            \
-    _out4 = __lsx_vsub_h(_in3, _in4);                                            \
-    _out5 = __lsx_vsub_h(_in2, _in5);                                            \
-    _out6 = __lsx_vsub_h(_in1, _in6);                                            \
-    _out7 = __lsx_vsub_h(_in0, _in7);                                            \
-}
-
-#define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,        \
-                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
-{                                                                                \
-    _out0 = __lsx_vadd_w(_in0, _in7);                                            \
-    _out1 = __lsx_vadd_w(_in1, _in6);                                            \
-    _out2 = __lsx_vadd_w(_in2, _in5);                                            \
-    _out3 = __lsx_vadd_w(_in3, _in4);                                            \
-    _out4 = __lsx_vsub_w(_in3, _in4);                                            \
-    _out5 = __lsx_vsub_w(_in2, _in5);                                            \
-    _out6 = __lsx_vsub_w(_in1, _in6);                                            \
-    _out7 = __lsx_vsub_w(_in0, _in7);                                            \
-}
-
-#define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,        \
-                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
-{                                                                                \
-    _out0 = __lsx_vadd_d(_in0, _in7);                                            \
-    _out1 = __lsx_vadd_d(_in1, _in6);                                            \
-    _out2 = __lsx_vadd_d(_in2, _in5);                                            \
-    _out3 = __lsx_vadd_d(_in3, _in4);                                            \
-    _out4 = __lsx_vsub_d(_in3, _in4);                                            \
-    _out5 = __lsx_vsub_d(_in2, _in5);                                            \
-    _out6 = __lsx_vsub_d(_in1, _in6);                                            \
-    _out7 = __lsx_vsub_d(_in0, _in7);                                            \
-}
-
-#endif //LSX
+#define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                          _out7)                                           \
+  {                                                                        \
+    _out0 = __lsx_vadd_b(_in0, _in7);                                      \
+    _out1 = __lsx_vadd_b(_in1, _in6);                                      \
+    _out2 = __lsx_vadd_b(_in2, _in5);                                      \
+    _out3 = __lsx_vadd_b(_in3, _in4);                                      \
+    _out4 = __lsx_vsub_b(_in3, _in4);                                      \
+    _out5 = __lsx_vsub_b(_in2, _in5);                                      \
+    _out6 = __lsx_vsub_b(_in1, _in6);                                      \
+    _out7 = __lsx_vsub_b(_in0, _in7);                                      \
+  }
+
+#define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                          _out7)                                           \
+  {                                                                        \
+    _out0 = __lsx_vadd_h(_in0, _in7);                                      \
+    _out1 = __lsx_vadd_h(_in1, _in6);                                      \
+    _out2 = __lsx_vadd_h(_in2, _in5);                                      \
+    _out3 = __lsx_vadd_h(_in3, _in4);                                      \
+    _out4 = __lsx_vsub_h(_in3, _in4);                                      \
+    _out5 = __lsx_vsub_h(_in2, _in5);                                      \
+    _out6 = __lsx_vsub_h(_in1, _in6);                                      \
+    _out7 = __lsx_vsub_h(_in0, _in7);                                      \
+  }
+
+#define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                          _out7)                                           \
+  {                                                                        \
+    _out0 = __lsx_vadd_w(_in0, _in7);                                      \
+    _out1 = __lsx_vadd_w(_in1, _in6);                                      \
+    _out2 = __lsx_vadd_w(_in2, _in5);                                      \
+    _out3 = __lsx_vadd_w(_in3, _in4);                                      \
+    _out4 = __lsx_vsub_w(_in3, _in4);                                      \
+    _out5 = __lsx_vsub_w(_in2, _in5);                                      \
+    _out6 = __lsx_vsub_w(_in1, _in6);                                      \
+    _out7 = __lsx_vsub_w(_in0, _in7);                                      \
+  }
+
+#define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                          _out7)                                           \
+  {                                                                        \
+    _out0 = __lsx_vadd_d(_in0, _in7);                                      \
+    _out1 = __lsx_vadd_d(_in1, _in6);                                      \
+    _out2 = __lsx_vadd_d(_in2, _in5);                                      \
+    _out3 = __lsx_vadd_d(_in3, _in4);                                      \
+    _out4 = __lsx_vsub_d(_in3, _in4);                                      \
+    _out5 = __lsx_vsub_d(_in2, _in5);                                      \
+    _out6 = __lsx_vsub_d(_in1, _in6);                                      \
+    _out7 = __lsx_vsub_d(_in0, _in7);                                      \
+  }
+
+#endif  // LSX
 
 #ifdef __loongarch_asx
 #include <lasxintrin.h>
@@ -700,13 +730,12 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
  * =============================================================================
  */
-static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l)
-{
-    __m256i out;
+static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) {
+  __m256i out;
 
-    out = __lasx_xvmulwev_h_bu(in_h, in_l);
-    out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
-    return out;
+  out = __lasx_xvmulwev_h_bu(in_h, in_l);
+  out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+  return out;
 }
 
 /*
@@ -718,18 +747,17 @@ static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l)
  * Details     : Signed byte elements from in_h are multiplied with
  *               signed byte elements from in_l producing a result
  *               twice the size of input i.e. signed halfword.
- *               Then this iniplication results of adjacent odd-even elements
+ *               Then this multiplication results of adjacent odd-even elements
  *               are added to the out vector
  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
  * =============================================================================
  */
-static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l)
-{
-    __m256i out;
+static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) {
+  __m256i out;
 
-    out = __lasx_xvmulwev_h_b(in_h, in_l);
-    out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
-    return out;
+  out = __lasx_xvmulwev_h_b(in_h, in_l);
+  out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
+  return out;
 }
 
 /*
@@ -749,13 +777,12 @@ static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l)
  *         out : 22,38,38,22, 22,38,38,22
  * =============================================================================
  */
-static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l)
-{
-    __m256i out;
+static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
 
-    out = __lasx_xvmulwev_w_h(in_h, in_l);
-    out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
-    return out;
+  out = __lasx_xvmulwev_w_h(in_h, in_l);
+  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+  return out;
 }
 
 /*
@@ -763,22 +790,21 @@ static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l)
  * Description : Dot product of word vector elements
  * Arguments   : Inputs - in_h, in_l
  *               Output - out
- *               Retrun Type - signed double
+ *               Return Type - signed double
  * Details     : Signed word elements from in_h are multiplied with
  *               signed word elements from in_l producing a result
- *               twice the size of input i.e. signed double word.
+ *               twice the size of input i.e. signed double-word.
  *               Then this multiplied results of adjacent odd-even elements
  *               are added to the out vector.
  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
  * =============================================================================
  */
-static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l)
-{
-    __m256i out;
+static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l) {
+  __m256i out;
 
-    out = __lasx_xvmulwev_d_w(in_h, in_l);
-    out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
-    return out;
+  out = __lasx_xvmulwev_d_w(in_h, in_l);
+  out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
+  return out;
 }
 
 /*
@@ -795,13 +821,12 @@ static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l)
  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
  * =============================================================================
  */
-static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l)
-{
-    __m256i out;
+static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
 
-    out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
-    out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
-    return out;
+  out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
+  out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
+  return out;
 }
 
 /*
@@ -809,7 +834,7 @@ static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l)
  * Description : Dot product & addition of byte vector elements
  * Arguments   : Inputs - in_h, in_l
  *               Output - out
- *               Retrun Type - halfword
+ *               Return Type - halfword
  * Details     : Signed byte elements from in_h are multiplied with
  *               signed byte elements from in_l producing a result
  *               twice the size of input i.e. signed halfword.
@@ -818,13 +843,59 @@ static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l)
  * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
  * =============================================================================
  */
-static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c,__m256i in_h, __m256i in_l)
-{
-    __m256i out;
+static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h,
+                                          __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied with
+ *               unsigned byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then this multiplied results of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, __m256i in_h,
+                                           __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied with
+ *               signed byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then this multiplied results of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, __m256i in_h,
+                                             __m256i in_l) {
+  __m256i out;
 
-    out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
-    out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
-    return out;
+  out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l);
+  return out;
 }
 
 /*
@@ -845,13 +916,13 @@ static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c,__m256i in_h, __m256i in_
  *         out : 23,40,41,26, 23,40,41,26
  * =============================================================================
  */
-static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
-{
-    __m256i out;
+static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h,
+                                          __m256i in_l) {
+  __m256i out;
 
-    out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
-    out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
-    return out;
+  out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+  return out;
 }
 
 /*
@@ -868,13 +939,13 @@ static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h, __m256i in
  * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
  * =============================================================================
  */
-static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h, __m256i in_l)
-{
-    __m256i out;
+static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h,
+                                           __m256i in_l) {
+  __m256i out;
 
-    out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
-    out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
-    return out;
+  out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
+  return out;
 }
 
 /*
@@ -891,13 +962,13 @@ static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h, __m256i i
  * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
  * =============================================================================
  */
-static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h, __m256i in_l)
-{
-    __m256i out;
+static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h,
+                                             __m256i in_l) {
+  __m256i out;
 
-    out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
-    out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
-    return out;
+  out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
+  return out;
 }
 
 /*
@@ -915,14 +986,14 @@ static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h, __m256i
  * Example     : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
  * =============================================================================
  */
-static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h, __m256i in_l)
-{
-    __m256i out;
-
-    out = __lasx_xvmulwev_h_bu(in_h, in_l);
-    out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
-    out = __lasx_xvsub_h(in_c, out);
-    return out;
+static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h,
+                                           __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_h_bu(in_h, in_l);
+  out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+  out = __lasx_xvsub_h(in_c, out);
+  return out;
 }
 
 /*
@@ -944,14 +1015,14 @@ static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h, __m256i i
  *         out : -7,-3,0,0, 0,-1,0,-1
  * =============================================================================
  */
-static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
-{
-    __m256i out;
-
-    out = __lasx_xvmulwev_w_h(in_h, in_l);
-    out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
-    out = __lasx_xvsub_w(in_c, out);
-    return out;
+static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h,
+                                          __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_w_h(in_h, in_l);
+  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+  out = __lasx_xvsub_w(in_c, out);
+  return out;
 }
 
 /*
@@ -960,10 +1031,10 @@ static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h, __m256i in
  * Arguments   : Inputs - in_h, in_l
  *               Output - out
  *               Return Type - signed word
- * Details     : Signed halfword elements from in_h are iniplied with
+ * Details     : Signed halfword elements from in_h are multiplied with
  *               signed halfword elements from in_l producing a result
  *               four times the size of input i.e. signed doubleword.
- *               Then this iniplication results of four adjacent elements
+ *               Then this multiplication results of four adjacent elements
  *               are added together and stored to the out vector.
  * Example     : out = __lasx_xvdp4_d_h(in_h, in_l)
  *        in_h :  3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1
@@ -971,14 +1042,13 @@ static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h, __m256i in
  *         out : -2,0,1,1
  * =============================================================================
  */
-static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l)
-{
-    __m256i out;
-
-    out = __lasx_xvmulwev_w_h(in_h, in_l);
-    out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
-    out = __lasx_xvhaddw_d_w(out, out);
-    return out;
+static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_w_h(in_h, in_l);
+  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+  out = __lasx_xvhaddw_d_w(out, out);
+  return out;
 }
 
 /*
@@ -993,13 +1063,12 @@ static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l)
  * Example     : See out = __lasx_xvaddwh_w_h(in_h, in_l)
  * =============================================================================
  */
-static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l)
-{
-    __m256i out;
+static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l) {
+  __m256i out;
 
-    out = __lasx_xvilvh_b(in_h, in_l);
-    out = __lasx_xvhaddw_h_b(out, out);
-    return out;
+  out = __lasx_xvilvh_b(in_h, in_l);
+  out = __lasx_xvhaddw_h_b(out, out);
+  return out;
 }
 
 /*
@@ -1017,13 +1086,12 @@ static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l)
  *         out : 1,0,0,-1, 1,0,0, 2
  * =============================================================================
  */
- static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l)
-{
-    __m256i out;
+static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
 
-    out = __lasx_xvilvh_h(in_h, in_l);
-    out = __lasx_xvhaddw_w_h(out, out);
-    return out;
+  out = __lasx_xvilvh_h(in_h, in_l);
+  out = __lasx_xvhaddw_w_h(out, out);
+  return out;
 }
 
 /*
@@ -1038,13 +1106,12 @@ static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l)
  * Example     : See out = __lasx_xvaddwl_w_h(in_h, in_l)
  * =============================================================================
  */
-static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l)
-{
-    __m256i out;
+static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l) {
+  __m256i out;
 
-    out = __lasx_xvilvl_b(in_h, in_l);
-    out = __lasx_xvhaddw_h_b(out, out);
-    return out;
+  out = __lasx_xvilvl_b(in_h, in_l);
+  out = __lasx_xvhaddw_h_b(out, out);
+  return out;
 }
 
 /*
@@ -1062,13 +1129,12 @@ static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l)
  *         out : 5,-1,4,2, 1,0,2,-1
  * =============================================================================
  */
-static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l)
-{
-    __m256i out;
+static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
 
-    out = __lasx_xvilvl_h(in_h, in_l);
-    out = __lasx_xvhaddw_w_h(out, out);
-    return out;
+  out = __lasx_xvilvl_h(in_h, in_l);
+  out = __lasx_xvhaddw_w_h(out, out);
+  return out;
 }
 
 /*
@@ -1083,13 +1149,12 @@ static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l)
  * Example     : See out = __lasx_xvaddwl_w_h(in_h, in_l)
  * =============================================================================
  */
-static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l)
-{
-    __m256i out;
+static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l) {
+  __m256i out;
 
-    out = __lasx_xvilvl_b(in_h, in_l);
-    out = __lasx_xvhaddw_hu_bu(out, out);
-    return out;
+  out = __lasx_xvilvl_b(in_h, in_l);
+  out = __lasx_xvhaddw_hu_bu(out, out);
+  return out;
 }
 
 /*
@@ -1103,13 +1168,12 @@ static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l)
  * Example     : See out = __lasx_xvaddw_w_w_h(in_h, in_l)
  * =============================================================================
  */
-static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l)
-{
-    __m256i out;
+static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l) {
+  __m256i out;
 
-    out = __lasx_xvsllwil_hu_bu(in_l, 0);
-    out = __lasx_xvadd_h(in_h, out);
-    return out;
+  out = __lasx_xvsllwil_hu_bu(in_l, 0);
+  out = __lasx_xvadd_h(in_h, out);
+  return out;
 }
 
 /*
@@ -1126,13 +1190,12 @@ static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l)
  *         out : 2, 0,1,2, -1,0,1,1,
  * =============================================================================
  */
-static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l)
-{
-    __m256i out;
+static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
 
-    out = __lasx_xvsllwil_w_h(in_l, 0);
-    out = __lasx_xvadd_w(in_h, out);
-    return out;
+  out = __lasx_xvsllwil_w_h(in_l, 0);
+  out = __lasx_xvadd_w(in_h, out);
+  return out;
 }
 
 /*
@@ -1153,15 +1216,15 @@ static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l)
  *         out : 201, 602,1203,2004, -995, -1794,-2793,-3992
  * =============================================================================
  */
-static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
-{
-    __m256i tmp0, tmp1, out;
-
-    tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
-    tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
-    tmp0 = __lasx_xvmul_w(tmp0, tmp1);
-    out  = __lasx_xvadd_w(tmp0, in_c);
-    return out;
+static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h,
+                                          __m256i in_l) {
+  __m256i tmp0, tmp1, out;
+
+  tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
+  tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
+  tmp0 = __lasx_xvmul_w(tmp0, tmp1);
+  out = __lasx_xvadd_w(tmp0, in_c);
+  return out;
 }
 
 /*
@@ -1177,15 +1240,15 @@ static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h, __m256i in
  * Example     : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
  * =============================================================================
  */
-static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
-{
-    __m256i tmp0, tmp1, out;
-
-    tmp0 = __lasx_xvilvh_h(in_h, in_h);
-    tmp1 = __lasx_xvilvh_h(in_l, in_l);
-    tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
-    out  = __lasx_xvadd_w(tmp0, in_c);
-    return out;
+static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h,
+                                          __m256i in_l) {
+  __m256i tmp0, tmp1, out;
+
+  tmp0 = __lasx_xvilvh_h(in_h, in_h);
+  tmp1 = __lasx_xvilvh_h(in_l, in_l);
+  tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
+  out = __lasx_xvadd_w(tmp0, in_c);
+  return out;
 }
 
 /*
@@ -1203,14 +1266,13 @@ static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h, __m256i in
  *         out : 6,1,3,0, 0,0,1,0
  * =============================================================================
  */
-static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l)
-{
-    __m256i tmp0, tmp1, out;
-
-    tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
-    tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
-    out  = __lasx_xvmul_w(tmp0, tmp1);
-    return out;
+static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l) {
+  __m256i tmp0, tmp1, out;
+
+  tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
+  tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
+  out = __lasx_xvmul_w(tmp0, tmp1);
+  return out;
 }
 
 /*
@@ -1228,39 +1290,39 @@ static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l)
  *         out : 0,0,0,0, 0,0,0,1
  * =============================================================================
  */
-static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l)
-{
-    __m256i tmp0, tmp1, out;
-
-    tmp0 = __lasx_xvilvh_h(in_h, in_h);
-    tmp1 = __lasx_xvilvh_h(in_l, in_l);
-    out  = __lasx_xvmulwev_w_h(tmp0, tmp1);
-    return out;
+static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) {
+  __m256i tmp0, tmp1, out;
+
+  tmp0 = __lasx_xvilvh_h(in_h, in_h);
+  tmp1 = __lasx_xvilvh_h(in_l, in_l);
+  out = __lasx_xvmulwev_w_h(tmp0, tmp1);
+  return out;
 }
 
 /*
  * =============================================================================
- * Description : The low half of the vector elements are expanded and
- *               added saturately after being doubled.
+ * Description : The low half of the vector elements are added to the high half
+ *               after being doubled, then saturated.
  * Arguments   : Inputs - in_h, in_l
  *               Output - out
- * Details     : The in_h vector adds the in_l vector saturately after the lower
- *               half of the two-fold zero extension (unsigned byte to unsigned
- *               halfword) and the results are stored to the out vector.
+ * Details     : The in_h vector adds the in_l vector after the lower half of
+ *               the two-fold zero extension (unsigned byte to unsigned
+ *               halfword) and then saturated. The results are stored to the out
+ *               vector.
  * Example     : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l)
  *        in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1
- *        in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1
- *         out : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
+ *        in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1,
+ *               0,0,0,1
+ *        out  : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
  * =============================================================================
  */
-static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l)
-{
-    __m256i tmp1, out;
-    __m256i zero = {0};
-
-    tmp1 = __lasx_xvilvl_b(zero, in_l);
-    out  = __lasx_xvsadd_hu(in_h, tmp1);
-    return out;
+static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) {
+  __m256i tmp1, out;
+  __m256i zero = { 0 };
+
+  tmp1 = __lasx_xvilvl_b(zero, in_l);
+  out = __lasx_xvsadd_hu(in_h, tmp1);
+  return out;
 }
 
 /*
@@ -1279,13 +1341,12 @@ static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l)
  *         out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5
  * =============================================================================
  */
-static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max)
-{
-    __m256i out;
+static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max) {
+  __m256i out;
 
-    out = __lasx_xvmax_h(min, in);
-    out = __lasx_xvmin_h(max, out);
-    return out;
+  out = __lasx_xvmax_h(min, in);
+  out = __lasx_xvmin_h(max, out);
+  return out;
 }
 
 /*
@@ -1298,13 +1359,12 @@ static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max)
  * Example     : See out = __lasx_xvclip255_w(in)
  * =============================================================================
  */
-static inline __m256i __lasx_xvclip255_h(__m256i in)
-{
-    __m256i out;
+static inline __m256i __lasx_xvclip255_h(__m256i in) {
+  __m256i out;
 
-    out = __lasx_xvmaxi_h(in, 0);
-    out = __lasx_xvsat_hu(out, 7);
-    return out;
+  out = __lasx_xvmaxi_h(in, 0);
+  out = __lasx_xvsat_hu(out, 7);
+  return out;
 }
 
 /*
@@ -1319,20 +1379,19 @@ static inline __m256i __lasx_xvclip255_h(__m256i in)
  *         out :  0,255,255,249,  0,255,255,249
  * =============================================================================
  */
-static inline __m256i __lasx_xvclip255_w(__m256i in)
-{
-    __m256i out;
+static inline __m256i __lasx_xvclip255_w(__m256i in) {
+  __m256i out;
 
-    out = __lasx_xvmaxi_w(in, 0);
-    out = __lasx_xvsat_wu(out, 7);
-    return out;
+  out = __lasx_xvmaxi_w(in, 0);
+  out = __lasx_xvsat_wu(out, 7);
+  return out;
 }
 
 /*
  * =============================================================================
  * Description : Indexed halfword element values are replicated to all
- *               elements in output vector. If 'indx < 8' use xvsplati_l_*,
- *               if 'indx >= 8' use xvsplati_h_*.
+ *               elements in output vector. If 'idx < 8' use xvsplati_l_*,
+ *               if 'idx >= 8' use xvsplati_h_*.
  * Arguments   : Inputs - in, idx
  *               Output - out
  * Details     : Idx element value from in vector is replicated to all
@@ -1344,20 +1403,19 @@ static inline __m256i __lasx_xvclip255_w(__m256i in)
  *         out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11
  * =============================================================================
  */
-static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx)
-{
-    __m256i out;
+static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx) {
+  __m256i out;
 
-    out = __lasx_xvpermi_q(in, in, 0x02);
-    out = __lasx_xvreplve_h(out, idx);
-    return out;
+  out = __lasx_xvpermi_q(in, in, 0x02);
+  out = __lasx_xvreplve_h(out, idx);
+  return out;
 }
 
 /*
  * =============================================================================
  * Description : Indexed halfword element values are replicated to all
- *               elements in output vector. If 'indx < 8' use xvsplati_l_*,
- *               if 'indx >= 8' use xvsplati_h_*.
+ *               elements in output vector. If 'idx < 8' use xvsplati_l_*,
+ *               if 'idx >= 8' use xvsplati_h_*.
  * Arguments   : Inputs - in, idx
  *               Output - out
  * Details     : Idx element value from in vector is replicated to all
@@ -1369,18 +1427,17 @@ static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx)
  *         out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
  * =============================================================================
  */
-static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
-{
-    __m256i out;
+static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) {
+  __m256i out;
 
-    out = __lasx_xvpermi_q(in, in, 0x13);
-    out = __lasx_xvreplve_h(out, idx);
-    return out;
+  out = __lasx_xvpermi_q(in, in, 0x13);
+  out = __lasx_xvreplve_h(out, idx);
+  return out;
 }
 
 /*
  * =============================================================================
- * Description : Transpose 4x4 block with double word elements in vectors
+ * Description : Transpose 4x4 block with double-word elements in vectors
  * Arguments   : Inputs  - _in0, _in1, _in2, _in3
  *               Outputs - _out0, _out1, _out2, _out3
  * Example     : LASX_TRANSPOSE4x4_D
@@ -1395,24 +1452,26 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
  *       _out3 : 4,4,4,4
  * =============================================================================
  */
-#define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
-{                                                                               \
-    __m256i _tmp0, _tmp1, _tmp2, _tmp3;                                         \
-    _tmp0 = __lasx_xvilvl_d(_in1, _in0);                                        \
-    _tmp1 = __lasx_xvilvh_d(_in1, _in0);                                        \
-    _tmp2 = __lasx_xvilvl_d(_in3, _in2);                                        \
-    _tmp3 = __lasx_xvilvh_d(_in3, _in2);                                        \
-    _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20);                               \
-    _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31);                               \
-    _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20);                               \
-    _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31);                               \
-}
+#define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
+                            _out3)                                       \
+  {                                                                      \
+    __m256i _tmp0, _tmp1, _tmp2, _tmp3;                                  \
+    _tmp0 = __lasx_xvilvl_d(_in1, _in0);                                 \
+    _tmp1 = __lasx_xvilvh_d(_in1, _in0);                                 \
+    _tmp2 = __lasx_xvilvl_d(_in3, _in2);                                 \
+    _tmp3 = __lasx_xvilvh_d(_in3, _in2);                                 \
+    _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20);                        \
+    _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31);                        \
+    _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20);                        \
+    _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31);                        \
+  }
 
 /*
  * =============================================================================
  * Description : Transpose 8x8 block with word elements in vectors
  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
- *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *               _out7
  * Example     : LASX_TRANSPOSE8x8_W
  *        _in0 : 1,2,3,4,5,6,7,8
  *        _in1 : 2,2,3,4,5,6,7,8
@@ -1433,38 +1492,39 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
  *       _out7 : 8,8,8,8,8,8,8,8
  * =============================================================================
  */
-#define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,         \
-                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
-{                                                                                   \
-    __m256i _s0_m, _s1_m;                                                           \
-    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                                     \
-    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                                     \
-                                                                                    \
-    _s0_m   = __lasx_xvilvl_w(_in2, _in0);                                          \
-    _s1_m   = __lasx_xvilvl_w(_in3, _in1);                                          \
-    _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                        \
-    _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                        \
-    _s0_m   = __lasx_xvilvh_w(_in2, _in0);                                          \
-    _s1_m   = __lasx_xvilvh_w(_in3, _in1);                                          \
-    _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                        \
-    _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                        \
-    _s0_m   = __lasx_xvilvl_w(_in6, _in4);                                          \
-    _s1_m   = __lasx_xvilvl_w(_in7, _in5);                                          \
-    _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                        \
-    _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                        \
-    _s0_m   = __lasx_xvilvh_w(_in6, _in4);                                          \
-    _s1_m   = __lasx_xvilvh_w(_in7, _in5);                                          \
-    _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                        \
-    _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                        \
-    _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20);                               \
-    _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20);                               \
-    _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20);                               \
-    _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20);                               \
-    _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31);                               \
-    _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31);                               \
-    _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31);                               \
-    _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31);                               \
-}
+#define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                            _out7)                                           \
+  {                                                                          \
+    __m256i _s0_m, _s1_m;                                                    \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
+                                                                             \
+    _s0_m = __lasx_xvilvl_w(_in2, _in0);                                     \
+    _s1_m = __lasx_xvilvl_w(_in3, _in1);                                     \
+    _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
+    _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvh_w(_in2, _in0);                                     \
+    _s1_m = __lasx_xvilvh_w(_in3, _in1);                                     \
+    _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
+    _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvl_w(_in6, _in4);                                     \
+    _s1_m = __lasx_xvilvl_w(_in7, _in5);                                     \
+    _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
+    _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvh_w(_in6, _in4);                                     \
+    _s1_m = __lasx_xvilvh_w(_in7, _in5);                                     \
+    _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
+    _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
+    _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20);                        \
+    _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20);                        \
+    _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20);                        \
+    _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20);                        \
+    _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31);                        \
+    _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31);                        \
+    _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31);                        \
+    _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31);                        \
+  }
 
 /*
  * =============================================================================
@@ -1472,52 +1532,54 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
  *                         _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
  *                         (input 16x8 byte block)
- *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
- *                         (output 8x16 byte block)
- * Details     : The rows of the matrix become columns, and the columns become rows.
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *                         _out7 (output 8x16 byte block)
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
  * Example     : See LASX_TRANSPOSE16x8_H
  * =============================================================================
  */
-#define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,         \
-                             _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15,   \
-                             _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
-{                                                                                    \
-    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                                      \
-    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                                      \
-                                                                                     \
-    _tmp0_m = __lasx_xvilvl_b(_in2, _in0);                                           \
-    _tmp1_m = __lasx_xvilvl_b(_in3, _in1);                                           \
-    _tmp2_m = __lasx_xvilvl_b(_in6, _in4);                                           \
-    _tmp3_m = __lasx_xvilvl_b(_in7, _in5);                                           \
-    _tmp4_m = __lasx_xvilvl_b(_in10, _in8);                                          \
-    _tmp5_m = __lasx_xvilvl_b(_in11, _in9);                                          \
-    _tmp6_m = __lasx_xvilvl_b(_in14, _in12);                                         \
-    _tmp7_m = __lasx_xvilvl_b(_in15, _in13);                                         \
-    _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m);                                       \
-    _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m);                                       \
-    _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m);                                       \
-    _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m);                                       \
-    _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m);                                       \
-    _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m);                                       \
-    _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m);                                       \
-    _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m);                                       \
-    _tmp0_m = __lasx_xvilvl_w(_out2, _out0);                                         \
-    _tmp2_m = __lasx_xvilvh_w(_out2, _out0);                                         \
-    _tmp4_m = __lasx_xvilvl_w(_out3, _out1);                                         \
-    _tmp6_m = __lasx_xvilvh_w(_out3, _out1);                                         \
-    _tmp1_m = __lasx_xvilvl_w(_out6, _out4);                                         \
-    _tmp3_m = __lasx_xvilvh_w(_out6, _out4);                                         \
-    _tmp5_m = __lasx_xvilvl_w(_out7, _out5);                                         \
-    _tmp7_m = __lasx_xvilvh_w(_out7, _out5);                                         \
-    _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m);                                       \
-    _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m);                                       \
-    _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m);                                       \
-    _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m);                                       \
-    _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m);                                       \
-    _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m);                                       \
-    _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m);                                       \
-    _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m);                                       \
-}
+#define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                             _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
+                             _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+                             _out6, _out7)                                    \
+  {                                                                           \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                               \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                               \
+                                                                              \
+    _tmp0_m = __lasx_xvilvl_b(_in2, _in0);                                    \
+    _tmp1_m = __lasx_xvilvl_b(_in3, _in1);                                    \
+    _tmp2_m = __lasx_xvilvl_b(_in6, _in4);                                    \
+    _tmp3_m = __lasx_xvilvl_b(_in7, _in5);                                    \
+    _tmp4_m = __lasx_xvilvl_b(_in10, _in8);                                   \
+    _tmp5_m = __lasx_xvilvl_b(_in11, _in9);                                   \
+    _tmp6_m = __lasx_xvilvl_b(_in14, _in12);                                  \
+    _tmp7_m = __lasx_xvilvl_b(_in15, _in13);                                  \
+    _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m);                                \
+    _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m);                                \
+    _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m);                                \
+    _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m);                                \
+    _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m);                                \
+    _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m);                                \
+    _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m);                                \
+    _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m);                                \
+    _tmp0_m = __lasx_xvilvl_w(_out2, _out0);                                  \
+    _tmp2_m = __lasx_xvilvh_w(_out2, _out0);                                  \
+    _tmp4_m = __lasx_xvilvl_w(_out3, _out1);                                  \
+    _tmp6_m = __lasx_xvilvh_w(_out3, _out1);                                  \
+    _tmp1_m = __lasx_xvilvl_w(_out6, _out4);                                  \
+    _tmp3_m = __lasx_xvilvh_w(_out6, _out4);                                  \
+    _tmp5_m = __lasx_xvilvl_w(_out7, _out5);                                  \
+    _tmp7_m = __lasx_xvilvh_w(_out7, _out5);                                  \
+    _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m);                                \
+    _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m);                                \
+    _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m);                                \
+    _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m);                                \
+    _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m);                                \
+    _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m);                                \
+    _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m);                                \
+    _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m);                                \
+  }
 
 /*
  * =============================================================================
@@ -1525,9 +1587,10 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
  *                         _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
  *                         (input 16x8 byte block)
- *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
- *                         (output 8x16 byte block)
- * Details     : The rows of the matrix become columns, and the columns become rows.
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *                         _out7 (output 8x16 byte block)
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
  * Example     : LASX_TRANSPOSE16x8_H
  *        _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
  *        _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
@@ -1556,72 +1619,73 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
  *       _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
  * =============================================================================
  */
-#define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,         \
-                             _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15,   \
-                             _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
-   {                                                                                 \
-    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                                      \
-    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                                      \
-    __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                                  \
-                                                                                     \
-    _tmp0_m = __lasx_xvilvl_h(_in2, _in0);                                           \
-    _tmp1_m = __lasx_xvilvl_h(_in3, _in1);                                           \
-    _tmp2_m = __lasx_xvilvl_h(_in6, _in4);                                           \
-    _tmp3_m = __lasx_xvilvl_h(_in7, _in5);                                           \
-    _tmp4_m = __lasx_xvilvl_h(_in10, _in8);                                          \
-    _tmp5_m = __lasx_xvilvl_h(_in11, _in9);                                          \
-    _tmp6_m = __lasx_xvilvl_h(_in14, _in12);                                         \
-    _tmp7_m = __lasx_xvilvl_h(_in15, _in13);                                         \
-    _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m);                                         \
-    _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m);                                         \
-    _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m);                                         \
-    _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m);                                         \
-    _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m);                                         \
-    _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m);                                         \
-    _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m);                                         \
-    _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m);                                         \
-    _tmp0_m = __lasx_xvilvl_d(_t2, _t0);                                             \
-    _tmp2_m = __lasx_xvilvh_d(_t2, _t0);                                             \
-    _tmp4_m = __lasx_xvilvl_d(_t3, _t1);                                             \
-    _tmp6_m = __lasx_xvilvh_d(_t3, _t1);                                             \
-    _tmp1_m = __lasx_xvilvl_d(_t6, _t4);                                             \
-    _tmp3_m = __lasx_xvilvh_d(_t6, _t4);                                             \
-    _tmp5_m = __lasx_xvilvl_d(_t7, _t5);                                             \
-    _tmp7_m = __lasx_xvilvh_d(_t7, _t5);                                             \
-    _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20);                                \
-    _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20);                                \
-    _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20);                                \
-    _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20);                                \
-                                                                                     \
-    _tmp0_m = __lasx_xvilvh_h(_in2, _in0);                                           \
-    _tmp1_m = __lasx_xvilvh_h(_in3, _in1);                                           \
-    _tmp2_m = __lasx_xvilvh_h(_in6, _in4);                                           \
-    _tmp3_m = __lasx_xvilvh_h(_in7, _in5);                                           \
-    _tmp4_m = __lasx_xvilvh_h(_in10, _in8);                                          \
-    _tmp5_m = __lasx_xvilvh_h(_in11, _in9);                                          \
-    _tmp6_m = __lasx_xvilvh_h(_in14, _in12);                                         \
-    _tmp7_m = __lasx_xvilvh_h(_in15, _in13);                                         \
-    _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m);                                         \
-    _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m);                                         \
-    _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m);                                         \
-    _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m);                                         \
-    _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m);                                         \
-    _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m);                                         \
-    _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m);                                         \
-    _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m);                                         \
-    _tmp0_m = __lasx_xvilvl_d(_t2, _t0);                                             \
-    _tmp2_m = __lasx_xvilvh_d(_t2, _t0);                                             \
-    _tmp4_m = __lasx_xvilvl_d(_t3, _t1);                                             \
-    _tmp6_m = __lasx_xvilvh_d(_t3, _t1);                                             \
-    _tmp1_m = __lasx_xvilvl_d(_t6, _t4);                                             \
-    _tmp3_m = __lasx_xvilvh_d(_t6, _t4);                                             \
-    _tmp5_m = __lasx_xvilvl_d(_t7, _t5);                                             \
-    _tmp7_m = __lasx_xvilvh_d(_t7, _t5);                                             \
-    _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20);                                \
-    _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20);                                \
-    _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20);                                \
-    _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20);                                \
-}
+#define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                             _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
+                             _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+                             _out6, _out7)                                    \
+  {                                                                           \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                               \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                               \
+    __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                           \
+                                                                              \
+    _tmp0_m = __lasx_xvilvl_h(_in2, _in0);                                    \
+    _tmp1_m = __lasx_xvilvl_h(_in3, _in1);                                    \
+    _tmp2_m = __lasx_xvilvl_h(_in6, _in4);                                    \
+    _tmp3_m = __lasx_xvilvl_h(_in7, _in5);                                    \
+    _tmp4_m = __lasx_xvilvl_h(_in10, _in8);                                   \
+    _tmp5_m = __lasx_xvilvl_h(_in11, _in9);                                   \
+    _tmp6_m = __lasx_xvilvl_h(_in14, _in12);                                  \
+    _tmp7_m = __lasx_xvilvl_h(_in15, _in13);                                  \
+    _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m);                                  \
+    _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m);                                  \
+    _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m);                                  \
+    _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m);                                  \
+    _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m);                                  \
+    _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m);                                  \
+    _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m);                                  \
+    _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m);                                  \
+    _tmp0_m = __lasx_xvilvl_d(_t2, _t0);                                      \
+    _tmp2_m = __lasx_xvilvh_d(_t2, _t0);                                      \
+    _tmp4_m = __lasx_xvilvl_d(_t3, _t1);                                      \
+    _tmp6_m = __lasx_xvilvh_d(_t3, _t1);                                      \
+    _tmp1_m = __lasx_xvilvl_d(_t6, _t4);                                      \
+    _tmp3_m = __lasx_xvilvh_d(_t6, _t4);                                      \
+    _tmp5_m = __lasx_xvilvl_d(_t7, _t5);                                      \
+    _tmp7_m = __lasx_xvilvh_d(_t7, _t5);                                      \
+    _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20);                         \
+    _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20);                         \
+    _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20);                         \
+    _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20);                         \
+                                                                              \
+    _tmp0_m = __lasx_xvilvh_h(_in2, _in0);                                    \
+    _tmp1_m = __lasx_xvilvh_h(_in3, _in1);                                    \
+    _tmp2_m = __lasx_xvilvh_h(_in6, _in4);                                    \
+    _tmp3_m = __lasx_xvilvh_h(_in7, _in5);                                    \
+    _tmp4_m = __lasx_xvilvh_h(_in10, _in8);                                   \
+    _tmp5_m = __lasx_xvilvh_h(_in11, _in9);                                   \
+    _tmp6_m = __lasx_xvilvh_h(_in14, _in12);                                  \
+    _tmp7_m = __lasx_xvilvh_h(_in15, _in13);                                  \
+    _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m);                                  \
+    _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m);                                  \
+    _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m);                                  \
+    _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m);                                  \
+    _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m);                                  \
+    _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m);                                  \
+    _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m);                                  \
+    _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m);                                  \
+    _tmp0_m = __lasx_xvilvl_d(_t2, _t0);                                      \
+    _tmp2_m = __lasx_xvilvh_d(_t2, _t0);                                      \
+    _tmp4_m = __lasx_xvilvl_d(_t3, _t1);                                      \
+    _tmp6_m = __lasx_xvilvh_d(_t3, _t1);                                      \
+    _tmp1_m = __lasx_xvilvl_d(_t6, _t4);                                      \
+    _tmp3_m = __lasx_xvilvh_d(_t6, _t4);                                      \
+    _tmp5_m = __lasx_xvilvl_d(_t7, _t5);                                      \
+    _tmp7_m = __lasx_xvilvh_d(_t7, _t5);                                      \
+    _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20);                         \
+    _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20);                         \
+    _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20);                         \
+    _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20);                         \
+  }
 
 /*
  * =============================================================================
@@ -1629,61 +1693,65 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
  * Arguments   : Inputs  - _in0, _in1, _in2, _in3
  *               Outputs - _out0, _out1, _out2, _out3
  *               Return Type - signed halfword
- * Details     : The rows of the matrix become columns, and the columns become rows.
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
  * Example     : See LASX_TRANSPOSE8x8_H
  * =============================================================================
  */
-#define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3)     \
-{                                                                                   \
-    __m256i _s0_m, _s1_m;                                                           \
-                                                                                    \
-    _s0_m = __lasx_xvilvl_h(_in1, _in0);                                            \
-    _s1_m = __lasx_xvilvl_h(_in3, _in2);                                            \
-    _out0 = __lasx_xvilvl_w(_s1_m, _s0_m);                                          \
-    _out2 = __lasx_xvilvh_w(_s1_m, _s0_m);                                          \
-    _out1 = __lasx_xvilvh_d(_out0, _out0);                                          \
-    _out3 = __lasx_xvilvh_d(_out2, _out2);                                          \
-}
+#define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
+                            _out3)                                       \
+  {                                                                      \
+    __m256i _s0_m, _s1_m;                                                \
+                                                                         \
+    _s0_m = __lasx_xvilvl_h(_in1, _in0);                                 \
+    _s1_m = __lasx_xvilvl_h(_in3, _in2);                                 \
+    _out0 = __lasx_xvilvl_w(_s1_m, _s0_m);                               \
+    _out2 = __lasx_xvilvh_w(_s1_m, _s0_m);                               \
+    _out1 = __lasx_xvilvh_d(_out0, _out0);                               \
+    _out3 = __lasx_xvilvh_d(_out2, _out2);                               \
+  }
 
 /*
  * =============================================================================
  * Description : Transpose input 8x8 byte block
  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
  *                         (input 8x8 byte block)
- *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
- *                         (output 8x8 byte block)
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *                         _out7 (output 8x8 byte block)
  * Example     : See LASX_TRANSPOSE8x8_H
  * =============================================================================
  */
-#define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, _out0,  \
-                            _out1, _out2, _out3, _out4, _out5, _out6, _out7)        \
-{                                                                                   \
-    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                                     \
-    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                                     \
-    _tmp0_m = __lasx_xvilvl_b(_in2, _in0);                                          \
-    _tmp1_m = __lasx_xvilvl_b(_in3, _in1);                                          \
-    _tmp2_m = __lasx_xvilvl_b(_in6, _in4);                                          \
-    _tmp3_m = __lasx_xvilvl_b(_in7, _in5);                                          \
-    _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m);                                    \
-    _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m);                                    \
-    _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m);                                    \
-    _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m);                                    \
-    _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m);                                      \
-    _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m);                                      \
-    _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m);                                      \
-    _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m);                                      \
-    _out1 = __lasx_xvbsrl_v(_out0, 8);                                              \
-    _out3 = __lasx_xvbsrl_v(_out2, 8);                                              \
-    _out5 = __lasx_xvbsrl_v(_out4, 8);                                              \
-    _out7 = __lasx_xvbsrl_v(_out6, 8);                                              \
-}
+#define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                            _out7)                                           \
+  {                                                                          \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
+    _tmp0_m = __lasx_xvilvl_b(_in2, _in0);                                   \
+    _tmp1_m = __lasx_xvilvl_b(_in3, _in1);                                   \
+    _tmp2_m = __lasx_xvilvl_b(_in6, _in4);                                   \
+    _tmp3_m = __lasx_xvilvl_b(_in7, _in5);                                   \
+    _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m);                             \
+    _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m);                             \
+    _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m);                             \
+    _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m);                             \
+    _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m);                               \
+    _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m);                               \
+    _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m);                               \
+    _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m);                               \
+    _out1 = __lasx_xvbsrl_v(_out0, 8);                                       \
+    _out3 = __lasx_xvbsrl_v(_out2, 8);                                       \
+    _out5 = __lasx_xvbsrl_v(_out4, 8);                                       \
+    _out7 = __lasx_xvbsrl_v(_out6, 8);                                       \
+  }
 
 /*
  * =============================================================================
  * Description : Transpose 8x8 block with halfword elements in vectors.
  * Arguments   : Inputs  - _in0, _in1, ~
  *               Outputs - _out0, _out1, ~
- * Details     : The rows of the matrix become columns, and the columns become rows.
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
  * Example     : LASX_TRANSPOSE8x8_H
  *        _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
  *        _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
@@ -1704,40 +1772,41 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
  *       _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8
  * =============================================================================
  */
-#define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, _out0,  \
-                            _out1, _out2, _out3, _out4, _out5, _out6, _out7)        \
-{                                                                                   \
-    __m256i _s0_m, _s1_m;                                                           \
-    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                                     \
-    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                                     \
-                                                                                    \
-    _s0_m   = __lasx_xvilvl_h(_in6, _in4);                                          \
-    _s1_m   = __lasx_xvilvl_h(_in7, _in5);                                          \
-    _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                        \
-    _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                        \
-    _s0_m   = __lasx_xvilvh_h(_in6, _in4);                                          \
-    _s1_m   = __lasx_xvilvh_h(_in7, _in5);                                          \
-    _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                        \
-    _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                        \
-                                                                                    \
-    _s0_m   = __lasx_xvilvl_h(_in2, _in0);                                          \
-    _s1_m   = __lasx_xvilvl_h(_in3, _in1);                                          \
-    _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                        \
-    _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                        \
-    _s0_m   = __lasx_xvilvh_h(_in2, _in0);                                          \
-    _s1_m   = __lasx_xvilvh_h(_in3, _in1);                                          \
-    _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                        \
-    _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                        \
-                                                                                    \
-    _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m);                                    \
-    _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m);                                    \
-    _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m);                                    \
-    _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m);                                    \
-    _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m);                                    \
-    _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m);                                    \
-    _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m);                                    \
-    _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m);                                    \
-}
+#define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                            _out7)                                           \
+  {                                                                          \
+    __m256i _s0_m, _s1_m;                                                    \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
+                                                                             \
+    _s0_m = __lasx_xvilvl_h(_in6, _in4);                                     \
+    _s1_m = __lasx_xvilvl_h(_in7, _in5);                                     \
+    _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
+    _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvh_h(_in6, _in4);                                     \
+    _s1_m = __lasx_xvilvh_h(_in7, _in5);                                     \
+    _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
+    _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
+                                                                             \
+    _s0_m = __lasx_xvilvl_h(_in2, _in0);                                     \
+    _s1_m = __lasx_xvilvl_h(_in3, _in1);                                     \
+    _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
+    _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvh_h(_in2, _in0);                                     \
+    _s1_m = __lasx_xvilvh_h(_in3, _in1);                                     \
+    _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
+    _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
+                                                                             \
+    _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m);                             \
+    _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m);                             \
+    _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m);                             \
+    _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m);                             \
+    _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m);                             \
+    _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m);                             \
+    _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m);                             \
+    _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m);                             \
+  }
 
 /*
  * =============================================================================
@@ -1752,34 +1821,34 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
  *               _out3 = _in0 - _in3;
  * =============================================================================
  */
-#define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3)  \
-{                                                                               \
-    _out0 = __lasx_xvadd_b(_in0, _in3);                                         \
-    _out1 = __lasx_xvadd_b(_in1, _in2);                                         \
-    _out2 = __lasx_xvsub_b(_in1, _in2);                                         \
-    _out3 = __lasx_xvsub_b(_in0, _in3);                                         \
-}
-#define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3)  \
-{                                                                               \
-    _out0 = __lasx_xvadd_h(_in0, _in3);                                         \
-    _out1 = __lasx_xvadd_h(_in1, _in2);                                         \
-    _out2 = __lasx_xvsub_h(_in1, _in2);                                         \
-    _out3 = __lasx_xvsub_h(_in0, _in3);                                         \
-}
-#define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3)  \
-{                                                                               \
-    _out0 = __lasx_xvadd_w(_in0, _in3);                                         \
-    _out1 = __lasx_xvadd_w(_in1, _in2);                                         \
-    _out2 = __lasx_xvsub_w(_in1, _in2);                                         \
-    _out3 = __lasx_xvsub_w(_in0, _in3);                                         \
-}
-#define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3)  \
-{                                                                               \
-    _out0 = __lasx_xvadd_d(_in0, _in3);                                         \
-    _out1 = __lasx_xvadd_d(_in1, _in2);                                         \
-    _out2 = __lasx_xvsub_d(_in1, _in2);                                         \
-    _out3 = __lasx_xvsub_d(_in0, _in3);                                         \
-}
+#define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    _out0 = __lasx_xvadd_b(_in0, _in3);                                        \
+    _out1 = __lasx_xvadd_b(_in1, _in2);                                        \
+    _out2 = __lasx_xvsub_b(_in1, _in2);                                        \
+    _out3 = __lasx_xvsub_b(_in0, _in3);                                        \
+  }
+#define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    _out0 = __lasx_xvadd_h(_in0, _in3);                                        \
+    _out1 = __lasx_xvadd_h(_in1, _in2);                                        \
+    _out2 = __lasx_xvsub_h(_in1, _in2);                                        \
+    _out3 = __lasx_xvsub_h(_in0, _in3);                                        \
+  }
+#define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    _out0 = __lasx_xvadd_w(_in0, _in3);                                        \
+    _out1 = __lasx_xvadd_w(_in1, _in2);                                        \
+    _out2 = __lasx_xvsub_w(_in1, _in2);                                        \
+    _out3 = __lasx_xvsub_w(_in0, _in3);                                        \
+  }
+#define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    _out0 = __lasx_xvadd_d(_in0, _in3);                                        \
+    _out1 = __lasx_xvadd_d(_in1, _in2);                                        \
+    _out2 = __lasx_xvsub_d(_in1, _in2);                                        \
+    _out3 = __lasx_xvsub_d(_in0, _in3);                                        \
+  }
 
 /*
  * =============================================================================
@@ -1798,59 +1867,63 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
  *               _out7 = _in0 - _in7;
  * =============================================================================
  */
-#define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,        \
-                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
-{                                                                                 \
-    _out0 = __lasx_xvadd_b(_in0, _in7);                                           \
-    _out1 = __lasx_xvadd_b(_in1, _in6);                                           \
-    _out2 = __lasx_xvadd_b(_in2, _in5);                                           \
-    _out3 = __lasx_xvadd_b(_in3, _in4);                                           \
-    _out4 = __lasx_xvsub_b(_in3, _in4);                                           \
-    _out5 = __lasx_xvsub_b(_in2, _in5);                                           \
-    _out6 = __lasx_xvsub_b(_in1, _in6);                                           \
-    _out7 = __lasx_xvsub_b(_in0, _in7);                                           \
-}
-
-#define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,        \
-                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
-{                                                                                 \
-    _out0 = __lasx_xvadd_h(_in0, _in7);                                           \
-    _out1 = __lasx_xvadd_h(_in1, _in6);                                           \
-    _out2 = __lasx_xvadd_h(_in2, _in5);                                           \
-    _out3 = __lasx_xvadd_h(_in3, _in4);                                           \
-    _out4 = __lasx_xvsub_h(_in3, _in4);                                           \
-    _out5 = __lasx_xvsub_h(_in2, _in5);                                           \
-    _out6 = __lasx_xvsub_h(_in1, _in6);                                           \
-    _out7 = __lasx_xvsub_h(_in0, _in7);                                           \
-}
-
-#define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,        \
-                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
-{                                                                                 \
-    _out0 = __lasx_xvadd_w(_in0, _in7);                                           \
-    _out1 = __lasx_xvadd_w(_in1, _in6);                                           \
-    _out2 = __lasx_xvadd_w(_in2, _in5);                                           \
-    _out3 = __lasx_xvadd_w(_in3, _in4);                                           \
-    _out4 = __lasx_xvsub_w(_in3, _in4);                                           \
-    _out5 = __lasx_xvsub_w(_in2, _in5);                                           \
-    _out6 = __lasx_xvsub_w(_in1, _in6);                                           \
-    _out7 = __lasx_xvsub_w(_in0, _in7);                                           \
-}
-
-#define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,        \
-                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
-{                                                                                 \
-    _out0 = __lasx_xvadd_d(_in0, _in7);                                           \
-    _out1 = __lasx_xvadd_d(_in1, _in6);                                           \
-    _out2 = __lasx_xvadd_d(_in2, _in5);                                           \
-    _out3 = __lasx_xvadd_d(_in3, _in4);                                           \
-    _out4 = __lasx_xvsub_d(_in3, _in4);                                           \
-    _out5 = __lasx_xvsub_d(_in2, _in5);                                           \
-    _out6 = __lasx_xvsub_d(_in1, _in6);                                           \
-    _out7 = __lasx_xvsub_d(_in0, _in7);                                           \
-}
-
-#endif //LASX
+#define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    _out0 = __lasx_xvadd_b(_in0, _in7);                                     \
+    _out1 = __lasx_xvadd_b(_in1, _in6);                                     \
+    _out2 = __lasx_xvadd_b(_in2, _in5);                                     \
+    _out3 = __lasx_xvadd_b(_in3, _in4);                                     \
+    _out4 = __lasx_xvsub_b(_in3, _in4);                                     \
+    _out5 = __lasx_xvsub_b(_in2, _in5);                                     \
+    _out6 = __lasx_xvsub_b(_in1, _in6);                                     \
+    _out7 = __lasx_xvsub_b(_in0, _in7);                                     \
+  }
+
+#define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    _out0 = __lasx_xvadd_h(_in0, _in7);                                     \
+    _out1 = __lasx_xvadd_h(_in1, _in6);                                     \
+    _out2 = __lasx_xvadd_h(_in2, _in5);                                     \
+    _out3 = __lasx_xvadd_h(_in3, _in4);                                     \
+    _out4 = __lasx_xvsub_h(_in3, _in4);                                     \
+    _out5 = __lasx_xvsub_h(_in2, _in5);                                     \
+    _out6 = __lasx_xvsub_h(_in1, _in6);                                     \
+    _out7 = __lasx_xvsub_h(_in0, _in7);                                     \
+  }
+
+#define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    _out0 = __lasx_xvadd_w(_in0, _in7);                                     \
+    _out1 = __lasx_xvadd_w(_in1, _in6);                                     \
+    _out2 = __lasx_xvadd_w(_in2, _in5);                                     \
+    _out3 = __lasx_xvadd_w(_in3, _in4);                                     \
+    _out4 = __lasx_xvsub_w(_in3, _in4);                                     \
+    _out5 = __lasx_xvsub_w(_in2, _in5);                                     \
+    _out6 = __lasx_xvsub_w(_in1, _in6);                                     \
+    _out7 = __lasx_xvsub_w(_in0, _in7);                                     \
+  }
+
+#define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    _out0 = __lasx_xvadd_d(_in0, _in7);                                     \
+    _out1 = __lasx_xvadd_d(_in1, _in6);                                     \
+    _out2 = __lasx_xvadd_d(_in2, _in5);                                     \
+    _out3 = __lasx_xvadd_d(_in3, _in4);                                     \
+    _out4 = __lasx_xvsub_d(_in3, _in4);                                     \
+    _out5 = __lasx_xvsub_d(_in2, _in5);                                     \
+    _out6 = __lasx_xvsub_d(_in1, _in6);                                     \
+    _out7 = __lasx_xvsub_d(_in0, _in7);                                     \
+  }
+
+#endif  // LASX
 
 /*
  * =============================================================================
@@ -1863,15 +1936,13 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
  *               VP:1,2,3,4,
  * =============================================================================
  */
-#define VECT_PRINT(RTYPE, element_num, in0, enter)    \
-{                                                     \
-    RTYPE _tmp0 = (RTYPE)in0;                         \
-    int _i = 0;                                       \
-    if (enter)                                        \
-        printf("\nVP:");                              \
-    for(_i = 0; _i < element_num; _i++)               \
-        printf("%d,",_tmp0[_i]);                      \
-}
+#define VECT_PRINT(RTYPE, element_num, in0, enter)                 \
+  {                                                                \
+    RTYPE _tmp0 = (RTYPE)in0;                                      \
+    int _i = 0;                                                    \
+    if (enter) printf("\nVP:");                                    \
+    for (_i = 0; _i < element_num; _i++) printf("%d,", _tmp0[_i]); \
+  }
 
 #endif /* LOONGSON_INTRINSICS_H */
 #endif /* AVUTIL_LOONGARCH_LOONGSON_INTRINSICS_H */
-- 
2.20.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [FFmpeg-devel] [PATCH v1 2/5] avcodec: [loongarch] Optimize Hevcdsp with LSX.
  2022-02-14  8:25 [FFmpeg-devel] Optimize HEVC decoding for loongarch Hao Chen
  2022-02-14  8:25 ` [FFmpeg-devel] [PATCH v1 1/5] avutil: [loongarch] Update loongson_intrinsics.h to v1.1.0 Hao Chen
@ 2022-02-14  8:25 ` Hao Chen
  2022-02-14  8:25 ` [FFmpeg-devel] [PATCH v1 3/5] avcodec: [loongarch] Optimize Hevc_idct/lpf " Hao Chen
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Hao Chen @ 2022-02-14  8:25 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Lu Wang

From: Lu Wang <wanglu@loongson.cn>

ffmpeg -i 5_h265_1080p_60fps_3Mbps.mkv -f rawvideo -y /dev/null -an
before:  94fps
after : 110fps
---
 libavcodec/hevcdsp.c                          |    2 +
 libavcodec/hevcdsp.h                          |    1 +
 libavcodec/loongarch/Makefile                 |    2 +
 libavcodec/loongarch/hevcdsp_init_loongarch.c |   90 +
 libavcodec/loongarch/hevcdsp_lsx.c            | 3299 +++++++++++++++++
 libavcodec/loongarch/hevcdsp_lsx.h            |   88 +
 6 files changed, 3482 insertions(+)
 create mode 100644 libavcodec/loongarch/hevcdsp_init_loongarch.c
 create mode 100644 libavcodec/loongarch/hevcdsp_lsx.c
 create mode 100644 libavcodec/loongarch/hevcdsp_lsx.h

diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
index fe272ac1ce..b6c3f00819 100644
--- a/libavcodec/hevcdsp.c
+++ b/libavcodec/hevcdsp.c
@@ -267,4 +267,6 @@ int i = 0;
         ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
     if (ARCH_MIPS)
         ff_hevc_dsp_init_mips(hevcdsp, bit_depth);
+    if (ARCH_LOONGARCH)
+        ff_hevc_dsp_init_loongarch(hevcdsp, bit_depth);
 }
diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
index 0e013a8328..d642ecf9c6 100644
--- a/libavcodec/hevcdsp.h
+++ b/libavcodec/hevcdsp.h
@@ -134,5 +134,6 @@ void ff_hevc_dsp_init_arm(HEVCDSPContext *c, const int bit_depth);
 void ff_hevc_dsp_init_ppc(HEVCDSPContext *c, const int bit_depth);
 void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth);
 void ff_hevc_dsp_init_mips(HEVCDSPContext *c, const int bit_depth);
+void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth);
 
 #endif /* AVCODEC_HEVCDSP_H */
diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index 3c15c2edeb..9f416d0c6e 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -8,6 +8,7 @@ OBJS-$(CONFIG_VC1DSP)                 += loongarch/vc1dsp_init_loongarch.o
 OBJS-$(CONFIG_HPELDSP)                += loongarch/hpeldsp_init_loongarch.o
 OBJS-$(CONFIG_IDCTDSP)                += loongarch/idctdsp_init_loongarch.o
 OBJS-$(CONFIG_VIDEODSP)               += loongarch/videodsp_init.o
+OBJS-$(CONFIG_HEVC_DECODER)           += loongarch/hevcdsp_init_loongarch.o
 LASX-OBJS-$(CONFIG_H264CHROMA)        += loongarch/h264chroma_lasx.o
 LASX-OBJS-$(CONFIG_H264QPEL)          += loongarch/h264qpel_lasx.o
 LASX-OBJS-$(CONFIG_H264DSP)           += loongarch/h264dsp_lasx.o \
@@ -24,3 +25,4 @@ LSX-OBJS-$(CONFIG_VP9_DECODER)        += loongarch/vp9_mc_lsx.o \
                                          loongarch/vp9_intra_lsx.o \
                                          loongarch/vp9_lpf_lsx.o \
                                          loongarch/vp9_idct_lsx.o
+LSX-OBJS-$(CONFIG_HEVC_DECODER)       += loongarch/hevcdsp_lsx.o \
diff --git a/libavcodec/loongarch/hevcdsp_init_loongarch.c b/libavcodec/loongarch/hevcdsp_init_loongarch.c
new file mode 100644
index 0000000000..fc0e8fb0df
--- /dev/null
+++ b/libavcodec/loongarch/hevcdsp_init_loongarch.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *                Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/cpu.h"
+#include "hevcdsp_lsx.h"
+
+void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_lsx(cpu_flags)) {
+        if (bit_depth == 8) {
+            c->put_hevc_qpel[1][0][0] = ff_hevc_put_hevc_pel_pixels4_8_lsx;
+            c->put_hevc_qpel[2][0][0] = ff_hevc_put_hevc_pel_pixels6_8_lsx;
+            c->put_hevc_qpel[3][0][0] = ff_hevc_put_hevc_pel_pixels8_8_lsx;
+            c->put_hevc_qpel[4][0][0] = ff_hevc_put_hevc_pel_pixels12_8_lsx;
+            c->put_hevc_qpel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_8_lsx;
+            c->put_hevc_qpel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_8_lsx;
+            c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_lsx;
+            c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_lsx;
+            c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_lsx;
+
+            c->put_hevc_epel[1][0][0] = ff_hevc_put_hevc_pel_pixels4_8_lsx;
+            c->put_hevc_epel[2][0][0] = ff_hevc_put_hevc_pel_pixels6_8_lsx;
+            c->put_hevc_epel[3][0][0] = ff_hevc_put_hevc_pel_pixels8_8_lsx;
+            c->put_hevc_epel[4][0][0] = ff_hevc_put_hevc_pel_pixels12_8_lsx;
+            c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_8_lsx;
+            c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_8_lsx;
+            c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_lsx;
+
+            c->put_hevc_qpel[1][0][1] = ff_hevc_put_hevc_qpel_h4_8_lsx;
+            c->put_hevc_qpel[3][0][1] = ff_hevc_put_hevc_qpel_h8_8_lsx;
+            c->put_hevc_qpel[4][0][1] = ff_hevc_put_hevc_qpel_h12_8_lsx;
+            c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_8_lsx;
+            c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h24_8_lsx;
+            c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_lsx;
+            c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_lsx;
+            c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_lsx;
+
+            c->put_hevc_qpel[1][1][0] = ff_hevc_put_hevc_qpel_v4_8_lsx;
+            c->put_hevc_qpel[3][1][0] = ff_hevc_put_hevc_qpel_v8_8_lsx;
+            c->put_hevc_qpel[4][1][0] = ff_hevc_put_hevc_qpel_v12_8_lsx;
+            c->put_hevc_qpel[5][1][0] = ff_hevc_put_hevc_qpel_v16_8_lsx;
+            c->put_hevc_qpel[6][1][0] = ff_hevc_put_hevc_qpel_v24_8_lsx;
+            c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_8_lsx;
+            c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_8_lsx;
+            c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_8_lsx;
+
+            c->put_hevc_qpel[1][1][1] = ff_hevc_put_hevc_qpel_hv4_8_lsx;
+            c->put_hevc_qpel[3][1][1] = ff_hevc_put_hevc_qpel_hv8_8_lsx;
+            c->put_hevc_qpel[4][1][1] = ff_hevc_put_hevc_qpel_hv12_8_lsx;
+            c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_8_lsx;
+            c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_8_lsx;
+            c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_8_lsx;
+            c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_8_lsx;
+            c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_8_lsx;
+
+            c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_8_lsx;
+
+            c->put_hevc_epel[5][1][0] = ff_hevc_put_hevc_epel_v16_8_lsx;
+            c->put_hevc_epel[6][1][0] = ff_hevc_put_hevc_epel_v24_8_lsx;
+            c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_8_lsx;
+
+            c->put_hevc_epel[3][1][1] = ff_hevc_put_hevc_epel_hv8_8_lsx;
+            c->put_hevc_epel[4][1][1] = ff_hevc_put_hevc_epel_hv12_8_lsx;
+            c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_8_lsx;
+            c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_8_lsx;
+            c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_lsx;
+        }
+    }
+}
diff --git a/libavcodec/loongarch/hevcdsp_lsx.c b/libavcodec/loongarch/hevcdsp_lsx.c
new file mode 100644
index 0000000000..a520f02bd1
--- /dev/null
+++ b/libavcodec/loongarch/hevcdsp_lsx.c
@@ -0,0 +1,3299 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *                Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "hevcdsp_lsx.h"
+
+static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
+    /* 8 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+    /* 4 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
+};
+
+/* hevc_copy: dst = src << 6 */
+static void hevc_copy_4w_lsx(uint8_t *src, int32_t src_stride,
+                             int16_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t dst_stride_2x = (dst_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t dst_stride_4x = (dst_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+    int32_t loop_cnt = height >> 3;
+    int32_t res = height & 0x07;
+
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+    __m128i in0, in1, in2, in3;
+    for (; loop_cnt--;) {
+        src0 = __lsx_vld(src, 0);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
+                  src1, src2);
+        src3 = __lsx_vldx(src, src_stride_3x);
+        src += src_stride_4x;
+        src4 = __lsx_vld(src, 0);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
+                  src5, src6);
+        src7 = __lsx_vldx(src, src_stride_3x);
+        src += src_stride_4x;
+
+        DUP4_ARG2(__lsx_vilvl_w, src1, src0, src3, src2, src5, src4, src7, src6,
+                  src0, src1, src2, src3);
+        DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+                  in0, in1, in2, in3);
+
+        __lsx_vstelm_d(in0, dst, 0, 0);
+        __lsx_vstelm_d(in0, dst + dst_stride, 0, 1);
+        __lsx_vstelm_d(in1, dst + dst_stride_2x, 0, 0);
+        __lsx_vstelm_d(in1, dst + dst_stride_3x, 0, 1);
+        dst += dst_stride_4x;
+        __lsx_vstelm_d(in2, dst, 0, 0);
+        __lsx_vstelm_d(in2, dst + dst_stride, 0, 1);
+        __lsx_vstelm_d(in3, dst + dst_stride_2x, 0, 0);
+        __lsx_vstelm_d(in3, dst + dst_stride_3x, 0, 1);
+        dst += dst_stride_4x;
+    }
+    for (;res--;) {
+        src0 = __lsx_vld(src, 0);
+        in0 = __lsx_vsllwil_hu_bu(src0, 6);
+        __lsx_vstelm_d(in0, dst, 0, 0);
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+static void hevc_copy_6w_lsx(uint8_t *src, int32_t src_stride,
+                             int16_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    int32_t loop_cnt = (height >> 3);
+    int32_t res = height & 0x07;
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        src0 = __lsx_vld(src, 0);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+        src3 = __lsx_vldx(src, src_stride_3x);
+        src += src_stride_4x;
+        src4 = __lsx_vld(src, 0);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
+        src7 = __lsx_vldx(src, src_stride_3x);
+        src += src_stride_4x;
+
+        DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+                  in0, in1, in2, in3);
+        DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
+                  in4, in5, in6, in7);
+
+        __lsx_vstelm_d(in0, dst, 0, 0);
+        __lsx_vstelm_w(in0, dst, 8, 2);
+        dst += dst_stride;
+        __lsx_vstelm_d(in1, dst, 0, 0);
+        __lsx_vstelm_w(in1, dst, 8, 2);
+        dst += dst_stride;
+        __lsx_vstelm_d(in2, dst, 0, 0);
+        __lsx_vstelm_w(in2, dst, 8, 2);
+        dst += dst_stride;
+        __lsx_vstelm_d(in3, dst, 0, 0);
+        __lsx_vstelm_w(in3, dst, 8, 2);
+        dst += dst_stride;
+        __lsx_vstelm_d(in4, dst, 0, 0);
+        __lsx_vstelm_w(in4, dst, 8, 2);
+        dst += dst_stride;
+        __lsx_vstelm_d(in5, dst, 0, 0);
+        __lsx_vstelm_w(in5, dst, 8, 2);
+        dst += dst_stride;
+        __lsx_vstelm_d(in6, dst, 0, 0);
+        __lsx_vstelm_w(in6, dst, 8, 2);
+        dst += dst_stride;
+        __lsx_vstelm_d(in7, dst, 0, 0);
+        __lsx_vstelm_w(in7, dst, 8, 2);
+        dst += dst_stride;
+    }
+    for (;res--;) {
+        src0 = __lsx_vld(src, 0);
+        in0 = __lsx_vsllwil_hu_bu(src0, 6);
+        src += src_stride;
+        __lsx_vstelm_d(in0, dst, 0, 0);
+        __lsx_vstelm_w(in0, dst, 8, 2);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_copy_8w_lsx(uint8_t *src, int32_t src_stride,
+                             int16_t *dst, int32_t dst_stride,
+                             int32_t height)
+{
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t dst_stride_x  = (dst_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t dst_stride_2x = (dst_stride_x << 1);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
+    int32_t loop_cnt = height >> 3;
+    int32_t res = height & 0x07;
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        src0 = __lsx_vld(src, 0);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
+                  src1, src2);
+        src3 = __lsx_vldx(src, src_stride_3x);
+        src += src_stride_4x;
+        src4 = __lsx_vld(src, 0);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
+                  src5, src6);
+        src7 = __lsx_vldx(src, src_stride_3x);
+        src += src_stride_4x;
+
+        DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+                  in0, in1, in2, in3);
+        DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
+                  in4, in5, in6, in7);
+        __lsx_vst(in0, dst, 0);
+        __lsx_vstx(in1, dst, dst_stride_x);
+        __lsx_vstx(in2, dst, dst_stride_2x);
+        __lsx_vstx(in3, dst, dst_stride_3x);
+        dst += dst_stride_2x;
+        __lsx_vst(in4, dst, 0);
+        __lsx_vstx(in5, dst, dst_stride_x);
+        __lsx_vstx(in6, dst, dst_stride_2x);
+        __lsx_vstx(in7, dst, dst_stride_3x);
+        dst += dst_stride_2x;
+    }
+    for (;res--;) {
+        src0 = __lsx_vld(src, 0);
+        in0 = __lsx_vsllwil_hu_bu(src0, 6);
+        __lsx_vst(in0, dst, 0);
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+static void hevc_copy_12w_lsx(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              int32_t height)
+{
+    uint32_t loop_cnt;
+    uint32_t res = height & 0x07;
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t dst_stride_x  = (dst_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t dst_stride_2x = (dst_stride_x << 1);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
+    __m128i zero = __lsx_vldi(0);
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+    __m128i in0, in1, in0_r, in1_r, in2_r, in3_r;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        src0 = __lsx_vld(src, 0);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+        src3 = __lsx_vldx(src, src_stride_3x);
+        src += src_stride_4x;
+        src4 = __lsx_vld(src, 0);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
+        src7 = __lsx_vldx(src, src_stride_3x);
+        src += src_stride_4x;
+
+        DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+                  in0_r, in1_r, in2_r, in3_r);
+        DUP2_ARG2(__lsx_vilvh_w, src1, src0, src3, src2, src0, src1);
+        DUP2_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, in0, in1);
+        __lsx_vst(in0_r, dst, 0);
+        __lsx_vstx(in1_r, dst, dst_stride_x);
+        __lsx_vstx(in2_r, dst, dst_stride_2x);
+        __lsx_vstx(in3_r, dst, dst_stride_3x);
+        __lsx_vstelm_d(in0, dst, 16, 0);
+        dst += dst_stride;
+        __lsx_vstelm_d(in0, dst, 16, 1);
+        dst += dst_stride;
+        __lsx_vstelm_d(in1, dst, 16, 0);
+        dst += dst_stride;
+        __lsx_vstelm_d(in1, dst, 16, 1);
+        dst += dst_stride;
+
+        DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
+                  in0_r, in1_r, in2_r, in3_r);
+        DUP2_ARG2(__lsx_vilvh_w, src5, src4, src7, src6, src0, src1);
+        DUP2_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, in0, in1);
+        __lsx_vst(in0_r, dst, 0);
+        __lsx_vstx(in1_r, dst, dst_stride_x);
+        __lsx_vstx(in2_r, dst, dst_stride_2x);
+        __lsx_vstx(in3_r, dst, dst_stride_3x);
+        __lsx_vstelm_d(in0, dst, 16, 0);
+        dst += dst_stride;
+        __lsx_vstelm_d(in0, dst, 16, 1);
+        dst += dst_stride;
+        __lsx_vstelm_d(in1, dst, 16, 0);
+        dst += dst_stride;
+        __lsx_vstelm_d(in1, dst, 16, 1);
+        dst += dst_stride;
+    }
+    for (;res--;) {
+        src0 = __lsx_vld(src, 0);
+        in0  = __lsx_vsllwil_hu_bu(src0, 6);
+        src1 = __lsx_vilvh_b(zero, src0);
+        in1  = __lsx_vslli_h(src1, 6);
+        __lsx_vst(in0, dst, 0);
+        __lsx_vstelm_d(in1, dst, 16, 0);
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+static void hevc_copy_16w_lsx(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              int32_t height)
+{
+    __m128i zero = __lsx_vldi(0);
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t dst_stride_x  = (dst_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t dst_stride_2x = (dst_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
+    int32_t loop_cnt = height >> 3;
+    int32_t res = height & 0x07;
+    int16_t* dst1 = dst + 8;
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+    __m128i in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        src0 = __lsx_vld(src, 0);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
+                  src1, src2);
+        src3 = __lsx_vldx(src, src_stride_3x);
+        src += src_stride_4x;
+        src4 = __lsx_vld(src, 0);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
+                  src5, src6);
+        src7 = __lsx_vldx(src, src_stride_3x);
+        src += src_stride_4x;
+        DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero, src3,
+                  in0_l, in1_l, in2_l, in3_l);
+        DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+                  in0_r, in1_r, in2_r, in3_r);
+        DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
+                  in1_l, in2_l, in3_l);
+        __lsx_vst(in0_r, dst, 0);
+        __lsx_vstx(in1_r, dst, dst_stride_x);
+        __lsx_vstx(in2_r, dst, dst_stride_2x);
+        __lsx_vstx(in3_r, dst, dst_stride_3x);
+        __lsx_vst(in0_l, dst1, 0);
+        __lsx_vstx(in1_l, dst1, dst_stride_x);
+        __lsx_vstx(in2_l, dst1, dst_stride_2x);
+        __lsx_vstx(in3_l, dst1, dst_stride_3x);
+        dst += dst_stride_2x;
+        dst1 += dst_stride_2x;
+
+        DUP4_ARG2(__lsx_vilvh_b, zero, src4, zero, src5, zero, src6, zero, src7,
+                  in0_l, in1_l, in2_l, in3_l);
+        DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
+                  in0_r, in1_r, in2_r, in3_r);
+        DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
+                  in1_l, in2_l, in3_l);
+        __lsx_vst(in0_r, dst, 0);
+        __lsx_vstx(in1_r, dst, dst_stride_x);
+        __lsx_vstx(in2_r, dst, dst_stride_2x);
+        __lsx_vstx(in3_r, dst, dst_stride_3x);
+        __lsx_vst(in0_l, dst1, 0);
+        __lsx_vstx(in1_l, dst1, dst_stride_x);
+        __lsx_vstx(in2_l, dst1, dst_stride_2x);
+        __lsx_vstx(in3_l, dst1, dst_stride_3x);
+        dst += dst_stride_2x;
+        dst1 += dst_stride_2x;
+    }
+    if (res) {
+        src0 = __lsx_vld(src, 0);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+        src3 = __lsx_vldx(src, src_stride_3x);
+
+        DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero, src3,
+                  in0_l, in1_l, in2_l, in3_l);
+        DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+                  in0_r, in1_r, in2_r, in3_r);
+        DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
+                  in1_l, in2_l, in3_l);
+        __lsx_vst(in0_r, dst, 0);
+        __lsx_vstx(in1_r, dst, dst_stride_x);
+        __lsx_vstx(in2_r, dst, dst_stride_2x);
+        __lsx_vstx(in3_r, dst, dst_stride_3x);
+        dst += 8;
+        __lsx_vst(in0_l, dst, 0);
+        __lsx_vstx(in1_l, dst, dst_stride_x);
+        __lsx_vstx(in2_l, dst, dst_stride_2x);
+        __lsx_vstx(in3_l, dst, dst_stride_3x);
+    }
+}
+
+static void hevc_copy_24w_lsx(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              int32_t height)
+{
+    uint32_t loop_cnt;
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t dst_stride_x = (dst_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t dst_stride_2x = (dst_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
+    uint8_t *_src = src + 16;
+    int16_t *dst1 = dst;
+    __m128i zero = __lsx_vldi(0);
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+    __m128i in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        src0 = __lsx_vld(src, 0);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+        src3 = __lsx_vldx(src, src_stride_3x);
+        src += src_stride_4x;
+        src4 = __lsx_vld(_src, 0);
+        DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x,
+                  src5, src6);
+        src7 = __lsx_vldx(_src, src_stride_3x);
+        _src += src_stride_4x;
+
+        DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero,
+                  src3, in0_l, in1_l, in2_l, in3_l);
+        DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+                  in0_r, in1_r, in2_r, in3_r);
+        DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6,
+                  in0_l, in1_l, in2_l, in3_l);
+        __lsx_vst(in0_r, dst, 0);
+        __lsx_vstx(in1_r, dst, dst_stride_x);
+        __lsx_vstx(in2_r, dst, dst_stride_2x);
+        __lsx_vstx(in3_r, dst, dst_stride_3x);
+        dst1 = dst + 8;
+        __lsx_vst(in0_l, dst1, 0);
+        __lsx_vstx(in1_l, dst1, dst_stride_x);
+        __lsx_vstx(in2_l, dst1, dst_stride_2x);
+        __lsx_vstx(in3_l, dst1, dst_stride_3x);
+        DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
+                  in0_r, in1_r, in2_r, in3_r);
+        dst1 = dst1 + 8;
+        __lsx_vst(in0_r, dst1, 0);
+        __lsx_vstx(in1_r, dst1, dst_stride_x);
+        __lsx_vstx(in2_r, dst1, dst_stride_2x);
+        __lsx_vstx(in3_r, dst1, dst_stride_3x);
+        dst += dst_stride_2x;
+    }
+}
+
+static void hevc_copy_32w_lsx(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              int32_t height)
+{
+    uint32_t loop_cnt;
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    uint8_t *_src = src + 16;
+    __m128i zero = {0};
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+    __m128i in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        src0 = __lsx_vld(src, 0);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src2, src4);
+        src6 = __lsx_vldx(src, src_stride_3x);
+        src += src_stride_4x;
+        src1 = __lsx_vld(_src, 0);
+        DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x,
+                  src3, src5);
+        src7 = __lsx_vldx(_src, src_stride_3x);
+        _src += src_stride_4x;
+
+        DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero,
+                  src3, in0_l, in1_l, in2_l, in3_l);
+        DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+                  in0_r, in1_r, in2_r, in3_r);
+        DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6,
+                  in0_l, in1_l, in2_l, in3_l);
+        __lsx_vst(in0_r, dst, 0);
+        __lsx_vst(in0_l, dst, 16);
+        __lsx_vst(in1_r, dst, 32);
+        __lsx_vst(in1_l, dst, 48);
+        dst += dst_stride;
+        __lsx_vst(in2_r, dst, 0);
+        __lsx_vst(in2_l, dst, 16);
+        __lsx_vst(in3_r, dst, 32);
+        __lsx_vst(in3_l, dst, 48);
+        dst += dst_stride;
+
+        DUP4_ARG2(__lsx_vilvh_b, zero, src4, zero, src5, zero, src6, zero, src7,
+                  in0_l, in1_l, in2_l, in3_l);
+        DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
+                  in0_r, in1_r, in2_r, in3_r);
+        DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
+                  in1_l, in2_l, in3_l);
+        __lsx_vst(in0_r, dst, 0);
+        __lsx_vst(in0_l, dst, 16);
+        __lsx_vst(in1_r, dst, 32);
+        __lsx_vst(in1_l, dst, 48);
+        dst += dst_stride;
+        __lsx_vst(in2_r, dst, 0);
+        __lsx_vst(in2_l, dst, 16);
+        __lsx_vst(in3_r, dst, 32);
+        __lsx_vst(in3_l, dst, 48);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_copy_48w_lsx(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              int32_t height)
+{
+    uint32_t loop_cnt;
+    __m128i zero = {0};
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+    __m128i src8, src9, src10, src11;
+    __m128i in0_r, in1_r, in2_r, in3_r, in4_r, in5_r;
+    __m128i in0_l, in1_l, in2_l, in3_l, in4_l, in5_l;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+        src2 = __lsx_vld(src, 32);
+        src += src_stride;
+        DUP2_ARG2(__lsx_vld, src, 0, src, 16, src3, src4);
+        src5 = __lsx_vld(src, 32);
+        src += src_stride;
+        DUP2_ARG2(__lsx_vld, src, 0, src, 16, src6, src7);
+        src8 = __lsx_vld(src, 32);
+        src += src_stride;
+        DUP2_ARG2(__lsx_vld, src, 0, src, 16, src9, src10);
+        src11 = __lsx_vld(src, 32);
+        src += src_stride;
+
+        DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero,
+                  src3, in0_l, in1_l, in2_l, in3_l);
+        DUP2_ARG2(__lsx_vilvh_b, zero, src4, zero, src5, in4_l, in5_l);
+        DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+                  in0_r, in1_r, in2_r, in3_r);
+        DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
+                  in1_l, in2_l, in3_l);
+        DUP2_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, in4_r, in5_r);
+        DUP2_ARG2(__lsx_vslli_h, in4_l, 6, in5_l, 6, in4_l, in5_l);
+        __lsx_vst(in0_r, dst, 0);
+        __lsx_vst(in0_l, dst, 16);
+        __lsx_vst(in1_r, dst, 32);
+        __lsx_vst(in1_l, dst, 48);
+        __lsx_vst(in2_r, dst, 64);
+        __lsx_vst(in2_l, dst, 80);
+        dst += dst_stride;
+        __lsx_vst(in3_r, dst, 0);
+        __lsx_vst(in3_l, dst, 16);
+        __lsx_vst(in4_r, dst, 32);
+        __lsx_vst(in4_l, dst, 48);
+        __lsx_vst(in5_r, dst, 64);
+        __lsx_vst(in5_l, dst, 80);
+        dst += dst_stride;
+
+        DUP4_ARG2(__lsx_vilvh_b, zero, src6, zero, src7, zero, src8, zero, src9,
+                  in0_l, in1_l, in2_l, in3_l);
+        DUP2_ARG2(__lsx_vilvh_b, zero, src10, zero, src11, in4_l, in5_l);
+        DUP4_ARG2(__lsx_vsllwil_hu_bu, src6, 6, src7, 6, src8, 6, src9, 6,
+                  in0_r, in1_r, in2_r, in3_r);
+        DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
+                  in1_l, in2_l, in3_l);
+        DUP2_ARG2(__lsx_vsllwil_hu_bu, src10, 6, src11, 6, in4_r, in5_r);
+        DUP2_ARG2(__lsx_vslli_h, in4_l, 6, in5_l, 6, in4_l, in5_l);
+        __lsx_vst(in0_r, dst, 0);
+        __lsx_vst(in0_l, dst, 16);
+        __lsx_vst(in1_r, dst, 32);
+        __lsx_vst(in1_l, dst, 48);
+        __lsx_vst(in2_r, dst, 64);
+        __lsx_vst(in2_l, dst, 80);
+        dst += dst_stride;
+        __lsx_vst(in3_r, dst, 0);
+        __lsx_vst(in3_l, dst, 16);
+        __lsx_vst(in4_r, dst, 32);
+        __lsx_vst(in4_l, dst, 48);
+        __lsx_vst(in5_r, dst, 64);
+        __lsx_vst(in5_l, dst, 80);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_copy_64w_lsx(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              int32_t height)
+{
+    uint32_t loop_cnt;
+    __m128i zero = {0};
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+    __m128i in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
+
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
+                  src0, src1, src2, src3);
+        src += src_stride;
+        DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
+                  src4, src5, src6, src7);
+        src += src_stride;
+
+        DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero,
+                  src3, in0_l, in1_l, in2_l, in3_l);
+        DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+                  in0_r, in1_r, in2_r, in3_r);
+        DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6,
+                  in0_l, in1_l, in2_l, in3_l);
+        __lsx_vst(in0_r, dst, 0);
+        __lsx_vst(in0_l, dst, 16);
+        __lsx_vst(in1_r, dst, 32);
+        __lsx_vst(in1_l, dst, 48);
+        __lsx_vst(in2_r, dst, 64);
+        __lsx_vst(in2_l, dst, 80);
+        __lsx_vst(in3_r, dst, 96);
+        __lsx_vst(in3_l, dst, 112);
+        dst += dst_stride;
+
+        DUP4_ARG2(__lsx_vilvh_b, zero, src4, zero, src5, zero, src6, zero,
+                  src7, in0_l, in1_l, in2_l, in3_l);
+        DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
+                  in0_r, in1_r, in2_r, in3_r);
+        DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
+                  in1_l, in2_l, in3_l);
+        __lsx_vst(in0_r, dst, 0);
+        __lsx_vst(in0_l, dst, 16);
+        __lsx_vst(in1_r, dst, 32);
+        __lsx_vst(in1_l, dst, 48);
+        __lsx_vst(in2_r, dst, 64);
+        __lsx_vst(in2_l, dst, 80);
+        __lsx_vst(in3_r, dst, 96);
+        __lsx_vst(in3_l, dst, 112);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_8t_4w_lsx(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt = height >> 3;
+    uint32_t res = (height & 0x7) >> 1;
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t dst_stride_2x = (dst_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t dst_stride_4x = (dst_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+    __m128i filt0, filt1, filt2, filt3;
+    __m128i mask1, mask2, mask3;
+    __m128i vec0, vec1, vec2, vec3;
+    __m128i dst0, dst1, dst2, dst3;
+    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 16);
+
+    src -= 3;
+    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+              filt0, filt1, filt2, filt3);
+
+    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+    mask3 = __lsx_vaddi_bu(mask0, 6);
+
+    for (;loop_cnt--;) {
+        src0 = __lsx_vld(src, 0);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+        src3 = __lsx_vldx(src, src_stride_3x);
+        src += src_stride_4x;
+        src4 = __lsx_vld(src, 0);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
+        src7 = __lsx_vldx(src, src_stride_3x);
+        src += src_stride_4x;
+        DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, src1,
+                  src0, mask2, src1, src0, mask3, vec0, vec1, vec2, vec3);
+        dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
+                  dst0, dst0);
+        dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
+        DUP4_ARG3(__lsx_vshuf_b, src3, src2, mask0, src3, src2, mask1, src3,
+                  src2, mask2, src3, src2, mask3, vec0, vec1, vec2, vec3);
+        dst1 = __lsx_vdp2_h_bu_b(vec0, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec1, filt1, dst1, vec2, filt2,
+                  dst1, dst1);
+        dst1 = __lsx_vdp2add_h_bu_b(dst1, vec3, filt3);
+        DUP4_ARG3(__lsx_vshuf_b, src5, src4, mask0, src5, src4, mask1, src5,
+                  src4, mask2, src5, src4, mask3, vec0, vec1, vec2, vec3);
+        dst2 = __lsx_vdp2_h_bu_b(vec0, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec1, filt1, dst2, vec2, filt2,
+                  dst2, dst2);
+        dst2 = __lsx_vdp2add_h_bu_b(dst2, vec3, filt3);
+        DUP4_ARG3(__lsx_vshuf_b, src7, src6, mask0, src7, src6, mask1, src7,
+                  src6, mask2, src7, src6, mask3, vec0, vec1, vec2, vec3);
+        dst3 = __lsx_vdp2_h_bu_b(vec0, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst3, vec2, filt2,
+                  dst3, dst3);
+        dst3 = __lsx_vdp2add_h_bu_b(dst3, vec3, filt3);
+
+        __lsx_vstelm_d(dst0, dst, 0, 0);
+        __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+        __lsx_vstelm_d(dst1, dst + dst_stride_2x, 0, 0);
+        __lsx_vstelm_d(dst1, dst + dst_stride_3x, 0, 1);
+        dst += dst_stride_4x;
+        __lsx_vstelm_d(dst2, dst, 0, 0);
+        __lsx_vstelm_d(dst2, dst + dst_stride, 0, 1);
+        __lsx_vstelm_d(dst3, dst + dst_stride_2x, 0, 0);
+        __lsx_vstelm_d(dst3, dst + dst_stride_3x, 0, 1);
+        dst += dst_stride_4x;
+    }
+    for (;res--;) {
+        src0 = __lsx_vld(src, 0);
+        src1 = __lsx_vldx(src, src_stride);
+        DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, src1,
+                  src0, mask2, src1, src0, mask3, vec0, vec1, vec2, vec3);
+        dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
+                  dst0, dst0);
+        dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
+        __lsx_vstelm_d(dst0, dst, 0, 0);
+        __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+        src += src_stride_2x;
+        dst += dst_stride_2x;
+    }
+}
+
+static void hevc_hz_8t_8w_lsx(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t dst_stride_x  = (dst_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t dst_stride_2x = (dst_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
+    __m128i src0, src1, src2, src3;
+    __m128i filt0, filt1, filt2, filt3;
+    __m128i mask1, mask2, mask3;
+    __m128i vec0, vec1, vec2, vec3;
+    __m128i dst0, dst1, dst2, dst3;
+    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+
+    src -= 3;
+    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+              filt0, filt1, filt2, filt3);
+
+    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+    mask3 = __lsx_vaddi_bu(mask0, 6);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        src0 = __lsx_vld(src, 0);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+        src3 = __lsx_vldx(src, src_stride_3x);
+        src += src_stride_4x;
+
+        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0,
+                  src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
+        dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
+                  dst0, dst0);
+        dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
+        DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1,
+                  src1, mask2, src1, src1, mask3, vec0, vec1, vec2, vec3);
+        dst1 = __lsx_vdp2_h_bu_b(vec0, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec1, filt1, dst1, vec2, filt2,
+                  dst1, dst1);
+        dst1 = __lsx_vdp2add_h_bu_b(dst1, vec3, filt3);
+        DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2,
+                  src2, mask2, src2, src2, mask3, vec0, vec1, vec2, vec3);
+        dst2 = __lsx_vdp2_h_bu_b(vec0, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec1, filt1, dst2, vec2, filt2,
+                  dst2, dst2);
+        dst2 = __lsx_vdp2add_h_bu_b(dst2, vec3, filt3);
+        DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3,
+                  src3, mask2, src3, src3, mask3, vec0, vec1, vec2, vec3);
+        dst3 = __lsx_vdp2_h_bu_b(vec0, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst3, vec2, filt2,
+                  dst3, dst3);
+        dst3 = __lsx_vdp2add_h_bu_b(dst3, vec3, filt3);
+
+        __lsx_vst(dst0, dst, 0);
+        __lsx_vstx(dst1, dst, dst_stride_x);
+        __lsx_vstx(dst2, dst, dst_stride_2x);
+        __lsx_vstx(dst3, dst, dst_stride_3x);
+        dst += dst_stride_2x;
+    }
+}
+
+static void hevc_hz_8t_12w_lsx(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    uint8_t *_src;
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+    __m128i mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5;
+    __m128i filt0, filt1, filt2, filt3, dst0, dst1, dst2, dst3, dst4, dst5;
+
+    src -= 3;
+    _src = src + 8;
+    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+              filt0, filt1, filt2, filt3);
+
+    mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+    mask3 = __lsx_vaddi_bu(mask0, 6);
+    mask4 = __lsx_vld(ff_hevc_mask_arr, 16);
+    DUP2_ARG2(__lsx_vaddi_bu, mask4, 2, mask4, 4, mask5, mask6);
+    mask7 = __lsx_vaddi_bu(mask4, 6);
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        src0 = __lsx_vld(src, 0);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+        src3 = __lsx_vldx(src, src_stride_3x);
+        src4 = __lsx_vld(_src, 0);
+        DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x,
+                  src5, src6);
+        src7 = __lsx_vldx(_src, src_stride_3x);
+        src += src_stride_4x;
+        _src += src_stride_4x;
+
+        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0,
+                  vec0, vec1);
+        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0,
+                  vec2, vec3);
+        DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask4, src7, src6, mask4,
+                  vec4, vec5);
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+                  vec3, filt0, dst0, dst1, dst2, dst3);
+        DUP2_ARG2(__lsx_vdp2_h_bu_b, vec4, filt0, vec5, filt0, dst4, dst5);
+        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1,
+                  vec0, vec1);
+        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1,
+                  vec2, vec3);
+        DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask5, src7, src6, mask5,
+                  vec4, vec5);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
+                  dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt1, dst5, vec5, filt1,
+                  dst4, dst5);
+        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2,
+                  vec0, vec1);
+        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2,
+                  vec2, vec3);
+        DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask6, src7, src6, mask6,
+                  vec4, vec5);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt2, dst1, vec1, filt2,
+                  dst2, vec2, filt2, dst3, vec3, filt2, dst0, dst1, dst2, dst3);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt2, dst5, vec5, filt2,
+                  dst4, dst5);
+        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3,
+                  vec0, vec1);
+        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask3, src3, src3, mask3,
+                  vec2, vec3);
+        DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask7, src7, src6, mask7,
+                  vec4, vec5);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt3, dst1, vec1, filt3,
+                  dst2, vec2, filt3, dst3, vec3, filt3, dst0, dst1, dst2, dst3);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt3, dst5, vec5, filt3,
+                  dst4, dst5);
+
+        __lsx_vst(dst0, dst, 0);
+        __lsx_vstelm_d(dst4, dst, 16, 0);
+        dst += dst_stride;
+        __lsx_vst(dst1, dst, 0);
+        __lsx_vstelm_d(dst4, dst, 16, 1);
+        dst += dst_stride;
+        __lsx_vst(dst2, dst, 0);
+        __lsx_vstelm_d(dst5, dst, 16, 0);
+        dst += dst_stride;
+        __lsx_vst(dst3, dst, 0);
+        __lsx_vstelm_d(dst5, dst, 16, 1);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_8t_16w_lsx(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    __m128i src0, src1, src2, src3;
+    __m128i filt0, filt1, filt2, filt3;
+    __m128i mask1, mask2, mask3;
+    __m128i vec0, vec1, vec2, vec3;
+    __m128i dst0, dst1, dst2, dst3;
+    __m128i mask0;
+
+    src -= 3;
+    mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+              filt0, filt1, filt2, filt3);
+
+    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+    mask3 = __lsx_vaddi_bu(mask0, 6);
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+        src += src_stride;
+        DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
+        src += src_stride;
+
+        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0,
+                  vec0, vec1);
+        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0,
+                  vec2, vec3);
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+                  vec3, filt0, dst0, dst1, dst2, dst3);
+        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1,
+                  vec0, vec1);
+        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1,
+                  vec2, vec3);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
+                  dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
+        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2,
+                  vec0, vec1);
+        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2,
+                  vec2, vec3);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt2, dst1, vec1, filt2,
+                  dst2, vec2, filt2, dst3, vec3, filt2, dst0, dst1, dst2, dst3);
+        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3,
+                  vec0, vec1);
+        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask3, src3, src3, mask3,
+                  vec2, vec3);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt3, dst1, vec1, filt3,
+                  dst2, vec2, filt3, dst3, vec3, filt3, dst0, dst1, dst2, dst3);
+
+        __lsx_vst(dst0, dst, 0);
+        __lsx_vst(dst1, dst, 16);
+        dst += dst_stride;
+        __lsx_vst(dst2, dst, 0);
+        __lsx_vst(dst3, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_8t_24w_lsx(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    __m128i src0, src1, src2, src3;
+    __m128i filt0, filt1, filt2, filt3;
+    __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5;
+    __m128i dst0, dst1, dst2, dst3, dst4, dst5;
+    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+
+    src -= 3;
+    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+              filt0, filt1, filt2, filt3);
+
+    DUP4_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask0, 6, mask0, 8, mask1,
+              mask2, mask3, mask4);
+    DUP2_ARG2(__lsx_vaddi_bu, mask0, 10, mask0, 12, mask5, mask6);
+    mask7 = __lsx_vaddi_bu(mask0, 14);
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+        src += src_stride;
+        DUP2_ARG2(__lsx_vld, src, 0, src, 16, src2, src3);
+        src += src_stride;
+
+        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask4, src1,
+                  src1, mask0, src2, src2, mask0, vec0, vec1, vec2, vec3);
+        DUP2_ARG3(__lsx_vshuf_b, src3, src2, mask4, src3, src3, mask0,
+                  vec4, vec5);
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+                  vec3, filt0, dst0, dst1, dst2, dst3);
+        DUP2_ARG2(__lsx_vdp2_h_bu_b, vec4, filt0, vec5, filt0, dst4, dst5);
+        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src0, mask5, src1,
+                  src1, mask1, src2, src2, mask1, vec0, vec1, vec2, vec3);
+        DUP2_ARG3(__lsx_vshuf_b, src3, src2, mask5, src3, src3, mask1,
+                  vec4, vec5);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
+                  dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt1, dst5, vec5, filt1,
+                  dst4, dst5);
+        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src0, mask6, src1,
+                  src1, mask2, src2, src2, mask2, vec0, vec1, vec2, vec3);
+        DUP2_ARG3(__lsx_vshuf_b, src3, src2, mask6, src3, src3, mask2,
+                  vec4, vec5);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt2, dst1, vec1, filt2,
+                  dst2, vec2, filt2, dst3, vec3, filt2, dst0, dst1, dst2, dst3);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt2, dst5, vec5, filt2,
+                  dst4, dst5);
+        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src0, mask7, src1,
+                  src1, mask3, src2, src2, mask3, vec0, vec1, vec2, vec3);
+        DUP2_ARG3(__lsx_vshuf_b, src3, src2, mask7, src3, src3, mask3,
+                  vec4, vec5);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt3, dst1, vec1, filt3,
+                  dst2, vec2, filt3, dst3, vec3, filt3, dst0, dst1, dst2, dst3);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt3, dst5, vec5, filt3,
+                  dst4, dst5);
+
+        __lsx_vst(dst0, dst, 0);
+        __lsx_vst(dst1, dst, 16);
+        __lsx_vst(dst2, dst, 32);
+        dst += dst_stride;
+        __lsx_vst(dst3, dst, 0);
+        __lsx_vst(dst4, dst, 16);
+        __lsx_vst(dst5, dst, 32);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_8t_32w_lsx(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    __m128i src0, src1, src2;
+    __m128i filt0, filt1, filt2, filt3;
+    __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    __m128i vec0, vec1, vec2, vec3;
+    __m128i dst0, dst1, dst2, dst3;
+    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+
+    src -= 3;
+    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2,  filter, 4, filter, 6,
+              filt0, filt1, filt2, filt3);
+
+    DUP4_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask0, 6, mask0, 8,
+              mask1, mask2, mask3, mask4);
+    DUP2_ARG2(__lsx_vaddi_bu, mask0, 10, mask0, 12, mask5, mask6);
+    mask7 = __lsx_vaddi_bu(mask0, 14);
+
+    for (loop_cnt = height; loop_cnt--;) {
+        DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+        src2 = __lsx_vld(src, 24);
+        src += src_stride;
+
+        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0,
+                  src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
+        dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
+                  dst0, dst0);
+        dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
+
+        DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask4, src1, src0, mask5, src1,
+                  src0, mask6, src1, src0, mask7, vec0, vec1, vec2, vec3);
+        dst1 = __lsx_vdp2_h_bu_b(vec0, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec1, filt1, dst1, vec2, filt2,
+                  dst1, dst1);
+        dst1 = __lsx_vdp2add_h_bu_b(dst1, vec3, filt3);
+        DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1, src1,
+                  mask2, src1, src1, mask3, vec0, vec1, vec2, vec3);
+        dst2 = __lsx_vdp2_h_bu_b(vec0, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec1, filt1, dst2, vec2, filt2,
+                  dst2, dst2);
+        dst2 = __lsx_vdp2add_h_bu_b(dst2, vec3, filt3);
+        DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2, src2,
+                  mask2, src2, src2, mask3, vec0, vec1, vec2, vec3);
+        dst3 = __lsx_vdp2_h_bu_b(vec0, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst3, vec2, filt2,
+                  dst3, dst3);
+        dst3 = __lsx_vdp2add_h_bu_b(dst3, vec3, filt3);
+
+        __lsx_vst(dst0, dst, 0);
+        __lsx_vst(dst1, dst, 16);
+        __lsx_vst(dst2, dst, 32);
+        __lsx_vst(dst3, dst, 48);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_8t_48w_lsx(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    __m128i src0, src1, src2, src3;
+    __m128i filt0, filt1, filt2, filt3;
+    __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5;
+    __m128i dst0, dst1, dst2, dst3, dst4, dst5;
+    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+
+    src -= 3;
+    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+              filt0, filt1, filt2, filt3);
+
+    DUP4_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask0, 6, mask0, 8, mask1,
+              mask2, mask3, mask4);
+    DUP2_ARG2(__lsx_vaddi_bu, mask0, 10, mask0, 12, mask5, mask6);
+    mask7 = __lsx_vaddi_bu(mask0, 14);
+
+    for (loop_cnt = height; loop_cnt--;) {
+        DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+        src2 = __lsx_vld(src, 32);
+        src3 = __lsx_vld(src, 40);
+        src += src_stride;
+
+        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask4, src1, src1,
+                  mask0, src2, src1, mask4, vec0, vec1, vec2, vec3);
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+                  vec3, filt0, dst0, dst1, dst2, dst3);
+        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src0, mask5, src1,
+                  src1, mask1, src2, src1, mask5, vec0, vec1, vec2, vec3);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
+                  dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
+        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src0, mask6, src1,
+                  src1, mask2, src2, src1, mask6, vec0, vec1, vec2, vec3);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt2, dst1, vec1, filt2,
+                  dst2, vec2, filt2, dst3, vec3, filt2, dst0, dst1, dst2, dst3);
+        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src0, mask7, src1,
+                  src1, mask3, src2, src1, mask7, vec0, vec1, vec2, vec3);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt3, dst1, vec1, filt3,
+                  dst2, vec2, filt3, dst3, vec3, filt3, dst0, dst1, dst2, dst3);
+        __lsx_vst(dst0, dst, 0);
+        __lsx_vst(dst1, dst, 16);
+        __lsx_vst(dst2, dst, 32);
+        __lsx_vst(dst3, dst, 48);
+
+        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0,
+                  vec4, vec5);
+        DUP2_ARG2(__lsx_vdp2_h_bu_b, vec4, filt0, vec5, filt0, dst4, dst5);
+        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1,
+                  vec4, vec5);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt1, dst5, vec5, filt1,
+                  dst4, dst5);
+        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2,
+                  vec4, vec5);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt2, dst5, vec5, filt2,
+                  dst4, dst5);
+        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask3, src3, src3, mask3,
+                  vec4, vec5);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt3, dst5, vec5, filt3,
+                  dst4, dst5);
+        __lsx_vst(dst4, dst, 64);
+        __lsx_vst(dst5, dst, 80);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_8t_64w_lsx(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    __m128i src0, src1, src2, src3, src4;
+    __m128i filt0, filt1, filt2, filt3;
+    __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    __m128i vec0, vec1, vec2, vec3;
+    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+
+    src -= 3;
+    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+              filt0, filt1, filt2, filt3);
+
+    DUP4_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask0, 6, mask0, 8, mask1,
+              mask2, mask3, mask4);
+    DUP2_ARG2(__lsx_vaddi_bu, mask0, 10, mask0, 12, mask5, mask6)
+    mask7 = __lsx_vaddi_bu(mask0, 14);
+
+    for (loop_cnt = height; loop_cnt--;) {
+        DUP4_ARG2(__lsx_vld, src, 0, src, 16,  src, 32, src, 48,
+                  src0, src1, src2, src3);
+        src4 = __lsx_vld(src, 56);
+        src += src_stride;
+
+        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0,
+                  src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
+        dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
+                  dst0, dst0);
+        dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
+        __lsx_vst(dst0, dst, 0);
+
+        DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask4, src1, src0, mask5, src1,
+                  src0, mask6, src1, src0, mask7, vec0, vec1, vec2, vec3);
+        dst1 = __lsx_vdp2_h_bu_b(vec0, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec1, filt1, dst1, vec2, filt2,
+                  dst1, dst1);
+        dst1 = __lsx_vdp2add_h_bu_b(dst1, vec3, filt3);
+        __lsx_vst(dst1, dst, 16);
+
+        DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1,
+                  src1, mask2, src1, src1, mask3, vec0, vec1, vec2, vec3);
+        dst2 = __lsx_vdp2_h_bu_b(vec0, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec1, filt1, dst2, vec2, filt2,
+                  dst2, dst2);
+        dst2 = __lsx_vdp2add_h_bu_b(dst2, vec3, filt3);
+        __lsx_vst(dst2, dst, 32);
+
+        DUP4_ARG3(__lsx_vshuf_b, src2, src1, mask4, src2, src1, mask5, src2,
+                  src1, mask6, src2, src1, mask7, vec0, vec1, vec2, vec3);
+        dst3 = __lsx_vdp2_h_bu_b(vec0, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst3, vec2, filt2,
+                  dst3, dst3);
+        dst3 = __lsx_vdp2add_h_bu_b(dst3, vec3, filt3);
+        __lsx_vst(dst3, dst, 48);
+
+        DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2,
+                  src2, mask2, src2, src2, mask3, vec0, vec1, vec2, vec3);
+        dst4 = __lsx_vdp2_h_bu_b(vec0, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst4, vec2, filt2,
+                  dst4, dst4);
+        dst4 = __lsx_vdp2add_h_bu_b(dst4, vec3, filt3);
+        __lsx_vst(dst4, dst, 64);
+
+        DUP4_ARG3(__lsx_vshuf_b, src3, src2, mask4, src3, src2, mask5, src3,
+                  src2, mask6, src3, src2, mask7, vec0, vec1, vec2, vec3);
+        dst5 = __lsx_vdp2_h_bu_b(vec0, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec1, filt1, dst5, vec2, filt2,
+                  dst5, dst5);
+        dst5 = __lsx_vdp2add_h_bu_b(dst5, vec3, filt3);
+        __lsx_vst(dst5, dst, 80);
+
+        DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3,
+                  src3, mask2, src3, src3, mask3, vec0, vec1, vec2, vec3);
+        dst6 = __lsx_vdp2_h_bu_b(vec0, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst6, vec1, filt1, dst6, vec2, filt2,
+                  dst6, dst6);
+        dst6 = __lsx_vdp2add_h_bu_b(dst6, vec3, filt3);
+        __lsx_vst(dst6, dst, 96);
+
+        DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4,
+                  src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3);
+        dst7 = __lsx_vdp2_h_bu_b(vec0, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2, filt2,
+                  dst7, dst7);
+        dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3);
+        __lsx_vst(dst7, dst, 112);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_8t_4w_lsx(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              const int8_t *filter, int32_t height)
+{
+    int32_t loop_cnt;
+    int32_t res = (height & 0x07) >> 1;
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    __m128i src9, src10, src11, src12, src13, src14;
+    __m128i src10_r, src32_r, src54_r, src76_r, src98_r;
+    __m128i src21_r, src43_r, src65_r, src87_r, src109_r;
+    __m128i src1110_r, src1211_r, src1312_r, src1413_r;
+    __m128i src2110, src4332, src6554, src8776, src10998;
+    __m128i src12111110, src14131312;
+    __m128i dst10, dst32, dst54, dst76;
+    __m128i filt0, filt1, filt2, filt3;
+
+    src -= src_stride_3x;
+
+    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+              filt0, filt1, filt2, filt3);
+
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+    src3 = __lsx_vldx(src, src_stride_3x);
+    src += src_stride_4x;
+    src4 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
+    src += src_stride_3x;
+    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+              src10_r, src32_r, src54_r, src21_r);
+    DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
+    DUP2_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r,
+              src2110, src4332);
+    src6554 = __lsx_vilvl_d(src65_r, src54_r);
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        src7 = __lsx_vld(src, 0);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
+        src10 = __lsx_vldx(src, src_stride_3x);
+        src += src_stride_4x;
+        src11 = __lsx_vld(src, 0);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
+                  src12, src13);
+        src14 = __lsx_vldx(src, src_stride_3x);
+        src += src_stride_4x;
+
+        DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+                  src76_r, src87_r, src98_r, src109_r);
+        DUP4_ARG2(__lsx_vilvl_b, src11, src10, src12, src11, src13, src12, src14,
+                  src13, src1110_r, src1211_r, src1312_r, src1413_r);
+        DUP4_ARG2(__lsx_vilvl_d, src87_r, src76_r, src109_r, src98_r, src1211_r,
+                  src1110_r, src1413_r, src1312_r, src8776, src10998,
+                  src12111110, src14131312);
+
+        dst10 = __lsx_vdp2_h_bu_b(src2110, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst10, src4332, filt1, dst10, src6554,
+                  filt2, dst10, dst10);
+        dst10 = __lsx_vdp2add_h_bu_b(dst10, src8776, filt3);
+        dst32 = __lsx_vdp2_h_bu_b(src4332, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst32, src6554, filt1, dst32, src8776,
+                  filt2, dst32, dst32);
+        dst32 = __lsx_vdp2add_h_bu_b(dst32, src10998, filt3);
+        dst54 = __lsx_vdp2_h_bu_b(src6554, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst54, src8776, filt1,
+                  dst54, src10998, filt2, dst54, dst54);
+        dst54 = __lsx_vdp2add_h_bu_b(dst54, src12111110, filt3);
+        dst76 = __lsx_vdp2_h_bu_b(src8776, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst76, src10998, filt1, dst76,
+                  src12111110, filt2, dst76, dst76);
+        dst76 = __lsx_vdp2add_h_bu_b(dst76, src14131312, filt3);
+
+        __lsx_vstelm_d(dst10, dst, 0, 0);
+        dst += dst_stride;
+        __lsx_vstelm_d(dst10, dst, 0, 1);
+        dst += dst_stride;
+        __lsx_vstelm_d(dst32, dst, 0, 0);
+        dst += dst_stride;
+        __lsx_vstelm_d(dst32, dst, 0, 1);
+        dst += dst_stride;
+        __lsx_vstelm_d(dst54, dst, 0, 0);
+        dst += dst_stride;
+        __lsx_vstelm_d(dst54, dst, 0, 1);
+        dst += dst_stride;
+        __lsx_vstelm_d(dst76, dst, 0, 0);
+        dst += dst_stride;
+        __lsx_vstelm_d(dst76, dst, 0, 1);
+        dst += dst_stride;
+
+        src2110 = src10998;
+        src4332 = src12111110;
+        src6554 = src14131312;
+        src6 = src14;
+    }
+    for (;res--;) {
+        src7 = __lsx_vld(src, 0);
+        src8 = __lsx_vldx(src, src_stride);
+        DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
+        src += src_stride_2x;
+        src8776 = __lsx_vilvl_d(src87_r, src76_r);
+
+        dst10 = __lsx_vdp2_h_bu_b(src2110, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst10, src4332, filt1, dst10, src6554,
+                  filt2, dst10, dst10);
+        dst10 = __lsx_vdp2add_h_bu_b(dst10, src8776, filt3);
+
+        __lsx_vstelm_d(dst10, dst, 0, 0);
+        dst += dst_stride;
+        __lsx_vstelm_d(dst10, dst, 0, 1);
+        dst += dst_stride;
+
+        src2110 = src4332;
+        src4332 = src6554;
+        src6554 = src8776;
+        src6 = src8;
+    }
+}
+
+static void hevc_vt_8t_8w_lsx(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              const int8_t *filter, int32_t height)
+{
+    int32_t loop_cnt;
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t dst_stride_x  = (dst_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t dst_stride_2x = (dst_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    __m128i src10_r, src32_r, src54_r, src76_r, src98_r;
+    __m128i src21_r, src43_r, src65_r, src87_r, src109_r;
+    __m128i dst0_r, dst1_r, dst2_r, dst3_r;
+    __m128i filt0, filt1, filt2, filt3;
+
+    src -= src_stride_3x;
+
+    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+              filt0, filt1, filt2, filt3);
+
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+    src3 = __lsx_vldx(src, src_stride_3x);
+    src += src_stride_4x;
+    src4 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
+    src += src_stride_3x;
+    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+              src10_r, src32_r, src54_r, src21_r);
+    DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        src7 = __lsx_vld(src, 0);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
+        src10 = __lsx_vldx(src, src_stride_3x);
+        src += src_stride_4x;
+        DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
+                  src9, src76_r, src87_r, src98_r, src109_r);
+
+        dst0_r = __lsx_vdp2_h_bu_b(src10_r, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_r,
+                  src54_r, filt2, dst0_r, dst0_r);
+        dst0_r = __lsx_vdp2add_h_bu_b(dst0_r, src76_r, filt3);
+        dst1_r = __lsx_vdp2_h_bu_b(src21_r, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1_r, src43_r, filt1, dst1_r,
+                  src65_r, filt2, dst1_r, dst1_r);
+        dst1_r = __lsx_vdp2add_h_bu_b(dst1_r, src87_r, filt3);
+        dst2_r = __lsx_vdp2_h_bu_b(src32_r, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src54_r, filt1, dst2_r,
+                  src76_r, filt2, dst2_r, dst2_r);
+        dst2_r = __lsx_vdp2add_h_bu_b(dst2_r, src98_r, filt3);
+        dst3_r = __lsx_vdp2_h_bu_b(src43_r, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3_r, src65_r, filt1, dst3_r,
+                  src87_r, filt2, dst3_r, dst3_r);
+        dst3_r = __lsx_vdp2add_h_bu_b(dst3_r, src109_r, filt3);
+
+        __lsx_vst(dst0_r, dst, 0);
+        __lsx_vstx(dst1_r, dst, dst_stride_x);
+        __lsx_vstx(dst2_r, dst, dst_stride_2x);
+        __lsx_vstx(dst3_r, dst, dst_stride_3x);
+        dst += dst_stride_2x;
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src6 = src10;
+    }
+}
+
+static void hevc_vt_8t_12w_lsx(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    int32_t loop_cnt;
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    __m128i src10_r, src32_r, src54_r, src76_r, src98_r;
+    __m128i src21_r, src43_r, src65_r, src87_r, src109_r;
+    __m128i dst0_r, dst1_r, dst2_r, dst3_r;
+    __m128i src10_l, src32_l, src54_l, src76_l, src98_l;
+    __m128i src21_l, src43_l, src65_l, src87_l, src109_l;
+    __m128i src2110, src4332, src6554, src8776, src10998;
+    __m128i dst0_l, dst1_l;
+    __m128i filt0, filt1, filt2, filt3;
+
+    src -= src_stride_3x;
+
+    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+              filt0, filt1, filt2, filt3);
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+    src3 = __lsx_vldx(src, src_stride_3x);
+    src += src_stride_4x;
+    src4 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
+    src += src_stride_3x;
+    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+              src10_r, src32_r, src54_r, src21_r);
+    DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
+    DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
+              src10_l, src32_l, src54_l, src21_l);
+    DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, src43_l, src65_l);
+    DUP2_ARG2(__lsx_vilvl_d, src21_l, src10_l, src43_l, src32_l,
+              src2110, src4332);
+    src6554 = __lsx_vilvl_d(src65_l, src54_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        src7 = __lsx_vld(src, 0);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
+        src10 = __lsx_vldx(src, src_stride_3x);
+        src += src_stride_4x;
+        DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
+                  src9, src76_r, src87_r, src98_r, src109_r);
+        DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10,
+                  src9, src76_l, src87_l, src98_l, src109_l);
+        DUP2_ARG2(__lsx_vilvl_d, src87_l, src76_l, src109_l, src98_l,
+                  src8776, src10998);
+
+        dst0_r = __lsx_vdp2_h_bu_b(src10_r, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_r,
+                  src54_r, filt2, dst0_r, dst0_r);
+        dst0_r = __lsx_vdp2add_h_bu_b(dst0_r, src76_r, filt3);
+        dst1_r = __lsx_vdp2_h_bu_b(src21_r, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1_r, src43_r, filt1, dst1_r,
+                  src65_r, filt2, dst1_r, dst1_r);
+        dst1_r = __lsx_vdp2add_h_bu_b(dst1_r, src87_r, filt3);
+        dst2_r = __lsx_vdp2_h_bu_b(src32_r, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src54_r, filt1, dst2_r,
+                  src76_r, filt2, dst2_r, dst2_r);
+        dst2_r = __lsx_vdp2add_h_bu_b(dst2_r, src98_r, filt3);
+        dst3_r = __lsx_vdp2_h_bu_b(src43_r, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3_r, src65_r, filt1, dst3_r,
+                  src87_r, filt2, dst3_r, dst3_r);
+        dst3_r = __lsx_vdp2add_h_bu_b(dst3_r, src109_r, filt3);
+        dst0_l = __lsx_vdp2_h_bu_b(src2110, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0_l, src4332, filt1, dst0_l,
+                  src6554, filt2, dst0_l, dst0_l);
+        dst0_l = __lsx_vdp2add_h_bu_b(dst0_l, src8776, filt3);
+        dst1_l = __lsx_vdp2_h_bu_b(src4332, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1_l, src6554, filt1, dst1_l,
+                  src8776, filt2, dst1_l, dst1_l);
+        dst1_l = __lsx_vdp2add_h_bu_b(dst1_l, src10998, filt3);
+
+        __lsx_vst(dst0_r, dst, 0);
+        __lsx_vstelm_d(dst0_l, dst, 16, 0);
+        dst += dst_stride;
+        __lsx_vst(dst1_r, dst, 0);
+        __lsx_vstelm_d(dst0_l, dst, 16, 1);
+        dst += dst_stride;
+        __lsx_vst(dst2_r, dst, 0);
+        __lsx_vstelm_d(dst1_l, dst, 16, 0);
+        dst += dst_stride;
+        __lsx_vst(dst3_r, dst, 0);
+        __lsx_vstelm_d(dst1_l, dst, 16, 1);
+        dst += dst_stride;
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src2110 = src6554;
+        src4332 = src8776;
+        src6554 = src10998;
+        src6 = src10;
+    }
+}
+
+static void hevc_vt_8t_16multx4mult_lsx(uint8_t *src,
+                                        int32_t src_stride,
+                                        int16_t *dst,
+                                        int32_t dst_stride,
+                                        const int8_t *filter,
+                                        int32_t height,
+                                        int32_t width)
+{
+    uint8_t *src_tmp;
+    int16_t *dst_tmp;
+    int32_t loop_cnt, cnt;
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    __m128i src10_r, src32_r, src54_r, src76_r, src98_r;
+    __m128i src21_r, src43_r, src65_r, src87_r, src109_r;
+    __m128i dst0_r, dst1_r, dst2_r, dst3_r;
+    __m128i src10_l, src32_l, src54_l, src76_l, src98_l;
+    __m128i src21_l, src43_l, src65_l, src87_l, src109_l;
+    __m128i dst0_l, dst1_l, dst2_l, dst3_l;
+    __m128i filt0, filt1, filt2, filt3;
+
+    src -= src_stride_3x;
+
+    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+              filt0, filt1, filt2, filt3);
+
+    for (cnt = width >> 4; cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        src0 = __lsx_vld(src_tmp, 0);
+        DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+                  src1, src2);
+        src3 = __lsx_vldx(src_tmp, src_stride_3x);
+        src_tmp += src_stride_4x;
+        src4 = __lsx_vld(src_tmp, 0);
+        DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+                  src5, src6);
+        src_tmp += src_stride_3x;
+        DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+                  src10_r, src32_r, src54_r, src21_r);
+        DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
+        DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
+                  src10_l, src32_l, src54_l, src21_l);
+        DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, src43_l, src65_l);
+
+        for (loop_cnt = (height >> 2); loop_cnt--;) {
+            src7 = __lsx_vld(src_tmp, 0);
+            DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+                      src8, src9);
+            src10 = __lsx_vldx(src_tmp, src_stride_3x);
+            src_tmp += src_stride_4x;
+            DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8,
+                      src10, src9, src76_r, src87_r, src98_r, src109_r);
+            DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8,
+                      src10, src9, src76_l, src87_l, src98_l, src109_l);
+
+            dst0_r = __lsx_vdp2_h_bu_b(src10_r, filt0);
+            DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_r,
+                      src54_r, filt2, dst0_r, dst0_r);
+            dst0_r = __lsx_vdp2add_h_bu_b(dst0_r, src76_r, filt3);
+            dst1_r = __lsx_vdp2_h_bu_b(src21_r, filt0);
+            DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1_r, src43_r, filt1, dst1_r,
+                      src65_r, filt2, dst1_r, dst1_r);
+            dst1_r = __lsx_vdp2add_h_bu_b(dst1_r, src87_r, filt3);
+            dst2_r = __lsx_vdp2_h_bu_b(src32_r, filt0);
+            DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src54_r, filt1, dst2_r,
+                      src76_r, filt2, dst2_r, dst2_r);
+            dst2_r = __lsx_vdp2add_h_bu_b(dst2_r, src98_r, filt3);
+            dst3_r = __lsx_vdp2_h_bu_b(src43_r, filt0);
+            DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3_r, src65_r, filt1, dst3_r,
+                      src87_r, filt2, dst3_r, dst3_r);
+            dst3_r = __lsx_vdp2add_h_bu_b(dst3_r, src109_r, filt3);
+            dst0_l = __lsx_vdp2_h_bu_b(src10_l, filt0);
+            DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0_l, src32_l, filt1, dst0_l,
+                      src54_l, filt2, dst0_l, dst0_l);
+            dst0_l = __lsx_vdp2add_h_bu_b(dst0_l, src76_l, filt3);
+            dst1_l = __lsx_vdp2_h_bu_b(src21_l, filt0);
+            DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1_l, src43_l, filt1, dst1_l,
+                      src65_l, filt2, dst1_l, dst1_l);
+            dst1_l = __lsx_vdp2add_h_bu_b(dst1_l, src87_l, filt3);
+            dst2_l = __lsx_vdp2_h_bu_b(src32_l, filt0);
+            DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_l, src54_l, filt1, dst2_l,
+                      src76_l, filt2, dst2_l, dst2_l);
+            dst2_l = __lsx_vdp2add_h_bu_b(dst2_l, src98_l, filt3);
+            dst3_l = __lsx_vdp2_h_bu_b(src43_l, filt0);
+            DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3_l, src65_l, filt1, dst3_l,
+                      src87_l, filt2, dst3_l, dst3_l);
+            dst3_l = __lsx_vdp2add_h_bu_b(dst3_l, src109_l, filt3);
+
+            __lsx_vst(dst0_r, dst_tmp, 0);
+            __lsx_vst(dst0_l, dst_tmp, 16);
+            dst_tmp += dst_stride;
+            __lsx_vst(dst1_r, dst_tmp, 0);
+            __lsx_vst(dst1_l, dst_tmp, 16);
+            dst_tmp += dst_stride;
+            __lsx_vst(dst2_r, dst_tmp, 0);
+            __lsx_vst(dst2_l, dst_tmp, 16);
+            dst_tmp += dst_stride;
+            __lsx_vst(dst3_r, dst_tmp, 0);
+            __lsx_vst(dst3_l, dst_tmp, 16);
+            dst_tmp += dst_stride;
+
+            src10_r = src54_r;
+            src32_r = src76_r;
+            src54_r = src98_r;
+            src21_r = src65_r;
+            src43_r = src87_r;
+            src65_r = src109_r;
+            src10_l = src54_l;
+            src32_l = src76_l;
+            src54_l = src98_l;
+            src21_l = src65_l;
+            src43_l = src87_l;
+            src65_l = src109_l;
+            src6 = src10;
+        }
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void hevc_vt_8t_16w_lsx(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    hevc_vt_8t_16multx4mult_lsx(src, src_stride, dst, dst_stride,
+                                filter, height, 16);
+}
+
+static void hevc_vt_8t_24w_lsx(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    hevc_vt_8t_16multx4mult_lsx(src, src_stride, dst, dst_stride,
+                                filter, height, 16);
+    hevc_vt_8t_8w_lsx(src + 16, src_stride, dst + 16, dst_stride,
+                      filter, height);
+}
+
+static void hevc_vt_8t_32w_lsx(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    hevc_vt_8t_16multx4mult_lsx(src, src_stride, dst, dst_stride,
+                                filter, height, 32);
+}
+
+static void hevc_vt_8t_48w_lsx(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    hevc_vt_8t_16multx4mult_lsx(src, src_stride, dst, dst_stride,
+                                filter, height, 48);
+}
+
+static void hevc_vt_8t_64w_lsx(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    hevc_vt_8t_16multx4mult_lsx(src, src_stride, dst, dst_stride,
+                                filter, height, 64);
+}
+
+static void hevc_hv_8t_4w_lsx(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              const int8_t *filter_x, const int8_t *filter_y,
+                              int32_t height)
+{
+    uint32_t loop_cnt;
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    __m128i filt0, filt1, filt2, filt3;
+    __m128i filt_h0, filt_h1, filt_h2, filt_h3;
+    __m128i mask1, mask2, mask3;
+    __m128i filter_vec;
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    __m128i dst30, dst41, dst52, dst63, dst66, dst97, dst108;
+    __m128i dst0_r, dst1_r, dst2_r, dst3_r;
+    __m128i dst10_r, dst32_r, dst54_r, dst76_r, dst98_r;
+    __m128i dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
+    __m128i mask0;
+
+    mask0 = __lsx_vld(ff_hevc_mask_arr, 16);
+
+    src -= src_stride_3x + 3;
+    DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4,
+              filter_x, 6, filt0, filt1, filt2, filt3);
+    filter_vec = __lsx_vld(filter_y, 0);
+    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+
+    DUP4_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filter_vec, 2,
+              filter_vec, 3, filt_h0, filt_h1, filt_h2, filt_h3);
+    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+    mask3 = __lsx_vaddi_bu(mask0, 6);
+
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+    src3 = __lsx_vldx(src, src_stride_3x);
+    src += src_stride_4x;
+    src4 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
+    src += src_stride_3x;
+
+    DUP4_ARG3(__lsx_vshuf_b, src3, src0, mask0, src3, src0, mask1, src3, src0,
+              mask2, src3, src0, mask3, vec0, vec1, vec2, vec3);
+    DUP4_ARG3(__lsx_vshuf_b, src4, src1, mask0, src4, src1, mask1, src4, src1,
+              mask2, src4, src1, mask3, vec4, vec5, vec6, vec7);
+    DUP4_ARG3(__lsx_vshuf_b, src5, src2, mask0, src5, src2, mask1, src5, src2,
+              mask2, src5, src2, mask3, vec8, vec9, vec10, vec11);
+    DUP4_ARG3(__lsx_vshuf_b, src6, src3, mask0, src6, src3, mask1, src6, src3,
+              mask2, src6, src3, mask3, vec12, vec13, vec14, vec15);
+    dst30 = __lsx_vdp2_h_bu_b(vec0, filt0);
+    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst30, vec1, filt1, dst30, vec2, filt2,
+              dst30, dst30);
+    dst30 = __lsx_vdp2add_h_bu_b(dst30, vec3, filt3);
+    dst41 = __lsx_vdp2_h_bu_b(vec4, filt0);
+    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst41, vec5, filt1, dst41, vec6, filt2,
+              dst41, dst41);
+    dst41 = __lsx_vdp2add_h_bu_b(dst41, vec7, filt3);
+    dst52 = __lsx_vdp2_h_bu_b(vec8, filt0);
+    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst52, vec9, filt1, dst52, vec10, filt2,
+              dst52, dst52);
+    dst52 = __lsx_vdp2add_h_bu_b(dst52, vec11, filt3);
+    dst63 = __lsx_vdp2_h_bu_b(vec12, filt0);
+    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst63, vec13, filt1, dst63, vec14, filt2,
+              dst63, dst63);
+    dst63 = __lsx_vdp2add_h_bu_b(dst63, vec15, filt3);
+
+    DUP2_ARG2(__lsx_vilvl_h, dst41, dst30, dst52, dst41, dst10_r, dst21_r);
+    DUP2_ARG2(__lsx_vilvh_h, dst41, dst30, dst52, dst41, dst43_r, dst54_r);
+    dst32_r = __lsx_vilvl_h(dst63, dst52);
+    dst65_r = __lsx_vilvh_h(dst63, dst52);
+    dst66 = __lsx_vreplvei_d(dst63, 1);
+
+    for (loop_cnt = height >> 2; loop_cnt--;) {
+        src7 = __lsx_vld(src, 0);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
+        src10 = __lsx_vldx(src, src_stride_3x);
+        src += src_stride_4x;
+
+        DUP4_ARG3(__lsx_vshuf_b, src9, src7, mask0, src9, src7, mask1, src9, src7,
+                  mask2, src9, src7, mask3, vec0, vec1, vec2, vec3);
+        DUP4_ARG3(__lsx_vshuf_b, src10, src8, mask0, src10, src8, mask1, src10, src8,
+                  mask2, src10, src8, mask3, vec4, vec5, vec6, vec7);
+
+        dst97 = __lsx_vdp2_h_bu_b(vec0, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst97, vec1, filt1, dst97, vec2, filt2,
+                  dst97, dst97);
+        dst97 = __lsx_vdp2add_h_bu_b(dst97, vec3, filt3);
+        dst108 = __lsx_vdp2_h_bu_b(vec4, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst108, vec5, filt1, dst108, vec6,
+                  filt2, dst108, dst108);
+        dst108 = __lsx_vdp2add_h_bu_b(dst108, vec7, filt3);
+
+        DUP2_ARG2(__lsx_vilvl_h, dst97, dst66, dst108, dst97, dst76_r, dst87_r);
+        dst109_r = __lsx_vilvh_h(dst108, dst97);
+        dst66 = __lsx_vreplvei_d(dst97, 1);
+        dst98_r = __lsx_vilvl_h(dst66, dst108);
+
+        DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst21_r, filt_h0, dst32_r,
+                  filt_h0, dst43_r, filt_h0, dst0_r, dst1_r, dst2_r, dst3_r);
+        DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst1_r, dst43_r,
+                  filt_h1, dst2_r, dst54_r, filt_h1, dst3_r, dst65_r, filt_h1,
+                  dst0_r, dst1_r, dst2_r, dst3_r);
+        DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst54_r, filt_h2, dst1_r, dst65_r,
+                  filt_h2, dst2_r, dst76_r, filt_h2, dst3_r, dst87_r, filt_h2,
+                  dst0_r, dst1_r, dst2_r, dst3_r);
+        DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst1_r, dst87_r,
+                  filt_h3, dst2_r, dst98_r, filt_h3, dst3_r, dst109_r, filt_h3,
+                  dst0_r, dst1_r, dst2_r, dst3_r);
+        DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst1_r, 6, dst2_r, 6, dst3_r, 6,
+                  dst0_r, dst1_r, dst2_r, dst3_r);
+        DUP2_ARG2(__lsx_vpickev_h, dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst2_r);
+        __lsx_vstelm_d(dst0_r, dst, 0, 0);
+        dst += dst_stride;
+        __lsx_vstelm_d(dst0_r, dst, 0, 1);
+        dst += dst_stride;
+        __lsx_vstelm_d(dst2_r, dst, 0, 0);
+        dst += dst_stride;
+        __lsx_vstelm_d(dst2_r, dst, 0, 1);
+        dst += dst_stride;
+
+        dst10_r = dst54_r;
+        dst32_r = dst76_r;
+        dst54_r = dst98_r;
+        dst21_r = dst65_r;
+        dst43_r = dst87_r;
+        dst65_r = dst109_r;
+        dst66 = __lsx_vreplvei_d(dst108, 1);
+    }
+}
+
+static void hevc_hv_8t_8multx1mult_lsx(uint8_t *src,
+                                       int32_t src_stride,
+                                       int16_t *dst,
+                                       int32_t dst_stride,
+                                       const int8_t *filter_x,
+                                       const int8_t *filter_y,
+                                       int32_t height,
+                                       int32_t width)
+{
+    uint32_t loop_cnt, cnt;
+    uint8_t *src_tmp;
+    int16_t *dst_tmp;
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+    __m128i filt0, filt1, filt2, filt3;
+    __m128i filt_h0, filt_h1, filt_h2, filt_h3;
+    __m128i mask1, mask2, mask3;
+    __m128i filter_vec;
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    __m128i dst0_r, dst0_l;
+    __m128i dst10_r, dst32_r, dst54_r, dst76_r;
+    __m128i dst10_l, dst32_l, dst54_l, dst76_l;
+    __m128i mask0 = {0x403030202010100, 0x807070606050504};
+
+    src -= src_stride_3x + 3;
+    DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4,
+              filter_x, 6, filt0, filt1, filt2, filt3);
+
+    filter_vec = __lsx_vld(filter_y, 0);
+    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+
+    DUP4_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filter_vec, 2,
+              filter_vec, 3, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+    mask3 = __lsx_vaddi_bu(mask0, 6);
+
+    for (cnt = width >> 3; cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+        src0 = __lsx_vld(src_tmp, 0);
+        DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+                  src1, src2);
+        src3 = __lsx_vldx(src_tmp, src_stride_3x);
+        src_tmp += src_stride_4x;
+        src4 = __lsx_vld(src_tmp, 0);
+        DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+                  src5, src6);
+        src_tmp += src_stride_3x;
+
+        /* row 0 row 1 row 2 row 3 */
+        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0,
+                  src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
+        DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1,
+                  src1, mask2, src1, src1, mask3, vec4, vec5, vec6, vec7);
+        DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2,
+                  src2, mask2, src2, src2, mask3, vec8, vec9, vec10, vec11);
+        DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3,
+                  src3, mask2, src3, src3, mask3, vec12, vec13, vec14, vec15);
+        dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
+                  dst0, dst0);
+        dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
+        dst1 = __lsx_vdp2_h_bu_b(vec4, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec5, filt1, dst1, vec6, filt2,
+                  dst1, dst1);
+        dst1 = __lsx_vdp2add_h_bu_b(dst1, vec7, filt3);
+        dst2 = __lsx_vdp2_h_bu_b(vec8, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec9, filt1, dst2, vec10, filt2,
+                  dst2, dst2);
+        dst2 = __lsx_vdp2add_h_bu_b(dst2, vec11, filt3);
+        dst3 = __lsx_vdp2_h_bu_b(vec12, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec13, filt1, dst3, vec14, filt2,
+                  dst3, dst3);
+        dst3 = __lsx_vdp2add_h_bu_b(dst3, vec15, filt3);
+
+        /* row 4 row 5 row 6 */
+        DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4,
+                  src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3);
+        DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5,
+                  src5, mask2, src5, src5, mask3, vec4, vec5, vec6, vec7);
+        DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6,
+                  src6, mask2, src6, src6, mask3, vec8, vec9, vec10, vec11);
+        dst4 = __lsx_vdp2_h_bu_b(vec0, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst4, vec2, filt2,
+                  dst4, dst4);
+        dst4 = __lsx_vdp2add_h_bu_b(dst4, vec3, filt3);
+        dst5 = __lsx_vdp2_h_bu_b(vec4, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec5, filt1, dst5, vec6, filt2,
+                  dst5, dst5);
+        dst5 = __lsx_vdp2add_h_bu_b(dst5, vec7, filt3);
+        dst6 = __lsx_vdp2_h_bu_b(vec8, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst6, vec9, filt1, dst6, vec10, filt2,
+                  dst6, dst6);
+        dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3);
+
+        for (loop_cnt = height; loop_cnt--;) {
+            src7 = __lsx_vld(src_tmp, 0);
+            src_tmp += src_stride;
+
+            DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7,
+                      src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3);
+            dst7 = __lsx_vdp2_h_bu_b(vec0, filt0);
+            DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2,
+                      filt2, dst7, dst7);
+            dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3);
+
+            DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7,
+                      dst6, dst10_r, dst32_r, dst54_r, dst76_r);
+            DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7,
+                      dst6, dst10_l, dst32_l, dst54_l, dst76_l);
+
+            DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0,
+                      dst0_r, dst0_l);
+            DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
+                      dst32_l, filt_h1, dst0_r, dst54_r, filt_h2, dst0_l,
+                      dst54_l, filt_h2, dst0_r, dst0_l, dst0_r, dst0_l);
+            DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l,
+                      dst76_l, filt_h3, dst0_r, dst0_l);
+            dst0_r = __lsx_vsrai_w(dst0_r, 6);
+            dst0_l = __lsx_vsrai_w(dst0_l, 6);
+
+            dst0_r = __lsx_vpickev_h(dst0_l, dst0_r);
+            __lsx_vst(dst0_r, dst_tmp, 0);
+            dst_tmp += dst_stride;
+
+            dst0 = dst1;
+            dst1 = dst2;
+            dst2 = dst3;
+            dst3 = dst4;
+            dst4 = dst5;
+            dst5 = dst6;
+            dst6 = dst7;
+        }
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void hevc_hv_8t_8w_lsx(uint8_t *src, int32_t src_stride,
+                              int16_t *dst, int32_t dst_stride,
+                              const int8_t *filter_x, const int8_t *filter_y,
+                              int32_t height)
+{
+    hevc_hv_8t_8multx1mult_lsx(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 8);
+}
+
+static void hevc_hv_8t_12w_lsx(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter_x, const int8_t *filter_y,
+                               int32_t height)
+{
+    uint32_t loop_cnt;
+    uint8_t *src_tmp;
+    int16_t *dst_tmp;
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    __m128i mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    __m128i filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
+    __m128i filter_vec;
+    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    __m128i dst30, dst41, dst52, dst63, dst66, dst97, dst108;
+    __m128i dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
+    __m128i dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
+    __m128i dst0_r, dst0_l, dst1_r, dst2_r, dst3_r;
+
+    src -= src_stride_3x + 3;
+    DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4,
+              filter_x, 6, filt0, filt1, filt2, filt3);
+
+    filter_vec = __lsx_vld(filter_y, 0);
+    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+
+    DUP4_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filter_vec, 2,
+              filter_vec, 3, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+    mask3 = __lsx_vaddi_bu(mask0, 6);
+
+    src_tmp = src;
+    dst_tmp = dst;
+
+    src0 = __lsx_vld(src_tmp, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+              src1, src2);
+    src3 = __lsx_vldx(src_tmp, src_stride_3x);
+    src_tmp += src_stride_4x;
+    src4 = __lsx_vld(src_tmp, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+              src5, src6);
+    src_tmp += src_stride_3x;
+
+    /* row 0 row 1 row 2 row 3 */
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0, src0,
+              mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
+    DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1, src1,
+              mask2, src1, src1, mask3, vec4, vec5, vec6, vec7);
+    DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2, src2,
+              mask2, src2, src2, mask3, vec8, vec9, vec10, vec11);
+    DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3, src3,
+              mask2, src3, src3, mask3, vec12, vec13, vec14, vec15);
+    dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
+    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
+              dst0, dst0);
+    dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
+    dst1 = __lsx_vdp2_h_bu_b(vec4, filt0);
+    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec5, filt1, dst1, vec6, filt2,
+              dst1, dst1);
+    dst1 = __lsx_vdp2add_h_bu_b(dst1, vec7, filt3);
+    dst2 = __lsx_vdp2_h_bu_b(vec8, filt0);
+    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec9, filt1, dst2, vec10, filt2,
+              dst2, dst2);
+    dst2 = __lsx_vdp2add_h_bu_b(dst2, vec11, filt3);
+    dst3 = __lsx_vdp2_h_bu_b(vec12, filt0);
+    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec13, filt1, dst3, vec14, filt2,
+              dst3, dst3);
+    dst3 = __lsx_vdp2add_h_bu_b(dst3, vec15, filt3);
+
+    /* row 4 row 5 row 6 */
+    DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4, src4,
+              mask2, src4, src4, mask3, vec0, vec1, vec2, vec3);
+    DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5, src5,
+              mask2, src5, src5, mask3, vec4, vec5, vec6, vec7);
+    DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6, src6,
+              mask2, src6, src6, mask3, vec8, vec9, vec10, vec11);
+    dst4 = __lsx_vdp2_h_bu_b(vec0, filt0);
+    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst4, vec2, filt2,
+              dst4, dst4);
+    dst4 = __lsx_vdp2add_h_bu_b(dst4, vec3, filt3);
+    dst5 = __lsx_vdp2_h_bu_b(vec4, filt0);
+    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec5, filt1, dst5, vec6, filt2,
+              dst5, dst5);
+    dst5 = __lsx_vdp2add_h_bu_b(dst5, vec7, filt3);
+    dst6 = __lsx_vdp2_h_bu_b(vec8, filt0);
+    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst6, vec9, filt1, dst6, vec10, filt2,
+              dst6, dst6);
+    dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3);
+
+    for (loop_cnt = height; loop_cnt--;) {
+        src7 = __lsx_vld(src_tmp, 0);
+        src_tmp += src_stride;
+
+        DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7,
+                  src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3);
+        dst7 = __lsx_vdp2_h_bu_b(vec0, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2, filt2,
+                  dst7, dst7);
+        dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3);
+        DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
+                  dst10_r, dst32_r, dst54_r, dst76_r);
+        DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
+                  dst10_l, dst32_l, dst54_l, dst76_l);
+        DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0,
+                  dst0_r, dst0_l);
+        DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+                  filt_h1, dst0_r, dst54_r, filt_h2, dst0_l, dst54_l, filt_h2,
+                  dst0_r, dst0_l, dst0_r, dst0_l);
+        DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l, dst76_l,
+                  filt_h3, dst0_r, dst0_l)
+        dst0_r = __lsx_vsrai_w(dst0_r, 6);
+        dst0_l = __lsx_vsrai_w(dst0_l, 6);
+
+        dst0_r = __lsx_vpickev_h(dst0_l, dst0_r);
+        __lsx_vst(dst0_r, dst_tmp, 0);
+        dst_tmp += dst_stride;
+
+        dst0 = dst1;
+        dst1 = dst2;
+        dst2 = dst3;
+        dst3 = dst4;
+        dst4 = dst5;
+        dst5 = dst6;
+        dst6 = dst7;
+    }
+    src += 8;
+    dst += 8;
+
+    mask4 = __lsx_vld(ff_hevc_mask_arr, 16);
+    DUP2_ARG2(__lsx_vaddi_bu, mask4, 2, mask4, 4, mask5, mask6);
+    mask7 = __lsx_vaddi_bu(mask4, 6);
+
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+    src3 = __lsx_vldx(src, src_stride_3x);
+    src += src_stride_4x;
+    src4 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
+    src += src_stride_3x;
+
+    DUP4_ARG3(__lsx_vshuf_b, src3, src0, mask4, src3, src0, mask5, src3, src0,
+              mask6, src3, src0, mask7, vec0, vec1, vec2, vec3);
+    DUP4_ARG3(__lsx_vshuf_b, src4, src1, mask4, src4, src1, mask5, src4, src1,
+              mask6, src4, src1, mask7, vec4, vec5, vec6, vec7);
+    DUP4_ARG3(__lsx_vshuf_b, src5, src2, mask4, src5, src2, mask5, src5, src2,
+              mask6, src5, src2, mask7, vec8, vec9, vec10, vec11);
+    DUP4_ARG3(__lsx_vshuf_b, src6, src3, mask4, src6, src3, mask5, src6, src3,
+              mask6, src6, src3, mask7, vec12, vec13, vec14, vec15);
+    dst30 = __lsx_vdp2_h_bu_b(vec0, filt0);
+    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst30, vec1, filt1, dst30, vec2, filt2,
+              dst30, dst30);
+    dst30 = __lsx_vdp2add_h_bu_b(dst30, vec3, filt3);
+    dst41 = __lsx_vdp2_h_bu_b(vec4, filt0);
+    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst41, vec5, filt1, dst41, vec6, filt2,
+              dst41, dst41);
+    dst41 = __lsx_vdp2add_h_bu_b(dst41, vec7, filt3);
+    dst52 = __lsx_vdp2_h_bu_b(vec8, filt0);
+    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst52, vec9, filt1, dst52, vec10, filt2,
+              dst52, dst52);
+    dst52 = __lsx_vdp2add_h_bu_b(dst52, vec11, filt3);
+    dst63 = __lsx_vdp2_h_bu_b(vec12, filt0);
+    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst63, vec13, filt1, dst63, vec14, filt2,
+              dst63, dst63);
+    dst63 = __lsx_vdp2add_h_bu_b(dst63, vec15, filt3);
+
+    DUP2_ARG2(__lsx_vilvl_h, dst41, dst30, dst52, dst41, dst10_r, dst21_r);
+    DUP2_ARG2(__lsx_vilvh_h, dst41, dst30, dst52, dst41, dst43_r, dst54_r);
+    dst32_r = __lsx_vilvl_h(dst63, dst52);
+    dst65_r = __lsx_vilvh_h(dst63, dst52);
+
+    dst66 = __lsx_vreplvei_d(dst63, 1);
+
+    for (loop_cnt = height >> 2; loop_cnt--;) {
+        src7 = __lsx_vld(src, 0);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
+        src10 = __lsx_vldx(src, src_stride_3x);
+        src += src_stride_4x;
+
+        DUP4_ARG3(__lsx_vshuf_b, src9, src7, mask4, src9, src7, mask5, src9,
+                  src7, mask6, src9, src7, mask7, vec0, vec1, vec2, vec3);
+        DUP4_ARG3(__lsx_vshuf_b, src10, src8, mask4, src10, src8, mask5, src10,
+                  src8, mask6, src10, src8, mask7, vec4, vec5, vec6, vec7);
+        dst97 = __lsx_vdp2_h_bu_b(vec0, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst97, vec1, filt1, dst97, vec2, filt2,
+                  dst97, dst97);
+        dst97 = __lsx_vdp2add_h_bu_b(dst97, vec3, filt3);
+        dst108 = __lsx_vdp2_h_bu_b(vec4, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst108, vec5, filt1, dst108, vec6,
+                  filt2, dst108, dst108);
+        dst108 = __lsx_vdp2add_h_bu_b(dst108, vec7, filt3);
+
+        DUP2_ARG2(__lsx_vilvl_h, dst97, dst66, dst108, dst97, dst76_r, dst87_r);
+        dst109_r = __lsx_vilvh_h(dst108, dst97);
+        dst66 = __lsx_vreplvei_d(dst97, 1);
+        dst98_r = __lsx_vilvl_h(dst66, dst108);
+
+        DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst21_r, filt_h0, dst32_r,
+                  filt_h0, dst43_r, filt_h0, dst0_r, dst1_r, dst2_r, dst3_r);
+        DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst1_r, dst43_r,
+                  filt_h1, dst2_r, dst54_r, filt_h1, dst3_r, dst65_r, filt_h1,
+                  dst0_r, dst1_r, dst2_r, dst3_r);
+        DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst54_r, filt_h2, dst1_r, dst65_r,
+                  filt_h2, dst2_r, dst76_r, filt_h2, dst3_r, dst87_r, filt_h2,
+                  dst0_r, dst1_r, dst2_r, dst3_r);
+        DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst1_r, dst87_r,
+                  filt_h3, dst2_r, dst98_r, filt_h3, dst3_r, dst109_r, filt_h3,
+                  dst0_r, dst1_r, dst2_r, dst3_r);
+        DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst1_r, 6, dst2_r, 6, dst3_r, 6,
+                  dst0_r, dst1_r, dst2_r, dst3_r);
+        DUP2_ARG2(__lsx_vpickev_h, dst1_r, dst0_r, dst3_r, dst2_r,
+                  dst0_r, dst2_r);
+        __lsx_vstelm_d(dst0_r, dst, 0, 0);
+        dst += dst_stride;
+        __lsx_vstelm_d(dst0_r, dst, 0, 1);
+        dst += dst_stride;
+        __lsx_vstelm_d(dst2_r, dst, 0, 0);
+        dst += dst_stride;
+        __lsx_vstelm_d(dst2_r, dst, 0, 1);
+        dst += dst_stride;
+
+        dst10_r = dst54_r;
+        dst32_r = dst76_r;
+        dst54_r = dst98_r;
+        dst21_r = dst65_r;
+        dst43_r = dst87_r;
+        dst65_r = dst109_r;
+        dst66 = __lsx_vreplvei_d(dst108, 1);
+    }
+}
+
+static void hevc_hv_8t_16w_lsx(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter_x, const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_8t_8multx1mult_lsx(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 16);
+}
+
+static void hevc_hv_8t_24w_lsx(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter_x, const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_8t_8multx1mult_lsx(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 24);
+}
+
+static void hevc_hv_8t_32w_lsx(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter_x, const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_8t_8multx1mult_lsx(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 32);
+}
+
+static void hevc_hv_8t_48w_lsx(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter_x, const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_8t_8multx1mult_lsx(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 48);
+}
+
+static void hevc_hv_8t_64w_lsx(uint8_t *src, int32_t src_stride,
+                               int16_t *dst, int32_t dst_stride,
+                               const int8_t *filter_x, const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_8t_8multx1mult_lsx(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 64);
+}
+
+static void hevc_hz_4t_32w_lsx(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter,
+                               int32_t height)
+{
+    uint32_t loop_cnt;
+    __m128i src0, src1, src2;
+    __m128i filt0, filt1;
+    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+    __m128i mask1, mask2, mask3;
+    __m128i dst0, dst1, dst2, dst3;
+    __m128i vec0, vec1, vec2, vec3;
+
+    src -= 1;
+    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+
+    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 8, mask1, mask2);
+    mask3 = __lsx_vaddi_bu(mask0, 10);
+
+    for (loop_cnt = height; loop_cnt--;) {
+        DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+        src2 = __lsx_vld(src, 24);
+        src += src_stride;
+
+        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask2,
+                  vec0, vec1);
+        DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src2, src2, mask0,
+                  vec2, vec3);
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+                  vec3, filt0, dst0, dst1, dst2, dst3);
+        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src0, mask3,
+                  vec0, vec1);
+        DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask1, src2, src2, mask1,
+                  vec2, vec3);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
+                  dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
+        __lsx_vst(dst0, dst, 0);
+        __lsx_vst(dst1, dst, 16);
+        __lsx_vst(dst2, dst, 32);
+        __lsx_vst(dst3, dst, 48);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_4t_16w_lsx(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter,
+                               int32_t height)
+{
+    int32_t loop_cnt;
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    __m128i src0, src1, src2, src3, src4, src5;
+    __m128i src10_r, src32_r, src21_r, src43_r;
+    __m128i src10_l, src32_l, src21_l, src43_l;
+    __m128i dst0_r, dst1_r, dst0_l, dst1_l;
+    __m128i filt0, filt1;
+
+    src -= src_stride;
+    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+    src += src_stride_3x;
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        src3 = __lsx_vld(src, 0);
+        src4 = __lsx_vldx(src, src_stride);
+        src += src_stride_2x;
+        DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
+        DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
+                  filt0, src21_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_l,
+                  src32_l, filt1, dst1_r, src43_r, filt1, dst1_l, src43_l,
+                  filt1, dst0_r, dst0_l, dst1_r, dst1_l);
+        __lsx_vst(dst0_r, dst, 0);
+        __lsx_vst(dst0_l, dst, 16);
+        dst += dst_stride;
+        __lsx_vst(dst1_r, dst, 0);
+        __lsx_vst(dst1_l, dst, 16);
+        dst += dst_stride;
+
+        src5 = __lsx_vld(src, 0);
+        src2 = __lsx_vldx(src, src_stride);
+        src += src_stride_2x;
+        DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r);
+        DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l);
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
+                  filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l,
+                  src10_l, filt1, dst1_r, src21_r, filt1, dst1_l, src21_l,
+                  filt1, dst0_r, dst0_l, dst1_r, dst1_l);
+        __lsx_vst(dst0_r, dst, 0);
+        __lsx_vst(dst0_l, dst, 16);
+        dst += dst_stride;
+        __lsx_vst(dst1_r, dst, 0);
+        __lsx_vst(dst1_l, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_4t_24w_lsx(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter,
+                               int32_t height)
+{
+    int32_t loop_cnt;
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    uint8_t *_src;
+
+    __m128i src0, src1, src2, src3, src4, src5;
+    __m128i src6, src7, src8, src9, src10, src11;
+    __m128i src10_r, src32_r, src76_r, src98_r;
+    __m128i src21_r, src43_r, src87_r, src109_r;
+    __m128i dst0_r, dst1_r, dst2_r, dst3_r;
+    __m128i src10_l, src32_l, src21_l, src43_l;
+    __m128i dst0_l, dst1_l;
+    __m128i filt0, filt1;
+
+    src -= src_stride;
+    _src = src + 16;
+    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
+
+    src6 = __lsx_vld(_src, 0);
+    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x, src7, src8);
+    src  += src_stride_3x;
+    _src += src_stride_3x;
+    DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src3, src9);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src4, src10);
+        src  += src_stride_2x;
+        _src += src_stride_2x;
+        DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
+        DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
+
+        DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r);
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
+                  filt0, src21_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_l,
+                  src32_l, filt1, dst1_r, src43_r, filt1, dst1_l, src43_l,
+                  filt1, dst0_r, dst0_l, dst1_r, dst1_l);
+        DUP2_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src87_r, filt0,
+                  dst2_r, dst3_r);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src98_r, filt1, dst3_r,
+                  src109_r, filt1, dst2_r, dst3_r);
+
+        __lsx_vst(dst0_r, dst, 0);
+        __lsx_vst(dst0_l, dst, 16);
+        __lsx_vst(dst2_r, dst, 32);
+        dst += dst_stride;
+        __lsx_vst(dst1_r, dst, 0);
+        __lsx_vst(dst1_l, dst, 16);
+        __lsx_vst(dst3_r, dst, 32);
+        dst += dst_stride;
+
+        DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src5, src11);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src2, src8);
+        src  += src_stride_2x;
+        _src += src_stride_2x;
+        DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r);
+        DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l);
+
+        DUP2_ARG2(__lsx_vilvl_b, src11, src10, src8, src11, src76_r, src87_r);
+
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
+                  filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l, src10_l,
+                  filt1, dst1_r, src21_r, filt1, dst1_l, src21_l, filt1,
+                  dst0_r, dst0_l, dst1_r, dst1_l);
+        DUP2_ARG2(__lsx_vdp2_h_bu_b, src98_r, filt0, src109_r, filt0,
+                  dst2_r, dst3_r);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src76_r, filt1, dst3_r, src87_r,
+                  filt1, dst2_r, dst3_r);
+
+        __lsx_vst(dst0_r, dst, 0);
+        __lsx_vst(dst0_l, dst, 16);
+        __lsx_vst(dst2_r, dst, 32);
+        dst += dst_stride;
+        __lsx_vst(dst1_r, dst, 0);
+        __lsx_vst(dst1_l, dst, 16);
+        __lsx_vst(dst3_r, dst, 32);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_4t_32w_lsx(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter,
+                               int32_t height)
+{
+    int32_t loop_cnt;
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    uint8_t *_src;
+
+    __m128i src0, src1, src2, src3, src4, src5;
+    __m128i src6, src7, src8, src9, src10, src11;
+    __m128i src10_r, src32_r, src76_r, src98_r;
+    __m128i src21_r, src43_r, src87_r, src109_r;
+    __m128i dst0_r, dst1_r, dst2_r, dst3_r;
+    __m128i src10_l, src32_l, src76_l, src98_l;
+    __m128i src21_l, src43_l, src87_l, src109_l;
+    __m128i dst0_l, dst1_l, dst2_l, dst3_l;
+    __m128i filt0, filt1;
+
+    src -= src_stride;
+    _src = src + 16;
+    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
+
+    src6 = __lsx_vld(_src, 0);
+    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x, src7, src8);
+    src  += src_stride_3x;
+    _src += src_stride_3x;
+    DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
+    DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src76_l, src87_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src3, src9);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src4, src10);
+        src  += src_stride_2x;
+        _src += src_stride_2x;
+        DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
+        DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
+
+        DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r);
+        DUP2_ARG2(__lsx_vilvh_b, src9, src8, src10, src9, src98_l, src109_l);
+
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
+                  filt0, src21_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_l,
+                  src32_l, filt1, dst1_r, src43_r, filt1, dst1_l,src43_l,
+                  filt1, dst0_r, dst0_l, dst1_r, dst1_l);
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src76_l, filt0, src87_r,
+                  filt0, src87_l, filt0, dst2_r, dst2_l, dst3_r, dst3_l);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src98_r, filt1, dst2_l, src98_l,
+                  filt1, dst3_r, src109_r, filt1, dst3_l, src109_l, filt1,
+                  dst2_r, dst2_l, dst3_r, dst3_l);
+
+        __lsx_vst(dst0_r, dst, 0);
+        __lsx_vst(dst0_l, dst, 16);
+        __lsx_vst(dst2_r, dst, 32);
+        __lsx_vst(dst2_l, dst, 48);
+        dst += dst_stride;
+        __lsx_vst(dst1_r, dst, 0);
+        __lsx_vst(dst1_l, dst, 16);
+        __lsx_vst(dst3_r, dst, 32);
+        __lsx_vst(dst3_l, dst, 48);
+        dst += dst_stride;
+
+        DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src5, src11);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src2, src8);
+        src  += src_stride_2x;
+        _src += src_stride_2x;
+        DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r);
+        DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l);
+
+        DUP2_ARG2(__lsx_vilvl_b, src11, src10, src8, src11, src76_r, src87_r);
+        DUP2_ARG2(__lsx_vilvh_b, src11, src10, src8, src11, src76_l, src87_l);
+
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
+                  filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l,
+                  src10_l, filt1, dst1_r, src21_r, filt1, dst1_l, src21_l,
+                  filt1, dst0_r, dst0_l, dst1_r, dst1_l);
+
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, src98_r, filt0, src98_l, filt0, src109_r,
+                  filt0, src109_l, filt0, dst2_r, dst2_l, dst3_r, dst3_l);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src76_r, filt1, dst2_l, src76_l,
+                  filt1, dst3_r, src87_r, filt1, dst3_l, src87_l, filt1,
+                  dst2_r, dst2_l, dst3_r, dst3_l);
+
+        __lsx_vst(dst0_r, dst, 0);
+        __lsx_vst(dst0_l, dst, 16);
+        __lsx_vst(dst2_r, dst, 32);
+        __lsx_vst(dst2_l, dst, 48);
+        dst += dst_stride;
+        __lsx_vst(dst1_r, dst, 0);
+        __lsx_vst(dst1_l, dst, 16);
+        __lsx_vst(dst3_r, dst, 32);
+        __lsx_vst(dst3_l, dst, 48);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hv_4t_8x2_lsx(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter_x,
+                               const int8_t *filter_y)
+{
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+
+    __m128i src0, src1, src2, src3, src4;
+    __m128i filt0, filt1;
+    __m128i filt_h0, filt_h1;
+    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+    __m128i mask1;
+    __m128i filter_vec;
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+    __m128i dst0, dst1, dst2, dst3, dst4;
+    __m128i dst0_r, dst0_l, dst1_r, dst1_l;
+    __m128i dst10_r, dst32_r, dst21_r, dst43_r;
+    __m128i dst10_l, dst32_l, dst21_l, dst43_l;
+
+    src -= (src_stride + 1);
+    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+    filter_vec = __lsx_vld(filter_y, 0);
+    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+    DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+    mask1 = __lsx_vaddi_bu(mask0, 2);
+
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+    src3 = __lsx_vldx(src, src_stride_3x);
+    src4 = __lsx_vldx(src, src_stride_4x);
+
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5);
+    DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec6, vec7);
+    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec8, vec9);
+
+    DUP4_ARG2(__lsx_vdp2_h_bu_b,  vec0, filt0, vec2, filt0, vec4, filt0,
+              vec6, filt0, dst0, dst1, dst2, dst3);
+    DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
+              dst2, vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
+    dst4 = __lsx_vdp2_h_bu_b(vec8, filt0);
+    dst4 = __lsx_vdp2add_h_bu_b(dst4, vec9, filt1);
+
+    DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+    DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
+    DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
+    DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
+
+    DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+              filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+    DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+              filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
+              dst0_r, dst0_l, dst1_r, dst1_l);
+    DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
+              dst0_r, dst0_l, dst1_r, dst1_l);
+    DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
+    __lsx_vst(dst0_r, dst, 0);
+    __lsx_vst(dst1_r, dst + dst_stride, 0);
+}
+
+static void hevc_hv_4t_8multx4_lsx(uint8_t *src, int32_t src_stride,
+                                   int16_t *dst, int32_t dst_stride,
+                                   const int8_t *filter_x,
+                                   const int8_t *filter_y, int32_t width8mult)
+{
+    int32_t cnt;
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t dst_stride_x  = (dst_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t dst_stride_2x = (dst_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
+
+    __m128i src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    __m128i filt0, filt1, filt_h0, filt_h1, filter_vec;
+    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6;
+    __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
+    __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
+    __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+
+    src -= (src_stride + 1);
+    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+    filter_vec = __lsx_vld(filter_y, 0);
+    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+    DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+    mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+    mask1 = __lsx_vaddi_bu(mask0, 2);
+
+    for (cnt = width8mult; cnt--;) {
+        src0 = __lsx_vld(src, 0);
+        DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src,
+                  src_stride_3x, src, src_stride_4x, src1, src2, src3, src4);
+        src += src_stride_4x;
+        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
+        src += (8 - src_stride_4x);
+
+        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1,
+                  vec0, vec1);
+        DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1,
+                  vec2, vec3);
+        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,
+                  vec4, vec5);
+
+        DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
+                  dst0, dst1);
+        dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
+        dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
+
+        DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+        DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
+
+        DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1,
+                  vec0, vec1);
+        DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1,
+                  vec2, vec3);
+        DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1,
+                  vec4, vec5);
+        DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1,
+                  vec6, vec7);
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
+                  vec6, filt0, dst3, dst4, dst5, dst6);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3, filt1,
+                  dst5, vec5, filt1, dst6, vec7, filt1, dst3, dst4, dst5, dst6);
+        DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
+        DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
+        DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r);
+        DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l);
+
+        DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+                  filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+        DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+                  filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
+                  dst0_r, dst0_l, dst1_r, dst1_l);
+
+        DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
+                  filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
+        DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
+                  filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
+                  dst2_r, dst2_l, dst3_r, dst3_l);
+        DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
+                  dst0_r, dst0_l, dst1_r, dst1_l);
+        DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6,
+                  dst2_r, dst2_l, dst3_r, dst3_l);
+        DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r,
+                  dst0_r, dst1_r);
+        DUP2_ARG2(__lsx_vpickev_h, dst2_l, dst2_r, dst3_l, dst3_r,
+                  dst2_r, dst3_r);
+
+        __lsx_vst(dst0_r, dst, 0);
+        __lsx_vstx(dst1_r, dst, dst_stride_x);
+        __lsx_vstx(dst2_r, dst, dst_stride_2x);
+        __lsx_vstx(dst3_r, dst, dst_stride_3x);
+        dst += 8;
+    }
+}
+
+static void hevc_hv_4t_8x6_lsx(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter_x,
+                               const int8_t *filter_y)
+{
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t dst_stride_2x = (dst_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    __m128i filt0, filt1;
+    __m128i filt_h0, filt_h1;
+    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+    __m128i mask1, filter_vec;
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+    __m128i vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
+    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    __m128i dst4_r, dst4_l, dst5_r, dst5_l;
+    __m128i dst10_r, dst32_r, dst10_l, dst32_l;
+    __m128i dst21_r, dst43_r, dst21_l, dst43_l;
+    __m128i dst54_r, dst54_l, dst65_r, dst65_l;
+    __m128i dst76_r, dst76_l, dst87_r, dst87_l;
+
+    src -= (src_stride + 1);
+    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+    filter_vec = __lsx_vld(filter_y, 0);
+    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+    DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+    mask1 = __lsx_vaddi_bu(mask0, 2);
+
+    src0 = __lsx_vld(src, 0);
+    DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src,
+              src_stride_3x, src, src_stride_4x, src1, src2, src3, src4);
+    src += src_stride_4x;
+    DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src,
+              src_stride_3x, src, src_stride_4x, src5, src6, src7, src8);
+
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src1, src1,
+              mask0, src1, src1, mask1, vec0, vec1, vec2, vec3);
+    DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,src3, src3,
+              mask0, src3, src3, mask1, vec4, vec5, vec6, vec7);
+    DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src5, src5,
+              mask0, src5, src5, mask1, vec8, vec9, vec10, vec11);
+    DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src7, src7,
+              mask0, src7, src7, mask1, vec12, vec13, vec14, vec15);
+    DUP2_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1,
+              vec16, vec17);
+
+    DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
+              filt0, dst0, dst1, dst2, dst3);
+    DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
+              dst2, vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
+    DUP4_ARG2(__lsx_vdp2_h_bu_b,  vec8, filt0, vec10, filt0, vec12, filt0,
+              vec14, filt0, dst4, dst5, dst6, dst7);
+    DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec9, filt1, dst5, vec11, filt1, dst6,
+              vec13, filt1, dst7, vec15, filt1, dst4, dst5, dst6, dst7);
+    dst8 = __lsx_vdp2_h_bu_b(vec16, filt0);
+    dst8 = __lsx_vdp2add_h_bu_b(dst8, vec17, filt1);
+
+    DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
+              dst10_r, dst21_r, dst32_r, dst43_r);
+    DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
+              dst10_l, dst21_l, dst32_l, dst43_l);
+    DUP4_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
+              dst54_r, dst65_r, dst76_r, dst87_r);
+    DUP4_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
+              dst54_l, dst65_l, dst76_l, dst87_l);
+
+    DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+              filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+    DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
+              filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
+    DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst54_l, filt_h0, dst65_r,
+              filt_h0, dst65_l, filt_h0, dst4_r, dst4_l, dst5_r, dst5_l);
+    DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+              filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
+              dst0_r, dst0_l, dst1_r, dst1_l);
+    DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
+              filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
+              dst2_r, dst2_l, dst3_r, dst3_l);
+    DUP4_ARG3(__lsx_vdp2add_w_h, dst4_r, dst76_r, filt_h1, dst4_l, dst76_l,
+              filt_h1, dst5_r, dst87_r, filt_h1, dst5_l, dst87_l, filt_h1,
+              dst4_r, dst4_l, dst5_r, dst5_l);
+    DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6, dst0_r,
+              dst0_l, dst1_r, dst1_l);
+    DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6, dst2_r,
+              dst2_l, dst3_r, dst3_l);
+    DUP4_ARG2(__lsx_vsrai_w, dst4_r, 6, dst4_l, 6, dst5_r, 6, dst5_l, 6, dst4_r,
+              dst4_l, dst5_r, dst5_l);
+
+    DUP4_ARG2(__lsx_vpickev_h,dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r,
+              dst3_l, dst3_r, dst0_r, dst1_r, dst2_r, dst3_r);
+    DUP2_ARG2(__lsx_vpickev_h, dst4_l, dst4_r, dst5_l, dst5_r, dst4_r, dst5_r);
+
+    __lsx_vst(dst0_r, dst, 0);
+    __lsx_vstx(dst1_r, dst, dst_stride_2x);
+    dst += dst_stride_2x;
+    __lsx_vst(dst2_r, dst, 0);
+    __lsx_vstx(dst3_r, dst, dst_stride_2x);
+    dst += dst_stride_2x;
+    __lsx_vst(dst4_r, dst, 0);
+    __lsx_vstx(dst5_r, dst, dst_stride_2x);
+}
+
+static void hevc_hv_4t_8multx4mult_lsx(uint8_t *src,
+                                       int32_t src_stride,
+                                       int16_t *dst,
+                                       int32_t dst_stride,
+                                       const int8_t *filter_x,
+                                       const int8_t *filter_y,
+                                       int32_t height,
+                                       int32_t width8mult)
+{
+    uint32_t loop_cnt, cnt;
+    uint8_t *src_tmp;
+    int16_t *dst_tmp;
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t dst_stride_x  = (dst_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t dst_stride_2x = (dst_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
+
+    __m128i src0, src1, src2, src3, src4, src5, src6;
+    __m128i filt0, filt1;
+    __m128i filt_h0, filt_h1;
+    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+    __m128i mask1, filter_vec;
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6;
+    __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
+    __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
+
+    src -= (src_stride + 1);
+    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+    filter_vec = __lsx_vld(filter_y, 0);
+    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+    DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+    mask1 = __lsx_vaddi_bu(mask0, 2);
+
+    for (cnt = width8mult; cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        src0 = __lsx_vld(src_tmp, 0);
+        DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+                  src1, src2);
+        src_tmp += src_stride_3x;
+
+        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1,
+                  vec0, vec1);
+        DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1,
+                  vec2, vec3);
+        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,
+                  vec4, vec5);
+
+        DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
+                  dst0, dst1);
+        dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
+        dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
+
+        DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+        DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
+
+        for (loop_cnt = height >> 2; loop_cnt--;) {
+            src3 = __lsx_vld(src_tmp, 0);
+            DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+                      src4, src5);
+            src6 = __lsx_vldx(src_tmp, src_stride_3x);
+            src_tmp += src_stride_4x;
+
+            DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1,
+                      vec0, vec1);
+            DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1,
+                      vec2, vec3);
+            DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1,
+                      vec4, vec5);
+            DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1,
+                      vec6, vec7);
+
+            DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
+                      vec6, filt0, dst3, dst4, dst5, dst6);
+            DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3,
+                      filt1, dst5, vec5, filt1, dst6, vec7, filt1, dst3,
+                      dst4, dst5, dst6);
+
+            DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
+            DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
+            DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r);
+            DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l);
+
+            DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+                      filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+            DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
+                      filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
+            DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
+                      dst32_l, filt_h1, dst1_r, dst43_r, filt_h1, dst1_l,
+                      dst43_l, filt_h1, dst0_r, dst0_l, dst1_r, dst1_l);
+            DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l,
+                      dst54_l, filt_h1, dst3_r, dst65_r, filt_h1, dst3_l,
+                      dst65_l, filt_h1, dst2_r, dst2_l, dst3_r, dst3_l);
+
+            DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
+                      dst0_r, dst0_l, dst1_r, dst1_l);
+            DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6,
+                      dst2_r, dst2_l, dst3_r, dst3_l);
+
+            DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l,
+                      dst2_r, dst3_l, dst3_r, dst0_r, dst1_r, dst2_r, dst3_r);
+
+            __lsx_vst(dst0_r, dst_tmp, 0);
+            __lsx_vstx(dst1_r, dst_tmp, dst_stride_x);
+            __lsx_vstx(dst2_r, dst_tmp, dst_stride_2x);
+            __lsx_vstx(dst3_r, dst_tmp, dst_stride_3x);
+            dst_tmp += dst_stride_2x;
+
+            dst10_r = dst54_r;
+            dst10_l = dst54_l;
+            dst21_r = dst65_r;
+            dst21_l = dst65_l;
+            dst2 = dst6;
+        }
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void hevc_hv_4t_8w_lsx(uint8_t *src,
+                              int32_t src_stride,
+                              int16_t *dst,
+                              int32_t dst_stride,
+                              const int8_t *filter_x,
+                              const int8_t *filter_y,
+                              int32_t height)
+{
+
+    if (2 == height) {
+        hevc_hv_4t_8x2_lsx(src, src_stride, dst, dst_stride,
+                           filter_x, filter_y);
+    } else if (4 == height) {
+        hevc_hv_4t_8multx4_lsx(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, 1);
+    } else if (6 == height) {
+        hevc_hv_4t_8x6_lsx(src, src_stride, dst, dst_stride,
+                           filter_x, filter_y);
+    } else if (0 == (height & 0x03)) {
+        hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height, 1);
+    }
+}
+
+static void hevc_hv_4t_12w_lsx(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter_x,
+                               const int8_t *filter_y,
+                               int32_t height)
+{
+    uint32_t loop_cnt;
+    uint8_t *src_tmp;
+    int16_t *dst_tmp;
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t dst_stride_x  = (dst_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t dst_stride_2x = (dst_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
+
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    __m128i mask0, mask1, mask2, mask3;
+    __m128i filt0, filt1, filt_h0, filt_h1, filter_vec, dst0;
+    __m128i dst1, dst2, dst3, dst4, dst5, dst6, dst10, dst21, dst22, dst73;
+    __m128i dst84, dst95, dst106, dst76_r, dst98_r, dst87_r, dst109_r;
+    __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
+    __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
+    __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+    src -= (src_stride + 1);
+    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+    filter_vec = __lsx_vld(filter_y, 0);
+    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+    DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+    mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+    mask1 = __lsx_vaddi_bu(mask0, 2);
+
+    src_tmp = src;
+    dst_tmp = dst;
+
+    src0 = __lsx_vld(src_tmp, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+              src1, src2);
+    src_tmp += src_stride_3x;
+
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
+    dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
+    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
+              dst0, dst1);
+    dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
+
+    DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+    DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        src3 = __lsx_vld(src_tmp, 0);
+        DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+                  src4, src5);
+        src6 = __lsx_vldx(src_tmp, src_stride_3x);
+        src_tmp += src_stride_4x;
+
+        DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1,
+                  vec0, vec1);
+        DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1,
+                  vec2, vec3);
+        DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1,
+                  vec4, vec5);
+        DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1,
+                  vec6, vec7);
+
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
+                  vec6, filt0, dst3, dst4, dst5, dst6);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3,
+                  filt1, dst5, vec5, filt1, dst6, vec7, filt1, dst3,
+                  dst4, dst5, dst6);
+
+        DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
+        DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
+        DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r);
+        DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l);
+
+        DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+                  filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+        DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
+                  filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
+        DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+                  filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
+                  dst0_r, dst0_l, dst1_r, dst1_l);
+        DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
+                  filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
+                  dst2_r, dst2_l, dst3_r, dst3_l);
+        DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
+                  dst0_r, dst0_l, dst1_r, dst1_l);
+        DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6,
+                  dst2_r, dst2_l, dst3_r, dst3_l);
+        DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r,
+                  dst3_l, dst3_r, dst0_r, dst1_r, dst2_r, dst3_r);
+        __lsx_vst(dst0_r, dst_tmp, 0);
+        __lsx_vstx(dst1_r, dst_tmp, dst_stride_x);
+        __lsx_vstx(dst2_r, dst_tmp, dst_stride_2x);
+        __lsx_vstx(dst3_r, dst_tmp, dst_stride_3x);
+        dst_tmp += dst_stride_2x;
+
+        dst10_r = dst54_r;
+        dst10_l = dst54_l;
+        dst21_r = dst65_r;
+        dst21_l = dst65_l;
+        dst2 = dst6;
+    }
+
+    src += 8;
+    dst += 8;
+
+    mask2 = __lsx_vld(ff_hevc_mask_arr, 16);
+    mask3 = __lsx_vaddi_bu(mask2, 2);
+
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+    src += src_stride_3x;
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask2, src1, src0, mask3, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src1, mask2, src2, src1, mask3, vec2, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst10, dst21);
+    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst10, vec1, filt1, dst21, vec3, filt1,
+              dst10, dst21);
+    dst10_r = __lsx_vilvl_h(dst21, dst10);
+    dst21_r = __lsx_vilvh_h(dst21, dst10);
+    dst22 = __lsx_vreplvei_d(dst21, 1);
+
+    for (loop_cnt = 2; loop_cnt--;) {
+        src3 = __lsx_vld(src, 0);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src4, src5);
+        src6 = __lsx_vldx(src, src_stride_3x);
+        src += src_stride_4x;
+        src7 = __lsx_vld(src, 0);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
+        src10 = __lsx_vldx(src, src_stride_3x);
+        src += src_stride_4x;
+        DUP2_ARG3(__lsx_vshuf_b, src7, src3, mask2, src7, src3, mask3,
+                  vec0, vec1);
+        DUP2_ARG3(__lsx_vshuf_b, src8, src4, mask2, src8, src4, mask3,
+                  vec2, vec3);
+        DUP2_ARG3(__lsx_vshuf_b, src9, src5, mask2, src9, src5, mask3,
+                  vec4, vec5);
+        DUP2_ARG3(__lsx_vshuf_b, src10, src6, mask2, src10, src6, mask3,
+                  vec6, vec7);
+
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
+                  vec6, filt0, dst73, dst84, dst95, dst106);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst73, vec1, filt1, dst84, vec3,
+                  filt1, dst95, vec5, filt1, dst106, vec7, filt1, dst73,
+                  dst84, dst95, dst106);
+
+        DUP2_ARG2(__lsx_vilvl_h, dst73, dst22, dst84, dst73, dst32_r, dst43_r);
+        DUP2_ARG2(__lsx_vilvh_h, dst84, dst73, dst95, dst84, dst87_r, dst98_r);
+        DUP2_ARG2(__lsx_vilvl_h, dst95, dst84, dst106, dst95, dst54_r, dst65_r);
+        dst109_r = __lsx_vilvh_h(dst106, dst95);
+        dst22 = __lsx_vreplvei_d(dst73, 1);
+        dst76_r = __lsx_vilvl_h(dst22, dst106);
+
+        DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst21_r, filt_h0, dst32_r,
+                  filt_h0, dst43_r, filt_h0, tmp0, tmp1, tmp2, tmp3);
+        DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst65_r, filt_h0, dst76_r,
+                  filt_h0, dst87_r, filt_h0, tmp4, tmp5, tmp6, tmp7);
+        DUP4_ARG3(__lsx_vdp2add_w_h, tmp0, dst32_r, filt_h1, tmp1, dst43_r,
+                  filt_h1, tmp2, dst54_r, filt_h1, tmp3, dst65_r, filt_h1,
+                  tmp0, tmp1, tmp2, tmp3);
+        DUP4_ARG3(__lsx_vdp2add_w_h, tmp4, dst76_r, filt_h1, tmp5, dst87_r,
+                  filt_h1, tmp6, dst98_r, filt_h1, tmp7, dst109_r, filt_h1,
+                  tmp4, tmp5, tmp6, tmp7);
+        DUP4_ARG2(__lsx_vsrai_w, tmp0, 6, tmp1, 6, tmp2, 6, tmp3, 6,
+                  tmp0, tmp1, tmp2, tmp3);
+        DUP4_ARG2(__lsx_vsrai_w, tmp4, 6, tmp5, 6, tmp6, 6, tmp7, 6,
+                  tmp4, tmp5, tmp6, tmp7);
+        DUP4_ARG2(__lsx_vpickev_h, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4,
+                  tmp7, tmp6, tmp0, tmp1, tmp2, tmp3);
+
+        __lsx_vstelm_d(tmp0, dst, 0, 0);
+        dst += dst_stride;
+        __lsx_vstelm_d(tmp0, dst, 0, 1);
+        dst += dst_stride;
+        __lsx_vstelm_d(tmp1, dst, 0, 0);
+        dst += dst_stride;
+        __lsx_vstelm_d(tmp1, dst, 0, 1);
+        dst += dst_stride;
+        __lsx_vstelm_d(tmp2, dst, 0, 0);
+        dst += dst_stride;
+        __lsx_vstelm_d(tmp2, dst, 0, 1);
+        dst += dst_stride;
+        __lsx_vstelm_d(tmp3, dst, 0, 0);
+        dst += dst_stride;
+        __lsx_vstelm_d(tmp3, dst, 0, 1);
+        dst += dst_stride;
+
+        dst10_r = dst98_r;
+        dst21_r = dst109_r;
+        dst22 = __lsx_vreplvei_d(dst106, 1);
+    }
+}
+
+static void hevc_hv_4t_16w_lsx(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter_x,
+                               const int8_t *filter_y,
+                               int32_t height)
+{
+    if (4 == height) {
+        hevc_hv_4t_8multx4_lsx(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, 2);
+    } else {
+        hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height, 2);
+    }
+}
+
+static void hevc_hv_4t_24w_lsx(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter_x,
+                               const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 3);
+}
+
+static void hevc_hv_4t_32w_lsx(uint8_t *src,
+                               int32_t src_stride,
+                               int16_t *dst,
+                               int32_t dst_stride,
+                               const int8_t *filter_x,
+                               const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 4);
+}
+
+#define MC_COPY(WIDTH)                                                    \
+void ff_hevc_put_hevc_pel_pixels##WIDTH##_8_lsx(int16_t *dst,             \
+                                                uint8_t *src,             \
+                                                ptrdiff_t src_stride,     \
+                                                int height,               \
+                                                intptr_t mx,              \
+                                                intptr_t my,              \
+                                                int width)                \
+{                                                                         \
+    hevc_copy_##WIDTH##w_lsx(src, src_stride, dst, MAX_PB_SIZE, height);  \
+}
+
+MC_COPY(4);
+MC_COPY(6);
+MC_COPY(8);
+MC_COPY(12);
+MC_COPY(16);
+MC_COPY(24);
+MC_COPY(32);
+MC_COPY(48);
+MC_COPY(64);
+
+#undef MC_COPY
+
+#define MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                          \
+void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_lsx(int16_t *dst,          \
+                                                   uint8_t *src,          \
+                                                   ptrdiff_t src_stride,  \
+                                                   int height,            \
+                                                   intptr_t mx,           \
+                                                   intptr_t my,           \
+                                                   int width)             \
+{                                                                         \
+    const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];         \
+                                                                          \
+    hevc_##DIR1##_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst,           \
+                                          MAX_PB_SIZE, filter, height);   \
+}
+
+MC(qpel, h, 4, 8, hz, mx);
+MC(qpel, h, 8, 8, hz, mx);
+MC(qpel, h, 12, 8, hz, mx);
+MC(qpel, h, 16, 8, hz, mx);
+MC(qpel, h, 24, 8, hz, mx);
+MC(qpel, h, 32, 8, hz, mx);
+MC(qpel, h, 48, 8, hz, mx);
+MC(qpel, h, 64, 8, hz, mx);
+
+MC(qpel, v, 4, 8, vt, my);
+MC(qpel, v, 8, 8, vt, my);
+MC(qpel, v, 12, 8, vt, my);
+MC(qpel, v, 16, 8, vt, my);
+MC(qpel, v, 24, 8, vt, my);
+MC(qpel, v, 32, 8, vt, my);
+MC(qpel, v, 48, 8, vt, my);
+MC(qpel, v, 64, 8, vt, my);
+
+MC(epel, h, 32, 4, hz, mx);
+
+MC(epel, v, 16, 4, vt, my);
+MC(epel, v, 24, 4, vt, my);
+MC(epel, v, 32, 4, vt, my);
+
+#undef MC
+
+#define MC_HV(PEL, WIDTH, TAP)                                          \
+void ff_hevc_put_hevc_##PEL##_hv##WIDTH##_8_lsx(int16_t *dst,           \
+                                                uint8_t *src,           \
+                                                ptrdiff_t src_stride,   \
+                                                int height,             \
+                                                intptr_t mx,            \
+                                                intptr_t my,            \
+                                                int width)              \
+{                                                                       \
+    const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];           \
+    const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];           \
+                                                                        \
+    hevc_hv_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, MAX_PB_SIZE,  \
+                                          filter_x, filter_y, height);  \
+}
+
+MC_HV(qpel, 4, 8);
+MC_HV(qpel, 8, 8);
+MC_HV(qpel, 12, 8);
+MC_HV(qpel, 16, 8);
+MC_HV(qpel, 24, 8);
+MC_HV(qpel, 32, 8);
+MC_HV(qpel, 48, 8);
+MC_HV(qpel, 64, 8);
+
+MC_HV(epel, 8, 4);
+MC_HV(epel, 12, 4);
+MC_HV(epel, 16, 4);
+MC_HV(epel, 24, 4);
+MC_HV(epel, 32, 4);
+
+#undef MC_HV
diff --git a/libavcodec/loongarch/hevcdsp_lsx.h b/libavcodec/loongarch/hevcdsp_lsx.h
new file mode 100644
index 0000000000..3259e03f13
--- /dev/null
+++ b/libavcodec/loongarch/hevcdsp_lsx.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *                Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_LOONGARCH_HEVCDSP_LSX_H
+#define AVCODEC_LOONGARCH_HEVCDSP_LSX_H
+
+#include "libavcodec/hevcdsp.h"
+
+#define MC(PEL, DIR, WIDTH)                                               \
+void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_lsx(int16_t *dst,          \
+                                                   uint8_t *src,          \
+                                                   ptrdiff_t src_stride,  \
+                                                   int height,            \
+                                                   intptr_t mx,           \
+                                                   intptr_t my,           \
+                                                   int width)
+
+MC(pel, pixels, 4);
+MC(pel, pixels, 6);
+MC(pel, pixels, 8);
+MC(pel, pixels, 12);
+MC(pel, pixels, 16);
+MC(pel, pixels, 24);
+MC(pel, pixels, 32);
+MC(pel, pixels, 48);
+MC(pel, pixels, 64);
+
+MC(qpel, h, 4);
+MC(qpel, h, 8);
+MC(qpel, h, 12);
+MC(qpel, h, 16);
+MC(qpel, h, 24);
+MC(qpel, h, 32);
+MC(qpel, h, 48);
+MC(qpel, h, 64);
+
+MC(qpel, v, 4);
+MC(qpel, v, 8);
+MC(qpel, v, 12);
+MC(qpel, v, 16);
+MC(qpel, v, 24);
+MC(qpel, v, 32);
+MC(qpel, v, 48);
+MC(qpel, v, 64);
+
+MC(qpel, hv, 4);
+MC(qpel, hv, 8);
+MC(qpel, hv, 12);
+MC(qpel, hv, 16);
+MC(qpel, hv, 24);
+MC(qpel, hv, 32);
+MC(qpel, hv, 48);
+MC(qpel, hv, 64);
+
+MC(epel, h, 32);
+
+MC(epel, v, 16);
+MC(epel, v, 24);
+MC(epel, v, 32);
+
+MC(epel, hv, 8);
+MC(epel, hv, 12);
+MC(epel, hv, 16);
+MC(epel, hv, 24);
+MC(epel, hv, 32);
+
+#undef MC
+
+#endif  // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LSX_H
-- 
2.20.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [FFmpeg-devel] [PATCH v1 3/5] avcodec: [loongarch] Optimize Hevc_idct/lpf with LSX.
  2022-02-14  8:25 [FFmpeg-devel] Optimize HEVC decoding for loongarch Hao Chen
  2022-02-14  8:25 ` [FFmpeg-devel] [PATCH v1 1/5] avutil: [loongarch] Update loongson_intrinsics.h to v1.1.0 Hao Chen
  2022-02-14  8:25 ` [FFmpeg-devel] [PATCH v1 2/5] avcodec: [loongarch] Optimize Hevcdsp with LSX Hao Chen
@ 2022-02-14  8:25 ` Hao Chen
  2022-02-14  8:25 ` [FFmpeg-devel] [PATCH v1 4/5] avcodec: [loongarch] Optimize Hevc_mc_bi " Hao Chen
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Hao Chen @ 2022-02-14  8:25 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Lu Wang

From: Lu Wang <wanglu@loongson.cn>

ffmpeg -i 5_h265_1080p_60fps_3Mbps.mkv -f rawvideo -y /dev/null -an
before:  110fps
after : 124fps
---
 libavcodec/loongarch/Makefile                 |    2 +
 libavcodec/loongarch/hevc_idct_lsx.c          |  842 ++++++
 libavcodec/loongarch/hevc_lpf_sao_lsx.c       | 2485 +++++++++++++++++
 libavcodec/loongarch/hevcdsp_init_loongarch.c |   19 +
 libavcodec/loongarch/hevcdsp_lsx.h            |   26 +
 5 files changed, 3374 insertions(+)
 create mode 100644 libavcodec/loongarch/hevc_idct_lsx.c
 create mode 100644 libavcodec/loongarch/hevc_lpf_sao_lsx.c

diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index 9f416d0c6e..cfc8e3aaff 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -26,3 +26,5 @@ LSX-OBJS-$(CONFIG_VP9_DECODER)        += loongarch/vp9_mc_lsx.o \
                                          loongarch/vp9_lpf_lsx.o \
                                          loongarch/vp9_idct_lsx.o
 LSX-OBJS-$(CONFIG_HEVC_DECODER)       += loongarch/hevcdsp_lsx.o \
+                                         loongarch/hevc_idct_lsx.o \
+                                         loongarch/hevc_lpf_sao_lsx.o
diff --git a/libavcodec/loongarch/hevc_idct_lsx.c b/libavcodec/loongarch/hevc_idct_lsx.c
new file mode 100644
index 0000000000..2193b27546
--- /dev/null
+++ b/libavcodec/loongarch/hevc_idct_lsx.c
@@ -0,0 +1,842 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *                Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "hevcdsp_lsx.h"
+
+static const int16_t gt8x8_cnst[16] __attribute__ ((aligned (64))) = {
+    64, 64, 83, 36, 89, 50, 18, 75, 64, -64, 36, -83, 75, -89, -50, -18
+};
+
+static const int16_t gt16x16_cnst[64] __attribute__ ((aligned (64))) = {
+    64, 83, 64, 36, 89, 75, 50, 18, 90, 80, 57, 25, 70, 87, 9, 43,
+    64, 36, -64, -83, 75, -18, -89, -50, 87, 9, -80, -70, -43, 57, -25, -90,
+    64, -36, -64, 83, 50, -89, 18, 75, 80, -70, -25, 90, -87, 9, 43, 57,
+    64, -83, 64, -36, 18, -50, 75, -89, 70, -87, 90, -80, 9, -43, -57, 25
+};
+
+static const int16_t gt32x32_cnst0[256] __attribute__ ((aligned (64))) = {
+    90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4,
+    90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13,
+    88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22,
+    85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31,
+    82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38,
+    78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46,
+    73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54,
+    67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61,
+    61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67,
+    54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73,
+    46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78,
+    38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82,
+    31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85,
+    22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88,
+    13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90,
+    4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90
+};
+
+static const int16_t gt32x32_cnst1[64] __attribute__ ((aligned (64))) = {
+    90, 87, 80, 70, 57, 43, 25, 9, 87, 57, 9, -43, -80, -90, -70, -25,
+    80, 9, -70, -87, -25, 57, 90, 43, 70, -43, -87, 9, 90, 25, -80, -57,
+    57, -80, -25, 90, -9, -87, 43, 70, 43, -90, 57, 25, -87, 70, 9, -80,
+    25, -70, 90, -80, 43, 9, -57, 87, 9, -25, 43, -57, 70, -80, 87, -90
+};
+
+static const int16_t gt32x32_cnst2[16] __attribute__ ((aligned (64))) = {
+    89, 75, 50, 18, 75, -18, -89, -50, 50, -89, 18, 75, 18, -50, 75, -89
+};
+
+#define HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1,          \
+                         sum0, sum1, sum2, sum3, shift)       \
+{                                                             \
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5;               \
+    __m128i cnst64 = __lsx_vldi(0x0840);                      \
+    __m128i cnst83 = __lsx_vldi(0x0853);                      \
+    __m128i cnst36 = __lsx_vldi(0x0824);                      \
+                                                              \
+    vec0 = __lsx_vdp2_w_h(in_r0, cnst64);                     \
+    vec1 = __lsx_vdp2_w_h(in_l0, cnst83);                     \
+    vec2 = __lsx_vdp2_w_h(in_r1, cnst64);                     \
+    vec3 = __lsx_vdp2_w_h(in_l1, cnst36);                     \
+    vec4 = __lsx_vdp2_w_h(in_l0, cnst36);                     \
+    vec5 = __lsx_vdp2_w_h(in_l1, cnst83);                     \
+                                                              \
+    sum0 = __lsx_vadd_w(vec0, vec2);                          \
+    sum1 = __lsx_vsub_w(vec0, vec2);                          \
+    vec1 = __lsx_vadd_w(vec1, vec3);                          \
+    vec4 = __lsx_vsub_w(vec4, vec5);                          \
+    sum2 = __lsx_vsub_w(sum1, vec4);                          \
+    sum3 = __lsx_vsub_w(sum0, vec1);                          \
+    sum0 = __lsx_vadd_w(sum0, vec1);                          \
+    sum1 = __lsx_vadd_w(sum1, vec4);                          \
+                                                              \
+    sum0 = __lsx_vsrari_w(sum0, shift);                       \
+    sum1 = __lsx_vsrari_w(sum1, shift);                       \
+    sum2 = __lsx_vsrari_w(sum2, shift);                       \
+    sum3 = __lsx_vsrari_w(sum3, shift);                       \
+    sum0 = __lsx_vsat_w(sum0, 15);                            \
+    sum1 = __lsx_vsat_w(sum1, 15);                            \
+    sum2 = __lsx_vsat_w(sum2, 15);                            \
+    sum3 = __lsx_vsat_w(sum3, 15);                            \
+}
+
+#define HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, shift)  \
+{                                                                        \
+    __m128i src0_r, src1_r, src2_r, src3_r;                              \
+    __m128i src0_l, src1_l, src2_l, src3_l;                              \
+    __m128i filter0, filter1, filter2, filter3;                          \
+    __m128i temp0_r, temp1_r, temp2_r, temp3_r, temp4_r, temp5_r;        \
+    __m128i temp0_l, temp1_l, temp2_l, temp3_l, temp4_l, temp5_l;        \
+    __m128i sum0_r, sum1_r, sum2_r, sum3_r;                              \
+    __m128i sum0_l, sum1_l, sum2_l, sum3_l;                              \
+                                                                         \
+    DUP4_ARG2(__lsx_vilvl_h, in4, in0, in6, in2, in5, in1, in3, in7,     \
+              src0_r, src1_r, src2_r, src3_r);                           \
+    DUP4_ARG2(__lsx_vilvh_h, in4, in0, in6, in2, in5, in1, in3, in7,     \
+              src0_l, src1_l, src2_l, src3_l);                           \
+                                                                         \
+    DUP4_ARG2(__lsx_vldrepl_w, filter, 0, filter, 4, filter, 8,          \
+              filter, 12, filter0, filter1, filter2, filter3);           \
+    DUP4_ARG2(__lsx_vdp2_w_h, src0_r, filter0, src0_l, filter0,          \
+              src1_r, filter1, src1_l, filter1,  temp0_r, temp0_l,       \
+              temp1_r, temp1_l);                                         \
+                                                                         \
+    LSX_BUTTERFLY_4_W(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l,\
+                      sum1_l, sum1_r);                                   \
+    sum2_r = sum1_r;                                                     \
+    sum2_l = sum1_l;                                                     \
+    sum3_r = sum0_r;                                                     \
+    sum3_l = sum0_l;                                                     \
+                                                                         \
+    DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter2, src2_l, filter2,          \
+              src3_r, filter3, src3_l, filter3,  temp2_r, temp2_l,       \
+              temp3_r, temp3_l);                                         \
+    temp2_r = __lsx_vadd_w(temp2_r, temp3_r);                            \
+    temp2_l = __lsx_vadd_w(temp2_l, temp3_l);                            \
+    sum0_r  = __lsx_vadd_w(sum0_r, temp2_r);                             \
+    sum0_l  = __lsx_vadd_w(sum0_l, temp2_l);                             \
+    sum3_r  = __lsx_vsub_w(sum3_r, temp2_r);                             \
+    sum3_l  = __lsx_vsub_w(sum3_l, temp2_l);                             \
+                                                                         \
+    in0 = __lsx_vssrarni_h_w(sum0_l, sum0_r, shift);                     \
+    in7 = __lsx_vssrarni_h_w(sum3_l, sum3_r, shift);                     \
+                                                                         \
+    DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter3, src2_l, filter3,          \
+              src3_r, filter2, src3_l, filter2,  temp4_r, temp4_l,       \
+              temp5_r, temp5_l);                                         \
+    temp4_r = __lsx_vsub_w(temp4_r, temp5_r);                            \
+    temp4_l = __lsx_vsub_w(temp4_l, temp5_l);                            \
+    sum1_r  = __lsx_vadd_w(sum1_r, temp4_r);                             \
+    sum1_l  = __lsx_vadd_w(sum1_l, temp4_l);                             \
+    sum2_r  = __lsx_vsub_w(sum2_r, temp4_r);                             \
+    sum2_l  = __lsx_vsub_w(sum2_l, temp4_l);                             \
+                                                                         \
+    in3 = __lsx_vssrarni_h_w(sum1_l, sum1_r, shift);                     \
+    in4 = __lsx_vssrarni_h_w(sum2_l, sum2_r, shift);                     \
+                                                                         \
+    DUP4_ARG2(__lsx_vldrepl_w, filter, 16, filter, 20, filter, 24,       \
+              filter, 28, filter0, filter1, filter2, filter3);           \
+    DUP4_ARG2(__lsx_vdp2_w_h, src0_r, filter0, src0_l, filter0,          \
+              src1_r, filter1, src1_l, filter1,  temp0_r, temp0_l,       \
+              temp1_r, temp1_l);                                         \
+                                                                         \
+    LSX_BUTTERFLY_4_W(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l,\
+                      sum1_l, sum1_r);                                   \
+    sum2_r = sum1_r;                                                     \
+    sum2_l = sum1_l;                                                     \
+    sum3_r = sum0_r;                                                     \
+    sum3_l = sum0_l;                                                     \
+                                                                         \
+    DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter2, src2_l, filter2,          \
+              src3_r, filter3, src3_l, filter3,  temp2_r, temp2_l,       \
+              temp3_r, temp3_l);                                         \
+    temp2_r = __lsx_vadd_w(temp2_r, temp3_r);                            \
+    temp2_l = __lsx_vadd_w(temp2_l, temp3_l);                            \
+    sum0_r  = __lsx_vadd_w(sum0_r, temp2_r);                             \
+    sum0_l  = __lsx_vadd_w(sum0_l, temp2_l);                             \
+    sum3_r  = __lsx_vsub_w(sum3_r, temp2_r);                             \
+    sum3_l  = __lsx_vsub_w(sum3_l, temp2_l);                             \
+                                                                         \
+    in1 = __lsx_vssrarni_h_w(sum0_l, sum0_r, shift);                     \
+    in6 = __lsx_vssrarni_h_w(sum3_l, sum3_r, shift);                     \
+                                                                         \
+    DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter3, src2_l, filter3,          \
+              src3_r, filter2, src3_l, filter2,  temp4_r, temp4_l,       \
+              temp5_r, temp5_l);                                         \
+    temp4_r = __lsx_vsub_w(temp4_r, temp5_r);                            \
+    temp4_l = __lsx_vsub_w(temp4_l, temp5_l);                            \
+    sum1_r  = __lsx_vsub_w(sum1_r, temp4_r);                             \
+    sum1_l  = __lsx_vsub_w(sum1_l, temp4_l);                             \
+    sum2_r  = __lsx_vadd_w(sum2_r, temp4_r);                             \
+    sum2_l  = __lsx_vadd_w(sum2_l, temp4_l);                             \
+                                                                         \
+    in2 = __lsx_vssrarni_h_w(sum1_l, sum1_r, shift);                     \
+    in5 = __lsx_vssrarni_h_w(sum2_l, sum2_r, shift);                     \
+}
+
+#define HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r,                   \
+                           src4_r, src5_r, src6_r, src7_r,                   \
+                           src0_l, src1_l, src2_l, src3_l,                   \
+                           src4_l, src5_l, src6_l, src7_l, shift)            \
+{                                                                            \
+    int16_t *ptr0, *ptr1;                                                    \
+    __m128i dst0, dst1;                                                      \
+    __m128i filter0, filter1, filter2, filter3;                              \
+    __m128i temp0_r, temp1_r, temp0_l, temp1_l;                              \
+    __m128i sum0_r, sum1_r, sum2_r, sum3_r, sum0_l, sum1_l, sum2_l;          \
+    __m128i sum3_l, res0_r, res1_r, res0_l, res1_l;                          \
+                                                                             \
+    ptr0 = (buf_ptr + 112);                                                  \
+    ptr1 = (buf_ptr + 128);                                                  \
+    k = -1;                                                                  \
+                                                                             \
+    for (j = 0; j < 4; j++)                                                  \
+    {                                                                        \
+        DUP4_ARG2(__lsx_vldrepl_w, filter, 0, filter, 4, filter, 16,         \
+                  filter, 20, filter0, filter1, filter2, filter3);           \
+        DUP4_ARG2(__lsx_vdp2_w_h, src0_r, filter0, src0_l, filter0,          \
+                  src4_r, filter2, src4_l, filter2,  sum0_r, sum0_l,         \
+                  sum2_r, sum2_l);                                           \
+        DUP2_ARG2(__lsx_vdp2_w_h, src7_r, filter2, src7_l, filter2,          \
+                  sum3_r, sum3_l);                                           \
+        DUP4_ARG3(__lsx_vdp2add_w_h, sum0_r, src1_r, filter1, sum0_l,        \
+                  src1_l, filter1, sum2_r, src5_r, filter3, sum2_l,          \
+                  src5_l, filter3, sum0_r, sum0_l, sum2_r, sum2_l);          \
+        DUP2_ARG3(__lsx_vdp2add_w_h, sum3_r, src6_r, filter3, sum3_l,        \
+                  src6_l, filter3, sum3_r, sum3_l);                          \
+                                                                             \
+        sum1_r = sum0_r;                                                     \
+        sum1_l = sum0_l;                                                     \
+                                                                             \
+        DUP4_ARG2(__lsx_vldrepl_w, filter, 8, filter, 12, filter, 24,        \
+                  filter, 28, filter0, filter1, filter2, filter3);           \
+        filter += 16;                                                        \
+        DUP2_ARG2(__lsx_vdp2_w_h, src2_r, filter0, src2_l, filter0,          \
+                  temp0_r, temp0_l);                                         \
+        DUP2_ARG3(__lsx_vdp2add_w_h, sum2_r, src6_r, filter2, sum2_l,        \
+                  src6_l, filter2, sum2_r, sum2_l);                          \
+        DUP2_ARG2(__lsx_vdp2_w_h, src5_r, filter2, src5_l, filter2,          \
+                  temp1_r, temp1_l);                                         \
+                                                                             \
+        sum0_r = __lsx_vadd_w(sum0_r, temp0_r);                              \
+        sum0_l = __lsx_vadd_w(sum0_l, temp0_l);                              \
+        sum1_r = __lsx_vsub_w(sum1_r, temp0_r);                              \
+        sum1_l = __lsx_vsub_w(sum1_l, temp0_l);                              \
+        sum3_r = __lsx_vsub_w(temp1_r, sum3_r);                              \
+        sum3_l = __lsx_vsub_w(temp1_l, sum3_l);                              \
+                                                                             \
+        DUP2_ARG2(__lsx_vdp2_w_h, src3_r, filter1, src3_l, filter1,          \
+                  temp0_r, temp0_l);                                         \
+        DUP4_ARG3(__lsx_vdp2add_w_h, sum2_r, src7_r, filter3, sum2_l,        \
+                  src7_l, filter3, sum3_r, src4_r, filter3, sum3_l,          \
+                  src4_l, filter3, sum2_r, sum2_l, sum3_r, sum3_l);          \
+                                                                             \
+        sum0_r = __lsx_vadd_w(sum0_r, temp0_r);                              \
+        sum0_l = __lsx_vadd_w(sum0_l, temp0_l);                              \
+        sum1_r = __lsx_vsub_w(sum1_r, temp0_r);                              \
+        sum1_l = __lsx_vsub_w(sum1_l, temp0_l);                              \
+                                                                             \
+        LSX_BUTTERFLY_4_W(sum0_r, sum0_l, sum2_l, sum2_r, res0_r, res0_l,    \
+                          res1_l, res1_r);                                   \
+        dst0 = __lsx_vssrarni_h_w(res0_l, res0_r, shift);                    \
+        dst1 = __lsx_vssrarni_h_w(res1_l, res1_r, shift);                    \
+        __lsx_vst(dst0, buf_ptr, 0);                                         \
+        __lsx_vst(dst1, (buf_ptr + ((15 - (j * 2)) << 4)), 0);               \
+                                                                             \
+        LSX_BUTTERFLY_4_W(sum1_r, sum1_l, sum3_l, sum3_r, res0_r, res0_l,    \
+                          res1_l, res1_r);                                   \
+                                                                             \
+        dst0 = __lsx_vssrarni_h_w(res0_l, res0_r, shift);                    \
+        dst1 = __lsx_vssrarni_h_w(res1_l, res1_r, shift);                    \
+        __lsx_vst(dst0, (ptr0 + ((((j + 1) >> 1) * 2 * k) << 4)), 0);        \
+        __lsx_vst(dst1, (ptr1 - ((((j + 1) >> 1) * 2 * k) << 4)), 0);        \
+                                                                             \
+        k *= -1;                                                             \
+        buf_ptr += 16;                                                       \
+    }                                                                        \
+}
+
+#define HEVC_EVEN16_CALC(input, sum0_r, sum0_l, load_idx, store_idx)  \
+{                                                                     \
+    tmp0_r = __lsx_vld(input + load_idx * 8, 0);                      \
+    tmp0_l = __lsx_vld(input + load_idx * 8, 16);                     \
+    tmp1_r = sum0_r;                                                  \
+    tmp1_l = sum0_l;                                                  \
+    sum0_r = __lsx_vadd_w(sum0_r, tmp0_r);                            \
+    sum0_l = __lsx_vadd_w(sum0_l, tmp0_l);                            \
+    __lsx_vst(sum0_r, (input + load_idx * 8), 0);                     \
+    __lsx_vst(sum0_l, (input + load_idx * 8), 16);                    \
+    tmp1_r = __lsx_vsub_w(tmp1_r, tmp0_r);                            \
+    tmp1_l = __lsx_vsub_w(tmp1_l, tmp0_l);                            \
+    __lsx_vst(tmp1_r, (input + store_idx * 8), 0);                    \
+    __lsx_vst(tmp1_l, (input + store_idx * 8), 16);                   \
+}
+
+#define HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1,     \
+                              res0, res1, res2, res3, shift)  \
+{                                                             \
+    __m128i vec0, vec1, vec2, vec3;                           \
+    __m128i cnst74 = __lsx_vldi(0x84a);                       \
+    __m128i cnst55 = __lsx_vldi(0x837);                       \
+    __m128i cnst29 = __lsx_vldi(0x81d);                       \
+                                                              \
+    vec0 = __lsx_vadd_w(in_r0, in_r1);                        \
+    vec2 = __lsx_vsub_w(in_r0, in_l1);                        \
+    res0 = __lsx_vmul_w(vec0, cnst29);                        \
+    res1 = __lsx_vmul_w(vec2, cnst55);                        \
+    res2 = __lsx_vsub_w(in_r0, in_r1);                        \
+    vec1 = __lsx_vadd_w(in_r1, in_l1);                        \
+    res2 = __lsx_vadd_w(res2, in_l1);                         \
+    vec3 = __lsx_vmul_w(in_l0, cnst74);                       \
+    res3 = __lsx_vmul_w(vec0, cnst55);                        \
+                                                              \
+    res0 = __lsx_vadd_w(res0, __lsx_vmul_w(vec1, cnst55));    \
+    res1 = __lsx_vsub_w(res1, __lsx_vmul_w(vec1, cnst29));    \
+    res2 = __lsx_vmul_w(res2, cnst74);                        \
+    res3 = __lsx_vadd_w(res3, __lsx_vmul_w(vec2, cnst29));    \
+                                                              \
+    res0 = __lsx_vadd_w(res0, vec3);                          \
+    res1 = __lsx_vadd_w(res1, vec3);                          \
+    res3 = __lsx_vsub_w(res3, vec3);                          \
+                                                              \
+    res0 = __lsx_vsrari_w(res0, shift);                       \
+    res1 = __lsx_vsrari_w(res1, shift);                       \
+    res2 = __lsx_vsrari_w(res2, shift);                       \
+    res3 = __lsx_vsrari_w(res3, shift);                       \
+    res0 = __lsx_vsat_w(res0, 15);                            \
+    res1 = __lsx_vsat_w(res1, 15);                            \
+    res2 = __lsx_vsat_w(res2, 15);                            \
+    res3 = __lsx_vsat_w(res3, 15);                            \
+}
+
+void ff_hevc_idct_4x4_lsx(int16_t *coeffs, int col_limit)
+{
+    __m128i in0, in1;
+    __m128i in_r0, in_l0, in_r1, in_l1;
+    __m128i sum0, sum1, sum2, sum3;
+    __m128i zero = __lsx_vldi(0x00);
+
+    in0   = __lsx_vld(coeffs, 0);
+    in1   = __lsx_vld(coeffs, 16);
+    in_r0 = __lsx_vilvl_h(zero, in0);
+    in_l0 = __lsx_vilvh_h(zero, in0);
+    in_r1 = __lsx_vilvl_h(zero, in1);
+    in_l1 = __lsx_vilvh_h(zero, in1);
+
+    HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 7);
+    LSX_TRANSPOSE4x4_W(sum0, sum1, sum2, sum3, in_r0, in_l0, in_r1, in_l1);
+    HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 12);
+
+    /* Pack and transpose */
+    in0  = __lsx_vpickev_h(sum2, sum0);
+    in1  = __lsx_vpickev_h(sum3, sum1);
+    sum0 = __lsx_vilvl_h(in1, in0);
+    sum1 = __lsx_vilvh_h(in1, in0);
+    in0  = __lsx_vilvl_w(sum1, sum0);
+    in1  = __lsx_vilvh_w(sum1, sum0);
+
+    __lsx_vst(in0, coeffs, 0);
+    __lsx_vst(in1, coeffs, 16);
+}
+
+void ff_hevc_idct_8x8_lsx(int16_t *coeffs, int col_limit)
+{
+    const int16_t *filter = &gt8x8_cnst[0];
+    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+    DUP4_ARG2(__lsx_vld, coeffs, 0, coeffs, 16, coeffs, 32,
+              coeffs, 48, in0, in1, in2, in3);
+    DUP4_ARG2(__lsx_vld, coeffs, 64, coeffs, 80, coeffs, 96,
+              coeffs, 112, in4, in5, in6, in7);
+    HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 7);
+    LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+    HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 12);
+    LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+                       in0, in1, in2, in3, in4, in5, in6, in7);
+
+    __lsx_vst(in0, coeffs, 0);
+    __lsx_vst(in1, coeffs, 16);
+    __lsx_vst(in2, coeffs, 32);
+    __lsx_vst(in3, coeffs, 48);
+    __lsx_vst(in4, coeffs, 64);
+    __lsx_vst(in5, coeffs, 80);
+    __lsx_vst(in6, coeffs, 96);
+    __lsx_vst(in7, coeffs, 112);
+}
+
+void ff_hevc_idct_16x16_lsx(int16_t *coeffs, int col_limit)
+{
+    int16_t i, j, k;
+    int16_t buf[256];
+    int16_t *buf_ptr = &buf[0];
+    int16_t *src = coeffs;
+    const int16_t *filter = &gt16x16_cnst[0];
+    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+    __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    __m128i src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+    __m128i src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l;
+
+    for (i = 2; i--;) {
+        DUP4_ARG2(__lsx_vld, src, 0, src, 32, src, 64, src, 96,
+                  in0, in1, in2, in3);
+        DUP4_ARG2(__lsx_vld, src, 128, src, 160, src, 192, src, 224,
+                  in4, in5, in6, in7);
+        DUP4_ARG2(__lsx_vld, src, 256, src, 288, src, 320, src, 352,
+                  in8, in9, in10, in11);
+        DUP4_ARG2(__lsx_vld, src, 384, src, 416, src, 448, src, 480,
+                  in12, in13, in14, in15);
+
+        DUP4_ARG2(__lsx_vilvl_h, in4, in0, in12, in8, in6, in2, in14, in10,
+                  src0_r, src1_r, src2_r, src3_r);
+        DUP4_ARG2(__lsx_vilvl_h, in5, in1, in13, in9, in3, in7, in11, in15,
+                  src4_r, src5_r, src6_r, src7_r);
+        DUP4_ARG2(__lsx_vilvh_h, in4, in0, in12, in8, in6, in2, in14, in10,
+                  src0_l, src1_l, src2_l, src3_l);
+        DUP4_ARG2(__lsx_vilvh_h, in5, in1, in13, in9, in3, in7, in11, in15,
+                  src4_l, src5_l, src6_l, src7_l);
+
+        HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r,
+                           src6_r, src7_r, src0_l, src1_l, src2_l, src3_l,
+                           src4_l, src5_l, src6_l, src7_l, 7);
+
+        src += 8;
+        buf_ptr = (&buf[0] + 8);
+        filter = &gt16x16_cnst[0];
+    }
+
+    src = &buf[0];
+    buf_ptr = coeffs;
+    filter = &gt16x16_cnst[0];
+
+    for (i = 2; i--;) {
+        DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
+                  in0, in8, in1, in9);
+        DUP4_ARG2(__lsx_vld, src, 64, src, 80, src, 96, src, 112,
+                  in2, in10, in3, in11);
+        DUP4_ARG2(__lsx_vld, src, 128, src, 144, src, 160, src, 176,
+                  in4, in12, in5, in13);
+        DUP4_ARG2(__lsx_vld, src, 192, src, 208, src, 224, src, 240,
+                  in6, in14, in7, in15);
+        LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+                           in0, in1, in2, in3, in4, in5, in6, in7);
+        LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15,
+                           in8, in9, in10, in11, in12, in13, in14, in15);
+        DUP4_ARG2(__lsx_vilvl_h, in4, in0, in12, in8, in6, in2, in14, in10,
+                  src0_r, src1_r, src2_r, src3_r);
+        DUP4_ARG2(__lsx_vilvl_h, in5, in1, in13, in9, in3, in7, in11, in15,
+                  src4_r, src5_r, src6_r, src7_r);
+        DUP4_ARG2(__lsx_vilvh_h, in4, in0, in12, in8, in6, in2, in14, in10,
+                  src0_l, src1_l, src2_l, src3_l);
+        DUP4_ARG2(__lsx_vilvh_h, in5, in1, in13, in9, in3, in7, in11, in15,
+                  src4_l, src5_l, src6_l, src7_l);
+        HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r,
+                           src6_r, src7_r, src0_l, src1_l, src2_l, src3_l,
+                           src4_l, src5_l, src6_l, src7_l, 12);
+
+        src += 128;
+        buf_ptr = coeffs + 8;
+        filter = &gt16x16_cnst[0];
+    }
+
+    DUP4_ARG2(__lsx_vld, coeffs, 0, coeffs, 32, coeffs, 64, coeffs, 96,
+              in0, in1, in2, in3);
+    DUP4_ARG2(__lsx_vld, coeffs, 128, coeffs, 160, coeffs, 192, coeffs, 224,
+              in4, in5, in6, in7);
+    LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+                       vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+    __lsx_vst(vec0, coeffs, 0);
+    __lsx_vst(vec1, coeffs, 32);
+    __lsx_vst(vec2, coeffs, 64);
+    __lsx_vst(vec3, coeffs, 96);
+    __lsx_vst(vec4, coeffs, 128);
+    __lsx_vst(vec5, coeffs, 160);
+    __lsx_vst(vec6, coeffs, 192);
+    __lsx_vst(vec7, coeffs, 224);
+
+    src = coeffs + 8;
+    DUP4_ARG2(__lsx_vld, src, 0, src, 32, src, 64, src, 96, in0, in1, in2, in3);
+    DUP4_ARG2(__lsx_vld, src, 128, src, 160, src, 192, src, 224,
+              in4, in5, in6, in7);
+    LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+                       vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+    src = coeffs + 128;
+    DUP4_ARG2(__lsx_vld, src, 0, src, 32, src, 64, src, 96,
+              in8, in9, in10, in11);
+    DUP4_ARG2(__lsx_vld, src, 128, src, 160, src, 192, src, 224,
+              in12, in13, in14, in15);
+
+    __lsx_vst(vec0, src, 0);
+    __lsx_vst(vec1, src, 32);
+    __lsx_vst(vec2, src, 64);
+    __lsx_vst(vec3, src, 96);
+    __lsx_vst(vec4, src, 128);
+    __lsx_vst(vec5, src, 160);
+    __lsx_vst(vec6, src, 192);
+    __lsx_vst(vec7, src, 224);
+    LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15,
+                       vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+    src = coeffs + 8;
+    __lsx_vst(vec0, src, 0);
+    __lsx_vst(vec1, src, 32);
+    __lsx_vst(vec2, src, 64);
+    __lsx_vst(vec3, src, 96);
+    __lsx_vst(vec4, src, 128);
+    __lsx_vst(vec5, src, 160);
+    __lsx_vst(vec6, src, 192);
+    __lsx_vst(vec7, src, 224);
+
+    src = coeffs + 136;
+    DUP4_ARG2(__lsx_vld, src, 0, src, 32, src, 64, src, 96,
+              in0, in1, in2, in3);
+    DUP4_ARG2(__lsx_vld, src, 128, src, 160, src, 192, src, 224,
+              in4, in5, in6, in7);
+    LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+                       vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+    __lsx_vst(vec0, src, 0);
+    __lsx_vst(vec1, src, 32);
+    __lsx_vst(vec2, src, 64);
+    __lsx_vst(vec3, src, 96);
+    __lsx_vst(vec4, src, 128);
+    __lsx_vst(vec5, src, 160);
+    __lsx_vst(vec6, src, 192);
+    __lsx_vst(vec7, src, 224);
+}
+
+static void hevc_idct_8x32_column_lsx(int16_t *coeffs, int32_t buf_pitch,
+                                      uint8_t round)
+{
+    uint8_t i;
+    int32_t buf_pitch_2  = buf_pitch << 1;
+    int32_t buf_pitch_4  = buf_pitch << 2;
+    int32_t buf_pitch_8  = buf_pitch << 3;
+    int32_t buf_pitch_16 = buf_pitch << 4;
+
+    const int16_t *filter_ptr0 = &gt32x32_cnst0[0];
+    const int16_t *filter_ptr1 = &gt32x32_cnst1[0];
+    const int16_t *filter_ptr2 = &gt32x32_cnst2[0];
+    const int16_t *filter_ptr3 = &gt8x8_cnst[0];
+    int16_t *src0 = (coeffs + buf_pitch);
+    int16_t *src1 = (coeffs + buf_pitch_2);
+    int16_t *src2 = (coeffs + buf_pitch_4);
+    int16_t *src3 = (coeffs);
+    int32_t tmp_buf[8 * 32 + 15];
+    int32_t *tmp_buf_ptr = tmp_buf + 15;
+    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+    __m128i src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+    __m128i src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l;
+    __m128i filter0, filter1, filter2, filter3;
+    __m128i sum0_r, sum0_l, sum1_r, sum1_l, tmp0_r, tmp0_l, tmp1_r, tmp1_l;
+
+    /* Align pointer to 64 byte boundary */
+    tmp_buf_ptr = (int32_t *)(((uintptr_t) tmp_buf_ptr) & ~(uintptr_t) 63);
+
+    /* process coeff 4, 12, 20, 28 */
+    in0 = __lsx_vld(src2, 0);
+    in1 = __lsx_vld(src2 + buf_pitch_8, 0);
+    in2 = __lsx_vld(src2 + buf_pitch_16, 0);
+    in3 = __lsx_vld(src2 + buf_pitch_16 + buf_pitch_8, 0);
+    in4 = __lsx_vld(src3, 0);
+    in5 = __lsx_vld(src3 + buf_pitch_8, 0);
+    in6 = __lsx_vld(src3 + buf_pitch_16, 0);
+    in7 = __lsx_vld(src3 + buf_pitch_16 + buf_pitch_8, 0);
+    DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in6, in4, in7, in5,
+              src0_r, src1_r, src2_r, src3_r);
+    DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in6, in4, in7, in5,
+              src0_l, src1_l, src2_l, src3_l);
+
+    filter0 = __lsx_vldrepl_w(filter_ptr2, 0);
+    filter1 = __lsx_vldrepl_w(filter_ptr2, 4);
+    sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
+    sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
+    sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
+    sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
+    __lsx_vst(sum0_r, tmp_buf_ptr, 0);
+    __lsx_vst(sum0_l, tmp_buf_ptr, 16);
+
+    filter0 = __lsx_vldrepl_w(filter_ptr2, 8);
+    filter1 = __lsx_vldrepl_w(filter_ptr2, 12);
+    sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
+    sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
+    sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
+    sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
+    __lsx_vst(sum0_r, tmp_buf_ptr, 32);
+    __lsx_vst(sum0_l, tmp_buf_ptr, 48);
+
+    filter0 = __lsx_vldrepl_w(filter_ptr2, 16);
+    filter1 = __lsx_vldrepl_w(filter_ptr2, 20);
+    sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
+    sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
+    sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
+    sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
+    __lsx_vst(sum0_r, tmp_buf_ptr, 64);
+    __lsx_vst(sum0_l, tmp_buf_ptr, 80);
+
+    filter0 = __lsx_vldrepl_w(filter_ptr2, 24);
+    filter1 = __lsx_vldrepl_w(filter_ptr2, 28);
+    sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
+    sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
+    sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
+    sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
+    __lsx_vst(sum0_r, tmp_buf_ptr, 96);
+    __lsx_vst(sum0_l, tmp_buf_ptr, 112);
+
+    /* process coeff 0, 8, 16, 24 */
+    filter0 = __lsx_vldrepl_w(filter_ptr3, 0);
+    filter1 = __lsx_vldrepl_w(filter_ptr3, 4);
+
+    DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter0, src2_l, filter0,
+              src3_r, filter1, src3_l, filter1, sum0_r, sum0_l, tmp1_r, tmp1_l);
+    sum1_r = __lsx_vsub_w(sum0_r, tmp1_r);
+    sum1_l = __lsx_vsub_w(sum0_l, tmp1_l);
+    sum0_r = __lsx_vadd_w(sum0_r, tmp1_r);
+    sum0_l = __lsx_vadd_w(sum0_l, tmp1_l);
+
+    HEVC_EVEN16_CALC(tmp_buf_ptr, sum0_r, sum0_l, 0, 7);
+    HEVC_EVEN16_CALC(tmp_buf_ptr, sum1_r, sum1_l, 3, 4);
+
+    filter0 = __lsx_vldrepl_w(filter_ptr3, 16);
+    filter1 = __lsx_vldrepl_w(filter_ptr3, 20);
+
+    DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter0, src2_l, filter0,
+              src3_r, filter1, src3_l, filter1, sum0_r, sum0_l, tmp1_r, tmp1_l);
+    sum1_r = __lsx_vsub_w(sum0_r, tmp1_r);
+    sum1_l = __lsx_vsub_w(sum0_l, tmp1_l);
+    sum0_r = __lsx_vadd_w(sum0_r, tmp1_r);
+    sum0_l = __lsx_vadd_w(sum0_l, tmp1_l);
+
+    HEVC_EVEN16_CALC(tmp_buf_ptr, sum0_r, sum0_l, 1, 6);
+    HEVC_EVEN16_CALC(tmp_buf_ptr, sum1_r, sum1_l, 2, 5);
+
+    /* process coeff 2 6 10 14 18 22 26 30 */
+    in0 = __lsx_vld(src1, 0);
+    in1 = __lsx_vld(src1 + buf_pitch_4, 0);
+    in2 = __lsx_vld(src1 + buf_pitch_8, 0);
+    in3 = __lsx_vld(src1 + buf_pitch_8 + buf_pitch_4, 0);
+    in4 = __lsx_vld(src1 + buf_pitch_16, 0);
+    in5 = __lsx_vld(src1 + buf_pitch_16 + buf_pitch_4, 0);
+    in6 = __lsx_vld(src1 + buf_pitch_16 + buf_pitch_8, 0);
+    in7 = __lsx_vld(src1 + buf_pitch_16 + buf_pitch_8 + buf_pitch_4, 0);
+
+    DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in5, in4, in7, in6,
+              src0_r, src1_r, src2_r, src3_r);
+    DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in5, in4, in7, in6,
+              src0_l, src1_l, src2_l, src3_l);
+
+    /* loop for all columns of constants */
+    for (i = 0; i < 8; i++) {
+        /* processing single column of constants */
+        filter0 = __lsx_vldrepl_w(filter_ptr1, 0);
+        filter1 = __lsx_vldrepl_w(filter_ptr1, 4);
+        filter2 = __lsx_vldrepl_w(filter_ptr1, 8);
+        filter3 = __lsx_vldrepl_w(filter_ptr1, 12);
+        sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
+        sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
+        sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
+        sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
+        sum0_r = __lsx_vdp2add_w_h(sum0_r, src2_r, filter2);
+        sum0_l = __lsx_vdp2add_w_h(sum0_l, src2_l, filter2);
+        sum0_r = __lsx_vdp2add_w_h(sum0_r, src3_r, filter3);
+        sum0_l = __lsx_vdp2add_w_h(sum0_l, src3_l, filter3);
+
+        tmp0_r = __lsx_vld(tmp_buf_ptr + (i << 3), 0);
+        tmp0_l = __lsx_vld(tmp_buf_ptr + (i << 3), 16);
+        tmp1_r = tmp0_r;
+        tmp1_l = tmp0_l;
+        tmp0_r = __lsx_vadd_w(tmp0_r, sum0_r);
+        tmp0_l = __lsx_vadd_w(tmp0_l, sum0_l);
+        tmp1_r = __lsx_vsub_w(tmp1_r, sum0_r);
+        tmp1_l = __lsx_vsub_w(tmp1_l, sum0_l);
+        __lsx_vst(tmp0_r, tmp_buf_ptr + (i << 3), 0);
+        __lsx_vst(tmp0_l, tmp_buf_ptr + (i << 3), 16);
+        __lsx_vst(tmp1_r, tmp_buf_ptr + ((15 - i) * 8), 0);
+        __lsx_vst(tmp1_l, tmp_buf_ptr + ((15 - i) * 8), 16);
+
+        filter_ptr1 += 8;
+    }
+
+    /* process coeff 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 */
+    in0 = __lsx_vld(src0, 0);
+    in1 = __lsx_vld(src0 + buf_pitch_2, 0);
+    in2 = __lsx_vld(src0 + buf_pitch_4, 0);
+    in3 = __lsx_vld(src0 + buf_pitch_4 + buf_pitch_2, 0);
+    in4 = __lsx_vld(src0 + buf_pitch_8, 0);
+    in5 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_2, 0);
+    in6 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_4, 0);
+    in7 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_4 + buf_pitch_2, 0);
+
+    src0 += 16 * buf_pitch;
+    DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in5, in4, in7, in6,
+              src0_r, src1_r, src2_r, src3_r);
+    DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in5, in4, in7, in6,
+              src0_l, src1_l, src2_l, src3_l);
+    in0 = __lsx_vld(src0, 0);
+    in1 = __lsx_vld(src0 + buf_pitch_2, 0);
+    in2 = __lsx_vld(src0 + buf_pitch_4, 0);
+    in3 = __lsx_vld(src0 + buf_pitch_4 + buf_pitch_2, 0);
+    in4 = __lsx_vld(src0 + buf_pitch_8, 0);
+    in5 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_2, 0);
+    in6 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_4, 0);
+    in7 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_4 + buf_pitch_2, 0);
+
+    DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in5, in4, in7, in6,
+              src4_r, src5_r, src6_r, src7_r);
+    DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in5, in4, in7, in6,
+              src4_l, src5_l, src6_l, src7_l);
+
+    /* loop for all columns of filter constants */
+    for (i = 0; i < 16; i++) {
+        /* processing single column of constants */
+        filter0 = __lsx_vldrepl_w(filter_ptr0, 0);
+        filter1 = __lsx_vldrepl_w(filter_ptr0, 4);
+        filter2 = __lsx_vldrepl_w(filter_ptr0, 8);
+        filter3 = __lsx_vldrepl_w(filter_ptr0, 12);
+        sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
+        sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
+        sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
+        sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
+        sum0_r = __lsx_vdp2add_w_h(sum0_r, src2_r, filter2);
+        sum0_l = __lsx_vdp2add_w_h(sum0_l, src2_l, filter2);
+        sum0_r = __lsx_vdp2add_w_h(sum0_r, src3_r, filter3);
+        sum0_l = __lsx_vdp2add_w_h(sum0_l, src3_l, filter3);
+        tmp1_r = sum0_r;
+        tmp1_l = sum0_l;
+
+        filter0 = __lsx_vldrepl_w(filter_ptr0, 16);
+        filter1 = __lsx_vldrepl_w(filter_ptr0, 20);
+        filter2 = __lsx_vldrepl_w(filter_ptr0, 24);
+        filter3 = __lsx_vldrepl_w(filter_ptr0, 28);
+        sum0_r = __lsx_vdp2_w_h(src4_r, filter0);
+        sum0_l = __lsx_vdp2_w_h(src4_l, filter0);
+        sum0_r = __lsx_vdp2add_w_h(sum0_r, src5_r, filter1);
+        sum0_l = __lsx_vdp2add_w_h(sum0_l, src5_l, filter1);
+        sum0_r = __lsx_vdp2add_w_h(sum0_r, src6_r, filter2);
+        sum0_l = __lsx_vdp2add_w_h(sum0_l, src6_l, filter2);
+        sum0_r = __lsx_vdp2add_w_h(sum0_r, src7_r, filter3);
+        sum0_l = __lsx_vdp2add_w_h(sum0_l, src7_l, filter3);
+        sum0_r = __lsx_vadd_w(sum0_r, tmp1_r);
+        sum0_l = __lsx_vadd_w(sum0_l, tmp1_l);
+
+        tmp0_r = __lsx_vld(tmp_buf_ptr + i * 8, 0);
+        tmp0_l = __lsx_vld(tmp_buf_ptr + i * 8, 16);
+        tmp1_r = tmp0_r;
+        tmp1_l = tmp0_l;
+        tmp0_r = __lsx_vadd_w(tmp0_r, sum0_r);
+        tmp0_l = __lsx_vadd_w(tmp0_l, sum0_l);
+        sum1_r = __lsx_vreplgr2vr_w(round);
+        tmp0_r = __lsx_vssrarn_h_w(tmp0_r, sum1_r);
+        tmp0_l = __lsx_vssrarn_h_w(tmp0_l, sum1_r);
+        in0    = __lsx_vpackev_d(tmp0_l, tmp0_r);
+        __lsx_vst(in0, (coeffs + i * buf_pitch), 0);
+        tmp1_r = __lsx_vsub_w(tmp1_r, sum0_r);
+        tmp1_l = __lsx_vsub_w(tmp1_l, sum0_l);
+        tmp1_r = __lsx_vssrarn_h_w(tmp1_r, sum1_r);
+        tmp1_l = __lsx_vssrarn_h_w(tmp1_l, sum1_r);
+        in0    = __lsx_vpackev_d(tmp1_l, tmp1_r);
+        __lsx_vst(in0, (coeffs + (31 - i) * buf_pitch), 0);
+
+        filter_ptr0 += 16;
+    }
+}
+
+static void hevc_idct_transpose_32x8_to_8x32(int16_t *coeffs, int16_t *tmp_buf)
+{
+    uint8_t i;
+    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+    for (i = 0; i < 4; i++) {
+        DUP4_ARG2(__lsx_vld, coeffs, 0, coeffs, 64, coeffs, 128,
+                  coeffs, 192, in0, in1, in2, in3);
+        DUP4_ARG2(__lsx_vld, coeffs, 256, coeffs, 320, coeffs, 384,
+                  coeffs, 448, in4, in5, in6, in7);
+        coeffs += 8;
+        LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+                           in0, in1, in2, in3, in4, in5, in6, in7);
+        __lsx_vst(in0, tmp_buf, 0);
+        __lsx_vst(in1, tmp_buf, 16);
+        __lsx_vst(in2, tmp_buf, 32);
+        __lsx_vst(in3, tmp_buf, 48);
+        __lsx_vst(in4, tmp_buf, 64);
+        __lsx_vst(in5, tmp_buf, 80);
+        __lsx_vst(in6, tmp_buf, 96);
+        __lsx_vst(in7, tmp_buf, 112);
+        tmp_buf += 64;
+    }
+}
+
+static void hevc_idct_transpose_8x32_to_32x8(int16_t *tmp_buf, int16_t *coeffs)
+{
+    uint8_t i;
+    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+    for (i = 0; i < 4; i++) {
+        DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 16, tmp_buf, 32,
+                  tmp_buf, 48, in0, in1, in2, in3);
+        DUP4_ARG2(__lsx_vld, tmp_buf, 64, tmp_buf, 80, tmp_buf, 96,
+                  tmp_buf, 112, in4, in5, in6, in7);
+        tmp_buf += 64;
+        LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+                           in0, in1, in2, in3, in4, in5, in6, in7);
+        __lsx_vst(in0, coeffs, 0);
+        __lsx_vst(in1, coeffs, 64);
+        __lsx_vst(in2, coeffs, 128);
+        __lsx_vst(in3, coeffs, 192);
+        __lsx_vst(in4, coeffs, 256);
+        __lsx_vst(in5, coeffs, 320);
+        __lsx_vst(in6, coeffs, 384);
+        __lsx_vst(in7, coeffs, 448);
+        coeffs += 8;
+    }
+}
+
+void ff_hevc_idct_32x32_lsx(int16_t *coeffs, int col_limit)
+{
+    uint8_t row_cnt, col_cnt;
+    int16_t *src = coeffs;
+    int16_t tmp_buf[8 * 32 + 31];
+    int16_t *tmp_buf_ptr = tmp_buf + 31;
+    uint8_t round;
+    int32_t buf_pitch;
+
+    /* Align pointer to 64 byte boundary */
+    tmp_buf_ptr = (int16_t *)(((uintptr_t) tmp_buf_ptr) & ~(uintptr_t) 63);
+
+    /* column transform */
+    round = 7;
+    buf_pitch = 32;
+    for (col_cnt = 0; col_cnt < 4; col_cnt++) {
+        /* process 8x32 blocks */
+        hevc_idct_8x32_column_lsx((coeffs + col_cnt * 8), buf_pitch, round);
+    }
+
+    /* row transform */
+    round = 12;
+    buf_pitch = 8;
+    for (row_cnt = 0; row_cnt < 4; row_cnt++) {
+        /* process 32x8 blocks */
+        src = (coeffs + 32 * 8 * row_cnt);
+
+        hevc_idct_transpose_32x8_to_8x32(src, tmp_buf_ptr);
+        hevc_idct_8x32_column_lsx(tmp_buf_ptr, buf_pitch, round);
+        hevc_idct_transpose_8x32_to_32x8(tmp_buf_ptr, src);
+    }
+}
diff --git a/libavcodec/loongarch/hevc_lpf_sao_lsx.c b/libavcodec/loongarch/hevc_lpf_sao_lsx.c
new file mode 100644
index 0000000000..fc10e8eda8
--- /dev/null
+++ b/libavcodec/loongarch/hevc_lpf_sao_lsx.c
@@ -0,0 +1,2485 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *                Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "hevcdsp_lsx.h"
+
+void ff_hevc_loop_filter_luma_h_8_lsx(uint8_t *src, ptrdiff_t stride,
+                                      int32_t beta, int32_t *tc,
+                                      uint8_t *p_is_pcm, uint8_t *q_is_pcm)
+{
+    ptrdiff_t stride_2x = (stride << 1);
+    ptrdiff_t stride_4x = (stride << 2);
+    ptrdiff_t stride_3x = stride_2x + stride;
+    uint8_t *p3 = src - stride_4x;
+    uint8_t *p2 = src - stride_3x;
+    uint8_t *p1 = src - stride_2x;
+    uint8_t *p0 = src - stride;
+    uint8_t *q0 = src;
+    uint8_t *q1 = src + stride;
+    uint8_t *q2 = src + stride_2x;
+    uint8_t *q3 = src + stride_3x;
+    uint8_t flag0, flag1;
+    int32_t dp00, dq00, dp30, dq30, d00, d30, d0030, d0434;
+    int32_t dp04, dq04, dp34, dq34, d04, d34;
+    int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
+    int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
+
+    __m128i dst0, dst1, dst2, dst3, dst4, dst5;
+    __m128i cmp0, cmp1, cmp2, cmp3, p_is_pcm_vec, q_is_pcm_vec;
+    __m128i temp0, temp1;
+    __m128i temp2, tc_pos, tc_neg;
+    __m128i diff0, diff1, delta0, delta1, delta2, abs_delta0;
+    __m128i zero = {0};
+    __m128i p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
+
+    dp00 = abs(p2[0] - (p1[0] << 1) + p0[0]);
+    dq00 = abs(q2[0] - (q1[0] << 1) + q0[0]);
+    dp30 = abs(p2[3] - (p1[3] << 1) + p0[3]);
+    dq30 = abs(q2[3] - (q1[3] << 1) + q0[3]);
+    d00 = dp00 + dq00;
+    d30 = dp30 + dq30;
+    dp04 = abs(p2[4] - (p1[4] << 1) + p0[4]);
+    dq04 = abs(q2[4] - (q1[4] << 1) + q0[4]);
+    dp34 = abs(p2[7] - (p1[7] << 1) + p0[7]);
+    dq34 = abs(q2[7] - (q1[7] << 1) + q0[7]);
+    d04 = dp04 + dq04;
+    d34 = dp34 + dq34;
+
+    p_is_pcm0 = p_is_pcm[0];
+    p_is_pcm4 = p_is_pcm[1];
+    q_is_pcm0 = q_is_pcm[0];
+    q_is_pcm4 = q_is_pcm[1];
+
+    DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm0, p_is_pcm4, cmp0, cmp1);
+    p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+    p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
+    d0030 = (d00 + d30) >= beta;
+    d0434 = (d04 + d34) >= beta;
+    DUP2_ARG1(__lsx_vreplgr2vr_w, d0030, d0434, cmp0, cmp1);
+    cmp3 = __lsx_vpackev_w(cmp1, cmp0);
+    cmp3 = __lsx_vseqi_w(cmp3, 0);
+
+    if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
+        (!d0030 || !d0434)) {
+        DUP4_ARG2(__lsx_vld, p3, 0, p2, 0, p1, 0, p0, 0,
+                  p3_src, p2_src, p1_src, p0_src);
+        DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm0, q_is_pcm4, cmp0, cmp1);
+        q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+        q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
+
+        tc0 = tc[0];
+        beta30 = beta >> 3;
+        beta20 = beta >> 2;
+        tc250 = (((tc0 << 2) + tc0 + 1) >> 1);
+        tc4 = tc[1];
+        tc254 = (((tc4 << 2) + tc4 + 1) >> 1);
+
+        DUP2_ARG1(__lsx_vreplgr2vr_h, tc0, tc4, cmp0, cmp1);
+        DUP4_ARG2(__lsx_vilvl_b, zero, p3_src, zero, p2_src, zero, p1_src, zero,
+                  p0_src, p3_src, p2_src, p1_src, p0_src);
+        DUP4_ARG2(__lsx_vld, q0, 0, q1, 0, q2, 0, q3, 0,
+                  q0_src, q1_src, q2_src, q3_src);
+        flag0 = abs(p3[0] - p0[0]) + abs(q3[0] - q0[0]) < beta30 &&
+                abs(p0[0] - q0[0]) < tc250;
+        flag0 = flag0 && (abs(p3[3] - p0[3]) + abs(q3[3] - q0[3]) < beta30 &&
+                abs(p0[3] - q0[3]) < tc250 && (d00 << 1) < beta20 &&
+                (d30 << 1) < beta20);
+        tc_pos = __lsx_vpackev_d(cmp1, cmp0);
+        DUP4_ARG2(__lsx_vilvl_b, zero, q0_src, zero, q1_src, zero, q2_src,
+                  zero, q3_src, q0_src, q1_src, q2_src, q3_src);
+
+        flag1 = abs(p3[4] - p0[4]) + abs(q3[4] - q0[4]) < beta30 &&
+                abs(p0[4] - q0[4]) < tc254;
+        flag1 = flag1 && (abs(p3[7] - p0[7]) + abs(q3[7] - q0[7]) < beta30 &&
+                abs(p0[7] - q0[7]) < tc254 && (d04 << 1) < beta20 &&
+                (d34 << 1) < beta20);
+        DUP2_ARG1(__lsx_vreplgr2vr_w, flag0, flag1, cmp0, cmp1);
+        cmp2 = __lsx_vpackev_w(cmp1, cmp0);
+        cmp2 = __lsx_vseqi_w(cmp2, 0);
+
+        if (flag0 && flag1) { /* strong only */
+            /* strong filter */
+            tc_pos = __lsx_vslli_h(tc_pos, 1);
+            tc_neg = __lsx_vneg_h(tc_pos);
+
+            /* p part */
+            DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
+                      temp0, temp0);
+            temp1 = __lsx_vadd_h(p3_src, p2_src);
+            temp1 = __lsx_vslli_h(temp1, 1);
+            DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
+            temp1 = __lsx_vsrari_h(temp1, 3);
+            temp2 = __lsx_vsub_h(temp1, p2_src);
+            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+            dst0 = __lsx_vadd_h(temp2, p2_src);
+
+            temp1 = __lsx_vadd_h(temp0, p2_src);
+            temp1 = __lsx_vsrari_h(temp1, 2);
+            temp2 = __lsx_vsub_h(temp1, p1_src);
+            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+            dst1 = __lsx_vadd_h(temp2, p1_src);
+
+            temp1 = __lsx_vslli_h(temp0, 1);
+            DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src,
+                      temp1, temp1);
+            temp1 = __lsx_vsrari_h(temp1, 3);
+            temp2 = __lsx_vsub_h(temp1, p0_src);
+            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+            dst2 = __lsx_vadd_h(temp2, p0_src);
+
+            p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
+            DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1,
+                      p1_src, p_is_pcm_vec, dst0, dst1);
+            dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
+
+            /* q part */
+            DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
+                      temp0, temp0);
+            temp1 = __lsx_vadd_h(q3_src, q2_src);
+            temp1 = __lsx_vslli_h(temp1, 1);
+            DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
+            temp1 = __lsx_vsrari_h(temp1, 3);
+            temp2 = __lsx_vsub_h(temp1, q2_src);
+            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+            dst5 = __lsx_vadd_h(temp2, q2_src);
+
+            temp1 = __lsx_vadd_h(temp0, q2_src);
+            temp1 = __lsx_vsrari_h(temp1, 2);
+            temp2 = __lsx_vsub_h(temp1, q1_src);
+            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+            dst4 = __lsx_vadd_h(temp2, q1_src);
+
+            temp0 = __lsx_vslli_h(temp0, 1);
+            DUP2_ARG2(__lsx_vadd_h, temp0, p1_src, temp1, q2_src,
+                      temp1, temp1);
+            temp1 = __lsx_vsrari_h(temp1, 3);
+            temp2 = __lsx_vsub_h(temp1, q0_src);
+            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+            dst3 = __lsx_vadd_h(temp2, q0_src);
+
+            q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
+            DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4,
+                      q1_src, q_is_pcm_vec, dst3, dst4);
+            dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
+
+            /* pack results to 8 bit */
+            DUP2_ARG2(__lsx_vpickev_b, dst1, dst0, dst3, dst2, dst0, dst1);
+            dst2 = __lsx_vpickev_b(dst5, dst4);
+
+            /* pack src to 8 bit */
+            DUP2_ARG2(__lsx_vpickev_b, p1_src, p2_src, q0_src, p0_src,
+                      dst3, dst4);
+            dst5 = __lsx_vpickev_b(q2_src, q1_src);
+
+            cmp3 = __lsx_vnor_v(cmp3, cmp3);
+            DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp3, dst1, dst4, cmp3,
+                      dst0, dst1);
+            dst2 = __lsx_vbitsel_v(dst2, dst5, cmp3);
+
+            __lsx_vstelm_d(dst0, p2, 0, 0);
+            __lsx_vstelm_d(dst0, p2 + stride, 0, 1);
+            __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
+            __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
+            __lsx_vstelm_d(dst2, p2 + stride_4x, 0, 0);
+            __lsx_vstelm_d(dst2, p2 + stride_4x + stride, 0, 1);
+            /* strong filter ends */
+        } else if (flag0 == flag1) { /* weak only */
+            /* weak filter */
+            tc_neg = __lsx_vneg_h(tc_pos);
+            DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
+                      diff0, diff1);
+            DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
+                      __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
+            delta0 = __lsx_vsub_h(diff0, diff1);
+            delta0 = __lsx_vsrari_h(delta0, 4);
+            temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
+                                 __lsx_vslli_h(tc_pos, 1));
+            abs_delta0 = __lsx_vadda_h(delta0, zero);
+            abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
+            abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
+
+            delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
+            temp2 = __lsx_vadd_h(delta0, p0_src);
+            temp2 = __lsx_vclip255_h(temp2);
+            temp0 = __lsx_vbitsel_v(temp2, p0_src,
+                                    __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec));
+            temp2 = __lsx_vsub_h(q0_src, delta0);
+            temp2 = __lsx_vclip255_h(temp2);
+            temp2 = __lsx_vbitsel_v(temp2, q0_src, __lsx_vnor_v(q_is_pcm_vec,
+                                    q_is_pcm_vec));
+            DUP2_ARG2(__lsx_vnor_v, p_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec,
+                      q_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec);
+
+            tmp = (beta + (beta >> 1)) >> 3;
+            DUP2_ARG1(__lsx_vreplgr2vr_d, dp00 + dp30 < tmp, dp04 + dp34 < tmp,
+                      cmp0, cmp1);
+            cmp0 = __lsx_vpackev_d(cmp1, cmp0);
+            cmp0 = __lsx_vseqi_d(cmp0, 0);
+            p_is_pcm_vec = __lsx_vor_v(p_is_pcm_vec, cmp0);
+
+            DUP2_ARG1(__lsx_vreplgr2vr_d, dq00 + dq30 < tmp, dq04 + dq34 < tmp,
+                      cmp0, cmp1);
+            cmp0 = __lsx_vpackev_d(cmp1, cmp0);
+            cmp0 = __lsx_vseqi_d(cmp0, 0);
+            q_is_pcm_vec = __lsx_vor_v(q_is_pcm_vec, cmp0);
+            tc_pos = __lsx_vsrai_h(tc_pos, 1);
+            tc_neg = __lsx_vneg_h(tc_pos);
+
+            DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
+                      delta1, delta2);
+            DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
+                      delta1, delta2);
+            delta1 = __lsx_vadd_h(delta1, delta0);
+            delta2 = __lsx_vsub_h(delta2, delta0);
+            DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
+            DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2,
+                      tc_neg, tc_pos, delta1, delta2);
+            DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
+                      delta1, delta2);
+            DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
+            DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
+                      q1_src, q_is_pcm_vec, delta1, delta2);
+
+            abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
+            DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, temp0,
+                      p0_src,  abs_delta0, temp2, q0_src, abs_delta0, delta2,
+                      q1_src, abs_delta0, dst1, dst2, dst3, dst4);
+            /* pack results to 8 bit */
+            DUP2_ARG2(__lsx_vpickev_b, dst2, dst1, dst4, dst3, dst0, dst1);
+            /* pack src to 8 bit */
+            DUP2_ARG2(__lsx_vpickev_b, p0_src, p1_src, q1_src, q0_src,
+                      dst2, dst3);
+            cmp3 = __lsx_vnor_v(cmp3, cmp3);
+            DUP2_ARG3(__lsx_vbitsel_v, dst0, dst2, cmp3, dst1, dst3, cmp3,
+                      dst0, dst1);
+
+            p2 += stride;
+            __lsx_vstelm_d(dst0, p2, 0, 0);
+            __lsx_vstelm_d(dst0, p2 + stride, 0, 1);
+            __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
+            __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
+            /* weak filter ends */
+        } else { /* strong + weak */
+            /* strong filter */
+            tc_pos = __lsx_vslli_h(tc_pos, 1);
+            tc_neg = __lsx_vneg_h(tc_pos);
+
+            /* p part */
+            DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
+                      temp0, temp0);
+            temp1 = __lsx_vadd_h(p3_src, p2_src);
+            temp1 = __lsx_vslli_h(temp1, 1);
+            DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
+            temp1 = __lsx_vsrari_h(temp1, 3);
+            temp2 = __lsx_vsub_h(temp1, p2_src);
+            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+            dst0 = __lsx_vadd_h(temp2, p2_src);
+
+            temp1 = __lsx_vadd_h(temp0, p2_src);
+            temp1 = __lsx_vsrari_h(temp1, 2);
+            temp2 = __lsx_vsub_h(temp1, p1_src);
+            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+            dst1 = __lsx_vadd_h(temp2, p1_src);
+
+            temp1 = __lsx_vslli_h(temp0, 1);
+            DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
+            temp1 = __lsx_vsrari_h(temp1, 3);
+            temp2 = __lsx_vsub_h(temp1, p0_src);
+            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+            dst2 = __lsx_vadd_h(temp2, p0_src);
+
+            p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
+            DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1,
+                      p1_src, p_is_pcm_vec, dst0, dst1);
+            dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
+
+            /* q part */
+            DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
+                      temp0, temp0);
+            temp1 = __lsx_vadd_h(q3_src, q2_src);
+            temp1 = __lsx_vslli_h(temp1, 1);
+            DUP2_ARG2(__lsx_vadd_h, temp1,  q2_src, temp1, temp0, temp1, temp1);
+            temp1 = __lsx_vsrari_h(temp1, 3);
+            temp2 = __lsx_vsub_h(temp1, q2_src);
+            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+            dst5 = __lsx_vadd_h(temp2, q2_src);
+
+            temp1 = __lsx_vadd_h(temp0, q2_src);
+            temp1 = __lsx_vsrari_h(temp1, 2);
+            temp2 = __lsx_vsub_h(temp1, q1_src);
+            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+            dst4 = __lsx_vadd_h(temp2, q1_src);
+
+            temp1 = __lsx_vslli_h(temp0, 1);
+            DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
+            temp1 = __lsx_vsrari_h(temp1, 3);
+            temp2 = __lsx_vsub_h(temp1, q0_src);
+            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+            dst3 = __lsx_vadd_h(temp2, q0_src);
+
+            q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
+            DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4,
+                      q1_src, q_is_pcm_vec, dst3, dst4);
+            dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
+
+            /* pack strong results to 8 bit */
+            DUP2_ARG2(__lsx_vpickev_b, dst1, dst0, dst3, dst2, dst0, dst1);
+            dst2 = __lsx_vpickev_b(dst5, dst4);
+            /* strong filter ends */
+
+            /* weak filter */
+            tc_pos = __lsx_vsrai_h(tc_pos, 1);
+            tc_neg = __lsx_vneg_h(tc_pos);
+
+            DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
+                      diff0, diff1);
+            DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
+                      __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
+            delta0 = __lsx_vsub_h(diff0, diff1);
+            delta0 = __lsx_vsrari_h(delta0, 4);
+            temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
+                                 __lsx_vslli_h(tc_pos, 1));
+            abs_delta0 = __lsx_vadda_h(delta0, zero);
+            abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
+            abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
+
+            delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
+            temp2 = __lsx_vadd_h(delta0, p0_src);
+            temp2 = __lsx_vclip255_h(temp2);
+            temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
+
+            temp2 = __lsx_vsub_h(q0_src, delta0);
+            temp2 = __lsx_vclip255_h(temp2);
+            temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
+
+            tmp = (beta + (beta >> 1)) >> 3;
+            DUP2_ARG1(__lsx_vreplgr2vr_d, dp00 + dp30 < tmp, dp04 + dp34 < tmp,
+                      cmp0, cmp1);
+            cmp0 = __lsx_vpackev_d(cmp1, cmp0);
+            p_is_pcm_vec = __lsx_vor_v(p_is_pcm_vec, __lsx_vseqi_d(cmp0, 0));
+            DUP2_ARG1(__lsx_vreplgr2vr_d, dq00 + dq30 < tmp, dq04 + dq34 < tmp,
+                      cmp0, cmp1);
+            cmp0 = __lsx_vpackev_d(cmp1, cmp0);
+            q_is_pcm_vec = __lsx_vor_v(q_is_pcm_vec, __lsx_vseqi_d(cmp0, 0));
+
+            tc_pos = __lsx_vsrai_h(tc_pos, 1);
+            tc_neg = __lsx_vneg_h(tc_pos);
+
+            DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
+                      delta1, delta2);
+            DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
+                      delta1, delta2);
+            delta1 = __lsx_vadd_h(delta1, delta0);
+            delta2 = __lsx_vsub_h(delta2, delta0);
+            DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
+            DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
+                      tc_pos, delta1, delta2);
+            DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
+                      delta1, delta2);
+            DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
+            DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
+                      q1_src, q_is_pcm_vec, delta1, delta2);
+            abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
+            DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, delta2,
+                      q1_src, abs_delta0, temp0, p0_src, abs_delta0, temp2,
+                      q0_src, abs_delta0, delta1, delta2, temp0, temp2);
+            /* weak filter ends */
+
+            /* pack weak results to 8 bit */
+            DUP2_ARG2(__lsx_vpickev_b, delta1, p2_src, temp2, temp0,
+                      dst3, dst4);
+            dst5 = __lsx_vpickev_b(q2_src, delta2);
+
+            /* select between weak or strong */
+            DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp2, dst1, dst4, cmp2,
+                      dst0, dst1);
+            dst2 = __lsx_vbitsel_v(dst2, dst5, cmp2);
+
+            /* pack src to 8 bit */
+            DUP2_ARG2(__lsx_vpickev_b, p1_src, p2_src, q0_src, p0_src,
+                      dst3, dst4);
+            dst5 = __lsx_vpickev_b(q2_src, q1_src);
+
+            cmp3 = __lsx_vnor_v(cmp3, cmp3);
+            DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp3, dst1, dst4, cmp3,
+                      dst0, dst1);
+            dst2 = __lsx_vbitsel_v(dst2, dst5, cmp3);
+
+            __lsx_vstelm_d(dst0, p2, 0, 0);
+            __lsx_vstelm_d(dst0, p2 + stride, 0, 1);
+            __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
+            __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
+            __lsx_vstelm_d(dst2, p2 + stride_4x, 0, 0);
+            __lsx_vstelm_d(dst2, p2 + stride_4x + stride, 0, 1);
+        }
+    }
+}
+
+void ff_hevc_loop_filter_luma_v_8_lsx(uint8_t *src, ptrdiff_t stride,
+                                      int32_t beta, int32_t *tc,
+                                      uint8_t *p_is_pcm, uint8_t *q_is_pcm)
+{
+    ptrdiff_t stride_2x = (stride << 1);
+    ptrdiff_t stride_4x = (stride << 2);
+    ptrdiff_t stride_3x = stride_2x + stride;
+    uint8_t *p3 = src;
+    uint8_t *p2 = src + stride_3x;
+    uint8_t *p1 = src + stride_4x;
+    uint8_t *p0 = src + stride_4x + stride_3x;
+    uint8_t flag0, flag1;
+    int32_t dp00, dq00, dp30, dq30, d00, d30;
+    int32_t d0030, d0434;
+    int32_t dp04, dq04, dp34, dq34, d04, d34;
+    int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
+    int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
+
+    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    __m128i cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
+    __m128i cmp3;
+    __m128i temp0, temp1;
+    __m128i temp2;
+    __m128i tc_pos, tc_neg;
+    __m128i diff0, diff1, delta0, delta1, delta2, abs_delta0;
+    __m128i zero = {0};
+    __m128i p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
+
+    dp00 = abs(p3[-3] - (p3[-2] << 1) + p3[-1]);
+    dq00 = abs(p3[2] - (p3[1] << 1) + p3[0]);
+    dp30 = abs(p2[-3] - (p2[-2] << 1) + p2[-1]);
+    dq30 = abs(p2[2] - (p2[1] << 1) + p2[0]);
+    d00 = dp00 + dq00;
+    d30 = dp30 + dq30;
+    p_is_pcm0 = p_is_pcm[0];
+    q_is_pcm0 = q_is_pcm[0];
+
+    dp04 = abs(p1[-3] - (p1[-2] << 1) + p1[-1]);
+    dq04 = abs(p1[2] - (p1[1] << 1) + p1[0]);
+    dp34 = abs(p0[-3] - (p0[-2] << 1) + p0[-1]);
+    dq34 = abs(p0[2] - (p0[1] << 1) + p0[0]);
+    d04 = dp04 + dq04;
+    d34 = dp34 + dq34;
+    p_is_pcm4 = p_is_pcm[1];
+    q_is_pcm4 = q_is_pcm[1];
+
+    DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm0, p_is_pcm4, cmp0, cmp1);
+    p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+    p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
+
+    d0030 = (d00 + d30) >= beta;
+    d0434 = (d04 + d34) >= beta;
+
+    DUP2_ARG1(__lsx_vreplgr2vr_d, d0030, d0434, cmp0, cmp1);
+    cmp3 = __lsx_vpackev_d(cmp1, cmp0);
+    cmp3 = __lsx_vseqi_d(cmp3, 0);
+
+    if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
+        (!d0030 || !d0434)) {
+        src -= 4;
+        DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
+                  src + stride_3x, 0, p3_src, p2_src, p1_src, p0_src);
+        src += stride_4x;
+        DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
+                  src + stride_3x, 0, q0_src, q1_src, q2_src, q3_src);
+        src -= stride_4x;
+
+        DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm0, q_is_pcm4, cmp0, cmp1);
+        q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+        q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
+
+        tc0 = tc[0];
+        beta30 = beta >> 3;
+        beta20 = beta >> 2;
+        tc250 = (((tc0 << 2) + tc0 + 1) >> 1);
+        tc4 = tc[1];
+        tc254 = (((tc4 << 2) + tc4 + 1) >> 1);
+        DUP2_ARG1( __lsx_vreplgr2vr_h, tc0 << 1, tc4 << 1, cmp0, cmp1);
+        tc_pos = __lsx_vpackev_d(cmp1, cmp0);
+        LSX_TRANSPOSE8x8_B(p3_src, p2_src, p1_src, p0_src, q0_src, q1_src,
+                           q2_src, q3_src, p3_src, p2_src, p1_src, p0_src,
+                           q0_src, q1_src, q2_src, q3_src);
+
+        flag0 = abs(p3[-4] - p3[-1]) + abs(p3[3] - p3[0]) < beta30 &&
+                abs(p3[-1] - p3[0]) < tc250;
+        flag0 = flag0 && (abs(p2[-4] - p2[-1]) + abs(p2[3] - p2[0]) < beta30 &&
+                abs(p2[-1] - p2[0]) < tc250 && (d00 << 1) < beta20 &&
+                (d30 << 1) < beta20);
+        cmp0 = __lsx_vreplgr2vr_d(flag0);
+        DUP4_ARG2(__lsx_vilvl_b, zero, p3_src, zero, p2_src, zero, p1_src, zero,
+                  p0_src, p3_src, p2_src, p1_src, p0_src);
+
+        flag1 = abs(p1[-4] - p1[-1]) + abs(p1[3] - p1[0]) < beta30 &&
+                abs(p1[-1] - p1[0]) < tc254;
+        flag1 = flag1 && (abs(p0[-4] - p0[-1]) + abs(p0[3] - p0[0]) < beta30 &&
+                abs(p0[-1] - p0[0]) < tc254 && (d04 << 1) < beta20 &&
+                (d34 << 1) < beta20);
+        DUP4_ARG2(__lsx_vilvl_b, zero, q0_src, zero, q1_src, zero, q2_src, zero,
+                  q3_src, q0_src, q1_src, q2_src, q3_src);
+
+        cmp1 = __lsx_vreplgr2vr_d(flag1);
+        cmp2 = __lsx_vpackev_d(cmp1, cmp0);
+        cmp2 = __lsx_vseqi_d(cmp2, 0);
+
+        if (flag0 && flag1) { /* strong only */
+            /* strong filter */
+            tc_neg = __lsx_vneg_h(tc_pos);
+            /* p part */
+            DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
+                      temp0, temp0);
+            temp1 = __lsx_vadd_h(p3_src, p2_src);
+            temp1 = __lsx_vslli_h(temp1, 1);
+            DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
+            temp1 = __lsx_vsrari_h(temp1, 3);
+            temp2 = __lsx_vsub_h(temp1, p2_src);
+            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+            dst0 = __lsx_vadd_h(temp2, p2_src);
+
+            temp1 = __lsx_vadd_h(temp0, p2_src);
+            temp1 = __lsx_vsrari_h(temp1, 2);
+            temp2 = __lsx_vsub_h(temp1, p1_src);
+            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+            dst1 = __lsx_vadd_h(temp2, p1_src);
+
+            temp1 = __lsx_vslli_h(temp0, 1);
+            DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
+            temp1 = __lsx_vsrari_h(temp1, 3);
+            temp2 = __lsx_vsub_h(temp1, p0_src);
+            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+            dst2 = __lsx_vadd_h(temp2, p0_src);
+
+            p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
+            DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1, p1_src,
+                      p_is_pcm_vec, dst0, dst1);
+            dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
+
+            /* q part */
+            DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
+                      temp0, temp0);
+            temp1 = __lsx_vadd_h(q3_src, q2_src);
+            temp1 = __lsx_vslli_h(temp1, 1);
+            DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
+            temp1 = __lsx_vsrari_h(temp1, 3);
+            temp2 = __lsx_vsub_h(temp1, q2_src);
+            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+            dst5 = __lsx_vadd_h(temp2, q2_src);
+
+            temp1 = __lsx_vadd_h(temp0, q2_src);
+            temp1 = __lsx_vsrari_h(temp1, 2);
+            temp2 = __lsx_vsub_h(temp1, q1_src);
+            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+            dst4 = __lsx_vadd_h(temp2, q1_src);
+
+            temp1 = __lsx_vslli_h(temp0, 1);
+            DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
+            temp1 = __lsx_vsrari_h(temp1, 3);
+            temp2 = __lsx_vsub_h(temp1, q0_src);
+            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+            dst3 = __lsx_vadd_h(temp2, q0_src);
+
+            q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
+            DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4, q1_src,
+                      q_is_pcm_vec, dst3, dst4);
+            dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
+            /* strong filter ends */
+        } else if (flag0 == flag1) { /* weak only */
+            /* weak filter */
+            tc_pos = __lsx_vsrai_h(tc_pos, 1);
+            tc_neg = __lsx_vneg_h(tc_pos);
+
+            DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
+                      diff0, diff1);
+            DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
+                      __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
+            delta0 = __lsx_vsub_h(diff0, diff1);
+            delta0 = __lsx_vsrari_h(delta0, 4);
+            temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
+                                 __lsx_vslli_h(tc_pos, 1));
+            abs_delta0 = __lsx_vadda_h(delta0, zero);
+            abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
+            abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
+
+            delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
+            temp2 = __lsx_vadd_h(delta0, p0_src);
+            temp2 = __lsx_vclip255_h(temp2);
+            p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
+            temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
+
+            temp2 = __lsx_vsub_h(q0_src, delta0);
+            temp2 = __lsx_vclip255_h(temp2);
+            q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
+            temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
+
+            tmp = ((beta + (beta >> 1)) >> 3);
+            DUP2_ARG1(__lsx_vreplgr2vr_d, !p_is_pcm0 && ((dp00 + dp30) < tmp),
+                      !p_is_pcm4 && ((dp04 + dp34) < tmp), cmp0, cmp1);
+            p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+            p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
+
+            DUP2_ARG1(__lsx_vreplgr2vr_h, (!q_is_pcm0) && (dq00 + dq30 < tmp),
+                      (!q_is_pcm4) && (dq04 + dq34 < tmp), cmp0, cmp1);
+            q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+            q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
+            tc_pos = __lsx_vsrai_h(tc_pos, 1);
+            tc_neg = __lsx_vneg_h(tc_pos);
+
+            DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
+                      delta1, delta2);
+            DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
+                      delta1, delta2);
+            delta1 = __lsx_vadd_h(delta1, delta0);
+            delta2 = __lsx_vsub_h(delta2, delta0);
+            DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
+            DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
+                      tc_pos, delta1, delta2);
+            DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
+                      delta1, delta2);
+            DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
+            DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
+                      q1_src, q_is_pcm_vec, delta1, delta2);
+
+            abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
+            DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, temp0,
+                      p0_src, abs_delta0, temp2, q0_src, abs_delta0, delta2,
+                      q1_src, abs_delta0, dst0, dst1, dst2, dst3);
+            /* weak filter ends */
+
+            cmp3 = __lsx_vnor_v(cmp3, cmp3);
+            DUP4_ARG3(__lsx_vbitsel_v, dst0, p1_src, cmp3, dst1, p0_src,
+                      cmp3, dst2, q0_src, cmp3, dst3, q1_src, cmp3,
+                      dst0, dst1, dst2, dst3);
+            DUP2_ARG2(__lsx_vpickev_b, dst2, dst0, dst3, dst1, dst0, dst1);
+
+            /* transpose */
+            dst4 = __lsx_vilvl_b(dst1, dst0);
+            dst5 = __lsx_vilvh_b(dst1, dst0);
+            dst0 = __lsx_vilvl_h(dst5, dst4);
+            dst1 = __lsx_vilvh_h(dst5, dst4);
+
+            src += 2;
+            __lsx_vstelm_w(dst0, src, 0, 0);
+            __lsx_vstelm_w(dst0, src + stride, 0, 1);
+            __lsx_vstelm_w(dst0, src + stride_2x, 0, 2);
+            __lsx_vstelm_w(dst0, src + stride_3x, 0, 3);
+            src += stride_4x;
+            __lsx_vstelm_w(dst1, src, 0, 0);
+            __lsx_vstelm_w(dst1, src + stride, 0, 1);
+            __lsx_vstelm_w(dst1, src + stride_2x, 0, 2);
+            __lsx_vstelm_w(dst1, src + stride_3x, 0, 3);
+            return;
+        } else { /* strong + weak */
+            /* strong filter */
+            tc_neg = __lsx_vneg_h(tc_pos);
+
+            /* p part */
+            DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
+                      temp0, temp0);
+
+            temp1 = __lsx_vadd_h(p3_src, p2_src);
+            temp1 = __lsx_vslli_h(temp1, 1);
+            DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
+            temp1 = __lsx_vsrari_h(temp1, 3);
+            temp2 = __lsx_vsub_h(temp1, p2_src);
+            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+            dst0 = __lsx_vadd_h(temp2, p2_src);
+
+            temp1 = __lsx_vadd_h(temp0, p2_src);
+            temp1 = __lsx_vsrari_h(temp1, 2);
+            temp2 = __lsx_vsub_h(temp1, p1_src);
+            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+            dst1 = __lsx_vadd_h(temp2, p1_src);
+
+            temp1 = __lsx_vslli_h(temp0, 1);
+            DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
+            temp1 = __lsx_vsrari_h(temp1, 3);
+            temp2 = __lsx_vsub_h(temp1, p0_src);
+            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+            dst2 = __lsx_vadd_h(temp2, p0_src);
+
+            p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
+            DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1, p1_src,
+                      p_is_pcm_vec, dst0, dst1);
+            dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
+
+            /* q part */
+            DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src, temp0, temp0);
+            temp1 = __lsx_vadd_h(q3_src, q2_src);
+            temp1 = __lsx_vslli_h(temp1, 1);
+            DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
+            temp1 = __lsx_vsrari_h(temp1, 3);
+            temp2 = __lsx_vsub_h(temp1, q2_src);
+            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+            dst5 = __lsx_vadd_h(temp2, q2_src);
+
+            temp1 = __lsx_vadd_h(temp0, q2_src);
+            temp1 = __lsx_vsrari_h(temp1, 2);
+            temp2 = __lsx_vsub_h(temp1, q1_src);
+            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+            dst4 = __lsx_vadd_h(temp2, q1_src);
+
+            temp1 = __lsx_vslli_h(temp0, 1);
+            DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
+            temp1 = __lsx_vsrari_h(temp1, 3);
+            temp2 = __lsx_vsub_h(temp1, q0_src);
+            temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+            dst3 = __lsx_vadd_h(temp2, q0_src);
+
+            q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
+            DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4, q1_src,
+                      q_is_pcm_vec, dst3, dst4);
+            dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
+            /* strong filter ends */
+
+            /* weak filter */
+            tc_pos = __lsx_vsrai_h(tc_pos, 1);
+            tc_neg = __lsx_vneg_h(tc_pos);
+
+            DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
+                      diff0, diff1);
+            DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
+                      __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
+            delta0 = __lsx_vsub_h(diff0, diff1);
+            delta0 = __lsx_vsrari_h(delta0, 4);
+
+            temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
+                    __lsx_vslli_h(tc_pos, 1));
+            abs_delta0 = __lsx_vadda_h(delta0, zero);
+            abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
+            abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
+            delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
+            temp2 = __lsx_vadd_h(delta0, p0_src);
+            temp2 = __lsx_vclip255_h(temp2);
+            temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
+            temp2 = __lsx_vsub_h(q0_src, delta0);
+            temp2 = __lsx_vclip255_h(temp2);
+            temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
+
+            tmp = (beta + (beta >> 1)) >> 3;
+            DUP2_ARG1(__lsx_vreplgr2vr_d, !p_is_pcm0 && ((dp00 + dp30) < tmp),
+                      !p_is_pcm4 && ((dp04 + dp34) < tmp), cmp0, cmp1);
+            p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+            p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
+
+            DUP2_ARG1(__lsx_vreplgr2vr_h, (!q_is_pcm0) && (dq00 + dq30 < tmp),
+                      (!q_is_pcm4) && (dq04 + dq34 < tmp), cmp0, cmp1);
+            q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+            q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
+            tc_pos = __lsx_vsrai_h(tc_pos, 1);
+            tc_neg = __lsx_vneg_h(tc_pos);
+
+            DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
+                      delta1, delta2);
+            DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
+                      delta1, delta2);
+            delta1 = __lsx_vadd_h(delta1, delta0);
+            delta2 = __lsx_vsub_h(delta2, delta0);
+            DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
+            DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
+                      tc_pos, delta1, delta2);
+            DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
+                      delta1, delta2);
+            DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
+            DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
+                      q1_src, q_is_pcm_vec, delta1, delta2);
+
+            abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
+            DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, delta2,
+                      q1_src, abs_delta0, temp0, p0_src, abs_delta0, temp2,
+                      q0_src, abs_delta0, delta1, delta2, temp0, temp2);
+            /* weak filter ends*/
+
+            /* select between weak or strong */
+            DUP4_ARG3(__lsx_vbitsel_v, dst0, p2_src, cmp2, dst1, delta1,
+                      cmp2, dst2, temp0, cmp2, dst3, temp2, cmp2,
+                      dst0, dst1, dst2, dst3);
+            DUP2_ARG3(__lsx_vbitsel_v, dst4, delta2, cmp2, dst5, q2_src, cmp2,
+                      dst4, dst5);
+        }
+
+        cmp3 = __lsx_vnor_v(cmp3, cmp3);
+        DUP4_ARG3(__lsx_vbitsel_v, dst0, p2_src, cmp3, dst1, p1_src, cmp3, dst2,
+                  p0_src, cmp3, dst3, q0_src, cmp3, dst0, dst1, dst2, dst3);
+        DUP2_ARG3(__lsx_vbitsel_v, dst4, q1_src, cmp3, dst5, q2_src, cmp3,
+                  dst4, dst5);
+
+        /* pack results to 8 bit */
+        DUP4_ARG2(__lsx_vpickev_b, dst2, dst0, dst3, dst1, dst4, dst4, dst5,
+                  dst5, dst0, dst1, dst2, dst3);
+
+        /* transpose */
+        DUP2_ARG2(__lsx_vilvl_b, dst1, dst0, dst3, dst2, dst4, dst6);
+        DUP2_ARG2(__lsx_vilvh_b, dst1, dst0, dst3, dst2, dst5, dst7);
+        DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst7, dst6, dst0, dst2);
+        DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst7, dst6, dst1, dst3);
+
+        src += 1;
+        __lsx_vstelm_w(dst0, src, 0, 0);
+        __lsx_vstelm_h(dst2, src, 4, 0);
+        src += stride;
+        __lsx_vstelm_w(dst0, src, 0, 1);
+        __lsx_vstelm_h(dst2, src, 4, 2);
+        src += stride;
+
+        __lsx_vstelm_w(dst0, src, 0, 2);
+        __lsx_vstelm_h(dst2, src, 4, 4);
+        src += stride;
+        __lsx_vstelm_w(dst0, src, 0, 3);
+        __lsx_vstelm_h(dst2, src, 4, 6);
+        src += stride;
+
+        __lsx_vstelm_w(dst1, src, 0, 0);
+        __lsx_vstelm_h(dst3, src, 4, 0);
+        src += stride;
+        __lsx_vstelm_w(dst1, src, 0, 1);
+        __lsx_vstelm_h(dst3, src, 4, 2);
+        src += stride;
+
+        __lsx_vstelm_w(dst1, src, 0, 2);
+        __lsx_vstelm_h(dst3, src, 4, 4);
+        src += stride;
+        __lsx_vstelm_w(dst1, src, 0, 3);
+        __lsx_vstelm_h(dst3, src, 4, 6);
+    }
+}
+
+void ff_hevc_loop_filter_chroma_h_8_lsx(uint8_t *src, ptrdiff_t stride,
+                                        int32_t *tc, uint8_t *p_is_pcm,
+                                        uint8_t *q_is_pcm)
+{
+    uint8_t *p1_ptr = src - (stride << 1);
+    uint8_t *p0_ptr = src - stride;
+    uint8_t *q0_ptr = src;
+    uint8_t *q1_ptr = src + stride;
+    __m128i cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
+    __m128i p1, p0, q0, q1;
+    __m128i tc_pos, tc_neg;
+    __m128i zero = {0};
+    __m128i temp0, temp1, delta;
+
+    if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
+        DUP2_ARG1(__lsx_vreplgr2vr_h, tc[0], tc[1], cmp0, cmp1);
+        tc_pos = __lsx_vpackev_d(cmp1, cmp0);
+        tc_neg = __lsx_vneg_h(tc_pos);
+        DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm[0], p_is_pcm[1], cmp0, cmp1);
+        p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+        p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
+
+        DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm[0], q_is_pcm[1], cmp0, cmp1);
+        q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+        q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
+
+        DUP4_ARG2(__lsx_vld, p1_ptr, 0, p0_ptr, 0, q0_ptr, 0, q1_ptr, 0,
+                  p1, p0, q0, q1);
+        DUP4_ARG2(__lsx_vilvl_b, zero, p1, zero, p0, zero, q0, zero, q1,
+                  p1, p0, q0, q1);
+        DUP2_ARG2(__lsx_vsub_h, q0, p0, p1, q1, temp0, temp1);
+        temp0 = __lsx_vslli_h(temp0, 2);
+        temp0 = __lsx_vadd_h(temp0, temp1);
+        delta = __lsx_vsrari_h(temp0, 3);
+        delta = __lsx_vclip_h(delta, tc_neg, tc_pos);
+        temp0 = __lsx_vadd_h(p0, delta);
+        temp0 = __lsx_vclip255_h(temp0);
+        p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
+        temp0 = __lsx_vbitsel_v(temp0, p0, p_is_pcm_vec);
+
+        temp1 = __lsx_vsub_h(q0, delta);
+        temp1 = __lsx_vclip255_h(temp1);
+        q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
+        temp1 = __lsx_vbitsel_v(temp1, q0, q_is_pcm_vec);
+
+        tc_pos = __lsx_vslei_d(tc_pos, 0);
+        DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, tc_pos, temp1, q0, tc_pos,
+                  temp0, temp1);
+        temp0 = __lsx_vpickev_b(temp1, temp0);
+        __lsx_vstelm_d(temp0, p0_ptr, 0, 0);
+        __lsx_vstelm_d(temp0, p0_ptr + stride, 0, 1);
+    }
+}
+
+void ff_hevc_loop_filter_chroma_v_8_lsx(uint8_t *src, ptrdiff_t stride,
+                                        int32_t *tc, uint8_t *p_is_pcm,
+                                        uint8_t *q_is_pcm)
+{
+    ptrdiff_t stride_2x = (stride << 1);
+    ptrdiff_t stride_4x = (stride << 2);
+    ptrdiff_t stride_3x = stride_2x + stride;
+    __m128i cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+    __m128i p1, p0, q0, q1;
+    __m128i tc_pos, tc_neg;
+    __m128i zero = {0};
+    __m128i temp0, temp1, delta;
+
+    if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
+        DUP2_ARG1(__lsx_vreplgr2vr_h, tc[0], tc[1], cmp0, cmp1);
+        tc_pos = __lsx_vpackev_d(cmp1, cmp0);
+        tc_neg = __lsx_vneg_h(tc_pos);
+
+        DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm[0], p_is_pcm[1], cmp0, cmp1);
+        p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+        p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
+        DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm[0], q_is_pcm[1], cmp0, cmp1);
+        q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+        q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
+
+        src -= 2;
+        DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
+                  src + stride_3x, 0, src0, src1, src2, src3);
+        src += stride_4x;
+        DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
+                  src + stride_3x, 0, src4, src5, src6, src7);
+        src -= stride_4x;
+        LSX_TRANSPOSE8x4_B(src0, src1, src2, src3, src4, src5, src6, src7,
+                           p1, p0, q0, q1);
+        DUP4_ARG2(__lsx_vilvl_b, zero, p1, zero, p0, zero, q0, zero, q1,
+                  p1, p0, q0, q1);
+
+        DUP2_ARG2(__lsx_vsub_h, q0, p0, p1, q1, temp0, temp1);
+        temp0 = __lsx_vslli_h(temp0, 2);
+        temp0 = __lsx_vadd_h(temp0, temp1);
+        delta = __lsx_vsrari_h(temp0, 3);
+        delta = __lsx_vclip_h(delta, tc_neg, tc_pos);
+
+        temp0 = __lsx_vadd_h(p0, delta);
+        temp1 = __lsx_vsub_h(q0, delta);
+        DUP2_ARG1(__lsx_vclip255_h, temp0, temp1, temp0, temp1);
+        DUP2_ARG2(__lsx_vnor_v, p_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec,
+                  q_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec);
+        DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, p_is_pcm_vec, temp1, q0,
+                  q_is_pcm_vec, temp0, temp1);
+
+        tc_pos = __lsx_vslei_d(tc_pos, 0);
+        DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, tc_pos, temp1, q0, tc_pos,
+                  temp0, temp1);
+        temp0 = __lsx_vpackev_b(temp1, temp0);
+
+        src += 1;
+        __lsx_vstelm_h(temp0, src, 0, 0);
+        __lsx_vstelm_h(temp0, src + stride, 0, 1);
+        __lsx_vstelm_h(temp0, src + stride_2x, 0, 2);
+        __lsx_vstelm_h(temp0, src + stride_3x, 0, 3);
+        src += stride_4x;
+        __lsx_vstelm_h(temp0, src, 0, 4);
+        __lsx_vstelm_h(temp0, src + stride, 0, 5);
+        __lsx_vstelm_h(temp0, src + stride_2x, 0, 6);
+        __lsx_vstelm_h(temp0, src + stride_3x, 0, 7);
+        src -= stride_4x;
+    }
+}
+
+static void hevc_sao_edge_filter_0degree_4width_lsx(uint8_t *dst,
+                                                    int32_t dst_stride,
+                                                    uint8_t *src,
+                                                    int32_t src_stride,
+                                                    int16_t *sao_offset_val,
+                                                    int32_t height)
+{
+    const int32_t src_stride_2x = (src_stride << 1);
+    const int32_t dst_stride_2x = (dst_stride << 1);
+    __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+    __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+    __m128i edge_idx = {0x403000201, 0x0};
+    __m128i cmp_minus10, cmp_minus11, diff_minus10, diff_minus11;
+    __m128i sao_offset = __lsx_vld(sao_offset_val, 0);
+    __m128i src_minus10, src_minus11, src_plus10, offset, src0, dst0;
+    __m128i const1 = __lsx_vldi(1);
+    __m128i zero = {0};
+
+    sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+    src -= 1;
+
+    /* load in advance */
+    DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src_minus10, src_minus11);
+
+    for (height -= 2; height; height -= 2) {
+        src += src_stride_2x;
+        src_minus10 = __lsx_vpickev_d(src_minus11, src_minus10);
+        src0 = __lsx_vshuf_b(zero, src_minus10, shuf1);
+        src_plus10 = __lsx_vshuf_b(zero, src_minus10, shuf2);
+
+        DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10,
+                  cmp_minus10, cmp_minus11);
+        DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+                  cmp_minus11, diff_minus10, diff_minus11);
+        DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10,
+                  cmp_minus10, cmp_minus11);
+        DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+                  cmp_minus11, cmp_minus10, cmp_minus11);
+        DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+        diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
+
+        offset = __lsx_vadd_b(diff_minus10, diff_minus11);
+        offset = __lsx_vaddi_bu(offset, 2);
+
+        /* load in advance */
+        DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0,
+                  src_minus10, src_minus11);
+        DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset,
+                  sao_offset, sao_offset, offset, offset, offset);
+        src0 = __lsx_vxori_b(src0, 128);
+        dst0 = __lsx_vsadd_b(src0, offset);
+        dst0 = __lsx_vxori_b(dst0, 128);
+
+        __lsx_vstelm_w(dst0, dst, 0, 0);
+        __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
+        dst += dst_stride_2x;
+    }
+
+    src_minus10 = __lsx_vpickev_d(src_minus11, src_minus10);
+    src0 = __lsx_vshuf_b(zero, src_minus10, shuf1);
+    src_plus10 = __lsx_vshuf_b(zero, src_minus10, shuf2);
+
+    DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10, cmp_minus10,
+              cmp_minus11);
+    DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+              diff_minus10, diff_minus11);
+    DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10, cmp_minus10,
+              cmp_minus11);
+    DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+              cmp_minus10, cmp_minus11);
+    DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
+              const1, cmp_minus11, diff_minus10, diff_minus11);
+
+    offset = __lsx_vadd_b(diff_minus10, diff_minus11);
+    offset = __lsx_vaddi_bu(offset, 2);
+    DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, sao_offset,
+              offset, offset, offset);
+    src0 = __lsx_vxori_b(src0, 128);
+    dst0 = __lsx_vsadd_b(src0, offset);
+    dst0 = __lsx_vxori_b(dst0, 128);
+
+    __lsx_vstelm_w(dst0, dst, 0, 0);
+    __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
+}
+
+static void hevc_sao_edge_filter_0degree_8width_lsx(uint8_t *dst,
+                                                    int32_t dst_stride,
+                                                    uint8_t *src,
+                                                    int32_t src_stride,
+                                                    int16_t *sao_offset_val,
+                                                    int32_t height)
+{
+    const int32_t src_stride_2x = (src_stride << 1);
+    const int32_t dst_stride_2x = (dst_stride << 1);
+    __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+    __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+    __m128i edge_idx = {0x403000201, 0x0};
+    __m128i const1 = __lsx_vldi(1);
+    __m128i cmp_minus10, cmp_minus11, diff_minus10, diff_minus11;
+    __m128i src0, src1, dst0, src_minus10, src_minus11, src_plus10, src_plus11;
+    __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
+    __m128i zeros = {0};
+
+    sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+    src -= 1;
+
+    /* load in advance */
+    DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src_minus10, src_minus11);
+
+    for (height -= 2; height; height -= 2) {
+        src += src_stride_2x;
+        DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf1, zeros,
+                  src_minus11, shuf1, src0, src1);
+        DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros,
+                  src_minus11, shuf2, src_plus10, src_plus11);
+        DUP2_ARG2(__lsx_vpickev_d, src_minus11, src_minus10, src_plus11,
+                  src_plus10, src_minus10, src_plus10);
+        src0 = __lsx_vpickev_d(src1, src0);
+
+        DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10,
+                  cmp_minus10, cmp_minus11);
+        DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+                  cmp_minus11, diff_minus10, diff_minus11);
+        DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10,
+                  cmp_minus10, cmp_minus11);
+        DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+                  cmp_minus11, cmp_minus10, cmp_minus11);
+        DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+        diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
+
+        offset = __lsx_vadd_b(diff_minus10, diff_minus11);
+        offset = __lsx_vaddi_bu(offset, 2);
+
+        /* load in advance */
+        DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0,
+                  src_minus10, src_minus11);
+        DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+                  sao_offset, offset, offset, offset);
+        src0 = __lsx_vxori_b(src0, 128);
+        dst0 = __lsx_vsadd_b(src0, offset);
+        dst0 = __lsx_vxori_b(dst0, 128);
+
+        __lsx_vstelm_d(dst0, dst, 0, 0);
+        __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+        dst += dst_stride_2x;
+    }
+
+    DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf1, zeros, src_minus11,
+              shuf1, src0, src1);
+    DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
+              shuf2, src_plus10, src_plus11);
+    DUP2_ARG2(__lsx_vpickev_d, src_minus11, src_minus10, src_plus11,
+              src_plus10, src_minus10, src_plus10);
+    src0 =  __lsx_vpickev_d(src1, src0);
+
+    DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10, cmp_minus10,
+              cmp_minus11);
+    DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+              diff_minus10, diff_minus11);
+    DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10, cmp_minus10,
+              cmp_minus11);
+    DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+              cmp_minus10, cmp_minus11);
+    DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
+              const1, cmp_minus11, diff_minus10, diff_minus11);
+
+    offset = __lsx_vadd_b(diff_minus10, diff_minus11);
+    offset = __lsx_vaddi_bu(offset, 2);
+    DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+              sao_offset, offset, offset, offset);
+    src0 = __lsx_vxori_b(src0, 128);
+    dst0 = __lsx_vsadd_b(src0, offset);
+    dst0 = __lsx_vxori_b(dst0, 128);
+
+    __lsx_vstelm_d(dst0, dst, 0, 0);
+    __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+}
+
+static void hevc_sao_edge_filter_0degree_16multiple_lsx(uint8_t *dst,
+                                                        int32_t dst_stride,
+                                                        uint8_t *src,
+                                                        int32_t src_stride,
+                                                        int16_t *sao_offset_val,
+                                                        int32_t width,
+                                                        int32_t height)
+{
+    uint8_t *dst_ptr, *src_minus1;
+    int32_t v_cnt;
+    const int32_t src_stride_2x = (src_stride << 1);
+    const int32_t dst_stride_2x = (dst_stride << 1);
+    const int32_t src_stride_4x = (src_stride << 2);
+    const int32_t dst_stride_4x = (dst_stride << 2);
+    const int32_t src_stride_3x = src_stride_2x + src_stride;
+    const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+
+    __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+    __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+    __m128i edge_idx = {0x403000201, 0x0};
+    __m128i const1 = __lsx_vldi(1);
+    __m128i sao_offset;
+    __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
+    __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
+    __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
+    __m128i diff_plus13;
+    __m128i src10, src11, src12, src13, dst0, dst1, dst2, dst3;
+    __m128i src_minus10, src_minus11, src_minus12, src_minus13;
+    __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3;
+    __m128i src_zero0, src_zero1, src_zero2, src_zero3;
+    __m128i src_plus10, src_plus11, src_plus12, src_plus13;
+
+    sao_offset = __lsx_vld(sao_offset_val, 0);
+    sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+
+    for (; height; height -= 4) {
+        src_minus1 = src - 1;
+        src_minus10 = __lsx_vld(src_minus1, 0);
+        DUP2_ARG2(__lsx_vldx, src_minus1, src_stride, src_minus1,
+                  src_stride_2x, src_minus11, src_minus12);
+        src_minus13 = __lsx_vldx(src_minus1, src_stride_3x);
+
+        for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
+            src_minus1 += 16;
+            dst_ptr = dst + v_cnt;
+            src10 = __lsx_vld(src_minus1, 0);
+            DUP2_ARG2(__lsx_vldx, src_minus1, src_stride, src_minus1,
+                      src_stride_2x, src11, src12);
+            src13 = __lsx_vldx(src_minus1, src_stride_3x);
+            DUP4_ARG3(__lsx_vshuf_b, src10, src_minus10, shuf1, src11,
+                      src_minus11, shuf1, src12, src_minus12, shuf1, src13,
+                      src_minus13, shuf1, src_zero0, src_zero1,
+                      src_zero2, src_zero3);
+            DUP4_ARG3(__lsx_vshuf_b, src10, src_minus10, shuf2, src11,
+                      src_minus11, shuf2, src12, src_minus12, shuf2, src13,
+                      src_minus13, shuf2, src_plus10, src_plus11,
+                      src_plus12, src_plus13);
+            DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
+                      src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
+                      cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
+            DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
+                      src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
+                      cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
+            DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
+                      cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
+                      cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
+                      diff_plus11);
+            DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
+                      cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
+                      cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
+                      diff_plus13);
+            DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
+                      src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
+                      cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
+            DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
+                      src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
+                      cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
+            DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
+                      cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
+                      cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
+                      cmp_plus11);
+            DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
+                      cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
+                      cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
+                      cmp_plus13);
+            DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+                      diff_plus10, const1, cmp_plus10, diff_minus11, const1,
+                      cmp_minus11, diff_plus11, const1, cmp_plus11,
+                      diff_minus10, diff_plus10, diff_minus11, diff_plus11);
+            DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
+                      diff_plus12, const1, cmp_plus12, diff_minus13, const1,
+                      cmp_minus13, diff_plus13, const1, cmp_plus13,
+                      diff_minus12, diff_plus12, diff_minus13, diff_plus13);
+
+            DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
+                      diff_plus11, diff_minus12, diff_plus12, diff_minus13,
+                      diff_plus13, offset_mask0, offset_mask1, offset_mask2,
+                      offset_mask3);
+            DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
+                      offset_mask2, 2, offset_mask3, 2, offset_mask0,
+                      offset_mask1, offset_mask2, offset_mask3);
+            DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
+                      sao_offset, sao_offset, offset_mask0, offset_mask0,
+                      offset_mask0);
+            DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
+                      sao_offset, sao_offset, offset_mask1, offset_mask1,
+                      offset_mask1);
+            DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
+                      sao_offset, sao_offset, offset_mask2, offset_mask2,
+                      offset_mask2);
+            DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
+                      sao_offset, sao_offset, offset_mask3, offset_mask3,
+                      offset_mask3);
+
+            DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128,
+                      src_zero2, 128, src_zero3, 128, src_zero0, src_zero1,
+                      src_zero2, src_zero3);
+            DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
+                      offset_mask1, src_zero2, offset_mask2, src_zero3,
+                      offset_mask3, dst0, dst1, dst2, dst3);
+            DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
+                      128, dst0, dst1, dst2, dst3);
+
+            src_minus10 = src10;
+            src_minus11 = src11;
+            src_minus12 = src12;
+            src_minus13 = src13;
+
+            __lsx_vst(dst0, dst_ptr, 0);
+            __lsx_vst(dst1, dst_ptr + dst_stride, 0);
+            __lsx_vst(dst2, dst_ptr + dst_stride_2x, 0);
+            __lsx_vst(dst3, dst_ptr + dst_stride_3x, 0);
+        }
+        src += src_stride_4x;
+        dst += dst_stride_4x;
+    }
+}
+
+static void hevc_sao_edge_filter_90degree_4width_lsx(uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     uint8_t *src,
+                                                     int32_t src_stride,
+                                                     int16_t *sao_offset_val,
+                                                     int32_t height)
+{
+    const int32_t src_stride_2x = (src_stride << 1);
+    const int32_t dst_stride_2x = (dst_stride << 1);
+    __m128i edge_idx = {0x403000201, 0x0};
+    __m128i const1 = __lsx_vldi(1);
+    __m128i dst0;
+    __m128i sao_offset = __lsx_vld(sao_offset_val, 0);
+    __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+    __m128i src_minus10, src_minus11, src10, src11;
+    __m128i src_zero0, src_zero1;
+    __m128i offset;
+    __m128i offset_mask0, offset_mask1;
+
+    sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+
+    /* load in advance */
+    DUP4_ARG2(__lsx_vld, src - src_stride, 0, src, 0, src + src_stride, 0,
+              src + src_stride_2x, 0, src_minus10, src_minus11, src10, src11);
+
+    for (height -= 2; height; height -= 2) {
+        src += src_stride_2x;
+        DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
+                  src11, src_minus11, src10, src10, src_minus10, src_zero0,
+                  src_minus11, src_zero1);
+        DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+                  cmp_minus10, cmp_minus11);
+        DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+                  cmp_minus11, diff_minus10, diff_minus11);
+        DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
+                  src_minus11, cmp_minus10, cmp_minus11);
+        DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+                  cmp_minus11, cmp_minus10, cmp_minus11);
+        DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+                 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
+
+        DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+                  diff_minus11, offset_mask0, offset_mask1);
+        DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
+                  offset_mask0, offset_mask1);
+        DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+                  src_zero0, offset, dst0);
+        DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+                  sao_offset, offset, offset, offset);
+
+        dst0 = __lsx_vxori_b(dst0, 128);
+        dst0 = __lsx_vsadd_b(dst0, offset);
+        dst0 = __lsx_vxori_b(dst0, 128);
+        src_minus10 = src10;
+        src_minus11 = src11;
+
+        /* load in advance */
+        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
+                  src10, src11);
+
+        __lsx_vstelm_w(dst0, dst, 0, 0);
+        __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
+        dst += dst_stride_2x;
+    }
+
+    DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
+              src11,  src_minus11, src10, src10, src_minus10, src_zero0,
+              src_minus11, src_zero1);
+    DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+              cmp_minus10, cmp_minus11);
+    DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+              diff_minus10, diff_minus11);
+    DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
+              cmp_minus10, cmp_minus11);
+    DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+              cmp_minus10, cmp_minus11);
+    DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
+              const1, cmp_minus11, diff_minus10, diff_minus11);
+
+    DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+              diff_minus11, offset_mask0, offset_mask1);
+    DUP2_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
+              offset_mask0, offset_mask1);
+    DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+              src_zero0, offset, dst0);
+    DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+              sao_offset, offset, offset, offset);
+    dst0 = __lsx_vxori_b(dst0, 128);
+    dst0 = __lsx_vsadd_b(dst0, offset);
+    dst0 = __lsx_vxori_b(dst0, 128);
+
+    __lsx_vstelm_w(dst0, dst, 0, 0);
+    __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
+}
+
+static void hevc_sao_edge_filter_90degree_8width_lsx(uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     uint8_t *src,
+                                                     int32_t src_stride,
+                                                     int16_t *sao_offset_val,
+                                                     int32_t height)
+{
+    const int32_t src_stride_2x = (src_stride << 1);
+    const int32_t dst_stride_2x = (dst_stride << 1);
+    __m128i edge_idx = {0x403000201, 0x0};
+    __m128i const1 = __lsx_vldi(1);
+    __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
+    __m128i src_zero0, src_zero1, dst0;
+    __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+    __m128i src_minus10, src_minus11, src10, src11;
+    __m128i offset_mask0, offset_mask1;
+
+    sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+
+    /* load in advance */
+    DUP2_ARG2(__lsx_vld, src - src_stride, 0, src, 0, src_minus10, src_minus11);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src10, src11);
+
+    for (height -= 2; height; height -= 2) {
+        src += src_stride_2x;
+        DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
+                  src11, src_minus11, src10, src10, src_minus10, src_zero0,
+                  src_minus11, src_zero1);
+        DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+                  cmp_minus10, cmp_minus11);
+        DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+                  cmp_minus11, diff_minus10, diff_minus11);
+        DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
+                  src_minus11, cmp_minus10, cmp_minus11);
+        DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+                  cmp_minus11, cmp_minus10, cmp_minus11);
+        DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+                diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
+
+        DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+                  diff_minus11, offset_mask0, offset_mask1);
+        DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
+                  offset_mask0, offset_mask1);
+        DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+                  src_zero0, offset, dst0);
+        DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+                  sao_offset, offset, offset, offset);
+
+        dst0 = __lsx_vxori_b(dst0, 128);
+        dst0 = __lsx_vsadd_b(dst0, offset);
+        dst0 = __lsx_vxori_b(dst0, 128);
+        src_minus10 = src10;
+        src_minus11 = src11;
+
+        /* load in advance */
+        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
+                  src10, src11);
+
+        __lsx_vstelm_d(dst0, dst, 0, 0);
+        __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+        dst += dst_stride_2x;
+    }
+
+    DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
+              src11, src_minus11, src10, src10, src_minus10, src_zero0,
+              src_minus11, src_zero1);
+    DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+              cmp_minus10, cmp_minus11);
+    DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+              diff_minus10, diff_minus11);
+    DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
+              cmp_minus10, cmp_minus11);
+    DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+              cmp_minus10, cmp_minus11);
+    DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
+              const1, cmp_minus11, diff_minus10, diff_minus11);
+
+    DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+              diff_minus11, offset_mask0, offset_mask1);
+    DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
+              offset_mask0, offset_mask1);
+    DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+              src_zero0, offset, dst0);
+    DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+              sao_offset, offset, offset, offset);
+    dst0 =  __lsx_vxori_b(dst0, 128);
+    dst0 = __lsx_vsadd_b(dst0, offset);
+    dst0 = __lsx_vxori_b(dst0, 128);
+
+    __lsx_vstelm_d(dst0, dst, 0, 0);
+    __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+}
+
+static void hevc_sao_edge_filter_90degree_16multiple_lsx(uint8_t *dst,
+                                                         int32_t dst_stride,
+                                                         uint8_t *src,
+                                                         int32_t src_stride,
+                                                         int16_t *
+                                                         sao_offset_val,
+                                                         int32_t width,
+                                                         int32_t height)
+{
+    uint8_t *src_orig = src;
+    uint8_t *dst_orig = dst;
+    int32_t h_cnt, v_cnt;
+    const int32_t src_stride_2x = (src_stride << 1);
+    const int32_t dst_stride_2x = (dst_stride << 1);
+    const int32_t src_stride_4x = (src_stride << 2);
+    const int32_t dst_stride_4x = (dst_stride << 2);
+    const int32_t src_stride_3x = src_stride_2x + src_stride;
+    const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+    __m128i edge_idx = {0x403000201, 0x0};
+    __m128i const1 = __lsx_vldi(1);
+    __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
+    __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
+    __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
+    __m128i diff_plus13;
+    __m128i src10, src_minus10, dst0, src11, src_minus11, dst1;
+    __m128i src12, dst2, src13, dst3;
+    __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
+
+    sao_offset = __lsx_vld(sao_offset_val, 0);
+    sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+
+    for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
+        src = src_orig + v_cnt;
+        dst = dst_orig + v_cnt;
+
+        DUP2_ARG2(__lsx_vld, src - src_stride, 0, src, 0,
+                  src_minus10, src_minus11);
+
+        for (h_cnt = (height >> 2); h_cnt--;) {
+            DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
+                      src, src_stride_3x, src, src_stride_4x,
+                      src10, src11, src12, src13);
+            DUP4_ARG2(__lsx_vseq_b, src_minus11, src_minus10, src_minus11,
+                      src10, src10, src_minus11, src10, src11, cmp_minus10,
+                      cmp_plus10, cmp_minus11, cmp_plus11);
+            DUP4_ARG2(__lsx_vseq_b, src11, src10, src11, src12, src12, src11,
+                      src12, src13, cmp_minus12, cmp_plus12,
+                      cmp_minus13, cmp_plus13);
+            DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
+                      cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
+                      cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
+                      diff_plus11);
+            DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
+                      cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
+                      cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
+                      diff_plus13);
+            DUP4_ARG2(__lsx_vsle_bu, src_minus11, src_minus10, src_minus11,
+                      src10, src10, src_minus11, src10, src11, cmp_minus10,
+                      cmp_plus10, cmp_minus11, cmp_plus11);
+            DUP4_ARG2(__lsx_vsle_bu, src11, src10, src11, src12, src12, src11,
+                      src12, src13, cmp_minus12, cmp_plus12, cmp_minus13,
+                      cmp_plus13);
+            DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
+                      cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
+                      cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
+                      cmp_plus11);
+            DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
+                      cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
+                      cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
+                      cmp_plus13);
+            DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+                      diff_plus10, const1, cmp_plus10, diff_minus11, const1,
+                      cmp_minus11, diff_plus11, const1, cmp_plus11,
+                      diff_minus10, diff_plus10, diff_minus11, diff_plus11);
+            DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
+                      diff_plus12, const1, cmp_plus12, diff_minus13, const1,
+                      cmp_minus13, diff_plus13, const1, cmp_plus13,
+                      diff_minus12, diff_plus12, diff_minus13, diff_plus13);
+
+            DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
+                      diff_plus11, diff_minus12, diff_plus12, diff_minus13,
+                      diff_plus13, offset_mask0, offset_mask1, offset_mask2,
+                      offset_mask3);
+            DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
+                      offset_mask2, 2, offset_mask3, 2, offset_mask0,
+                      offset_mask1, offset_mask2, offset_mask3);
+            DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
+                      sao_offset, sao_offset, offset_mask0,\
+                      offset_mask0, offset_mask0);
+            DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
+                      sao_offset, sao_offset, offset_mask1, offset_mask1,
+                      offset_mask1);
+            DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
+                      sao_offset, sao_offset, offset_mask2, offset_mask2,
+                      offset_mask2);
+            DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
+                      sao_offset, sao_offset, offset_mask3, offset_mask3,
+                      offset_mask3);
+
+            src_minus10 = src12;
+            DUP4_ARG2(__lsx_vxori_b, src_minus11, 128, src10, 128, src11, 128,
+                      src12, 128, src_minus11, src10, src11, src12);
+            DUP4_ARG2(__lsx_vsadd_b, src_minus11, offset_mask0, src10,
+                      offset_mask1, src11, offset_mask2, src12,
+                      offset_mask3, dst0, dst1, dst2, dst3);
+            DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
+                      128, dst0, dst1, dst2, dst3);
+            src_minus11 = src13;
+
+            __lsx_vst(dst0, dst, 0);
+            __lsx_vstx(dst1, dst, dst_stride);
+            __lsx_vstx(dst2, dst, dst_stride_2x);
+            __lsx_vstx(dst3, dst, dst_stride_3x);
+            src += src_stride_4x;
+            dst += dst_stride_4x;
+        }
+    }
+}
+
+static void hevc_sao_edge_filter_45degree_4width_lsx(uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     uint8_t *src,
+                                                     int32_t src_stride,
+                                                     int16_t *sao_offset_val,
+                                                     int32_t height)
+{
+    uint8_t *src_orig;
+    const int32_t src_stride_2x = (src_stride << 1);
+    const int32_t dst_stride_2x = (dst_stride << 1);
+    __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+    __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+    __m128i edge_idx = {0x403000201, 0x0};
+    __m128i const1 = __lsx_vldi(1);
+    __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
+    __m128i cmp_minus10, diff_minus10, src_minus10, cmp_minus11, diff_minus11;
+    __m128i src_minus11, src10, src11;
+    __m128i src_plus0, src_zero0, src_plus1, src_zero1, dst0;
+    __m128i offset_mask0, offset_mask1;
+    __m128i zeros = {0};
+
+    sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+    src_orig = src - 1;
+
+    /* load in advance */
+    DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
+              src_minus10, src_minus11);
+    DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+              src10, src11);
+
+    for (height -= 2; height; height -= 2) {
+        src_orig += src_stride_2x;
+
+        DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
+                  shuf1, src_zero0, src_zero1);
+        DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
+                  src_plus0, src_plus1);
+
+        DUP2_ARG2(__lsx_vilvl_b, src_plus0, src_minus10, src_plus1,
+                  src_minus11, src_minus10, src_minus11);
+        DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1,
+                  src_zero1, src_zero0, src_zero1);
+        DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1,
+                  src_minus11, cmp_minus10, cmp_minus11);
+        DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+                  cmp_minus11, diff_minus10, diff_minus11);
+        DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
+                  src_minus11, cmp_minus10, cmp_minus11);
+        DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+                  cmp_minus11, cmp_minus10, cmp_minus11);
+        DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+             diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
+
+        DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+                  diff_minus11, offset_mask0, offset_mask1);
+        DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
+                  offset_mask0, offset_mask1);
+        DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+                  src_zero0, offset, dst0);
+        DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+                  sao_offset, offset, offset, offset);
+        dst0 = __lsx_vxori_b(dst0, 128);
+        dst0 = __lsx_vsadd_b(dst0, offset);
+        dst0 = __lsx_vxori_b(dst0, 128);
+
+        src_minus10 = src10;
+        src_minus11 = src11;
+
+        /* load in advance */
+        DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+                  src10, src11);
+
+        __lsx_vstelm_w(dst0, dst, 0, 0);
+        __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
+        dst += dst_stride_2x;
+    }
+
+    DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
+              src_zero0, src_zero1);
+    DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
+              src_plus0, src_plus1);
+
+    DUP2_ARG2(__lsx_vilvl_b, src_plus0, src_minus10, src_plus1, src_minus11,
+              src_minus10, src_minus11);
+    DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
+              src_zero0, src_zero1);
+    DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+              cmp_minus10, cmp_minus11);
+    DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+              diff_minus10, diff_minus11);
+    DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
+              cmp_minus10, cmp_minus11);
+    DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+              cmp_minus10, cmp_minus11);
+    DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
+              const1, cmp_minus11, diff_minus10, diff_minus11);
+
+    DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+              diff_minus11, offset_mask0, offset_mask1);
+    DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
+              offset_mask1);
+    DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+              src_zero0, offset, dst0);
+    DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+              sao_offset, offset, offset, offset);
+    dst0 = __lsx_vxori_b(dst0, 128);
+    dst0 = __lsx_vsadd_b(dst0, offset);
+    dst0 = __lsx_vxori_b(dst0, 128);
+
+    __lsx_vstelm_w(dst0, dst, 0, 0);
+    __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
+}
+
+static void hevc_sao_edge_filter_45degree_8width_lsx(uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     uint8_t *src,
+                                                     int32_t src_stride,
+                                                     int16_t *sao_offset_val,
+                                                     int32_t height)
+{
+    uint8_t *src_orig;
+    const int32_t src_stride_2x = (src_stride << 1);
+    const int32_t dst_stride_2x = (dst_stride << 1);
+    __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+    __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+    __m128i edge_idx = {0x403000201, 0x0};
+    __m128i const1 = __lsx_vldi(1);
+    __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
+    __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+    __m128i src_minus10, src10, src_minus11, src11;
+    __m128i src_zero0, src_plus10, src_zero1, src_plus11, dst0;
+    __m128i offset_mask0, offset_mask1;
+    __m128i zeros = {0};
+
+    sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+    src_orig = src - 1;
+
+    /* load in advance */
+    DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0, src_minus10,
+              src_minus11);
+    DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+              src10, src11);
+
+    for (height -= 2; height; height -= 2) {
+        src_orig += src_stride_2x;
+
+        DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
+                  shuf1, src_zero0, src_zero1);
+        DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
+                  src_plus10, src_plus11);
+
+        DUP2_ARG2(__lsx_vilvl_b, src_plus10, src_minus10, src_plus11,
+                  src_minus11, src_minus10, src_minus11);
+        DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
+                  src_zero0, src_zero1);
+        DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+                  cmp_minus10, cmp_minus11);
+        DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+                  cmp_minus11, diff_minus10, diff_minus11);
+        DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
+                  src_minus11, cmp_minus10, cmp_minus11);
+        DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+                  cmp_minus11, cmp_minus10, cmp_minus11);
+        DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+               diff_minus11, const1, cmp_minus11,  diff_minus10, diff_minus11);
+
+        DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+                  diff_minus11, offset_mask0, offset_mask1);
+        DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
+                  offset_mask0, offset_mask1);
+        DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+                  src_zero0, offset, dst0);
+        DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+                  sao_offset, offset, offset, offset);
+        dst0 = __lsx_vxori_b(dst0, 128);
+        dst0 = __lsx_vsadd_b(dst0, offset);
+        dst0 = __lsx_vxori_b(dst0, 128);
+
+        src_minus10 = src10;
+        src_minus11 = src11;
+
+        /* load in advance */
+        DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+                  src10, src11)
+        __lsx_vstelm_d(dst0, dst, 0, 0);
+        __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+        dst += dst_stride_2x;
+    }
+
+    DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
+              src_zero0, src_zero1);
+    DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
+              src_plus10, src_plus11);
+    DUP2_ARG2(__lsx_vilvl_b, src_plus10, src_minus10, src_plus11, src_minus11,
+              src_minus10, src_minus11);
+    DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
+              src_zero0, src_zero1);
+
+    DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+              cmp_minus10, cmp_minus11);
+    DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+              cmp_minus11, diff_minus10, diff_minus11);
+    DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
+              cmp_minus10, cmp_minus11);
+    DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+              cmp_minus10, cmp_minus11);
+    DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
+              const1, cmp_minus11, diff_minus10, diff_minus11);
+
+    DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+              diff_minus11, offset_mask0, offset_mask1);
+    DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
+              offset_mask1);
+    DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+              src_zero0, offset, dst0);
+    DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+              sao_offset, offset, offset, offset);
+    dst0 = __lsx_vxori_b(dst0, 128);
+    dst0 = __lsx_vsadd_b(dst0, offset);
+    dst0 = __lsx_vxori_b(dst0, 128);
+
+    src_minus10 = src10;
+    src_minus11 = src11;
+
+    /* load in advance */
+    DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+              src10, src11);
+
+    __lsx_vstelm_d(dst0, dst, 0, 0);
+    __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+}
+
+static void hevc_sao_edge_filter_45degree_16multiple_lsx(uint8_t *dst,
+                                                         int32_t dst_stride,
+                                                         uint8_t *src,
+                                                         int32_t src_stride,
+                                                         int16_t *
+                                                         sao_offset_val,
+                                                         int32_t width,
+                                                         int32_t height)
+{
+    uint8_t *src_orig = src;
+    uint8_t *dst_orig = dst;
+    int32_t v_cnt;
+    const int32_t src_stride_2x = (src_stride << 1);
+    const int32_t dst_stride_2x = (dst_stride << 1);
+    const int32_t src_stride_4x = (src_stride << 2);
+    const int32_t dst_stride_4x = (dst_stride << 2);
+    const int32_t src_stride_3x = src_stride_2x + src_stride;
+    const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+
+    __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+    __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+    __m128i edge_idx = {0x403000201, 0x0};
+    __m128i const1 = __lsx_vldi(1);
+    __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
+    __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
+    __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
+    __m128i diff_plus13, src_minus14, src_plus13;
+    __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3;
+    __m128i src10, src_minus10, dst0, src11, src_minus11, dst1;
+    __m128i src12, src_minus12, dst2, src13, src_minus13, dst3;
+    __m128i src_zero0, src_plus10, src_zero1, src_plus11, src_zero2;
+    __m128i src_zero3, sao_offset, src_plus12;
+
+    sao_offset = __lsx_vld(sao_offset_val, 0);
+    sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+
+    for (; height; height -= 4) {
+        src_orig = src - 1;
+        dst_orig = dst;
+        src_minus11 = __lsx_vld(src_orig, 0);
+        DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+                  src_minus12, src_minus13);
+        src_minus14 = __lsx_vldx(src_orig, src_stride_3x);
+
+        for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
+            src_minus10 = __lsx_vld(src_orig - src_stride, 0);
+            src_orig += 16;
+            src10 = __lsx_vld(src_orig, 0);
+            DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig,
+                      src_stride_2x, src11, src12);
+            src13 = __lsx_vldx(src_orig, src_stride_3x);
+            src_plus13 = __lsx_vld(src + v_cnt + src_stride_4x, 1);
+
+            DUP4_ARG3(__lsx_vshuf_b, src10, src_minus11, shuf1, src11,
+                      src_minus12, shuf1, src12, src_minus13, shuf1,
+                      src13, src_minus14, shuf1, src_zero0, src_zero1,
+                      src_zero2, src_zero3);
+            DUP2_ARG3(__lsx_vshuf_b, src11, src_minus12, shuf2, src12,
+                      src_minus13, shuf2, src_plus10, src_plus11);
+            src_plus12 = __lsx_vshuf_b(src13, src_minus14, shuf2);
+
+            DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
+                      src_plus10, src_zero1, src_minus11, src_zero1,
+                      src_plus11, cmp_minus10, cmp_plus10,
+                      cmp_minus11, cmp_plus11);
+            DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
+                      src_plus12, src_zero3, src_minus13, src_zero3,
+                      src_plus13, cmp_minus12, cmp_plus12,
+                      cmp_minus13, cmp_plus13);
+            DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
+                      cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
+                      cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
+                      diff_plus11);
+            DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
+                      cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
+                      cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
+                      diff_plus13);
+            DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
+                      src_plus10, src_zero1, src_minus11, src_zero1,
+                      src_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
+                      cmp_plus11);
+            DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
+                      src_plus12, src_zero3, src_minus13, src_zero3,
+                      src_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
+                      cmp_plus13);
+            DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
+                      cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
+                      cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
+                      cmp_plus11);
+            DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
+                      cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
+                      cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
+                      cmp_plus13);
+            DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+                      diff_plus10, const1, cmp_plus10, diff_minus11, const1,
+                      cmp_minus11, diff_plus11, const1, cmp_plus11,
+                      diff_minus10, diff_plus10, diff_minus11, diff_plus11);
+            DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
+                      diff_plus12, const1, cmp_plus12, diff_minus13, const1,
+                      cmp_minus13, diff_plus13, const1, cmp_plus13,
+                      diff_minus12, diff_plus12, diff_minus13, diff_plus13);
+
+            DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
+                      diff_plus11, diff_minus12, diff_plus12, diff_minus13,
+                      diff_plus13, offset_mask0, offset_mask1, offset_mask2,
+                      offset_mask3);
+            DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
+                      offset_mask2, 2, offset_mask3, 2, offset_mask0,
+                      offset_mask1, offset_mask2, offset_mask3);
+
+            DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
+                      sao_offset, sao_offset, offset_mask0, offset_mask0,
+                      offset_mask0);
+            DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
+                      sao_offset, sao_offset, offset_mask1, offset_mask1,
+                      offset_mask1);
+            DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
+                      sao_offset, sao_offset, offset_mask2, offset_mask2,
+                      offset_mask2);
+            DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
+                      sao_offset, sao_offset, offset_mask3, offset_mask3,
+                      offset_mask3);
+
+            DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128, src_zero2,
+                      128, src_zero3, 128, src_zero0, src_zero1, src_zero2,
+                      src_zero3);
+            DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
+                      offset_mask1, src_zero2, offset_mask2, src_zero3,
+                      offset_mask3, dst0, dst1, dst2, dst3);
+            DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
+                      128, dst0, dst1, dst2, dst3);
+
+            src_minus11 = src10;
+            src_minus12 = src11;
+            src_minus13 = src12;
+            src_minus14 = src13;
+
+            __lsx_vst(dst0, dst_orig, 0);
+            __lsx_vstx(dst1, dst_orig, dst_stride);
+            __lsx_vstx(dst2, dst_orig, dst_stride_2x);
+            __lsx_vstx(dst3, dst_orig, dst_stride_3x);
+            dst_orig += 16;
+        }
+        src += src_stride_4x;
+        dst += dst_stride_4x;
+    }
+}
+
+static void hevc_sao_edge_filter_135degree_4width_lsx(uint8_t *dst,
+                                                      int32_t dst_stride,
+                                                      uint8_t *src,
+                                                      int32_t src_stride,
+                                                      int16_t *sao_offset_val,
+                                                      int32_t height)
+{
+    uint8_t *src_orig;
+    const int32_t src_stride_2x = (src_stride << 1);
+    const int32_t dst_stride_2x = (dst_stride << 1);
+
+    __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+    __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+    __m128i edge_idx = {0x403000201, 0x0};
+    __m128i const1 = __lsx_vldi(1);
+    __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
+    __m128i src_zero0, src_zero1, dst0;
+    __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+    __m128i src_minus10, src10, src_minus11, src11;
+    __m128i offset_mask0, offset_mask1;
+    __m128i zeros = {0};
+
+    sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+    src_orig = src - 1;
+
+    /* load in advance */
+    DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
+              src_minus10, src_minus11);
+    DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+              src10, src11);
+
+    for (height -= 2; height; height -= 2) {
+        src_orig += src_stride_2x;
+
+        DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
+                  shuf1, src_zero0, src_zero1);
+        DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
+                  shuf2, src_minus10, src_minus11);
+
+        DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
+                  src_minus10, src_minus11);
+        DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
+                  src_zero0, src_zero1);
+        DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+                  cmp_minus10, cmp_minus11);
+        DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+                  cmp_minus11, diff_minus10, diff_minus11);
+        DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
+                  src_minus11, cmp_minus10, cmp_minus11);
+        DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+                  cmp_minus11, cmp_minus10, cmp_minus11);
+        DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+               diff_minus11, const1, cmp_minus11,  diff_minus10, diff_minus11);
+
+        DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+                  diff_minus11, offset_mask0, offset_mask1);
+        DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
+                  offset_mask0, offset_mask1);
+        DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+                  src_zero0, offset, dst0);
+        DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+                  sao_offset, offset, offset, offset);
+        dst0 = __lsx_vxori_b(dst0, 128);
+        dst0 = __lsx_vsadd_b(dst0, offset);
+        dst0 = __lsx_vxori_b(dst0, 128);
+
+        src_minus10 = src10;
+        src_minus11 = src11;
+
+        /* load in advance */
+        DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+                  src10, src11);
+
+        __lsx_vstelm_w(dst0, dst, 0, 0);
+        __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
+        dst += dst_stride_2x;
+    }
+
+    DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
+              src_zero0, src_zero1);
+    DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
+              shuf2, src_minus10, src_minus11);
+
+    DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
+              src_minus10, src_minus11);
+    DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
+              src_zero0, src_zero1);
+    DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+              cmp_minus10, cmp_minus11);
+    DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+              cmp_minus11, diff_minus10, diff_minus11);
+    DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
+              cmp_minus10, cmp_minus11);
+    DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+              cmp_minus10, cmp_minus11);
+    DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
+              const1, cmp_minus11, diff_minus10, diff_minus11);
+
+    DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+              diff_minus11, offset_mask0, offset_mask1);
+    DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
+              offset_mask1);
+    DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+              src_zero0, offset, dst0);
+    DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+              sao_offset, offset, offset, offset);
+    dst0 = __lsx_vxori_b(dst0, 128);
+    dst0 = __lsx_vsadd_b(dst0, offset);
+    dst0 = __lsx_vxori_b(dst0, 128);
+
+    __lsx_vstelm_w(dst0, dst, 0, 0);
+    __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
+    dst += dst_stride_2x;
+}
+
+static void hevc_sao_edge_filter_135degree_8width_lsx(uint8_t *dst,
+                                                      int32_t dst_stride,
+                                                      uint8_t *src,
+                                                      int32_t src_stride,
+                                                      int16_t *sao_offset_val,
+                                                      int32_t height)
+{
+    uint8_t *src_orig;
+    const int32_t src_stride_2x = (src_stride << 1);
+    const int32_t dst_stride_2x = (dst_stride << 1);
+
+    __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+    __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+    __m128i edge_idx = {0x403000201, 0x0};
+    __m128i const1 = __lsx_vldi(1);
+    __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
+    __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+    __m128i src_minus10, src10, src_minus11, src11;
+    __m128i src_zero0, src_zero1, dst0;
+    __m128i offset_mask0, offset_mask1;
+    __m128i zeros = {0};
+
+    sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+    src_orig = src - 1;
+
+    /* load in advance */
+    DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
+              src_minus10, src_minus11);
+    DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+              src10, src11);
+
+    for (height -= 2; height; height -= 2) {
+        src_orig += src_stride_2x;
+
+        DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
+                  shuf1, src_zero0, src_zero1);
+        DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
+                  shuf2, src_minus10, src_minus11);
+
+        DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
+                  src_minus10, src_minus11);
+        DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
+                  src_zero0, src_zero1);
+        DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+                  cmp_minus10, cmp_minus11);
+        DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+                  cmp_minus11, diff_minus10, diff_minus11);
+        DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
+                  src_minus11, cmp_minus10, cmp_minus11);
+        DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+                  cmp_minus11, cmp_minus10, cmp_minus11);
+        DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+              diff_minus11, const1, cmp_minus11,  diff_minus10, diff_minus11);
+
+        DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+                  diff_minus11, offset_mask0, offset_mask1);
+        DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
+                  offset_mask0, offset_mask1);
+        DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+                  src_zero0, offset, dst0);
+        DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+                  sao_offset, offset, offset, offset);
+        dst0 = __lsx_vxori_b(dst0, 128);
+        dst0 = __lsx_vsadd_b(dst0, offset);
+        dst0 = __lsx_vxori_b(dst0, 128);
+
+        src_minus10 = src10;
+        src_minus11 = src11;
+
+        /* load in advance */
+        DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+                  src10, src11);
+
+        __lsx_vstelm_d(dst0, dst, 0, 0);
+        __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+        dst += dst_stride_2x;
+    }
+
+    DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
+              src_zero0, src_zero1);
+    DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
+              shuf2, src_minus10, src_minus11);
+
+    DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
+              src_minus10, src_minus11);
+    DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
+              src_zero0, src_zero1);
+    DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+              cmp_minus10, cmp_minus11);
+    DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+              diff_minus10, diff_minus11);
+    DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
+              cmp_minus10, cmp_minus11);
+    DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+              cmp_minus10, cmp_minus11);
+    DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
+              const1, cmp_minus11, diff_minus10, diff_minus11);
+
+    DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+              diff_minus11, offset_mask0, offset_mask1);
+    DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
+              offset_mask1);
+    DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+              src_zero0, offset, dst0);
+    DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+              sao_offset, offset, offset, offset);
+    dst0 = __lsx_vxori_b(dst0, 128);
+    dst0 = __lsx_vsadd_b(dst0, offset);
+    dst0 = __lsx_vxori_b(dst0, 128);
+
+    __lsx_vstelm_d(dst0, dst, 0, 0);
+    __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+}
+
+static void hevc_sao_edge_filter_135degree_16multiple_lsx(uint8_t *dst,
+                                                          int32_t dst_stride,
+                                                          uint8_t *src,
+                                                          int32_t src_stride,
+                                                          int16_t *sao_offset_val,
+                                                          int32_t width,
+                                                          int32_t height)
+{
+    uint8_t *src_orig, *dst_orig;
+    int32_t v_cnt;
+    const int32_t src_stride_2x = (src_stride << 1);
+    const int32_t dst_stride_2x = (dst_stride << 1);
+    const int32_t src_stride_4x = (src_stride << 2);
+    const int32_t dst_stride_4x = (dst_stride << 2);
+    const int32_t src_stride_3x = src_stride_2x + src_stride;
+    const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+
+    __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+    __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+    __m128i edge_idx = {0x403000201, 0x0};
+    __m128i const1 = __lsx_vldi(1);
+    __m128i dst0, dst1, dst2, dst3;
+    __m128i cmp_minus10, cmp_minus11, cmp_minus12, cmp_minus13, cmp_plus10;
+    __m128i cmp_plus11, cmp_plus12, cmp_plus13, diff_minus10, diff_minus11;
+    __m128i diff_minus12, diff_minus13, diff_plus10, diff_plus11, diff_plus12;
+    __m128i diff_plus13, src10, src11, src12, src13, src_minus10, src_minus11;
+    __m128i src_plus10, src_plus11, src_plus12, src_plus13;
+    __m128i src_minus12, src_minus13, src_zero0, src_zero1, src_zero2, src_zero3;
+    __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
+
+    sao_offset = __lsx_vld(sao_offset_val, 0);
+    sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+
+    for (; height; height -= 4) {
+        src_orig = src - 1;
+        dst_orig = dst;
+
+        src_minus11 = __lsx_vld(src_orig, 0);
+        DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+                  src_plus10, src_plus11);
+        src_plus12 = __lsx_vldx(src_orig, src_stride_3x);
+
+        for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
+            src_minus10 = __lsx_vld(src_orig - src_stride, 2);
+            src_plus13 = __lsx_vldx(src_orig, src_stride_4x);
+            src_orig += 16;
+            src10 = __lsx_vld(src_orig, 0);
+            DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+                      src11, src12);
+            src13 =__lsx_vldx(src_orig, src_stride_3x);
+
+            DUP4_ARG3(__lsx_vshuf_b, src10, src_minus11, shuf1, src11,
+                      src_plus10,  shuf1, src12, src_plus11, shuf1, src13,
+                      src_plus12, shuf1, src_zero0, src_zero1, src_zero2,
+                      src_zero3);
+            src_minus11 = __lsx_vshuf_b(src10, src_minus11, shuf2);
+            DUP2_ARG3(__lsx_vshuf_b, src11, src_plus10, shuf2, src12,
+                      src_plus11, shuf2, src_minus12, src_minus13);
+
+            DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
+                      src_plus10,  src_zero1, src_minus11, src_zero1,
+                      src_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
+                      cmp_plus11);
+            DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
+                      src_plus12, src_zero3, src_minus13, src_zero3,
+                      src_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
+                      cmp_plus13);
+            DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
+                      cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
+                      cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
+                      diff_plus11);
+            DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
+                      cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
+                      cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
+                      diff_plus13);
+            DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
+                      src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
+                      cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
+            DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
+                      src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
+                      cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
+            DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
+                      cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
+                      cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
+                      cmp_plus11);
+            DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
+                      cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
+                      cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
+                      cmp_plus13);
+            DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+                      diff_plus10, const1, cmp_plus10, diff_minus11, const1,
+                      cmp_minus11, diff_plus11, const1, cmp_plus11,
+                      diff_minus10, diff_plus10, diff_minus11, diff_plus11);
+            DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
+                      diff_plus12, const1, cmp_plus12, diff_minus13, const1,
+                      cmp_minus13, diff_plus13, const1, cmp_plus13,
+                      diff_minus12, diff_plus12, diff_minus13, diff_plus13);
+
+            DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
+                      diff_plus11, diff_minus12, diff_plus12, diff_minus13,
+                      diff_plus13, offset_mask0, offset_mask1, offset_mask2,
+                      offset_mask3);
+            DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
+                      offset_mask2, 2, offset_mask3, 2, offset_mask0,
+                      offset_mask1, offset_mask2, offset_mask3);
+
+            DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
+                      sao_offset, sao_offset, offset_mask0, offset_mask0,
+                      offset_mask0);
+            DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
+                      sao_offset, sao_offset, offset_mask1, offset_mask1,
+                      offset_mask1);
+            DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
+                      sao_offset, sao_offset, offset_mask2, offset_mask2,
+                      offset_mask2);
+            DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
+                      sao_offset, sao_offset, offset_mask3, offset_mask3,
+                      offset_mask3);
+
+            DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128,
+                      src_zero2, 128, src_zero3, 128, src_zero0, src_zero1,
+                      src_zero2, src_zero3);
+            DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
+                      offset_mask1, src_zero2, offset_mask2, src_zero3,
+                      offset_mask3, dst0, dst1, dst2, dst3);
+            DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
+                      128, dst0, dst1, dst2, dst3);
+
+            src_minus11 = src10;
+            src_plus10 = src11;
+            src_plus11 = src12;
+            src_plus12 = src13;
+
+            __lsx_vst(dst0, dst_orig, 0);
+            __lsx_vstx(dst1, dst_orig, dst_stride);
+            __lsx_vstx(dst2, dst_orig, dst_stride_2x);
+            __lsx_vstx(dst3, dst_orig, dst_stride_3x);
+            dst_orig += 16;
+        }
+
+        src += src_stride_4x;
+        dst += dst_stride_4x;
+    }
+}
+
+void ff_hevc_sao_edge_filter_8_lsx(uint8_t *dst, uint8_t *src,
+                                   ptrdiff_t stride_dst,
+                                   int16_t *sao_offset_val,
+                                   int eo, int width, int height)
+{
+    ptrdiff_t stride_src = (2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE);
+
+    switch (eo) {
+    case 0:
+        if (width >> 4) {
+            hevc_sao_edge_filter_0degree_16multiple_lsx(dst, stride_dst,
+                                                        src, stride_src,
+                                                        sao_offset_val,
+                                                        width - (width & 0x0F),
+                                                        height);
+            dst += width & 0xFFFFFFF0;
+            src += width & 0xFFFFFFF0;
+            width &= 0x0F;
+        }
+
+        if (width >> 3) {
+            hevc_sao_edge_filter_0degree_8width_lsx(dst, stride_dst,
+                                                    src, stride_src,
+                                                    sao_offset_val, height);
+            dst += 8;
+            src += 8;
+            width &= 0x07;
+        }
+
+        if (width) {
+            hevc_sao_edge_filter_0degree_4width_lsx(dst, stride_dst,
+                                                    src, stride_src,
+                                                    sao_offset_val, height);
+        }
+        break;
+
+    case 1:
+        if (width >> 4) {
+            hevc_sao_edge_filter_90degree_16multiple_lsx(dst, stride_dst,
+                                                         src, stride_src,
+                                                         sao_offset_val,
+                                                         width - (width & 0x0F),
+                                                         height);
+            dst += width & 0xFFFFFFF0;
+            src += width & 0xFFFFFFF0;
+            width &= 0x0F;
+        }
+
+        if (width >> 3) {
+            hevc_sao_edge_filter_90degree_8width_lsx(dst, stride_dst,
+                                                     src, stride_src,
+                                                     sao_offset_val, height);
+            dst += 8;
+            src += 8;
+            width &= 0x07;
+        }
+
+        if (width) {
+            hevc_sao_edge_filter_90degree_4width_lsx(dst, stride_dst,
+                                                     src, stride_src,
+                                                     sao_offset_val, height);
+        }
+        break;
+
+    case 2:
+        if (width >> 4) {
+            hevc_sao_edge_filter_45degree_16multiple_lsx(dst, stride_dst,
+                                                         src, stride_src,
+                                                         sao_offset_val,
+                                                         width - (width & 0x0F),
+                                                         height);
+            dst += width & 0xFFFFFFF0;
+            src += width & 0xFFFFFFF0;
+            width &= 0x0F;
+        }
+
+        if (width >> 3) {
+            hevc_sao_edge_filter_45degree_8width_lsx(dst, stride_dst,
+                                                     src, stride_src,
+                                                     sao_offset_val, height);
+            dst += 8;
+            src += 8;
+            width &= 0x07;
+        }
+
+        if (width) {
+            hevc_sao_edge_filter_45degree_4width_lsx(dst, stride_dst,
+                                                     src, stride_src,
+                                                     sao_offset_val, height);
+        }
+        break;
+
+    case 3:
+        if (width >> 4) {
+            hevc_sao_edge_filter_135degree_16multiple_lsx(dst, stride_dst,
+                                                          src, stride_src,
+                                                          sao_offset_val,
+                                                          width - (width & 0x0F),
+                                                          height);
+            dst += width & 0xFFFFFFF0;
+            src += width & 0xFFFFFFF0;
+            width &= 0x0F;
+        }
+
+        if (width >> 3) {
+            hevc_sao_edge_filter_135degree_8width_lsx(dst, stride_dst,
+                                                      src, stride_src,
+                                                      sao_offset_val, height);
+            dst += 8;
+            src += 8;
+            width &= 0x07;
+        }
+
+        if (width) {
+            hevc_sao_edge_filter_135degree_4width_lsx(dst, stride_dst,
+                                                      src, stride_src,
+                                                      sao_offset_val, height);
+        }
+        break;
+    }
+}
diff --git a/libavcodec/loongarch/hevcdsp_init_loongarch.c b/libavcodec/loongarch/hevcdsp_init_loongarch.c
index fc0e8fb0df..f39674be64 100644
--- a/libavcodec/loongarch/hevcdsp_init_loongarch.c
+++ b/libavcodec/loongarch/hevcdsp_init_loongarch.c
@@ -85,6 +85,25 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
             c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_8_lsx;
             c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_8_lsx;
             c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_lsx;
+
+            c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_8_lsx;
+
+            c->hevc_h_loop_filter_luma = ff_hevc_loop_filter_luma_h_8_lsx;
+            c->hevc_v_loop_filter_luma = ff_hevc_loop_filter_luma_v_8_lsx;
+
+            c->hevc_h_loop_filter_luma_c = ff_hevc_loop_filter_luma_h_8_lsx;
+            c->hevc_v_loop_filter_luma_c = ff_hevc_loop_filter_luma_v_8_lsx;
+
+            c->hevc_h_loop_filter_chroma = ff_hevc_loop_filter_chroma_h_8_lsx;
+            c->hevc_v_loop_filter_chroma = ff_hevc_loop_filter_chroma_v_8_lsx;
+
+            c->hevc_h_loop_filter_chroma_c = ff_hevc_loop_filter_chroma_h_8_lsx;
+            c->hevc_v_loop_filter_chroma_c = ff_hevc_loop_filter_chroma_v_8_lsx;
+
+            c->idct[0] = ff_hevc_idct_4x4_lsx;
+            c->idct[1] = ff_hevc_idct_8x8_lsx;
+            c->idct[2] = ff_hevc_idct_16x16_lsx;
+            c->idct[3] = ff_hevc_idct_32x32_lsx;
         }
     }
 }
diff --git a/libavcodec/loongarch/hevcdsp_lsx.h b/libavcodec/loongarch/hevcdsp_lsx.h
index 3259e03f13..0e73fd1f8e 100644
--- a/libavcodec/loongarch/hevcdsp_lsx.h
+++ b/libavcodec/loongarch/hevcdsp_lsx.h
@@ -85,4 +85,30 @@ MC(epel, hv, 32);
 
 #undef MC
 
+void ff_hevc_loop_filter_luma_h_8_lsx(uint8_t *src, ptrdiff_t stride,
+                                      int32_t beta, int32_t *tc,
+                                      uint8_t *p_is_pcm, uint8_t *q_is_pcm);
+
+void ff_hevc_loop_filter_luma_v_8_lsx(uint8_t *src, ptrdiff_t stride,
+                                      int32_t beta, int32_t *tc,
+                                      uint8_t *p_is_pcm, uint8_t *q_is_pcm);
+
+void ff_hevc_loop_filter_chroma_h_8_lsx(uint8_t *src, ptrdiff_t stride,
+                                        int32_t *tc, uint8_t *p_is_pcm,
+                                        uint8_t *q_is_pcm);
+
+void ff_hevc_loop_filter_chroma_v_8_lsx(uint8_t *src, ptrdiff_t stride,
+                                        int32_t *tc, uint8_t *p_is_pcm,
+                                        uint8_t *q_is_pcm);
+
+void ff_hevc_sao_edge_filter_8_lsx(uint8_t *dst, uint8_t *src,
+                                   ptrdiff_t stride_dst,
+                                   int16_t *sao_offset_val,
+                                   int eo, int width, int height);
+
+void ff_hevc_idct_4x4_lsx(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_8x8_lsx(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_16x16_lsx(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_32x32_lsx(int16_t *coeffs, int col_limit);
+
 #endif  // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LSX_H
-- 
2.20.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [FFmpeg-devel] [PATCH v1 4/5] avcodec: [loongarch] Optimize Hevc_mc_bi with LSX.
  2022-02-14  8:25 [FFmpeg-devel] Optimize HEVC decoding for loongarch Hao Chen
                   ` (2 preceding siblings ...)
  2022-02-14  8:25 ` [FFmpeg-devel] [PATCH v1 3/5] avcodec: [loongarch] Optimize Hevc_idct/lpf " Hao Chen
@ 2022-02-14  8:25 ` Hao Chen
  2022-02-14  8:25 ` [FFmpeg-devel] [PATCH v1 5/5] avcodec: [loongarch] Optimize Hevc_mc_uni/w " Hao Chen
  2022-02-15  3:30 ` [FFmpeg-devel] Optimize HEVC decoding for loongarch 殷时友
  5 siblings, 0 replies; 7+ messages in thread
From: Hao Chen @ 2022-02-14  8:25 UTC (permalink / raw)
  To: ffmpeg-devel

ffmpeg -i 5_h265_1080p_60fps_3Mbps.mkv -f rawvideo -y /dev/null -an
before: 124fps
after : 182fps
---
 libavcodec/loongarch/Makefile                 |    3 +-
 libavcodec/loongarch/hevc_mc_bi_lsx.c         | 2289 +++++++++++++++++
 libavcodec/loongarch/hevcdsp_init_loongarch.c |   51 +
 libavcodec/loongarch/hevcdsp_lsx.h            |   57 +
 4 files changed, 2399 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/loongarch/hevc_mc_bi_lsx.c

diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index cfc8e3aaff..620fba7192 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -27,4 +27,5 @@ LSX-OBJS-$(CONFIG_VP9_DECODER)        += loongarch/vp9_mc_lsx.o \
                                          loongarch/vp9_idct_lsx.o
 LSX-OBJS-$(CONFIG_HEVC_DECODER)       += loongarch/hevcdsp_lsx.o \
                                          loongarch/hevc_idct_lsx.o \
-                                         loongarch/hevc_lpf_sao_lsx.o
+                                         loongarch/hevc_lpf_sao_lsx.o \
+                                         loongarch/hevc_mc_bi_lsx.o
diff --git a/libavcodec/loongarch/hevc_mc_bi_lsx.c b/libavcodec/loongarch/hevc_mc_bi_lsx.c
new file mode 100644
index 0000000000..9092fdccb2
--- /dev/null
+++ b/libavcodec/loongarch/hevc_mc_bi_lsx.c
@@ -0,0 +1,2289 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *                Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "hevcdsp_lsx.h"
+
+static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
+    /* 8 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
+};
+
+static av_always_inline __m128i
+hevc_bi_rnd_clip(__m128i in0, __m128i vec0, __m128i in1, __m128i vec1)
+{
+    __m128i out;
+
+    vec0 = __lsx_vsadd_h(in0, vec0);
+    vec1 = __lsx_vsadd_h(in1, vec1);
+    out  = __lsx_vssrarni_bu_h(vec1, vec0, 7);
+    return out;
+}
+
+/* hevc_bi_copy: dst = av_clip_uint8((src0 << 6 + src1) >> 7) */
+static
+void hevc_bi_copy_4w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                         int16_t *src1_ptr, int32_t src2_stride,
+                         uint8_t *dst, int32_t dst_stride, int32_t height)
+{
+    int32_t loop_cnt = height >> 3;
+    int32_t res = (height & 0x07) >> 1;
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t dst_stride_2x = (dst_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t dst_stride_4x = (dst_stride << 2);
+    int32_t src2_stride_2x = (src2_stride << 1);
+    int32_t src2_stride_4x = (src2_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+    int32_t src2_stride_3x = src2_stride_2x + src2_stride;
+    __m128i src0, src1;
+    __m128i zero = __lsx_vldi(0);
+    __m128i in0, in1, in2, in3;
+    __m128i tmp0, tmp1, tmp2, tmp3;
+    __m128i reg0, reg1, reg2, reg3;
+    __m128i dst0, dst1, dst2, dst3;
+
+    for (;loop_cnt--;) {
+        reg0 = __lsx_vldrepl_w(src0_ptr, 0);
+        reg1 = __lsx_vldrepl_w(src0_ptr + src_stride, 0);
+        reg2 = __lsx_vldrepl_w(src0_ptr + src_stride_2x, 0);
+        reg3 = __lsx_vldrepl_w(src0_ptr + src_stride_3x, 0);
+        src0_ptr += src_stride_4x;
+        DUP2_ARG2(__lsx_vilvl_w, reg1, reg0, reg3, reg2, tmp0, tmp1);
+        src0 = __lsx_vilvl_d(tmp1, tmp0);
+        reg0 = __lsx_vldrepl_w(src0_ptr, 0);
+        reg1 = __lsx_vldrepl_w(src0_ptr + src_stride, 0);
+        reg2 = __lsx_vldrepl_w(src0_ptr + src_stride_2x, 0);
+        reg3 = __lsx_vldrepl_w(src0_ptr + src_stride_3x, 0);
+        DUP2_ARG2(__lsx_vilvl_w, reg1, reg0, reg3, reg2, tmp0, tmp1);
+        src1 = __lsx_vilvl_d(tmp1, tmp0);
+        src0_ptr += src_stride_4x;
+
+        tmp0 = __lsx_vldrepl_d(src1_ptr, 0);
+        tmp1 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0);
+        tmp2 = __lsx_vldrepl_d(src1_ptr + src2_stride_2x, 0);
+        tmp3 = __lsx_vldrepl_d(src1_ptr + src2_stride_3x, 0);
+        src1_ptr += src2_stride_4x;
+        DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, in0, in1);
+        tmp0 = __lsx_vldrepl_d(src1_ptr, 0);
+        tmp1 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0);
+        tmp2 = __lsx_vldrepl_d(src1_ptr + src2_stride_2x, 0);
+        tmp3 = __lsx_vldrepl_d(src1_ptr + src2_stride_3x, 0);
+        src1_ptr += src2_stride_4x;
+        DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, in2, in3);
+        DUP2_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, dst0, dst2);
+        DUP2_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, dst1, dst3);
+        DUP2_ARG2(__lsx_vslli_h, dst1, 6, dst3, 6, dst1, dst3);
+        dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
+        dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
+        __lsx_vstelm_w(dst0, dst, 0, 0);
+        __lsx_vstelm_w(dst0, dst + dst_stride, 0, 1);
+        __lsx_vstelm_w(dst0, dst + dst_stride_2x, 0, 2);
+        __lsx_vstelm_w(dst0, dst + dst_stride_3x, 0, 3);
+        dst += dst_stride_4x;
+        __lsx_vstelm_w(dst1, dst, 0, 0);
+        __lsx_vstelm_w(dst1, dst + dst_stride, 0, 1);
+        __lsx_vstelm_w(dst1, dst + dst_stride_2x, 0, 2);
+        __lsx_vstelm_w(dst1, dst + dst_stride_3x, 0, 3);
+        dst += dst_stride_4x;
+    }
+    for(;res--;) {
+        reg0 = __lsx_vldrepl_w(src0_ptr, 0);
+        reg1 = __lsx_vldrepl_w(src0_ptr + src_stride, 0);
+        reg2 = __lsx_vldrepl_d(src1_ptr, 0);
+        reg3 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0);
+        src0 = __lsx_vilvl_w(reg1, reg0);
+        in0  = __lsx_vilvl_d(reg3, reg2);
+        dst0 = __lsx_vsllwil_hu_bu(src0, 6);
+        dst0 = __lsx_vsadd_h(dst0, in0);
+        dst0 = __lsx_vssrarni_bu_h(dst0, dst0, 7);
+        __lsx_vstelm_w(dst0, dst, 0, 0);
+        __lsx_vstelm_w(dst0, dst + dst_stride, 0, 1);
+        src0_ptr += src_stride_2x;
+        src1_ptr += src2_stride_2x;
+        dst += dst_stride_2x;
+    }
+}
+
+static
+void hevc_bi_copy_6w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                         int16_t *src1_ptr, int32_t src2_stride,
+                         uint8_t *dst, int32_t dst_stride, int32_t height)
+{
+    int32_t loop_cnt;
+    int32_t res = (height & 0x07) >> 1;
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t dst_stride_2x = (dst_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t dst_stride_4x = (dst_stride << 2);
+    int32_t src2_stride_x = (src2_stride << 1);
+    int32_t src2_stride_2x = (src2_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+    int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
+    __m128i out0, out1, out2, out3;
+    __m128i zero = __lsx_vldi(0);
+    __m128i src0, src1, src2, src3;
+    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    __m128i reg0, reg1, reg2, reg3;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        reg0 = __lsx_vldrepl_d(src0_ptr, 0);
+        reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
+        reg2 = __lsx_vldrepl_d(src0_ptr + src_stride_2x, 0);
+        reg3 = __lsx_vldrepl_d(src0_ptr + src_stride_3x, 0);
+        DUP2_ARG2(__lsx_vilvl_d, reg1, reg0, reg3, reg2, src0, src1);
+        src0_ptr += src_stride_4x;
+        reg0 = __lsx_vldrepl_d(src0_ptr, 0);
+        reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
+        reg2 = __lsx_vldrepl_d(src0_ptr + src_stride_2x, 0);
+        reg3 = __lsx_vldrepl_d(src0_ptr + src_stride_3x, 0);
+        DUP2_ARG2(__lsx_vilvl_d, reg1, reg0, reg3, reg2, src2, src3);
+        src0_ptr += src_stride_4x;
+        in0 = __lsx_vld(src1_ptr, 0);
+        DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
+                  src2_stride_2x, in1, in2);
+        in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
+        src1_ptr += src2_stride_2x;
+        in4 = __lsx_vld(src1_ptr, 0);
+        DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
+                  src2_stride_2x, in5, in6);
+        in7 = __lsx_vldx(src1_ptr, src2_stride_3x);
+        src1_ptr += src2_stride_2x;
+        DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+                  dst0, dst2, dst4, dst6);
+        DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero, src3,
+                  dst1, dst3, dst5, dst7);
+        DUP4_ARG2(__lsx_vslli_h, dst1, 6, dst3, 6, dst5, 6, dst7, 6, dst1, dst3,
+                  dst5, dst7);
+        out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
+        out1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
+        out2 = hevc_bi_rnd_clip(in4, dst4, in5, dst5);
+        out3 = hevc_bi_rnd_clip(in6, dst6, in7, dst7);
+        __lsx_vstelm_w(out0, dst, 0, 0);
+        __lsx_vstelm_w(out0, dst + dst_stride, 0, 2);
+        __lsx_vstelm_h(out0, dst, 4, 2);
+        __lsx_vstelm_h(out0, dst + dst_stride, 4, 6);
+        __lsx_vstelm_w(out1, dst + dst_stride_2x, 0, 0);
+        __lsx_vstelm_w(out1, dst + dst_stride_3x, 0, 2);
+        __lsx_vstelm_h(out1, dst + dst_stride_2x, 4, 2);
+        __lsx_vstelm_h(out1, dst + dst_stride_3x, 4, 6);
+        dst += dst_stride_4x;
+        __lsx_vstelm_w(out2, dst, 0, 0);
+        __lsx_vstelm_w(out2, dst + dst_stride, 0, 2);
+        __lsx_vstelm_h(out2, dst, 4, 2);
+        __lsx_vstelm_h(out2, dst + dst_stride, 4, 6);
+        __lsx_vstelm_w(out3, dst + dst_stride_2x, 0, 0);
+        __lsx_vstelm_w(out3, dst + dst_stride_3x, 0, 2);
+        __lsx_vstelm_h(out3, dst + dst_stride_2x, 4, 2);
+        __lsx_vstelm_h(out3, dst + dst_stride_3x, 4, 6);
+        dst += dst_stride_4x;
+    }
+    for (;res--;) {
+        reg0 = __lsx_vldrepl_d(src0_ptr, 0);
+        reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
+        src0 = __lsx_vilvl_d(reg1, reg0);
+        src0_ptr += src_stride_2x;
+        in0 = __lsx_vld(src1_ptr, 0);
+        in1 = __lsx_vldx(src1_ptr, src2_stride_x);
+        src1_ptr += src2_stride_x;
+        dst0 = __lsx_vsllwil_hu_bu(src0, 6);
+        dst1 = __lsx_vilvh_b(zero, src0);
+        dst1 = __lsx_vslli_h(dst1, 6);
+        out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
+        __lsx_vstelm_w(out0, dst, 0, 0);
+        __lsx_vstelm_h(out0, dst, 4, 2);
+        dst += dst_stride;
+        __lsx_vstelm_w(out0, dst, 0, 2);
+        __lsx_vstelm_h(out0, dst, 4, 6);
+        dst += dst_stride;
+    }
+}
+
+static
+void hevc_bi_copy_8w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                         int16_t *src1_ptr, int32_t src2_stride,
+                         uint8_t *dst, int32_t dst_stride, int32_t height)
+{
+    int32_t loop_cnt = height >> 3;
+    int32_t res = (height & 7) >> 1;
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t dst_stride_2x = (dst_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t dst_stride_4x = (dst_stride << 2);
+    int32_t src2_stride_x = (src2_stride << 1);
+    int32_t src2_stride_2x = (src2_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+    int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
+    __m128i out0, out1, out2, out3;
+    __m128i src0, src1, src2, src3;
+    __m128i zero = __lsx_vldi(0);
+    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    __m128i reg0, reg1, reg2, reg3;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+        reg0 = __lsx_vldrepl_d(src0_ptr, 0);
+        reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
+        reg2 = __lsx_vldrepl_d(src0_ptr + src_stride_2x, 0);
+        reg3 = __lsx_vldrepl_d(src0_ptr + src_stride_3x, 0);
+        DUP2_ARG2(__lsx_vilvl_d, reg1, reg0, reg3, reg2, src0, src1);
+        src0_ptr += src_stride_4x;
+        reg0 = __lsx_vldrepl_d(src0_ptr, 0);
+        reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
+        reg2 = __lsx_vldrepl_d(src0_ptr + src_stride_2x, 0);
+        reg3 = __lsx_vldrepl_d(src0_ptr + src_stride_3x, 0);
+        DUP2_ARG2(__lsx_vilvl_d, reg1, reg0, reg3, reg2, src2, src3);
+        src0_ptr += src_stride_4x;
+        DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+                  dst0, dst2, dst4, dst6);
+        DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero,
+                  src3, dst1, dst3, dst5, dst7);
+        DUP4_ARG2(__lsx_vslli_h, dst1, 6, dst3, 6, dst5, 6, dst7, 6, dst1,
+                  dst3, dst5, dst7);
+        in0 = __lsx_vld(src1_ptr, 0);
+        DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
+                  src2_stride_2x, in1, in2);
+        in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
+        src1_ptr += src2_stride_2x;
+        in4 = __lsx_vld(src1_ptr, 0);
+        DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
+                  src2_stride_2x, in5, in6);
+        in7 = __lsx_vldx(src1_ptr, src2_stride_3x);
+        src1_ptr += src2_stride_2x;
+        out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
+        out1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
+        out2 = hevc_bi_rnd_clip(in4, dst4, in5, dst5);
+        out3 = hevc_bi_rnd_clip(in6, dst6, in7, dst7);
+        __lsx_vstelm_d(out0, dst, 0, 0);
+        __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+        __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
+        __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
+        dst += dst_stride_4x;
+        __lsx_vstelm_d(out2, dst, 0, 0);
+        __lsx_vstelm_d(out2, dst + dst_stride, 0, 1);
+        __lsx_vstelm_d(out3, dst + dst_stride_2x, 0, 0);
+        __lsx_vstelm_d(out3, dst + dst_stride_3x, 0, 1);
+        dst += dst_stride_4x;
+    }
+    for (;res--;) {
+        reg0 = __lsx_vldrepl_d(src0_ptr, 0);
+        reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
+        src0 = __lsx_vilvl_d(reg1, reg0);
+        in0  = __lsx_vld(src1_ptr, 0);
+        in1  = __lsx_vldx(src1_ptr, src2_stride_x);
+        dst0 = __lsx_vsllwil_hu_bu(src0, 6);
+        dst1 = __lsx_vilvh_b(zero, src0);
+        dst1 = __lsx_vslli_h(dst1, 6);
+        out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
+        __lsx_vstelm_d(out0, dst, 0, 0);
+        __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+        src0_ptr += src_stride_2x;
+        src1_ptr += src2_stride_x;
+        dst += dst_stride_2x;
+    }
+}
+
+static
+void hevc_bi_copy_12w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                          int16_t *src1_ptr, int32_t src2_stride,
+                          uint8_t *dst, int32_t dst_stride, int32_t height)
+{
+    uint32_t loop_cnt;
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t dst_stride_2x = (dst_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t dst_stride_4x = (dst_stride << 2);
+    int32_t src2_stride_x = (src2_stride << 1);
+    int32_t src2_stride_2x = (src2_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+    int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
+    int16_t* _src1 = src1_ptr + 8;
+    __m128i out0, out1, out2;
+    __m128i src0, src1, src2, src3;
+    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+    __m128i dst0, dst1, dst2, dst3, dst4, dst5;
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        src0 = __lsx_vld(src0_ptr, 0);
+        DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+                  src1, src2);
+        src3 = __lsx_vldx(src0_ptr, src_stride_3x);
+        src0_ptr += src_stride_4x;
+        in0 = __lsx_vld(src1_ptr, 0);
+        DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
+                  src2_stride_2x, in1, in2);
+        in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
+        src1_ptr += src2_stride_2x;
+        in4 = __lsx_vld(_src1, 0);
+        DUP2_ARG2(__lsx_vldx, _src1, src2_stride_x, _src1, src2_stride_2x,
+                  in5, in6);
+        in7 = __lsx_vldx(_src1, src2_stride_3x);
+        _src1 += src2_stride_2x;
+
+        DUP2_ARG2(__lsx_vilvl_d, in5, in4, in7, in6, in4, in5);
+        DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+                  dst0, dst1, dst2, dst3)
+        DUP2_ARG2(__lsx_vilvh_w, src1, src0, src3, src2, src0, src1);
+        DUP2_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, dst4, dst5)
+        out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
+        out1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
+        out2 = hevc_bi_rnd_clip(in4, dst4, in5, dst5);
+        __lsx_vstelm_d(out0, dst, 0, 0);
+        __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+        __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
+        __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
+        __lsx_vstelm_w(out2, dst, 8, 0);
+        __lsx_vstelm_w(out2, dst + dst_stride, 8, 1);
+        __lsx_vstelm_w(out2, dst + dst_stride_2x, 8, 2);
+        __lsx_vstelm_w(out2, dst + dst_stride_3x, 8, 3);
+        dst += dst_stride_4x;
+    }
+}
+
+static
+void hevc_bi_copy_16w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                          int16_t *src1_ptr, int32_t src2_stride,
+                          uint8_t *dst, int32_t dst_stride, int32_t height)
+{
+    uint32_t loop_cnt;
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t dst_stride_2x = (dst_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t dst_stride_4x = (dst_stride << 2);
+    int32_t src2_stride_x = (src2_stride << 1);
+    int32_t src2_stride_2x = (src2_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+    int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
+    int16_t *_src1 = src1_ptr + 8;
+    __m128i out0, out1, out2, out3;
+    __m128i src0, src1, src2, src3;
+    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+    __m128i dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+    __m128i zero = {0};
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        src0 = __lsx_vld(src0_ptr, 0);
+        DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+                  src1, src2);
+        src3 = __lsx_vldx(src0_ptr, src_stride_3x);
+        src0_ptr += src_stride_4x;
+        in0 = __lsx_vld(src1_ptr, 0);
+        DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
+                  src2_stride_2x, in1, in2);
+        in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
+        src1_ptr += src2_stride_2x;
+        in4 = __lsx_vld(_src1, 0);
+        DUP2_ARG2(__lsx_vldx, _src1, src2_stride_x, _src1, src2_stride_2x,
+                  in5, in6);
+        in7 = __lsx_vldx(_src1, src2_stride_3x);
+        _src1 += src2_stride_2x;
+        DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+                  dst0_r, dst1_r, dst2_r, dst3_r)
+        DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero, src3,
+                  dst0_l, dst1_l, dst2_l, dst3_l);
+        DUP4_ARG2(__lsx_vslli_h, dst0_l, 6, dst1_l, 6, dst2_l, 6, dst3_l, 6,
+                  dst0_l, dst1_l, dst2_l, dst3_l);
+
+        out0 = hevc_bi_rnd_clip(in0, dst0_r, in4, dst0_l);
+        out1 = hevc_bi_rnd_clip(in1, dst1_r, in5, dst1_l);
+        out2 = hevc_bi_rnd_clip(in2, dst2_r, in6, dst2_l);
+        out3 = hevc_bi_rnd_clip(in3, dst3_r, in7, dst3_l);
+        __lsx_vst(out0, dst, 0);
+        __lsx_vstx(out1, dst, dst_stride);
+        __lsx_vstx(out2, dst, dst_stride_2x);
+        __lsx_vstx(out3, dst, dst_stride_3x);
+        dst += dst_stride_4x;
+    }
+}
+
+static
+void hevc_bi_copy_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                          int16_t *src1_ptr, int32_t src2_stride,
+                          uint8_t *dst, int32_t dst_stride, int32_t height)
+{
+    hevc_bi_copy_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+                         dst, dst_stride, height);
+    hevc_bi_copy_8w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
+                         dst + 16, dst_stride, height);
+}
+
+static
+void hevc_bi_copy_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                          int16_t *src1_ptr, int32_t src2_stride,
+                          uint8_t *dst, int32_t dst_stride, int32_t height)
+{
+    hevc_bi_copy_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+                         dst, dst_stride, height);
+    hevc_bi_copy_16w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
+                         dst + 16, dst_stride, height);
+}
+
+static
+void hevc_bi_copy_48w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                          int16_t *src1_ptr, int32_t src2_stride,
+                          uint8_t *dst, int32_t dst_stride, int32_t height)
+{
+    hevc_bi_copy_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+                         dst, dst_stride, height);
+    hevc_bi_copy_32w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
+                         dst + 16, dst_stride, height);
+}
+
+static
+void hevc_bi_copy_64w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                          int16_t *src1_ptr, int32_t src2_stride,
+                          uint8_t *dst, int32_t dst_stride, int32_t height)
+{
+    hevc_bi_copy_32w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+                         dst, dst_stride, height);
+    hevc_bi_copy_32w_lsx(src0_ptr + 32, src_stride, src1_ptr + 32, src2_stride,
+                         dst + 32, dst_stride, height);
+}
+
+static void hevc_hz_8t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                               int16_t *src1_ptr,  int32_t src2_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    const int32_t dst_stride_2x = (dst_stride << 1);
+    __m128i src0, src1, src2, src3;
+    __m128i filt0, filt1, filt2, filt3;
+    __m128i mask1, mask2, mask3;
+    __m128i vec0, vec1, vec2, vec3;
+    __m128i dst0, dst1, dst2, dst3;
+    __m128i in0, in1, in2, in3;
+    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+
+    src0_ptr -= 3;
+    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+              filt0, filt1, filt2, filt3);
+
+    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+    mask3 = __lsx_vaddi_bu(mask0, 6);
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 8, src0, src1);
+        src0_ptr += src_stride;
+        DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 8, src2, src3);
+        src0_ptr += src_stride;
+        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in1);
+        src1_ptr += src2_stride;
+        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in2, in3);
+        src1_ptr += src2_stride;
+
+        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0,
+                  vec0, vec1);
+        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0,
+                  vec2, vec3);
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+                  vec3, filt0, dst0, dst1, dst2, dst3);
+        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1,
+                  vec0, vec1);
+        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1,
+                  vec2, vec3);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
+                  dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
+        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2,
+                  vec0, vec1);
+        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2,
+                  vec2, vec3);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt2, dst1, vec1, filt2,
+                  dst2, vec2, filt2, dst3, vec3, filt2, dst0, dst1, dst2, dst3);
+        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3,
+                  vec0, vec1);
+        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask3, src3, src3, mask3,
+                  vec2, vec3);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt3, dst1, vec1, filt3,
+                  dst2, vec2, filt3, dst3, vec3, filt3, dst0, dst1, dst2, dst3);
+
+        dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
+        dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
+        __lsx_vst(dst0, dst, 0);
+        __lsx_vstx(dst1, dst, dst_stride);
+        dst += dst_stride_2x;
+    }
+}
+
+static void hevc_hz_8t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                               int16_t *src1_ptr, int32_t src2_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    __m128i src0, src1, tmp0, tmp1;
+    __m128i filt0, filt1, filt2, filt3;
+    __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+    __m128i vec0, vec1, vec2, vec3;
+    __m128i dst0, dst1, dst2;
+    __m128i in0, in1, in2;
+    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+
+    src0_ptr -= 3;
+    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+              filt0, filt1, filt2, filt3);
+
+    DUP4_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask0, 6, mask0, 8, mask1,
+              mask2, mask3, mask4);
+    DUP2_ARG2(__lsx_vaddi_bu, mask0, 10, mask0, 12, mask5, mask6);
+    mask7 = __lsx_vaddi_bu(mask0, 14);
+
+    for (loop_cnt = height; loop_cnt--;) {
+        DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src0, src1);
+        src0_ptr += src_stride;
+        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in1);
+        in2 = __lsx_vld(src1_ptr, 32);
+        src1_ptr += src2_stride;
+
+        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask4, src1,
+                  src1, mask0, src0, src0, mask1, vec0, vec1, vec2, vec3);
+        DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, dst0, dst1);
+        dst2 = __lsx_vdp2_h_bu_b(vec2, filt0);
+        dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt1);
+        DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask5, src1, src1, mask1, src0,
+                  src0, mask2, src1, src0, mask6, vec0, vec1, vec2, vec3);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec0, filt1, dst2, vec1, filt1,
+                  dst0, vec2, filt2, dst1, vec3, filt2, dst1, dst2, dst0, dst1);
+        DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask2, src0, src0, mask3, src1, src0,
+                  mask7, src1, src1, mask3, vec0, vec1, vec2, vec3);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec0, filt2, dst0, vec1, filt3,
+                  dst1, vec2, filt3, dst2, vec3, filt3, dst2, dst0, dst1, dst2);
+
+        tmp0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
+        dst2 = __lsx_vsadd_h(dst2, in2);
+        tmp1 = __lsx_vssrarni_bu_h(dst2, dst2, 7);
+
+        __lsx_vst(tmp0, dst, 0);
+        __lsx_vstelm_d(tmp1, dst, 16, 0);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_hz_8t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                               int16_t *src1_ptr, int32_t src2_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    hevc_hz_8t_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+                       dst, dst_stride, filter, height);
+    hevc_hz_8t_16w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
+                       dst + 16, dst_stride, filter, height);
+}
+
+static void hevc_hz_8t_48w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                               int16_t *src1_ptr, int32_t src2_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    hevc_hz_8t_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+                       dst, dst_stride, filter, height);
+    hevc_hz_8t_32w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
+                       dst + 16, dst_stride, filter, height);
+}
+
+static void hevc_hz_8t_64w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                               int16_t *src1_ptr, int32_t src2_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    hevc_hz_8t_32w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+                       dst, dst_stride, filter, height);
+    hevc_hz_8t_32w_lsx(src0_ptr + 32, src_stride, src1_ptr + 32, src2_stride,
+                       dst + 32, dst_stride, filter, height);
+}
+
+static av_always_inline
+void hevc_vt_8t_8w_lsx(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr,
+                       int32_t src2_stride, uint8_t *dst, int32_t dst_stride,\
+                       const int8_t *filter, int32_t height)
+{
+    int32_t loop_cnt;
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t dst_stride_2x = (dst_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t dst_stride_4x = (dst_stride << 2);
+    int32_t src2_stride_x = (src2_stride << 1);
+    int32_t src2_stride_2x = (src2_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+    int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
+    __m128i src0, src1, src2, src3, src4, src5;
+    __m128i src6, src7, src8, src9, src10;
+    __m128i in0, in1, in2, in3;
+    __m128i src10_r, src32_r, src54_r, src76_r, src98_r;
+    __m128i src21_r, src43_r, src65_r, src87_r, src109_r;
+    __m128i dst0_r, dst1_r, dst2_r, dst3_r;
+    __m128i filt0, filt1, filt2, filt3;
+
+    src0_ptr -= src_stride_3x;
+
+    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+              filt0, filt1, filt2, filt3);
+
+    src0 = __lsx_vld(src0_ptr, 0);
+    DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+              src1, src2);
+    src3 = __lsx_vldx(src0_ptr, src_stride_3x);
+    src0_ptr += src_stride_4x;
+    src4 = __lsx_vld(src0_ptr, 0);
+    DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+              src5, src6);
+    src0_ptr += src_stride_3x;
+    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+              src10_r, src32_r, src54_r, src21_r);
+    DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        src7 = __lsx_vld(src0_ptr, 0);
+        DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+                  src8, src9);
+        src10 = __lsx_vldx(src0_ptr, src_stride_3x);
+        src0_ptr += src_stride_4x;
+        in0 = __lsx_vld(src1_ptr, 0);
+        DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr, src2_stride_2x,
+                  in1, in2);
+        in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
+        src1_ptr += src2_stride_2x;
+        DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+                  src76_r, src87_r, src98_r, src109_r);
+
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src32_r,
+                  filt0, src43_r, filt0, dst0_r, dst1_r, dst2_r, dst3_r);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst1_r, src43_r,
+                  filt1, dst2_r, src54_r, filt1, dst3_r, src65_r, filt1,
+                  dst0_r, dst1_r, dst2_r, dst3_r);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src54_r, filt2, dst1_r, src65_r,
+                  filt2, dst2_r, src76_r, filt2, dst3_r, src87_r, filt2,
+                  dst0_r, dst1_r, dst2_r, dst3_r);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src76_r, filt3, dst1_r, src87_r,
+                  filt3, dst2_r, src98_r, filt3, dst3_r, src109_r, filt3,
+                  dst0_r, dst1_r, dst2_r, dst3_r);
+
+        dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in1, dst1_r);
+        dst1_r = hevc_bi_rnd_clip(in2, dst2_r, in3, dst3_r);
+        __lsx_vstelm_d(dst0_r, dst, 0, 0);
+        __lsx_vstelm_d(dst0_r, dst + dst_stride, 0, 1);
+        __lsx_vstelm_d(dst1_r, dst + dst_stride_2x, 0, 0);
+        __lsx_vstelm_d(dst1_r, dst + dst_stride_3x, 0, 1);
+        dst += dst_stride_4x;
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+
+        src6 = src10;
+    }
+}
+
+static av_always_inline
+void hevc_vt_8t_16multx2mult_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                                 int16_t *src1_ptr, int32_t src2_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height,
+                                 int32_t width)
+{
+    uint8_t *src0_ptr_tmp;
+    int16_t *src1_ptr_tmp;
+    uint8_t *dst_tmp;
+    uint32_t loop_cnt;
+    uint32_t cnt;
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t dst_stride_2x = (dst_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    __m128i in0, in1, in2, in3;
+    __m128i src10_r, src32_r, src54_r, src76_r;
+    __m128i src21_r, src43_r, src65_r, src87_r;
+    __m128i dst0_r, dst1_r;
+    __m128i src10_l, src32_l, src54_l, src76_l;
+    __m128i src21_l, src43_l, src65_l, src87_l;
+    __m128i dst0_l, dst1_l;
+    __m128i filt0, filt1, filt2, filt3;
+
+    src0_ptr -= src_stride_3x;
+
+    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+              filt0, filt1, filt2, filt3);
+
+    for (cnt = (width >> 4); cnt--;) {
+        src0_ptr_tmp = src0_ptr;
+        src1_ptr_tmp = src1_ptr;
+        dst_tmp = dst;
+
+        src0 = __lsx_vld(src0_ptr_tmp, 0);
+        DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
+                  src_stride_2x, src1, src2);
+        src3 = __lsx_vldx(src0_ptr_tmp, src_stride_3x);
+        src0_ptr_tmp += src_stride_4x;
+        src4 = __lsx_vld(src0_ptr_tmp, 0);
+        DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
+                  src_stride_2x, src5, src6);
+        src0_ptr_tmp += src_stride_3x;
+
+        DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+                  src10_r, src32_r, src54_r, src21_r);
+        DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
+        DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
+                  src10_l, src32_l, src54_l, src21_l);
+        DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, src43_l, src65_l);
+
+        for (loop_cnt = (height >> 1); loop_cnt--;) {
+            src7 = __lsx_vld(src0_ptr_tmp, 0);
+            src8 = __lsx_vldx(src0_ptr_tmp, src_stride);
+            src0_ptr_tmp += src_stride_2x;
+            DUP2_ARG2(__lsx_vld, src1_ptr_tmp, 0, src1_ptr_tmp, 16, in0, in2);
+            src1_ptr_tmp += src2_stride;
+            DUP2_ARG2(__lsx_vld, src1_ptr_tmp, 0, src1_ptr_tmp, 16, in1, in3);
+            src1_ptr_tmp += src2_stride;
+
+            DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
+            DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src76_l, src87_l);
+
+            DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src10_l,
+                      filt0, src21_l, filt0, dst0_r, dst1_r, dst0_l, dst1_l);
+            DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst1_r,
+                      src43_r, filt1, dst0_l, src32_l, filt1, dst1_l, src43_l,
+                      filt1, dst0_r, dst1_r, dst0_l, dst1_l);
+            DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src54_r, filt2, dst1_r,
+                      src65_r, filt2, dst0_l, src54_l, filt2, dst1_l, src65_l,
+                      filt2, dst0_r, dst1_r, dst0_l, dst1_l);
+            DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src76_r, filt3, dst1_r,
+                      src87_r, filt3, dst0_l, src76_l, filt3, dst1_l, src87_l,
+                      filt3, dst0_r, dst1_r, dst0_l, dst1_l);
+            dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l);
+            dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l);
+
+            __lsx_vst(dst0_r, dst_tmp, 0);
+            __lsx_vstx(dst1_r, dst_tmp, dst_stride);
+            dst_tmp += dst_stride_2x;
+
+            src10_r = src32_r;
+            src32_r = src54_r;
+            src54_r = src76_r;
+            src21_r = src43_r;
+            src43_r = src65_r;
+            src65_r = src87_r;
+            src10_l = src32_l;
+            src32_l = src54_l;
+            src54_l = src76_l;
+            src21_l = src43_l;
+            src43_l = src65_l;
+            src65_l = src87_l;
+            src6 = src8;
+        }
+
+        src0_ptr += 16;
+        src1_ptr += 16;
+        dst += 16;
+    }
+}
+
+static void hevc_vt_8t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                               int16_t *src1_ptr, int32_t src2_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                dst, dst_stride, filter, height, 16);
+}
+
+static void hevc_vt_8t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                               int16_t *src1_ptr, int32_t src2_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                dst, dst_stride, filter, height, 16);
+    hevc_vt_8t_8w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
+                      dst + 16, dst_stride, filter, height);
+}
+
+static void hevc_vt_8t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                               int16_t *src1_ptr, int32_t src2_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                dst, dst_stride, filter, height, 32);
+}
+
+static void hevc_vt_8t_48w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                               int16_t *src1_ptr, int32_t src2_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                dst, dst_stride, filter, height, 48);
+}
+
+static void hevc_vt_8t_64w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                               int16_t *src1_ptr, int32_t src2_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                dst, dst_stride, filter, height, 64);
+}
+
+static av_always_inline
+void hevc_hv_8t_8multx1mult_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                                int16_t *src1_ptr, int32_t src2_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter_x, const int8_t *filter_y,
+                                int32_t height, int32_t width)
+{
+    uint32_t loop_cnt;
+    uint32_t cnt;
+    uint8_t *src0_ptr_tmp;
+    int16_t *src1_ptr_tmp;
+    uint8_t *dst_tmp;
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    __m128i out;
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+    __m128i in0, tmp;
+    __m128i filt0, filt1, filt2, filt3;
+    __m128i filt_h0, filt_h1, filt_h2, filt_h3;
+    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+    __m128i mask1, mask2, mask3;
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    __m128i dst0_r, dst0_l;
+    __m128i dst10_r, dst32_r, dst54_r, dst76_r;
+    __m128i dst10_l, dst32_l, dst54_l, dst76_l;
+
+    src0_ptr -= src_stride_3x + 3;
+
+    DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4, filter_x,
+              6, filt0, filt1, filt2, filt3);
+    filt_h3 = __lsx_vld(filter_y, 0);
+    filt_h3 = __lsx_vsllwil_h_b(filt_h3, 0);
+
+    DUP4_ARG2(__lsx_vreplvei_w, filt_h3, 0, filt_h3, 1, filt_h3, 2, filt_h3, 3,
+              filt_h0, filt_h1, filt_h2, filt_h3);
+
+    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+    mask3 = __lsx_vaddi_bu(mask0, 6);
+
+    for (cnt = width >> 3; cnt--;) {
+        src0_ptr_tmp = src0_ptr;
+        dst_tmp = dst;
+        src1_ptr_tmp = src1_ptr;
+
+        src0 = __lsx_vld(src0_ptr_tmp, 0);
+        DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
+                  src_stride_2x, src1, src2);
+        src3 = __lsx_vldx(src0_ptr_tmp, src_stride_3x);
+        src0_ptr_tmp += src_stride_4x;
+        src4 = __lsx_vld(src0_ptr_tmp, 0);
+        DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
+                  src_stride_2x, src5, src6);
+        src0_ptr_tmp += src_stride_3x;
+
+        /* row 0 row 1 row 2 row 3 */
+        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0,
+                  src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
+        DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1,
+                  src1, mask2, src1, src1, mask3, vec4, vec5, vec6, vec7);
+        DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2,
+                  src2, mask2, src2, src2, mask3, vec8, vec9, vec10, vec11);
+        DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3,
+                  src3, mask2, src3, src3, mask3, vec12, vec13, vec14, vec15);
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, vec8, filt0,
+                  vec12, filt0, dst0, dst1, dst2, dst3);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec5, filt1,
+                  dst2, vec9, filt1, dst3, vec13, filt1, dst0, dst1, dst2, dst3);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec2, filt2, dst1, vec6, filt2,
+                  dst2, vec10, filt2, dst3, vec14, filt2, dst0, dst1, dst2, dst3);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec3, filt3, dst1, vec7, filt3,
+                  dst2, vec11, filt3, dst3, vec15, filt3, dst0, dst1, dst2, dst3);
+
+        DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4,
+                  src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3);
+        DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5,
+                  src5, mask2, src5, src5, mask3, vec4, vec5, vec6, vec7);
+        DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6,
+                  src6, mask2, src6, src6, mask3, vec8, vec9, vec10, vec11);
+        DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, dst4, dst5);
+        dst6 = __lsx_vdp2_h_bu_b(vec8, filt0);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst5, vec5, filt1,
+                  dst6, vec9, filt1, dst4, vec2, filt2, dst4, dst5, dst6, dst4);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec6, filt2, dst6, vec10, filt2,
+                  dst4, vec3, filt3, dst5, vec7, filt3, dst5, dst6, dst4, dst5);
+        dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3);
+
+        for (loop_cnt = height; loop_cnt--;) {
+            src7 = __lsx_vld(src0_ptr_tmp, 0);
+            src0_ptr_tmp += src_stride;
+
+            in0 = __lsx_vld(src1_ptr_tmp, 0);
+            src1_ptr_tmp += src2_stride;
+
+            DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7,
+                      src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3);
+            dst7 = __lsx_vdp2_h_bu_b(vec0, filt0);
+            DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2,
+                      filt2, dst7, dst7);
+            dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3);
+            DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7,
+                      dst6, dst10_r, dst32_r, dst54_r, dst76_r);
+            DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7,
+                      dst6, dst10_l, dst32_l, dst54_l, dst76_l);
+
+            DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0,
+                      dst0_r, dst0_l);
+            DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
+                      dst32_l, filt_h1, dst0_r, dst54_r, filt_h2, dst0_l,
+                      dst54_l, filt_h2, dst0_r, dst0_l, dst0_r, dst0_l);
+            DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l,
+                      dst76_l, filt_h3, dst0_r, dst0_l);
+            dst0_r = __lsx_vsrli_w(dst0_r, 6);
+            dst0_l = __lsx_vsrli_w(dst0_l, 6);
+
+            tmp = __lsx_vpickev_h(dst0_l, dst0_r);
+            tmp = __lsx_vsadd_h(tmp, in0);
+            tmp = __lsx_vmaxi_h(tmp, 0);
+            out = __lsx_vssrlrni_bu_h(tmp, tmp, 7);
+            __lsx_vstelm_d(out, dst_tmp, 0, 0);
+            dst_tmp += dst_stride;
+
+            dst0 = dst1;
+            dst1 = dst2;
+            dst2 = dst3;
+            dst3 = dst4;
+            dst4 = dst5;
+            dst5 = dst6;
+            dst6 = dst7;
+        }
+
+        src0_ptr += 8;
+        dst += 8;
+        src1_ptr += 8;
+    }
+}
+
+static void hevc_hv_8t_8w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                              int16_t *src1_ptr, int32_t src2_stride,
+                              uint8_t *dst, int32_t dst_stride,
+                              const int8_t *filter_x, const int8_t *filter_y,
+                              int32_t height)
+{
+    hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+                               dst, dst_stride, filter_x, filter_y, height, 8);
+}
+
+static void hevc_hv_8t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                               int16_t *src1_ptr, int32_t src2_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const int8_t *filter_x, const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+                               dst, dst_stride, filter_x, filter_y, height, 16);
+}
+
+static void hevc_hv_8t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                               int16_t *src1_ptr, int32_t src2_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const int8_t *filter_x, const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+                               dst, dst_stride, filter_x, filter_y, height, 24);
+}
+
+static void hevc_hv_8t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                               int16_t *src1_ptr, int32_t src2_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const int8_t *filter_x, const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+                               dst, dst_stride, filter_x, filter_y, height, 32);
+}
+
+static void hevc_hv_8t_48w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                               int16_t *src1_ptr, int32_t src2_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const int8_t *filter_x, const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+                               dst, dst_stride, filter_x, filter_y, height, 48);
+}
+
+static void hevc_hv_8t_64w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                               int16_t *src1_ptr, int32_t src2_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const int8_t *filter_x, const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+                               dst, dst_stride, filter_x, filter_y, height, 64);
+}
+
+static void hevc_hz_4t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                               int16_t *src1_ptr, int32_t src2_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    int16_t *src1_ptr_tmp;
+    uint8_t *dst_tmp;
+    uint32_t loop_cnt;
+    int32_t dst_stride_2x = (dst_stride << 1);
+    int32_t dst_stride_4x = (dst_stride << 2);
+    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+    int32_t src2_stride_x = src2_stride << 1;
+    int32_t src2_stride_2x = src2_stride << 2;
+    int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
+
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+    __m128i filt0, filt1;
+    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+    __m128i mask1, mask2, mask3;
+    __m128i vec0, vec1, vec2, vec3;
+    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+    src0_ptr -= 1;
+    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+
+    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 8, mask1, mask2);
+    mask3 = __lsx_vaddi_bu(mask0, 10);
+
+    dst_tmp = dst + 16;
+    src1_ptr_tmp = src1_ptr + 16;
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src0, src1);
+        src0_ptr += src_stride;
+        DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src2, src3);
+        src0_ptr += src_stride;
+        DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src4, src5);
+        src0_ptr += src_stride;
+        DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src6, src7);
+        src0_ptr += src_stride;
+
+        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in1);
+        src1_ptr += src2_stride;
+        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in2, in3);
+        src1_ptr += src2_stride;
+        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in4, in5);
+        src1_ptr += src2_stride;
+        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in6, in7);
+        src1_ptr += src2_stride;
+
+        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask2, src2,
+                  src2, mask0, src3, src2, mask2, vec0, vec1, vec2, vec3);
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+                  vec3, filt0, dst0, dst1, dst2, dst3);
+        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src0, mask3, src2,
+                  src2, mask1, src3, src2, mask3, vec0, vec1, vec2, vec3);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
+                  dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
+
+        DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src5, src4, mask2, src6,
+                  src6, mask0, src7, src6, mask2, vec0, vec1, vec2, vec3);
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+                  vec3, filt0, dst4, dst5, dst6, dst7);
+        DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask1, src5, src4, mask3, src6,
+                  src6, mask1, src7, src6, mask3, vec0, vec1, vec2, vec3);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec0, filt1, dst5, vec1, filt1,
+                  dst6, vec2, filt1, dst7, vec3, filt1, dst4, dst5, dst6, dst7);
+
+        dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
+        dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
+        dst2 = hevc_bi_rnd_clip(in4, dst4, in5, dst5);
+        dst3 = hevc_bi_rnd_clip(in6, dst6, in7, dst7);
+        __lsx_vst(dst0, dst, 0);
+        __lsx_vstx(dst1, dst, dst_stride);
+        __lsx_vstx(dst2, dst, dst_stride_2x);
+        __lsx_vstx(dst3, dst, dst_stride_3x);
+        dst += dst_stride_4x;
+
+        in0 = __lsx_vld(src1_ptr_tmp, 0);
+        DUP2_ARG2(__lsx_vldx, src1_ptr_tmp, src2_stride_x, src1_ptr_tmp,
+                  src2_stride_2x, in1, in2);
+        in3 = __lsx_vldx(src1_ptr_tmp, src2_stride_3x);
+        src1_ptr_tmp += src2_stride_2x;
+
+        DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src3, src3, mask0, src5,
+                  src5, mask0, src7, src7, mask0, vec0, vec1, vec2, vec3);
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+                  vec3, filt0, dst0, dst1, dst2, dst3);
+        DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask1, src3, src3, mask1, src5,
+                  src5, mask1, src7, src7, mask1, vec0, vec1, vec2, vec3);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
+                  dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
+        dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
+        dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
+        __lsx_vstelm_d(dst0, dst_tmp, 0, 0);
+        __lsx_vstelm_d(dst0, dst_tmp + dst_stride, 0, 1);
+        __lsx_vstelm_d(dst1, dst_tmp + dst_stride_2x, 0, 0);
+        __lsx_vstelm_d(dst1, dst_tmp + dst_stride_3x, 0, 1);
+        dst_tmp += dst_stride_4x;
+    }
+}
+
+static void hevc_hz_4t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                               int16_t *src1_ptr, int32_t src2_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    __m128i src0, src1, src2;
+    __m128i in0, in1, in2, in3;
+    __m128i filt0, filt1;
+    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+    __m128i mask1, mask2, mask3;
+    __m128i dst0, dst1, dst2, dst3;
+    __m128i vec0, vec1, vec2, vec3;
+
+    src0_ptr -= 1;
+
+    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+
+    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 8, mask1, mask2);
+    mask3 = __lsx_vaddi_bu(mask0, 10);
+
+    for (loop_cnt = height; loop_cnt--;) {
+        DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src0, src1);
+        src2 = __lsx_vld(src0_ptr, 24);
+        src0_ptr += src_stride;
+        DUP4_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, src1_ptr, 32,
+                  src1_ptr, 48, in0, in1, in2, in3);
+        src1_ptr += src2_stride;
+        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask2, src1,
+                  src1, mask0, src2, src2, mask0, vec0, vec1, vec2, vec3);
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+                  vec3, filt0, dst0, dst1, dst2, dst3);
+        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src0, mask3, src1,
+                  src1, mask1, src2, src2, mask1, vec0, vec1, vec2, vec3);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
+                  dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
+        dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
+        dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
+        __lsx_vst(dst0, dst, 0);
+        __lsx_vst(dst1, dst, 16);
+        dst += dst_stride;
+    }
+}
+
+static void hevc_vt_4t_12w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                               int16_t *src1_ptr, int32_t src2_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    int32_t loop_cnt;
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t dst_stride_2x = (dst_stride << 1);
+    int32_t dst_stride_4x = (dst_stride << 2);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t src2_stride_x = (src2_stride << 1);
+    int32_t src2_stride_2x = (src2_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+    int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
+    int16_t *_src1 = src1_ptr + 8;
+    __m128i src0, src1, src2, src3, src4, src5, src6;
+    __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+    __m128i src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
+    __m128i dst0_r, dst1_r, dst2_r, dst3_r;
+    __m128i src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
+    __m128i src2110, src4332, src6554;
+    __m128i dst0_l, dst1_l, filt0, filt1;
+
+    src0_ptr -= src_stride;
+    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+
+    src0 = __lsx_vld(src0_ptr, 0);
+    DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+              src1, src2);
+    src0_ptr += src_stride_3x;
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
+    src2110 = __lsx_vilvl_d(src21_l, src10_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        src3 = __lsx_vld(src0_ptr, 0);
+        DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+                  src4, src5);
+        src6 = __lsx_vldx(src0_ptr, src_stride_3x);
+        src0_ptr += src_stride_4x;
+        in0 = __lsx_vld(src1_ptr, 0);
+        DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
+                  src2_stride_2x, in1, in2);
+        in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
+        src1_ptr += src2_stride_2x;
+        in4 = __lsx_vld(_src1, 0);
+        DUP2_ARG2(__lsx_vldx, _src1, src2_stride_x, _src1, src2_stride_2x,
+                  in5, in6);
+        in7 = __lsx_vldx(_src1, src2_stride_3x);
+        _src1 += src2_stride_2x;
+        DUP2_ARG2(__lsx_vilvl_d, in5, in4, in7, in6, in4, in5);
+
+        DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
+        DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
+        src4332 = __lsx_vilvl_d(src43_l, src32_l);
+        DUP2_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src54_r, src65_r);
+        DUP2_ARG2(__lsx_vilvh_b, src5, src4, src6, src5, src54_l, src65_l);
+        src6554 = __lsx_vilvl_d(src65_l, src54_l);
+
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src2110,
+                  filt0, src32_r, filt0, dst0_r, dst1_r, dst0_l, dst2_r);
+        DUP2_ARG2(__lsx_vdp2_h_bu_b, src43_r, filt0, src4332, filt0,
+                  dst3_r, dst1_l);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst1_r,
+                  src43_r, filt1, dst0_l, src4332, filt1, dst2_r, src54_r,
+                  filt1, dst0_r, dst1_r, dst0_l, dst2_r);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3_r, src65_r, filt1, dst1_l,
+                  src6554, filt1, dst3_r, dst1_l);
+        dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in1, dst1_r);
+        dst1_r = hevc_bi_rnd_clip(in2, dst2_r, in3, dst3_r);
+        dst0_l = hevc_bi_rnd_clip(in4, dst0_l, in5, dst1_l);
+        __lsx_vstelm_d(dst0_r, dst, 0, 0);
+        __lsx_vstelm_d(dst0_r, dst + dst_stride, 0, 1);
+        __lsx_vstelm_d(dst1_r, dst + dst_stride_2x, 0, 0);
+        __lsx_vstelm_d(dst1_r, dst + dst_stride_3x, 0, 1);
+        __lsx_vstelm_w(dst0_l, dst, 8, 0);
+        __lsx_vstelm_w(dst0_l, dst + dst_stride, 8, 1);
+        __lsx_vstelm_w(dst0_l, dst + dst_stride_2x, 8, 2);
+        __lsx_vstelm_w(dst0_l, dst + dst_stride_3x, 8, 3);
+        dst += dst_stride_4x;
+
+        src2 = src6;
+        src10_r = src54_r;
+        src21_r = src65_r;
+        src2110 = src6554;
+    }
+}
+
+static void hevc_vt_4t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                               int16_t *src1_ptr, int32_t src2_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    int32_t loop_cnt;
+    const int32_t src_stride_2x = (src_stride << 1);
+    const int32_t dst_stride_2x = (dst_stride << 1);
+    const int32_t src_stride_3x = src_stride_2x + src_stride;
+    __m128i src0, src1, src2, src3, src4, src5;
+    __m128i in0, in1, in2, in3;
+    __m128i src10_r, src32_r, src21_r, src43_r;
+    __m128i src10_l, src32_l, src21_l, src43_l;
+    __m128i dst0_r, dst1_r, dst0_l, dst1_l;
+    __m128i filt0, filt1;
+
+    src0_ptr -= src_stride;
+    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+
+    src0 = __lsx_vld(src0_ptr, 0);
+    DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+              src1, src2);
+    src0_ptr += src_stride_3x;
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        src3 = __lsx_vld(src0_ptr, 0);
+        src4 = __lsx_vldx(src0_ptr, src_stride);
+        src0_ptr += src_stride_2x;
+        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in2);
+        src1_ptr += src2_stride;
+        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in1, in3);
+        src1_ptr += src2_stride;
+        DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
+        DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
+
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src10_l,
+                  filt0, src21_l, filt0, dst0_r, dst1_r, dst0_l, dst1_l);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst1_r, src43_r,
+                  filt1, dst0_l, src32_l, filt1, dst1_l, src43_l, filt1,
+                  dst0_r, dst1_r, dst0_l, dst1_l);
+
+        dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l);
+        dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l);
+        __lsx_vst(dst0_r, dst, 0);
+        __lsx_vstx(dst1_r, dst, dst_stride);
+        dst += dst_stride_2x;
+
+        src5 = __lsx_vld(src0_ptr, 0);
+        src2 = __lsx_vldx(src0_ptr, src_stride);
+        src0_ptr += src_stride_2x;
+        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in2);
+        src1_ptr += src2_stride;
+        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in1, in3);
+        src1_ptr += src2_stride;
+        DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r);
+        DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l);
+
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
+                  filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l,
+                  src10_l, filt1, dst1_r, src21_r, filt1, dst1_l, src21_l,
+                  filt1, dst0_r, dst0_l, dst1_r, dst1_l);
+        dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l);
+        dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l);
+        __lsx_vst(dst0_r, dst, 0);
+        __lsx_vstx(dst1_r, dst, dst_stride);
+        dst += dst_stride_2x;
+    }
+}
+
+static void hevc_vt_4t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                               int16_t *src1_ptr, int32_t src2_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    int32_t dst_stride_2x = dst_stride << 1;
+    __m128i src0, src1, src2, src3, src4, src5;
+    __m128i src6, src7, src8, src9, src10, src11;
+    __m128i in0, in1, in2, in3, in4, in5;
+    __m128i src10_r, src32_r, src76_r, src98_r;
+    __m128i src21_r, src43_r, src87_r, src109_r;
+    __m128i src10_l, src32_l, src21_l, src43_l;
+    __m128i dst0_r, dst1_r, dst2_r, dst3_r;
+    __m128i dst0_l, dst1_l;
+    __m128i filt0, filt1;
+
+    src0_ptr -= src_stride;
+    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+
+    /* 16width */
+    DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src0, src6);
+    src0_ptr += src_stride;
+    DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src1, src7);
+    src0_ptr += src_stride;
+    DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src2, src8);
+    src0_ptr += src_stride;
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
+    /* 8width */
+    DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        /* 16width */
+        DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src3, src9);
+        src0_ptr += src_stride;
+        DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src4, src10);
+        src0_ptr += src_stride;
+        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in2);
+        in4 = __lsx_vld(src1_ptr, 32);
+        src1_ptr += src2_stride;
+        DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in1, in3);
+        in5 = __lsx_vld(src1_ptr, 32);
+        src1_ptr += src2_stride;
+        DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
+        DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
+        /* 8width */
+        DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r);
+        /* 16width */
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
+                  filt0, src21_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1,  dst0_l,
+                  src32_l, filt1, dst1_r, src43_r, filt1, dst1_l, src43_l, filt1,
+                  dst0_r, dst0_l, dst1_r, dst1_l);
+        /* 8width */
+        DUP2_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src87_r, filt0,
+                  dst2_r, dst3_r);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src98_r, filt1, dst3_r,
+                  src109_r, filt1, dst2_r, dst3_r);
+        /* 16width */
+        dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l);
+        dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l);
+        dst2_r = hevc_bi_rnd_clip(in4, dst2_r, in5, dst3_r);
+        __lsx_vst(dst0_r, dst, 0);
+        __lsx_vstx(dst1_r, dst, dst_stride);
+        __lsx_vstelm_d(dst2_r, dst, 16, 0);
+        __lsx_vstelm_d(dst2_r, dst + dst_stride, 16, 1);
+        dst += dst_stride_2x;
+
+        /* 16width */
+        DUP4_ARG2(__lsx_vld, src0_ptr, 0, src1_ptr, 0, src1_ptr, 16, src1_ptr,
+                  32, src5, in0, in2, in4);
+        src1_ptr += src2_stride;
+        DUP4_ARG2(__lsx_vld, src0_ptr, 16,  src1_ptr, 0, src1_ptr, 16, src1_ptr,
+                  32, src11, in1, in3, in5);
+        src1_ptr += src2_stride;
+        src0_ptr += src_stride;
+        DUP2_ARG2(__lsx_vld, src0_ptr, 0,  src0_ptr, 16, src2, src8);
+        src0_ptr += src_stride;
+        DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r);
+        DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l);
+        /* 8width */
+        DUP2_ARG2(__lsx_vilvl_b, src11, src10, src8, src11, src76_r, src87_r);
+        /* 16width */
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
+                  filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l,
+                  src10_l, filt1, dst1_r, src21_r, filt1, dst1_l, src21_l,
+                  filt1, dst0_r, dst0_l, dst1_r, dst1_l);
+
+        /* 8width */
+        DUP2_ARG2(__lsx_vdp2_h_bu_b, src98_r, filt0, src109_r, filt0,
+                  dst2_r, dst3_r);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b,  dst2_r, src76_r, filt1, dst3_r,
+                  src87_r, filt1, dst2_r, dst3_r);
+
+        dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l);
+        dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l);
+        dst2_r = hevc_bi_rnd_clip(in4, dst2_r, in5, dst3_r);
+        __lsx_vst(dst0_r, dst, 0);
+        __lsx_vstx(dst1_r, dst, dst_stride);
+        __lsx_vstelm_d(dst2_r, dst, 16, 0);
+        __lsx_vstelm_d(dst2_r, dst + dst_stride, 16, 1);
+        dst += dst_stride_2x;
+    }
+}
+
+static void hevc_vt_4t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                               int16_t *src1_ptr, int32_t src2_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const int8_t *filter, int32_t height)
+{
+    hevc_vt_4t_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+                       dst, dst_stride, filter, height);
+    hevc_vt_4t_16w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
+                       dst + 16, dst_stride, filter, height);
+}
+
+static void hevc_hv_4t_6w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                              int16_t *src1_ptr, int32_t src2_stride,
+                              uint8_t *dst, int32_t dst_stride,
+                              const int8_t *filter_x, const int8_t *filter_y,
+                              int32_t height)
+{
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t dst_stride_2x = (dst_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t dst_stride_4x = (dst_stride << 2);
+    int32_t src2_stride_2x = (src2_stride << 1);
+    int32_t src2_stride_4x = (src2_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+    int32_t src2_stride_3x = src2_stride_2x + src2_stride;
+    __m128i out0, out1;
+    __m128i src0, src1, src2, src3, src4, src5, src6;
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, mask1;
+    __m128i filt0, filt1, filt_h0, filt_h1;
+    __m128i dsth0, dsth1, dsth2, dsth3, dsth4, dsth5;
+    __m128i dsth6, dsth7, dsth8, dsth9, dsth10;
+    __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    __m128i dst4_r, dst5_r, dst6_r, dst7_r;
+    __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+    __m128i reg0, reg1, reg2, reg3;
+    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+
+    src0_ptr -= (src_stride + 1);
+    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+    filt_h1 = __lsx_vld(filter_y, 0);
+    filt_h1 = __lsx_vsllwil_h_b(filt_h1, 0);
+    DUP2_ARG2(__lsx_vreplvei_w, filt_h1, 0, filt_h1, 1, filt_h0, filt_h1);
+
+    mask1 = __lsx_vaddi_bu(mask0, 2);
+
+    src0 = __lsx_vld(src0_ptr, 0);
+    DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+              src1, src2);
+    src0_ptr += src_stride_3x;
+
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dsth0, dsth1);
+    dsth2 = __lsx_vdp2_h_bu_b(vec4, filt0);
+    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dsth0, vec1, filt1, dsth1, vec3, filt1,
+              dsth0, dsth1);
+    dsth2 = __lsx_vdp2add_h_bu_b(dsth2, vec5, filt1);
+
+    DUP2_ARG2(__lsx_vilvl_h, dsth1, dsth0, dsth2, dsth1, tmp0, tmp2);
+    DUP2_ARG2(__lsx_vilvh_h, dsth1, dsth0, dsth2, dsth1, tmp1, tmp3);
+
+    src3 = __lsx_vld(src0_ptr, 0);
+    DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+              src4, src5);
+    src6 = __lsx_vldx(src0_ptr, src_stride_3x);
+    src0_ptr += src_stride_4x;
+    DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec2, vec3);
+    DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, vec4, vec5);
+    DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, vec6, vec7);
+
+    DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
+              filt0, dsth3, dsth4, dsth5, dsth6);
+    DUP4_ARG3(__lsx_vdp2add_h_bu_b, dsth3, vec1, filt1, dsth4, vec3, filt1, dsth5,
+              vec5, filt1, dsth6, vec7, filt1, dsth3, dsth4, dsth5, dsth6);
+
+    src3 = __lsx_vld(src0_ptr, 0);
+    DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+              src4, src5);
+    src6 = __lsx_vldx(src0_ptr, src_stride_3x);
+
+    DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec2, vec3);
+    DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, vec4, vec5);
+    DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, vec6, vec7);
+
+    DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
+              filt0, dsth7, dsth8, dsth9, dsth10);
+    DUP4_ARG3(__lsx_vdp2add_h_bu_b, dsth7, vec1, filt1, dsth8, vec3, filt1, dsth9,
+              vec5, filt1, dsth10, vec7, filt1, dsth7, dsth8, dsth9, dsth10);
+
+    DUP2_ARG2(__lsx_vilvl_h, dsth3, dsth2, dsth4, dsth3, tmp4, tmp6);
+    DUP2_ARG2(__lsx_vilvh_h, dsth3, dsth2, dsth4, dsth3, tmp5, tmp7);
+    DUP2_ARG2(__lsx_vilvl_h, dsth5, dsth4, dsth6, dsth5, dsth0, dsth2);
+    DUP2_ARG2(__lsx_vilvh_h, dsth5, dsth4, dsth6, dsth5, dsth1, dsth3);
+    DUP4_ARG2(__lsx_vdp2_w_h, tmp0, filt_h0, tmp2, filt_h0, tmp4, filt_h0,
+              tmp6, filt_h0, dst0_r, dst1_r, dst2_r, dst3_r);
+    DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, tmp4, filt_h1, dst1_r, tmp6,
+              filt_h1, dst2_r, dsth0, filt_h1, dst3_r, dsth2, filt_h1,
+              dst0_r, dst1_r, dst2_r, dst3_r);
+    DUP2_ARG2(__lsx_vpickev_d, tmp3, tmp1, tmp7, tmp5, tmp0, tmp8);
+    dst0_l = __lsx_vdp2_w_h(tmp0, filt_h0);
+    dst0_l = __lsx_vdp2add_w_h(dst0_l, tmp8, filt_h1);
+
+    DUP2_ARG2(__lsx_vilvl_h, dsth7, dsth6, dsth8, dsth7, tmp0, tmp2);
+    DUP2_ARG2(__lsx_vilvh_h, dsth7, dsth6, dsth8, dsth7, tmp1, tmp3);
+    DUP2_ARG2(__lsx_vilvl_h, dsth9, dsth8, dsth10, dsth9, tmp4, tmp6);
+    DUP2_ARG2(__lsx_vilvh_h, dsth9, dsth8, dsth10, dsth9, tmp5, tmp7);
+    DUP4_ARG2(__lsx_vdp2_w_h, dsth0, filt_h0, dsth2, filt_h0, tmp0, filt_h0,
+              tmp2, filt_h0, dst4_r, dst5_r, dst6_r, dst7_r);
+    DUP4_ARG3(__lsx_vdp2add_w_h, dst4_r, tmp0, filt_h1, dst5_r, tmp2,
+              filt_h1, dst6_r, tmp4, filt_h1, dst7_r, tmp6, filt_h1,
+              dst4_r, dst5_r, dst6_r, dst7_r);
+    DUP2_ARG2(__lsx_vpickev_d, dsth3, dsth1, tmp3, tmp1, tmp0, tmp1);
+    tmp2 = __lsx_vpickev_d(tmp7, tmp5);
+
+    DUP2_ARG2(__lsx_vdp2_w_h, tmp8, filt_h0, tmp0, filt_h0, dst1_l, dst2_l);
+    dst3_l = __lsx_vdp2_w_h(tmp1, filt_h0);
+    DUP2_ARG3(__lsx_vdp2add_w_h, dst1_l, tmp0, filt_h1, dst2_l, tmp1, filt_h1,
+              dst1_l, dst2_l);
+    dst3_l = __lsx_vdp2add_w_h(dst3_l, tmp2, filt_h1);
+
+    DUP4_ARG2(__lsx_vsrai_d, dst0_r, 6, dst1_r, 6, dst2_r, 6, dst3_r, 6,
+              dst0_r, dst1_r, dst2_r, dst3_r);
+    DUP4_ARG2(__lsx_vsrai_d, dst4_r, 6, dst5_r, 6, dst6_r, 6, dst7_r, 6,
+              dst4_r, dst5_r, dst6_r, dst7_r);
+    DUP4_ARG2(__lsx_vsrai_d, dst0_l, 6, dst1_l, 6, dst2_l, 6, dst3_l, 6,
+              dst0_l, dst1_l, dst2_l, dst3_l);
+    DUP2_ARG2(__lsx_vpickev_h, dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vpickev_h, dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
+    DUP2_ARG2(__lsx_vpickev_h, dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
+
+    reg0 = __lsx_vldrepl_d(src1_ptr, 0);
+    reg1 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0);
+    dsth0 = __lsx_vilvl_d(reg1, reg0);
+    reg0 = __lsx_vldrepl_d(src1_ptr + src2_stride_2x, 0);
+    reg1 = __lsx_vldrepl_d(src1_ptr + src2_stride_3x, 0);
+    dsth1 = __lsx_vilvl_d(reg1, reg0);
+    src1_ptr += src2_stride_4x;
+    reg0 = __lsx_vldrepl_d(src1_ptr, 0);
+    reg1 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0);
+    dsth2 = __lsx_vilvl_d(reg1, reg0);
+    reg0 = __lsx_vldrepl_d(src1_ptr + src2_stride_2x, 0);
+    reg1 = __lsx_vldrepl_d(src1_ptr + src2_stride_3x, 0);
+    dsth3 = __lsx_vilvl_d(reg1, reg0);
+
+    DUP4_ARG2(__lsx_vsadd_h, dsth0, tmp0, dsth1, tmp1, dsth2, tmp2, dsth3,
+              tmp3, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp2, 0, tmp3, 0,
+              tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG3(__lsx_vssrlrni_bu_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
+
+    __lsx_vstelm_w(out0, dst, 0, 0);
+    __lsx_vstelm_w(out0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_w(out0, dst + dst_stride_2x, 0, 2);
+    __lsx_vstelm_w(out0, dst + dst_stride_3x, 0, 3);
+    dst += dst_stride_4x;
+    __lsx_vstelm_w(out1, dst, 0, 0);
+    __lsx_vstelm_w(out1, dst + dst_stride, 0, 1);
+    __lsx_vstelm_w(out1, dst + dst_stride_2x, 0, 2);
+    __lsx_vstelm_w(out1, dst + dst_stride_3x, 0, 3);
+    dst -= dst_stride_4x;
+
+    src1_ptr -= src2_stride_4x;
+
+    reg0 = __lsx_vldrepl_w(src1_ptr, 8);
+    reg1 = __lsx_vldrepl_w(src1_ptr + src2_stride, 8);
+    reg2 = __lsx_vldrepl_w(src1_ptr + src2_stride_2x, 8);
+    reg3 = __lsx_vldrepl_w(src1_ptr + src2_stride_3x, 8);
+    DUP2_ARG2(__lsx_vilvl_w, reg1, reg0, reg3, reg2, tmp0, tmp1);
+    dsth4 = __lsx_vilvl_d(tmp1, tmp0);
+    src1_ptr += src2_stride_4x;
+
+    reg0 = __lsx_vldrepl_w(src1_ptr, 8);
+    reg1 = __lsx_vldrepl_w(src1_ptr + src2_stride, 8);
+    reg2 = __lsx_vldrepl_w(src1_ptr + src2_stride_2x, 8);
+    reg3 = __lsx_vldrepl_w(src1_ptr + src2_stride_3x, 8);
+    DUP2_ARG2(__lsx_vilvl_w, reg1, reg0, reg3, reg2, tmp0, tmp1);
+    dsth5 = __lsx_vilvl_d(tmp1, tmp0);
+    DUP2_ARG2(__lsx_vsadd_h, dsth4, tmp4, dsth5, tmp5, tmp4, tmp5);
+    DUP2_ARG2(__lsx_vmaxi_h, tmp4, 0, tmp5, 7, tmp4, tmp5);
+    out0 = __lsx_vssrlrni_bu_h(tmp5, tmp4, 7);
+
+    __lsx_vstelm_h(out0, dst, 4, 0);
+    __lsx_vstelm_h(out0, dst + dst_stride, 4, 1);
+    __lsx_vstelm_h(out0, dst + dst_stride_2x, 4, 2);
+    __lsx_vstelm_h(out0, dst + dst_stride_3x, 4, 3);
+    dst += dst_stride_4x;
+    __lsx_vstelm_h(out0, dst, 4, 4);
+    __lsx_vstelm_h(out0, dst + dst_stride, 4, 5);
+    __lsx_vstelm_h(out0, dst + dst_stride_2x, 4, 6);
+    __lsx_vstelm_h(out0, dst + dst_stride_3x, 4, 7);
+}
+
+static av_always_inline
+void hevc_hv_4t_8x2_lsx(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr,
+                        int32_t src2_stride, uint8_t *dst, int32_t dst_stride,
+                        const int8_t *filter_x, const int8_t *filter_y)
+{
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+
+    __m128i out;
+    __m128i src0, src1, src2, src3, src4;
+    __m128i filt0, filt1;
+    __m128i filt_h0, filt_h1;
+    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+    __m128i mask1, filter_vec;
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+    __m128i dst0, dst1, dst2, dst3, dst4;
+    __m128i dst0_r, dst0_l, dst1_r, dst1_l;
+    __m128i dst10_r, dst32_r, dst21_r, dst43_r;
+    __m128i dst10_l, dst32_l, dst21_l, dst43_l;
+    __m128i tmp0, tmp1;
+    __m128i in0, in1;
+
+    src0_ptr -= (src_stride + 1);
+    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+    filter_vec = __lsx_vld(filter_y, 0);
+    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+    DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+    mask1 = __lsx_vaddi_bu(mask0, 2);
+
+    src0 = __lsx_vld(src0_ptr, 0);
+    DUP4_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+              src0_ptr, src_stride_3x, src0_ptr, src_stride_4x,
+              src1, src2, src3, src4);
+
+    DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr + src2_stride, 0, in0, in1);
+
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5);
+    DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec6, vec7);
+    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec8, vec9);
+
+    DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
+              filt0, dst0, dst1, dst2, dst3);
+    dst4 = __lsx_vdp2_h_bu_b(vec8, filt0);
+    DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2,
+              vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
+    dst4 = __lsx_vdp2add_h_bu_b(dst4, vec9, filt1);
+
+    DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+    DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
+    DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
+    DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
+    DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+              filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+    DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+              filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
+              dst0_r, dst0_l, dst1_r, dst1_l);
+    DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
+              dst0_r, dst0_l, dst1_r, dst1_l);
+    DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vsadd_h, in0, tmp0, in1, tmp1, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp0, tmp1);
+    out = __lsx_vssrlrni_bu_h(tmp1, tmp0, 7);
+    __lsx_vstelm_d(out, dst, 0, 0);
+    __lsx_vstelm_d(out, dst + dst_stride, 0, 1);
+}
+
+static av_always_inline
+void hevc_hv_4t_8multx4_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                            int16_t *src1_ptr, int32_t src2_stride,
+                            uint8_t *dst, int32_t dst_stride,
+                            const int8_t *filter_x, const int8_t *filter_y,
+                            int32_t width8mult)
+{
+    uint32_t cnt;
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t dst_stride_2x = (dst_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t src2_stride_x = (src2_stride << 1);
+    int32_t src2_stride_2x = (src2_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+    int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
+
+    __m128i out0, out1;
+    __m128i src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    __m128i filt0, filt1, filt_h0, filt_h1, filter_vec;
+    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
+    __m128i in0, in1, in2, in3;
+    __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
+    __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
+
+    src0_ptr -= (src_stride + 1);
+    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+    filter_vec = __lsx_vld(filter_y, 0);
+    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+    DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+    mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+    mask1 = __lsx_vaddi_bu(mask0, 2);
+
+    for (cnt = width8mult; cnt--;) {
+        src0 = __lsx_vld(src0_ptr, 0);
+        DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+                  src1, src2);
+        src3 = __lsx_vldx(src0_ptr, src_stride_3x);
+        src0_ptr += src_stride_4x;
+        src4 = __lsx_vld(src0_ptr, 0);
+        DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+                  src5, src6);
+        src0_ptr += (8 - src_stride_4x);
+
+        in0 = __lsx_vld(src1_ptr, 0);
+        DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
+                  src2_stride_2x, in1, in2);
+        in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
+        src1_ptr += 8;
+
+        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1,
+                  vec0, vec1);
+        DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1,
+                  vec2, vec3);
+        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,
+                  vec4, vec5);
+
+        DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
+        dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
+                  dst0, dst1);
+        dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
+
+        DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+        DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
+
+        DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1,
+                  vec0, vec1);
+        DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1,
+                  vec2, vec3);
+        DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1,
+                  vec4, vec5);
+        DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1,
+                  vec6, vec7);
+
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
+                  vec6, filt0, dst3, dst4, dst5, dst6);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3, filt1,
+                  dst5, vec5, filt1, dst6, vec7, filt1, dst3, dst4, dst5, dst6);
+
+        DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
+        DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
+        DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r);
+        DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l);
+
+        DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+                  filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+        DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
+                  filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
+        DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+                  filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
+                  dst0_r, dst0_l, dst1_r, dst1_l);
+        DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
+                  filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
+                  dst2_r, dst2_l, dst3_r, dst3_l);
+
+        DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
+                  dst0_r, dst0_l, dst1_r, dst1_l);
+        DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6,
+                  dst2_r, dst2_l, dst3_r, dst3_l);
+        DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l,
+                  dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
+        DUP4_ARG2(__lsx_vsadd_h, in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
+                  tmp0, tmp1, tmp2, tmp3);
+        DUP4_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp2, 0, tmp3, 0,
+                  tmp0, tmp1, tmp2, tmp3);
+        DUP2_ARG3(__lsx_vssrlrni_bu_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
+        __lsx_vstelm_d(out0, dst, 0, 0);
+        __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+        __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
+        __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
+        dst += 8;
+    }
+}
+
+static av_always_inline
+void hevc_hv_4t_8x6_lsx(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr,
+                        int32_t src2_stride, uint8_t *dst, int32_t dst_stride,
+                        const int8_t *filter_x, const int8_t *filter_y)
+{
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t dst_stride_2x = (dst_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t dst_stride_4x = (dst_stride << 2);
+    int32_t src2_stride_x = (src2_stride << 1);
+    int32_t src2_stride_2x = (src2_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+    int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
+
+    __m128i out0, out1, out2;
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    __m128i in0, in1, in2, in3, in4, in5;
+    __m128i filt0, filt1;
+    __m128i filt_h0, filt_h1;
+    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+    __m128i mask1, filter_vec;
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+    __m128i vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
+    __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    __m128i dst4_r, dst4_l, dst5_r, dst5_l;
+    __m128i dst10_r, dst32_r, dst10_l, dst32_l;
+    __m128i dst21_r, dst43_r, dst21_l, dst43_l;
+    __m128i dst54_r, dst54_l, dst65_r, dst65_l;
+    __m128i dst76_r, dst76_l, dst87_r, dst87_l;
+
+    src0_ptr -= (src_stride + 1);
+    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+    filter_vec = __lsx_vld(filter_y, 0);
+    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+    DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+    mask1 = __lsx_vaddi_bu(mask0, 2);
+
+    src0 = __lsx_vld(src0_ptr, 0);
+    DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+              src1, src2);
+    src3 = __lsx_vldx(src0_ptr, src_stride_3x);
+    src0_ptr += src_stride_4x;
+    src4 = __lsx_vld(src0_ptr, 0);
+    DUP4_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+              src0_ptr, src_stride_3x, src0_ptr, src_stride_4x,
+              src5, src6, src7, src8);
+
+    in0 = __lsx_vld(src1_ptr, 0);
+    DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr, src2_stride_2x,
+              in1, in2);
+    in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
+    src1_ptr += src2_stride_2x;
+    in4 = __lsx_vld(src1_ptr, 0);
+    in5 = __lsx_vldx(src1_ptr, src2_stride_x);
+
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5);
+    DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec6, vec7);
+    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec8, vec9);
+    DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, vec10, vec11);
+    DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, vec12, vec13);
+    DUP2_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, vec14, vec15);
+    DUP2_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1, vec16, vec17);
+
+    DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
+              filt0, dst0, dst1, dst2, dst3);
+    dst4 = __lsx_vdp2_h_bu_b(vec8, filt0);
+    DUP4_ARG2(__lsx_vdp2_h_bu_b, vec10, filt0, vec12, filt0, vec14, filt0,
+              vec16, filt0, dst5, dst6, dst7, dst8);
+    DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2,
+              vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
+    dst4 = __lsx_vdp2add_h_bu_b(dst4, vec9, filt1);
+    DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec11, filt1, dst6, vec13, filt1,
+              dst7, vec15, filt1, dst8, vec17, filt1, dst5, dst6, dst7, dst8);
+
+    DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
+              dst10_r, dst21_r, dst32_r, dst43_r);
+    DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
+              dst10_l, dst21_l, dst32_l, dst43_l);
+    DUP4_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
+              dst54_r, dst65_r, dst76_r, dst87_r);
+    DUP4_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
+              dst54_l, dst65_l, dst76_l, dst87_l);
+
+    DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+              filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+    DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
+              filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
+    DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst54_l, filt_h0, dst65_r,
+              filt_h0, dst65_l, filt_h0, dst4_r, dst4_l, dst5_r, dst5_l);
+    DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+              filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
+              dst0_r, dst0_l, dst1_r, dst1_l);
+    DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
+              filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
+              dst2_r, dst2_l, dst3_r, dst3_l);
+    DUP4_ARG3(__lsx_vdp2add_w_h, dst4_r, dst76_r, filt_h1, dst4_l, dst76_l,
+              filt_h1, dst5_r, dst87_r, filt_h1, dst5_l, dst87_l, filt_h1,
+              dst4_r, dst4_l, dst5_r, dst5_l);
+
+    DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
+              dst0_r, dst0_l, dst1_r, dst1_l);
+    DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6,
+              dst2_r, dst2_l, dst3_r, dst3_l);
+    DUP4_ARG2(__lsx_vsrai_w, dst4_r, 6, dst4_l, 6, dst5_r, 6, dst5_l, 6,
+              dst4_r, dst4_l, dst5_r, dst5_l);
+    DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r,
+              dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG2(__lsx_vpickev_h, dst4_l, dst4_r, dst5_l, dst5_r, tmp4, tmp5);
+    DUP4_ARG2(__lsx_vsadd_h, in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
+              tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG2(__lsx_vsadd_h, in4, tmp4, in5, tmp5, tmp4, tmp5);
+    DUP4_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp2, 0, tmp3, 0,
+              tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG2(__lsx_vmaxi_h, tmp4, 0, tmp5, 0, tmp4, tmp5);
+    DUP2_ARG3(__lsx_vssrlrni_bu_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
+    out2 = __lsx_vssrlrni_bu_h(tmp5, tmp4, 7);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
+    __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
+    dst += dst_stride_4x;
+    __lsx_vstelm_d(out2, dst, 0, 0);
+    __lsx_vstelm_d(out2, dst + dst_stride, 0, 1);
+}
+
+static av_always_inline
+void hevc_hv_4t_8multx4mult_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                                int16_t *src1_ptr, int32_t src2_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter_x, const int8_t *filter_y,
+                                int32_t height, int32_t width)
+{
+    uint32_t loop_cnt, cnt;
+    uint8_t *src0_ptr_tmp;
+    int16_t *src1_ptr_tmp;
+    uint8_t *dst_tmp;
+    const int32_t src_stride_2x = (src_stride << 1);
+    const int32_t dst_stride_2x = (dst_stride << 1);
+    const int32_t src_stride_4x = (src_stride << 2);
+    const int32_t dst_stride_4x = (dst_stride << 2);
+    const int32_t src2_stride_x = (src2_stride << 1);
+    const int32_t src2_stride_2x = (src2_stride << 2);
+    const int32_t src_stride_3x = src_stride_2x + src_stride;
+    const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+    const int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
+    __m128i out0, out1;
+    __m128i src0, src1, src2, src3, src4, src5, src6;
+    __m128i in0, in1, in2, in3;
+    __m128i filt0, filt1;
+    __m128i filt_h0, filt_h1;
+    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+    __m128i mask1, filter_vec;
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    __m128i dst0, dst1, dst2, dst3, dst4, dst5;
+    __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    __m128i tmp0, tmp1, tmp2, tmp3;
+    __m128i dst10_r, dst32_r, dst21_r, dst43_r;
+    __m128i dst10_l, dst32_l, dst21_l, dst43_l;
+    __m128i dst54_r, dst54_l, dst65_r, dst65_l, dst6;
+
+    src0_ptr -= (src_stride + 1);
+
+    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+    filter_vec = __lsx_vld(filter_y, 0);
+    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+
+    DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+    mask1 = __lsx_vaddi_bu(mask0, 2);
+
+    for (cnt = width >> 3; cnt--;) {
+        src0_ptr_tmp = src0_ptr;
+        dst_tmp = dst;
+        src1_ptr_tmp = src1_ptr;
+
+        src0 = __lsx_vld(src0_ptr_tmp, 0);
+        DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
+                  src_stride_2x, src1, src2);
+        src0_ptr_tmp += src_stride_3x;
+
+        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1,
+                  vec0, vec1);
+        DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1,
+                  vec2, vec3);
+        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,
+                  vec4, vec5);
+
+        DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
+        dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
+                  dst0, dst1);
+        dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
+
+        DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+        DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
+
+        for (loop_cnt = height >> 2; loop_cnt--;) {
+            src3 = __lsx_vld(src0_ptr_tmp, 0);
+            DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
+                      src_stride_2x, src4, src5);
+            src6 = __lsx_vldx(src0_ptr_tmp, src_stride_3x);
+            src0_ptr_tmp += src_stride_4x;
+            in0 = __lsx_vld(src1_ptr_tmp, 0);
+            DUP2_ARG2(__lsx_vldx, src1_ptr_tmp, src2_stride_x, src1_ptr_tmp,
+                      src2_stride_2x, in1, in2);
+            in3 = __lsx_vldx(src1_ptr_tmp, src2_stride_3x);
+            src1_ptr_tmp += src2_stride_2x;
+
+            DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src4,
+                      src4, mask0, src4, src4, mask1, vec0, vec1, vec2, vec3);
+            DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src6,
+                      src6, mask0, src6, src6, mask1, vec4, vec5, vec6, vec7);
+
+            DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
+                      vec6, filt0, dst3, dst4, dst5, dst6);
+            DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3,
+                      filt1, dst5, vec5, filt1, dst6, vec7, filt1,
+                      dst3, dst4, dst5, dst6);
+
+            DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
+            DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
+            DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r);
+            DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l);
+
+            DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+                      filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+            DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
+                      filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
+            DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
+                      dst32_l, filt_h1, dst1_r, dst43_r, filt_h1, dst1_l,
+                      dst43_l, filt_h1, dst0_r, dst0_l, dst1_r, dst1_l);
+            DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l,
+                      dst54_l, filt_h1, dst3_r, dst65_r, filt_h1, dst3_l,
+                      dst65_l, filt_h1, dst2_r, dst2_l, dst3_r, dst3_l);
+
+            DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
+                      dst0_r, dst0_l, dst1_r, dst1_l);
+            DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6,
+                      dst2_r, dst2_l, dst3_r, dst3_l);
+            DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l,
+                      dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
+            DUP4_ARG2(__lsx_vsadd_h, in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
+                      tmp0, tmp1, tmp2, tmp3);
+            DUP4_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp2, 0, tmp3, 0, tmp0,
+                      tmp1, tmp2, tmp3);
+            DUP2_ARG3(__lsx_vssrlrni_bu_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
+            __lsx_vstelm_d(out0, dst_tmp, 0, 0);
+            __lsx_vstelm_d(out0, dst_tmp + dst_stride, 0, 1);
+            __lsx_vstelm_d(out1, dst_tmp + dst_stride_2x, 0, 0);
+            __lsx_vstelm_d(out1, dst_tmp + dst_stride_3x, 0, 1);
+            dst_tmp += dst_stride_4x;
+
+            dst10_r = dst54_r;
+            dst10_l = dst54_l;
+            dst21_r = dst65_r;
+            dst21_l = dst65_l;
+            dst2 = dst6;
+        }
+
+        src0_ptr += 8;
+        dst += 8;
+        src1_ptr += 8;
+    }
+}
+
+static void hevc_hv_4t_8w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                              int16_t *src1_ptr, int32_t src2_stride,
+                              uint8_t *dst, int32_t dst_stride,
+                              const int8_t *filter_x, const int8_t *filter_y,
+                              int32_t height)
+{
+    if (2 == height) {
+        hevc_hv_4t_8x2_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+                           dst, dst_stride, filter_x, filter_y);
+    } else if (4 == height) {
+        hevc_hv_4t_8multx4_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+                               dst, dst_stride, filter_x, filter_y, 1);
+    } else if (6 == height) {
+        hevc_hv_4t_8x6_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+                           dst, dst_stride, filter_x, filter_y);
+    } else {
+        hevc_hv_4t_8multx4mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                dst, dst_stride, filter_x, filter_y, height, 8);
+    }
+}
+
+static void hevc_hv_4t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                               int16_t *src1_ptr, int32_t src2_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const int8_t *filter_x, const int8_t *filter_y,
+                               int32_t height)
+{
+    if (4 == height) {
+        hevc_hv_4t_8multx4_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+                               dst, dst_stride, filter_x, filter_y, 2);
+    } else {
+        hevc_hv_4t_8multx4mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+                                dst, dst_stride, filter_x, filter_y, height, 16);
+    }
+}
+
+static void hevc_hv_4t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                               int16_t *src1_ptr, int32_t src2_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const int8_t *filter_x, const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_4t_8multx4mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+                            dst, dst_stride, filter_x, filter_y, height, 24);
+}
+
+static void hevc_hv_4t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+                               int16_t *src1_ptr, int32_t src2_stride,
+                               uint8_t *dst, int32_t dst_stride,
+                               const int8_t *filter_x, const int8_t *filter_y,
+                               int32_t height)
+{
+    hevc_hv_4t_8multx4mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+                            dst, dst_stride, filter_x, filter_y, height, 32);
+}
+
+#define BI_MC_COPY(WIDTH)                                                 \
+void ff_hevc_put_hevc_bi_pel_pixels##WIDTH##_8_lsx(uint8_t *dst,          \
+                                                   ptrdiff_t dst_stride,  \
+                                                   uint8_t *src,          \
+                                                   ptrdiff_t src_stride,  \
+                                                   int16_t *src_16bit,    \
+                                                   int height,            \
+                                                   intptr_t mx,           \
+                                                   intptr_t my,           \
+                                                   int width)             \
+{                                                                         \
+    hevc_bi_copy_##WIDTH##w_lsx(src, src_stride, src_16bit, MAX_PB_SIZE,  \
+                                dst, dst_stride, height);                 \
+}
+
+BI_MC_COPY(4);
+BI_MC_COPY(6);
+BI_MC_COPY(8);
+BI_MC_COPY(12);
+BI_MC_COPY(16);
+BI_MC_COPY(24);
+BI_MC_COPY(32);
+BI_MC_COPY(48);
+BI_MC_COPY(64);
+
+#undef BI_MC_COPY
+
+#define BI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                          \
+void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_lsx(uint8_t *dst,          \
+                                                      ptrdiff_t dst_stride,  \
+                                                      uint8_t *src,          \
+                                                      ptrdiff_t src_stride,  \
+                                                      int16_t *src_16bit,    \
+                                                      int height,            \
+                                                      intptr_t mx,           \
+                                                      intptr_t my,           \
+                                                      int width)             \
+{                                                                            \
+    const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];            \
+                                                                             \
+    hevc_##DIR1##_##TAP##t_##WIDTH##w_lsx(src, src_stride, src_16bit,        \
+                                          MAX_PB_SIZE, dst, dst_stride,      \
+                                          filter, height);                   \
+}
+
+BI_MC(qpel, h, 16, 8, hz, mx);
+BI_MC(qpel, h, 24, 8, hz, mx);
+BI_MC(qpel, h, 32, 8, hz, mx);
+BI_MC(qpel, h, 48, 8, hz, mx);
+BI_MC(qpel, h, 64, 8, hz, mx);
+
+BI_MC(qpel, v, 8, 8, vt, my);
+BI_MC(qpel, v, 16, 8, vt, my);
+BI_MC(qpel, v, 24, 8, vt, my);
+BI_MC(qpel, v, 32, 8, vt, my);
+BI_MC(qpel, v, 48, 8, vt, my);
+BI_MC(qpel, v, 64, 8, vt, my);
+
+BI_MC(epel, h, 24, 4, hz, mx);
+BI_MC(epel, h, 32, 4, hz, mx);
+
+BI_MC(epel, v, 12, 4, vt, my);
+BI_MC(epel, v, 16, 4, vt, my);
+BI_MC(epel, v, 24, 4, vt, my);
+BI_MC(epel, v, 32, 4, vt, my);
+
+#undef BI_MC
+
+#define BI_MC_HV(PEL, WIDTH, TAP)                                         \
+void ff_hevc_put_hevc_bi_##PEL##_hv##WIDTH##_8_lsx(uint8_t *dst,          \
+                                                   ptrdiff_t dst_stride,  \
+                                                   uint8_t *src,          \
+                                                   ptrdiff_t src_stride,  \
+                                                   int16_t *src_16bit,    \
+                                                   int height,            \
+                                                   intptr_t mx,           \
+                                                   intptr_t my,           \
+                                                   int width)             \
+{                                                                         \
+    const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];             \
+    const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];             \
+                                                                          \
+    hevc_hv_##TAP##t_##WIDTH##w_lsx(src, src_stride, src_16bit,           \
+                                    MAX_PB_SIZE, dst, dst_stride,         \
+                                    filter_x, filter_y, height);          \
+}
+
+BI_MC_HV(qpel, 8, 8);
+BI_MC_HV(qpel, 16, 8);
+BI_MC_HV(qpel, 24, 8);
+BI_MC_HV(qpel, 32, 8);
+BI_MC_HV(qpel, 48, 8);
+BI_MC_HV(qpel, 64, 8);
+
+BI_MC_HV(epel, 8, 4);
+BI_MC_HV(epel, 6, 4);
+BI_MC_HV(epel, 16, 4);
+BI_MC_HV(epel, 24, 4);
+BI_MC_HV(epel, 32, 4);
+
+#undef BI_MC_HV
diff --git a/libavcodec/loongarch/hevcdsp_init_loongarch.c b/libavcodec/loongarch/hevcdsp_init_loongarch.c
index f39674be64..e05fe780c2 100644
--- a/libavcodec/loongarch/hevcdsp_init_loongarch.c
+++ b/libavcodec/loongarch/hevcdsp_init_loongarch.c
@@ -86,6 +86,57 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
             c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_8_lsx;
             c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_lsx;
 
+            c->put_hevc_qpel_bi[1][0][0] = ff_hevc_put_hevc_bi_pel_pixels4_8_lsx;
+            c->put_hevc_qpel_bi[3][0][0] = ff_hevc_put_hevc_bi_pel_pixels8_8_lsx;
+            c->put_hevc_qpel_bi[4][0][0] = ff_hevc_put_hevc_bi_pel_pixels12_8_lsx;
+            c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_8_lsx;
+            c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_8_lsx;
+            c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_lsx;
+            c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_lsx;
+            c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_lsx;
+
+            c->put_hevc_epel_bi[1][0][0] = ff_hevc_put_hevc_bi_pel_pixels4_8_lsx;
+            c->put_hevc_epel_bi[2][0][0] = ff_hevc_put_hevc_bi_pel_pixels6_8_lsx;
+            c->put_hevc_epel_bi[3][0][0] = ff_hevc_put_hevc_bi_pel_pixels8_8_lsx;
+            c->put_hevc_epel_bi[4][0][0] = ff_hevc_put_hevc_bi_pel_pixels12_8_lsx;
+            c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_8_lsx;
+            c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_8_lsx;
+            c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_lsx;
+
+            c->put_hevc_qpel_bi[3][1][0] = ff_hevc_put_hevc_bi_qpel_v8_8_lsx;
+            c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_8_lsx;
+            c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_8_lsx;
+            c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_lsx;
+            c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_lsx;
+            c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_lsx;
+
+            c->put_hevc_qpel_bi[3][1][1] = ff_hevc_put_hevc_bi_qpel_hv8_8_lsx;
+            c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_8_lsx;
+            c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_8_lsx;
+            c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_8_lsx;
+            c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_8_lsx;
+            c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_8_lsx;
+
+            c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_8_lsx;
+            c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_8_lsx;
+            c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_lsx;
+            c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_lsx;
+            c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_lsx;
+
+            c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_8_lsx;
+            c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_lsx;
+
+            c->put_hevc_epel_bi[4][1][0] = ff_hevc_put_hevc_bi_epel_v12_8_lsx;
+            c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_8_lsx;
+            c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_8_lsx;
+            c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_lsx;
+
+            c->put_hevc_epel_bi[2][1][1] = ff_hevc_put_hevc_bi_epel_hv6_8_lsx;
+            c->put_hevc_epel_bi[3][1][1] = ff_hevc_put_hevc_bi_epel_hv8_8_lsx;
+            c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_8_lsx;
+            c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_8_lsx;
+            c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_lsx;
+
             c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_8_lsx;
 
             c->hevc_h_loop_filter_luma = ff_hevc_loop_filter_luma_h_8_lsx;
diff --git a/libavcodec/loongarch/hevcdsp_lsx.h b/libavcodec/loongarch/hevcdsp_lsx.h
index 0e73fd1f8e..6a834290ab 100644
--- a/libavcodec/loongarch/hevcdsp_lsx.h
+++ b/libavcodec/loongarch/hevcdsp_lsx.h
@@ -85,6 +85,63 @@ MC(epel, hv, 32);
 
 #undef MC
 
+#define BI_MC(PEL, DIR, WIDTH)                                               \
+void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_lsx(uint8_t *dst,          \
+                                                      ptrdiff_t dst_stride,  \
+                                                      uint8_t *src,          \
+                                                      ptrdiff_t src_stride,  \
+                                                      int16_t *src_16bit,    \
+                                                      int height,            \
+                                                      intptr_t mx,           \
+                                                      intptr_t my,           \
+                                                      int width)
+
+BI_MC(pel, pixels, 4);
+BI_MC(pel, pixels, 6);
+BI_MC(pel, pixels, 8);
+BI_MC(pel, pixels, 12);
+BI_MC(pel, pixels, 16);
+BI_MC(pel, pixels, 24);
+BI_MC(pel, pixels, 32);
+BI_MC(pel, pixels, 48);
+BI_MC(pel, pixels, 64);
+
+BI_MC(qpel, h, 16);
+BI_MC(qpel, h, 24);
+BI_MC(qpel, h, 32);
+BI_MC(qpel, h, 48);
+BI_MC(qpel, h, 64);
+
+BI_MC(qpel, v, 8);
+BI_MC(qpel, v, 16);
+BI_MC(qpel, v, 24);
+BI_MC(qpel, v, 32);
+BI_MC(qpel, v, 48);
+BI_MC(qpel, v, 64);
+
+BI_MC(qpel, hv, 8);
+BI_MC(qpel, hv, 16);
+BI_MC(qpel, hv, 24);
+BI_MC(qpel, hv, 32);
+BI_MC(qpel, hv, 48);
+BI_MC(qpel, hv, 64);
+
+BI_MC(epel, h, 24);
+BI_MC(epel, h, 32);
+
+BI_MC(epel, v, 12);
+BI_MC(epel, v, 16);
+BI_MC(epel, v, 24);
+BI_MC(epel, v, 32);
+
+BI_MC(epel, hv, 6);
+BI_MC(epel, hv, 8);
+BI_MC(epel, hv, 16);
+BI_MC(epel, hv, 24);
+BI_MC(epel, hv, 32);
+
+#undef BI_MC
+
 void ff_hevc_loop_filter_luma_h_8_lsx(uint8_t *src, ptrdiff_t stride,
                                       int32_t beta, int32_t *tc,
                                       uint8_t *p_is_pcm, uint8_t *q_is_pcm);
-- 
2.20.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [FFmpeg-devel] [PATCH v1 5/5] avcodec: [loongarch] Optimize Hevc_mc_uni/w with LSX.
  2022-02-14  8:25 [FFmpeg-devel] Optimize HEVC decoding for loongarch Hao Chen
                   ` (3 preceding siblings ...)
  2022-02-14  8:25 ` [FFmpeg-devel] [PATCH v1 4/5] avcodec: [loongarch] Optimize Hevc_mc_bi " Hao Chen
@ 2022-02-14  8:25 ` Hao Chen
  2022-02-15  3:30 ` [FFmpeg-devel] Optimize HEVC decoding for loongarch 殷时友
  5 siblings, 0 replies; 7+ messages in thread
From: Hao Chen @ 2022-02-14  8:25 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Lu Wang

From: Lu Wang <wanglu@loongson.cn>

ffmpeg -i 5_h265_1080p_60fps_3Mbps.mkv -f rawvideo -y /dev/null -an
before: 182fps
after : 191fps
---
 libavcodec/loongarch/Makefile                 |    4 +-
 libavcodec/loongarch/hevc_mc_uni_lsx.c        | 1423 +++++++++++++++++
 libavcodec/loongarch/hevc_mc_uniw_lsx.c       |  298 ++++
 libavcodec/loongarch/hevcdsp_init_loongarch.c |   30 +
 libavcodec/loongarch/hevcdsp_lsx.h            |   59 +
 5 files changed, 1813 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/loongarch/hevc_mc_uni_lsx.c
 create mode 100644 libavcodec/loongarch/hevc_mc_uniw_lsx.c

diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index 620fba7192..c1b5de5c44 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -28,4 +28,6 @@ LSX-OBJS-$(CONFIG_VP9_DECODER)        += loongarch/vp9_mc_lsx.o \
 LSX-OBJS-$(CONFIG_HEVC_DECODER)       += loongarch/hevcdsp_lsx.o \
                                          loongarch/hevc_idct_lsx.o \
                                          loongarch/hevc_lpf_sao_lsx.o \
-                                         loongarch/hevc_mc_bi_lsx.o
+                                         loongarch/hevc_mc_bi_lsx.o \
+                                         loongarch/hevc_mc_uni_lsx.o \
+                                         loongarch/hevc_mc_uniw_lsx.o
diff --git a/libavcodec/loongarch/hevc_mc_uni_lsx.c b/libavcodec/loongarch/hevc_mc_uni_lsx.c
new file mode 100644
index 0000000000..a15c86268f
--- /dev/null
+++ b/libavcodec/loongarch/hevc_mc_uni_lsx.c
@@ -0,0 +1,1423 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *                Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "hevcdsp_lsx.h"
+
+static const uint8_t ff_hevc_mask_arr[16 * 3] __attribute__((aligned(0x40))) = {
+    /* 8 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+    /* 4 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+    /* 4 width cases */
+    8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static av_always_inline
+void common_hz_8t_64w_lsx(uint8_t *src, int32_t src_stride,
+                          uint8_t *dst, int32_t dst_stride,
+                          const int8_t *filter, int32_t height)
+{
+    int32_t loop_cnt;
+    __m128i mask0, mask1, mask2, mask3, out1, out2;
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    __m128i filt0, filt1, filt2, filt3;
+    __m128i res0, res1, res2, res3;
+
+    mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+    src -= 3;
+
+    /* rearranging filter */
+    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+              filt0, filt1, filt2, filt3);
+
+    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+    mask3 = __lsx_vaddi_bu(mask0, 6);
+
+    for (loop_cnt = height; loop_cnt--;) {
+        DUP4_ARG2(__lsx_vld, src, 0, src, 8, src, 16, src, 24,
+                  src0, src1, src2, src3);
+        DUP4_ARG2(__lsx_vld, src, 32, src, 40, src, 48, src, 56,
+                  src4, src5, src6, src7);
+        src += src_stride;
+
+        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0,
+                  vec0, vec1);
+        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0,
+                  vec2, vec3);
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+                  vec3, filt0, res0, res1, res2, res3);
+        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2,
+                  vec0, vec1);
+        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2,
+                  vec2, vec3);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec0, filt2, res1, vec1, filt2,
+                  res2, vec2, filt2, res3, vec3, filt2, res0, res1, res2, res3);
+        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1,
+                  vec4, vec5);
+        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1,
+                  vec6, vec7);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt1, res1, vec5, filt1,
+                  res2, vec6, filt1, res3, vec7, filt1, res0, res1, res2, res3);
+        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3,
+                  vec4, vec5);
+        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask3, src3, src3, mask3,
+                  vec6, vec7);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt3, res1, vec5, filt3,
+                  res2, vec6, filt3, res3, vec7, filt3, res0, res1, res2, res3);
+
+        DUP2_ARG3(__lsx_vssrarni_bu_h, res1, res0, 6, res3, res2, 6,
+                  out1, out2);
+        __lsx_vst(out1, dst, 0);
+        __lsx_vst(out2, dst, 16);
+
+        DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src5, src5, mask0,
+                  vec0, vec1);
+        DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src7, src7, mask0,
+                  vec2, vec3);
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+                  vec3, filt0, res0, res1, res2, res3);
+        DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask2, src5, src5, mask2,
+                  vec0, vec1);
+        DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask2, src7, src7, mask2,
+                  vec2, vec3);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec0, filt2, res1, vec1, filt2,
+                  res2, vec2, filt2, res3, vec3, filt2, res0, res1, res2, res3);
+        DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask1, src5, src5, mask1,
+                  vec4, vec5);
+        DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask1, src7, src7, mask1,
+                  vec6, vec7);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt1, res1, vec5, filt1,
+                  res2, vec6, filt1, res3, vec7, filt1, res0, res1, res2, res3);
+        DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask3, src5, src5, mask3,
+                  vec4, vec5);
+        DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask3, src7, src7, mask3,
+                  vec6, vec7);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt3, res1, vec5, filt3,
+                  res2, vec6, filt3, res3, vec7, filt3, res0, res1, res2, res3);
+
+        DUP2_ARG3(__lsx_vssrarni_bu_h, res1, res0, 6, res3, res2, 6,
+                  out1, out2);
+        __lsx_vst(out1, dst, 32);
+        __lsx_vst(out2, dst, 48);
+        dst += dst_stride;
+    }
+}
+
+static av_always_inline
+void common_vt_8t_8w_lsx(uint8_t *src, int32_t src_stride,
+                         uint8_t *dst, int32_t dst_stride,
+                         const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t dst_stride_2x = (dst_stride << 1);
+    int32_t src_stride_4x = (src_stride << 2);
+    int32_t dst_stride_4x = (dst_stride << 2);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    __m128i src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    __m128i src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
+    __m128i tmp0, tmp1;
+    __m128i out0_r, out1_r, out2_r, out3_r;
+
+    src -= src_stride_3x;
+    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+              filt0, filt1, filt2, filt3);
+
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+    src3 = __lsx_vldx(src, src_stride_3x);
+    src += src_stride_4x;
+    src4 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
+    src += src_stride_3x;
+    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+              src10_r, src32_r, src54_r, src21_r);
+    DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+        src7 = __lsx_vld(src, 0);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
+        src10 = __lsx_vldx(src, src_stride_3x);
+        src += src_stride_4x;
+
+        DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
+                  src9, src76_r, src87_r, src98_r, src109_r);
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src32_r,
+                  filt0, src43_r, filt0, out0_r, out1_r, out2_r, out3_r);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out1_r,
+                  src43_r, filt1, out2_r, src54_r, filt1, out3_r, src65_r,
+                  filt1, out0_r, out1_r, out2_r, out3_r);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src54_r, filt2, out1_r,
+                  src65_r, filt2, out2_r, src76_r, filt2, out3_r, src87_r,
+                  filt2, out0_r, out1_r, out2_r, out3_r);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src76_r, filt3, out1_r,
+                  src87_r, filt3, out2_r, src98_r, filt3, out3_r, src109_r,
+                  filt3, out0_r, out1_r, out2_r, out3_r);
+
+        DUP2_ARG3(__lsx_vssrarni_bu_h, out1_r, out0_r, 6, out3_r, out2_r, 6,
+                  tmp0, tmp1)
+        __lsx_vstelm_d(tmp0, dst, 0, 0);
+        __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+        __lsx_vstelm_d(tmp1, dst + dst_stride_2x, 0, 0);
+        __lsx_vstelm_d(tmp1, dst + dst_stride_3x, 0, 1);
+        dst += dst_stride_4x;
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src54_r = src98_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src65_r = src109_r;
+        src6 = src10;
+    }
+}
+
+static av_always_inline
+void common_vt_8t_16w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+                          int32_t dst_stride, const int8_t *filter,
+                          int32_t height, int32_t width)
+{
+    uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    uint32_t loop_cnt, cnt;
+    const int32_t src_stride_2x = (src_stride << 1);
+    const int32_t dst_stride_2x = (dst_stride << 1);
+    const int32_t src_stride_4x = (src_stride << 2);
+    const int32_t dst_stride_4x = (dst_stride << 2);
+    const int32_t src_stride_3x = src_stride_2x + src_stride;
+    const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    __m128i filt0, filt1, filt2, filt3;
+    __m128i src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+    __m128i src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+    __m128i src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+    __m128i tmp0, tmp1, tmp2, tmp3;
+    __m128i out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+    src -= src_stride_3x;
+    DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, filt0,
+              filt1, filt2, filt3);
+
+    for (cnt = (width >> 4); cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        src0 = __lsx_vld(src_tmp, 0);
+        DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+                  src1, src2);
+        src3 = __lsx_vldx(src_tmp, src_stride_3x);
+        src_tmp += src_stride_4x;
+        src4 = __lsx_vld(src_tmp, 0);
+        DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+                  src5, src6);
+        src_tmp += src_stride_3x;
+        DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+                  src10_r, src32_r, src54_r, src21_r);
+        DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
+        DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
+                  src10_l, src32_l, src54_l, src21_l);
+        DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, src43_l, src65_l);
+
+        for (loop_cnt = (height >> 2); loop_cnt--;) {
+            src7 = __lsx_vld(src_tmp, 0);
+            DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+                      src8, src9);
+            src10 = __lsx_vldx(src_tmp, src_stride_3x);
+            src_tmp += src_stride_4x;
+            DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
+                      src9, src76_r, src87_r, src98_r, src109_r);
+            DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10,
+                      src9, src76_l, src87_l, src98_l, src109_l);
+            DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src32_r,
+                      filt0, src43_r, filt0, out0_r, out1_r, out2_r, out3_r);
+            DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out1_r,
+                      src43_r, filt1, out2_r, src54_r, filt1, out3_r, src65_r,
+                      filt1, out0_r, out1_r, out2_r, out3_r);
+            DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src54_r, filt2, out1_r,
+                      src65_r, filt2, out2_r, src76_r, filt2, out3_r, src87_r,
+                      filt2, out0_r, out1_r, out2_r, out3_r);
+            DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src76_r, filt3, out1_r,
+                      src87_r, filt3, out2_r, src98_r, filt3, out3_r, src109_r,
+                      filt3, out0_r, out1_r, out2_r, out3_r);
+            DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_l, filt0, src21_l, filt0, src32_l,
+                      filt0, src43_l, filt0, out0_l, out1_l, out2_l, out3_l);
+            DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_l, src32_l, filt1, out1_l,
+                      src43_l, filt1, out2_l, src54_l, filt1, out3_l, src65_l,
+                      filt1, out0_l, out1_l, out2_l, out3_l);
+            DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_l, src54_l, filt2, out1_l,
+                      src65_l, filt2, out2_l, src76_l, filt2, out3_l, src87_l,
+                      filt2, out0_l, out1_l, out2_l, out3_l);
+            DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_l, src76_l, filt3, out1_l,
+                      src87_l, filt3, out2_l, src98_l, filt3, out3_l, src109_l,
+                      filt3, out0_l, out1_l, out2_l, out3_l);
+            DUP4_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out1_l, out1_r,
+                      6, out2_l, out2_r, 6, out3_l, out3_r, 6,
+                      tmp0, tmp1, tmp2, tmp3);
+            __lsx_vst(tmp0, dst_tmp, 0);
+            __lsx_vstx(tmp1, dst_tmp, dst_stride);
+            __lsx_vstx(tmp2, dst_tmp, dst_stride_2x);
+            __lsx_vstx(tmp3, dst_tmp, dst_stride_3x);
+            dst_tmp += dst_stride_4x;
+
+            src10_r = src54_r;
+            src32_r = src76_r;
+            src54_r = src98_r;
+            src21_r = src65_r;
+            src43_r = src87_r;
+            src65_r = src109_r;
+            src10_l = src54_l;
+            src32_l = src76_l;
+            src54_l = src98_l;
+            src21_l = src65_l;
+            src43_l = src87_l;
+            src65_l = src109_l;
+            src6 = src10;
+        }
+
+        src += 16;
+        dst += 16;
+    }
+}
+
+static void common_vt_8t_24w_lsx(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    common_vt_8t_16w_lsx(src, src_stride, dst, dst_stride, filter, height, 16);
+    common_vt_8t_8w_lsx(src + 16, src_stride, dst + 16, dst_stride, filter,
+                        height);
+}
+
+static void common_vt_8t_32w_lsx(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    common_vt_8t_16w_lsx(src, src_stride, dst, dst_stride, filter, height, 32);
+}
+
+static void common_vt_8t_48w_lsx(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    common_vt_8t_16w_lsx(src, src_stride, dst, dst_stride, filter, height, 48);
+}
+
+static void common_vt_8t_64w_lsx(uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height)
+{
+    common_vt_8t_16w_lsx(src, src_stride, dst, dst_stride, filter, height, 64);
+}
+
+static av_always_inline
+void hevc_hv_8t_8x2_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+                        int32_t dst_stride, const int8_t *filter_x,
+                        const int8_t *filter_y, int32_t height, int32_t width)
+{
+    uint32_t loop_cnt, cnt;
+    uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    const int32_t src_stride_2x = (src_stride << 1);
+    const int32_t dst_stride_2x = (dst_stride << 1);
+    const int32_t src_stride_4x = (src_stride << 2);
+    const int32_t src_stride_3x = src_stride_2x + src_stride;
+
+    __m128i out;
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    __m128i filt0, filt1, filt2, filt3;
+    __m128i filt_h0, filt_h1, filt_h2, filt_h3;
+    __m128i mask1, mask2, mask3;
+    __m128i filter_vec;
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    __m128i dst0_r, dst0_l, dst1_r, dst1_l;
+    __m128i dst10_r, dst32_r, dst54_r, dst76_r;
+    __m128i dst10_l, dst32_l, dst54_l, dst76_l;
+    __m128i dst21_r, dst43_r, dst65_r, dst87_r;
+    __m128i dst21_l, dst43_l, dst65_l, dst87_l;
+    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+
+    src -= (src_stride_3x + 3);
+    DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4,
+              filter_x, 6, filt0, filt1, filt2, filt3);
+
+    filter_vec = __lsx_vld(filter_y, 0);
+    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+    DUP4_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filter_vec, 2,
+              filter_vec, 3, filt_h0, filt_h1, filt_h2, filt_h3);
+
+    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+    mask3 = __lsx_vaddi_bu(mask0, 6);
+
+    for (cnt = width >> 3; cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        src0 = __lsx_vld(src_tmp, 0);
+        DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+                  src1, src2);
+        src3 = __lsx_vldx(src_tmp, src_stride_3x);
+        src_tmp += src_stride_4x;
+        src4 = __lsx_vld(src_tmp, 0);
+        DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+                  src5, src6);
+        src_tmp += src_stride_3x;
+
+        /* row 0 row 1 row 2 row 3 */
+        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0,
+                  src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
+        DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1,
+                  src1, mask2, src1, src1, mask3, vec4, vec5, vec6, vec7);
+        DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2,
+                  src2, mask2, src2, src2, mask3, vec8, vec9, vec10, vec11);
+        DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3,
+                  src3, mask2, src3, src3, mask3, vec12, vec13, vec14, vec15);
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, vec8, filt0,
+                  vec12, filt0, dst0, dst1, dst2, dst3);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec5, filt1,
+                  dst2, vec9, filt1, dst3, vec13, filt1, dst0, dst1, dst2, dst3);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec2, filt2, dst1, vec6, filt2,
+                  dst2, vec10, filt2, dst3, vec14, filt2, dst0, dst1, dst2, dst3);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec3, filt3, dst1, vec7, filt3,
+                  dst2, vec11, filt3, dst3, vec15, filt3, dst0, dst1, dst2, dst3);
+
+        DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4,
+                  src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3);
+        DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5,
+                  src5, mask2, src5, src5, mask3, vec4, vec5, vec6, vec7);
+        DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6,
+                  src6, mask2, src6, src6, mask3, vec8, vec9, vec10, vec11);
+        DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, dst4, dst5);
+        dst6 = __lsx_vdp2_h_bu_b(vec8, filt0);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst5, vec5, filt1,
+                  dst6, vec9, filt1, dst4, vec2, filt2, dst4, dst5, dst6, dst4);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec6, filt2, dst6, vec10, filt2,
+                  dst4, vec3, filt3, dst5, vec7, filt3, dst5, dst6, dst4, dst5);
+        dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3);
+        DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst2,
+                  dst1, dst10_r, dst32_r, dst54_r, dst21_r);
+        DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst2,
+                  dst1, dst10_l, dst32_l, dst54_l, dst21_l);
+        DUP2_ARG2(__lsx_vilvl_h, dst4, dst3, dst6, dst5, dst43_r, dst65_r);
+        DUP2_ARG2(__lsx_vilvh_h, dst4, dst3, dst6, dst5, dst43_l, dst65_l);
+
+        for (loop_cnt = height >> 1; loop_cnt--;) {
+            src7 = __lsx_vld(src_tmp, 0);
+            src8 = __lsx_vldx(src_tmp, src_stride);
+            src_tmp += src_stride_2x;
+
+            DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7,
+                      src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3);
+            dst7 = __lsx_vdp2_h_bu_b(vec0, filt0);
+            DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2,
+                      filt2, dst7, dst7);
+            dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3);
+            dst76_r = __lsx_vilvl_h(dst7, dst6);
+            dst76_l = __lsx_vilvh_h(dst7, dst6);
+            DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0,
+                      dst0_r, dst0_l);
+            DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
+                      dst32_l, filt_h1, dst0_r, dst54_r, filt_h2, dst0_l,
+                      dst54_l, filt_h2, dst0_r, dst0_l, dst0_r, dst0_l);
+            DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l,
+                      dst76_l, filt_h3, dst0_r, dst0_l);
+            DUP2_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst0_r, dst0_l);
+
+            DUP4_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1, src8,
+                      src8, mask2, src8, src8, mask3, vec0, vec1, vec2, vec3);
+            dst8 = __lsx_vdp2_h_bu_b(vec0, filt0);
+            DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst8, vec1, filt1, dst8, vec2,
+                      filt2, dst8, dst8);
+            dst8 = __lsx_vdp2add_h_bu_b(dst8, vec3, filt3);
+
+            dst87_r = __lsx_vilvl_h(dst8, dst7);
+            dst87_l = __lsx_vilvh_h(dst8, dst7);
+            DUP2_ARG2(__lsx_vdp2_w_h, dst21_r, filt_h0, dst21_l, filt_h0,
+                      dst1_r, dst1_l);
+            DUP4_ARG3(__lsx_vdp2add_w_h, dst1_r, dst43_r, filt_h1, dst1_l,
+                      dst43_l, filt_h1, dst1_r, dst65_r, filt_h2, dst1_l,
+                      dst65_l, filt_h2, dst1_r, dst1_l, dst1_r, dst1_l);
+            DUP2_ARG3(__lsx_vdp2add_w_h, dst1_r, dst87_r, filt_h3, dst1_l,
+                      dst87_l, filt_h3, dst1_r, dst1_l);
+            DUP2_ARG2(__lsx_vsrai_w, dst1_r, 6, dst1_l, 6, dst1_r, dst1_l);
+            DUP4_ARG2(__lsx_vsrari_w, dst0_r, 6, dst0_l, 6,dst1_r, 6, dst1_l,
+                      6, dst0_r, dst0_l, dst1_r, dst1_l);
+            DUP4_ARG1(__lsx_vclip255_w, dst0_l, dst0_r, dst1_l, dst1_r,
+                      dst0_l, dst0_r, dst1_l, dst1_r);
+            DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r,
+                      dst0, dst1);
+            out = __lsx_vpickev_b(dst1, dst0);
+            __lsx_vstelm_d(out, dst_tmp, 0, 0);
+            __lsx_vstelm_d(out, dst_tmp + dst_stride, 0, 1);
+            dst_tmp += dst_stride_2x;
+
+            dst10_r = dst32_r;
+            dst32_r = dst54_r;
+            dst54_r = dst76_r;
+            dst10_l = dst32_l;
+            dst32_l = dst54_l;
+            dst54_l = dst76_l;
+            dst21_r = dst43_r;
+            dst43_r = dst65_r;
+            dst65_r = dst87_r;
+            dst21_l = dst43_l;
+            dst43_l = dst65_l;
+            dst65_l = dst87_l;
+            dst6 = dst8;
+        }
+        src += 8;
+        dst += 8;
+    }
+}
+
+static void hevc_hv_8t_8w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+                              int32_t dst_stride, const int8_t *filter_x,
+                              const int8_t *filter_y, int32_t height)
+{
+    hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
+                       filter_x, filter_y, height, 8);
+}
+
+static void hevc_hv_8t_16w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+                               int32_t dst_stride, const int8_t *filter_x,
+                               const int8_t *filter_y, int32_t height)
+{
+    hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
+                      filter_x, filter_y, height, 16);
+}
+
+static void hevc_hv_8t_24w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+                               int32_t dst_stride, const int8_t *filter_x,
+                               const int8_t *filter_y, int32_t height)
+{
+    hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
+                       filter_x, filter_y, height, 24);
+}
+
+static void hevc_hv_8t_32w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+                               int32_t dst_stride, const int8_t *filter_x,
+                               const int8_t *filter_y, int32_t height)
+{
+    hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
+                       filter_x, filter_y, height, 32);
+}
+
+static void hevc_hv_8t_48w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+                               int32_t dst_stride, const int8_t *filter_x,
+                               const int8_t *filter_y, int32_t height)
+{
+    hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
+                       filter_x, filter_y, height, 48);
+}
+
+static void hevc_hv_8t_64w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+                               int32_t dst_stride, const int8_t *filter_x,
+                               const int8_t *filter_y, int32_t height)
+{
+    hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
+                       filter_x, filter_y, height, 64);
+}
+
+static av_always_inline
+void common_vt_4t_24w_lsx(uint8_t *src, int32_t src_stride,
+                          uint8_t *dst, int32_t dst_stride,
+                          const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    uint8_t *_src;
+
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    __m128i src11, filt0, filt1;
+    __m128i src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
+    __m128i src109_r, src10_l, src32_l, src21_l, src43_l;
+    __m128i out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
+    __m128i out1, out2, out3, out4;
+
+    src -= src_stride;
+    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+    _src = src + 16;
+
+    /* 16 width */
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
+
+    /* 8 width */
+    src6 = __lsx_vld(_src, 0);
+    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x, src7, src8);
+    src += src_stride_3x;
+    _src += src_stride_3x;
+    DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
+
+    for (loop_cnt = 8; loop_cnt--;) {
+        /* 16 width */
+        DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src3, src9);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src4, src10);
+        DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
+        DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
+
+        /* 8 width */
+        src += src_stride_2x;
+        _src += src_stride_2x;
+        DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r);
+
+        /* 16 width */
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
+                  filt0, src21_l, filt0, out0_r, out0_l, out1_r, out1_l);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out0_l, src32_l,
+                  filt1, out1_r, src43_r, filt1, out1_l, src43_l, filt1,
+                  out0_r, out0_l, out1_r, out1_l);
+
+        /* 8 width */
+        DUP2_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src87_r, filt0,
+                  out2_r, out3_r);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, out2_r, src98_r, filt1, out3_r,
+                  src109_r, filt1, out2_r, out3_r);
+
+        /* 16 + 8 width */
+        DUP4_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out2_r, out2_r, 6,
+                out3_r, out3_r, 6, out1_l, out1_r, 6, out1, out2, out3, out4);
+        __lsx_vst(out1, dst, 0);
+        __lsx_vstelm_d(out2, dst, 16, 0);
+        dst += dst_stride;
+        __lsx_vst(out4, dst, 0);
+        __lsx_vstelm_d(out3, dst, 16, 0);
+        dst += dst_stride;
+
+        /* 16 width */
+        DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src5, src11);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src2, src8);
+        DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r);
+        DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l);
+
+        /* 8 width */
+        src += src_stride_2x;
+        _src += src_stride_2x;
+        DUP2_ARG2(__lsx_vilvl_b, src11, src10, src8, src11, src76_r, src87_r);
+
+        /* 16 width */
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
+                  filt0, src43_l, filt0, out0_r, out0_l, out1_r, out1_l);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src10_r, filt1, out0_l, src10_l,
+                  filt1, out1_r, src21_r, filt1, out1_l, src21_l, filt1,
+                  out0_r, out0_l, out1_r, out1_l);
+
+        /* 8 width */
+        DUP2_ARG2(__lsx_vdp2_h_bu_b, src98_r, filt0, src109_r, filt0,
+                  out2_r, out3_r);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, out2_r, src76_r, filt1, out3_r,
+                  src87_r, filt1, out2_r, out3_r);
+
+        /* 16 + 8 width */
+        DUP4_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out2_r, out2_r, 6,
+                  out1_l, out1_r, 6, out3_r, out3_r, 6, out1, out2, out3, out4);
+
+        __lsx_vst(out1, dst, 0);
+        __lsx_vstelm_d(out2, dst, 16, 0);
+        dst += dst_stride;
+        __lsx_vst(out3, dst, 0);
+        __lsx_vstelm_d(out4, dst, 16, 0);
+        dst += dst_stride;
+    }
+}
+
+static av_always_inline
+void common_vt_4t_32w_lsx(uint8_t *src, int32_t src_stride,
+                          uint8_t *dst, int32_t dst_stride,
+                          const int8_t *filter, int32_t height)
+{
+    uint32_t loop_cnt;
+    int32_t src_stride_2x = (src_stride << 1);
+    int32_t dst_stride_2x = (dst_stride << 1);
+    int32_t src_stride_3x = src_stride_2x + src_stride;
+    uint8_t *_src;
+
+    __m128i src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
+    __m128i src10_r, src32_r, src76_r, src98_r;
+    __m128i src21_r, src43_r, src87_r, src109_r;
+    __m128i out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+    __m128i src10_l, src32_l, src76_l, src98_l;
+    __m128i src21_l, src43_l, src87_l, src109_l;
+    __m128i filt0, filt1;
+    __m128i out1, out2;
+
+    src -= src_stride;
+    DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+    _src = src + 16;
+
+    /* 16 width */
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
+
+    /* next 16 width */
+    src6 = __lsx_vld(_src, 0);
+    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x, src7, src8);
+    src += src_stride_3x;
+    _src += src_stride_3x;
+
+    DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
+    DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src76_l, src87_l);
+
+    for (loop_cnt = (height >> 1); loop_cnt--;) {
+        /* 16 width */
+        DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src3, src9);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src4, src10);
+        DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
+        DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
+
+        /* 16 width */
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
+                  filt0, src21_l, filt0, out0_r, out0_l, out1_r, out1_l);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out0_l, src32_l,
+                  filt1, out1_r, src43_r, filt1, out1_l, src43_l, filt1,
+                  out0_r, out0_l, out1_r, out1_l);
+
+        DUP2_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out1_l, out1_r, 6,
+                  out1, out2);
+        __lsx_vst(out1, dst, 0);
+        __lsx_vstx(out2, dst, dst_stride);
+
+        src10_r = src32_r;
+        src21_r = src43_r;
+        src10_l = src32_l;
+        src21_l = src43_l;
+        src2 = src4;
+
+        /* next 16 width */
+        src += src_stride_2x;
+        _src += src_stride_2x;
+        DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r);
+        DUP2_ARG2(__lsx_vilvh_b, src9, src8, src10, src9, src98_l, src109_l);
+
+        /* next 16 width */
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src76_l, filt0, src87_r,
+                  filt0, src87_l, filt0, out2_r, out2_l, out3_r, out3_l);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, out2_r, src98_r, filt1, out2_l, src98_l,
+                  filt1, out3_r, src109_r, filt1, out3_l, src109_l, filt1,
+                  out2_r, out2_l, out3_r, out3_l);
+
+        /* next 16 width */
+        DUP2_ARG3(__lsx_vssrarni_bu_h, out2_l, out2_r, 6, out3_l, out3_r, 6,
+                  out1, out2);
+        __lsx_vst(out1, dst, 16);
+        __lsx_vst(out2, dst + dst_stride, 16);
+
+        dst += dst_stride_2x;
+
+        src76_r = src98_r;
+        src87_r = src109_r;
+        src76_l = src98_l;
+        src87_l = src109_l;
+        src8 = src10;
+    }
+}
+
+static av_always_inline
+void hevc_hv_4t_8x2_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+                        int32_t dst_stride, const int8_t *filter_x,
+                        const int8_t *filter_y)
+{
+    const int32_t src_stride_2x = (src_stride << 1);
+    const int32_t src_stride_4x = (src_stride << 2);
+    const int32_t src_stride_3x = src_stride_2x + src_stride;
+    __m128i out;
+    __m128i src0, src1, src2, src3, src4;
+    __m128i filt0, filt1;
+    __m128i filt_h0, filt_h1, filter_vec;
+    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+    __m128i mask1;
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+    __m128i dst0, dst1, dst2, dst3, dst4;
+    __m128i dst0_r, dst0_l, dst1_r, dst1_l;
+    __m128i dst10_r, dst32_r, dst21_r, dst43_r;
+    __m128i dst10_l, dst32_l, dst21_l, dst43_l;
+    __m128i out0_r, out1_r;
+
+    src -= (src_stride + 1);
+    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+    filter_vec = __lsx_vld(filter_y, 0);
+    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+    DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+    mask1 = __lsx_vaddi_bu(mask0, 2);
+    src0 = __lsx_vld(src, 0);
+    DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src,
+              src_stride_3x, src, src_stride_4x, src1, src2, src3, src4);
+
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src1, src1,
+              mask0, src1, src1, mask1, vec0, vec1, vec2, vec3);
+    DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src3, src3,
+              mask0, src3, src3, mask1, vec4, vec5, vec6, vec7);
+    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec8, vec9);
+
+    DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
+              filt0, dst0, dst1, dst2, dst3);
+    dst4 = __lsx_vdp2_h_bu_b(vec8, filt0);
+    DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2,
+              vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
+    dst4 = __lsx_vdp2add_h_bu_b(dst4, vec9, filt1);
+    DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
+              dst10_r, dst21_r, dst32_r, dst43_r);
+    DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
+              dst10_l, dst21_l, dst32_l, dst43_l);
+    DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+              filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+    DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+              filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
+              dst0_r, dst0_l, dst1_r, dst1_l);
+    DUP2_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6,
+              out0_r, out1_r);
+    out = __lsx_vssrarni_bu_h(out1_r, out0_r, 6);
+    __lsx_vstelm_d(out, dst, 0, 0);
+    __lsx_vstelm_d(out, dst + dst_stride, 0, 1);
+}
+
+static av_always_inline
+void hevc_hv_4t_8multx4_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+                            int32_t dst_stride, const int8_t *filter_x,
+                            const int8_t *filter_y, int32_t width8mult)
+{
+    uint32_t cnt;
+    const int32_t src_stride_2x = (src_stride << 1);
+    const int32_t dst_stride_2x = (dst_stride << 1);
+    const int32_t src_stride_4x = (src_stride << 2);
+    const int32_t src_stride_3x = src_stride_2x + src_stride;
+    const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+
+    __m128i out0, out1;
+    __m128i src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    __m128i filt0, filt1, filt_h0, filt_h1, filter_vec;
+    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
+    __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
+    __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
+
+    src -= (src_stride + 1);
+    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+    filter_vec = __lsx_vld(filter_y, 0);
+    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+    DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+    mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+    mask1 = __lsx_vaddi_bu(mask0, 2);
+
+    for (cnt = width8mult; cnt--;) {
+        src0 = __lsx_vld(src, 0);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+        src3 = __lsx_vldx(src, src_stride_3x);
+        src += src_stride_4x;
+        src4 = __lsx_vld(src, 0);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
+        src += (8 - src_stride_4x);
+        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1,
+                  vec0, vec1);
+        DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1,
+                  vec2, vec3);
+        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,
+                  vec4, vec5);
+
+        DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
+        dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
+                  dst0, dst1);
+        dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
+
+        DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+        DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
+
+        DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1,
+                  vec0, vec1);
+        DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1,
+                  vec2, vec3);
+        DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1,
+                  vec4, vec5);
+        DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1,
+                  vec6, vec7);
+
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
+                  vec6, filt0, dst3, dst4, dst5, dst6);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3, filt1,
+                  dst5, vec5, filt1, dst6, vec7, filt1, dst3, dst4, dst5, dst6);
+
+        DUP4_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst5, dst4, dst6,
+                  dst5, dst32_r, dst43_r, dst54_r, dst65_r);
+        DUP4_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst5, dst4, dst6,
+                  dst5, dst32_l, dst43_l, dst54_l, dst65_l);
+
+        DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+                  filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+        DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
+                  filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
+        DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+                  filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
+                  dst0_r, dst0_l, dst1_r, dst1_l);
+        DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
+                  filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
+                  dst2_r, dst2_l, dst3_r, dst3_l);
+
+        DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6,
+                  dst2_l, dst2_r, 6, dst3_l, dst3_r, 6, tmp0, tmp1, tmp2, tmp3);
+        DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, 6, tmp3, tmp2, 6, out0, out1);
+        __lsx_vstelm_d(out0, dst, 0, 0);
+        __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+        __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
+        __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
+        dst += 8;
+    }
+}
+
+static av_always_inline
+void hevc_hv_4t_8x6_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+                        int32_t dst_stride, const int8_t *filter_x,
+                        const int8_t *filter_y)
+{
+    const int32_t src_stride_2x = (src_stride << 1);
+    const int32_t dst_stride_2x = (dst_stride << 1);
+    const int32_t src_stride_4x = (src_stride << 2);
+    const int32_t dst_stride_4x = (dst_stride << 2);
+    const int32_t src_stride_3x = src_stride_2x + src_stride;
+    const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+    __m128i out0, out1, out2;
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    __m128i filt0, filt1;
+    __m128i filt_h0, filt_h1, filter_vec;
+    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+    __m128i mask1;
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+    __m128i vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
+    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    __m128i dst4_r, dst4_l, dst5_r, dst5_l;
+    __m128i dst10_r, dst32_r, dst10_l, dst32_l;
+    __m128i dst21_r, dst43_r, dst21_l, dst43_l;
+    __m128i dst54_r, dst54_l, dst65_r, dst65_l;
+    __m128i dst76_r, dst76_l, dst87_r, dst87_l;
+    __m128i out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
+
+    src -= (src_stride + 1);
+    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+    filter_vec = __lsx_vld(filter_y, 0);
+    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+    DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+    mask1 = __lsx_vaddi_bu(mask0, 2);
+
+    src0 = __lsx_vld(src, 0);
+    DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,src,
+              src_stride_3x, src, src_stride_4x, src1, src2, src3, src4);
+    src += src_stride_4x;
+    DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,src,
+              src_stride_3x, src, src_stride_4x, src5, src6, src7, src8);
+
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src1, src1,
+              mask0, src1, src1, mask1, vec0, vec1, vec2, vec3);
+    DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src3, src3,
+              mask0, src3, src3, mask1, vec4, vec5, vec6, vec7);
+    DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src5, src5,
+              mask0, src5, src5, mask1, vec8, vec9, vec10, vec11);
+    DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src7, src7,
+              mask0, src7, src7, mask1, vec12, vec13, vec14, vec15);
+    DUP2_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1, vec16, vec17);
+
+    DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
+              filt0, dst0, dst1, dst2, dst3);
+    DUP4_ARG2(__lsx_vdp2_h_bu_b, vec8, filt0, vec10, filt0, vec12, filt0, vec14,
+              filt0, dst4, dst5, dst6, dst7);
+    dst8 = __lsx_vdp2_h_bu_b(vec16, filt0);
+    DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2,
+              vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
+    DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec9, filt1, dst5, vec11, filt1, dst6,
+              vec13, filt1, dst7, vec15, filt1, dst4, dst5, dst6, dst7);
+    dst8 = __lsx_vdp2add_h_bu_b(dst8, vec17, filt1);
+
+    DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
+              dst10_r, dst21_r, dst32_r, dst43_r);
+    DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
+              dst10_l, dst21_l, dst32_l, dst43_l);
+    DUP4_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
+              dst54_r, dst65_r, dst76_r, dst87_r);
+    DUP4_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
+              dst54_l, dst65_l, dst76_l, dst87_l);
+
+    DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+              filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+    DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
+              filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
+    DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst54_l, filt_h0, dst65_r,
+              filt_h0, dst65_l, filt_h0, dst4_r, dst4_l, dst5_r, dst5_l);
+    DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+              filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
+              dst0_r, dst0_l, dst1_r, dst1_l);
+    DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
+              filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
+              dst2_r, dst2_l, dst3_r, dst3_l);
+    DUP4_ARG3(__lsx_vdp2add_w_h, dst4_r, dst76_r, filt_h1, dst4_l, dst76_l,
+              filt_h1, dst5_r, dst87_r, filt_h1, dst5_l, dst87_l, filt_h1,
+              dst4_r, dst4_l, dst5_r, dst5_l);
+
+    DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6, dst2_l,
+              dst2_r, 6, dst3_l, dst3_r, 6, out0_r, out1_r, out2_r, out3_r);
+    DUP2_ARG3(__lsx_vsrani_h_w, dst4_l, dst4_r, 6, dst5_l, dst5_r, 6,
+              out4_r, out5_r);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, out1_r, out0_r, 6, out3_r, out2_r, 6,
+              out0, out1);
+    out2 = __lsx_vssrarni_bu_h(out5_r, out4_r, 6);
+
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
+    __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
+    dst += dst_stride_4x;
+    __lsx_vstelm_d(out2, dst, 0, 0);
+    __lsx_vstelm_d(out2, dst + dst_stride, 0, 1);
+}
+
+static av_always_inline
+void hevc_hv_4t_8multx4mult_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+                                int32_t dst_stride, const int8_t *filter_x,
+                                const int8_t *filter_y, int32_t height,
+                                int32_t width8mult)
+{
+    uint32_t loop_cnt, cnt;
+    uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    const int32_t src_stride_2x = (src_stride << 1);
+    const int32_t dst_stride_2x = (dst_stride << 1);
+    const int32_t src_stride_4x = (src_stride << 2);
+    const int32_t dst_stride_4x = (dst_stride << 2);
+    const int32_t src_stride_3x = src_stride_2x + src_stride;
+    const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+
+    __m128i out0, out1;
+    __m128i src0, src1, src2, src3, src4, src5, src6;
+    __m128i filt0, filt1;
+    __m128i filt_h0, filt_h1, filter_vec;
+    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+    __m128i mask1;
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    __m128i dst0, dst1, dst2, dst3, dst4, dst5;
+    __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    __m128i dst10_r, dst32_r, dst21_r, dst43_r;
+    __m128i dst10_l, dst32_l, dst21_l, dst43_l;
+    __m128i dst54_r, dst54_l, dst65_r, dst65_l, dst6;
+    __m128i out0_r, out1_r, out2_r, out3_r;
+
+    src -= (src_stride + 1);
+    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+    filter_vec = __lsx_vld(filter_y, 0);
+    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+    DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+    mask1 = __lsx_vaddi_bu(mask0, 2);
+
+    for (cnt = width8mult; cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        src0 = __lsx_vld(src_tmp, 0);
+        DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+                  src1, src2);
+        src_tmp += src_stride_3x;
+
+        DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1,
+                  vec0, vec1);
+        DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1,
+                  vec2, vec3);
+        DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,
+                  vec4, vec5);
+
+        DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
+        dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
+        DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
+                  dst0, dst1);
+        dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
+
+        DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+        DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
+
+        for (loop_cnt = (height >> 2); loop_cnt--;) {
+            src3 = __lsx_vld(src_tmp, 0);
+            DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+                      src4, src5);
+            src6 = __lsx_vldx(src_tmp, src_stride_3x);
+            src_tmp += src_stride_4x;
+
+            DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src4,
+                      src4, mask0, src4, src4, mask1, vec0, vec1, vec2, vec3);
+            DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src6,
+                      src6, mask0, src6, src6, mask1, vec4, vec5, vec6, vec7);
+
+            DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
+                      vec6, filt0, dst3, dst4, dst5, dst6);
+            DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3,
+                      filt1, dst5, vec5, filt1, dst6, vec7, filt1,
+                      dst3, dst4, dst5, dst6);
+
+            DUP4_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst5, dst4,
+                      dst6, dst5, dst32_r, dst43_r, dst54_r, dst65_r);
+            DUP4_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst5, dst4,
+                      dst6, dst5, dst32_l, dst43_l, dst54_l, dst65_l);
+
+            DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+                      filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+            DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
+                      filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
+            DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
+                      dst32_l, filt_h1, dst1_r, dst43_r, filt_h1, dst1_l,
+                      dst43_l, filt_h1, dst0_r, dst0_l, dst1_r, dst1_l);
+            DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l,
+                      dst54_l, filt_h1, dst3_r, dst65_r, filt_h1, dst3_l,
+                      dst65_l, filt_h1, dst2_r, dst2_l, dst3_r, dst3_l);
+
+            DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6,
+                      dst2_l, dst2_r, 6, dst3_l, dst3_r, 6, out0_r, out1_r,
+                      out2_r, out3_r);
+            DUP2_ARG3(__lsx_vssrarni_bu_h, out1_r, out0_r, 6, out3_r, out2_r,
+                      6, out0, out1);
+            __lsx_vstelm_d(out0, dst_tmp, 0, 0);
+            __lsx_vstelm_d(out0, dst_tmp + dst_stride, 0, 1);
+            __lsx_vstelm_d(out1, dst_tmp + dst_stride_2x, 0, 0);
+            __lsx_vstelm_d(out1, dst_tmp + dst_stride_3x, 0, 1);
+            dst_tmp += dst_stride_4x;
+
+            dst10_r = dst54_r;
+            dst10_l = dst54_l;
+            dst21_r = dst65_r;
+            dst21_l = dst65_l;
+            dst2 = dst6;
+        }
+        src += 8;
+        dst += 8;
+    }
+}
+
+static
+void hevc_hv_4t_8w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+                       int32_t dst_stride, const int8_t *filter_x,
+                       const int8_t *filter_y, int32_t height)
+{
+    if (2 == height) {
+        hevc_hv_4t_8x2_lsx(src, src_stride, dst, dst_stride, filter_x, filter_y);
+    } else if (4 == height) {
+        hevc_hv_4t_8multx4_lsx(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, 1);
+    } else if (6 == height) {
+        hevc_hv_4t_8x6_lsx(src, src_stride, dst, dst_stride, filter_x, filter_y);
+    } else if (0 == (height & 0x03)) {
+        hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height, 1);
+    }
+}
+
+static av_always_inline
+void hevc_hv_4t_12w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+                        int32_t dst_stride, const int8_t *filter_x,
+                        const int8_t *filter_y, int32_t height)
+{
+    uint32_t loop_cnt;
+    uint8_t *src_tmp, *dst_tmp;
+    const int32_t src_stride_2x = (src_stride << 1);
+    const int32_t dst_stride_2x = (dst_stride << 1);
+    const int32_t src_stride_4x = (src_stride << 2);
+    const int32_t dst_stride_4x = (dst_stride << 2);
+    const int32_t src_stride_3x = src_stride_2x + src_stride;
+    const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+    __m128i out0, out1;
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    __m128i mask0, mask1, mask2, mask3;
+    __m128i filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
+    __m128i dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
+    __m128i dst10, dst21, dst22, dst73, dst84, dst95, dst106;
+    __m128i dst76_r, dst98_r, dst87_r, dst109_r;
+    __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
+    __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
+    __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+    src -= (src_stride + 1);
+    DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+    filter_vec = __lsx_vld(filter_y, 0);
+    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+    DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+    mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+    mask1 = __lsx_vaddi_bu(mask0, 2);
+
+    src_tmp = src;
+    dst_tmp = dst;
+
+    src0 = __lsx_vld(src_tmp, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+              src1, src2);
+    src_tmp += src_stride_3x;
+
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dsth0, dsth1);
+    dsth2 = __lsx_vdp2_h_bu_b(vec4, filt0);
+    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dsth0, vec1, filt1, dsth1, vec3, filt1,
+              dsth0, dsth1);
+    dsth2 = __lsx_vdp2add_h_bu_b(dsth2, vec5, filt1);
+
+    DUP2_ARG2(__lsx_vilvl_h, dsth1, dsth0, dsth2, dsth1, dst10_r, dst21_r);
+    DUP2_ARG2(__lsx_vilvh_h, dsth1, dsth0, dsth2, dsth1, dst10_l, dst21_l);
+
+    for (loop_cnt = 4; loop_cnt--;) {
+        src3 = __lsx_vld(src_tmp, 0);
+        DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+                  src4, src5);
+        src6 = __lsx_vldx(src_tmp, src_stride_3x);
+        src_tmp += src_stride_4x;
+
+        DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src4,
+                  src4, mask0, src4, src4, mask1, vec0, vec1, vec2, vec3);
+        DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src6,
+                  src6, mask0, src6, src6, mask1, vec4, vec5, vec6, vec7);
+
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
+                  vec6, filt0, dsth3, dsth4, dsth5, dsth6);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dsth3, vec1, filt1, dsth4,
+                  vec3, filt1, dsth5, vec5, filt1, dsth6, vec7, filt1,
+                  dsth3, dsth4, dsth5, dsth6);
+
+        DUP4_ARG2(__lsx_vilvl_h, dsth3, dsth2, dsth4, dsth3, dsth5, dsth4,
+                  dsth6, dsth5, dst32_r, dst43_r, dst54_r, dst65_r);
+        DUP4_ARG2(__lsx_vilvh_h, dsth3, dsth2, dsth4, dsth3, dsth5, dsth4,
+                  dsth6, dsth5, dst32_l, dst43_l, dst54_l, dst65_l);
+
+        DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+                  filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+        DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
+                  filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
+        DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+                  filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
+                  dst0_r, dst0_l, dst1_r, dst1_l);
+        DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
+                  filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
+                  dst2_r, dst2_l, dst3_r, dst3_l);
+
+        DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6, dst2_l,
+                  dst2_r, 6, dst3_l, dst3_r, 6, tmp0, tmp1, tmp2, tmp3);
+        DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, 6, tmp3, tmp2, 6, out0, out1);
+
+        __lsx_vstelm_d(out0, dst_tmp, 0, 0);
+        __lsx_vstelm_d(out0, dst_tmp + dst_stride, 0, 1);
+        __lsx_vstelm_d(out1, dst_tmp + dst_stride_2x, 0, 0);
+        __lsx_vstelm_d(out1, dst_tmp + dst_stride_3x, 0, 1);
+        dst_tmp += dst_stride_4x;
+
+        dst10_r = dst54_r;
+        dst10_l = dst54_l;
+        dst21_r = dst65_r;
+        dst21_l = dst65_l;
+        dsth2 = dsth6;
+    }
+
+    src += 8;
+    dst += 8;
+
+    mask2 = __lsx_vld(ff_hevc_mask_arr, 16);
+    mask3 = __lsx_vaddi_bu(mask2, 2);
+
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+    src += src_stride_3x;
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask2, src1, src0, mask3, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src1, mask2, src2, src1, mask3, vec2, vec3);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst10, dst21);
+    DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst10, vec1, filt1, dst21, vec3, filt1,
+              dst10, dst21);
+
+    dst10_r = __lsx_vilvl_h(dst21, dst10);
+    dst21_r = __lsx_vilvh_h(dst21, dst10);
+    dst22 = __lsx_vreplvei_d(dst21, 1);
+
+    for (loop_cnt = 2; loop_cnt--;) {
+        src3 = __lsx_vld(src, 0);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src4, src5);
+        src6 = __lsx_vldx(src, src_stride_3x);
+        src += src_stride_4x;
+        src7 = __lsx_vld(src, 0);
+        DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
+        src10 = __lsx_vldx(src, src_stride_3x);
+        src += src_stride_4x;
+        DUP4_ARG3(__lsx_vshuf_b, src7, src3, mask2, src7, src3, mask3, src8,
+                  src4, mask2, src8, src4, mask3, vec0, vec1, vec2, vec3);
+        DUP4_ARG3(__lsx_vshuf_b, src9, src5, mask2, src9, src5, mask3, src10,
+                  src6, mask2, src10, src6, mask3, vec4, vec5, vec6, vec7);
+
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
+                  vec6, filt0, dst73, dst84, dst95, dst106);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst73, vec1, filt1, dst84, vec3,
+                  filt1, dst95, vec5, filt1, dst106, vec7, filt1,
+                  dst73, dst84, dst95, dst106);
+
+        dst32_r = __lsx_vilvl_h(dst73, dst22);
+        DUP2_ARG2(__lsx_vilvl_h, dst84, dst73, dst95, dst84, dst43_r, dst54_r);
+        DUP2_ARG2(__lsx_vilvh_h, dst84, dst73, dst95, dst84, dst87_r, dst98_r);
+        dst65_r = __lsx_vilvl_h(dst106, dst95);
+        dst109_r = __lsx_vilvh_h(dst106, dst95);
+        dst22 = __lsx_vreplvei_d(dst73, 1);
+        dst76_r = __lsx_vilvl_h(dst22, dst106);
+
+        DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst21_r, filt_h0, dst32_r,
+                  filt_h0, dst43_r, filt_h0, dst0, dst1, dst2, dst3);
+        DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst65_r, filt_h0, dst76_r,
+                  filt_h0, dst87_r, filt_h0, dst4, dst5, dst6, dst7);
+        DUP4_ARG3(__lsx_vdp2add_w_h, dst0, dst32_r, filt_h1, dst1, dst43_r,
+                  filt_h1, dst2, dst54_r, filt_h1, dst3, dst65_r, filt_h1,
+                  dst0, dst1, dst2, dst3);
+        DUP4_ARG3(__lsx_vdp2add_w_h, dst4, dst76_r, filt_h1, dst5, dst87_r,
+                  filt_h1, dst6, dst98_r, filt_h1, dst7, dst109_r, filt_h1,
+                  dst4, dst5, dst6, dst7);
+
+        DUP4_ARG3(__lsx_vsrani_h_w, dst1, dst0, 6, dst3, dst2, 6, dst5, dst4,
+                  6, dst7, dst6, 6, tmp0, tmp1, tmp2, tmp3);
+        DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, 6, tmp3, tmp2, 6, out0, out1);
+
+        __lsx_vstelm_w(out0, dst, 0, 0);
+        __lsx_vstelm_w(out0, dst + dst_stride, 0, 1);
+        __lsx_vstelm_w(out0, dst + dst_stride_2x, 0, 2);
+        __lsx_vstelm_w(out0, dst + dst_stride_3x, 0, 3);
+        dst += dst_stride_4x;
+        __lsx_vstelm_w(out1, dst, 0, 0);
+        __lsx_vstelm_w(out1, dst + dst_stride, 0, 1);
+        __lsx_vstelm_w(out1, dst + dst_stride_2x, 0, 2);
+        __lsx_vstelm_w(out1, dst + dst_stride_3x, 0, 3);
+        dst += dst_stride_4x;
+
+        dst10_r = dst98_r;
+        dst21_r = dst109_r;
+        dst22 = __lsx_vreplvei_d(dst106, 1);
+    }
+}
+
+static void hevc_hv_4t_16w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+                               int32_t dst_stride, const int8_t *filter_x,
+                               const int8_t *filter_y, int32_t height)
+{
+    if (4 == height) {
+        hevc_hv_4t_8multx4_lsx(src, src_stride, dst, dst_stride, filter_x,
+                               filter_y, 2);
+    } else {
+        hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
+                                   filter_x, filter_y, height, 2);
+    }
+}
+
+static void hevc_hv_4t_24w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+                               int32_t dst_stride, const int8_t *filter_x,
+                               const int8_t *filter_y, int32_t height)
+{
+    hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 3);
+}
+
+static void hevc_hv_4t_32w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+                               int32_t dst_stride, const int8_t *filter_x,
+                               const int8_t *filter_y, int32_t height)
+{
+    hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
+                               filter_x, filter_y, height, 4);
+}
+
+#define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)                           \
+void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_lsx(uint8_t *dst,           \
+                                                       ptrdiff_t dst_stride,   \
+                                                       uint8_t *src,           \
+                                                       ptrdiff_t src_stride,   \
+                                                       int height,             \
+                                                       intptr_t mx,            \
+                                                       intptr_t my,            \
+                                                       int width)              \
+{                                                                              \
+    const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1];              \
+                                                                               \
+    common_##DIR1##_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, dst_stride,  \
+                                            filter, height);                   \
+}
+
+UNI_MC(qpel, h, 64, 8, hz, mx);
+
+UNI_MC(qpel, v, 24, 8, vt, my);
+UNI_MC(qpel, v, 32, 8, vt, my);
+UNI_MC(qpel, v, 48, 8, vt, my);
+UNI_MC(qpel, v, 64, 8, vt, my);
+
+UNI_MC(epel, v, 24, 4, vt, my);
+UNI_MC(epel, v, 32, 4, vt, my);
+
+#undef UNI_MC
+
+#define UNI_MC_HV(PEL, WIDTH, TAP)                                         \
+void ff_hevc_put_hevc_uni_##PEL##_hv##WIDTH##_8_lsx(uint8_t *dst,          \
+                                                    ptrdiff_t dst_stride,  \
+                                                    uint8_t *src,          \
+                                                    ptrdiff_t src_stride,  \
+                                                    int height,            \
+                                                    intptr_t mx,           \
+                                                    intptr_t my,           \
+                                                    int width)             \
+{                                                                          \
+    const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];              \
+    const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];              \
+                                                                           \
+    hevc_hv_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, dst_stride,  \
+                                    filter_x, filter_y, height);       \
+}
+
+UNI_MC_HV(qpel, 8, 8);
+UNI_MC_HV(qpel, 16, 8);
+UNI_MC_HV(qpel, 24, 8);
+UNI_MC_HV(qpel, 32, 8);
+UNI_MC_HV(qpel, 48, 8);
+UNI_MC_HV(qpel, 64, 8);
+
+UNI_MC_HV(epel, 8, 4);
+UNI_MC_HV(epel, 12, 4);
+UNI_MC_HV(epel, 16, 4);
+UNI_MC_HV(epel, 24, 4);
+UNI_MC_HV(epel, 32, 4);
+
+#undef UNI_MC_HV
diff --git a/libavcodec/loongarch/hevc_mc_uniw_lsx.c b/libavcodec/loongarch/hevc_mc_uniw_lsx.c
new file mode 100644
index 0000000000..118f5b820e
--- /dev/null
+++ b/libavcodec/loongarch/hevc_mc_uniw_lsx.c
@@ -0,0 +1,298 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *                Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "hevcdsp_lsx.h"
+
+static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
+    /* 8 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+    /* 4 width cases */
+    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
+};
+
+static av_always_inline
+void hevc_hv_8t_8x2_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+                        int32_t dst_stride, const int8_t *filter_x,
+                        const int8_t *filter_y, int32_t height, int32_t weight,
+                        int32_t offset, int32_t rnd_val, int32_t width)
+{
+    uint32_t loop_cnt, cnt;
+    uint8_t *src_tmp;
+    uint8_t *dst_tmp;
+    const int32_t src_stride_2x = (src_stride << 1);
+    const int32_t dst_stride_2x = (dst_stride << 1);
+    const int32_t src_stride_4x = (src_stride << 2);
+    const int32_t src_stride_3x = src_stride_2x + src_stride;
+
+    __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    __m128i filt0, filt1, filt2, filt3;
+    __m128i filt_h0, filt_h1, filt_h2, filt_h3;
+    __m128i mask1, mask2, mask3;
+    __m128i filter_vec;
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+    __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+    __m128i dst0_r, dst0_l, dst1_r, dst1_l;
+    __m128i dst10_r, dst32_r, dst54_r, dst76_r;
+    __m128i dst10_l, dst32_l, dst54_l, dst76_l;
+    __m128i dst21_r, dst43_r, dst65_r, dst87_r;
+    __m128i dst21_l, dst43_l, dst65_l, dst87_l;
+    __m128i weight_vec, offset_vec, rnd_vec;
+    __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+
+    src -= (src_stride_3x + 3);
+    weight_vec = __lsx_vreplgr2vr_w(weight);
+    offset_vec = __lsx_vreplgr2vr_w(offset);
+    rnd_vec = __lsx_vreplgr2vr_w(rnd_val);
+
+    DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4,
+              filter_x, 6, filt0, filt1, filt2, filt3);
+    filter_vec = __lsx_vld(filter_y, 0);
+    filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+    DUP4_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filter_vec, 2,
+              filter_vec, 3, filt_h0, filt_h1, filt_h2, filt_h3);
+    DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+    mask3 = __lsx_vaddi_bu(mask0, 6);
+
+    for (cnt = width >> 3; cnt--;) {
+        src_tmp = src;
+        dst_tmp = dst;
+
+        src0 = __lsx_vld(src_tmp, 0);
+        DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+                  src1, src2);
+        src3 = __lsx_vldx(src_tmp, src_stride_3x);
+        src_tmp += src_stride_4x;
+        src4 = __lsx_vld(src_tmp, 0);
+        DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+                  src5, src6);
+        src_tmp += src_stride_3x;
+
+        DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0,
+                  src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
+        DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1,
+                  src1, mask2, src1, src1, mask3, vec4, vec5, vec6, vec7);
+        DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2,
+                  src2, mask2, src2, src2, mask3, vec8, vec9, vec10, vec11);
+        DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3,
+                  src3, mask2, src3, src3, mask3, vec12, vec13, vec14, vec15);
+        DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, vec8, filt0,
+                  vec12, filt0, dst0, dst1, dst2, dst3);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec5, filt1,
+                  dst2, vec9, filt1, dst3, vec13, filt1, dst0, dst1, dst2, dst3);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec2, filt2, dst1, vec6, filt2,
+                  dst2, vec10, filt2, dst3, vec14, filt2, dst0, dst1, dst2, dst3);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec3, filt3, dst1, vec7, filt3,
+                  dst2, vec11, filt3, dst3, vec15, filt3, dst0, dst1, dst2, dst3);
+        DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4,
+                  src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3);
+        DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5,
+                  src5, mask2, src5, src5, mask3, vec4, vec5, vec6, vec7);
+        DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6,
+                  src6, mask2, src6, src6, mask3, vec8, vec9, vec10, vec11);
+        DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, dst4, dst5);
+        dst6 = __lsx_vdp2_h_bu_b(vec8, filt0);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst5, vec5, filt1,
+                  dst6, vec9, filt1, dst4, vec2, filt2, dst4, dst5, dst6, dst4);
+        DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec6, filt2, dst6, vec10, filt2,
+                  dst4, vec3, filt3, dst5, vec7, filt3, dst5, dst6, dst4, dst5);
+        dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3);
+
+        DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst2,
+                  dst1, dst10_r, dst32_r, dst54_r, dst21_r);
+        DUP2_ARG2(__lsx_vilvl_h, dst4, dst3, dst6, dst5, dst43_r, dst65_r);
+        DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst2,
+                  dst1, dst10_l, dst32_l, dst54_l, dst21_l);
+        DUP2_ARG2(__lsx_vilvh_h, dst4, dst3, dst6, dst5, dst43_l, dst65_l);
+
+        for (loop_cnt = height >> 1; loop_cnt--;) {
+            src7 = __lsx_vld(src_tmp, 0);
+            src8 = __lsx_vldx(src_tmp, src_stride);
+            src_tmp += src_stride_2x;
+            DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7,
+                      src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3);
+            dst7 = __lsx_vdp2_h_bu_b(vec0, filt0);
+            DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2,
+                      filt2, dst7, dst7);
+            dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3);
+            dst76_r = __lsx_vilvl_h(dst7, dst6);
+            dst76_l = __lsx_vilvh_h(dst7, dst6);
+            DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0,
+                      dst0_r, dst0_l);
+            DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
+                      dst32_l, filt_h1, dst0_r, dst54_r, filt_h2, dst0_l,
+                      dst54_l, filt_h2, dst0_r, dst0_l, dst0_r, dst0_l);
+            DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l,
+                      dst76_l, filt_h3, dst0_r, dst0_l);
+            DUP2_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst0_r, dst0_l);
+
+            /* row 8 */
+            DUP4_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1, src8,
+                      src8, mask2, src8, src8, mask3, vec0, vec1, vec2, vec3);
+            dst8 = __lsx_vdp2_h_bu_b(vec0, filt0);
+            DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst8, vec1, filt1, dst8, vec2,
+                      filt2, dst8, dst8);
+            dst8 = __lsx_vdp2add_h_bu_b(dst8, vec3, filt3);
+
+            dst87_r = __lsx_vilvl_h(dst8, dst7);
+            dst87_l = __lsx_vilvh_h(dst8, dst7);
+            DUP2_ARG2(__lsx_vdp2_w_h, dst21_r, filt_h0, dst21_l, filt_h0,
+                      dst1_r, dst1_l);
+            DUP4_ARG3(__lsx_vdp2add_w_h, dst1_r, dst43_r, filt_h1, dst1_l,
+                      dst43_l, filt_h1, dst1_r, dst65_r, filt_h2, dst1_l,
+                      dst65_l, filt_h2, dst1_r, dst1_l, dst1_r, dst1_l);
+            DUP2_ARG3(__lsx_vdp2add_w_h, dst1_r, dst87_r, filt_h3, dst1_l,
+                      dst87_l, filt_h3, dst1_r, dst1_l);
+            DUP2_ARG2(__lsx_vsrai_w, dst1_r, 6, dst1_l, 6, dst1_r, dst1_l);
+
+            DUP2_ARG2(__lsx_vmul_w, dst0_r, weight_vec, dst0_l, weight_vec,
+                      dst0_r, dst0_l);
+            DUP2_ARG2(__lsx_vmul_w, dst1_r, weight_vec, dst1_l, weight_vec,
+                      dst1_r, dst1_l);
+            DUP4_ARG2(__lsx_vsrar_w, dst0_r, rnd_vec, dst1_r, rnd_vec, dst0_l,
+                     rnd_vec, dst1_l, rnd_vec, dst0_r, dst1_r, dst0_l, dst1_l);
+
+            DUP2_ARG2(__lsx_vadd_w, dst0_r, offset_vec, dst0_l, offset_vec,
+                      dst0_r, dst0_l);
+            DUP2_ARG2(__lsx_vadd_w, dst1_r, offset_vec, dst1_l, offset_vec,
+                      dst1_r, dst1_l);
+            DUP4_ARG1(__lsx_vclip255_w, dst0_r, dst1_r, dst0_l, dst1_l, dst0_r,
+                      dst1_r, dst0_l, dst1_l);
+            DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r,
+                      dst0_r, dst1_r);
+            dst0_r = __lsx_vpickev_b(dst1_r, dst0_r);
+
+            __lsx_vstelm_d(dst0_r, dst_tmp, 0, 0);
+            __lsx_vstelm_d(dst0_r, dst_tmp + dst_stride, 0, 1);
+            dst_tmp += dst_stride_2x;
+
+            dst10_r = dst32_r;
+            dst32_r = dst54_r;
+            dst54_r = dst76_r;
+            dst10_l = dst32_l;
+            dst32_l = dst54_l;
+            dst54_l = dst76_l;
+            dst21_r = dst43_r;
+            dst43_r = dst65_r;
+            dst65_r = dst87_r;
+            dst21_l = dst43_l;
+            dst43_l = dst65_l;
+            dst65_l = dst87_l;
+            dst6 = dst8;
+        }
+
+        src += 8;
+        dst += 8;
+    }
+}
+
+static
+void hevc_hv_8t_8w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+                       int32_t dst_stride, const int8_t *filter_x,
+                       const int8_t *filter_y, int32_t height, int32_t weight,
+                       int32_t offset, int32_t rnd_val)
+{
+    hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, filter_x,
+                       filter_y, height, weight, offset, rnd_val, 8);
+}
+
+static
+void hevc_hv_8t_16w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+                        int32_t dst_stride, const int8_t *filter_x,
+                        const int8_t *filter_y, int32_t height, int32_t weight,
+                        int32_t offset, int32_t rnd_val)
+{
+    hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, filter_x,
+                       filter_y, height, weight, offset, rnd_val, 16);
+}
+
+static
+void hevc_hv_8t_24w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+                        int32_t dst_stride, const int8_t *filter_x,
+                        const int8_t *filter_y, int32_t height, int32_t weight,
+                        int32_t offset, int32_t rnd_val)
+{
+    hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, filter_x,
+                       filter_y, height, weight, offset, rnd_val, 24);
+}
+
+static
+void hevc_hv_8t_32w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+                        int32_t dst_stride, const int8_t *filter_x,
+                        const int8_t *filter_y, int32_t height, int32_t weight,
+                        int32_t offset, int32_t rnd_val)
+{
+    hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, filter_x,
+                       filter_y, height, weight, offset, rnd_val, 32);
+}
+
+static
+void hevc_hv_8t_48w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+                        int32_t dst_stride, const int8_t *filter_x,
+                        const int8_t *filter_y, int32_t height, int32_t weight,
+                        int32_t offset, int32_t rnd_val)
+{
+    hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, filter_x,
+                       filter_y, height, weight, offset, rnd_val, 48);
+}
+
+static
+void hevc_hv_8t_64w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+                        int32_t dst_stride, const int8_t *filter_x,
+                        const int8_t *filter_y, int32_t height, int32_t weight,
+                        int32_t offset, int32_t rnd_val)
+{
+    hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, filter_x,
+                       filter_y, height, weight, offset, rnd_val, 64);
+}
+
+
+#define UNI_W_MC_HV(PEL, WIDTH, TAP)                                           \
+void ff_hevc_put_hevc_uni_w_##PEL##_hv##WIDTH##_8_lsx(uint8_t *dst,            \
+                                                      ptrdiff_t dst_stride,    \
+                                                      uint8_t *src,            \
+                                                      ptrdiff_t src_stride,    \
+                                                      int height,              \
+                                                      int denom,               \
+                                                      int weight,              \
+                                                      int offset,              \
+                                                      intptr_t mx,             \
+                                                      intptr_t my,             \
+                                                      int width)               \
+{                                                                              \
+    const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1];                  \
+    const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1];                  \
+    int shift = denom + 14 - 8;                                                \
+                                                                               \
+    hevc_hv_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, dst_stride, filter_x,\
+                                    filter_y,  height, weight, offset, shift); \
+}
+
+UNI_W_MC_HV(qpel, 8, 8);
+UNI_W_MC_HV(qpel, 16, 8);
+UNI_W_MC_HV(qpel, 24, 8);
+UNI_W_MC_HV(qpel, 32, 8);
+UNI_W_MC_HV(qpel, 48, 8);
+UNI_W_MC_HV(qpel, 64, 8);
+
+#undef UNI_W_MC_HV
diff --git a/libavcodec/loongarch/hevcdsp_init_loongarch.c b/libavcodec/loongarch/hevcdsp_init_loongarch.c
index e05fe780c2..22739c6f5b 100644
--- a/libavcodec/loongarch/hevcdsp_init_loongarch.c
+++ b/libavcodec/loongarch/hevcdsp_init_loongarch.c
@@ -137,6 +137,36 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
             c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_8_lsx;
             c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_lsx;
 
+            c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_lsx;
+
+            c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_8_lsx;
+            c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_lsx;
+            c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_lsx;
+            c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_lsx;
+
+            c->put_hevc_qpel_uni[3][1][1] = ff_hevc_put_hevc_uni_qpel_hv8_8_lsx;
+            c->put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_8_lsx;
+            c->put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_8_lsx;
+            c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_8_lsx;
+            c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_8_lsx;
+            c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_8_lsx;
+
+            c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_8_lsx;
+            c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_lsx;
+
+            c->put_hevc_epel_uni[3][1][1] = ff_hevc_put_hevc_uni_epel_hv8_8_lsx;
+            c->put_hevc_epel_uni[4][1][1] = ff_hevc_put_hevc_uni_epel_hv12_8_lsx;
+            c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_8_lsx;
+            c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_8_lsx;
+            c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_lsx;
+
+            c->put_hevc_qpel_uni_w[3][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv8_8_lsx;
+            c->put_hevc_qpel_uni_w[5][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv16_8_lsx;
+            c->put_hevc_qpel_uni_w[6][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv24_8_lsx;
+            c->put_hevc_qpel_uni_w[7][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv32_8_lsx;
+            c->put_hevc_qpel_uni_w[8][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv48_8_lsx;
+            c->put_hevc_qpel_uni_w[9][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv64_8_lsx;
+
             c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_8_lsx;
 
             c->hevc_h_loop_filter_luma = ff_hevc_loop_filter_luma_h_8_lsx;
diff --git a/libavcodec/loongarch/hevcdsp_lsx.h b/libavcodec/loongarch/hevcdsp_lsx.h
index 6a834290ab..0c517af887 100644
--- a/libavcodec/loongarch/hevcdsp_lsx.h
+++ b/libavcodec/loongarch/hevcdsp_lsx.h
@@ -142,6 +142,65 @@ BI_MC(epel, hv, 32);
 
 #undef BI_MC
 
+#define UNI_MC(PEL, DIR, WIDTH)                                              \
+void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_lsx(uint8_t *dst,         \
+                                                       ptrdiff_t dst_stride, \
+                                                       uint8_t *src,         \
+                                                       ptrdiff_t src_stride, \
+                                                       int height,           \
+                                                       intptr_t mx,          \
+                                                       intptr_t my,          \
+                                                       int width)
+
+UNI_MC(qpel, h, 64);
+
+UNI_MC(qpel, v, 24);
+UNI_MC(qpel, v, 32);
+UNI_MC(qpel, v, 48);
+UNI_MC(qpel, v, 64);
+
+UNI_MC(qpel, hv, 8);
+UNI_MC(qpel, hv, 16);
+UNI_MC(qpel, hv, 24);
+UNI_MC(qpel, hv, 32);
+UNI_MC(qpel, hv, 48);
+UNI_MC(qpel, hv, 64);
+
+UNI_MC(epel, v, 24);
+UNI_MC(epel, v, 32);
+
+UNI_MC(epel, hv, 8);
+UNI_MC(epel, hv, 12);
+UNI_MC(epel, hv, 16);
+UNI_MC(epel, hv, 24);
+UNI_MC(epel, hv, 32);
+
+#undef UNI_MC
+
+#define UNI_W_MC(PEL, DIR, WIDTH)                                       \
+void ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_lsx(uint8_t *dst,  \
+                                                         ptrdiff_t      \
+                                                         dst_stride,    \
+                                                         uint8_t *src,  \
+                                                         ptrdiff_t      \
+                                                         src_stride,    \
+                                                         int height,    \
+                                                         int denom,     \
+                                                         int weight,    \
+                                                         int offset,    \
+                                                         intptr_t mx,   \
+                                                         intptr_t my,   \
+                                                         int width)
+
+UNI_W_MC(qpel, hv, 8);
+UNI_W_MC(qpel, hv, 16);
+UNI_W_MC(qpel, hv, 24);
+UNI_W_MC(qpel, hv, 32);
+UNI_W_MC(qpel, hv, 48);
+UNI_W_MC(qpel, hv, 64);
+
+#undef UNI_W_MC
+
 void ff_hevc_loop_filter_luma_h_8_lsx(uint8_t *src, ptrdiff_t stride,
                                       int32_t beta, int32_t *tc,
                                       uint8_t *p_is_pcm, uint8_t *q_is_pcm);
-- 
2.20.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [FFmpeg-devel] Optimize HEVC decoding for loongarch
  2022-02-14  8:25 [FFmpeg-devel] Optimize HEVC decoding for loongarch Hao Chen
                   ` (4 preceding siblings ...)
  2022-02-14  8:25 ` [FFmpeg-devel] [PATCH v1 5/5] avcodec: [loongarch] Optimize Hevc_mc_uni/w " Hao Chen
@ 2022-02-15  3:30 ` 殷时友
  5 siblings, 0 replies; 7+ messages in thread
From: 殷时友 @ 2022-02-15  3:30 UTC (permalink / raw)
  To: FFmpeg development discussions and patches



> 2022年2月14日 下午4:25,Hao Chen <chenhao@loongson.cn> 写道:
> 
> ffmpeg -i 5_h265_1080p_60fps_3Mbps.mkv -f rawvideo -y /dev/null -an
> before: 94fps
> after : 191fps
> 
> [PATCH v1 1/5] avutil: [loongarch] Update loongson_intrinsics.h to
> [PATCH v1 2/5] avcodec: [loongarch] Optimize Hevcdsp with LSX.
> [PATCH v1 3/5] avcodec: [loongarch] Optimize Hevc_idct/lpf with LSX.
> [PATCH v1 4/5] avcodec: [loongarch] Optimize Hevc_mc_bi with LSX.
> [PATCH v1 5/5] avcodec: [loongarch] Optimize Hevc_mc_uni/w with LSX.
> 
LGTM

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2022-02-15  3:30 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-02-14  8:25 [FFmpeg-devel] Optimize HEVC decoding for loongarch Hao Chen
2022-02-14  8:25 ` [FFmpeg-devel] [PATCH v1 1/5] avutil: [loongarch] Update loongson_intrinsics.h to v1.1.0 Hao Chen
2022-02-14  8:25 ` [FFmpeg-devel] [PATCH v1 2/5] avcodec: [loongarch] Optimize Hevcdsp with LSX Hao Chen
2022-02-14  8:25 ` [FFmpeg-devel] [PATCH v1 3/5] avcodec: [loongarch] Optimize Hevc_idct/lpf " Hao Chen
2022-02-14  8:25 ` [FFmpeg-devel] [PATCH v1 4/5] avcodec: [loongarch] Optimize Hevc_mc_bi " Hao Chen
2022-02-14  8:25 ` [FFmpeg-devel] [PATCH v1 5/5] avcodec: [loongarch] Optimize Hevc_mc_uni/w " Hao Chen
2022-02-15  3:30 ` [FFmpeg-devel] Optimize HEVC decoding for loongarch 殷时友

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git