* [FFmpeg-devel] [PATCH v1 1/5] avutil: [loongarch] Update loongson_intrinsics.h to v1.1.0
2022-02-14 8:25 [FFmpeg-devel] Optimize HEVC decoding for loongarch Hao Chen
@ 2022-02-14 8:25 ` Hao Chen
2022-02-14 8:25 ` [FFmpeg-devel] [PATCH v1 2/5] avcodec: [loongarch] Optimize Hevcdsp with LSX Hao Chen
` (4 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Hao Chen @ 2022-02-14 8:25 UTC (permalink / raw)
To: ffmpeg-devel
The loongson_intrinsics.h file is updated from v1.0.3 version
to v1.1.0. Some spelling mistakes are fixed and new functions are added.
Signed-off-by: Hao Chen <chenhao@loongson.cn>
---
libavutil/loongarch/loongson_intrinsics.h | 1697 +++++++++++----------
1 file changed, 884 insertions(+), 813 deletions(-)
diff --git a/libavutil/loongarch/loongson_intrinsics.h b/libavutil/loongarch/loongson_intrinsics.h
index 6e0439f829..eb256863c8 100644
--- a/libavutil/loongarch/loongson_intrinsics.h
+++ b/libavutil/loongarch/loongson_intrinsics.h
@@ -33,7 +33,7 @@
* Xiwei Gu <guxiwei-hf@loongson.cn>
* Lu Wang <wanglu@loongson.cn>
*
- * This file is a header file for loongarch builtin extention.
+ * This file is a header file for loongarch builtin extension.
*
*/
@@ -42,50 +42,50 @@
/**
* MAJOR version: Macro usage changes.
- * MINOR version: Add new functions, or bug fix.
+ * MINOR version: Add new functions, or bug fixes.
* MICRO version: Comment changes or implementation changes.
*/
#define LSOM_VERSION_MAJOR 1
-#define LSOM_VERSION_MINOR 0
-#define LSOM_VERSION_MICRO 3
+#define LSOM_VERSION_MINOR 1
+#define LSOM_VERSION_MICRO 0
#define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
-{ \
- _OUT0 = _INS(_IN0); \
- _OUT1 = _INS(_IN1); \
-}
+ { \
+ _OUT0 = _INS(_IN0); \
+ _OUT1 = _INS(_IN1); \
+ }
#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
-{ \
- _OUT0 = _INS(_IN0, _IN1); \
- _OUT1 = _INS(_IN2, _IN3); \
-}
+ { \
+ _OUT0 = _INS(_IN0, _IN1); \
+ _OUT1 = _INS(_IN2, _IN3); \
+ }
#define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \
-{ \
- _OUT0 = _INS(_IN0, _IN1, _IN2); \
- _OUT1 = _INS(_IN3, _IN4, _IN5); \
-}
+ { \
+ _OUT0 = _INS(_IN0, _IN1, _IN2); \
+ _OUT1 = _INS(_IN3, _IN4, _IN5); \
+ }
#define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \
-{ \
- DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1); \
- DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3); \
-}
-
-#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, \
- _OUT0, _OUT1, _OUT2, _OUT3) \
-{ \
- DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1); \
- DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3); \
-}
-
-#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, \
- _IN8, _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3) \
-{ \
- DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1); \
- DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3); \
-}
+ { \
+ DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1); \
+ DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3); \
+ }
+
+#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, \
+ _OUT1, _OUT2, _OUT3) \
+ { \
+ DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1); \
+ DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3); \
+ }
+
+#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, \
+ _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3) \
+ { \
+ DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1); \
+ DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3); \
+ }
#ifdef __loongarch_sx
#include <lsxintrin.h>
@@ -94,11 +94,11 @@
* Description : Dot product & addition of byte vector elements
* Arguments : Inputs - in_c, in_h, in_l
* Outputs - out
- * Retrun Type - halfword
+ * Return Type - halfword
* Details : Signed byte elements from in_h are multiplied by
* signed byte elements from in_l, and then added adjacent to
* each other to get results with the twice size of input.
- * Then the results plus to signed half word elements from in_c.
+ * Then the results plus to signed half-word elements from in_c.
* Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
* in_c : 1,2,3,4, 1,2,3,4
* in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
@@ -106,13 +106,13 @@
* out : 23,40,41,26, 23,40,41,26
* =============================================================================
*/
-static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h, __m128i in_l)
-{
- __m128i out;
+static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h,
+ __m128i in_l) {
+ __m128i out;
- out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
- out = __lsx_vmaddwod_h_b(out, in_h, in_l);
- return out;
+ out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
+ out = __lsx_vmaddwod_h_b(out, in_h, in_l);
+ return out;
}
/*
@@ -120,35 +120,61 @@ static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h, __m128i in_l
* Description : Dot product & addition of byte vector elements
* Arguments : Inputs - in_c, in_h, in_l
* Outputs - out
- * Retrun Type - halfword
+ * Return Type - halfword
* Details : Unsigned byte elements from in_h are multiplied by
* unsigned byte elements from in_l, and then added adjacent to
* each other to get results with the twice size of input.
- * The results plus to signed half word elements from in_c.
- * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
+ * The results plus to signed half-word elements from in_c.
+ * Example : out = __lsx_vdp2add_h_bu(in_c, in_h, in_l)
* in_c : 1,2,3,4, 1,2,3,4
* in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
* in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
* out : 23,40,41,26, 23,40,41,26
* =============================================================================
*/
-static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h, __m128i in_l)
-{
- __m128i out;
+static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h,
+ __m128i in_l) {
+ __m128i out;
+
+ out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
+ out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments : Inputs - in_c, in_h, in_l
+ * Outputs - out
+ * Return Type - halfword
+ * Details : Unsigned byte elements from in_h are multiplied by
+ * signed byte elements from in_l, and then added adjacent to
+ * each other to get results with the twice size of input.
+ * The results plus to signed half-word elements from in_c.
+ * Example : out = __lsx_vdp2add_h_bu_b(in_c, in_h, in_l)
+ * in_c : 1,1,1,1, 1,1,1,1
+ * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ * in_l : -1,-2,-3,-4, -5,-6,-7,-8, 1,2,3,4, 5,6,7,8
+ * out : -4,-24,-60,-112, 6,26,62,114
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, __m128i in_h,
+ __m128i in_l) {
+ __m128i out;
- out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
- out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
- return out;
+ out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l);
+ out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
+ return out;
}
/*
* =============================================================================
- * Description : Dot product & addition of half word vector elements
+ * Description : Dot product & addition of half-word vector elements
* Arguments : Inputs - in_c, in_h, in_l
* Outputs - out
- * Retrun Type - __m128i
- * Details : Signed half word elements from in_h are multiplied by
- * signed half word elements from in_l, and then added adjacent to
+ * Return Type - __m128i
+ * Details : Signed half-word elements from in_h are multiplied by
+ * signed half-word elements from in_l, and then added adjacent to
* each other to get results with the twice size of input.
* Then the results plus to signed word elements from in_c.
* Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
@@ -158,13 +184,13 @@ static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h, __m128i in_
* out : 23,40,41,26
* =============================================================================
*/
-static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h, __m128i in_l)
-{
- __m128i out;
+static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h,
+ __m128i in_l) {
+ __m128i out;
- out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
- out = __lsx_vmaddwod_w_h(out, in_h, in_l);
- return out;
+ out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
+ out = __lsx_vmaddwod_w_h(out, in_h, in_l);
+ return out;
}
/*
@@ -172,7 +198,7 @@ static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h, __m128i in_l
* Description : Dot product of byte vector elements
* Arguments : Inputs - in_h, in_l
* Outputs - out
- * Retrun Type - halfword
+ * Return Type - halfword
* Details : Signed byte elements from in_h are multiplied by
* signed byte elements from in_l, and then added adjacent to
* each other to get results with the twice size of input.
@@ -182,13 +208,12 @@ static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h, __m128i in_l
* out : 22,38,38,22, 22,38,38,22
* =============================================================================
*/
-static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l)
-{
- __m128i out;
+static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) {
+ __m128i out;
- out = __lsx_vmulwev_h_b(in_h, in_l);
- out = __lsx_vmaddwod_h_b(out, in_h, in_l);
- return out;
+ out = __lsx_vmulwev_h_b(in_h, in_l);
+ out = __lsx_vmaddwod_h_b(out, in_h, in_l);
+ return out;
}
/*
@@ -196,7 +221,7 @@ static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l)
* Description : Dot product of byte vector elements
* Arguments : Inputs - in_h, in_l
* Outputs - out
- * Retrun Type - halfword
+ * Return Type - halfword
* Details : Unsigned byte elements from in_h are multiplied by
* unsigned byte elements from in_l, and then added adjacent to
* each other to get results with the twice size of input.
@@ -206,13 +231,12 @@ static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l)
* out : 22,38,38,22, 22,38,38,22
* =============================================================================
*/
-static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l)
-{
- __m128i out;
+static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) {
+ __m128i out;
- out = __lsx_vmulwev_h_bu(in_h, in_l);
- out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
- return out;
+ out = __lsx_vmulwev_h_bu(in_h, in_l);
+ out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
+ return out;
}
/*
@@ -220,7 +244,7 @@ static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l)
* Description : Dot product of byte vector elements
* Arguments : Inputs - in_h, in_l
* Outputs - out
- * Retrun Type - halfword
+ * Return Type - halfword
* Details : Unsigned byte elements from in_h are multiplied by
* signed byte elements from in_l, and then added adjacent to
* each other to get results with the twice size of input.
@@ -230,13 +254,12 @@ static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l)
* out : 22,38,38,22, 22,38,38,6
* =============================================================================
*/
-static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l)
-{
- __m128i out;
+static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) {
+ __m128i out;
- out = __lsx_vmulwev_h_bu_b(in_h, in_l);
- out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
- return out;
+ out = __lsx_vmulwev_h_bu_b(in_h, in_l);
+ out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
+ return out;
}
/*
@@ -244,7 +267,7 @@ static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l)
* Description : Dot product of byte vector elements
* Arguments : Inputs - in_h, in_l
* Outputs - out
- * Retrun Type - halfword
+ * Return Type - halfword
* Details : Signed byte elements from in_h are multiplied by
* signed byte elements from in_l, and then added adjacent to
* each other to get results with the twice size of input.
@@ -254,19 +277,19 @@ static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l)
* out : 22,38,38,22
* =============================================================================
*/
-static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l)
-{
- __m128i out;
+static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) {
+ __m128i out;
- out = __lsx_vmulwev_w_h(in_h, in_l);
- out = __lsx_vmaddwod_w_h(out, in_h, in_l);
- return out;
+ out = __lsx_vmulwev_w_h(in_h, in_l);
+ out = __lsx_vmaddwod_w_h(out, in_h, in_l);
+ return out;
}
/*
* =============================================================================
* Description : Clip all halfword elements of input vector between min & max
- * out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) : (_in))
+ * out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) :
+ * (_in))
* Arguments : Inputs - _in (input vector)
* - min (min threshold)
* - max (max threshold)
@@ -279,13 +302,12 @@ static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l)
* out : 1,2,9,9, 1,9,9,9
* =============================================================================
*/
-static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max)
-{
- __m128i out;
+static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max) {
+ __m128i out;
- out = __lsx_vmax_h(min, _in);
- out = __lsx_vmin_h(max, out);
- return out;
+ out = __lsx_vmax_h(min, _in);
+ out = __lsx_vmin_h(max, out);
+ return out;
}
/*
@@ -293,20 +315,19 @@ static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max)
* Description : Set each element of vector between 0 and 255
* Arguments : Inputs - _in
* Outputs - out
- * Retrun Type - halfword
+ * Return Type - halfword
* Details : Signed byte elements from _in are clamped between 0 and 255.
* Example : out = __lsx_vclip255_h(_in)
* _in : -8,255,280,249, -8,255,280,249
* out : 0,255,255,249, 0,255,255,249
* =============================================================================
*/
-static inline __m128i __lsx_vclip255_h(__m128i _in)
-{
- __m128i out;
+static inline __m128i __lsx_vclip255_h(__m128i _in) {
+ __m128i out;
- out = __lsx_vmaxi_h(_in, 0);
- out = __lsx_vsat_hu(out, 7);
- return out;
+ out = __lsx_vmaxi_h(_in, 0);
+ out = __lsx_vsat_hu(out, 7);
+ return out;
}
/*
@@ -314,20 +335,19 @@ static inline __m128i __lsx_vclip255_h(__m128i _in)
* Description : Set each element of vector between 0 and 255
* Arguments : Inputs - _in
* Outputs - out
- * Retrun Type - word
+ * Return Type - word
* Details : Signed byte elements from _in are clamped between 0 and 255.
* Example : out = __lsx_vclip255_w(_in)
* _in : -8,255,280,249
* out : 0,255,255,249
* =============================================================================
*/
-static inline __m128i __lsx_vclip255_w(__m128i _in)
-{
- __m128i out;
+static inline __m128i __lsx_vclip255_w(__m128i _in) {
+ __m128i out;
- out = __lsx_vmaxi_w(_in, 0);
- out = __lsx_vsat_wu(out, 7);
- return out;
+ out = __lsx_vmaxi_w(_in, 0);
+ out = __lsx_vsat_wu(out, 7);
+ return out;
}
/*
@@ -343,12 +363,12 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
* _in1(out) : 1,2,3,4
* =============================================================================
*/
-#define LSX_SWAP(_in0, _in1) \
-{ \
- _in0 = __lsx_vxor_v(_in0, _in1); \
- _in1 = __lsx_vxor_v(_in0, _in1); \
- _in0 = __lsx_vxor_v(_in0, _in1); \
-} \
+#define LSX_SWAP(_in0, _in1) \
+ { \
+ _in0 = __lsx_vxor_v(_in0, _in1); \
+ _in1 = __lsx_vxor_v(_in0, _in1); \
+ _in0 = __lsx_vxor_v(_in0, _in1); \
+ }
/*
* =============================================================================
@@ -364,25 +384,27 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
* =============================================================================
*/
#define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
-{ \
+ { \
__m128i _t0, _t1, _t2, _t3; \
\
- _t0 = __lsx_vilvl_w(_in1, _in0); \
- _t1 = __lsx_vilvh_w(_in1, _in0); \
- _t2 = __lsx_vilvl_w(_in3, _in2); \
- _t3 = __lsx_vilvh_w(_in3, _in2); \
+ _t0 = __lsx_vilvl_w(_in1, _in0); \
+ _t1 = __lsx_vilvh_w(_in1, _in0); \
+ _t2 = __lsx_vilvl_w(_in3, _in2); \
+ _t3 = __lsx_vilvh_w(_in3, _in2); \
_out0 = __lsx_vilvl_d(_t2, _t0); \
_out1 = __lsx_vilvh_d(_t2, _t0); \
_out2 = __lsx_vilvl_d(_t3, _t1); \
_out3 = __lsx_vilvh_d(_t3, _t1); \
-}
+ }
/*
* =============================================================================
* Description : Transpose 8x8 block with byte elements in vectors
* Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
- * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
- * Details : The rows of the matrix become columns, and the columns become rows.
+ * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ * _out7
+ * Details : The rows of the matrix become columns, and the columns
+ * become rows.
* Example : LSX_TRANSPOSE8x8_B
* _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00
* _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00
@@ -403,34 +425,35 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
* _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00
* =============================================================================
*/
-#define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
-{ \
- __m128i zero = {0}; \
- __m128i shuf8 = {0x0F0E0D0C0B0A0908, 0x1716151413121110}; \
- __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
- \
- _t0 = __lsx_vilvl_b(_in2, _in0); \
- _t1 = __lsx_vilvl_b(_in3, _in1); \
- _t2 = __lsx_vilvl_b(_in6, _in4); \
- _t3 = __lsx_vilvl_b(_in7, _in5); \
- _t4 = __lsx_vilvl_b(_t1, _t0); \
- _t5 = __lsx_vilvh_b(_t1, _t0); \
- _t6 = __lsx_vilvl_b(_t3, _t2); \
- _t7 = __lsx_vilvh_b(_t3, _t2); \
- _out0 = __lsx_vilvl_w(_t6, _t4); \
- _out2 = __lsx_vilvh_w(_t6, _t4); \
- _out4 = __lsx_vilvl_w(_t7, _t5); \
- _out6 = __lsx_vilvh_w(_t7, _t5); \
- _out1 = __lsx_vshuf_b(zero, _out0, shuf8); \
- _out3 = __lsx_vshuf_b(zero, _out2, shuf8); \
- _out5 = __lsx_vshuf_b(zero, _out4, shuf8); \
- _out7 = __lsx_vshuf_b(zero, _out6, shuf8); \
-}
+#define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ __m128i zero = { 0 }; \
+ __m128i shuf8 = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; \
+ __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
+ \
+ _t0 = __lsx_vilvl_b(_in2, _in0); \
+ _t1 = __lsx_vilvl_b(_in3, _in1); \
+ _t2 = __lsx_vilvl_b(_in6, _in4); \
+ _t3 = __lsx_vilvl_b(_in7, _in5); \
+ _t4 = __lsx_vilvl_b(_t1, _t0); \
+ _t5 = __lsx_vilvh_b(_t1, _t0); \
+ _t6 = __lsx_vilvl_b(_t3, _t2); \
+ _t7 = __lsx_vilvh_b(_t3, _t2); \
+ _out0 = __lsx_vilvl_w(_t6, _t4); \
+ _out2 = __lsx_vilvh_w(_t6, _t4); \
+ _out4 = __lsx_vilvl_w(_t7, _t5); \
+ _out6 = __lsx_vilvh_w(_t7, _t5); \
+ _out1 = __lsx_vshuf_b(zero, _out0, shuf8); \
+ _out3 = __lsx_vshuf_b(zero, _out2, shuf8); \
+ _out5 = __lsx_vshuf_b(zero, _out4, shuf8); \
+ _out7 = __lsx_vshuf_b(zero, _out6, shuf8); \
+ }
/*
* =============================================================================
- * Description : Transpose 8x8 block with half word elements in vectors
+ * Description : Transpose 8x8 block with half-word elements in vectors
* Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
* Outputs - out0, out1, out2, out3, out4, out5, out6, out7
* Details :
@@ -445,37 +468,38 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
* 70,71,72,73,74,75,76,77 07,17,27,37,47,57,67,77
* =============================================================================
*/
-#define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
-{ \
- __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
- \
- _s0 = __lsx_vilvl_h(_in6, _in4); \
- _s1 = __lsx_vilvl_h(_in7, _in5); \
- _t0 = __lsx_vilvl_h(_s1, _s0); \
- _t1 = __lsx_vilvh_h(_s1, _s0); \
- _s0 = __lsx_vilvh_h(_in6, _in4); \
- _s1 = __lsx_vilvh_h(_in7, _in5); \
- _t2 = __lsx_vilvl_h(_s1, _s0); \
- _t3 = __lsx_vilvh_h(_s1, _s0); \
- _s0 = __lsx_vilvl_h(_in2, _in0); \
- _s1 = __lsx_vilvl_h(_in3, _in1); \
- _t4 = __lsx_vilvl_h(_s1, _s0); \
- _t5 = __lsx_vilvh_h(_s1, _s0); \
- _s0 = __lsx_vilvh_h(_in2, _in0); \
- _s1 = __lsx_vilvh_h(_in3, _in1); \
- _t6 = __lsx_vilvl_h(_s1, _s0); \
- _t7 = __lsx_vilvh_h(_s1, _s0); \
- \
- _out0 = __lsx_vpickev_d(_t0, _t4); \
- _out2 = __lsx_vpickev_d(_t1, _t5); \
- _out4 = __lsx_vpickev_d(_t2, _t6); \
- _out6 = __lsx_vpickev_d(_t3, _t7); \
- _out1 = __lsx_vpickod_d(_t0, _t4); \
- _out3 = __lsx_vpickod_d(_t1, _t5); \
- _out5 = __lsx_vpickod_d(_t2, _t6); \
- _out7 = __lsx_vpickod_d(_t3, _t7); \
-}
+#define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
+ \
+ _s0 = __lsx_vilvl_h(_in6, _in4); \
+ _s1 = __lsx_vilvl_h(_in7, _in5); \
+ _t0 = __lsx_vilvl_h(_s1, _s0); \
+ _t1 = __lsx_vilvh_h(_s1, _s0); \
+ _s0 = __lsx_vilvh_h(_in6, _in4); \
+ _s1 = __lsx_vilvh_h(_in7, _in5); \
+ _t2 = __lsx_vilvl_h(_s1, _s0); \
+ _t3 = __lsx_vilvh_h(_s1, _s0); \
+ _s0 = __lsx_vilvl_h(_in2, _in0); \
+ _s1 = __lsx_vilvl_h(_in3, _in1); \
+ _t4 = __lsx_vilvl_h(_s1, _s0); \
+ _t5 = __lsx_vilvh_h(_s1, _s0); \
+ _s0 = __lsx_vilvh_h(_in2, _in0); \
+ _s1 = __lsx_vilvh_h(_in3, _in1); \
+ _t6 = __lsx_vilvl_h(_s1, _s0); \
+ _t7 = __lsx_vilvh_h(_s1, _s0); \
+ \
+ _out0 = __lsx_vpickev_d(_t0, _t4); \
+ _out2 = __lsx_vpickev_d(_t1, _t5); \
+ _out4 = __lsx_vpickev_d(_t2, _t6); \
+ _out6 = __lsx_vpickev_d(_t3, _t7); \
+ _out1 = __lsx_vpickod_d(_t0, _t4); \
+ _out3 = __lsx_vpickod_d(_t1, _t5); \
+ _out5 = __lsx_vpickod_d(_t2, _t6); \
+ _out7 = __lsx_vpickod_d(_t3, _t7); \
+ }
/*
* =============================================================================
@@ -483,7 +507,8 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
* Arguments : Inputs - _in0, _in1, _in2, _in3 (input 8x4 byte block)
* Outputs - _out0, _out1, _out2, _out3 (output 4x8 byte block)
* Return Type - as per RTYPE
- * Details : The rows of the matrix become columns, and the columns become rows.
+ * Details : The rows of the matrix become columns, and the columns become
+ * rows.
* Example : LSX_TRANSPOSE8x4_B
* _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00
* _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00
@@ -500,26 +525,26 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
* _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
* =============================================================================
*/
-#define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _out0, _out1, _out2, _out3) \
-{ \
- __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
- \
- _tmp0_m = __lsx_vpackev_w(_in4, _in0); \
- _tmp1_m = __lsx_vpackev_w(_in5, _in1); \
- _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
- _tmp0_m = __lsx_vpackev_w(_in6, _in2); \
- _tmp1_m = __lsx_vpackev_w(_in7, _in3); \
- \
- _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
- _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m); \
- _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m); \
- \
- _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m); \
- _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m); \
- _out1 = __lsx_vilvh_d(_out2, _out0); \
- _out3 = __lsx_vilvh_d(_out0, _out2); \
-}
+#define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3) \
+ { \
+ __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
+ \
+ _tmp0_m = __lsx_vpackev_w(_in4, _in0); \
+ _tmp1_m = __lsx_vpackev_w(_in5, _in1); \
+ _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
+ _tmp0_m = __lsx_vpackev_w(_in6, _in2); \
+ _tmp1_m = __lsx_vpackev_w(_in7, _in3); \
+ \
+ _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
+ _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m); \
+ _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m); \
+ \
+ _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m); \
+ _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m); \
+ _out1 = __lsx_vilvh_d(_out2, _out0); \
+ _out3 = __lsx_vilvh_d(_out0, _out2); \
+ }
/*
* =============================================================================
@@ -547,29 +572,30 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
* 120,121,122,123,124,125,126,127
* =============================================================================
*/
-#define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, _in8, \
- _in9, _in10, _in11, _in12, _in13, _in14, _in15, _out0, \
- _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
-{ \
- __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
- __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
- DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \
- _tmp0, _tmp1, _tmp2, _tmp3); \
- DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15, \
- _in13, _tmp4, _tmp5, _tmp6, _tmp7); \
- DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2); \
- DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3); \
- DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6); \
- DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7); \
- DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4); \
- DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6); \
- DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5); \
- DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7); \
- DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2); \
- DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3); \
- DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6); \
- DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7); \
-}
+#define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
+ _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+ _out6, _out7) \
+ { \
+ __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
+ __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
+ DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \
+ _tmp0, _tmp1, _tmp2, _tmp3); \
+ DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15, \
+ _in13, _tmp4, _tmp5, _tmp6, _tmp7); \
+ DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2); \
+ DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3); \
+ DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6); \
+ DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7); \
+ DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4); \
+ DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6); \
+ DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5); \
+ DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7); \
+ DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2); \
+ DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3); \
+ DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6); \
+ DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7); \
+ }
/*
* =============================================================================
@@ -585,33 +611,33 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
* =============================================================================
*/
#define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
-{ \
+ { \
_out0 = __lsx_vadd_b(_in0, _in3); \
_out1 = __lsx_vadd_b(_in1, _in2); \
_out2 = __lsx_vsub_b(_in1, _in2); \
_out3 = __lsx_vsub_b(_in0, _in3); \
-}
+ }
#define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
-{ \
+ { \
_out0 = __lsx_vadd_h(_in0, _in3); \
_out1 = __lsx_vadd_h(_in1, _in2); \
_out2 = __lsx_vsub_h(_in1, _in2); \
_out3 = __lsx_vsub_h(_in0, _in3); \
-}
+ }
#define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
-{ \
+ { \
_out0 = __lsx_vadd_w(_in0, _in3); \
_out1 = __lsx_vadd_w(_in1, _in2); \
_out2 = __lsx_vsub_w(_in1, _in2); \
_out3 = __lsx_vsub_w(_in0, _in3); \
-}
+ }
#define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
-{ \
+ { \
_out0 = __lsx_vadd_d(_in0, _in3); \
_out1 = __lsx_vadd_d(_in1, _in2); \
_out2 = __lsx_vsub_d(_in1, _in2); \
_out3 = __lsx_vsub_d(_in0, _in3); \
-}
+ }
/*
* =============================================================================
@@ -630,59 +656,63 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
* _out7 = _in0 - _in7;
* =============================================================================
*/
-#define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
-{ \
- _out0 = __lsx_vadd_b(_in0, _in7); \
- _out1 = __lsx_vadd_b(_in1, _in6); \
- _out2 = __lsx_vadd_b(_in2, _in5); \
- _out3 = __lsx_vadd_b(_in3, _in4); \
- _out4 = __lsx_vsub_b(_in3, _in4); \
- _out5 = __lsx_vsub_b(_in2, _in5); \
- _out6 = __lsx_vsub_b(_in1, _in6); \
- _out7 = __lsx_vsub_b(_in0, _in7); \
-}
-
-#define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
-{ \
- _out0 = __lsx_vadd_h(_in0, _in7); \
- _out1 = __lsx_vadd_h(_in1, _in6); \
- _out2 = __lsx_vadd_h(_in2, _in5); \
- _out3 = __lsx_vadd_h(_in3, _in4); \
- _out4 = __lsx_vsub_h(_in3, _in4); \
- _out5 = __lsx_vsub_h(_in2, _in5); \
- _out6 = __lsx_vsub_h(_in1, _in6); \
- _out7 = __lsx_vsub_h(_in0, _in7); \
-}
-
-#define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
-{ \
- _out0 = __lsx_vadd_w(_in0, _in7); \
- _out1 = __lsx_vadd_w(_in1, _in6); \
- _out2 = __lsx_vadd_w(_in2, _in5); \
- _out3 = __lsx_vadd_w(_in3, _in4); \
- _out4 = __lsx_vsub_w(_in3, _in4); \
- _out5 = __lsx_vsub_w(_in2, _in5); \
- _out6 = __lsx_vsub_w(_in1, _in6); \
- _out7 = __lsx_vsub_w(_in0, _in7); \
-}
-
-#define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
-{ \
- _out0 = __lsx_vadd_d(_in0, _in7); \
- _out1 = __lsx_vadd_d(_in1, _in6); \
- _out2 = __lsx_vadd_d(_in2, _in5); \
- _out3 = __lsx_vadd_d(_in3, _in4); \
- _out4 = __lsx_vsub_d(_in3, _in4); \
- _out5 = __lsx_vsub_d(_in2, _in5); \
- _out6 = __lsx_vsub_d(_in1, _in6); \
- _out7 = __lsx_vsub_d(_in0, _in7); \
-}
-
-#endif //LSX
+#define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ _out0 = __lsx_vadd_b(_in0, _in7); \
+ _out1 = __lsx_vadd_b(_in1, _in6); \
+ _out2 = __lsx_vadd_b(_in2, _in5); \
+ _out3 = __lsx_vadd_b(_in3, _in4); \
+ _out4 = __lsx_vsub_b(_in3, _in4); \
+ _out5 = __lsx_vsub_b(_in2, _in5); \
+ _out6 = __lsx_vsub_b(_in1, _in6); \
+ _out7 = __lsx_vsub_b(_in0, _in7); \
+ }
+
+#define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ _out0 = __lsx_vadd_h(_in0, _in7); \
+ _out1 = __lsx_vadd_h(_in1, _in6); \
+ _out2 = __lsx_vadd_h(_in2, _in5); \
+ _out3 = __lsx_vadd_h(_in3, _in4); \
+ _out4 = __lsx_vsub_h(_in3, _in4); \
+ _out5 = __lsx_vsub_h(_in2, _in5); \
+ _out6 = __lsx_vsub_h(_in1, _in6); \
+ _out7 = __lsx_vsub_h(_in0, _in7); \
+ }
+
+#define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ _out0 = __lsx_vadd_w(_in0, _in7); \
+ _out1 = __lsx_vadd_w(_in1, _in6); \
+ _out2 = __lsx_vadd_w(_in2, _in5); \
+ _out3 = __lsx_vadd_w(_in3, _in4); \
+ _out4 = __lsx_vsub_w(_in3, _in4); \
+ _out5 = __lsx_vsub_w(_in2, _in5); \
+ _out6 = __lsx_vsub_w(_in1, _in6); \
+ _out7 = __lsx_vsub_w(_in0, _in7); \
+ }
+
+#define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ _out0 = __lsx_vadd_d(_in0, _in7); \
+ _out1 = __lsx_vadd_d(_in1, _in6); \
+ _out2 = __lsx_vadd_d(_in2, _in5); \
+ _out3 = __lsx_vadd_d(_in3, _in4); \
+ _out4 = __lsx_vsub_d(_in3, _in4); \
+ _out5 = __lsx_vsub_d(_in2, _in5); \
+ _out6 = __lsx_vsub_d(_in1, _in6); \
+ _out7 = __lsx_vsub_d(_in0, _in7); \
+ }
+
+#endif // LSX
#ifdef __loongarch_asx
#include <lasxintrin.h>
@@ -700,13 +730,12 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
* Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
* =============================================================================
*/
-static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l)
-{
- __m256i out;
+static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) {
+ __m256i out;
- out = __lasx_xvmulwev_h_bu(in_h, in_l);
- out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
- return out;
+ out = __lasx_xvmulwev_h_bu(in_h, in_l);
+ out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+ return out;
}
/*
@@ -718,18 +747,17 @@ static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l)
* Details : Signed byte elements from in_h are multiplied with
* signed byte elements from in_l producing a result
* twice the size of input i.e. signed halfword.
- * Then this iniplication results of adjacent odd-even elements
+ * Then this multiplication results of adjacent odd-even elements
* are added to the out vector
* Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
* =============================================================================
*/
-static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l)
-{
- __m256i out;
+static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) {
+ __m256i out;
- out = __lasx_xvmulwev_h_b(in_h, in_l);
- out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
- return out;
+ out = __lasx_xvmulwev_h_b(in_h, in_l);
+ out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
+ return out;
}
/*
@@ -749,13 +777,12 @@ static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l)
* out : 22,38,38,22, 22,38,38,22
* =============================================================================
*/
-static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l)
-{
- __m256i out;
+static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) {
+ __m256i out;
- out = __lasx_xvmulwev_w_h(in_h, in_l);
- out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
- return out;
+ out = __lasx_xvmulwev_w_h(in_h, in_l);
+ out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+ return out;
}
/*
@@ -763,22 +790,21 @@ static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l)
* Description : Dot product of word vector elements
* Arguments : Inputs - in_h, in_l
* Output - out
- * Retrun Type - signed double
+ * Return Type - signed double
* Details : Signed word elements from in_h are multiplied with
* signed word elements from in_l producing a result
- * twice the size of input i.e. signed double word.
+ * twice the size of input i.e. signed double-word.
* Then this multiplied results of adjacent odd-even elements
* are added to the out vector.
* Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
* =============================================================================
*/
-static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l)
-{
- __m256i out;
+static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l) {
+ __m256i out;
- out = __lasx_xvmulwev_d_w(in_h, in_l);
- out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
- return out;
+ out = __lasx_xvmulwev_d_w(in_h, in_l);
+ out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
+ return out;
}
/*
@@ -795,13 +821,12 @@ static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l)
* Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
* =============================================================================
*/
-static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l)
-{
- __m256i out;
+static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) {
+ __m256i out;
- out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
- out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
- return out;
+ out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
+ out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
+ return out;
}
/*
@@ -809,7 +834,7 @@ static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l)
* Description : Dot product & addition of byte vector elements
* Arguments : Inputs - in_h, in_l
* Output - out
- * Retrun Type - halfword
+ * Return Type - halfword
* Details : Signed byte elements from in_h are multiplied with
* signed byte elements from in_l producing a result
* twice the size of input i.e. signed halfword.
@@ -818,13 +843,59 @@ static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l)
* Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
* =============================================================================
*/
-static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c,__m256i in_h, __m256i in_l)
-{
- __m256i out;
+static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h,
+ __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
+ out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Return Type - halfword
+ * Details : Unsigned byte elements from in_h are multiplied with
+ * unsigned byte elements from in_l producing a result
+ * twice the size of input i.e. signed halfword.
+ * Then this multiplied results of adjacent odd-even elements
+ * are added to the in_c vector.
+ * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, __m256i in_h,
+ __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l);
+ out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Return Type - halfword
+ * Details : Unsigned byte elements from in_h are multiplied with
+ * signed byte elements from in_l producing a result
+ * twice the size of input i.e. signed halfword.
+ * Then this multiplied results of adjacent odd-even elements
+ * are added to the in_c vector.
+ * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, __m256i in_h,
+ __m256i in_l) {
+ __m256i out;
- out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
- out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
- return out;
+ out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l);
+ out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l);
+ return out;
}
/*
@@ -845,13 +916,13 @@ static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c,__m256i in_h, __m256i in_
* out : 23,40,41,26, 23,40,41,26
* =============================================================================
*/
-static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
-{
- __m256i out;
+static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h,
+ __m256i in_l) {
+ __m256i out;
- out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
- out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
- return out;
+ out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
+ out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+ return out;
}
/*
@@ -868,13 +939,13 @@ static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h, __m256i in
* Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
* =============================================================================
*/
-static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h, __m256i in_l)
-{
- __m256i out;
+static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h,
+ __m256i in_l) {
+ __m256i out;
- out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
- out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
- return out;
+ out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
+ out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
+ return out;
}
/*
@@ -891,13 +962,13 @@ static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h, __m256i i
* Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
* =============================================================================
*/
-static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h, __m256i in_l)
-{
- __m256i out;
+static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h,
+ __m256i in_l) {
+ __m256i out;
- out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
- out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
- return out;
+ out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
+ out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
+ return out;
}
/*
@@ -915,14 +986,14 @@ static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h, __m256i
* Example : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
* =============================================================================
*/
-static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h, __m256i in_l)
-{
- __m256i out;
-
- out = __lasx_xvmulwev_h_bu(in_h, in_l);
- out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
- out = __lasx_xvsub_h(in_c, out);
- return out;
+static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h,
+ __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmulwev_h_bu(in_h, in_l);
+ out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+ out = __lasx_xvsub_h(in_c, out);
+ return out;
}
/*
@@ -944,14 +1015,14 @@ static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h, __m256i i
* out : -7,-3,0,0, 0,-1,0,-1
* =============================================================================
*/
-static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
-{
- __m256i out;
-
- out = __lasx_xvmulwev_w_h(in_h, in_l);
- out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
- out = __lasx_xvsub_w(in_c, out);
- return out;
+static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h,
+ __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmulwev_w_h(in_h, in_l);
+ out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+ out = __lasx_xvsub_w(in_c, out);
+ return out;
}
/*
@@ -960,10 +1031,10 @@ static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h, __m256i in
* Arguments : Inputs - in_h, in_l
* Output - out
* Return Type - signed word
- * Details : Signed halfword elements from in_h are iniplied with
+ * Details : Signed halfword elements from in_h are multiplied with
* signed halfword elements from in_l producing a result
* four times the size of input i.e. signed doubleword.
- * Then this iniplication results of four adjacent elements
+ * Then this multiplication results of four adjacent elements
* are added together and stored to the out vector.
* Example : out = __lasx_xvdp4_d_h(in_h, in_l)
* in_h : 3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1
@@ -971,14 +1042,13 @@ static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h, __m256i in
* out : -2,0,1,1
* =============================================================================
*/
-static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l)
-{
- __m256i out;
-
- out = __lasx_xvmulwev_w_h(in_h, in_l);
- out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
- out = __lasx_xvhaddw_d_w(out, out);
- return out;
+static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmulwev_w_h(in_h, in_l);
+ out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+ out = __lasx_xvhaddw_d_w(out, out);
+ return out;
}
/*
@@ -993,13 +1063,12 @@ static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l)
* Example : See out = __lasx_xvaddwh_w_h(in_h, in_l)
* =============================================================================
*/
-static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l)
-{
- __m256i out;
+static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l) {
+ __m256i out;
- out = __lasx_xvilvh_b(in_h, in_l);
- out = __lasx_xvhaddw_h_b(out, out);
- return out;
+ out = __lasx_xvilvh_b(in_h, in_l);
+ out = __lasx_xvhaddw_h_b(out, out);
+ return out;
}
/*
@@ -1017,13 +1086,12 @@ static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l)
* out : 1,0,0,-1, 1,0,0, 2
* =============================================================================
*/
- static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l)
-{
- __m256i out;
+static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l) {
+ __m256i out;
- out = __lasx_xvilvh_h(in_h, in_l);
- out = __lasx_xvhaddw_w_h(out, out);
- return out;
+ out = __lasx_xvilvh_h(in_h, in_l);
+ out = __lasx_xvhaddw_w_h(out, out);
+ return out;
}
/*
@@ -1038,13 +1106,12 @@ static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l)
* Example : See out = __lasx_xvaddwl_w_h(in_h, in_l)
* =============================================================================
*/
-static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l)
-{
- __m256i out;
+static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l) {
+ __m256i out;
- out = __lasx_xvilvl_b(in_h, in_l);
- out = __lasx_xvhaddw_h_b(out, out);
- return out;
+ out = __lasx_xvilvl_b(in_h, in_l);
+ out = __lasx_xvhaddw_h_b(out, out);
+ return out;
}
/*
@@ -1062,13 +1129,12 @@ static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l)
* out : 5,-1,4,2, 1,0,2,-1
* =============================================================================
*/
-static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l)
-{
- __m256i out;
+static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l) {
+ __m256i out;
- out = __lasx_xvilvl_h(in_h, in_l);
- out = __lasx_xvhaddw_w_h(out, out);
- return out;
+ out = __lasx_xvilvl_h(in_h, in_l);
+ out = __lasx_xvhaddw_w_h(out, out);
+ return out;
}
/*
@@ -1083,13 +1149,12 @@ static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l)
* Example : See out = __lasx_xvaddwl_w_h(in_h, in_l)
* =============================================================================
*/
-static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l)
-{
- __m256i out;
+static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l) {
+ __m256i out;
- out = __lasx_xvilvl_b(in_h, in_l);
- out = __lasx_xvhaddw_hu_bu(out, out);
- return out;
+ out = __lasx_xvilvl_b(in_h, in_l);
+ out = __lasx_xvhaddw_hu_bu(out, out);
+ return out;
}
/*
@@ -1103,13 +1168,12 @@ static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l)
* Example : See out = __lasx_xvaddw_w_w_h(in_h, in_l)
* =============================================================================
*/
-static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l)
-{
- __m256i out;
+static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l) {
+ __m256i out;
- out = __lasx_xvsllwil_hu_bu(in_l, 0);
- out = __lasx_xvadd_h(in_h, out);
- return out;
+ out = __lasx_xvsllwil_hu_bu(in_l, 0);
+ out = __lasx_xvadd_h(in_h, out);
+ return out;
}
/*
@@ -1126,13 +1190,12 @@ static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l)
* out : 2, 0,1,2, -1,0,1,1,
* =============================================================================
*/
-static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l)
-{
- __m256i out;
+static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) {
+ __m256i out;
- out = __lasx_xvsllwil_w_h(in_l, 0);
- out = __lasx_xvadd_w(in_h, out);
- return out;
+ out = __lasx_xvsllwil_w_h(in_l, 0);
+ out = __lasx_xvadd_w(in_h, out);
+ return out;
}
/*
@@ -1153,15 +1216,15 @@ static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l)
* out : 201, 602,1203,2004, -995, -1794,-2793,-3992
* =============================================================================
*/
-static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
-{
- __m256i tmp0, tmp1, out;
-
- tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
- tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
- tmp0 = __lasx_xvmul_w(tmp0, tmp1);
- out = __lasx_xvadd_w(tmp0, in_c);
- return out;
+static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h,
+ __m256i in_l) {
+ __m256i tmp0, tmp1, out;
+
+ tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
+ tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
+ tmp0 = __lasx_xvmul_w(tmp0, tmp1);
+ out = __lasx_xvadd_w(tmp0, in_c);
+ return out;
}
/*
@@ -1177,15 +1240,15 @@ static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h, __m256i in
* Example : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
* =============================================================================
*/
-static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
-{
- __m256i tmp0, tmp1, out;
-
- tmp0 = __lasx_xvilvh_h(in_h, in_h);
- tmp1 = __lasx_xvilvh_h(in_l, in_l);
- tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
- out = __lasx_xvadd_w(tmp0, in_c);
- return out;
+static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h,
+ __m256i in_l) {
+ __m256i tmp0, tmp1, out;
+
+ tmp0 = __lasx_xvilvh_h(in_h, in_h);
+ tmp1 = __lasx_xvilvh_h(in_l, in_l);
+ tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
+ out = __lasx_xvadd_w(tmp0, in_c);
+ return out;
}
/*
@@ -1203,14 +1266,13 @@ static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h, __m256i in
* out : 6,1,3,0, 0,0,1,0
* =============================================================================
*/
-static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l)
-{
- __m256i tmp0, tmp1, out;
-
- tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
- tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
- out = __lasx_xvmul_w(tmp0, tmp1);
- return out;
+static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l) {
+ __m256i tmp0, tmp1, out;
+
+ tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
+ tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
+ out = __lasx_xvmul_w(tmp0, tmp1);
+ return out;
}
/*
@@ -1228,39 +1290,39 @@ static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l)
* out : 0,0,0,0, 0,0,0,1
* =============================================================================
*/
-static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l)
-{
- __m256i tmp0, tmp1, out;
-
- tmp0 = __lasx_xvilvh_h(in_h, in_h);
- tmp1 = __lasx_xvilvh_h(in_l, in_l);
- out = __lasx_xvmulwev_w_h(tmp0, tmp1);
- return out;
+static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) {
+ __m256i tmp0, tmp1, out;
+
+ tmp0 = __lasx_xvilvh_h(in_h, in_h);
+ tmp1 = __lasx_xvilvh_h(in_l, in_l);
+ out = __lasx_xvmulwev_w_h(tmp0, tmp1);
+ return out;
}
/*
* =============================================================================
- * Description : The low half of the vector elements are expanded and
- * added saturately after being doubled.
+ * Description : The low half of the vector elements are added to the high half
+ * after being doubled, then saturated.
* Arguments : Inputs - in_h, in_l
* Output - out
- * Details : The in_h vector adds the in_l vector saturately after the lower
- * half of the two-fold zero extension (unsigned byte to unsigned
- * halfword) and the results are stored to the out vector.
+ * Details : The in_h vector adds the in_l vector after the lower half of
+ * the two-fold zero extension (unsigned byte to unsigned
+ * halfword) and then saturated. The results are stored to the out
+ * vector.
* Example : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l)
* in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1
- * in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1
- * out : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
+ * in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1,
+ * 0,0,0,1
+ * out : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
* =============================================================================
*/
-static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l)
-{
- __m256i tmp1, out;
- __m256i zero = {0};
-
- tmp1 = __lasx_xvilvl_b(zero, in_l);
- out = __lasx_xvsadd_hu(in_h, tmp1);
- return out;
+static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) {
+ __m256i tmp1, out;
+ __m256i zero = { 0 };
+
+ tmp1 = __lasx_xvilvl_b(zero, in_l);
+ out = __lasx_xvsadd_hu(in_h, tmp1);
+ return out;
}
/*
@@ -1279,13 +1341,12 @@ static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l)
* out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5
* =============================================================================
*/
-static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max)
-{
- __m256i out;
+static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max) {
+ __m256i out;
- out = __lasx_xvmax_h(min, in);
- out = __lasx_xvmin_h(max, out);
- return out;
+ out = __lasx_xvmax_h(min, in);
+ out = __lasx_xvmin_h(max, out);
+ return out;
}
/*
@@ -1298,13 +1359,12 @@ static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max)
* Example : See out = __lasx_xvclip255_w(in)
* =============================================================================
*/
-static inline __m256i __lasx_xvclip255_h(__m256i in)
-{
- __m256i out;
+static inline __m256i __lasx_xvclip255_h(__m256i in) {
+ __m256i out;
- out = __lasx_xvmaxi_h(in, 0);
- out = __lasx_xvsat_hu(out, 7);
- return out;
+ out = __lasx_xvmaxi_h(in, 0);
+ out = __lasx_xvsat_hu(out, 7);
+ return out;
}
/*
@@ -1319,20 +1379,19 @@ static inline __m256i __lasx_xvclip255_h(__m256i in)
* out : 0,255,255,249, 0,255,255,249
* =============================================================================
*/
-static inline __m256i __lasx_xvclip255_w(__m256i in)
-{
- __m256i out;
+static inline __m256i __lasx_xvclip255_w(__m256i in) {
+ __m256i out;
- out = __lasx_xvmaxi_w(in, 0);
- out = __lasx_xvsat_wu(out, 7);
- return out;
+ out = __lasx_xvmaxi_w(in, 0);
+ out = __lasx_xvsat_wu(out, 7);
+ return out;
}
/*
* =============================================================================
* Description : Indexed halfword element values are replicated to all
- * elements in output vector. If 'indx < 8' use xvsplati_l_*,
- * if 'indx >= 8' use xvsplati_h_*.
+ * elements in output vector. If 'idx < 8' use xvsplati_l_*,
+ * if 'idx >= 8' use xvsplati_h_*.
* Arguments : Inputs - in, idx
* Output - out
* Details : Idx element value from in vector is replicated to all
@@ -1344,20 +1403,19 @@ static inline __m256i __lasx_xvclip255_w(__m256i in)
* out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11
* =============================================================================
*/
-static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx)
-{
- __m256i out;
+static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx) {
+ __m256i out;
- out = __lasx_xvpermi_q(in, in, 0x02);
- out = __lasx_xvreplve_h(out, idx);
- return out;
+ out = __lasx_xvpermi_q(in, in, 0x02);
+ out = __lasx_xvreplve_h(out, idx);
+ return out;
}
/*
* =============================================================================
* Description : Indexed halfword element values are replicated to all
- * elements in output vector. If 'indx < 8' use xvsplati_l_*,
- * if 'indx >= 8' use xvsplati_h_*.
+ * elements in output vector. If 'idx < 8' use xvsplati_l_*,
+ * if 'idx >= 8' use xvsplati_h_*.
* Arguments : Inputs - in, idx
* Output - out
* Details : Idx element value from in vector is replicated to all
@@ -1369,18 +1427,17 @@ static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx)
* out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
* =============================================================================
*/
-static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
-{
- __m256i out;
+static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) {
+ __m256i out;
- out = __lasx_xvpermi_q(in, in, 0x13);
- out = __lasx_xvreplve_h(out, idx);
- return out;
+ out = __lasx_xvpermi_q(in, in, 0x13);
+ out = __lasx_xvreplve_h(out, idx);
+ return out;
}
/*
* =============================================================================
- * Description : Transpose 4x4 block with double word elements in vectors
+ * Description : Transpose 4x4 block with double-word elements in vectors
* Arguments : Inputs - _in0, _in1, _in2, _in3
* Outputs - _out0, _out1, _out2, _out3
* Example : LASX_TRANSPOSE4x4_D
@@ -1395,24 +1452,26 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
* _out3 : 4,4,4,4
* =============================================================================
*/
-#define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
-{ \
- __m256i _tmp0, _tmp1, _tmp2, _tmp3; \
- _tmp0 = __lasx_xvilvl_d(_in1, _in0); \
- _tmp1 = __lasx_xvilvh_d(_in1, _in0); \
- _tmp2 = __lasx_xvilvl_d(_in3, _in2); \
- _tmp3 = __lasx_xvilvh_d(_in3, _in2); \
- _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20); \
- _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31); \
- _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20); \
- _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31); \
-}
+#define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
+ _out3) \
+ { \
+ __m256i _tmp0, _tmp1, _tmp2, _tmp3; \
+ _tmp0 = __lasx_xvilvl_d(_in1, _in0); \
+ _tmp1 = __lasx_xvilvh_d(_in1, _in0); \
+ _tmp2 = __lasx_xvilvl_d(_in3, _in2); \
+ _tmp3 = __lasx_xvilvh_d(_in3, _in2); \
+ _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20); \
+ _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31); \
+ _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20); \
+ _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31); \
+ }
/*
* =============================================================================
* Description : Transpose 8x8 block with word elements in vectors
* Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
- * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+ * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ * _out7
* Example : LASX_TRANSPOSE8x8_W
* _in0 : 1,2,3,4,5,6,7,8
* _in1 : 2,2,3,4,5,6,7,8
@@ -1433,38 +1492,39 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
* _out7 : 8,8,8,8,8,8,8,8
* =============================================================================
*/
-#define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
-{ \
- __m256i _s0_m, _s1_m; \
- __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
- __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
- \
- _s0_m = __lasx_xvilvl_w(_in2, _in0); \
- _s1_m = __lasx_xvilvl_w(_in3, _in1); \
- _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
- _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
- _s0_m = __lasx_xvilvh_w(_in2, _in0); \
- _s1_m = __lasx_xvilvh_w(_in3, _in1); \
- _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
- _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
- _s0_m = __lasx_xvilvl_w(_in6, _in4); \
- _s1_m = __lasx_xvilvl_w(_in7, _in5); \
- _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
- _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
- _s0_m = __lasx_xvilvh_w(_in6, _in4); \
- _s1_m = __lasx_xvilvh_w(_in7, _in5); \
- _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
- _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
- _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20); \
- _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20); \
- _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20); \
- _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20); \
- _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31); \
- _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31); \
- _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31); \
- _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31); \
-}
+#define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ __m256i _s0_m, _s1_m; \
+ __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
+ __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
+ \
+ _s0_m = __lasx_xvilvl_w(_in2, _in0); \
+ _s1_m = __lasx_xvilvl_w(_in3, _in1); \
+ _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
+ _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
+ _s0_m = __lasx_xvilvh_w(_in2, _in0); \
+ _s1_m = __lasx_xvilvh_w(_in3, _in1); \
+ _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
+ _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
+ _s0_m = __lasx_xvilvl_w(_in6, _in4); \
+ _s1_m = __lasx_xvilvl_w(_in7, _in5); \
+ _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
+ _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
+ _s0_m = __lasx_xvilvh_w(_in6, _in4); \
+ _s1_m = __lasx_xvilvh_w(_in7, _in5); \
+ _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
+ _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
+ _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20); \
+ _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20); \
+ _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20); \
+ _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20); \
+ _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31); \
+ _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31); \
+ _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31); \
+ _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31); \
+ }
/*
* =============================================================================
@@ -1472,52 +1532,54 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
* Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
* _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
* (input 16x8 byte block)
- * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
- * (output 8x16 byte block)
- * Details : The rows of the matrix become columns, and the columns become rows.
+ * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ * _out7 (output 8x16 byte block)
+ * Details : The rows of the matrix become columns, and the columns become
+ * rows.
* Example : See LASX_TRANSPOSE16x8_H
* =============================================================================
*/
-#define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15, \
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
-{ \
- __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
- __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
- \
- _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
- _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
- _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
- _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
- _tmp4_m = __lasx_xvilvl_b(_in10, _in8); \
- _tmp5_m = __lasx_xvilvl_b(_in11, _in9); \
- _tmp6_m = __lasx_xvilvl_b(_in14, _in12); \
- _tmp7_m = __lasx_xvilvl_b(_in15, _in13); \
- _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
- _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
- _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
- _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
- _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m); \
- _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m); \
- _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m); \
- _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m); \
- _tmp0_m = __lasx_xvilvl_w(_out2, _out0); \
- _tmp2_m = __lasx_xvilvh_w(_out2, _out0); \
- _tmp4_m = __lasx_xvilvl_w(_out3, _out1); \
- _tmp6_m = __lasx_xvilvh_w(_out3, _out1); \
- _tmp1_m = __lasx_xvilvl_w(_out6, _out4); \
- _tmp3_m = __lasx_xvilvh_w(_out6, _out4); \
- _tmp5_m = __lasx_xvilvl_w(_out7, _out5); \
- _tmp7_m = __lasx_xvilvh_w(_out7, _out5); \
- _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m); \
- _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m); \
- _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m); \
- _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m); \
- _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m); \
- _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m); \
- _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m); \
- _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m); \
-}
+#define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
+ _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+ _out6, _out7) \
+ { \
+ __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
+ __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
+ \
+ _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
+ _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
+ _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
+ _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
+ _tmp4_m = __lasx_xvilvl_b(_in10, _in8); \
+ _tmp5_m = __lasx_xvilvl_b(_in11, _in9); \
+ _tmp6_m = __lasx_xvilvl_b(_in14, _in12); \
+ _tmp7_m = __lasx_xvilvl_b(_in15, _in13); \
+ _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
+ _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
+ _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
+ _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
+ _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m); \
+ _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m); \
+ _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m); \
+ _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m); \
+ _tmp0_m = __lasx_xvilvl_w(_out2, _out0); \
+ _tmp2_m = __lasx_xvilvh_w(_out2, _out0); \
+ _tmp4_m = __lasx_xvilvl_w(_out3, _out1); \
+ _tmp6_m = __lasx_xvilvh_w(_out3, _out1); \
+ _tmp1_m = __lasx_xvilvl_w(_out6, _out4); \
+ _tmp3_m = __lasx_xvilvh_w(_out6, _out4); \
+ _tmp5_m = __lasx_xvilvl_w(_out7, _out5); \
+ _tmp7_m = __lasx_xvilvh_w(_out7, _out5); \
+ _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m); \
+ _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m); \
+ _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m); \
+ _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m); \
+ _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m); \
+ _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m); \
+ _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m); \
+ _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m); \
+ }
/*
* =============================================================================
@@ -1525,9 +1587,10 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
* Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
* _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
* (input 16x8 byte block)
- * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
- * (output 8x16 byte block)
- * Details : The rows of the matrix become columns, and the columns become rows.
+ * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ * _out7 (output 8x16 byte block)
+ * Details : The rows of the matrix become columns, and the columns become
+ * rows.
* Example : LASX_TRANSPOSE16x8_H
* _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
* _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
@@ -1556,72 +1619,73 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
* _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
* =============================================================================
*/
-#define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15, \
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
- { \
- __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
- __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
- __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
- \
- _tmp0_m = __lasx_xvilvl_h(_in2, _in0); \
- _tmp1_m = __lasx_xvilvl_h(_in3, _in1); \
- _tmp2_m = __lasx_xvilvl_h(_in6, _in4); \
- _tmp3_m = __lasx_xvilvl_h(_in7, _in5); \
- _tmp4_m = __lasx_xvilvl_h(_in10, _in8); \
- _tmp5_m = __lasx_xvilvl_h(_in11, _in9); \
- _tmp6_m = __lasx_xvilvl_h(_in14, _in12); \
- _tmp7_m = __lasx_xvilvl_h(_in15, _in13); \
- _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
- _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
- _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
- _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
- _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
- _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
- _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
- _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
- _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
- _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
- _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
- _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
- _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
- _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
- _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
- _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
- _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
- _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
- _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
- _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
- \
- _tmp0_m = __lasx_xvilvh_h(_in2, _in0); \
- _tmp1_m = __lasx_xvilvh_h(_in3, _in1); \
- _tmp2_m = __lasx_xvilvh_h(_in6, _in4); \
- _tmp3_m = __lasx_xvilvh_h(_in7, _in5); \
- _tmp4_m = __lasx_xvilvh_h(_in10, _in8); \
- _tmp5_m = __lasx_xvilvh_h(_in11, _in9); \
- _tmp6_m = __lasx_xvilvh_h(_in14, _in12); \
- _tmp7_m = __lasx_xvilvh_h(_in15, _in13); \
- _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
- _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
- _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
- _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
- _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
- _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
- _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
- _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
- _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
- _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
- _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
- _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
- _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
- _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
- _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
- _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
- _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
- _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
- _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
- _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
-}
+#define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
+ _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+ _out6, _out7) \
+ { \
+ __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
+ __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
+ __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
+ \
+ _tmp0_m = __lasx_xvilvl_h(_in2, _in0); \
+ _tmp1_m = __lasx_xvilvl_h(_in3, _in1); \
+ _tmp2_m = __lasx_xvilvl_h(_in6, _in4); \
+ _tmp3_m = __lasx_xvilvl_h(_in7, _in5); \
+ _tmp4_m = __lasx_xvilvl_h(_in10, _in8); \
+ _tmp5_m = __lasx_xvilvl_h(_in11, _in9); \
+ _tmp6_m = __lasx_xvilvl_h(_in14, _in12); \
+ _tmp7_m = __lasx_xvilvl_h(_in15, _in13); \
+ _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
+ _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
+ _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
+ _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
+ _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
+ _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
+ _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
+ _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
+ _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
+ _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
+ _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
+ _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
+ _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
+ _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
+ _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
+ _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
+ _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
+ _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
+ _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
+ _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
+ \
+ _tmp0_m = __lasx_xvilvh_h(_in2, _in0); \
+ _tmp1_m = __lasx_xvilvh_h(_in3, _in1); \
+ _tmp2_m = __lasx_xvilvh_h(_in6, _in4); \
+ _tmp3_m = __lasx_xvilvh_h(_in7, _in5); \
+ _tmp4_m = __lasx_xvilvh_h(_in10, _in8); \
+ _tmp5_m = __lasx_xvilvh_h(_in11, _in9); \
+ _tmp6_m = __lasx_xvilvh_h(_in14, _in12); \
+ _tmp7_m = __lasx_xvilvh_h(_in15, _in13); \
+ _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
+ _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
+ _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
+ _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
+ _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
+ _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
+ _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
+ _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
+ _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
+ _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
+ _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
+ _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
+ _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
+ _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
+ _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
+ _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
+ _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
+ _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
+ _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
+ _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
+ }
/*
* =============================================================================
@@ -1629,61 +1693,65 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
* Arguments : Inputs - _in0, _in1, _in2, _in3
* Outputs - _out0, _out1, _out2, _out3
* Return Type - signed halfword
- * Details : The rows of the matrix become columns, and the columns become rows.
+ * Details : The rows of the matrix become columns, and the columns become
+ * rows.
* Example : See LASX_TRANSPOSE8x8_H
* =============================================================================
*/
-#define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
-{ \
- __m256i _s0_m, _s1_m; \
- \
- _s0_m = __lasx_xvilvl_h(_in1, _in0); \
- _s1_m = __lasx_xvilvl_h(_in3, _in2); \
- _out0 = __lasx_xvilvl_w(_s1_m, _s0_m); \
- _out2 = __lasx_xvilvh_w(_s1_m, _s0_m); \
- _out1 = __lasx_xvilvh_d(_out0, _out0); \
- _out3 = __lasx_xvilvh_d(_out2, _out2); \
-}
+#define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
+ _out3) \
+ { \
+ __m256i _s0_m, _s1_m; \
+ \
+ _s0_m = __lasx_xvilvl_h(_in1, _in0); \
+ _s1_m = __lasx_xvilvl_h(_in3, _in2); \
+ _out0 = __lasx_xvilvl_w(_s1_m, _s0_m); \
+ _out2 = __lasx_xvilvh_w(_s1_m, _s0_m); \
+ _out1 = __lasx_xvilvh_d(_out0, _out0); \
+ _out3 = __lasx_xvilvh_d(_out2, _out2); \
+ }
/*
* =============================================================================
* Description : Transpose input 8x8 byte block
* Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
* (input 8x8 byte block)
- * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
- * (output 8x8 byte block)
+ * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ * _out7 (output 8x8 byte block)
* Example : See LASX_TRANSPOSE8x8_H
* =============================================================================
*/
-#define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, _out0, \
- _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
-{ \
- __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
- __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
- _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
- _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
- _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
- _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
- _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
- _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
- _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
- _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
- _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m); \
- _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m); \
- _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m); \
- _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m); \
- _out1 = __lasx_xvbsrl_v(_out0, 8); \
- _out3 = __lasx_xvbsrl_v(_out2, 8); \
- _out5 = __lasx_xvbsrl_v(_out4, 8); \
- _out7 = __lasx_xvbsrl_v(_out6, 8); \
-}
+#define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
+ __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
+ _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
+ _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
+ _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
+ _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
+ _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
+ _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
+ _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
+ _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
+ _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m); \
+ _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m); \
+ _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m); \
+ _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m); \
+ _out1 = __lasx_xvbsrl_v(_out0, 8); \
+ _out3 = __lasx_xvbsrl_v(_out2, 8); \
+ _out5 = __lasx_xvbsrl_v(_out4, 8); \
+ _out7 = __lasx_xvbsrl_v(_out6, 8); \
+ }
/*
* =============================================================================
* Description : Transpose 8x8 block with halfword elements in vectors.
* Arguments : Inputs - _in0, _in1, ~
* Outputs - _out0, _out1, ~
- * Details : The rows of the matrix become columns, and the columns become rows.
+ * Details : The rows of the matrix become columns, and the columns become
+ * rows.
* Example : LASX_TRANSPOSE8x8_H
* _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
* _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
@@ -1704,40 +1772,41 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
* _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8
* =============================================================================
*/
-#define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, _out0, \
- _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
-{ \
- __m256i _s0_m, _s1_m; \
- __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
- __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
- \
- _s0_m = __lasx_xvilvl_h(_in6, _in4); \
- _s1_m = __lasx_xvilvl_h(_in7, _in5); \
- _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
- _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
- _s0_m = __lasx_xvilvh_h(_in6, _in4); \
- _s1_m = __lasx_xvilvh_h(_in7, _in5); \
- _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
- _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
- \
- _s0_m = __lasx_xvilvl_h(_in2, _in0); \
- _s1_m = __lasx_xvilvl_h(_in3, _in1); \
- _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
- _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
- _s0_m = __lasx_xvilvh_h(_in2, _in0); \
- _s1_m = __lasx_xvilvh_h(_in3, _in1); \
- _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
- _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
- \
- _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m); \
- _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m); \
- _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m); \
- _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m); \
- _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m); \
- _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m); \
- _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m); \
- _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m); \
-}
+#define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ __m256i _s0_m, _s1_m; \
+ __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
+ __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
+ \
+ _s0_m = __lasx_xvilvl_h(_in6, _in4); \
+ _s1_m = __lasx_xvilvl_h(_in7, _in5); \
+ _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
+ _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
+ _s0_m = __lasx_xvilvh_h(_in6, _in4); \
+ _s1_m = __lasx_xvilvh_h(_in7, _in5); \
+ _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
+ _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
+ \
+ _s0_m = __lasx_xvilvl_h(_in2, _in0); \
+ _s1_m = __lasx_xvilvl_h(_in3, _in1); \
+ _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
+ _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
+ _s0_m = __lasx_xvilvh_h(_in2, _in0); \
+ _s1_m = __lasx_xvilvh_h(_in3, _in1); \
+ _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
+ _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
+ \
+ _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m); \
+ _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m); \
+ _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m); \
+ _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m); \
+ _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m); \
+ _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m); \
+ _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m); \
+ _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m); \
+ }
/*
* =============================================================================
@@ -1752,34 +1821,34 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
* _out3 = _in0 - _in3;
* =============================================================================
*/
-#define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
-{ \
- _out0 = __lasx_xvadd_b(_in0, _in3); \
- _out1 = __lasx_xvadd_b(_in1, _in2); \
- _out2 = __lasx_xvsub_b(_in1, _in2); \
- _out3 = __lasx_xvsub_b(_in0, _in3); \
-}
-#define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
-{ \
- _out0 = __lasx_xvadd_h(_in0, _in3); \
- _out1 = __lasx_xvadd_h(_in1, _in2); \
- _out2 = __lasx_xvsub_h(_in1, _in2); \
- _out3 = __lasx_xvsub_h(_in0, _in3); \
-}
-#define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
-{ \
- _out0 = __lasx_xvadd_w(_in0, _in3); \
- _out1 = __lasx_xvadd_w(_in1, _in2); \
- _out2 = __lasx_xvsub_w(_in1, _in2); \
- _out3 = __lasx_xvsub_w(_in0, _in3); \
-}
-#define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
-{ \
- _out0 = __lasx_xvadd_d(_in0, _in3); \
- _out1 = __lasx_xvadd_d(_in1, _in2); \
- _out2 = __lasx_xvsub_d(_in1, _in2); \
- _out3 = __lasx_xvsub_d(_in0, _in3); \
-}
+#define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+ { \
+ _out0 = __lasx_xvadd_b(_in0, _in3); \
+ _out1 = __lasx_xvadd_b(_in1, _in2); \
+ _out2 = __lasx_xvsub_b(_in1, _in2); \
+ _out3 = __lasx_xvsub_b(_in0, _in3); \
+ }
+#define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+ { \
+ _out0 = __lasx_xvadd_h(_in0, _in3); \
+ _out1 = __lasx_xvadd_h(_in1, _in2); \
+ _out2 = __lasx_xvsub_h(_in1, _in2); \
+ _out3 = __lasx_xvsub_h(_in0, _in3); \
+ }
+#define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+ { \
+ _out0 = __lasx_xvadd_w(_in0, _in3); \
+ _out1 = __lasx_xvadd_w(_in1, _in2); \
+ _out2 = __lasx_xvsub_w(_in1, _in2); \
+ _out3 = __lasx_xvsub_w(_in0, _in3); \
+ }
+#define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+ { \
+ _out0 = __lasx_xvadd_d(_in0, _in3); \
+ _out1 = __lasx_xvadd_d(_in1, _in2); \
+ _out2 = __lasx_xvsub_d(_in1, _in2); \
+ _out3 = __lasx_xvsub_d(_in0, _in3); \
+ }
/*
* =============================================================================
@@ -1798,59 +1867,63 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
* _out7 = _in0 - _in7;
* =============================================================================
*/
-#define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
-{ \
- _out0 = __lasx_xvadd_b(_in0, _in7); \
- _out1 = __lasx_xvadd_b(_in1, _in6); \
- _out2 = __lasx_xvadd_b(_in2, _in5); \
- _out3 = __lasx_xvadd_b(_in3, _in4); \
- _out4 = __lasx_xvsub_b(_in3, _in4); \
- _out5 = __lasx_xvsub_b(_in2, _in5); \
- _out6 = __lasx_xvsub_b(_in1, _in6); \
- _out7 = __lasx_xvsub_b(_in0, _in7); \
-}
-
-#define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
-{ \
- _out0 = __lasx_xvadd_h(_in0, _in7); \
- _out1 = __lasx_xvadd_h(_in1, _in6); \
- _out2 = __lasx_xvadd_h(_in2, _in5); \
- _out3 = __lasx_xvadd_h(_in3, _in4); \
- _out4 = __lasx_xvsub_h(_in3, _in4); \
- _out5 = __lasx_xvsub_h(_in2, _in5); \
- _out6 = __lasx_xvsub_h(_in1, _in6); \
- _out7 = __lasx_xvsub_h(_in0, _in7); \
-}
-
-#define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
-{ \
- _out0 = __lasx_xvadd_w(_in0, _in7); \
- _out1 = __lasx_xvadd_w(_in1, _in6); \
- _out2 = __lasx_xvadd_w(_in2, _in5); \
- _out3 = __lasx_xvadd_w(_in3, _in4); \
- _out4 = __lasx_xvsub_w(_in3, _in4); \
- _out5 = __lasx_xvsub_w(_in2, _in5); \
- _out6 = __lasx_xvsub_w(_in1, _in6); \
- _out7 = __lasx_xvsub_w(_in0, _in7); \
-}
-
-#define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
-{ \
- _out0 = __lasx_xvadd_d(_in0, _in7); \
- _out1 = __lasx_xvadd_d(_in1, _in6); \
- _out2 = __lasx_xvadd_d(_in2, _in5); \
- _out3 = __lasx_xvadd_d(_in3, _in4); \
- _out4 = __lasx_xvsub_d(_in3, _in4); \
- _out5 = __lasx_xvsub_d(_in2, _in5); \
- _out6 = __lasx_xvsub_d(_in1, _in6); \
- _out7 = __lasx_xvsub_d(_in0, _in7); \
-}
-
-#endif //LASX
+#define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ _out0 = __lasx_xvadd_b(_in0, _in7); \
+ _out1 = __lasx_xvadd_b(_in1, _in6); \
+ _out2 = __lasx_xvadd_b(_in2, _in5); \
+ _out3 = __lasx_xvadd_b(_in3, _in4); \
+ _out4 = __lasx_xvsub_b(_in3, _in4); \
+ _out5 = __lasx_xvsub_b(_in2, _in5); \
+ _out6 = __lasx_xvsub_b(_in1, _in6); \
+ _out7 = __lasx_xvsub_b(_in0, _in7); \
+ }
+
+#define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ _out0 = __lasx_xvadd_h(_in0, _in7); \
+ _out1 = __lasx_xvadd_h(_in1, _in6); \
+ _out2 = __lasx_xvadd_h(_in2, _in5); \
+ _out3 = __lasx_xvadd_h(_in3, _in4); \
+ _out4 = __lasx_xvsub_h(_in3, _in4); \
+ _out5 = __lasx_xvsub_h(_in2, _in5); \
+ _out6 = __lasx_xvsub_h(_in1, _in6); \
+ _out7 = __lasx_xvsub_h(_in0, _in7); \
+ }
+
+#define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ _out0 = __lasx_xvadd_w(_in0, _in7); \
+ _out1 = __lasx_xvadd_w(_in1, _in6); \
+ _out2 = __lasx_xvadd_w(_in2, _in5); \
+ _out3 = __lasx_xvadd_w(_in3, _in4); \
+ _out4 = __lasx_xvsub_w(_in3, _in4); \
+ _out5 = __lasx_xvsub_w(_in2, _in5); \
+ _out6 = __lasx_xvsub_w(_in1, _in6); \
+ _out7 = __lasx_xvsub_w(_in0, _in7); \
+ }
+
+#define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ _out0 = __lasx_xvadd_d(_in0, _in7); \
+ _out1 = __lasx_xvadd_d(_in1, _in6); \
+ _out2 = __lasx_xvadd_d(_in2, _in5); \
+ _out3 = __lasx_xvadd_d(_in3, _in4); \
+ _out4 = __lasx_xvsub_d(_in3, _in4); \
+ _out5 = __lasx_xvsub_d(_in2, _in5); \
+ _out6 = __lasx_xvsub_d(_in1, _in6); \
+ _out7 = __lasx_xvsub_d(_in0, _in7); \
+ }
+
+#endif // LASX
/*
* =============================================================================
@@ -1863,15 +1936,13 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
* VP:1,2,3,4,
* =============================================================================
*/
-#define VECT_PRINT(RTYPE, element_num, in0, enter) \
-{ \
- RTYPE _tmp0 = (RTYPE)in0; \
- int _i = 0; \
- if (enter) \
- printf("\nVP:"); \
- for(_i = 0; _i < element_num; _i++) \
- printf("%d,",_tmp0[_i]); \
-}
+#define VECT_PRINT(RTYPE, element_num, in0, enter) \
+ { \
+ RTYPE _tmp0 = (RTYPE)in0; \
+ int _i = 0; \
+ if (enter) printf("\nVP:"); \
+ for (_i = 0; _i < element_num; _i++) printf("%d,", _tmp0[_i]); \
+ }
#endif /* LOONGSON_INTRINSICS_H */
#endif /* AVUTIL_LOONGARCH_LOONGSON_INTRINSICS_H */
--
2.20.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
* [FFmpeg-devel] [PATCH v1 2/5] avcodec: [loongarch] Optimize Hevcdsp with LSX.
2022-02-14 8:25 [FFmpeg-devel] Optimize HEVC decoding for loongarch Hao Chen
2022-02-14 8:25 ` [FFmpeg-devel] [PATCH v1 1/5] avutil: [loongarch] Update loongson_intrinsics.h to v1.1.0 Hao Chen
@ 2022-02-14 8:25 ` Hao Chen
2022-02-14 8:25 ` [FFmpeg-devel] [PATCH v1 3/5] avcodec: [loongarch] Optimize Hevc_idct/lpf " Hao Chen
` (3 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Hao Chen @ 2022-02-14 8:25 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Lu Wang
From: Lu Wang <wanglu@loongson.cn>
ffmpeg -i 5_h265_1080p_60fps_3Mbps.mkv -f rawvideo -y /dev/null -an
before: 94fps
after : 110fps
---
libavcodec/hevcdsp.c | 2 +
libavcodec/hevcdsp.h | 1 +
libavcodec/loongarch/Makefile | 2 +
libavcodec/loongarch/hevcdsp_init_loongarch.c | 90 +
libavcodec/loongarch/hevcdsp_lsx.c | 3299 +++++++++++++++++
libavcodec/loongarch/hevcdsp_lsx.h | 88 +
6 files changed, 3482 insertions(+)
create mode 100644 libavcodec/loongarch/hevcdsp_init_loongarch.c
create mode 100644 libavcodec/loongarch/hevcdsp_lsx.c
create mode 100644 libavcodec/loongarch/hevcdsp_lsx.h
diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
index fe272ac1ce..b6c3f00819 100644
--- a/libavcodec/hevcdsp.c
+++ b/libavcodec/hevcdsp.c
@@ -267,4 +267,6 @@ int i = 0;
ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
if (ARCH_MIPS)
ff_hevc_dsp_init_mips(hevcdsp, bit_depth);
+ if (ARCH_LOONGARCH)
+ ff_hevc_dsp_init_loongarch(hevcdsp, bit_depth);
}
diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
index 0e013a8328..d642ecf9c6 100644
--- a/libavcodec/hevcdsp.h
+++ b/libavcodec/hevcdsp.h
@@ -134,5 +134,6 @@ void ff_hevc_dsp_init_arm(HEVCDSPContext *c, const int bit_depth);
void ff_hevc_dsp_init_ppc(HEVCDSPContext *c, const int bit_depth);
void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth);
void ff_hevc_dsp_init_mips(HEVCDSPContext *c, const int bit_depth);
+void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth);
#endif /* AVCODEC_HEVCDSP_H */
diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index 3c15c2edeb..9f416d0c6e 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -8,6 +8,7 @@ OBJS-$(CONFIG_VC1DSP) += loongarch/vc1dsp_init_loongarch.o
OBJS-$(CONFIG_HPELDSP) += loongarch/hpeldsp_init_loongarch.o
OBJS-$(CONFIG_IDCTDSP) += loongarch/idctdsp_init_loongarch.o
OBJS-$(CONFIG_VIDEODSP) += loongarch/videodsp_init.o
+OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_init_loongarch.o
LASX-OBJS-$(CONFIG_H264CHROMA) += loongarch/h264chroma_lasx.o
LASX-OBJS-$(CONFIG_H264QPEL) += loongarch/h264qpel_lasx.o
LASX-OBJS-$(CONFIG_H264DSP) += loongarch/h264dsp_lasx.o \
@@ -24,3 +25,4 @@ LSX-OBJS-$(CONFIG_VP9_DECODER) += loongarch/vp9_mc_lsx.o \
loongarch/vp9_intra_lsx.o \
loongarch/vp9_lpf_lsx.o \
loongarch/vp9_idct_lsx.o
+LSX-OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_lsx.o \
diff --git a/libavcodec/loongarch/hevcdsp_init_loongarch.c b/libavcodec/loongarch/hevcdsp_init_loongarch.c
new file mode 100644
index 0000000000..fc0e8fb0df
--- /dev/null
+++ b/libavcodec/loongarch/hevcdsp_init_loongarch.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ * Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/cpu.h"
+#include "hevcdsp_lsx.h"
+
+void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_lsx(cpu_flags)) {
+ if (bit_depth == 8) {
+ c->put_hevc_qpel[1][0][0] = ff_hevc_put_hevc_pel_pixels4_8_lsx;
+ c->put_hevc_qpel[2][0][0] = ff_hevc_put_hevc_pel_pixels6_8_lsx;
+ c->put_hevc_qpel[3][0][0] = ff_hevc_put_hevc_pel_pixels8_8_lsx;
+ c->put_hevc_qpel[4][0][0] = ff_hevc_put_hevc_pel_pixels12_8_lsx;
+ c->put_hevc_qpel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_8_lsx;
+ c->put_hevc_qpel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_8_lsx;
+ c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_lsx;
+ c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_lsx;
+ c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_lsx;
+
+ c->put_hevc_epel[1][0][0] = ff_hevc_put_hevc_pel_pixels4_8_lsx;
+ c->put_hevc_epel[2][0][0] = ff_hevc_put_hevc_pel_pixels6_8_lsx;
+ c->put_hevc_epel[3][0][0] = ff_hevc_put_hevc_pel_pixels8_8_lsx;
+ c->put_hevc_epel[4][0][0] = ff_hevc_put_hevc_pel_pixels12_8_lsx;
+ c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_8_lsx;
+ c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_8_lsx;
+ c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_lsx;
+
+ c->put_hevc_qpel[1][0][1] = ff_hevc_put_hevc_qpel_h4_8_lsx;
+ c->put_hevc_qpel[3][0][1] = ff_hevc_put_hevc_qpel_h8_8_lsx;
+ c->put_hevc_qpel[4][0][1] = ff_hevc_put_hevc_qpel_h12_8_lsx;
+ c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_8_lsx;
+ c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h24_8_lsx;
+ c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_lsx;
+ c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_lsx;
+ c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_lsx;
+
+ c->put_hevc_qpel[1][1][0] = ff_hevc_put_hevc_qpel_v4_8_lsx;
+ c->put_hevc_qpel[3][1][0] = ff_hevc_put_hevc_qpel_v8_8_lsx;
+ c->put_hevc_qpel[4][1][0] = ff_hevc_put_hevc_qpel_v12_8_lsx;
+ c->put_hevc_qpel[5][1][0] = ff_hevc_put_hevc_qpel_v16_8_lsx;
+ c->put_hevc_qpel[6][1][0] = ff_hevc_put_hevc_qpel_v24_8_lsx;
+ c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_8_lsx;
+ c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_8_lsx;
+ c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_8_lsx;
+
+ c->put_hevc_qpel[1][1][1] = ff_hevc_put_hevc_qpel_hv4_8_lsx;
+ c->put_hevc_qpel[3][1][1] = ff_hevc_put_hevc_qpel_hv8_8_lsx;
+ c->put_hevc_qpel[4][1][1] = ff_hevc_put_hevc_qpel_hv12_8_lsx;
+ c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_8_lsx;
+ c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_8_lsx;
+ c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_8_lsx;
+ c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_8_lsx;
+ c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_8_lsx;
+
+ c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_8_lsx;
+
+ c->put_hevc_epel[5][1][0] = ff_hevc_put_hevc_epel_v16_8_lsx;
+ c->put_hevc_epel[6][1][0] = ff_hevc_put_hevc_epel_v24_8_lsx;
+ c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_8_lsx;
+
+ c->put_hevc_epel[3][1][1] = ff_hevc_put_hevc_epel_hv8_8_lsx;
+ c->put_hevc_epel[4][1][1] = ff_hevc_put_hevc_epel_hv12_8_lsx;
+ c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_8_lsx;
+ c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_8_lsx;
+ c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_lsx;
+ }
+ }
+}
diff --git a/libavcodec/loongarch/hevcdsp_lsx.c b/libavcodec/loongarch/hevcdsp_lsx.c
new file mode 100644
index 0000000000..a520f02bd1
--- /dev/null
+++ b/libavcodec/loongarch/hevcdsp_lsx.c
@@ -0,0 +1,3299 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ * Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "hevcdsp_lsx.h"
+
+static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
+ /* 8 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+ /* 4 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
+};
+
+/* hevc_copy: dst = src << 6 */
+static void hevc_copy_4w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ int32_t height)
+{
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_4x = (dst_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ int32_t loop_cnt = height >> 3;
+ int32_t res = height & 0x07;
+
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i in0, in1, in2, in3;
+ for (; loop_cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
+ src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
+ src5, src6);
+ src7 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+
+ DUP4_ARG2(__lsx_vilvl_w, src1, src0, src3, src2, src5, src4, src7, src6,
+ src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+ in0, in1, in2, in3);
+
+ __lsx_vstelm_d(in0, dst, 0, 0);
+ __lsx_vstelm_d(in0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(in1, dst + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(in1, dst + dst_stride_3x, 0, 1);
+ dst += dst_stride_4x;
+ __lsx_vstelm_d(in2, dst, 0, 0);
+ __lsx_vstelm_d(in2, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(in3, dst + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(in3, dst + dst_stride_3x, 0, 1);
+ dst += dst_stride_4x;
+ }
+ for (;res--;) {
+ src0 = __lsx_vld(src, 0);
+ in0 = __lsx_vsllwil_hu_bu(src0, 6);
+ __lsx_vstelm_d(in0, dst, 0, 0);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void hevc_copy_6w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ int32_t height)
+{
+ int32_t loop_cnt = (height >> 3);
+ int32_t res = height & 0x07;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
+ src7 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+ in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
+ in4, in5, in6, in7);
+
+ __lsx_vstelm_d(in0, dst, 0, 0);
+ __lsx_vstelm_w(in0, dst, 8, 2);
+ dst += dst_stride;
+ __lsx_vstelm_d(in1, dst, 0, 0);
+ __lsx_vstelm_w(in1, dst, 8, 2);
+ dst += dst_stride;
+ __lsx_vstelm_d(in2, dst, 0, 0);
+ __lsx_vstelm_w(in2, dst, 8, 2);
+ dst += dst_stride;
+ __lsx_vstelm_d(in3, dst, 0, 0);
+ __lsx_vstelm_w(in3, dst, 8, 2);
+ dst += dst_stride;
+ __lsx_vstelm_d(in4, dst, 0, 0);
+ __lsx_vstelm_w(in4, dst, 8, 2);
+ dst += dst_stride;
+ __lsx_vstelm_d(in5, dst, 0, 0);
+ __lsx_vstelm_w(in5, dst, 8, 2);
+ dst += dst_stride;
+ __lsx_vstelm_d(in6, dst, 0, 0);
+ __lsx_vstelm_w(in6, dst, 8, 2);
+ dst += dst_stride;
+ __lsx_vstelm_d(in7, dst, 0, 0);
+ __lsx_vstelm_w(in7, dst, 8, 2);
+ dst += dst_stride;
+ }
+ for (;res--;) {
+ src0 = __lsx_vld(src, 0);
+ in0 = __lsx_vsllwil_hu_bu(src0, 6);
+ src += src_stride;
+ __lsx_vstelm_d(in0, dst, 0, 0);
+ __lsx_vstelm_w(in0, dst, 8, 2);
+ dst += dst_stride;
+ }
+}
+
+static void hevc_copy_8w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ int32_t height)
+{
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_2x = (dst_stride_x << 1);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
+ int32_t loop_cnt = height >> 3;
+ int32_t res = height & 0x07;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
+ src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
+ src5, src6);
+ src7 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+ in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
+ in4, in5, in6, in7);
+ __lsx_vst(in0, dst, 0);
+ __lsx_vstx(in1, dst, dst_stride_x);
+ __lsx_vstx(in2, dst, dst_stride_2x);
+ __lsx_vstx(in3, dst, dst_stride_3x);
+ dst += dst_stride_2x;
+ __lsx_vst(in4, dst, 0);
+ __lsx_vstx(in5, dst, dst_stride_x);
+ __lsx_vstx(in6, dst, dst_stride_2x);
+ __lsx_vstx(in7, dst, dst_stride_3x);
+ dst += dst_stride_2x;
+ }
+ for (;res--;) {
+ src0 = __lsx_vld(src, 0);
+ in0 = __lsx_vsllwil_hu_bu(src0, 6);
+ __lsx_vst(in0, dst, 0);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void hevc_copy_12w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ int32_t height)
+{
+ uint32_t loop_cnt;
+ uint32_t res = height & 0x07;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_2x = (dst_stride_x << 1);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
+ __m128i zero = __lsx_vldi(0);
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i in0, in1, in0_r, in1_r, in2_r, in3_r;
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
+ src7 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+ in0_r, in1_r, in2_r, in3_r);
+ DUP2_ARG2(__lsx_vilvh_w, src1, src0, src3, src2, src0, src1);
+ DUP2_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, in0, in1);
+ __lsx_vst(in0_r, dst, 0);
+ __lsx_vstx(in1_r, dst, dst_stride_x);
+ __lsx_vstx(in2_r, dst, dst_stride_2x);
+ __lsx_vstx(in3_r, dst, dst_stride_3x);
+ __lsx_vstelm_d(in0, dst, 16, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(in0, dst, 16, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(in1, dst, 16, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(in1, dst, 16, 1);
+ dst += dst_stride;
+
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
+ in0_r, in1_r, in2_r, in3_r);
+ DUP2_ARG2(__lsx_vilvh_w, src5, src4, src7, src6, src0, src1);
+ DUP2_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, in0, in1);
+ __lsx_vst(in0_r, dst, 0);
+ __lsx_vstx(in1_r, dst, dst_stride_x);
+ __lsx_vstx(in2_r, dst, dst_stride_2x);
+ __lsx_vstx(in3_r, dst, dst_stride_3x);
+ __lsx_vstelm_d(in0, dst, 16, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(in0, dst, 16, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(in1, dst, 16, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(in1, dst, 16, 1);
+ dst += dst_stride;
+ }
+ for (;res--;) {
+ src0 = __lsx_vld(src, 0);
+ in0 = __lsx_vsllwil_hu_bu(src0, 6);
+ src1 = __lsx_vilvh_b(zero, src0);
+ in1 = __lsx_vslli_h(src1, 6);
+ __lsx_vst(in0, dst, 0);
+ __lsx_vstelm_d(in1, dst, 16, 0);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void hevc_copy_16w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ int32_t height)
+{
+ __m128i zero = __lsx_vldi(0);
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_2x = (dst_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
+ int32_t loop_cnt = height >> 3;
+ int32_t res = height & 0x07;
+ int16_t* dst1 = dst + 8;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
+ src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
+ src5, src6);
+ src7 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero, src3,
+ in0_l, in1_l, in2_l, in3_l);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+ in0_r, in1_r, in2_r, in3_r);
+ DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
+ in1_l, in2_l, in3_l);
+ __lsx_vst(in0_r, dst, 0);
+ __lsx_vstx(in1_r, dst, dst_stride_x);
+ __lsx_vstx(in2_r, dst, dst_stride_2x);
+ __lsx_vstx(in3_r, dst, dst_stride_3x);
+ __lsx_vst(in0_l, dst1, 0);
+ __lsx_vstx(in1_l, dst1, dst_stride_x);
+ __lsx_vstx(in2_l, dst1, dst_stride_2x);
+ __lsx_vstx(in3_l, dst1, dst_stride_3x);
+ dst += dst_stride_2x;
+ dst1 += dst_stride_2x;
+
+ DUP4_ARG2(__lsx_vilvh_b, zero, src4, zero, src5, zero, src6, zero, src7,
+ in0_l, in1_l, in2_l, in3_l);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
+ in0_r, in1_r, in2_r, in3_r);
+ DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
+ in1_l, in2_l, in3_l);
+ __lsx_vst(in0_r, dst, 0);
+ __lsx_vstx(in1_r, dst, dst_stride_x);
+ __lsx_vstx(in2_r, dst, dst_stride_2x);
+ __lsx_vstx(in3_r, dst, dst_stride_3x);
+ __lsx_vst(in0_l, dst1, 0);
+ __lsx_vstx(in1_l, dst1, dst_stride_x);
+ __lsx_vstx(in2_l, dst1, dst_stride_2x);
+ __lsx_vstx(in3_l, dst1, dst_stride_3x);
+ dst += dst_stride_2x;
+ dst1 += dst_stride_2x;
+ }
+ if (res) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+
+ DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero, src3,
+ in0_l, in1_l, in2_l, in3_l);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+ in0_r, in1_r, in2_r, in3_r);
+ DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
+ in1_l, in2_l, in3_l);
+ __lsx_vst(in0_r, dst, 0);
+ __lsx_vstx(in1_r, dst, dst_stride_x);
+ __lsx_vstx(in2_r, dst, dst_stride_2x);
+ __lsx_vstx(in3_r, dst, dst_stride_3x);
+ dst += 8;
+ __lsx_vst(in0_l, dst, 0);
+ __lsx_vstx(in1_l, dst, dst_stride_x);
+ __lsx_vstx(in2_l, dst, dst_stride_2x);
+ __lsx_vstx(in3_l, dst, dst_stride_3x);
+ }
+}
+
+static void hevc_copy_24w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ int32_t height)
+{
+ uint32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_2x = (dst_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
+ uint8_t *_src = src + 16;
+ int16_t *dst1 = dst;
+ __m128i zero = __lsx_vldi(0);
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src4 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x,
+ src5, src6);
+ src7 = __lsx_vldx(_src, src_stride_3x);
+ _src += src_stride_4x;
+
+ DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero,
+ src3, in0_l, in1_l, in2_l, in3_l);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+ in0_r, in1_r, in2_r, in3_r);
+ DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6,
+ in0_l, in1_l, in2_l, in3_l);
+ __lsx_vst(in0_r, dst, 0);
+ __lsx_vstx(in1_r, dst, dst_stride_x);
+ __lsx_vstx(in2_r, dst, dst_stride_2x);
+ __lsx_vstx(in3_r, dst, dst_stride_3x);
+ dst1 = dst + 8;
+ __lsx_vst(in0_l, dst1, 0);
+ __lsx_vstx(in1_l, dst1, dst_stride_x);
+ __lsx_vstx(in2_l, dst1, dst_stride_2x);
+ __lsx_vstx(in3_l, dst1, dst_stride_3x);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
+ in0_r, in1_r, in2_r, in3_r);
+ dst1 = dst1 + 8;
+ __lsx_vst(in0_r, dst1, 0);
+ __lsx_vstx(in1_r, dst1, dst_stride_x);
+ __lsx_vstx(in2_r, dst1, dst_stride_2x);
+ __lsx_vstx(in3_r, dst1, dst_stride_3x);
+ dst += dst_stride_2x;
+ }
+}
+
+static void hevc_copy_32w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ int32_t height)
+{
+ uint32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ uint8_t *_src = src + 16;
+ __m128i zero = {0};
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src2, src4);
+ src6 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src1 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x,
+ src3, src5);
+ src7 = __lsx_vldx(_src, src_stride_3x);
+ _src += src_stride_4x;
+
+ DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero,
+ src3, in0_l, in1_l, in2_l, in3_l);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+ in0_r, in1_r, in2_r, in3_r);
+ DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6,
+ in0_l, in1_l, in2_l, in3_l);
+ __lsx_vst(in0_r, dst, 0);
+ __lsx_vst(in0_l, dst, 16);
+ __lsx_vst(in1_r, dst, 32);
+ __lsx_vst(in1_l, dst, 48);
+ dst += dst_stride;
+ __lsx_vst(in2_r, dst, 0);
+ __lsx_vst(in2_l, dst, 16);
+ __lsx_vst(in3_r, dst, 32);
+ __lsx_vst(in3_l, dst, 48);
+ dst += dst_stride;
+
+ DUP4_ARG2(__lsx_vilvh_b, zero, src4, zero, src5, zero, src6, zero, src7,
+ in0_l, in1_l, in2_l, in3_l);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
+ in0_r, in1_r, in2_r, in3_r);
+ DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
+ in1_l, in2_l, in3_l);
+ __lsx_vst(in0_r, dst, 0);
+ __lsx_vst(in0_l, dst, 16);
+ __lsx_vst(in1_r, dst, 32);
+ __lsx_vst(in1_l, dst, 48);
+ dst += dst_stride;
+ __lsx_vst(in2_r, dst, 0);
+ __lsx_vst(in2_l, dst, 16);
+ __lsx_vst(in3_r, dst, 32);
+ __lsx_vst(in3_l, dst, 48);
+ dst += dst_stride;
+ }
+}
+
+static void hevc_copy_48w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ int32_t height)
+{
+ uint32_t loop_cnt;
+ __m128i zero = {0};
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i src8, src9, src10, src11;
+ __m128i in0_r, in1_r, in2_r, in3_r, in4_r, in5_r;
+ __m128i in0_l, in1_l, in2_l, in3_l, in4_l, in5_l;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+ src2 = __lsx_vld(src, 32);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src3, src4);
+ src5 = __lsx_vld(src, 32);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src6, src7);
+ src8 = __lsx_vld(src, 32);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src9, src10);
+ src11 = __lsx_vld(src, 32);
+ src += src_stride;
+
+ DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero,
+ src3, in0_l, in1_l, in2_l, in3_l);
+ DUP2_ARG2(__lsx_vilvh_b, zero, src4, zero, src5, in4_l, in5_l);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+ in0_r, in1_r, in2_r, in3_r);
+ DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
+ in1_l, in2_l, in3_l);
+ DUP2_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, in4_r, in5_r);
+ DUP2_ARG2(__lsx_vslli_h, in4_l, 6, in5_l, 6, in4_l, in5_l);
+ __lsx_vst(in0_r, dst, 0);
+ __lsx_vst(in0_l, dst, 16);
+ __lsx_vst(in1_r, dst, 32);
+ __lsx_vst(in1_l, dst, 48);
+ __lsx_vst(in2_r, dst, 64);
+ __lsx_vst(in2_l, dst, 80);
+ dst += dst_stride;
+ __lsx_vst(in3_r, dst, 0);
+ __lsx_vst(in3_l, dst, 16);
+ __lsx_vst(in4_r, dst, 32);
+ __lsx_vst(in4_l, dst, 48);
+ __lsx_vst(in5_r, dst, 64);
+ __lsx_vst(in5_l, dst, 80);
+ dst += dst_stride;
+
+ DUP4_ARG2(__lsx_vilvh_b, zero, src6, zero, src7, zero, src8, zero, src9,
+ in0_l, in1_l, in2_l, in3_l);
+ DUP2_ARG2(__lsx_vilvh_b, zero, src10, zero, src11, in4_l, in5_l);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src6, 6, src7, 6, src8, 6, src9, 6,
+ in0_r, in1_r, in2_r, in3_r);
+ DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
+ in1_l, in2_l, in3_l);
+ DUP2_ARG2(__lsx_vsllwil_hu_bu, src10, 6, src11, 6, in4_r, in5_r);
+ DUP2_ARG2(__lsx_vslli_h, in4_l, 6, in5_l, 6, in4_l, in5_l);
+ __lsx_vst(in0_r, dst, 0);
+ __lsx_vst(in0_l, dst, 16);
+ __lsx_vst(in1_r, dst, 32);
+ __lsx_vst(in1_l, dst, 48);
+ __lsx_vst(in2_r, dst, 64);
+ __lsx_vst(in2_l, dst, 80);
+ dst += dst_stride;
+ __lsx_vst(in3_r, dst, 0);
+ __lsx_vst(in3_l, dst, 16);
+ __lsx_vst(in4_r, dst, 32);
+ __lsx_vst(in4_l, dst, 48);
+ __lsx_vst(in5_r, dst, 64);
+ __lsx_vst(in5_l, dst, 80);
+ dst += dst_stride;
+ }
+}
+
+static void hevc_copy_64w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ int32_t height)
+{
+ uint32_t loop_cnt;
+ __m128i zero = {0};
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
+
+
+ for (loop_cnt = (height >> 1); loop_cnt--;) {
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
+ src0, src1, src2, src3);
+ src += src_stride;
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
+ src4, src5, src6, src7);
+ src += src_stride;
+
+ DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero,
+ src3, in0_l, in1_l, in2_l, in3_l);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+ in0_r, in1_r, in2_r, in3_r);
+ DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6,
+ in0_l, in1_l, in2_l, in3_l);
+ __lsx_vst(in0_r, dst, 0);
+ __lsx_vst(in0_l, dst, 16);
+ __lsx_vst(in1_r, dst, 32);
+ __lsx_vst(in1_l, dst, 48);
+ __lsx_vst(in2_r, dst, 64);
+ __lsx_vst(in2_l, dst, 80);
+ __lsx_vst(in3_r, dst, 96);
+ __lsx_vst(in3_l, dst, 112);
+ dst += dst_stride;
+
+ DUP4_ARG2(__lsx_vilvh_b, zero, src4, zero, src5, zero, src6, zero,
+ src7, in0_l, in1_l, in2_l, in3_l);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
+ in0_r, in1_r, in2_r, in3_r);
+ DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
+ in1_l, in2_l, in3_l);
+ __lsx_vst(in0_r, dst, 0);
+ __lsx_vst(in0_l, dst, 16);
+ __lsx_vst(in1_r, dst, 32);
+ __lsx_vst(in1_l, dst, 48);
+ __lsx_vst(in2_r, dst, 64);
+ __lsx_vst(in2_l, dst, 80);
+ __lsx_vst(in3_r, dst, 96);
+ __lsx_vst(in3_l, dst, 112);
+ dst += dst_stride;
+ }
+}
+
+static void hevc_hz_8t_4w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ uint32_t loop_cnt = height >> 3;
+ uint32_t res = (height & 0x7) >> 1;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_4x = (dst_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i filt0, filt1, filt2, filt3;
+ __m128i mask1, mask2, mask3;
+ __m128i vec0, vec1, vec2, vec3;
+ __m128i dst0, dst1, dst2, dst3;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 16);
+
+ src -= 3;
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+
+ for (;loop_cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
+ src7 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, src1,
+ src0, mask2, src1, src0, mask3, vec0, vec1, vec2, vec3);
+ dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
+ dst0, dst0);
+ dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
+ DUP4_ARG3(__lsx_vshuf_b, src3, src2, mask0, src3, src2, mask1, src3,
+ src2, mask2, src3, src2, mask3, vec0, vec1, vec2, vec3);
+ dst1 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec1, filt1, dst1, vec2, filt2,
+ dst1, dst1);
+ dst1 = __lsx_vdp2add_h_bu_b(dst1, vec3, filt3);
+ DUP4_ARG3(__lsx_vshuf_b, src5, src4, mask0, src5, src4, mask1, src5,
+ src4, mask2, src5, src4, mask3, vec0, vec1, vec2, vec3);
+ dst2 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec1, filt1, dst2, vec2, filt2,
+ dst2, dst2);
+ dst2 = __lsx_vdp2add_h_bu_b(dst2, vec3, filt3);
+ DUP4_ARG3(__lsx_vshuf_b, src7, src6, mask0, src7, src6, mask1, src7,
+ src6, mask2, src7, src6, mask3, vec0, vec1, vec2, vec3);
+ dst3 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst3, vec2, filt2,
+ dst3, dst3);
+ dst3 = __lsx_vdp2add_h_bu_b(dst3, vec3, filt3);
+
+ __lsx_vstelm_d(dst0, dst, 0, 0);
+ __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(dst1, dst + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(dst1, dst + dst_stride_3x, 0, 1);
+ dst += dst_stride_4x;
+ __lsx_vstelm_d(dst2, dst, 0, 0);
+ __lsx_vstelm_d(dst2, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(dst3, dst + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(dst3, dst + dst_stride_3x, 0, 1);
+ dst += dst_stride_4x;
+ }
+ for (;res--;) {
+ src0 = __lsx_vld(src, 0);
+ src1 = __lsx_vldx(src, src_stride);
+ DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, src1,
+ src0, mask2, src1, src0, mask3, vec0, vec1, vec2, vec3);
+ dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
+ dst0, dst0);
+ dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
+ __lsx_vstelm_d(dst0, dst, 0, 0);
+ __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+ src += src_stride_2x;
+ dst += dst_stride_2x;
+ }
+}
+
+static void hevc_hz_8t_8w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ uint32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_2x = (dst_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
+ __m128i src0, src1, src2, src3;
+ __m128i filt0, filt1, filt2, filt3;
+ __m128i mask1, mask2, mask3;
+ __m128i vec0, vec1, vec2, vec3;
+ __m128i dst0, dst1, dst2, dst3;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+
+ src -= 3;
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0,
+ src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
+ dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
+ dst0, dst0);
+ dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
+ DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1,
+ src1, mask2, src1, src1, mask3, vec0, vec1, vec2, vec3);
+ dst1 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec1, filt1, dst1, vec2, filt2,
+ dst1, dst1);
+ dst1 = __lsx_vdp2add_h_bu_b(dst1, vec3, filt3);
+ DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2,
+ src2, mask2, src2, src2, mask3, vec0, vec1, vec2, vec3);
+ dst2 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec1, filt1, dst2, vec2, filt2,
+ dst2, dst2);
+ dst2 = __lsx_vdp2add_h_bu_b(dst2, vec3, filt3);
+ DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3,
+ src3, mask2, src3, src3, mask3, vec0, vec1, vec2, vec3);
+ dst3 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst3, vec2, filt2,
+ dst3, dst3);
+ dst3 = __lsx_vdp2add_h_bu_b(dst3, vec3, filt3);
+
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vstx(dst1, dst, dst_stride_x);
+ __lsx_vstx(dst2, dst, dst_stride_2x);
+ __lsx_vstx(dst3, dst, dst_stride_3x);
+ dst += dst_stride_2x;
+ }
+}
+
+static void hevc_hz_8t_12w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ uint32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ uint8_t *_src;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5;
+ __m128i filt0, filt1, filt2, filt3, dst0, dst1, dst2, dst3, dst4, dst5;
+
+ src -= 3;
+ _src = src + 8;
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+ mask4 = __lsx_vld(ff_hevc_mask_arr, 16);
+ DUP2_ARG2(__lsx_vaddi_bu, mask4, 2, mask4, 4, mask5, mask6);
+ mask7 = __lsx_vaddi_bu(mask4, 6);
+
+ for (loop_cnt = 4; loop_cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src4 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x,
+ src5, src6);
+ src7 = __lsx_vldx(_src, src_stride_3x);
+ src += src_stride_4x;
+ _src += src_stride_4x;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0,
+ vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask4, src7, src6, mask4,
+ vec4, vec5);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+ vec3, filt0, dst0, dst1, dst2, dst3);
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec4, filt0, vec5, filt0, dst4, dst5);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1,
+ vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask5, src7, src6, mask5,
+ vec4, vec5);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
+ dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt1, dst5, vec5, filt1,
+ dst4, dst5);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2,
+ vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask6, src7, src6, mask6,
+ vec4, vec5);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt2, dst1, vec1, filt2,
+ dst2, vec2, filt2, dst3, vec3, filt2, dst0, dst1, dst2, dst3);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt2, dst5, vec5, filt2,
+ dst4, dst5);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask3, src3, src3, mask3,
+ vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask7, src7, src6, mask7,
+ vec4, vec5);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt3, dst1, vec1, filt3,
+ dst2, vec2, filt3, dst3, vec3, filt3, dst0, dst1, dst2, dst3);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt3, dst5, vec5, filt3,
+ dst4, dst5);
+
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vstelm_d(dst4, dst, 16, 0);
+ dst += dst_stride;
+ __lsx_vst(dst1, dst, 0);
+ __lsx_vstelm_d(dst4, dst, 16, 1);
+ dst += dst_stride;
+ __lsx_vst(dst2, dst, 0);
+ __lsx_vstelm_d(dst5, dst, 16, 0);
+ dst += dst_stride;
+ __lsx_vst(dst3, dst, 0);
+ __lsx_vstelm_d(dst5, dst, 16, 1);
+ dst += dst_stride;
+ }
+}
+
+static void hevc_hz_8t_16w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ uint32_t loop_cnt;
+ __m128i src0, src1, src2, src3;
+ __m128i filt0, filt1, filt2, filt3;
+ __m128i mask1, mask2, mask3;
+ __m128i vec0, vec1, vec2, vec3;
+ __m128i dst0, dst1, dst2, dst3;
+ __m128i mask0;
+
+ src -= 3;
+ mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+
+ for (loop_cnt = (height >> 1); loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
+ src += src_stride;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0,
+ vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+ vec3, filt0, dst0, dst1, dst2, dst3);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1,
+ vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
+ dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2,
+ vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt2, dst1, vec1, filt2,
+ dst2, vec2, filt2, dst3, vec3, filt2, dst0, dst1, dst2, dst3);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask3, src3, src3, mask3,
+ vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt3, dst1, vec1, filt3,
+ dst2, vec2, filt3, dst3, vec3, filt3, dst0, dst1, dst2, dst3);
+
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vst(dst1, dst, 16);
+ dst += dst_stride;
+ __lsx_vst(dst2, dst, 0);
+ __lsx_vst(dst3, dst, 16);
+ dst += dst_stride;
+ }
+}
+
+static void hevc_hz_8t_24w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ uint32_t loop_cnt;
+ __m128i src0, src1, src2, src3;
+ __m128i filt0, filt1, filt2, filt3;
+ __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+
+ src -= 3;
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ DUP4_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask0, 6, mask0, 8, mask1,
+ mask2, mask3, mask4);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 10, mask0, 12, mask5, mask6);
+ mask7 = __lsx_vaddi_bu(mask0, 14);
+
+ for (loop_cnt = (height >> 1); loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src2, src3);
+ src += src_stride;
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask4, src1,
+ src1, mask0, src2, src2, mask0, vec0, vec1, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src3, src2, mask4, src3, src3, mask0,
+ vec4, vec5);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+ vec3, filt0, dst0, dst1, dst2, dst3);
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec4, filt0, vec5, filt0, dst4, dst5);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src0, mask5, src1,
+ src1, mask1, src2, src2, mask1, vec0, vec1, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src3, src2, mask5, src3, src3, mask1,
+ vec4, vec5);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
+ dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt1, dst5, vec5, filt1,
+ dst4, dst5);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src0, mask6, src1,
+ src1, mask2, src2, src2, mask2, vec0, vec1, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src3, src2, mask6, src3, src3, mask2,
+ vec4, vec5);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt2, dst1, vec1, filt2,
+ dst2, vec2, filt2, dst3, vec3, filt2, dst0, dst1, dst2, dst3);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt2, dst5, vec5, filt2,
+ dst4, dst5);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src0, mask7, src1,
+ src1, mask3, src2, src2, mask3, vec0, vec1, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src3, src2, mask7, src3, src3, mask3,
+ vec4, vec5);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt3, dst1, vec1, filt3,
+ dst2, vec2, filt3, dst3, vec3, filt3, dst0, dst1, dst2, dst3);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt3, dst5, vec5, filt3,
+ dst4, dst5);
+
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vst(dst1, dst, 16);
+ __lsx_vst(dst2, dst, 32);
+ dst += dst_stride;
+ __lsx_vst(dst3, dst, 0);
+ __lsx_vst(dst4, dst, 16);
+ __lsx_vst(dst5, dst, 32);
+ dst += dst_stride;
+ }
+}
+
+static void hevc_hz_8t_32w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ uint32_t loop_cnt;
+ __m128i src0, src1, src2;
+ __m128i filt0, filt1, filt2, filt3;
+ __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+ __m128i vec0, vec1, vec2, vec3;
+ __m128i dst0, dst1, dst2, dst3;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+
+ src -= 3;
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ DUP4_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask0, 6, mask0, 8,
+ mask1, mask2, mask3, mask4);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 10, mask0, 12, mask5, mask6);
+ mask7 = __lsx_vaddi_bu(mask0, 14);
+
+ for (loop_cnt = height; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+ src2 = __lsx_vld(src, 24);
+ src += src_stride;
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0,
+ src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
+ dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
+ dst0, dst0);
+ dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
+
+ DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask4, src1, src0, mask5, src1,
+ src0, mask6, src1, src0, mask7, vec0, vec1, vec2, vec3);
+ dst1 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec1, filt1, dst1, vec2, filt2,
+ dst1, dst1);
+ dst1 = __lsx_vdp2add_h_bu_b(dst1, vec3, filt3);
+ DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1, src1,
+ mask2, src1, src1, mask3, vec0, vec1, vec2, vec3);
+ dst2 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec1, filt1, dst2, vec2, filt2,
+ dst2, dst2);
+ dst2 = __lsx_vdp2add_h_bu_b(dst2, vec3, filt3);
+ DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2, src2,
+ mask2, src2, src2, mask3, vec0, vec1, vec2, vec3);
+ dst3 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst3, vec2, filt2,
+ dst3, dst3);
+ dst3 = __lsx_vdp2add_h_bu_b(dst3, vec3, filt3);
+
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vst(dst1, dst, 16);
+ __lsx_vst(dst2, dst, 32);
+ __lsx_vst(dst3, dst, 48);
+ dst += dst_stride;
+ }
+}
+
+static void hevc_hz_8t_48w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ uint32_t loop_cnt;
+ __m128i src0, src1, src2, src3;
+ __m128i filt0, filt1, filt2, filt3;
+ __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+
+ src -= 3;
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ DUP4_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask0, 6, mask0, 8, mask1,
+ mask2, mask3, mask4);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 10, mask0, 12, mask5, mask6);
+ mask7 = __lsx_vaddi_bu(mask0, 14);
+
+ for (loop_cnt = height; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+ src2 = __lsx_vld(src, 32);
+ src3 = __lsx_vld(src, 40);
+ src += src_stride;
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask4, src1, src1,
+ mask0, src2, src1, mask4, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+ vec3, filt0, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src0, mask5, src1,
+ src1, mask1, src2, src1, mask5, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
+ dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src0, mask6, src1,
+ src1, mask2, src2, src1, mask6, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt2, dst1, vec1, filt2,
+ dst2, vec2, filt2, dst3, vec3, filt2, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src0, mask7, src1,
+ src1, mask3, src2, src1, mask7, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt3, dst1, vec1, filt3,
+ dst2, vec2, filt3, dst3, vec3, filt3, dst0, dst1, dst2, dst3);
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vst(dst1, dst, 16);
+ __lsx_vst(dst2, dst, 32);
+ __lsx_vst(dst3, dst, 48);
+
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0,
+ vec4, vec5);
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec4, filt0, vec5, filt0, dst4, dst5);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1,
+ vec4, vec5);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt1, dst5, vec5, filt1,
+ dst4, dst5);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2,
+ vec4, vec5);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt2, dst5, vec5, filt2,
+ dst4, dst5);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask3, src3, src3, mask3,
+ vec4, vec5);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt3, dst5, vec5, filt3,
+ dst4, dst5);
+ __lsx_vst(dst4, dst, 64);
+ __lsx_vst(dst5, dst, 80);
+ dst += dst_stride;
+ }
+}
+
+static void hevc_hz_8t_64w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ uint32_t loop_cnt;
+ __m128i src0, src1, src2, src3, src4;
+ __m128i filt0, filt1, filt2, filt3;
+ __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+ __m128i vec0, vec1, vec2, vec3;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+
+ src -= 3;
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ DUP4_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask0, 6, mask0, 8, mask1,
+ mask2, mask3, mask4);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 10, mask0, 12, mask5, mask6)
+ mask7 = __lsx_vaddi_bu(mask0, 14);
+
+ for (loop_cnt = height; loop_cnt--;) {
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
+ src0, src1, src2, src3);
+ src4 = __lsx_vld(src, 56);
+ src += src_stride;
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0,
+ src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
+ dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
+ dst0, dst0);
+ dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
+ __lsx_vst(dst0, dst, 0);
+
+ DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask4, src1, src0, mask5, src1,
+ src0, mask6, src1, src0, mask7, vec0, vec1, vec2, vec3);
+ dst1 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec1, filt1, dst1, vec2, filt2,
+ dst1, dst1);
+ dst1 = __lsx_vdp2add_h_bu_b(dst1, vec3, filt3);
+ __lsx_vst(dst1, dst, 16);
+
+ DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1,
+ src1, mask2, src1, src1, mask3, vec0, vec1, vec2, vec3);
+ dst2 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec1, filt1, dst2, vec2, filt2,
+ dst2, dst2);
+ dst2 = __lsx_vdp2add_h_bu_b(dst2, vec3, filt3);
+ __lsx_vst(dst2, dst, 32);
+
+ DUP4_ARG3(__lsx_vshuf_b, src2, src1, mask4, src2, src1, mask5, src2,
+ src1, mask6, src2, src1, mask7, vec0, vec1, vec2, vec3);
+ dst3 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst3, vec2, filt2,
+ dst3, dst3);
+ dst3 = __lsx_vdp2add_h_bu_b(dst3, vec3, filt3);
+ __lsx_vst(dst3, dst, 48);
+
+ DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2,
+ src2, mask2, src2, src2, mask3, vec0, vec1, vec2, vec3);
+ dst4 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst4, vec2, filt2,
+ dst4, dst4);
+ dst4 = __lsx_vdp2add_h_bu_b(dst4, vec3, filt3);
+ __lsx_vst(dst4, dst, 64);
+
+ DUP4_ARG3(__lsx_vshuf_b, src3, src2, mask4, src3, src2, mask5, src3,
+ src2, mask6, src3, src2, mask7, vec0, vec1, vec2, vec3);
+ dst5 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec1, filt1, dst5, vec2, filt2,
+ dst5, dst5);
+ dst5 = __lsx_vdp2add_h_bu_b(dst5, vec3, filt3);
+ __lsx_vst(dst5, dst, 80);
+
+ DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3,
+ src3, mask2, src3, src3, mask3, vec0, vec1, vec2, vec3);
+ dst6 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst6, vec1, filt1, dst6, vec2, filt2,
+ dst6, dst6);
+ dst6 = __lsx_vdp2add_h_bu_b(dst6, vec3, filt3);
+ __lsx_vst(dst6, dst, 96);
+
+ DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4,
+ src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3);
+ dst7 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2, filt2,
+ dst7, dst7);
+ dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3);
+ __lsx_vst(dst7, dst, 112);
+ dst += dst_stride;
+ }
+}
+
+static void hevc_vt_8t_4w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ int32_t loop_cnt;
+ int32_t res = (height & 0x07) >> 1;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ __m128i src9, src10, src11, src12, src13, src14;
+ __m128i src10_r, src32_r, src54_r, src76_r, src98_r;
+ __m128i src21_r, src43_r, src65_r, src87_r, src109_r;
+ __m128i src1110_r, src1211_r, src1312_r, src1413_r;
+ __m128i src2110, src4332, src6554, src8776, src10998;
+ __m128i src12111110, src14131312;
+ __m128i dst10, dst32, dst54, dst76;
+ __m128i filt0, filt1, filt2, filt3;
+
+ src -= src_stride_3x;
+
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
+ src += src_stride_3x;
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ src10_r, src32_r, src54_r, src21_r);
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
+ DUP2_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r,
+ src2110, src4332);
+ src6554 = __lsx_vilvl_d(src65_r, src54_r);
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ src7 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
+ src10 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src11 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
+ src12, src13);
+ src14 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+
+ DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+ src76_r, src87_r, src98_r, src109_r);
+ DUP4_ARG2(__lsx_vilvl_b, src11, src10, src12, src11, src13, src12, src14,
+ src13, src1110_r, src1211_r, src1312_r, src1413_r);
+ DUP4_ARG2(__lsx_vilvl_d, src87_r, src76_r, src109_r, src98_r, src1211_r,
+ src1110_r, src1413_r, src1312_r, src8776, src10998,
+ src12111110, src14131312);
+
+ dst10 = __lsx_vdp2_h_bu_b(src2110, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst10, src4332, filt1, dst10, src6554,
+ filt2, dst10, dst10);
+ dst10 = __lsx_vdp2add_h_bu_b(dst10, src8776, filt3);
+ dst32 = __lsx_vdp2_h_bu_b(src4332, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst32, src6554, filt1, dst32, src8776,
+ filt2, dst32, dst32);
+ dst32 = __lsx_vdp2add_h_bu_b(dst32, src10998, filt3);
+ dst54 = __lsx_vdp2_h_bu_b(src6554, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst54, src8776, filt1,
+ dst54, src10998, filt2, dst54, dst54);
+ dst54 = __lsx_vdp2add_h_bu_b(dst54, src12111110, filt3);
+ dst76 = __lsx_vdp2_h_bu_b(src8776, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst76, src10998, filt1, dst76,
+ src12111110, filt2, dst76, dst76);
+ dst76 = __lsx_vdp2add_h_bu_b(dst76, src14131312, filt3);
+
+ __lsx_vstelm_d(dst10, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst10, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst32, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst32, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst54, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst54, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst76, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst76, dst, 0, 1);
+ dst += dst_stride;
+
+ src2110 = src10998;
+ src4332 = src12111110;
+ src6554 = src14131312;
+ src6 = src14;
+ }
+ for (;res--;) {
+ src7 = __lsx_vld(src, 0);
+ src8 = __lsx_vldx(src, src_stride);
+ DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
+ src += src_stride_2x;
+ src8776 = __lsx_vilvl_d(src87_r, src76_r);
+
+ dst10 = __lsx_vdp2_h_bu_b(src2110, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst10, src4332, filt1, dst10, src6554,
+ filt2, dst10, dst10);
+ dst10 = __lsx_vdp2add_h_bu_b(dst10, src8776, filt3);
+
+ __lsx_vstelm_d(dst10, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst10, dst, 0, 1);
+ dst += dst_stride;
+
+ src2110 = src4332;
+ src4332 = src6554;
+ src6554 = src8776;
+ src6 = src8;
+ }
+}
+
+static void hevc_vt_8t_8w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ int32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_2x = (dst_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i src10_r, src32_r, src54_r, src76_r, src98_r;
+ __m128i src21_r, src43_r, src65_r, src87_r, src109_r;
+ __m128i dst0_r, dst1_r, dst2_r, dst3_r;
+ __m128i filt0, filt1, filt2, filt3;
+
+ src -= src_stride_3x;
+
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
+ src += src_stride_3x;
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ src10_r, src32_r, src54_r, src21_r);
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src7 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
+ src10 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
+ src9, src76_r, src87_r, src98_r, src109_r);
+
+ dst0_r = __lsx_vdp2_h_bu_b(src10_r, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_r,
+ src54_r, filt2, dst0_r, dst0_r);
+ dst0_r = __lsx_vdp2add_h_bu_b(dst0_r, src76_r, filt3);
+ dst1_r = __lsx_vdp2_h_bu_b(src21_r, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1_r, src43_r, filt1, dst1_r,
+ src65_r, filt2, dst1_r, dst1_r);
+ dst1_r = __lsx_vdp2add_h_bu_b(dst1_r, src87_r, filt3);
+ dst2_r = __lsx_vdp2_h_bu_b(src32_r, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src54_r, filt1, dst2_r,
+ src76_r, filt2, dst2_r, dst2_r);
+ dst2_r = __lsx_vdp2add_h_bu_b(dst2_r, src98_r, filt3);
+ dst3_r = __lsx_vdp2_h_bu_b(src43_r, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3_r, src65_r, filt1, dst3_r,
+ src87_r, filt2, dst3_r, dst3_r);
+ dst3_r = __lsx_vdp2add_h_bu_b(dst3_r, src109_r, filt3);
+
+ __lsx_vst(dst0_r, dst, 0);
+ __lsx_vstx(dst1_r, dst, dst_stride_x);
+ __lsx_vstx(dst2_r, dst, dst_stride_2x);
+ __lsx_vstx(dst3_r, dst, dst_stride_3x);
+ dst += dst_stride_2x;
+
+ src10_r = src54_r;
+ src32_r = src76_r;
+ src54_r = src98_r;
+ src21_r = src65_r;
+ src43_r = src87_r;
+ src65_r = src109_r;
+ src6 = src10;
+ }
+}
+
+static void hevc_vt_8t_12w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ int32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i src10_r, src32_r, src54_r, src76_r, src98_r;
+ __m128i src21_r, src43_r, src65_r, src87_r, src109_r;
+ __m128i dst0_r, dst1_r, dst2_r, dst3_r;
+ __m128i src10_l, src32_l, src54_l, src76_l, src98_l;
+ __m128i src21_l, src43_l, src65_l, src87_l, src109_l;
+ __m128i src2110, src4332, src6554, src8776, src10998;
+ __m128i dst0_l, dst1_l;
+ __m128i filt0, filt1, filt2, filt3;
+
+ src -= src_stride_3x;
+
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
+ src += src_stride_3x;
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ src10_r, src32_r, src54_r, src21_r);
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
+ DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ src10_l, src32_l, src54_l, src21_l);
+ DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, src43_l, src65_l);
+ DUP2_ARG2(__lsx_vilvl_d, src21_l, src10_l, src43_l, src32_l,
+ src2110, src4332);
+ src6554 = __lsx_vilvl_d(src65_l, src54_l);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src7 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
+ src10 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
+ src9, src76_r, src87_r, src98_r, src109_r);
+ DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10,
+ src9, src76_l, src87_l, src98_l, src109_l);
+ DUP2_ARG2(__lsx_vilvl_d, src87_l, src76_l, src109_l, src98_l,
+ src8776, src10998);
+
+ dst0_r = __lsx_vdp2_h_bu_b(src10_r, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_r,
+ src54_r, filt2, dst0_r, dst0_r);
+ dst0_r = __lsx_vdp2add_h_bu_b(dst0_r, src76_r, filt3);
+ dst1_r = __lsx_vdp2_h_bu_b(src21_r, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1_r, src43_r, filt1, dst1_r,
+ src65_r, filt2, dst1_r, dst1_r);
+ dst1_r = __lsx_vdp2add_h_bu_b(dst1_r, src87_r, filt3);
+ dst2_r = __lsx_vdp2_h_bu_b(src32_r, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src54_r, filt1, dst2_r,
+ src76_r, filt2, dst2_r, dst2_r);
+ dst2_r = __lsx_vdp2add_h_bu_b(dst2_r, src98_r, filt3);
+ dst3_r = __lsx_vdp2_h_bu_b(src43_r, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3_r, src65_r, filt1, dst3_r,
+ src87_r, filt2, dst3_r, dst3_r);
+ dst3_r = __lsx_vdp2add_h_bu_b(dst3_r, src109_r, filt3);
+ dst0_l = __lsx_vdp2_h_bu_b(src2110, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0_l, src4332, filt1, dst0_l,
+ src6554, filt2, dst0_l, dst0_l);
+ dst0_l = __lsx_vdp2add_h_bu_b(dst0_l, src8776, filt3);
+ dst1_l = __lsx_vdp2_h_bu_b(src4332, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1_l, src6554, filt1, dst1_l,
+ src8776, filt2, dst1_l, dst1_l);
+ dst1_l = __lsx_vdp2add_h_bu_b(dst1_l, src10998, filt3);
+
+ __lsx_vst(dst0_r, dst, 0);
+ __lsx_vstelm_d(dst0_l, dst, 16, 0);
+ dst += dst_stride;
+ __lsx_vst(dst1_r, dst, 0);
+ __lsx_vstelm_d(dst0_l, dst, 16, 1);
+ dst += dst_stride;
+ __lsx_vst(dst2_r, dst, 0);
+ __lsx_vstelm_d(dst1_l, dst, 16, 0);
+ dst += dst_stride;
+ __lsx_vst(dst3_r, dst, 0);
+ __lsx_vstelm_d(dst1_l, dst, 16, 1);
+ dst += dst_stride;
+
+ src10_r = src54_r;
+ src32_r = src76_r;
+ src54_r = src98_r;
+ src21_r = src65_r;
+ src43_r = src87_r;
+ src65_r = src109_r;
+ src2110 = src6554;
+ src4332 = src8776;
+ src6554 = src10998;
+ src6 = src10;
+ }
+}
+
+static void hevc_vt_8t_16multx4mult_lsx(uint8_t *src,
+ int32_t src_stride,
+ int16_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter,
+ int32_t height,
+ int32_t width)
+{
+ uint8_t *src_tmp;
+ int16_t *dst_tmp;
+ int32_t loop_cnt, cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i src10_r, src32_r, src54_r, src76_r, src98_r;
+ __m128i src21_r, src43_r, src65_r, src87_r, src109_r;
+ __m128i dst0_r, dst1_r, dst2_r, dst3_r;
+ __m128i src10_l, src32_l, src54_l, src76_l, src98_l;
+ __m128i src21_l, src43_l, src65_l, src87_l, src109_l;
+ __m128i dst0_l, dst1_l, dst2_l, dst3_l;
+ __m128i filt0, filt1, filt2, filt3;
+
+ src -= src_stride_3x;
+
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ for (cnt = width >> 4; cnt--;) {
+ src_tmp = src;
+ dst_tmp = dst;
+
+ src0 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src1, src2);
+ src3 = __lsx_vldx(src_tmp, src_stride_3x);
+ src_tmp += src_stride_4x;
+ src4 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src5, src6);
+ src_tmp += src_stride_3x;
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ src10_r, src32_r, src54_r, src21_r);
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
+ DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ src10_l, src32_l, src54_l, src21_l);
+ DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, src43_l, src65_l);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src7 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src8, src9);
+ src10 = __lsx_vldx(src_tmp, src_stride_3x);
+ src_tmp += src_stride_4x;
+ DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8,
+ src10, src9, src76_r, src87_r, src98_r, src109_r);
+ DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8,
+ src10, src9, src76_l, src87_l, src98_l, src109_l);
+
+ dst0_r = __lsx_vdp2_h_bu_b(src10_r, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_r,
+ src54_r, filt2, dst0_r, dst0_r);
+ dst0_r = __lsx_vdp2add_h_bu_b(dst0_r, src76_r, filt3);
+ dst1_r = __lsx_vdp2_h_bu_b(src21_r, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1_r, src43_r, filt1, dst1_r,
+ src65_r, filt2, dst1_r, dst1_r);
+ dst1_r = __lsx_vdp2add_h_bu_b(dst1_r, src87_r, filt3);
+ dst2_r = __lsx_vdp2_h_bu_b(src32_r, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src54_r, filt1, dst2_r,
+ src76_r, filt2, dst2_r, dst2_r);
+ dst2_r = __lsx_vdp2add_h_bu_b(dst2_r, src98_r, filt3);
+ dst3_r = __lsx_vdp2_h_bu_b(src43_r, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3_r, src65_r, filt1, dst3_r,
+ src87_r, filt2, dst3_r, dst3_r);
+ dst3_r = __lsx_vdp2add_h_bu_b(dst3_r, src109_r, filt3);
+ dst0_l = __lsx_vdp2_h_bu_b(src10_l, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0_l, src32_l, filt1, dst0_l,
+ src54_l, filt2, dst0_l, dst0_l);
+ dst0_l = __lsx_vdp2add_h_bu_b(dst0_l, src76_l, filt3);
+ dst1_l = __lsx_vdp2_h_bu_b(src21_l, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1_l, src43_l, filt1, dst1_l,
+ src65_l, filt2, dst1_l, dst1_l);
+ dst1_l = __lsx_vdp2add_h_bu_b(dst1_l, src87_l, filt3);
+ dst2_l = __lsx_vdp2_h_bu_b(src32_l, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_l, src54_l, filt1, dst2_l,
+ src76_l, filt2, dst2_l, dst2_l);
+ dst2_l = __lsx_vdp2add_h_bu_b(dst2_l, src98_l, filt3);
+ dst3_l = __lsx_vdp2_h_bu_b(src43_l, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3_l, src65_l, filt1, dst3_l,
+ src87_l, filt2, dst3_l, dst3_l);
+ dst3_l = __lsx_vdp2add_h_bu_b(dst3_l, src109_l, filt3);
+
+ __lsx_vst(dst0_r, dst_tmp, 0);
+ __lsx_vst(dst0_l, dst_tmp, 16);
+ dst_tmp += dst_stride;
+ __lsx_vst(dst1_r, dst_tmp, 0);
+ __lsx_vst(dst1_l, dst_tmp, 16);
+ dst_tmp += dst_stride;
+ __lsx_vst(dst2_r, dst_tmp, 0);
+ __lsx_vst(dst2_l, dst_tmp, 16);
+ dst_tmp += dst_stride;
+ __lsx_vst(dst3_r, dst_tmp, 0);
+ __lsx_vst(dst3_l, dst_tmp, 16);
+ dst_tmp += dst_stride;
+
+ src10_r = src54_r;
+ src32_r = src76_r;
+ src54_r = src98_r;
+ src21_r = src65_r;
+ src43_r = src87_r;
+ src65_r = src109_r;
+ src10_l = src54_l;
+ src32_l = src76_l;
+ src54_l = src98_l;
+ src21_l = src65_l;
+ src43_l = src87_l;
+ src65_l = src109_l;
+ src6 = src10;
+ }
+ src += 16;
+ dst += 16;
+ }
+}
+
+static void hevc_vt_8t_16w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ hevc_vt_8t_16multx4mult_lsx(src, src_stride, dst, dst_stride,
+ filter, height, 16);
+}
+
+static void hevc_vt_8t_24w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ hevc_vt_8t_16multx4mult_lsx(src, src_stride, dst, dst_stride,
+ filter, height, 16);
+ hevc_vt_8t_8w_lsx(src + 16, src_stride, dst + 16, dst_stride,
+ filter, height);
+}
+
+static void hevc_vt_8t_32w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ hevc_vt_8t_16multx4mult_lsx(src, src_stride, dst, dst_stride,
+ filter, height, 32);
+}
+
+static void hevc_vt_8t_48w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ hevc_vt_8t_16multx4mult_lsx(src, src_stride, dst, dst_stride,
+ filter, height, 48);
+}
+
+static void hevc_vt_8t_64w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ hevc_vt_8t_16multx4mult_lsx(src, src_stride, dst, dst_stride,
+ filter, height, 64);
+}
+
+static void hevc_hv_8t_4w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ uint32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i filt0, filt1, filt2, filt3;
+ __m128i filt_h0, filt_h1, filt_h2, filt_h3;
+ __m128i mask1, mask2, mask3;
+ __m128i filter_vec;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+ __m128i dst30, dst41, dst52, dst63, dst66, dst97, dst108;
+ __m128i dst0_r, dst1_r, dst2_r, dst3_r;
+ __m128i dst10_r, dst32_r, dst54_r, dst76_r, dst98_r;
+ __m128i dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
+ __m128i mask0;
+
+ mask0 = __lsx_vld(ff_hevc_mask_arr, 16);
+
+ src -= src_stride_3x + 3;
+ DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4,
+ filter_x, 6, filt0, filt1, filt2, filt3);
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+
+ DUP4_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filter_vec, 2,
+ filter_vec, 3, filt_h0, filt_h1, filt_h2, filt_h3);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
+ src += src_stride_3x;
+
+ DUP4_ARG3(__lsx_vshuf_b, src3, src0, mask0, src3, src0, mask1, src3, src0,
+ mask2, src3, src0, mask3, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src4, src1, mask0, src4, src1, mask1, src4, src1,
+ mask2, src4, src1, mask3, vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vshuf_b, src5, src2, mask0, src5, src2, mask1, src5, src2,
+ mask2, src5, src2, mask3, vec8, vec9, vec10, vec11);
+ DUP4_ARG3(__lsx_vshuf_b, src6, src3, mask0, src6, src3, mask1, src6, src3,
+ mask2, src6, src3, mask3, vec12, vec13, vec14, vec15);
+ dst30 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst30, vec1, filt1, dst30, vec2, filt2,
+ dst30, dst30);
+ dst30 = __lsx_vdp2add_h_bu_b(dst30, vec3, filt3);
+ dst41 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst41, vec5, filt1, dst41, vec6, filt2,
+ dst41, dst41);
+ dst41 = __lsx_vdp2add_h_bu_b(dst41, vec7, filt3);
+ dst52 = __lsx_vdp2_h_bu_b(vec8, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst52, vec9, filt1, dst52, vec10, filt2,
+ dst52, dst52);
+ dst52 = __lsx_vdp2add_h_bu_b(dst52, vec11, filt3);
+ dst63 = __lsx_vdp2_h_bu_b(vec12, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst63, vec13, filt1, dst63, vec14, filt2,
+ dst63, dst63);
+ dst63 = __lsx_vdp2add_h_bu_b(dst63, vec15, filt3);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst41, dst30, dst52, dst41, dst10_r, dst21_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst41, dst30, dst52, dst41, dst43_r, dst54_r);
+ dst32_r = __lsx_vilvl_h(dst63, dst52);
+ dst65_r = __lsx_vilvh_h(dst63, dst52);
+ dst66 = __lsx_vreplvei_d(dst63, 1);
+
+ for (loop_cnt = height >> 2; loop_cnt--;) {
+ src7 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
+ src10 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+
+ DUP4_ARG3(__lsx_vshuf_b, src9, src7, mask0, src9, src7, mask1, src9, src7,
+ mask2, src9, src7, mask3, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src10, src8, mask0, src10, src8, mask1, src10, src8,
+ mask2, src10, src8, mask3, vec4, vec5, vec6, vec7);
+
+ dst97 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst97, vec1, filt1, dst97, vec2, filt2,
+ dst97, dst97);
+ dst97 = __lsx_vdp2add_h_bu_b(dst97, vec3, filt3);
+ dst108 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst108, vec5, filt1, dst108, vec6,
+ filt2, dst108, dst108);
+ dst108 = __lsx_vdp2add_h_bu_b(dst108, vec7, filt3);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst97, dst66, dst108, dst97, dst76_r, dst87_r);
+ dst109_r = __lsx_vilvh_h(dst108, dst97);
+ dst66 = __lsx_vreplvei_d(dst97, 1);
+ dst98_r = __lsx_vilvl_h(dst66, dst108);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst21_r, filt_h0, dst32_r,
+ filt_h0, dst43_r, filt_h0, dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst1_r, dst43_r,
+ filt_h1, dst2_r, dst54_r, filt_h1, dst3_r, dst65_r, filt_h1,
+ dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst54_r, filt_h2, dst1_r, dst65_r,
+ filt_h2, dst2_r, dst76_r, filt_h2, dst3_r, dst87_r, filt_h2,
+ dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst1_r, dst87_r,
+ filt_h3, dst2_r, dst98_r, filt_h3, dst3_r, dst109_r, filt_h3,
+ dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst1_r, 6, dst2_r, 6, dst3_r, 6,
+ dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP2_ARG2(__lsx_vpickev_h, dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst2_r);
+ __lsx_vstelm_d(dst0_r, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst0_r, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst2_r, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst2_r, dst, 0, 1);
+ dst += dst_stride;
+
+ dst10_r = dst54_r;
+ dst32_r = dst76_r;
+ dst54_r = dst98_r;
+ dst21_r = dst65_r;
+ dst43_r = dst87_r;
+ dst65_r = dst109_r;
+ dst66 = __lsx_vreplvei_d(dst108, 1);
+ }
+}
+
+static void hevc_hv_8t_8multx1mult_lsx(uint8_t *src,
+ int32_t src_stride,
+ int16_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter_x,
+ const int8_t *filter_y,
+ int32_t height,
+ int32_t width)
+{
+ uint32_t loop_cnt, cnt;
+ uint8_t *src_tmp;
+ int16_t *dst_tmp;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i filt0, filt1, filt2, filt3;
+ __m128i filt_h0, filt_h1, filt_h2, filt_h3;
+ __m128i mask1, mask2, mask3;
+ __m128i filter_vec;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ __m128i dst0_r, dst0_l;
+ __m128i dst10_r, dst32_r, dst54_r, dst76_r;
+ __m128i dst10_l, dst32_l, dst54_l, dst76_l;
+ __m128i mask0 = {0x403030202010100, 0x807070606050504};
+
+ src -= src_stride_3x + 3;
+ DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4,
+ filter_x, 6, filt0, filt1, filt2, filt3);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+
+ DUP4_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filter_vec, 2,
+ filter_vec, 3, filt_h0, filt_h1, filt_h2, filt_h3);
+
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+
+ for (cnt = width >> 3; cnt--;) {
+ src_tmp = src;
+ dst_tmp = dst;
+ src0 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src1, src2);
+ src3 = __lsx_vldx(src_tmp, src_stride_3x);
+ src_tmp += src_stride_4x;
+ src4 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src5, src6);
+ src_tmp += src_stride_3x;
+
+ /* row 0 row 1 row 2 row 3 */
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0,
+ src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1,
+ src1, mask2, src1, src1, mask3, vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2,
+ src2, mask2, src2, src2, mask3, vec8, vec9, vec10, vec11);
+ DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3,
+ src3, mask2, src3, src3, mask3, vec12, vec13, vec14, vec15);
+ dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
+ dst0, dst0);
+ dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
+ dst1 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec5, filt1, dst1, vec6, filt2,
+ dst1, dst1);
+ dst1 = __lsx_vdp2add_h_bu_b(dst1, vec7, filt3);
+ dst2 = __lsx_vdp2_h_bu_b(vec8, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec9, filt1, dst2, vec10, filt2,
+ dst2, dst2);
+ dst2 = __lsx_vdp2add_h_bu_b(dst2, vec11, filt3);
+ dst3 = __lsx_vdp2_h_bu_b(vec12, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec13, filt1, dst3, vec14, filt2,
+ dst3, dst3);
+ dst3 = __lsx_vdp2add_h_bu_b(dst3, vec15, filt3);
+
+ /* row 4 row 5 row 6 */
+ DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4,
+ src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5,
+ src5, mask2, src5, src5, mask3, vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6,
+ src6, mask2, src6, src6, mask3, vec8, vec9, vec10, vec11);
+ dst4 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst4, vec2, filt2,
+ dst4, dst4);
+ dst4 = __lsx_vdp2add_h_bu_b(dst4, vec3, filt3);
+ dst5 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec5, filt1, dst5, vec6, filt2,
+ dst5, dst5);
+ dst5 = __lsx_vdp2add_h_bu_b(dst5, vec7, filt3);
+ dst6 = __lsx_vdp2_h_bu_b(vec8, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst6, vec9, filt1, dst6, vec10, filt2,
+ dst6, dst6);
+ dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3);
+
+ for (loop_cnt = height; loop_cnt--;) {
+ src7 = __lsx_vld(src_tmp, 0);
+ src_tmp += src_stride;
+
+ DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7,
+ src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3);
+ dst7 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2,
+ filt2, dst7, dst7);
+ dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3);
+
+ DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7,
+ dst6, dst10_r, dst32_r, dst54_r, dst76_r);
+ DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7,
+ dst6, dst10_l, dst32_l, dst54_l, dst76_l);
+
+ DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0,
+ dst0_r, dst0_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
+ dst32_l, filt_h1, dst0_r, dst54_r, filt_h2, dst0_l,
+ dst54_l, filt_h2, dst0_r, dst0_l, dst0_r, dst0_l);
+ DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l,
+ dst76_l, filt_h3, dst0_r, dst0_l);
+ dst0_r = __lsx_vsrai_w(dst0_r, 6);
+ dst0_l = __lsx_vsrai_w(dst0_l, 6);
+
+ dst0_r = __lsx_vpickev_h(dst0_l, dst0_r);
+ __lsx_vst(dst0_r, dst_tmp, 0);
+ dst_tmp += dst_stride;
+
+ dst0 = dst1;
+ dst1 = dst2;
+ dst2 = dst3;
+ dst3 = dst4;
+ dst4 = dst5;
+ dst5 = dst6;
+ dst6 = dst7;
+ }
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void hevc_hv_8t_8w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ hevc_hv_8t_8multx1mult_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 8);
+}
+
+static void hevc_hv_8t_12w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ uint32_t loop_cnt;
+ uint8_t *src_tmp;
+ int16_t *dst_tmp;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+ __m128i filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
+ __m128i filter_vec;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ __m128i dst30, dst41, dst52, dst63, dst66, dst97, dst108;
+ __m128i dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
+ __m128i dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
+ __m128i dst0_r, dst0_l, dst1_r, dst2_r, dst3_r;
+
+ src -= src_stride_3x + 3;
+ DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4,
+ filter_x, 6, filt0, filt1, filt2, filt3);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+
+ DUP4_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filter_vec, 2,
+ filter_vec, 3, filt_h0, filt_h1, filt_h2, filt_h3);
+
+ mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+
+ src_tmp = src;
+ dst_tmp = dst;
+
+ src0 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src1, src2);
+ src3 = __lsx_vldx(src_tmp, src_stride_3x);
+ src_tmp += src_stride_4x;
+ src4 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src5, src6);
+ src_tmp += src_stride_3x;
+
+ /* row 0 row 1 row 2 row 3 */
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0, src0,
+ mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1, src1,
+ mask2, src1, src1, mask3, vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2, src2,
+ mask2, src2, src2, mask3, vec8, vec9, vec10, vec11);
+ DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3, src3,
+ mask2, src3, src3, mask3, vec12, vec13, vec14, vec15);
+ dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
+ dst0, dst0);
+ dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
+ dst1 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec5, filt1, dst1, vec6, filt2,
+ dst1, dst1);
+ dst1 = __lsx_vdp2add_h_bu_b(dst1, vec7, filt3);
+ dst2 = __lsx_vdp2_h_bu_b(vec8, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec9, filt1, dst2, vec10, filt2,
+ dst2, dst2);
+ dst2 = __lsx_vdp2add_h_bu_b(dst2, vec11, filt3);
+ dst3 = __lsx_vdp2_h_bu_b(vec12, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec13, filt1, dst3, vec14, filt2,
+ dst3, dst3);
+ dst3 = __lsx_vdp2add_h_bu_b(dst3, vec15, filt3);
+
+ /* row 4 row 5 row 6 */
+ DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4, src4,
+ mask2, src4, src4, mask3, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5, src5,
+ mask2, src5, src5, mask3, vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6, src6,
+ mask2, src6, src6, mask3, vec8, vec9, vec10, vec11);
+ dst4 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst4, vec2, filt2,
+ dst4, dst4);
+ dst4 = __lsx_vdp2add_h_bu_b(dst4, vec3, filt3);
+ dst5 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec5, filt1, dst5, vec6, filt2,
+ dst5, dst5);
+ dst5 = __lsx_vdp2add_h_bu_b(dst5, vec7, filt3);
+ dst6 = __lsx_vdp2_h_bu_b(vec8, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst6, vec9, filt1, dst6, vec10, filt2,
+ dst6, dst6);
+ dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3);
+
+ for (loop_cnt = height; loop_cnt--;) {
+ src7 = __lsx_vld(src_tmp, 0);
+ src_tmp += src_stride;
+
+ DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7,
+ src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3);
+ dst7 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2, filt2,
+ dst7, dst7);
+ dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3);
+ DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
+ dst10_r, dst32_r, dst54_r, dst76_r);
+ DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
+ dst10_l, dst32_l, dst54_l, dst76_l);
+ DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0,
+ dst0_r, dst0_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+ filt_h1, dst0_r, dst54_r, filt_h2, dst0_l, dst54_l, filt_h2,
+ dst0_r, dst0_l, dst0_r, dst0_l);
+ DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l, dst76_l,
+ filt_h3, dst0_r, dst0_l)
+ dst0_r = __lsx_vsrai_w(dst0_r, 6);
+ dst0_l = __lsx_vsrai_w(dst0_l, 6);
+
+ dst0_r = __lsx_vpickev_h(dst0_l, dst0_r);
+ __lsx_vst(dst0_r, dst_tmp, 0);
+ dst_tmp += dst_stride;
+
+ dst0 = dst1;
+ dst1 = dst2;
+ dst2 = dst3;
+ dst3 = dst4;
+ dst4 = dst5;
+ dst5 = dst6;
+ dst6 = dst7;
+ }
+ src += 8;
+ dst += 8;
+
+ mask4 = __lsx_vld(ff_hevc_mask_arr, 16);
+ DUP2_ARG2(__lsx_vaddi_bu, mask4, 2, mask4, 4, mask5, mask6);
+ mask7 = __lsx_vaddi_bu(mask4, 6);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
+ src += src_stride_3x;
+
+ DUP4_ARG3(__lsx_vshuf_b, src3, src0, mask4, src3, src0, mask5, src3, src0,
+ mask6, src3, src0, mask7, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src4, src1, mask4, src4, src1, mask5, src4, src1,
+ mask6, src4, src1, mask7, vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vshuf_b, src5, src2, mask4, src5, src2, mask5, src5, src2,
+ mask6, src5, src2, mask7, vec8, vec9, vec10, vec11);
+ DUP4_ARG3(__lsx_vshuf_b, src6, src3, mask4, src6, src3, mask5, src6, src3,
+ mask6, src6, src3, mask7, vec12, vec13, vec14, vec15);
+ dst30 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst30, vec1, filt1, dst30, vec2, filt2,
+ dst30, dst30);
+ dst30 = __lsx_vdp2add_h_bu_b(dst30, vec3, filt3);
+ dst41 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst41, vec5, filt1, dst41, vec6, filt2,
+ dst41, dst41);
+ dst41 = __lsx_vdp2add_h_bu_b(dst41, vec7, filt3);
+ dst52 = __lsx_vdp2_h_bu_b(vec8, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst52, vec9, filt1, dst52, vec10, filt2,
+ dst52, dst52);
+ dst52 = __lsx_vdp2add_h_bu_b(dst52, vec11, filt3);
+ dst63 = __lsx_vdp2_h_bu_b(vec12, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst63, vec13, filt1, dst63, vec14, filt2,
+ dst63, dst63);
+ dst63 = __lsx_vdp2add_h_bu_b(dst63, vec15, filt3);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst41, dst30, dst52, dst41, dst10_r, dst21_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst41, dst30, dst52, dst41, dst43_r, dst54_r);
+ dst32_r = __lsx_vilvl_h(dst63, dst52);
+ dst65_r = __lsx_vilvh_h(dst63, dst52);
+
+ dst66 = __lsx_vreplvei_d(dst63, 1);
+
+ for (loop_cnt = height >> 2; loop_cnt--;) {
+ src7 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
+ src10 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+
+ DUP4_ARG3(__lsx_vshuf_b, src9, src7, mask4, src9, src7, mask5, src9,
+ src7, mask6, src9, src7, mask7, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src10, src8, mask4, src10, src8, mask5, src10,
+ src8, mask6, src10, src8, mask7, vec4, vec5, vec6, vec7);
+ dst97 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst97, vec1, filt1, dst97, vec2, filt2,
+ dst97, dst97);
+ dst97 = __lsx_vdp2add_h_bu_b(dst97, vec3, filt3);
+ dst108 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst108, vec5, filt1, dst108, vec6,
+ filt2, dst108, dst108);
+ dst108 = __lsx_vdp2add_h_bu_b(dst108, vec7, filt3);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst97, dst66, dst108, dst97, dst76_r, dst87_r);
+ dst109_r = __lsx_vilvh_h(dst108, dst97);
+ dst66 = __lsx_vreplvei_d(dst97, 1);
+ dst98_r = __lsx_vilvl_h(dst66, dst108);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst21_r, filt_h0, dst32_r,
+ filt_h0, dst43_r, filt_h0, dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst1_r, dst43_r,
+ filt_h1, dst2_r, dst54_r, filt_h1, dst3_r, dst65_r, filt_h1,
+ dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst54_r, filt_h2, dst1_r, dst65_r,
+ filt_h2, dst2_r, dst76_r, filt_h2, dst3_r, dst87_r, filt_h2,
+ dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst1_r, dst87_r,
+ filt_h3, dst2_r, dst98_r, filt_h3, dst3_r, dst109_r, filt_h3,
+ dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst1_r, 6, dst2_r, 6, dst3_r, 6,
+ dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP2_ARG2(__lsx_vpickev_h, dst1_r, dst0_r, dst3_r, dst2_r,
+ dst0_r, dst2_r);
+ __lsx_vstelm_d(dst0_r, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst0_r, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst2_r, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst2_r, dst, 0, 1);
+ dst += dst_stride;
+
+ dst10_r = dst54_r;
+ dst32_r = dst76_r;
+ dst54_r = dst98_r;
+ dst21_r = dst65_r;
+ dst43_r = dst87_r;
+ dst65_r = dst109_r;
+ dst66 = __lsx_vreplvei_d(dst108, 1);
+ }
+}
+
+static void hevc_hv_8t_16w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ hevc_hv_8t_8multx1mult_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 16);
+}
+
+static void hevc_hv_8t_24w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ hevc_hv_8t_8multx1mult_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 24);
+}
+
+static void hevc_hv_8t_32w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ hevc_hv_8t_8multx1mult_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 32);
+}
+
+static void hevc_hv_8t_48w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ hevc_hv_8t_8multx1mult_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 48);
+}
+
+static void hevc_hv_8t_64w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ hevc_hv_8t_8multx1mult_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 64);
+}
+
+static void hevc_hz_4t_32w_lsx(uint8_t *src,
+ int32_t src_stride,
+ int16_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter,
+ int32_t height)
+{
+ uint32_t loop_cnt;
+ __m128i src0, src1, src2;
+ __m128i filt0, filt1;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ __m128i mask1, mask2, mask3;
+ __m128i dst0, dst1, dst2, dst3;
+ __m128i vec0, vec1, vec2, vec3;
+
+ src -= 1;
+ DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 8, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 10);
+
+ for (loop_cnt = height; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+ src2 = __lsx_vld(src, 24);
+ src += src_stride;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask2,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src2, src2, mask0,
+ vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+ vec3, filt0, dst0, dst1, dst2, dst3);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src0, mask3,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask1, src2, src2, mask1,
+ vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
+ dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vst(dst1, dst, 16);
+ __lsx_vst(dst2, dst, 32);
+ __lsx_vst(dst3, dst, 48);
+ dst += dst_stride;
+ }
+}
+
+static void hevc_vt_4t_16w_lsx(uint8_t *src,
+ int32_t src_stride,
+ int16_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter,
+ int32_t height)
+{
+ int32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ __m128i src0, src1, src2, src3, src4, src5;
+ __m128i src10_r, src32_r, src21_r, src43_r;
+ __m128i src10_l, src32_l, src21_l, src43_l;
+ __m128i dst0_r, dst1_r, dst0_l, dst1_l;
+ __m128i filt0, filt1;
+
+ src -= src_stride;
+ DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src += src_stride_3x;
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+ DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src3 = __lsx_vld(src, 0);
+ src4 = __lsx_vldx(src, src_stride);
+ src += src_stride_2x;
+ DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
+ DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
+ filt0, src21_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_l,
+ src32_l, filt1, dst1_r, src43_r, filt1, dst1_l, src43_l,
+ filt1, dst0_r, dst0_l, dst1_r, dst1_l);
+ __lsx_vst(dst0_r, dst, 0);
+ __lsx_vst(dst0_l, dst, 16);
+ dst += dst_stride;
+ __lsx_vst(dst1_r, dst, 0);
+ __lsx_vst(dst1_l, dst, 16);
+ dst += dst_stride;
+
+ src5 = __lsx_vld(src, 0);
+ src2 = __lsx_vldx(src, src_stride);
+ src += src_stride_2x;
+ DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r);
+ DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
+ filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l,
+ src10_l, filt1, dst1_r, src21_r, filt1, dst1_l, src21_l,
+ filt1, dst0_r, dst0_l, dst1_r, dst1_l);
+ __lsx_vst(dst0_r, dst, 0);
+ __lsx_vst(dst0_l, dst, 16);
+ dst += dst_stride;
+ __lsx_vst(dst1_r, dst, 0);
+ __lsx_vst(dst1_l, dst, 16);
+ dst += dst_stride;
+ }
+}
+
+static void hevc_vt_4t_24w_lsx(uint8_t *src,
+ int32_t src_stride,
+ int16_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter,
+ int32_t height)
+{
+ int32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ uint8_t *_src;
+
+ __m128i src0, src1, src2, src3, src4, src5;
+ __m128i src6, src7, src8, src9, src10, src11;
+ __m128i src10_r, src32_r, src76_r, src98_r;
+ __m128i src21_r, src43_r, src87_r, src109_r;
+ __m128i dst0_r, dst1_r, dst2_r, dst3_r;
+ __m128i src10_l, src32_l, src21_l, src43_l;
+ __m128i dst0_l, dst1_l;
+ __m128i filt0, filt1;
+
+ src -= src_stride;
+ _src = src + 16;
+ DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+ DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
+
+ src6 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x, src7, src8);
+ src += src_stride_3x;
+ _src += src_stride_3x;
+ DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src3, src9);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src4, src10);
+ src += src_stride_2x;
+ _src += src_stride_2x;
+ DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
+ DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
+
+ DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
+ filt0, src21_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_l,
+ src32_l, filt1, dst1_r, src43_r, filt1, dst1_l, src43_l,
+ filt1, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src87_r, filt0,
+ dst2_r, dst3_r);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src98_r, filt1, dst3_r,
+ src109_r, filt1, dst2_r, dst3_r);
+
+ __lsx_vst(dst0_r, dst, 0);
+ __lsx_vst(dst0_l, dst, 16);
+ __lsx_vst(dst2_r, dst, 32);
+ dst += dst_stride;
+ __lsx_vst(dst1_r, dst, 0);
+ __lsx_vst(dst1_l, dst, 16);
+ __lsx_vst(dst3_r, dst, 32);
+ dst += dst_stride;
+
+ DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src5, src11);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src2, src8);
+ src += src_stride_2x;
+ _src += src_stride_2x;
+ DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r);
+ DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l);
+
+ DUP2_ARG2(__lsx_vilvl_b, src11, src10, src8, src11, src76_r, src87_r);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
+ filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l, src10_l,
+ filt1, dst1_r, src21_r, filt1, dst1_l, src21_l, filt1,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, src98_r, filt0, src109_r, filt0,
+ dst2_r, dst3_r);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src76_r, filt1, dst3_r, src87_r,
+ filt1, dst2_r, dst3_r);
+
+ __lsx_vst(dst0_r, dst, 0);
+ __lsx_vst(dst0_l, dst, 16);
+ __lsx_vst(dst2_r, dst, 32);
+ dst += dst_stride;
+ __lsx_vst(dst1_r, dst, 0);
+ __lsx_vst(dst1_l, dst, 16);
+ __lsx_vst(dst3_r, dst, 32);
+ dst += dst_stride;
+ }
+}
+
+static void hevc_vt_4t_32w_lsx(uint8_t *src,
+ int32_t src_stride,
+ int16_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter,
+ int32_t height)
+{
+ int32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ uint8_t *_src;
+
+ __m128i src0, src1, src2, src3, src4, src5;
+ __m128i src6, src7, src8, src9, src10, src11;
+ __m128i src10_r, src32_r, src76_r, src98_r;
+ __m128i src21_r, src43_r, src87_r, src109_r;
+ __m128i dst0_r, dst1_r, dst2_r, dst3_r;
+ __m128i src10_l, src32_l, src76_l, src98_l;
+ __m128i src21_l, src43_l, src87_l, src109_l;
+ __m128i dst0_l, dst1_l, dst2_l, dst3_l;
+ __m128i filt0, filt1;
+
+ src -= src_stride;
+ _src = src + 16;
+ DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+ DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
+
+ src6 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x, src7, src8);
+ src += src_stride_3x;
+ _src += src_stride_3x;
+ DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
+ DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src76_l, src87_l);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src3, src9);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src4, src10);
+ src += src_stride_2x;
+ _src += src_stride_2x;
+ DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
+ DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
+
+ DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r);
+ DUP2_ARG2(__lsx_vilvh_b, src9, src8, src10, src9, src98_l, src109_l);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
+ filt0, src21_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_l,
+ src32_l, filt1, dst1_r, src43_r, filt1, dst1_l,src43_l,
+ filt1, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src76_l, filt0, src87_r,
+ filt0, src87_l, filt0, dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src98_r, filt1, dst2_l, src98_l,
+ filt1, dst3_r, src109_r, filt1, dst3_l, src109_l, filt1,
+ dst2_r, dst2_l, dst3_r, dst3_l);
+
+ __lsx_vst(dst0_r, dst, 0);
+ __lsx_vst(dst0_l, dst, 16);
+ __lsx_vst(dst2_r, dst, 32);
+ __lsx_vst(dst2_l, dst, 48);
+ dst += dst_stride;
+ __lsx_vst(dst1_r, dst, 0);
+ __lsx_vst(dst1_l, dst, 16);
+ __lsx_vst(dst3_r, dst, 32);
+ __lsx_vst(dst3_l, dst, 48);
+ dst += dst_stride;
+
+ DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src5, src11);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src2, src8);
+ src += src_stride_2x;
+ _src += src_stride_2x;
+ DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r);
+ DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l);
+
+ DUP2_ARG2(__lsx_vilvl_b, src11, src10, src8, src11, src76_r, src87_r);
+ DUP2_ARG2(__lsx_vilvh_b, src11, src10, src8, src11, src76_l, src87_l);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
+ filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l,
+ src10_l, filt1, dst1_r, src21_r, filt1, dst1_l, src21_l,
+ filt1, dst0_r, dst0_l, dst1_r, dst1_l);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src98_r, filt0, src98_l, filt0, src109_r,
+ filt0, src109_l, filt0, dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src76_r, filt1, dst2_l, src76_l,
+ filt1, dst3_r, src87_r, filt1, dst3_l, src87_l, filt1,
+ dst2_r, dst2_l, dst3_r, dst3_l);
+
+ __lsx_vst(dst0_r, dst, 0);
+ __lsx_vst(dst0_l, dst, 16);
+ __lsx_vst(dst2_r, dst, 32);
+ __lsx_vst(dst2_l, dst, 48);
+ dst += dst_stride;
+ __lsx_vst(dst1_r, dst, 0);
+ __lsx_vst(dst1_l, dst, 16);
+ __lsx_vst(dst3_r, dst, 32);
+ __lsx_vst(dst3_l, dst, 48);
+ dst += dst_stride;
+ }
+}
+
+static void hevc_hv_4t_8x2_lsx(uint8_t *src,
+ int32_t src_stride,
+ int16_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter_x,
+ const int8_t *filter_y)
+{
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+
+ __m128i src0, src1, src2, src3, src4;
+ __m128i filt0, filt1;
+ __m128i filt_h0, filt_h1;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ __m128i mask1;
+ __m128i filter_vec;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+ __m128i dst0, dst1, dst2, dst3, dst4;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l;
+ __m128i dst10_r, dst32_r, dst21_r, dst43_r;
+ __m128i dst10_l, dst32_l, dst21_l, dst43_l;
+
+ src -= (src_stride + 1);
+ DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+ DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src4 = __lsx_vldx(src, src_stride_4x);
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec6, vec7);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec8, vec9);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
+ vec6, filt0, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
+ dst2, vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
+ dst4 = __lsx_vdp2_h_bu_b(vec8, filt0);
+ dst4 = __lsx_vdp2add_h_bu_b(dst4, vec9, filt1);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
+ DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+ filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+ filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
+ __lsx_vst(dst0_r, dst, 0);
+ __lsx_vst(dst1_r, dst + dst_stride, 0);
+}
+
+static void hevc_hv_4t_8multx4_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter_x,
+ const int8_t *filter_y, int32_t width8mult)
+{
+ int32_t cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_2x = (dst_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
+
+ __m128i src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i filt0, filt1, filt_h0, filt_h1, filter_vec;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6;
+ __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
+ __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+
+ src -= (src_stride + 1);
+ DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+ DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+ mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ for (cnt = width8mult; cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src,
+ src_stride_3x, src, src_stride_4x, src1, src2, src3, src4);
+ src += src_stride_4x;
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
+ src += (8 - src_stride_4x);
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1,
+ vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,
+ vec4, vec5);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
+ dst0, dst1);
+ dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
+
+ DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1,
+ vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1,
+ vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1,
+ vec6, vec7);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
+ vec6, filt0, dst3, dst4, dst5, dst6);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3, filt1,
+ dst5, vec5, filt1, dst6, vec7, filt1, dst3, dst4, dst5, dst6);
+ DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
+ DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+ filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+ filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
+ filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
+ filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
+ dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6,
+ dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r,
+ dst0_r, dst1_r);
+ DUP2_ARG2(__lsx_vpickev_h, dst2_l, dst2_r, dst3_l, dst3_r,
+ dst2_r, dst3_r);
+
+ __lsx_vst(dst0_r, dst, 0);
+ __lsx_vstx(dst1_r, dst, dst_stride_x);
+ __lsx_vstx(dst2_r, dst, dst_stride_2x);
+ __lsx_vstx(dst3_r, dst, dst_stride_3x);
+ dst += 8;
+ }
+}
+
+static void hevc_hv_4t_8x6_lsx(uint8_t *src,
+ int32_t src_stride,
+ int16_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter_x,
+ const int8_t *filter_y)
+{
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ __m128i filt0, filt1;
+ __m128i filt_h0, filt_h1;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ __m128i mask1, filter_vec;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+ __m128i vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+ __m128i dst4_r, dst4_l, dst5_r, dst5_l;
+ __m128i dst10_r, dst32_r, dst10_l, dst32_l;
+ __m128i dst21_r, dst43_r, dst21_l, dst43_l;
+ __m128i dst54_r, dst54_l, dst65_r, dst65_l;
+ __m128i dst76_r, dst76_l, dst87_r, dst87_l;
+
+ src -= (src_stride + 1);
+ DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+ DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src,
+ src_stride_3x, src, src_stride_4x, src1, src2, src3, src4);
+ src += src_stride_4x;
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src,
+ src_stride_3x, src, src_stride_4x, src5, src6, src7, src8);
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src1, src1,
+ mask0, src1, src1, mask1, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,src3, src3,
+ mask0, src3, src3, mask1, vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src5, src5,
+ mask0, src5, src5, mask1, vec8, vec9, vec10, vec11);
+ DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src7, src7,
+ mask0, src7, src7, mask1, vec12, vec13, vec14, vec15);
+ DUP2_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1,
+ vec16, vec17);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
+ filt0, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
+ dst2, vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec8, filt0, vec10, filt0, vec12, filt0,
+ vec14, filt0, dst4, dst5, dst6, dst7);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec9, filt1, dst5, vec11, filt1, dst6,
+ vec13, filt1, dst7, vec15, filt1, dst4, dst5, dst6, dst7);
+ dst8 = __lsx_vdp2_h_bu_b(vec16, filt0);
+ dst8 = __lsx_vdp2add_h_bu_b(dst8, vec17, filt1);
+
+ DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
+ dst10_r, dst21_r, dst32_r, dst43_r);
+ DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
+ dst10_l, dst21_l, dst32_l, dst43_l);
+ DUP4_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
+ dst54_r, dst65_r, dst76_r, dst87_r);
+ DUP4_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
+ dst54_l, dst65_l, dst76_l, dst87_l);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+ filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
+ filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst54_l, filt_h0, dst65_r,
+ filt_h0, dst65_l, filt_h0, dst4_r, dst4_l, dst5_r, dst5_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+ filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
+ filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
+ dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst4_r, dst76_r, filt_h1, dst4_l, dst76_l,
+ filt_h1, dst5_r, dst87_r, filt_h1, dst5_l, dst87_l, filt_h1,
+ dst4_r, dst4_l, dst5_r, dst5_l);
+ DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6, dst0_r,
+ dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6, dst2_r,
+ dst2_l, dst3_r, dst3_l);
+ DUP4_ARG2(__lsx_vsrai_w, dst4_r, 6, dst4_l, 6, dst5_r, 6, dst5_l, 6, dst4_r,
+ dst4_l, dst5_r, dst5_l);
+
+ DUP4_ARG2(__lsx_vpickev_h,dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r,
+ dst3_l, dst3_r, dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP2_ARG2(__lsx_vpickev_h, dst4_l, dst4_r, dst5_l, dst5_r, dst4_r, dst5_r);
+
+ __lsx_vst(dst0_r, dst, 0);
+ __lsx_vstx(dst1_r, dst, dst_stride_2x);
+ dst += dst_stride_2x;
+ __lsx_vst(dst2_r, dst, 0);
+ __lsx_vstx(dst3_r, dst, dst_stride_2x);
+ dst += dst_stride_2x;
+ __lsx_vst(dst4_r, dst, 0);
+ __lsx_vstx(dst5_r, dst, dst_stride_2x);
+}
+
+static void hevc_hv_4t_8multx4mult_lsx(uint8_t *src,
+ int32_t src_stride,
+ int16_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter_x,
+ const int8_t *filter_y,
+ int32_t height,
+ int32_t width8mult)
+{
+ uint32_t loop_cnt, cnt;
+ uint8_t *src_tmp;
+ int16_t *dst_tmp;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_2x = (dst_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
+
+ __m128i src0, src1, src2, src3, src4, src5, src6;
+ __m128i filt0, filt1;
+ __m128i filt_h0, filt_h1;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ __m128i mask1, filter_vec;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+ __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
+ __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
+
+ src -= (src_stride + 1);
+ DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+ DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ for (cnt = width8mult; cnt--;) {
+ src_tmp = src;
+ dst_tmp = dst;
+
+ src0 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src1, src2);
+ src_tmp += src_stride_3x;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1,
+ vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,
+ vec4, vec5);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
+ dst0, dst1);
+ dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
+
+ for (loop_cnt = height >> 2; loop_cnt--;) {
+ src3 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src4, src5);
+ src6 = __lsx_vldx(src_tmp, src_stride_3x);
+ src_tmp += src_stride_4x;
+
+ DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1,
+ vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1,
+ vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1,
+ vec6, vec7);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
+ vec6, filt0, dst3, dst4, dst5, dst6);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3,
+ filt1, dst5, vec5, filt1, dst6, vec7, filt1, dst3,
+ dst4, dst5, dst6);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
+ DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+ filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
+ filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
+ dst32_l, filt_h1, dst1_r, dst43_r, filt_h1, dst1_l,
+ dst43_l, filt_h1, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l,
+ dst54_l, filt_h1, dst3_r, dst65_r, filt_h1, dst3_l,
+ dst65_l, filt_h1, dst2_r, dst2_l, dst3_r, dst3_l);
+
+ DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6,
+ dst2_r, dst2_l, dst3_r, dst3_l);
+
+ DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l,
+ dst2_r, dst3_l, dst3_r, dst0_r, dst1_r, dst2_r, dst3_r);
+
+ __lsx_vst(dst0_r, dst_tmp, 0);
+ __lsx_vstx(dst1_r, dst_tmp, dst_stride_x);
+ __lsx_vstx(dst2_r, dst_tmp, dst_stride_2x);
+ __lsx_vstx(dst3_r, dst_tmp, dst_stride_3x);
+ dst_tmp += dst_stride_2x;
+
+ dst10_r = dst54_r;
+ dst10_l = dst54_l;
+ dst21_r = dst65_r;
+ dst21_l = dst65_l;
+ dst2 = dst6;
+ }
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void hevc_hv_4t_8w_lsx(uint8_t *src,
+ int32_t src_stride,
+ int16_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter_x,
+ const int8_t *filter_y,
+ int32_t height)
+{
+
+ if (2 == height) {
+ hevc_hv_4t_8x2_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y);
+ } else if (4 == height) {
+ hevc_hv_4t_8multx4_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, 1);
+ } else if (6 == height) {
+ hevc_hv_4t_8x6_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y);
+ } else if (0 == (height & 0x03)) {
+ hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 1);
+ }
+}
+
+static void hevc_hv_4t_12w_lsx(uint8_t *src,
+ int32_t src_stride,
+ int16_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter_x,
+ const int8_t *filter_y,
+ int32_t height)
+{
+ uint32_t loop_cnt;
+ uint8_t *src_tmp;
+ int16_t *dst_tmp;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_2x = (dst_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
+
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i mask0, mask1, mask2, mask3;
+ __m128i filt0, filt1, filt_h0, filt_h1, filter_vec, dst0;
+ __m128i dst1, dst2, dst3, dst4, dst5, dst6, dst10, dst21, dst22, dst73;
+ __m128i dst84, dst95, dst106, dst76_r, dst98_r, dst87_r, dst109_r;
+ __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
+ __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+ src -= (src_stride + 1);
+ DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+ DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+ mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ src_tmp = src;
+ dst_tmp = dst;
+
+ src0 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src1, src2);
+ src_tmp += src_stride_3x;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
+ dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
+ dst0, dst1);
+ dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
+
+ for (loop_cnt = 4; loop_cnt--;) {
+ src3 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src4, src5);
+ src6 = __lsx_vldx(src_tmp, src_stride_3x);
+ src_tmp += src_stride_4x;
+
+ DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1,
+ vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1,
+ vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1,
+ vec6, vec7);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
+ vec6, filt0, dst3, dst4, dst5, dst6);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3,
+ filt1, dst5, vec5, filt1, dst6, vec7, filt1, dst3,
+ dst4, dst5, dst6);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
+ DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+ filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
+ filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+ filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
+ filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
+ dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6,
+ dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r,
+ dst3_l, dst3_r, dst0_r, dst1_r, dst2_r, dst3_r);
+ __lsx_vst(dst0_r, dst_tmp, 0);
+ __lsx_vstx(dst1_r, dst_tmp, dst_stride_x);
+ __lsx_vstx(dst2_r, dst_tmp, dst_stride_2x);
+ __lsx_vstx(dst3_r, dst_tmp, dst_stride_3x);
+ dst_tmp += dst_stride_2x;
+
+ dst10_r = dst54_r;
+ dst10_l = dst54_l;
+ dst21_r = dst65_r;
+ dst21_l = dst65_l;
+ dst2 = dst6;
+ }
+
+ src += 8;
+ dst += 8;
+
+ mask2 = __lsx_vld(ff_hevc_mask_arr, 16);
+ mask3 = __lsx_vaddi_bu(mask2, 2);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src += src_stride_3x;
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask2, src1, src0, mask3, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src1, mask2, src2, src1, mask3, vec2, vec3);
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst10, dst21);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst10, vec1, filt1, dst21, vec3, filt1,
+ dst10, dst21);
+ dst10_r = __lsx_vilvl_h(dst21, dst10);
+ dst21_r = __lsx_vilvh_h(dst21, dst10);
+ dst22 = __lsx_vreplvei_d(dst21, 1);
+
+ for (loop_cnt = 2; loop_cnt--;) {
+ src3 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src4, src5);
+ src6 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src7 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
+ src10 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ DUP2_ARG3(__lsx_vshuf_b, src7, src3, mask2, src7, src3, mask3,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src8, src4, mask2, src8, src4, mask3,
+ vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src9, src5, mask2, src9, src5, mask3,
+ vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src10, src6, mask2, src10, src6, mask3,
+ vec6, vec7);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
+ vec6, filt0, dst73, dst84, dst95, dst106);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst73, vec1, filt1, dst84, vec3,
+ filt1, dst95, vec5, filt1, dst106, vec7, filt1, dst73,
+ dst84, dst95, dst106);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst73, dst22, dst84, dst73, dst32_r, dst43_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst84, dst73, dst95, dst84, dst87_r, dst98_r);
+ DUP2_ARG2(__lsx_vilvl_h, dst95, dst84, dst106, dst95, dst54_r, dst65_r);
+ dst109_r = __lsx_vilvh_h(dst106, dst95);
+ dst22 = __lsx_vreplvei_d(dst73, 1);
+ dst76_r = __lsx_vilvl_h(dst22, dst106);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst21_r, filt_h0, dst32_r,
+ filt_h0, dst43_r, filt_h0, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst65_r, filt_h0, dst76_r,
+ filt_h0, dst87_r, filt_h0, tmp4, tmp5, tmp6, tmp7);
+ DUP4_ARG3(__lsx_vdp2add_w_h, tmp0, dst32_r, filt_h1, tmp1, dst43_r,
+ filt_h1, tmp2, dst54_r, filt_h1, tmp3, dst65_r, filt_h1,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG3(__lsx_vdp2add_w_h, tmp4, dst76_r, filt_h1, tmp5, dst87_r,
+ filt_h1, tmp6, dst98_r, filt_h1, tmp7, dst109_r, filt_h1,
+ tmp4, tmp5, tmp6, tmp7);
+ DUP4_ARG2(__lsx_vsrai_w, tmp0, 6, tmp1, 6, tmp2, 6, tmp3, 6,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vsrai_w, tmp4, 6, tmp5, 6, tmp6, 6, tmp7, 6,
+ tmp4, tmp5, tmp6, tmp7);
+ DUP4_ARG2(__lsx_vpickev_h, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4,
+ tmp7, tmp6, tmp0, tmp1, tmp2, tmp3);
+
+ __lsx_vstelm_d(tmp0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp1, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp2, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp3, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp3, dst, 0, 1);
+ dst += dst_stride;
+
+ dst10_r = dst98_r;
+ dst21_r = dst109_r;
+ dst22 = __lsx_vreplvei_d(dst106, 1);
+ }
+}
+
+static void hevc_hv_4t_16w_lsx(uint8_t *src,
+ int32_t src_stride,
+ int16_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter_x,
+ const int8_t *filter_y,
+ int32_t height)
+{
+ if (4 == height) {
+ hevc_hv_4t_8multx4_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, 2);
+ } else {
+ hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 2);
+ }
+}
+
+static void hevc_hv_4t_24w_lsx(uint8_t *src,
+ int32_t src_stride,
+ int16_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter_x,
+ const int8_t *filter_y,
+ int32_t height)
+{
+ hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 3);
+}
+
+static void hevc_hv_4t_32w_lsx(uint8_t *src,
+ int32_t src_stride,
+ int16_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter_x,
+ const int8_t *filter_y,
+ int32_t height)
+{
+ hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 4);
+}
+
+#define MC_COPY(WIDTH) \
+void ff_hevc_put_hevc_pel_pixels##WIDTH##_8_lsx(int16_t *dst, \
+ uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int height, \
+ intptr_t mx, \
+ intptr_t my, \
+ int width) \
+{ \
+ hevc_copy_##WIDTH##w_lsx(src, src_stride, dst, MAX_PB_SIZE, height); \
+}
+
+MC_COPY(4);
+MC_COPY(6);
+MC_COPY(8);
+MC_COPY(12);
+MC_COPY(16);
+MC_COPY(24);
+MC_COPY(32);
+MC_COPY(48);
+MC_COPY(64);
+
+#undef MC_COPY
+
+#define MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
+void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_lsx(int16_t *dst, \
+ uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int height, \
+ intptr_t mx, \
+ intptr_t my, \
+ int width) \
+{ \
+ const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
+ \
+ hevc_##DIR1##_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, \
+ MAX_PB_SIZE, filter, height); \
+}
+
+MC(qpel, h, 4, 8, hz, mx);
+MC(qpel, h, 8, 8, hz, mx);
+MC(qpel, h, 12, 8, hz, mx);
+MC(qpel, h, 16, 8, hz, mx);
+MC(qpel, h, 24, 8, hz, mx);
+MC(qpel, h, 32, 8, hz, mx);
+MC(qpel, h, 48, 8, hz, mx);
+MC(qpel, h, 64, 8, hz, mx);
+
+MC(qpel, v, 4, 8, vt, my);
+MC(qpel, v, 8, 8, vt, my);
+MC(qpel, v, 12, 8, vt, my);
+MC(qpel, v, 16, 8, vt, my);
+MC(qpel, v, 24, 8, vt, my);
+MC(qpel, v, 32, 8, vt, my);
+MC(qpel, v, 48, 8, vt, my);
+MC(qpel, v, 64, 8, vt, my);
+
+MC(epel, h, 32, 4, hz, mx);
+
+MC(epel, v, 16, 4, vt, my);
+MC(epel, v, 24, 4, vt, my);
+MC(epel, v, 32, 4, vt, my);
+
+#undef MC
+
+#define MC_HV(PEL, WIDTH, TAP) \
+void ff_hevc_put_hevc_##PEL##_hv##WIDTH##_8_lsx(int16_t *dst, \
+ uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int height, \
+ intptr_t mx, \
+ intptr_t my, \
+ int width) \
+{ \
+ const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
+ const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
+ \
+ hevc_hv_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, MAX_PB_SIZE, \
+ filter_x, filter_y, height); \
+}
+
+MC_HV(qpel, 4, 8);
+MC_HV(qpel, 8, 8);
+MC_HV(qpel, 12, 8);
+MC_HV(qpel, 16, 8);
+MC_HV(qpel, 24, 8);
+MC_HV(qpel, 32, 8);
+MC_HV(qpel, 48, 8);
+MC_HV(qpel, 64, 8);
+
+MC_HV(epel, 8, 4);
+MC_HV(epel, 12, 4);
+MC_HV(epel, 16, 4);
+MC_HV(epel, 24, 4);
+MC_HV(epel, 32, 4);
+
+#undef MC_HV
diff --git a/libavcodec/loongarch/hevcdsp_lsx.h b/libavcodec/loongarch/hevcdsp_lsx.h
new file mode 100644
index 0000000000..3259e03f13
--- /dev/null
+++ b/libavcodec/loongarch/hevcdsp_lsx.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ * Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_LOONGARCH_HEVCDSP_LSX_H
+#define AVCODEC_LOONGARCH_HEVCDSP_LSX_H
+
+#include "libavcodec/hevcdsp.h"
+
+#define MC(PEL, DIR, WIDTH) \
+void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_lsx(int16_t *dst, \
+ uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int height, \
+ intptr_t mx, \
+ intptr_t my, \
+ int width)
+
+MC(pel, pixels, 4);
+MC(pel, pixels, 6);
+MC(pel, pixels, 8);
+MC(pel, pixels, 12);
+MC(pel, pixels, 16);
+MC(pel, pixels, 24);
+MC(pel, pixels, 32);
+MC(pel, pixels, 48);
+MC(pel, pixels, 64);
+
+MC(qpel, h, 4);
+MC(qpel, h, 8);
+MC(qpel, h, 12);
+MC(qpel, h, 16);
+MC(qpel, h, 24);
+MC(qpel, h, 32);
+MC(qpel, h, 48);
+MC(qpel, h, 64);
+
+MC(qpel, v, 4);
+MC(qpel, v, 8);
+MC(qpel, v, 12);
+MC(qpel, v, 16);
+MC(qpel, v, 24);
+MC(qpel, v, 32);
+MC(qpel, v, 48);
+MC(qpel, v, 64);
+
+MC(qpel, hv, 4);
+MC(qpel, hv, 8);
+MC(qpel, hv, 12);
+MC(qpel, hv, 16);
+MC(qpel, hv, 24);
+MC(qpel, hv, 32);
+MC(qpel, hv, 48);
+MC(qpel, hv, 64);
+
+MC(epel, h, 32);
+
+MC(epel, v, 16);
+MC(epel, v, 24);
+MC(epel, v, 32);
+
+MC(epel, hv, 8);
+MC(epel, hv, 12);
+MC(epel, hv, 16);
+MC(epel, hv, 24);
+MC(epel, hv, 32);
+
+#undef MC
+
+#endif // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LSX_H
--
2.20.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
* [FFmpeg-devel] [PATCH v1 3/5] avcodec: [loongarch] Optimize Hevc_idct/lpf with LSX.
2022-02-14 8:25 [FFmpeg-devel] Optimize HEVC decoding for loongarch Hao Chen
2022-02-14 8:25 ` [FFmpeg-devel] [PATCH v1 1/5] avutil: [loongarch] Update loongson_intrinsics.h to v1.1.0 Hao Chen
2022-02-14 8:25 ` [FFmpeg-devel] [PATCH v1 2/5] avcodec: [loongarch] Optimize Hevcdsp with LSX Hao Chen
@ 2022-02-14 8:25 ` Hao Chen
2022-02-14 8:25 ` [FFmpeg-devel] [PATCH v1 4/5] avcodec: [loongarch] Optimize Hevc_mc_bi " Hao Chen
` (2 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Hao Chen @ 2022-02-14 8:25 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Lu Wang
From: Lu Wang <wanglu@loongson.cn>
ffmpeg -i 5_h265_1080p_60fps_3Mbps.mkv -f rawvideo -y /dev/null -an
before: 110fps
after : 124fps
---
libavcodec/loongarch/Makefile | 2 +
libavcodec/loongarch/hevc_idct_lsx.c | 842 ++++++
libavcodec/loongarch/hevc_lpf_sao_lsx.c | 2485 +++++++++++++++++
libavcodec/loongarch/hevcdsp_init_loongarch.c | 19 +
libavcodec/loongarch/hevcdsp_lsx.h | 26 +
5 files changed, 3374 insertions(+)
create mode 100644 libavcodec/loongarch/hevc_idct_lsx.c
create mode 100644 libavcodec/loongarch/hevc_lpf_sao_lsx.c
diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index 9f416d0c6e..cfc8e3aaff 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -26,3 +26,5 @@ LSX-OBJS-$(CONFIG_VP9_DECODER) += loongarch/vp9_mc_lsx.o \
loongarch/vp9_lpf_lsx.o \
loongarch/vp9_idct_lsx.o
LSX-OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_lsx.o \
+ loongarch/hevc_idct_lsx.o \
+ loongarch/hevc_lpf_sao_lsx.o
diff --git a/libavcodec/loongarch/hevc_idct_lsx.c b/libavcodec/loongarch/hevc_idct_lsx.c
new file mode 100644
index 0000000000..2193b27546
--- /dev/null
+++ b/libavcodec/loongarch/hevc_idct_lsx.c
@@ -0,0 +1,842 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ * Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "hevcdsp_lsx.h"
+
+static const int16_t gt8x8_cnst[16] __attribute__ ((aligned (64))) = {
+ 64, 64, 83, 36, 89, 50, 18, 75, 64, -64, 36, -83, 75, -89, -50, -18
+};
+
+static const int16_t gt16x16_cnst[64] __attribute__ ((aligned (64))) = {
+ 64, 83, 64, 36, 89, 75, 50, 18, 90, 80, 57, 25, 70, 87, 9, 43,
+ 64, 36, -64, -83, 75, -18, -89, -50, 87, 9, -80, -70, -43, 57, -25, -90,
+ 64, -36, -64, 83, 50, -89, 18, 75, 80, -70, -25, 90, -87, 9, 43, 57,
+ 64, -83, 64, -36, 18, -50, 75, -89, 70, -87, 90, -80, 9, -43, -57, 25
+};
+
+static const int16_t gt32x32_cnst0[256] __attribute__ ((aligned (64))) = {
+ 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4,
+ 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13,
+ 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22,
+ 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31,
+ 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38,
+ 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46,
+ 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54,
+ 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61,
+ 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67,
+ 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73,
+ 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78,
+ 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82,
+ 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85,
+ 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88,
+ 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90,
+ 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90
+};
+
+static const int16_t gt32x32_cnst1[64] __attribute__ ((aligned (64))) = {
+ 90, 87, 80, 70, 57, 43, 25, 9, 87, 57, 9, -43, -80, -90, -70, -25,
+ 80, 9, -70, -87, -25, 57, 90, 43, 70, -43, -87, 9, 90, 25, -80, -57,
+ 57, -80, -25, 90, -9, -87, 43, 70, 43, -90, 57, 25, -87, 70, 9, -80,
+ 25, -70, 90, -80, 43, 9, -57, 87, 9, -25, 43, -57, 70, -80, 87, -90
+};
+
+static const int16_t gt32x32_cnst2[16] __attribute__ ((aligned (64))) = {
+ 89, 75, 50, 18, 75, -18, -89, -50, 50, -89, 18, 75, 18, -50, 75, -89
+};
+
+#define HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, \
+ sum0, sum1, sum2, sum3, shift) \
+{ \
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5; \
+ __m128i cnst64 = __lsx_vldi(0x0840); \
+ __m128i cnst83 = __lsx_vldi(0x0853); \
+ __m128i cnst36 = __lsx_vldi(0x0824); \
+ \
+ vec0 = __lsx_vdp2_w_h(in_r0, cnst64); \
+ vec1 = __lsx_vdp2_w_h(in_l0, cnst83); \
+ vec2 = __lsx_vdp2_w_h(in_r1, cnst64); \
+ vec3 = __lsx_vdp2_w_h(in_l1, cnst36); \
+ vec4 = __lsx_vdp2_w_h(in_l0, cnst36); \
+ vec5 = __lsx_vdp2_w_h(in_l1, cnst83); \
+ \
+ sum0 = __lsx_vadd_w(vec0, vec2); \
+ sum1 = __lsx_vsub_w(vec0, vec2); \
+ vec1 = __lsx_vadd_w(vec1, vec3); \
+ vec4 = __lsx_vsub_w(vec4, vec5); \
+ sum2 = __lsx_vsub_w(sum1, vec4); \
+ sum3 = __lsx_vsub_w(sum0, vec1); \
+ sum0 = __lsx_vadd_w(sum0, vec1); \
+ sum1 = __lsx_vadd_w(sum1, vec4); \
+ \
+ sum0 = __lsx_vsrari_w(sum0, shift); \
+ sum1 = __lsx_vsrari_w(sum1, shift); \
+ sum2 = __lsx_vsrari_w(sum2, shift); \
+ sum3 = __lsx_vsrari_w(sum3, shift); \
+ sum0 = __lsx_vsat_w(sum0, 15); \
+ sum1 = __lsx_vsat_w(sum1, 15); \
+ sum2 = __lsx_vsat_w(sum2, 15); \
+ sum3 = __lsx_vsat_w(sum3, 15); \
+}
+
+#define HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, shift) \
+{ \
+ __m128i src0_r, src1_r, src2_r, src3_r; \
+ __m128i src0_l, src1_l, src2_l, src3_l; \
+ __m128i filter0, filter1, filter2, filter3; \
+ __m128i temp0_r, temp1_r, temp2_r, temp3_r, temp4_r, temp5_r; \
+ __m128i temp0_l, temp1_l, temp2_l, temp3_l, temp4_l, temp5_l; \
+ __m128i sum0_r, sum1_r, sum2_r, sum3_r; \
+ __m128i sum0_l, sum1_l, sum2_l, sum3_l; \
+ \
+ DUP4_ARG2(__lsx_vilvl_h, in4, in0, in6, in2, in5, in1, in3, in7, \
+ src0_r, src1_r, src2_r, src3_r); \
+ DUP4_ARG2(__lsx_vilvh_h, in4, in0, in6, in2, in5, in1, in3, in7, \
+ src0_l, src1_l, src2_l, src3_l); \
+ \
+ DUP4_ARG2(__lsx_vldrepl_w, filter, 0, filter, 4, filter, 8, \
+ filter, 12, filter0, filter1, filter2, filter3); \
+ DUP4_ARG2(__lsx_vdp2_w_h, src0_r, filter0, src0_l, filter0, \
+ src1_r, filter1, src1_l, filter1, temp0_r, temp0_l, \
+ temp1_r, temp1_l); \
+ \
+ LSX_BUTTERFLY_4_W(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l,\
+ sum1_l, sum1_r); \
+ sum2_r = sum1_r; \
+ sum2_l = sum1_l; \
+ sum3_r = sum0_r; \
+ sum3_l = sum0_l; \
+ \
+ DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter2, src2_l, filter2, \
+ src3_r, filter3, src3_l, filter3, temp2_r, temp2_l, \
+ temp3_r, temp3_l); \
+ temp2_r = __lsx_vadd_w(temp2_r, temp3_r); \
+ temp2_l = __lsx_vadd_w(temp2_l, temp3_l); \
+ sum0_r = __lsx_vadd_w(sum0_r, temp2_r); \
+ sum0_l = __lsx_vadd_w(sum0_l, temp2_l); \
+ sum3_r = __lsx_vsub_w(sum3_r, temp2_r); \
+ sum3_l = __lsx_vsub_w(sum3_l, temp2_l); \
+ \
+ in0 = __lsx_vssrarni_h_w(sum0_l, sum0_r, shift); \
+ in7 = __lsx_vssrarni_h_w(sum3_l, sum3_r, shift); \
+ \
+ DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter3, src2_l, filter3, \
+ src3_r, filter2, src3_l, filter2, temp4_r, temp4_l, \
+ temp5_r, temp5_l); \
+ temp4_r = __lsx_vsub_w(temp4_r, temp5_r); \
+ temp4_l = __lsx_vsub_w(temp4_l, temp5_l); \
+ sum1_r = __lsx_vadd_w(sum1_r, temp4_r); \
+ sum1_l = __lsx_vadd_w(sum1_l, temp4_l); \
+ sum2_r = __lsx_vsub_w(sum2_r, temp4_r); \
+ sum2_l = __lsx_vsub_w(sum2_l, temp4_l); \
+ \
+ in3 = __lsx_vssrarni_h_w(sum1_l, sum1_r, shift); \
+ in4 = __lsx_vssrarni_h_w(sum2_l, sum2_r, shift); \
+ \
+ DUP4_ARG2(__lsx_vldrepl_w, filter, 16, filter, 20, filter, 24, \
+ filter, 28, filter0, filter1, filter2, filter3); \
+ DUP4_ARG2(__lsx_vdp2_w_h, src0_r, filter0, src0_l, filter0, \
+ src1_r, filter1, src1_l, filter1, temp0_r, temp0_l, \
+ temp1_r, temp1_l); \
+ \
+ LSX_BUTTERFLY_4_W(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l,\
+ sum1_l, sum1_r); \
+ sum2_r = sum1_r; \
+ sum2_l = sum1_l; \
+ sum3_r = sum0_r; \
+ sum3_l = sum0_l; \
+ \
+ DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter2, src2_l, filter2, \
+ src3_r, filter3, src3_l, filter3, temp2_r, temp2_l, \
+ temp3_r, temp3_l); \
+ temp2_r = __lsx_vadd_w(temp2_r, temp3_r); \
+ temp2_l = __lsx_vadd_w(temp2_l, temp3_l); \
+ sum0_r = __lsx_vadd_w(sum0_r, temp2_r); \
+ sum0_l = __lsx_vadd_w(sum0_l, temp2_l); \
+ sum3_r = __lsx_vsub_w(sum3_r, temp2_r); \
+ sum3_l = __lsx_vsub_w(sum3_l, temp2_l); \
+ \
+ in1 = __lsx_vssrarni_h_w(sum0_l, sum0_r, shift); \
+ in6 = __lsx_vssrarni_h_w(sum3_l, sum3_r, shift); \
+ \
+ DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter3, src2_l, filter3, \
+ src3_r, filter2, src3_l, filter2, temp4_r, temp4_l, \
+ temp5_r, temp5_l); \
+ temp4_r = __lsx_vsub_w(temp4_r, temp5_r); \
+ temp4_l = __lsx_vsub_w(temp4_l, temp5_l); \
+ sum1_r = __lsx_vsub_w(sum1_r, temp4_r); \
+ sum1_l = __lsx_vsub_w(sum1_l, temp4_l); \
+ sum2_r = __lsx_vadd_w(sum2_r, temp4_r); \
+ sum2_l = __lsx_vadd_w(sum2_l, temp4_l); \
+ \
+ in2 = __lsx_vssrarni_h_w(sum1_l, sum1_r, shift); \
+ in5 = __lsx_vssrarni_h_w(sum2_l, sum2_r, shift); \
+}
+
+#define HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, \
+ src4_r, src5_r, src6_r, src7_r, \
+ src0_l, src1_l, src2_l, src3_l, \
+ src4_l, src5_l, src6_l, src7_l, shift) \
+{ \
+ int16_t *ptr0, *ptr1; \
+ __m128i dst0, dst1; \
+ __m128i filter0, filter1, filter2, filter3; \
+ __m128i temp0_r, temp1_r, temp0_l, temp1_l; \
+ __m128i sum0_r, sum1_r, sum2_r, sum3_r, sum0_l, sum1_l, sum2_l; \
+ __m128i sum3_l, res0_r, res1_r, res0_l, res1_l; \
+ \
+ ptr0 = (buf_ptr + 112); \
+ ptr1 = (buf_ptr + 128); \
+ k = -1; \
+ \
+ for (j = 0; j < 4; j++) \
+ { \
+ DUP4_ARG2(__lsx_vldrepl_w, filter, 0, filter, 4, filter, 16, \
+ filter, 20, filter0, filter1, filter2, filter3); \
+ DUP4_ARG2(__lsx_vdp2_w_h, src0_r, filter0, src0_l, filter0, \
+ src4_r, filter2, src4_l, filter2, sum0_r, sum0_l, \
+ sum2_r, sum2_l); \
+ DUP2_ARG2(__lsx_vdp2_w_h, src7_r, filter2, src7_l, filter2, \
+ sum3_r, sum3_l); \
+ DUP4_ARG3(__lsx_vdp2add_w_h, sum0_r, src1_r, filter1, sum0_l, \
+ src1_l, filter1, sum2_r, src5_r, filter3, sum2_l, \
+ src5_l, filter3, sum0_r, sum0_l, sum2_r, sum2_l); \
+ DUP2_ARG3(__lsx_vdp2add_w_h, sum3_r, src6_r, filter3, sum3_l, \
+ src6_l, filter3, sum3_r, sum3_l); \
+ \
+ sum1_r = sum0_r; \
+ sum1_l = sum0_l; \
+ \
+ DUP4_ARG2(__lsx_vldrepl_w, filter, 8, filter, 12, filter, 24, \
+ filter, 28, filter0, filter1, filter2, filter3); \
+ filter += 16; \
+ DUP2_ARG2(__lsx_vdp2_w_h, src2_r, filter0, src2_l, filter0, \
+ temp0_r, temp0_l); \
+ DUP2_ARG3(__lsx_vdp2add_w_h, sum2_r, src6_r, filter2, sum2_l, \
+ src6_l, filter2, sum2_r, sum2_l); \
+ DUP2_ARG2(__lsx_vdp2_w_h, src5_r, filter2, src5_l, filter2, \
+ temp1_r, temp1_l); \
+ \
+ sum0_r = __lsx_vadd_w(sum0_r, temp0_r); \
+ sum0_l = __lsx_vadd_w(sum0_l, temp0_l); \
+ sum1_r = __lsx_vsub_w(sum1_r, temp0_r); \
+ sum1_l = __lsx_vsub_w(sum1_l, temp0_l); \
+ sum3_r = __lsx_vsub_w(temp1_r, sum3_r); \
+ sum3_l = __lsx_vsub_w(temp1_l, sum3_l); \
+ \
+ DUP2_ARG2(__lsx_vdp2_w_h, src3_r, filter1, src3_l, filter1, \
+ temp0_r, temp0_l); \
+ DUP4_ARG3(__lsx_vdp2add_w_h, sum2_r, src7_r, filter3, sum2_l, \
+ src7_l, filter3, sum3_r, src4_r, filter3, sum3_l, \
+ src4_l, filter3, sum2_r, sum2_l, sum3_r, sum3_l); \
+ \
+ sum0_r = __lsx_vadd_w(sum0_r, temp0_r); \
+ sum0_l = __lsx_vadd_w(sum0_l, temp0_l); \
+ sum1_r = __lsx_vsub_w(sum1_r, temp0_r); \
+ sum1_l = __lsx_vsub_w(sum1_l, temp0_l); \
+ \
+ LSX_BUTTERFLY_4_W(sum0_r, sum0_l, sum2_l, sum2_r, res0_r, res0_l, \
+ res1_l, res1_r); \
+ dst0 = __lsx_vssrarni_h_w(res0_l, res0_r, shift); \
+ dst1 = __lsx_vssrarni_h_w(res1_l, res1_r, shift); \
+ __lsx_vst(dst0, buf_ptr, 0); \
+ __lsx_vst(dst1, (buf_ptr + ((15 - (j * 2)) << 4)), 0); \
+ \
+ LSX_BUTTERFLY_4_W(sum1_r, sum1_l, sum3_l, sum3_r, res0_r, res0_l, \
+ res1_l, res1_r); \
+ \
+ dst0 = __lsx_vssrarni_h_w(res0_l, res0_r, shift); \
+ dst1 = __lsx_vssrarni_h_w(res1_l, res1_r, shift); \
+ __lsx_vst(dst0, (ptr0 + ((((j + 1) >> 1) * 2 * k) << 4)), 0); \
+ __lsx_vst(dst1, (ptr1 - ((((j + 1) >> 1) * 2 * k) << 4)), 0); \
+ \
+ k *= -1; \
+ buf_ptr += 16; \
+ } \
+}
+
+#define HEVC_EVEN16_CALC(input, sum0_r, sum0_l, load_idx, store_idx) \
+{ \
+ tmp0_r = __lsx_vld(input + load_idx * 8, 0); \
+ tmp0_l = __lsx_vld(input + load_idx * 8, 16); \
+ tmp1_r = sum0_r; \
+ tmp1_l = sum0_l; \
+ sum0_r = __lsx_vadd_w(sum0_r, tmp0_r); \
+ sum0_l = __lsx_vadd_w(sum0_l, tmp0_l); \
+ __lsx_vst(sum0_r, (input + load_idx * 8), 0); \
+ __lsx_vst(sum0_l, (input + load_idx * 8), 16); \
+ tmp1_r = __lsx_vsub_w(tmp1_r, tmp0_r); \
+ tmp1_l = __lsx_vsub_w(tmp1_l, tmp0_l); \
+ __lsx_vst(tmp1_r, (input + store_idx * 8), 0); \
+ __lsx_vst(tmp1_l, (input + store_idx * 8), 16); \
+}
+
+#define HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1, \
+ res0, res1, res2, res3, shift) \
+{ \
+ __m128i vec0, vec1, vec2, vec3; \
+ __m128i cnst74 = __lsx_vldi(0x84a); \
+ __m128i cnst55 = __lsx_vldi(0x837); \
+ __m128i cnst29 = __lsx_vldi(0x81d); \
+ \
+ vec0 = __lsx_vadd_w(in_r0, in_r1); \
+ vec2 = __lsx_vsub_w(in_r0, in_l1); \
+ res0 = __lsx_vmul_w(vec0, cnst29); \
+ res1 = __lsx_vmul_w(vec2, cnst55); \
+ res2 = __lsx_vsub_w(in_r0, in_r1); \
+ vec1 = __lsx_vadd_w(in_r1, in_l1); \
+ res2 = __lsx_vadd_w(res2, in_l1); \
+ vec3 = __lsx_vmul_w(in_l0, cnst74); \
+ res3 = __lsx_vmul_w(vec0, cnst55); \
+ \
+ res0 = __lsx_vadd_w(res0, __lsx_vmul_w(vec1, cnst55)); \
+ res1 = __lsx_vsub_w(res1, __lsx_vmul_w(vec1, cnst29)); \
+ res2 = __lsx_vmul_w(res2, cnst74); \
+ res3 = __lsx_vadd_w(res3, __lsx_vmul_w(vec2, cnst29)); \
+ \
+ res0 = __lsx_vadd_w(res0, vec3); \
+ res1 = __lsx_vadd_w(res1, vec3); \
+ res3 = __lsx_vsub_w(res3, vec3); \
+ \
+ res0 = __lsx_vsrari_w(res0, shift); \
+ res1 = __lsx_vsrari_w(res1, shift); \
+ res2 = __lsx_vsrari_w(res2, shift); \
+ res3 = __lsx_vsrari_w(res3, shift); \
+ res0 = __lsx_vsat_w(res0, 15); \
+ res1 = __lsx_vsat_w(res1, 15); \
+ res2 = __lsx_vsat_w(res2, 15); \
+ res3 = __lsx_vsat_w(res3, 15); \
+}
+
+void ff_hevc_idct_4x4_lsx(int16_t *coeffs, int col_limit)
+{
+ __m128i in0, in1;
+ __m128i in_r0, in_l0, in_r1, in_l1;
+ __m128i sum0, sum1, sum2, sum3;
+ __m128i zero = __lsx_vldi(0x00);
+
+ in0 = __lsx_vld(coeffs, 0);
+ in1 = __lsx_vld(coeffs, 16);
+ in_r0 = __lsx_vilvl_h(zero, in0);
+ in_l0 = __lsx_vilvh_h(zero, in0);
+ in_r1 = __lsx_vilvl_h(zero, in1);
+ in_l1 = __lsx_vilvh_h(zero, in1);
+
+ HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 7);
+ LSX_TRANSPOSE4x4_W(sum0, sum1, sum2, sum3, in_r0, in_l0, in_r1, in_l1);
+ HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 12);
+
+ /* Pack and transpose */
+ in0 = __lsx_vpickev_h(sum2, sum0);
+ in1 = __lsx_vpickev_h(sum3, sum1);
+ sum0 = __lsx_vilvl_h(in1, in0);
+ sum1 = __lsx_vilvh_h(in1, in0);
+ in0 = __lsx_vilvl_w(sum1, sum0);
+ in1 = __lsx_vilvh_w(sum1, sum0);
+
+ __lsx_vst(in0, coeffs, 0);
+ __lsx_vst(in1, coeffs, 16);
+}
+
+void ff_hevc_idct_8x8_lsx(int16_t *coeffs, int col_limit)
+{
+ const int16_t *filter = >8x8_cnst[0];
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+ DUP4_ARG2(__lsx_vld, coeffs, 0, coeffs, 16, coeffs, 32,
+ coeffs, 48, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vld, coeffs, 64, coeffs, 80, coeffs, 96,
+ coeffs, 112, in4, in5, in6, in7);
+ HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 7);
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 12);
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+
+ __lsx_vst(in0, coeffs, 0);
+ __lsx_vst(in1, coeffs, 16);
+ __lsx_vst(in2, coeffs, 32);
+ __lsx_vst(in3, coeffs, 48);
+ __lsx_vst(in4, coeffs, 64);
+ __lsx_vst(in5, coeffs, 80);
+ __lsx_vst(in6, coeffs, 96);
+ __lsx_vst(in7, coeffs, 112);
+}
+
+void ff_hevc_idct_16x16_lsx(int16_t *coeffs, int col_limit)
+{
+ int16_t i, j, k;
+ int16_t buf[256];
+ int16_t *buf_ptr = &buf[0];
+ int16_t *src = coeffs;
+ const int16_t *filter = >16x16_cnst[0];
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+ __m128i src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l;
+
+ for (i = 2; i--;) {
+ DUP4_ARG2(__lsx_vld, src, 0, src, 32, src, 64, src, 96,
+ in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vld, src, 128, src, 160, src, 192, src, 224,
+ in4, in5, in6, in7);
+ DUP4_ARG2(__lsx_vld, src, 256, src, 288, src, 320, src, 352,
+ in8, in9, in10, in11);
+ DUP4_ARG2(__lsx_vld, src, 384, src, 416, src, 448, src, 480,
+ in12, in13, in14, in15);
+
+ DUP4_ARG2(__lsx_vilvl_h, in4, in0, in12, in8, in6, in2, in14, in10,
+ src0_r, src1_r, src2_r, src3_r);
+ DUP4_ARG2(__lsx_vilvl_h, in5, in1, in13, in9, in3, in7, in11, in15,
+ src4_r, src5_r, src6_r, src7_r);
+ DUP4_ARG2(__lsx_vilvh_h, in4, in0, in12, in8, in6, in2, in14, in10,
+ src0_l, src1_l, src2_l, src3_l);
+ DUP4_ARG2(__lsx_vilvh_h, in5, in1, in13, in9, in3, in7, in11, in15,
+ src4_l, src5_l, src6_l, src7_l);
+
+ HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r,
+ src6_r, src7_r, src0_l, src1_l, src2_l, src3_l,
+ src4_l, src5_l, src6_l, src7_l, 7);
+
+ src += 8;
+ buf_ptr = (&buf[0] + 8);
+ filter = >16x16_cnst[0];
+ }
+
+ src = &buf[0];
+ buf_ptr = coeffs;
+ filter = >16x16_cnst[0];
+
+ for (i = 2; i--;) {
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
+ in0, in8, in1, in9);
+ DUP4_ARG2(__lsx_vld, src, 64, src, 80, src, 96, src, 112,
+ in2, in10, in3, in11);
+ DUP4_ARG2(__lsx_vld, src, 128, src, 144, src, 160, src, 176,
+ in4, in12, in5, in13);
+ DUP4_ARG2(__lsx_vld, src, 192, src, 208, src, 224, src, 240,
+ in6, in14, in7, in15);
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15,
+ in8, in9, in10, in11, in12, in13, in14, in15);
+ DUP4_ARG2(__lsx_vilvl_h, in4, in0, in12, in8, in6, in2, in14, in10,
+ src0_r, src1_r, src2_r, src3_r);
+ DUP4_ARG2(__lsx_vilvl_h, in5, in1, in13, in9, in3, in7, in11, in15,
+ src4_r, src5_r, src6_r, src7_r);
+ DUP4_ARG2(__lsx_vilvh_h, in4, in0, in12, in8, in6, in2, in14, in10,
+ src0_l, src1_l, src2_l, src3_l);
+ DUP4_ARG2(__lsx_vilvh_h, in5, in1, in13, in9, in3, in7, in11, in15,
+ src4_l, src5_l, src6_l, src7_l);
+ HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r,
+ src6_r, src7_r, src0_l, src1_l, src2_l, src3_l,
+ src4_l, src5_l, src6_l, src7_l, 12);
+
+ src += 128;
+ buf_ptr = coeffs + 8;
+ filter = >16x16_cnst[0];
+ }
+
+ DUP4_ARG2(__lsx_vld, coeffs, 0, coeffs, 32, coeffs, 64, coeffs, 96,
+ in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vld, coeffs, 128, coeffs, 160, coeffs, 192, coeffs, 224,
+ in4, in5, in6, in7);
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+ vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+ __lsx_vst(vec0, coeffs, 0);
+ __lsx_vst(vec1, coeffs, 32);
+ __lsx_vst(vec2, coeffs, 64);
+ __lsx_vst(vec3, coeffs, 96);
+ __lsx_vst(vec4, coeffs, 128);
+ __lsx_vst(vec5, coeffs, 160);
+ __lsx_vst(vec6, coeffs, 192);
+ __lsx_vst(vec7, coeffs, 224);
+
+ src = coeffs + 8;
+ DUP4_ARG2(__lsx_vld, src, 0, src, 32, src, 64, src, 96, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vld, src, 128, src, 160, src, 192, src, 224,
+ in4, in5, in6, in7);
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+ vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+ src = coeffs + 128;
+ DUP4_ARG2(__lsx_vld, src, 0, src, 32, src, 64, src, 96,
+ in8, in9, in10, in11);
+ DUP4_ARG2(__lsx_vld, src, 128, src, 160, src, 192, src, 224,
+ in12, in13, in14, in15);
+
+ __lsx_vst(vec0, src, 0);
+ __lsx_vst(vec1, src, 32);
+ __lsx_vst(vec2, src, 64);
+ __lsx_vst(vec3, src, 96);
+ __lsx_vst(vec4, src, 128);
+ __lsx_vst(vec5, src, 160);
+ __lsx_vst(vec6, src, 192);
+ __lsx_vst(vec7, src, 224);
+ LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15,
+ vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+ src = coeffs + 8;
+ __lsx_vst(vec0, src, 0);
+ __lsx_vst(vec1, src, 32);
+ __lsx_vst(vec2, src, 64);
+ __lsx_vst(vec3, src, 96);
+ __lsx_vst(vec4, src, 128);
+ __lsx_vst(vec5, src, 160);
+ __lsx_vst(vec6, src, 192);
+ __lsx_vst(vec7, src, 224);
+
+ src = coeffs + 136;
+ DUP4_ARG2(__lsx_vld, src, 0, src, 32, src, 64, src, 96,
+ in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vld, src, 128, src, 160, src, 192, src, 224,
+ in4, in5, in6, in7);
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+ vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+ __lsx_vst(vec0, src, 0);
+ __lsx_vst(vec1, src, 32);
+ __lsx_vst(vec2, src, 64);
+ __lsx_vst(vec3, src, 96);
+ __lsx_vst(vec4, src, 128);
+ __lsx_vst(vec5, src, 160);
+ __lsx_vst(vec6, src, 192);
+ __lsx_vst(vec7, src, 224);
+}
+
+static void hevc_idct_8x32_column_lsx(int16_t *coeffs, int32_t buf_pitch,
+ uint8_t round)
+{
+ uint8_t i;
+ int32_t buf_pitch_2 = buf_pitch << 1;
+ int32_t buf_pitch_4 = buf_pitch << 2;
+ int32_t buf_pitch_8 = buf_pitch << 3;
+ int32_t buf_pitch_16 = buf_pitch << 4;
+
+ const int16_t *filter_ptr0 = >32x32_cnst0[0];
+ const int16_t *filter_ptr1 = >32x32_cnst1[0];
+ const int16_t *filter_ptr2 = >32x32_cnst2[0];
+ const int16_t *filter_ptr3 = >8x8_cnst[0];
+ int16_t *src0 = (coeffs + buf_pitch);
+ int16_t *src1 = (coeffs + buf_pitch_2);
+ int16_t *src2 = (coeffs + buf_pitch_4);
+ int16_t *src3 = (coeffs);
+ int32_t tmp_buf[8 * 32 + 15];
+ int32_t *tmp_buf_ptr = tmp_buf + 15;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+ __m128i src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i sum0_r, sum0_l, sum1_r, sum1_l, tmp0_r, tmp0_l, tmp1_r, tmp1_l;
+
+ /* Align pointer to 64 byte boundary */
+ tmp_buf_ptr = (int32_t *)(((uintptr_t) tmp_buf_ptr) & ~(uintptr_t) 63);
+
+ /* process coeff 4, 12, 20, 28 */
+ in0 = __lsx_vld(src2, 0);
+ in1 = __lsx_vld(src2 + buf_pitch_8, 0);
+ in2 = __lsx_vld(src2 + buf_pitch_16, 0);
+ in3 = __lsx_vld(src2 + buf_pitch_16 + buf_pitch_8, 0);
+ in4 = __lsx_vld(src3, 0);
+ in5 = __lsx_vld(src3 + buf_pitch_8, 0);
+ in6 = __lsx_vld(src3 + buf_pitch_16, 0);
+ in7 = __lsx_vld(src3 + buf_pitch_16 + buf_pitch_8, 0);
+ DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in6, in4, in7, in5,
+ src0_r, src1_r, src2_r, src3_r);
+ DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in6, in4, in7, in5,
+ src0_l, src1_l, src2_l, src3_l);
+
+ filter0 = __lsx_vldrepl_w(filter_ptr2, 0);
+ filter1 = __lsx_vldrepl_w(filter_ptr2, 4);
+ sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
+ sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
+ __lsx_vst(sum0_r, tmp_buf_ptr, 0);
+ __lsx_vst(sum0_l, tmp_buf_ptr, 16);
+
+ filter0 = __lsx_vldrepl_w(filter_ptr2, 8);
+ filter1 = __lsx_vldrepl_w(filter_ptr2, 12);
+ sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
+ sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
+ __lsx_vst(sum0_r, tmp_buf_ptr, 32);
+ __lsx_vst(sum0_l, tmp_buf_ptr, 48);
+
+ filter0 = __lsx_vldrepl_w(filter_ptr2, 16);
+ filter1 = __lsx_vldrepl_w(filter_ptr2, 20);
+ sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
+ sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
+ __lsx_vst(sum0_r, tmp_buf_ptr, 64);
+ __lsx_vst(sum0_l, tmp_buf_ptr, 80);
+
+ filter0 = __lsx_vldrepl_w(filter_ptr2, 24);
+ filter1 = __lsx_vldrepl_w(filter_ptr2, 28);
+ sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
+ sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
+ __lsx_vst(sum0_r, tmp_buf_ptr, 96);
+ __lsx_vst(sum0_l, tmp_buf_ptr, 112);
+
+ /* process coeff 0, 8, 16, 24 */
+ filter0 = __lsx_vldrepl_w(filter_ptr3, 0);
+ filter1 = __lsx_vldrepl_w(filter_ptr3, 4);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter0, src2_l, filter0,
+ src3_r, filter1, src3_l, filter1, sum0_r, sum0_l, tmp1_r, tmp1_l);
+ sum1_r = __lsx_vsub_w(sum0_r, tmp1_r);
+ sum1_l = __lsx_vsub_w(sum0_l, tmp1_l);
+ sum0_r = __lsx_vadd_w(sum0_r, tmp1_r);
+ sum0_l = __lsx_vadd_w(sum0_l, tmp1_l);
+
+ HEVC_EVEN16_CALC(tmp_buf_ptr, sum0_r, sum0_l, 0, 7);
+ HEVC_EVEN16_CALC(tmp_buf_ptr, sum1_r, sum1_l, 3, 4);
+
+ filter0 = __lsx_vldrepl_w(filter_ptr3, 16);
+ filter1 = __lsx_vldrepl_w(filter_ptr3, 20);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter0, src2_l, filter0,
+ src3_r, filter1, src3_l, filter1, sum0_r, sum0_l, tmp1_r, tmp1_l);
+ sum1_r = __lsx_vsub_w(sum0_r, tmp1_r);
+ sum1_l = __lsx_vsub_w(sum0_l, tmp1_l);
+ sum0_r = __lsx_vadd_w(sum0_r, tmp1_r);
+ sum0_l = __lsx_vadd_w(sum0_l, tmp1_l);
+
+ HEVC_EVEN16_CALC(tmp_buf_ptr, sum0_r, sum0_l, 1, 6);
+ HEVC_EVEN16_CALC(tmp_buf_ptr, sum1_r, sum1_l, 2, 5);
+
+ /* process coeff 2 6 10 14 18 22 26 30 */
+ in0 = __lsx_vld(src1, 0);
+ in1 = __lsx_vld(src1 + buf_pitch_4, 0);
+ in2 = __lsx_vld(src1 + buf_pitch_8, 0);
+ in3 = __lsx_vld(src1 + buf_pitch_8 + buf_pitch_4, 0);
+ in4 = __lsx_vld(src1 + buf_pitch_16, 0);
+ in5 = __lsx_vld(src1 + buf_pitch_16 + buf_pitch_4, 0);
+ in6 = __lsx_vld(src1 + buf_pitch_16 + buf_pitch_8, 0);
+ in7 = __lsx_vld(src1 + buf_pitch_16 + buf_pitch_8 + buf_pitch_4, 0);
+
+ DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in5, in4, in7, in6,
+ src0_r, src1_r, src2_r, src3_r);
+ DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in5, in4, in7, in6,
+ src0_l, src1_l, src2_l, src3_l);
+
+ /* loop for all columns of constants */
+ for (i = 0; i < 8; i++) {
+ /* processing single column of constants */
+ filter0 = __lsx_vldrepl_w(filter_ptr1, 0);
+ filter1 = __lsx_vldrepl_w(filter_ptr1, 4);
+ filter2 = __lsx_vldrepl_w(filter_ptr1, 8);
+ filter3 = __lsx_vldrepl_w(filter_ptr1, 12);
+ sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
+ sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src2_r, filter2);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src2_l, filter2);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src3_r, filter3);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src3_l, filter3);
+
+ tmp0_r = __lsx_vld(tmp_buf_ptr + (i << 3), 0);
+ tmp0_l = __lsx_vld(tmp_buf_ptr + (i << 3), 16);
+ tmp1_r = tmp0_r;
+ tmp1_l = tmp0_l;
+ tmp0_r = __lsx_vadd_w(tmp0_r, sum0_r);
+ tmp0_l = __lsx_vadd_w(tmp0_l, sum0_l);
+ tmp1_r = __lsx_vsub_w(tmp1_r, sum0_r);
+ tmp1_l = __lsx_vsub_w(tmp1_l, sum0_l);
+ __lsx_vst(tmp0_r, tmp_buf_ptr + (i << 3), 0);
+ __lsx_vst(tmp0_l, tmp_buf_ptr + (i << 3), 16);
+ __lsx_vst(tmp1_r, tmp_buf_ptr + ((15 - i) * 8), 0);
+ __lsx_vst(tmp1_l, tmp_buf_ptr + ((15 - i) * 8), 16);
+
+ filter_ptr1 += 8;
+ }
+
+ /* process coeff 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 */
+ in0 = __lsx_vld(src0, 0);
+ in1 = __lsx_vld(src0 + buf_pitch_2, 0);
+ in2 = __lsx_vld(src0 + buf_pitch_4, 0);
+ in3 = __lsx_vld(src0 + buf_pitch_4 + buf_pitch_2, 0);
+ in4 = __lsx_vld(src0 + buf_pitch_8, 0);
+ in5 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_2, 0);
+ in6 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_4, 0);
+ in7 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_4 + buf_pitch_2, 0);
+
+ src0 += 16 * buf_pitch;
+ DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in5, in4, in7, in6,
+ src0_r, src1_r, src2_r, src3_r);
+ DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in5, in4, in7, in6,
+ src0_l, src1_l, src2_l, src3_l);
+ in0 = __lsx_vld(src0, 0);
+ in1 = __lsx_vld(src0 + buf_pitch_2, 0);
+ in2 = __lsx_vld(src0 + buf_pitch_4, 0);
+ in3 = __lsx_vld(src0 + buf_pitch_4 + buf_pitch_2, 0);
+ in4 = __lsx_vld(src0 + buf_pitch_8, 0);
+ in5 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_2, 0);
+ in6 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_4, 0);
+ in7 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_4 + buf_pitch_2, 0);
+
+ DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in5, in4, in7, in6,
+ src4_r, src5_r, src6_r, src7_r);
+ DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in5, in4, in7, in6,
+ src4_l, src5_l, src6_l, src7_l);
+
+ /* loop for all columns of filter constants */
+ for (i = 0; i < 16; i++) {
+ /* processing single column of constants */
+ filter0 = __lsx_vldrepl_w(filter_ptr0, 0);
+ filter1 = __lsx_vldrepl_w(filter_ptr0, 4);
+ filter2 = __lsx_vldrepl_w(filter_ptr0, 8);
+ filter3 = __lsx_vldrepl_w(filter_ptr0, 12);
+ sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
+ sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src2_r, filter2);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src2_l, filter2);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src3_r, filter3);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src3_l, filter3);
+ tmp1_r = sum0_r;
+ tmp1_l = sum0_l;
+
+ filter0 = __lsx_vldrepl_w(filter_ptr0, 16);
+ filter1 = __lsx_vldrepl_w(filter_ptr0, 20);
+ filter2 = __lsx_vldrepl_w(filter_ptr0, 24);
+ filter3 = __lsx_vldrepl_w(filter_ptr0, 28);
+ sum0_r = __lsx_vdp2_w_h(src4_r, filter0);
+ sum0_l = __lsx_vdp2_w_h(src4_l, filter0);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src5_r, filter1);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src5_l, filter1);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src6_r, filter2);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src6_l, filter2);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src7_r, filter3);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src7_l, filter3);
+ sum0_r = __lsx_vadd_w(sum0_r, tmp1_r);
+ sum0_l = __lsx_vadd_w(sum0_l, tmp1_l);
+
+ tmp0_r = __lsx_vld(tmp_buf_ptr + i * 8, 0);
+ tmp0_l = __lsx_vld(tmp_buf_ptr + i * 8, 16);
+ tmp1_r = tmp0_r;
+ tmp1_l = tmp0_l;
+ tmp0_r = __lsx_vadd_w(tmp0_r, sum0_r);
+ tmp0_l = __lsx_vadd_w(tmp0_l, sum0_l);
+ sum1_r = __lsx_vreplgr2vr_w(round);
+ tmp0_r = __lsx_vssrarn_h_w(tmp0_r, sum1_r);
+ tmp0_l = __lsx_vssrarn_h_w(tmp0_l, sum1_r);
+ in0 = __lsx_vpackev_d(tmp0_l, tmp0_r);
+ __lsx_vst(in0, (coeffs + i * buf_pitch), 0);
+ tmp1_r = __lsx_vsub_w(tmp1_r, sum0_r);
+ tmp1_l = __lsx_vsub_w(tmp1_l, sum0_l);
+ tmp1_r = __lsx_vssrarn_h_w(tmp1_r, sum1_r);
+ tmp1_l = __lsx_vssrarn_h_w(tmp1_l, sum1_r);
+ in0 = __lsx_vpackev_d(tmp1_l, tmp1_r);
+ __lsx_vst(in0, (coeffs + (31 - i) * buf_pitch), 0);
+
+ filter_ptr0 += 16;
+ }
+}
+
+static void hevc_idct_transpose_32x8_to_8x32(int16_t *coeffs, int16_t *tmp_buf)
+{
+ uint8_t i;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+ for (i = 0; i < 4; i++) {
+ DUP4_ARG2(__lsx_vld, coeffs, 0, coeffs, 64, coeffs, 128,
+ coeffs, 192, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vld, coeffs, 256, coeffs, 320, coeffs, 384,
+ coeffs, 448, in4, in5, in6, in7);
+ coeffs += 8;
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ __lsx_vst(in0, tmp_buf, 0);
+ __lsx_vst(in1, tmp_buf, 16);
+ __lsx_vst(in2, tmp_buf, 32);
+ __lsx_vst(in3, tmp_buf, 48);
+ __lsx_vst(in4, tmp_buf, 64);
+ __lsx_vst(in5, tmp_buf, 80);
+ __lsx_vst(in6, tmp_buf, 96);
+ __lsx_vst(in7, tmp_buf, 112);
+ tmp_buf += 64;
+ }
+}
+
+static void hevc_idct_transpose_8x32_to_32x8(int16_t *tmp_buf, int16_t *coeffs)
+{
+ uint8_t i;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+ for (i = 0; i < 4; i++) {
+ DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 16, tmp_buf, 32,
+ tmp_buf, 48, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vld, tmp_buf, 64, tmp_buf, 80, tmp_buf, 96,
+ tmp_buf, 112, in4, in5, in6, in7);
+ tmp_buf += 64;
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ __lsx_vst(in0, coeffs, 0);
+ __lsx_vst(in1, coeffs, 64);
+ __lsx_vst(in2, coeffs, 128);
+ __lsx_vst(in3, coeffs, 192);
+ __lsx_vst(in4, coeffs, 256);
+ __lsx_vst(in5, coeffs, 320);
+ __lsx_vst(in6, coeffs, 384);
+ __lsx_vst(in7, coeffs, 448);
+ coeffs += 8;
+ }
+}
+
+void ff_hevc_idct_32x32_lsx(int16_t *coeffs, int col_limit)
+{
+ uint8_t row_cnt, col_cnt;
+ int16_t *src = coeffs;
+ int16_t tmp_buf[8 * 32 + 31];
+ int16_t *tmp_buf_ptr = tmp_buf + 31;
+ uint8_t round;
+ int32_t buf_pitch;
+
+ /* Align pointer to 64 byte boundary */
+ tmp_buf_ptr = (int16_t *)(((uintptr_t) tmp_buf_ptr) & ~(uintptr_t) 63);
+
+ /* column transform */
+ round = 7;
+ buf_pitch = 32;
+ for (col_cnt = 0; col_cnt < 4; col_cnt++) {
+ /* process 8x32 blocks */
+ hevc_idct_8x32_column_lsx((coeffs + col_cnt * 8), buf_pitch, round);
+ }
+
+ /* row transform */
+ round = 12;
+ buf_pitch = 8;
+ for (row_cnt = 0; row_cnt < 4; row_cnt++) {
+ /* process 32x8 blocks */
+ src = (coeffs + 32 * 8 * row_cnt);
+
+ hevc_idct_transpose_32x8_to_8x32(src, tmp_buf_ptr);
+ hevc_idct_8x32_column_lsx(tmp_buf_ptr, buf_pitch, round);
+ hevc_idct_transpose_8x32_to_32x8(tmp_buf_ptr, src);
+ }
+}
diff --git a/libavcodec/loongarch/hevc_lpf_sao_lsx.c b/libavcodec/loongarch/hevc_lpf_sao_lsx.c
new file mode 100644
index 0000000000..fc10e8eda8
--- /dev/null
+++ b/libavcodec/loongarch/hevc_lpf_sao_lsx.c
@@ -0,0 +1,2485 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ * Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "hevcdsp_lsx.h"
+
+void ff_hevc_loop_filter_luma_h_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int32_t beta, int32_t *tc,
+ uint8_t *p_is_pcm, uint8_t *q_is_pcm)
+{
+ ptrdiff_t stride_2x = (stride << 1);
+ ptrdiff_t stride_4x = (stride << 2);
+ ptrdiff_t stride_3x = stride_2x + stride;
+ uint8_t *p3 = src - stride_4x;
+ uint8_t *p2 = src - stride_3x;
+ uint8_t *p1 = src - stride_2x;
+ uint8_t *p0 = src - stride;
+ uint8_t *q0 = src;
+ uint8_t *q1 = src + stride;
+ uint8_t *q2 = src + stride_2x;
+ uint8_t *q3 = src + stride_3x;
+ uint8_t flag0, flag1;
+ int32_t dp00, dq00, dp30, dq30, d00, d30, d0030, d0434;
+ int32_t dp04, dq04, dp34, dq34, d04, d34;
+ int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
+ int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
+
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5;
+ __m128i cmp0, cmp1, cmp2, cmp3, p_is_pcm_vec, q_is_pcm_vec;
+ __m128i temp0, temp1;
+ __m128i temp2, tc_pos, tc_neg;
+ __m128i diff0, diff1, delta0, delta1, delta2, abs_delta0;
+ __m128i zero = {0};
+ __m128i p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
+
+ dp00 = abs(p2[0] - (p1[0] << 1) + p0[0]);
+ dq00 = abs(q2[0] - (q1[0] << 1) + q0[0]);
+ dp30 = abs(p2[3] - (p1[3] << 1) + p0[3]);
+ dq30 = abs(q2[3] - (q1[3] << 1) + q0[3]);
+ d00 = dp00 + dq00;
+ d30 = dp30 + dq30;
+ dp04 = abs(p2[4] - (p1[4] << 1) + p0[4]);
+ dq04 = abs(q2[4] - (q1[4] << 1) + q0[4]);
+ dp34 = abs(p2[7] - (p1[7] << 1) + p0[7]);
+ dq34 = abs(q2[7] - (q1[7] << 1) + q0[7]);
+ d04 = dp04 + dq04;
+ d34 = dp34 + dq34;
+
+ p_is_pcm0 = p_is_pcm[0];
+ p_is_pcm4 = p_is_pcm[1];
+ q_is_pcm0 = q_is_pcm[0];
+ q_is_pcm4 = q_is_pcm[1];
+
+ DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm0, p_is_pcm4, cmp0, cmp1);
+ p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
+ d0030 = (d00 + d30) >= beta;
+ d0434 = (d04 + d34) >= beta;
+ DUP2_ARG1(__lsx_vreplgr2vr_w, d0030, d0434, cmp0, cmp1);
+ cmp3 = __lsx_vpackev_w(cmp1, cmp0);
+ cmp3 = __lsx_vseqi_w(cmp3, 0);
+
+ if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
+ (!d0030 || !d0434)) {
+ DUP4_ARG2(__lsx_vld, p3, 0, p2, 0, p1, 0, p0, 0,
+ p3_src, p2_src, p1_src, p0_src);
+ DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm0, q_is_pcm4, cmp0, cmp1);
+ q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
+
+ tc0 = tc[0];
+ beta30 = beta >> 3;
+ beta20 = beta >> 2;
+ tc250 = (((tc0 << 2) + tc0 + 1) >> 1);
+ tc4 = tc[1];
+ tc254 = (((tc4 << 2) + tc4 + 1) >> 1);
+
+ DUP2_ARG1(__lsx_vreplgr2vr_h, tc0, tc4, cmp0, cmp1);
+ DUP4_ARG2(__lsx_vilvl_b, zero, p3_src, zero, p2_src, zero, p1_src, zero,
+ p0_src, p3_src, p2_src, p1_src, p0_src);
+ DUP4_ARG2(__lsx_vld, q0, 0, q1, 0, q2, 0, q3, 0,
+ q0_src, q1_src, q2_src, q3_src);
+ flag0 = abs(p3[0] - p0[0]) + abs(q3[0] - q0[0]) < beta30 &&
+ abs(p0[0] - q0[0]) < tc250;
+ flag0 = flag0 && (abs(p3[3] - p0[3]) + abs(q3[3] - q0[3]) < beta30 &&
+ abs(p0[3] - q0[3]) < tc250 && (d00 << 1) < beta20 &&
+ (d30 << 1) < beta20);
+ tc_pos = __lsx_vpackev_d(cmp1, cmp0);
+ DUP4_ARG2(__lsx_vilvl_b, zero, q0_src, zero, q1_src, zero, q2_src,
+ zero, q3_src, q0_src, q1_src, q2_src, q3_src);
+
+ flag1 = abs(p3[4] - p0[4]) + abs(q3[4] - q0[4]) < beta30 &&
+ abs(p0[4] - q0[4]) < tc254;
+ flag1 = flag1 && (abs(p3[7] - p0[7]) + abs(q3[7] - q0[7]) < beta30 &&
+ abs(p0[7] - q0[7]) < tc254 && (d04 << 1) < beta20 &&
+ (d34 << 1) < beta20);
+ DUP2_ARG1(__lsx_vreplgr2vr_w, flag0, flag1, cmp0, cmp1);
+ cmp2 = __lsx_vpackev_w(cmp1, cmp0);
+ cmp2 = __lsx_vseqi_w(cmp2, 0);
+
+ if (flag0 && flag1) { /* strong only */
+ /* strong filter */
+ tc_pos = __lsx_vslli_h(tc_pos, 1);
+ tc_neg = __lsx_vneg_h(tc_pos);
+
+ /* p part */
+ DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
+ temp0, temp0);
+ temp1 = __lsx_vadd_h(p3_src, p2_src);
+ temp1 = __lsx_vslli_h(temp1, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, p2_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst0 = __lsx_vadd_h(temp2, p2_src);
+
+ temp1 = __lsx_vadd_h(temp0, p2_src);
+ temp1 = __lsx_vsrari_h(temp1, 2);
+ temp2 = __lsx_vsub_h(temp1, p1_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst1 = __lsx_vadd_h(temp2, p1_src);
+
+ temp1 = __lsx_vslli_h(temp0, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src,
+ temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, p0_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst2 = __lsx_vadd_h(temp2, p0_src);
+
+ p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
+ DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1,
+ p1_src, p_is_pcm_vec, dst0, dst1);
+ dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
+
+ /* q part */
+ DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
+ temp0, temp0);
+ temp1 = __lsx_vadd_h(q3_src, q2_src);
+ temp1 = __lsx_vslli_h(temp1, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, q2_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst5 = __lsx_vadd_h(temp2, q2_src);
+
+ temp1 = __lsx_vadd_h(temp0, q2_src);
+ temp1 = __lsx_vsrari_h(temp1, 2);
+ temp2 = __lsx_vsub_h(temp1, q1_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst4 = __lsx_vadd_h(temp2, q1_src);
+
+ temp0 = __lsx_vslli_h(temp0, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp0, p1_src, temp1, q2_src,
+ temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, q0_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst3 = __lsx_vadd_h(temp2, q0_src);
+
+ q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
+ DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4,
+ q1_src, q_is_pcm_vec, dst3, dst4);
+ dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
+
+ /* pack results to 8 bit */
+ DUP2_ARG2(__lsx_vpickev_b, dst1, dst0, dst3, dst2, dst0, dst1);
+ dst2 = __lsx_vpickev_b(dst5, dst4);
+
+ /* pack src to 8 bit */
+ DUP2_ARG2(__lsx_vpickev_b, p1_src, p2_src, q0_src, p0_src,
+ dst3, dst4);
+ dst5 = __lsx_vpickev_b(q2_src, q1_src);
+
+ cmp3 = __lsx_vnor_v(cmp3, cmp3);
+ DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp3, dst1, dst4, cmp3,
+ dst0, dst1);
+ dst2 = __lsx_vbitsel_v(dst2, dst5, cmp3);
+
+ __lsx_vstelm_d(dst0, p2, 0, 0);
+ __lsx_vstelm_d(dst0, p2 + stride, 0, 1);
+ __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
+ __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
+ __lsx_vstelm_d(dst2, p2 + stride_4x, 0, 0);
+ __lsx_vstelm_d(dst2, p2 + stride_4x + stride, 0, 1);
+ /* strong filter ends */
+ } else if (flag0 == flag1) { /* weak only */
+ /* weak filter */
+ tc_neg = __lsx_vneg_h(tc_pos);
+ DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
+ diff0, diff1);
+ DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
+ __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
+ delta0 = __lsx_vsub_h(diff0, diff1);
+ delta0 = __lsx_vsrari_h(delta0, 4);
+ temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
+ __lsx_vslli_h(tc_pos, 1));
+ abs_delta0 = __lsx_vadda_h(delta0, zero);
+ abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
+ abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
+
+ delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
+ temp2 = __lsx_vadd_h(delta0, p0_src);
+ temp2 = __lsx_vclip255_h(temp2);
+ temp0 = __lsx_vbitsel_v(temp2, p0_src,
+ __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec));
+ temp2 = __lsx_vsub_h(q0_src, delta0);
+ temp2 = __lsx_vclip255_h(temp2);
+ temp2 = __lsx_vbitsel_v(temp2, q0_src, __lsx_vnor_v(q_is_pcm_vec,
+ q_is_pcm_vec));
+ DUP2_ARG2(__lsx_vnor_v, p_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec,
+ q_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec);
+
+ tmp = (beta + (beta >> 1)) >> 3;
+ DUP2_ARG1(__lsx_vreplgr2vr_d, dp00 + dp30 < tmp, dp04 + dp34 < tmp,
+ cmp0, cmp1);
+ cmp0 = __lsx_vpackev_d(cmp1, cmp0);
+ cmp0 = __lsx_vseqi_d(cmp0, 0);
+ p_is_pcm_vec = __lsx_vor_v(p_is_pcm_vec, cmp0);
+
+ DUP2_ARG1(__lsx_vreplgr2vr_d, dq00 + dq30 < tmp, dq04 + dq34 < tmp,
+ cmp0, cmp1);
+ cmp0 = __lsx_vpackev_d(cmp1, cmp0);
+ cmp0 = __lsx_vseqi_d(cmp0, 0);
+ q_is_pcm_vec = __lsx_vor_v(q_is_pcm_vec, cmp0);
+ tc_pos = __lsx_vsrai_h(tc_pos, 1);
+ tc_neg = __lsx_vneg_h(tc_pos);
+
+ DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
+ delta1, delta2);
+ DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
+ delta1, delta2);
+ delta1 = __lsx_vadd_h(delta1, delta0);
+ delta2 = __lsx_vsub_h(delta2, delta0);
+ DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
+ DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2,
+ tc_neg, tc_pos, delta1, delta2);
+ DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
+ delta1, delta2);
+ DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
+ DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
+ q1_src, q_is_pcm_vec, delta1, delta2);
+
+ abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
+ DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, temp0,
+ p0_src, abs_delta0, temp2, q0_src, abs_delta0, delta2,
+ q1_src, abs_delta0, dst1, dst2, dst3, dst4);
+ /* pack results to 8 bit */
+ DUP2_ARG2(__lsx_vpickev_b, dst2, dst1, dst4, dst3, dst0, dst1);
+ /* pack src to 8 bit */
+ DUP2_ARG2(__lsx_vpickev_b, p0_src, p1_src, q1_src, q0_src,
+ dst2, dst3);
+ cmp3 = __lsx_vnor_v(cmp3, cmp3);
+ DUP2_ARG3(__lsx_vbitsel_v, dst0, dst2, cmp3, dst1, dst3, cmp3,
+ dst0, dst1);
+
+ p2 += stride;
+ __lsx_vstelm_d(dst0, p2, 0, 0);
+ __lsx_vstelm_d(dst0, p2 + stride, 0, 1);
+ __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
+ __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
+ /* weak filter ends */
+ } else { /* strong + weak */
+ /* strong filter */
+ tc_pos = __lsx_vslli_h(tc_pos, 1);
+ tc_neg = __lsx_vneg_h(tc_pos);
+
+ /* p part */
+ DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
+ temp0, temp0);
+ temp1 = __lsx_vadd_h(p3_src, p2_src);
+ temp1 = __lsx_vslli_h(temp1, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, p2_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst0 = __lsx_vadd_h(temp2, p2_src);
+
+ temp1 = __lsx_vadd_h(temp0, p2_src);
+ temp1 = __lsx_vsrari_h(temp1, 2);
+ temp2 = __lsx_vsub_h(temp1, p1_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst1 = __lsx_vadd_h(temp2, p1_src);
+
+ temp1 = __lsx_vslli_h(temp0, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, p0_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst2 = __lsx_vadd_h(temp2, p0_src);
+
+ p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
+ DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1,
+ p1_src, p_is_pcm_vec, dst0, dst1);
+ dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
+
+ /* q part */
+ DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
+ temp0, temp0);
+ temp1 = __lsx_vadd_h(q3_src, q2_src);
+ temp1 = __lsx_vslli_h(temp1, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, q2_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst5 = __lsx_vadd_h(temp2, q2_src);
+
+ temp1 = __lsx_vadd_h(temp0, q2_src);
+ temp1 = __lsx_vsrari_h(temp1, 2);
+ temp2 = __lsx_vsub_h(temp1, q1_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst4 = __lsx_vadd_h(temp2, q1_src);
+
+ temp1 = __lsx_vslli_h(temp0, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, q0_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst3 = __lsx_vadd_h(temp2, q0_src);
+
+ q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
+ DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4,
+ q1_src, q_is_pcm_vec, dst3, dst4);
+ dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
+
+ /* pack strong results to 8 bit */
+ DUP2_ARG2(__lsx_vpickev_b, dst1, dst0, dst3, dst2, dst0, dst1);
+ dst2 = __lsx_vpickev_b(dst5, dst4);
+ /* strong filter ends */
+
+ /* weak filter */
+ tc_pos = __lsx_vsrai_h(tc_pos, 1);
+ tc_neg = __lsx_vneg_h(tc_pos);
+
+ DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
+ diff0, diff1);
+ DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
+ __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
+ delta0 = __lsx_vsub_h(diff0, diff1);
+ delta0 = __lsx_vsrari_h(delta0, 4);
+ temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
+ __lsx_vslli_h(tc_pos, 1));
+ abs_delta0 = __lsx_vadda_h(delta0, zero);
+ abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
+ abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
+
+ delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
+ temp2 = __lsx_vadd_h(delta0, p0_src);
+ temp2 = __lsx_vclip255_h(temp2);
+ temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
+
+ temp2 = __lsx_vsub_h(q0_src, delta0);
+ temp2 = __lsx_vclip255_h(temp2);
+ temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
+
+ tmp = (beta + (beta >> 1)) >> 3;
+ DUP2_ARG1(__lsx_vreplgr2vr_d, dp00 + dp30 < tmp, dp04 + dp34 < tmp,
+ cmp0, cmp1);
+ cmp0 = __lsx_vpackev_d(cmp1, cmp0);
+ p_is_pcm_vec = __lsx_vor_v(p_is_pcm_vec, __lsx_vseqi_d(cmp0, 0));
+ DUP2_ARG1(__lsx_vreplgr2vr_d, dq00 + dq30 < tmp, dq04 + dq34 < tmp,
+ cmp0, cmp1);
+ cmp0 = __lsx_vpackev_d(cmp1, cmp0);
+ q_is_pcm_vec = __lsx_vor_v(q_is_pcm_vec, __lsx_vseqi_d(cmp0, 0));
+
+ tc_pos = __lsx_vsrai_h(tc_pos, 1);
+ tc_neg = __lsx_vneg_h(tc_pos);
+
+ DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
+ delta1, delta2);
+ DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
+ delta1, delta2);
+ delta1 = __lsx_vadd_h(delta1, delta0);
+ delta2 = __lsx_vsub_h(delta2, delta0);
+ DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
+ DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
+ tc_pos, delta1, delta2);
+ DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
+ delta1, delta2);
+ DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
+ DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
+ q1_src, q_is_pcm_vec, delta1, delta2);
+ abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
+ DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, delta2,
+ q1_src, abs_delta0, temp0, p0_src, abs_delta0, temp2,
+ q0_src, abs_delta0, delta1, delta2, temp0, temp2);
+ /* weak filter ends */
+
+ /* pack weak results to 8 bit */
+ DUP2_ARG2(__lsx_vpickev_b, delta1, p2_src, temp2, temp0,
+ dst3, dst4);
+ dst5 = __lsx_vpickev_b(q2_src, delta2);
+
+ /* select between weak or strong */
+ DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp2, dst1, dst4, cmp2,
+ dst0, dst1);
+ dst2 = __lsx_vbitsel_v(dst2, dst5, cmp2);
+
+ /* pack src to 8 bit */
+ DUP2_ARG2(__lsx_vpickev_b, p1_src, p2_src, q0_src, p0_src,
+ dst3, dst4);
+ dst5 = __lsx_vpickev_b(q2_src, q1_src);
+
+ cmp3 = __lsx_vnor_v(cmp3, cmp3);
+ DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp3, dst1, dst4, cmp3,
+ dst0, dst1);
+ dst2 = __lsx_vbitsel_v(dst2, dst5, cmp3);
+
+ __lsx_vstelm_d(dst0, p2, 0, 0);
+ __lsx_vstelm_d(dst0, p2 + stride, 0, 1);
+ __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
+ __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
+ __lsx_vstelm_d(dst2, p2 + stride_4x, 0, 0);
+ __lsx_vstelm_d(dst2, p2 + stride_4x + stride, 0, 1);
+ }
+ }
+}
+
+void ff_hevc_loop_filter_luma_v_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int32_t beta, int32_t *tc,
+ uint8_t *p_is_pcm, uint8_t *q_is_pcm)
+{
+ ptrdiff_t stride_2x = (stride << 1);
+ ptrdiff_t stride_4x = (stride << 2);
+ ptrdiff_t stride_3x = stride_2x + stride;
+ uint8_t *p3 = src;
+ uint8_t *p2 = src + stride_3x;
+ uint8_t *p1 = src + stride_4x;
+ uint8_t *p0 = src + stride_4x + stride_3x;
+ uint8_t flag0, flag1;
+ int32_t dp00, dq00, dp30, dq30, d00, d30;
+ int32_t d0030, d0434;
+ int32_t dp04, dq04, dp34, dq34, d04, d34;
+ int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
+ int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
+
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ __m128i cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
+ __m128i cmp3;
+ __m128i temp0, temp1;
+ __m128i temp2;
+ __m128i tc_pos, tc_neg;
+ __m128i diff0, diff1, delta0, delta1, delta2, abs_delta0;
+ __m128i zero = {0};
+ __m128i p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
+
+ dp00 = abs(p3[-3] - (p3[-2] << 1) + p3[-1]);
+ dq00 = abs(p3[2] - (p3[1] << 1) + p3[0]);
+ dp30 = abs(p2[-3] - (p2[-2] << 1) + p2[-1]);
+ dq30 = abs(p2[2] - (p2[1] << 1) + p2[0]);
+ d00 = dp00 + dq00;
+ d30 = dp30 + dq30;
+ p_is_pcm0 = p_is_pcm[0];
+ q_is_pcm0 = q_is_pcm[0];
+
+ dp04 = abs(p1[-3] - (p1[-2] << 1) + p1[-1]);
+ dq04 = abs(p1[2] - (p1[1] << 1) + p1[0]);
+ dp34 = abs(p0[-3] - (p0[-2] << 1) + p0[-1]);
+ dq34 = abs(p0[2] - (p0[1] << 1) + p0[0]);
+ d04 = dp04 + dq04;
+ d34 = dp34 + dq34;
+ p_is_pcm4 = p_is_pcm[1];
+ q_is_pcm4 = q_is_pcm[1];
+
+ DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm0, p_is_pcm4, cmp0, cmp1);
+ p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
+
+ d0030 = (d00 + d30) >= beta;
+ d0434 = (d04 + d34) >= beta;
+
+ DUP2_ARG1(__lsx_vreplgr2vr_d, d0030, d0434, cmp0, cmp1);
+ cmp3 = __lsx_vpackev_d(cmp1, cmp0);
+ cmp3 = __lsx_vseqi_d(cmp3, 0);
+
+ if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
+ (!d0030 || !d0434)) {
+ src -= 4;
+ DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
+ src + stride_3x, 0, p3_src, p2_src, p1_src, p0_src);
+ src += stride_4x;
+ DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
+ src + stride_3x, 0, q0_src, q1_src, q2_src, q3_src);
+ src -= stride_4x;
+
+ DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm0, q_is_pcm4, cmp0, cmp1);
+ q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
+
+ tc0 = tc[0];
+ beta30 = beta >> 3;
+ beta20 = beta >> 2;
+ tc250 = (((tc0 << 2) + tc0 + 1) >> 1);
+ tc4 = tc[1];
+ tc254 = (((tc4 << 2) + tc4 + 1) >> 1);
+ DUP2_ARG1( __lsx_vreplgr2vr_h, tc0 << 1, tc4 << 1, cmp0, cmp1);
+ tc_pos = __lsx_vpackev_d(cmp1, cmp0);
+ LSX_TRANSPOSE8x8_B(p3_src, p2_src, p1_src, p0_src, q0_src, q1_src,
+ q2_src, q3_src, p3_src, p2_src, p1_src, p0_src,
+ q0_src, q1_src, q2_src, q3_src);
+
+ flag0 = abs(p3[-4] - p3[-1]) + abs(p3[3] - p3[0]) < beta30 &&
+ abs(p3[-1] - p3[0]) < tc250;
+ flag0 = flag0 && (abs(p2[-4] - p2[-1]) + abs(p2[3] - p2[0]) < beta30 &&
+ abs(p2[-1] - p2[0]) < tc250 && (d00 << 1) < beta20 &&
+ (d30 << 1) < beta20);
+ cmp0 = __lsx_vreplgr2vr_d(flag0);
+ DUP4_ARG2(__lsx_vilvl_b, zero, p3_src, zero, p2_src, zero, p1_src, zero,
+ p0_src, p3_src, p2_src, p1_src, p0_src);
+
+ flag1 = abs(p1[-4] - p1[-1]) + abs(p1[3] - p1[0]) < beta30 &&
+ abs(p1[-1] - p1[0]) < tc254;
+ flag1 = flag1 && (abs(p0[-4] - p0[-1]) + abs(p0[3] - p0[0]) < beta30 &&
+ abs(p0[-1] - p0[0]) < tc254 && (d04 << 1) < beta20 &&
+ (d34 << 1) < beta20);
+ DUP4_ARG2(__lsx_vilvl_b, zero, q0_src, zero, q1_src, zero, q2_src, zero,
+ q3_src, q0_src, q1_src, q2_src, q3_src);
+
+ cmp1 = __lsx_vreplgr2vr_d(flag1);
+ cmp2 = __lsx_vpackev_d(cmp1, cmp0);
+ cmp2 = __lsx_vseqi_d(cmp2, 0);
+
+ if (flag0 && flag1) { /* strong only */
+ /* strong filter */
+ tc_neg = __lsx_vneg_h(tc_pos);
+ /* p part */
+ DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
+ temp0, temp0);
+ temp1 = __lsx_vadd_h(p3_src, p2_src);
+ temp1 = __lsx_vslli_h(temp1, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, p2_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst0 = __lsx_vadd_h(temp2, p2_src);
+
+ temp1 = __lsx_vadd_h(temp0, p2_src);
+ temp1 = __lsx_vsrari_h(temp1, 2);
+ temp2 = __lsx_vsub_h(temp1, p1_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst1 = __lsx_vadd_h(temp2, p1_src);
+
+ temp1 = __lsx_vslli_h(temp0, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, p0_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst2 = __lsx_vadd_h(temp2, p0_src);
+
+ p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
+ DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1, p1_src,
+ p_is_pcm_vec, dst0, dst1);
+ dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
+
+ /* q part */
+ DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
+ temp0, temp0);
+ temp1 = __lsx_vadd_h(q3_src, q2_src);
+ temp1 = __lsx_vslli_h(temp1, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, q2_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst5 = __lsx_vadd_h(temp2, q2_src);
+
+ temp1 = __lsx_vadd_h(temp0, q2_src);
+ temp1 = __lsx_vsrari_h(temp1, 2);
+ temp2 = __lsx_vsub_h(temp1, q1_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst4 = __lsx_vadd_h(temp2, q1_src);
+
+ temp1 = __lsx_vslli_h(temp0, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, q0_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst3 = __lsx_vadd_h(temp2, q0_src);
+
+ q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
+ DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4, q1_src,
+ q_is_pcm_vec, dst3, dst4);
+ dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
+ /* strong filter ends */
+ } else if (flag0 == flag1) { /* weak only */
+ /* weak filter */
+ tc_pos = __lsx_vsrai_h(tc_pos, 1);
+ tc_neg = __lsx_vneg_h(tc_pos);
+
+ DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
+ diff0, diff1);
+ DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
+ __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
+ delta0 = __lsx_vsub_h(diff0, diff1);
+ delta0 = __lsx_vsrari_h(delta0, 4);
+ temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
+ __lsx_vslli_h(tc_pos, 1));
+ abs_delta0 = __lsx_vadda_h(delta0, zero);
+ abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
+ abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
+
+ delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
+ temp2 = __lsx_vadd_h(delta0, p0_src);
+ temp2 = __lsx_vclip255_h(temp2);
+ p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
+ temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
+
+ temp2 = __lsx_vsub_h(q0_src, delta0);
+ temp2 = __lsx_vclip255_h(temp2);
+ q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
+ temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
+
+ tmp = ((beta + (beta >> 1)) >> 3);
+ DUP2_ARG1(__lsx_vreplgr2vr_d, !p_is_pcm0 && ((dp00 + dp30) < tmp),
+ !p_is_pcm4 && ((dp04 + dp34) < tmp), cmp0, cmp1);
+ p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
+
+ DUP2_ARG1(__lsx_vreplgr2vr_h, (!q_is_pcm0) && (dq00 + dq30 < tmp),
+ (!q_is_pcm4) && (dq04 + dq34 < tmp), cmp0, cmp1);
+ q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
+ tc_pos = __lsx_vsrai_h(tc_pos, 1);
+ tc_neg = __lsx_vneg_h(tc_pos);
+
+ DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
+ delta1, delta2);
+ DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
+ delta1, delta2);
+ delta1 = __lsx_vadd_h(delta1, delta0);
+ delta2 = __lsx_vsub_h(delta2, delta0);
+ DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
+ DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
+ tc_pos, delta1, delta2);
+ DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
+ delta1, delta2);
+ DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
+ DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
+ q1_src, q_is_pcm_vec, delta1, delta2);
+
+ abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
+ DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, temp0,
+ p0_src, abs_delta0, temp2, q0_src, abs_delta0, delta2,
+ q1_src, abs_delta0, dst0, dst1, dst2, dst3);
+ /* weak filter ends */
+
+ cmp3 = __lsx_vnor_v(cmp3, cmp3);
+ DUP4_ARG3(__lsx_vbitsel_v, dst0, p1_src, cmp3, dst1, p0_src,
+ cmp3, dst2, q0_src, cmp3, dst3, q1_src, cmp3,
+ dst0, dst1, dst2, dst3);
+ DUP2_ARG2(__lsx_vpickev_b, dst2, dst0, dst3, dst1, dst0, dst1);
+
+ /* transpose */
+ dst4 = __lsx_vilvl_b(dst1, dst0);
+ dst5 = __lsx_vilvh_b(dst1, dst0);
+ dst0 = __lsx_vilvl_h(dst5, dst4);
+ dst1 = __lsx_vilvh_h(dst5, dst4);
+
+ src += 2;
+ __lsx_vstelm_w(dst0, src, 0, 0);
+ __lsx_vstelm_w(dst0, src + stride, 0, 1);
+ __lsx_vstelm_w(dst0, src + stride_2x, 0, 2);
+ __lsx_vstelm_w(dst0, src + stride_3x, 0, 3);
+ src += stride_4x;
+ __lsx_vstelm_w(dst1, src, 0, 0);
+ __lsx_vstelm_w(dst1, src + stride, 0, 1);
+ __lsx_vstelm_w(dst1, src + stride_2x, 0, 2);
+ __lsx_vstelm_w(dst1, src + stride_3x, 0, 3);
+ return;
+ } else { /* strong + weak */
+ /* strong filter */
+ tc_neg = __lsx_vneg_h(tc_pos);
+
+ /* p part */
+ DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
+ temp0, temp0);
+
+ temp1 = __lsx_vadd_h(p3_src, p2_src);
+ temp1 = __lsx_vslli_h(temp1, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, p2_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst0 = __lsx_vadd_h(temp2, p2_src);
+
+ temp1 = __lsx_vadd_h(temp0, p2_src);
+ temp1 = __lsx_vsrari_h(temp1, 2);
+ temp2 = __lsx_vsub_h(temp1, p1_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst1 = __lsx_vadd_h(temp2, p1_src);
+
+ temp1 = __lsx_vslli_h(temp0, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, p0_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst2 = __lsx_vadd_h(temp2, p0_src);
+
+ p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
+ DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1, p1_src,
+ p_is_pcm_vec, dst0, dst1);
+ dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
+
+ /* q part */
+ DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src, temp0, temp0);
+ temp1 = __lsx_vadd_h(q3_src, q2_src);
+ temp1 = __lsx_vslli_h(temp1, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, q2_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst5 = __lsx_vadd_h(temp2, q2_src);
+
+ temp1 = __lsx_vadd_h(temp0, q2_src);
+ temp1 = __lsx_vsrari_h(temp1, 2);
+ temp2 = __lsx_vsub_h(temp1, q1_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst4 = __lsx_vadd_h(temp2, q1_src);
+
+ temp1 = __lsx_vslli_h(temp0, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, q0_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst3 = __lsx_vadd_h(temp2, q0_src);
+
+ q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
+ DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4, q1_src,
+ q_is_pcm_vec, dst3, dst4);
+ dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
+ /* strong filter ends */
+
+ /* weak filter */
+ tc_pos = __lsx_vsrai_h(tc_pos, 1);
+ tc_neg = __lsx_vneg_h(tc_pos);
+
+ DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
+ diff0, diff1);
+ DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
+ __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
+ delta0 = __lsx_vsub_h(diff0, diff1);
+ delta0 = __lsx_vsrari_h(delta0, 4);
+
+ temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
+ __lsx_vslli_h(tc_pos, 1));
+ abs_delta0 = __lsx_vadda_h(delta0, zero);
+ abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
+ abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
+ delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
+ temp2 = __lsx_vadd_h(delta0, p0_src);
+ temp2 = __lsx_vclip255_h(temp2);
+ temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
+ temp2 = __lsx_vsub_h(q0_src, delta0);
+ temp2 = __lsx_vclip255_h(temp2);
+ temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
+
+ tmp = (beta + (beta >> 1)) >> 3;
+ DUP2_ARG1(__lsx_vreplgr2vr_d, !p_is_pcm0 && ((dp00 + dp30) < tmp),
+ !p_is_pcm4 && ((dp04 + dp34) < tmp), cmp0, cmp1);
+ p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
+
+ DUP2_ARG1(__lsx_vreplgr2vr_h, (!q_is_pcm0) && (dq00 + dq30 < tmp),
+ (!q_is_pcm4) && (dq04 + dq34 < tmp), cmp0, cmp1);
+ q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
+ tc_pos = __lsx_vsrai_h(tc_pos, 1);
+ tc_neg = __lsx_vneg_h(tc_pos);
+
+ DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
+ delta1, delta2);
+ DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
+ delta1, delta2);
+ delta1 = __lsx_vadd_h(delta1, delta0);
+ delta2 = __lsx_vsub_h(delta2, delta0);
+ DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
+ DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
+ tc_pos, delta1, delta2);
+ DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
+ delta1, delta2);
+ DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
+ DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
+ q1_src, q_is_pcm_vec, delta1, delta2);
+
+ abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
+ DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, delta2,
+ q1_src, abs_delta0, temp0, p0_src, abs_delta0, temp2,
+ q0_src, abs_delta0, delta1, delta2, temp0, temp2);
+ /* weak filter ends*/
+
+ /* select between weak or strong */
+ DUP4_ARG3(__lsx_vbitsel_v, dst0, p2_src, cmp2, dst1, delta1,
+ cmp2, dst2, temp0, cmp2, dst3, temp2, cmp2,
+ dst0, dst1, dst2, dst3);
+ DUP2_ARG3(__lsx_vbitsel_v, dst4, delta2, cmp2, dst5, q2_src, cmp2,
+ dst4, dst5);
+ }
+
+ cmp3 = __lsx_vnor_v(cmp3, cmp3);
+ DUP4_ARG3(__lsx_vbitsel_v, dst0, p2_src, cmp3, dst1, p1_src, cmp3, dst2,
+ p0_src, cmp3, dst3, q0_src, cmp3, dst0, dst1, dst2, dst3);
+ DUP2_ARG3(__lsx_vbitsel_v, dst4, q1_src, cmp3, dst5, q2_src, cmp3,
+ dst4, dst5);
+
+ /* pack results to 8 bit */
+ DUP4_ARG2(__lsx_vpickev_b, dst2, dst0, dst3, dst1, dst4, dst4, dst5,
+ dst5, dst0, dst1, dst2, dst3);
+
+ /* transpose */
+ DUP2_ARG2(__lsx_vilvl_b, dst1, dst0, dst3, dst2, dst4, dst6);
+ DUP2_ARG2(__lsx_vilvh_b, dst1, dst0, dst3, dst2, dst5, dst7);
+ DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst7, dst6, dst0, dst2);
+ DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst7, dst6, dst1, dst3);
+
+ src += 1;
+ __lsx_vstelm_w(dst0, src, 0, 0);
+ __lsx_vstelm_h(dst2, src, 4, 0);
+ src += stride;
+ __lsx_vstelm_w(dst0, src, 0, 1);
+ __lsx_vstelm_h(dst2, src, 4, 2);
+ src += stride;
+
+ __lsx_vstelm_w(dst0, src, 0, 2);
+ __lsx_vstelm_h(dst2, src, 4, 4);
+ src += stride;
+ __lsx_vstelm_w(dst0, src, 0, 3);
+ __lsx_vstelm_h(dst2, src, 4, 6);
+ src += stride;
+
+ __lsx_vstelm_w(dst1, src, 0, 0);
+ __lsx_vstelm_h(dst3, src, 4, 0);
+ src += stride;
+ __lsx_vstelm_w(dst1, src, 0, 1);
+ __lsx_vstelm_h(dst3, src, 4, 2);
+ src += stride;
+
+ __lsx_vstelm_w(dst1, src, 0, 2);
+ __lsx_vstelm_h(dst3, src, 4, 4);
+ src += stride;
+ __lsx_vstelm_w(dst1, src, 0, 3);
+ __lsx_vstelm_h(dst3, src, 4, 6);
+ }
+}
+
+void ff_hevc_loop_filter_chroma_h_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int32_t *tc, uint8_t *p_is_pcm,
+ uint8_t *q_is_pcm)
+{
+ uint8_t *p1_ptr = src - (stride << 1);
+ uint8_t *p0_ptr = src - stride;
+ uint8_t *q0_ptr = src;
+ uint8_t *q1_ptr = src + stride;
+ __m128i cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
+ __m128i p1, p0, q0, q1;
+ __m128i tc_pos, tc_neg;
+ __m128i zero = {0};
+ __m128i temp0, temp1, delta;
+
+ if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
+ DUP2_ARG1(__lsx_vreplgr2vr_h, tc[0], tc[1], cmp0, cmp1);
+ tc_pos = __lsx_vpackev_d(cmp1, cmp0);
+ tc_neg = __lsx_vneg_h(tc_pos);
+ DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm[0], p_is_pcm[1], cmp0, cmp1);
+ p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
+
+ DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm[0], q_is_pcm[1], cmp0, cmp1);
+ q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
+
+ DUP4_ARG2(__lsx_vld, p1_ptr, 0, p0_ptr, 0, q0_ptr, 0, q1_ptr, 0,
+ p1, p0, q0, q1);
+ DUP4_ARG2(__lsx_vilvl_b, zero, p1, zero, p0, zero, q0, zero, q1,
+ p1, p0, q0, q1);
+ DUP2_ARG2(__lsx_vsub_h, q0, p0, p1, q1, temp0, temp1);
+ temp0 = __lsx_vslli_h(temp0, 2);
+ temp0 = __lsx_vadd_h(temp0, temp1);
+ delta = __lsx_vsrari_h(temp0, 3);
+ delta = __lsx_vclip_h(delta, tc_neg, tc_pos);
+ temp0 = __lsx_vadd_h(p0, delta);
+ temp0 = __lsx_vclip255_h(temp0);
+ p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
+ temp0 = __lsx_vbitsel_v(temp0, p0, p_is_pcm_vec);
+
+ temp1 = __lsx_vsub_h(q0, delta);
+ temp1 = __lsx_vclip255_h(temp1);
+ q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
+ temp1 = __lsx_vbitsel_v(temp1, q0, q_is_pcm_vec);
+
+ tc_pos = __lsx_vslei_d(tc_pos, 0);
+ DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, tc_pos, temp1, q0, tc_pos,
+ temp0, temp1);
+ temp0 = __lsx_vpickev_b(temp1, temp0);
+ __lsx_vstelm_d(temp0, p0_ptr, 0, 0);
+ __lsx_vstelm_d(temp0, p0_ptr + stride, 0, 1);
+ }
+}
+
+void ff_hevc_loop_filter_chroma_v_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int32_t *tc, uint8_t *p_is_pcm,
+ uint8_t *q_is_pcm)
+{
+ ptrdiff_t stride_2x = (stride << 1);
+ ptrdiff_t stride_4x = (stride << 2);
+ ptrdiff_t stride_3x = stride_2x + stride;
+ __m128i cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i p1, p0, q0, q1;
+ __m128i tc_pos, tc_neg;
+ __m128i zero = {0};
+ __m128i temp0, temp1, delta;
+
+ if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
+ DUP2_ARG1(__lsx_vreplgr2vr_h, tc[0], tc[1], cmp0, cmp1);
+ tc_pos = __lsx_vpackev_d(cmp1, cmp0);
+ tc_neg = __lsx_vneg_h(tc_pos);
+
+ DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm[0], p_is_pcm[1], cmp0, cmp1);
+ p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
+ DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm[0], q_is_pcm[1], cmp0, cmp1);
+ q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
+
+ src -= 2;
+ DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
+ src + stride_3x, 0, src0, src1, src2, src3);
+ src += stride_4x;
+ DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
+ src + stride_3x, 0, src4, src5, src6, src7);
+ src -= stride_4x;
+ LSX_TRANSPOSE8x4_B(src0, src1, src2, src3, src4, src5, src6, src7,
+ p1, p0, q0, q1);
+ DUP4_ARG2(__lsx_vilvl_b, zero, p1, zero, p0, zero, q0, zero, q1,
+ p1, p0, q0, q1);
+
+ DUP2_ARG2(__lsx_vsub_h, q0, p0, p1, q1, temp0, temp1);
+ temp0 = __lsx_vslli_h(temp0, 2);
+ temp0 = __lsx_vadd_h(temp0, temp1);
+ delta = __lsx_vsrari_h(temp0, 3);
+ delta = __lsx_vclip_h(delta, tc_neg, tc_pos);
+
+ temp0 = __lsx_vadd_h(p0, delta);
+ temp1 = __lsx_vsub_h(q0, delta);
+ DUP2_ARG1(__lsx_vclip255_h, temp0, temp1, temp0, temp1);
+ DUP2_ARG2(__lsx_vnor_v, p_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec,
+ q_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec);
+ DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, p_is_pcm_vec, temp1, q0,
+ q_is_pcm_vec, temp0, temp1);
+
+ tc_pos = __lsx_vslei_d(tc_pos, 0);
+ DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, tc_pos, temp1, q0, tc_pos,
+ temp0, temp1);
+ temp0 = __lsx_vpackev_b(temp1, temp0);
+
+ src += 1;
+ __lsx_vstelm_h(temp0, src, 0, 0);
+ __lsx_vstelm_h(temp0, src + stride, 0, 1);
+ __lsx_vstelm_h(temp0, src + stride_2x, 0, 2);
+ __lsx_vstelm_h(temp0, src + stride_3x, 0, 3);
+ src += stride_4x;
+ __lsx_vstelm_h(temp0, src, 0, 4);
+ __lsx_vstelm_h(temp0, src + stride, 0, 5);
+ __lsx_vstelm_h(temp0, src + stride_2x, 0, 6);
+ __lsx_vstelm_h(temp0, src + stride_3x, 0, 7);
+ src -= stride_4x;
+ }
+}
+
+static void hevc_sao_edge_filter_0degree_4width_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *sao_offset_val,
+ int32_t height)
+{
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+ __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i cmp_minus10, cmp_minus11, diff_minus10, diff_minus11;
+ __m128i sao_offset = __lsx_vld(sao_offset_val, 0);
+ __m128i src_minus10, src_minus11, src_plus10, offset, src0, dst0;
+ __m128i const1 = __lsx_vldi(1);
+ __m128i zero = {0};
+
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+ src -= 1;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src_minus10, src_minus11);
+
+ for (height -= 2; height; height -= 2) {
+ src += src_stride_2x;
+ src_minus10 = __lsx_vpickev_d(src_minus11, src_minus10);
+ src0 = __lsx_vshuf_b(zero, src_minus10, shuf1);
+ src_plus10 = __lsx_vshuf_b(zero, src_minus10, shuf2);
+
+ DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ offset = __lsx_vadd_b(diff_minus10, diff_minus11);
+ offset = __lsx_vaddi_bu(offset, 2);
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0,
+ src_minus10, src_minus11);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset,
+ sao_offset, sao_offset, offset, offset, offset);
+ src0 = __lsx_vxori_b(src0, 128);
+ dst0 = __lsx_vsadd_b(src0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
+ dst += dst_stride_2x;
+ }
+
+ src_minus10 = __lsx_vpickev_d(src_minus11, src_minus10);
+ src0 = __lsx_vshuf_b(zero, src_minus10, shuf1);
+ src_plus10 = __lsx_vshuf_b(zero, src_minus10, shuf2);
+
+ DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10, cmp_minus10,
+ cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10, cmp_minus10,
+ cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
+ const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ offset = __lsx_vadd_b(diff_minus10, diff_minus11);
+ offset = __lsx_vaddi_bu(offset, 2);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, sao_offset,
+ offset, offset, offset);
+ src0 = __lsx_vxori_b(src0, 128);
+ dst0 = __lsx_vsadd_b(src0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
+}
+
+static void hevc_sao_edge_filter_0degree_8width_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *sao_offset_val,
+ int32_t height)
+{
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+ __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i const1 = __lsx_vldi(1);
+ __m128i cmp_minus10, cmp_minus11, diff_minus10, diff_minus11;
+ __m128i src0, src1, dst0, src_minus10, src_minus11, src_plus10, src_plus11;
+ __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
+ __m128i zeros = {0};
+
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+ src -= 1;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src_minus10, src_minus11);
+
+ for (height -= 2; height; height -= 2) {
+ src += src_stride_2x;
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf1, zeros,
+ src_minus11, shuf1, src0, src1);
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros,
+ src_minus11, shuf2, src_plus10, src_plus11);
+ DUP2_ARG2(__lsx_vpickev_d, src_minus11, src_minus10, src_plus11,
+ src_plus10, src_minus10, src_plus10);
+ src0 = __lsx_vpickev_d(src1, src0);
+
+ DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ offset = __lsx_vadd_b(diff_minus10, diff_minus11);
+ offset = __lsx_vaddi_bu(offset, 2);
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0,
+ src_minus10, src_minus11);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ src0 = __lsx_vxori_b(src0, 128);
+ dst0 = __lsx_vsadd_b(src0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ __lsx_vstelm_d(dst0, dst, 0, 0);
+ __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+ dst += dst_stride_2x;
+ }
+
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf1, zeros, src_minus11,
+ shuf1, src0, src1);
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
+ shuf2, src_plus10, src_plus11);
+ DUP2_ARG2(__lsx_vpickev_d, src_minus11, src_minus10, src_plus11,
+ src_plus10, src_minus10, src_plus10);
+ src0 = __lsx_vpickev_d(src1, src0);
+
+ DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10, cmp_minus10,
+ cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10, cmp_minus10,
+ cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
+ const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ offset = __lsx_vadd_b(diff_minus10, diff_minus11);
+ offset = __lsx_vaddi_bu(offset, 2);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ src0 = __lsx_vxori_b(src0, 128);
+ dst0 = __lsx_vsadd_b(src0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ __lsx_vstelm_d(dst0, dst, 0, 0);
+ __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+}
+
+static void hevc_sao_edge_filter_0degree_16multiple_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *sao_offset_val,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *dst_ptr, *src_minus1;
+ int32_t v_cnt;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ const int32_t src_stride_4x = (src_stride << 2);
+ const int32_t dst_stride_4x = (dst_stride << 2);
+ const int32_t src_stride_3x = src_stride_2x + src_stride;
+ const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+
+ __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+ __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i const1 = __lsx_vldi(1);
+ __m128i sao_offset;
+ __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
+ __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
+ __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
+ __m128i diff_plus13;
+ __m128i src10, src11, src12, src13, dst0, dst1, dst2, dst3;
+ __m128i src_minus10, src_minus11, src_minus12, src_minus13;
+ __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3;
+ __m128i src_zero0, src_zero1, src_zero2, src_zero3;
+ __m128i src_plus10, src_plus11, src_plus12, src_plus13;
+
+ sao_offset = __lsx_vld(sao_offset_val, 0);
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+
+ for (; height; height -= 4) {
+ src_minus1 = src - 1;
+ src_minus10 = __lsx_vld(src_minus1, 0);
+ DUP2_ARG2(__lsx_vldx, src_minus1, src_stride, src_minus1,
+ src_stride_2x, src_minus11, src_minus12);
+ src_minus13 = __lsx_vldx(src_minus1, src_stride_3x);
+
+ for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
+ src_minus1 += 16;
+ dst_ptr = dst + v_cnt;
+ src10 = __lsx_vld(src_minus1, 0);
+ DUP2_ARG2(__lsx_vldx, src_minus1, src_stride, src_minus1,
+ src_stride_2x, src11, src12);
+ src13 = __lsx_vldx(src_minus1, src_stride_3x);
+ DUP4_ARG3(__lsx_vshuf_b, src10, src_minus10, shuf1, src11,
+ src_minus11, shuf1, src12, src_minus12, shuf1, src13,
+ src_minus13, shuf1, src_zero0, src_zero1,
+ src_zero2, src_zero3);
+ DUP4_ARG3(__lsx_vshuf_b, src10, src_minus10, shuf2, src11,
+ src_minus11, shuf2, src12, src_minus12, shuf2, src13,
+ src_minus13, shuf2, src_plus10, src_plus11,
+ src_plus12, src_plus13);
+ DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
+ src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
+ cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
+ DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
+ src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
+ cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
+ cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
+ cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
+ diff_plus11);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
+ cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
+ cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
+ diff_plus13);
+ DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
+ src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
+ cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
+ DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
+ src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
+ cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
+ cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
+ cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
+ cmp_plus11);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
+ cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
+ cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
+ cmp_plus13);
+ DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_plus10, const1, cmp_plus10, diff_minus11, const1,
+ cmp_minus11, diff_plus11, const1, cmp_plus11,
+ diff_minus10, diff_plus10, diff_minus11, diff_plus11);
+ DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
+ diff_plus12, const1, cmp_plus12, diff_minus13, const1,
+ cmp_minus13, diff_plus13, const1, cmp_plus13,
+ diff_minus12, diff_plus12, diff_minus13, diff_plus13);
+
+ DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
+ diff_plus11, diff_minus12, diff_plus12, diff_minus13,
+ diff_plus13, offset_mask0, offset_mask1, offset_mask2,
+ offset_mask3);
+ DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask2, 2, offset_mask3, 2, offset_mask0,
+ offset_mask1, offset_mask2, offset_mask3);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
+ sao_offset, sao_offset, offset_mask0, offset_mask0,
+ offset_mask0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
+ sao_offset, sao_offset, offset_mask1, offset_mask1,
+ offset_mask1);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
+ sao_offset, sao_offset, offset_mask2, offset_mask2,
+ offset_mask2);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
+ sao_offset, sao_offset, offset_mask3, offset_mask3,
+ offset_mask3);
+
+ DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128,
+ src_zero2, 128, src_zero3, 128, src_zero0, src_zero1,
+ src_zero2, src_zero3);
+ DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
+ offset_mask1, src_zero2, offset_mask2, src_zero3,
+ offset_mask3, dst0, dst1, dst2, dst3);
+ DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
+ 128, dst0, dst1, dst2, dst3);
+
+ src_minus10 = src10;
+ src_minus11 = src11;
+ src_minus12 = src12;
+ src_minus13 = src13;
+
+ __lsx_vst(dst0, dst_ptr, 0);
+ __lsx_vst(dst1, dst_ptr + dst_stride, 0);
+ __lsx_vst(dst2, dst_ptr + dst_stride_2x, 0);
+ __lsx_vst(dst3, dst_ptr + dst_stride_3x, 0);
+ }
+ src += src_stride_4x;
+ dst += dst_stride_4x;
+ }
+}
+
+static void hevc_sao_edge_filter_90degree_4width_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *sao_offset_val,
+ int32_t height)
+{
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i const1 = __lsx_vldi(1);
+ __m128i dst0;
+ __m128i sao_offset = __lsx_vld(sao_offset_val, 0);
+ __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+ __m128i src_minus10, src_minus11, src10, src11;
+ __m128i src_zero0, src_zero1;
+ __m128i offset;
+ __m128i offset_mask0, offset_mask1;
+
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+
+ /* load in advance */
+ DUP4_ARG2(__lsx_vld, src - src_stride, 0, src, 0, src + src_stride, 0,
+ src + src_stride_2x, 0, src_minus10, src_minus11, src10, src11);
+
+ for (height -= 2; height; height -= 2) {
+ src += src_stride_2x;
+ DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
+ src11, src_minus11, src10, src10, src_minus10, src_zero0,
+ src_minus11, src_zero1);
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
+ src_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ src_minus10 = src10;
+ src_minus11 = src11;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
+ src10, src11);
+
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
+ dst += dst_stride_2x;
+ }
+
+ DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
+ src11, src_minus11, src10, src10, src_minus10, src_zero0,
+ src_minus11, src_zero1);
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
+ const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
+}
+
+static void hevc_sao_edge_filter_90degree_8width_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *sao_offset_val,
+ int32_t height)
+{
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i const1 = __lsx_vldi(1);
+ __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
+ __m128i src_zero0, src_zero1, dst0;
+ __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+ __m128i src_minus10, src_minus11, src10, src11;
+ __m128i offset_mask0, offset_mask1;
+
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vld, src - src_stride, 0, src, 0, src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src10, src11);
+
+ for (height -= 2; height; height -= 2) {
+ src += src_stride_2x;
+ DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
+ src11, src_minus11, src10, src10, src_minus10, src_zero0,
+ src_minus11, src_zero1);
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
+ src_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ src_minus10 = src10;
+ src_minus11 = src11;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
+ src10, src11);
+
+ __lsx_vstelm_d(dst0, dst, 0, 0);
+ __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+ dst += dst_stride_2x;
+ }
+
+ DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
+ src11, src_minus11, src10, src10, src_minus10, src_zero0,
+ src_minus11, src_zero1);
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
+ const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ __lsx_vstelm_d(dst0, dst, 0, 0);
+ __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+}
+
+static void hevc_sao_edge_filter_90degree_16multiple_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *
+ sao_offset_val,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *src_orig = src;
+ uint8_t *dst_orig = dst;
+ int32_t h_cnt, v_cnt;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ const int32_t src_stride_4x = (src_stride << 2);
+ const int32_t dst_stride_4x = (dst_stride << 2);
+ const int32_t src_stride_3x = src_stride_2x + src_stride;
+ const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i const1 = __lsx_vldi(1);
+ __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
+ __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
+ __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
+ __m128i diff_plus13;
+ __m128i src10, src_minus10, dst0, src11, src_minus11, dst1;
+ __m128i src12, dst2, src13, dst3;
+ __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
+
+ sao_offset = __lsx_vld(sao_offset_val, 0);
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+
+ for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
+ src = src_orig + v_cnt;
+ dst = dst_orig + v_cnt;
+
+ DUP2_ARG2(__lsx_vld, src - src_stride, 0, src, 0,
+ src_minus10, src_minus11);
+
+ for (h_cnt = (height >> 2); h_cnt--;) {
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
+ src, src_stride_3x, src, src_stride_4x,
+ src10, src11, src12, src13);
+ DUP4_ARG2(__lsx_vseq_b, src_minus11, src_minus10, src_minus11,
+ src10, src10, src_minus11, src10, src11, cmp_minus10,
+ cmp_plus10, cmp_minus11, cmp_plus11);
+ DUP4_ARG2(__lsx_vseq_b, src11, src10, src11, src12, src12, src11,
+ src12, src13, cmp_minus12, cmp_plus12,
+ cmp_minus13, cmp_plus13);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
+ cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
+ cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
+ diff_plus11);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
+ cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
+ cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
+ diff_plus13);
+ DUP4_ARG2(__lsx_vsle_bu, src_minus11, src_minus10, src_minus11,
+ src10, src10, src_minus11, src10, src11, cmp_minus10,
+ cmp_plus10, cmp_minus11, cmp_plus11);
+ DUP4_ARG2(__lsx_vsle_bu, src11, src10, src11, src12, src12, src11,
+ src12, src13, cmp_minus12, cmp_plus12, cmp_minus13,
+ cmp_plus13);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
+ cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
+ cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
+ cmp_plus11);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
+ cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
+ cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
+ cmp_plus13);
+ DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_plus10, const1, cmp_plus10, diff_minus11, const1,
+ cmp_minus11, diff_plus11, const1, cmp_plus11,
+ diff_minus10, diff_plus10, diff_minus11, diff_plus11);
+ DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
+ diff_plus12, const1, cmp_plus12, diff_minus13, const1,
+ cmp_minus13, diff_plus13, const1, cmp_plus13,
+ diff_minus12, diff_plus12, diff_minus13, diff_plus13);
+
+ DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
+ diff_plus11, diff_minus12, diff_plus12, diff_minus13,
+ diff_plus13, offset_mask0, offset_mask1, offset_mask2,
+ offset_mask3);
+ DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask2, 2, offset_mask3, 2, offset_mask0,
+ offset_mask1, offset_mask2, offset_mask3);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
+ sao_offset, sao_offset, offset_mask0,\
+ offset_mask0, offset_mask0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
+ sao_offset, sao_offset, offset_mask1, offset_mask1,
+ offset_mask1);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
+ sao_offset, sao_offset, offset_mask2, offset_mask2,
+ offset_mask2);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
+ sao_offset, sao_offset, offset_mask3, offset_mask3,
+ offset_mask3);
+
+ src_minus10 = src12;
+ DUP4_ARG2(__lsx_vxori_b, src_minus11, 128, src10, 128, src11, 128,
+ src12, 128, src_minus11, src10, src11, src12);
+ DUP4_ARG2(__lsx_vsadd_b, src_minus11, offset_mask0, src10,
+ offset_mask1, src11, offset_mask2, src12,
+ offset_mask3, dst0, dst1, dst2, dst3);
+ DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
+ 128, dst0, dst1, dst2, dst3);
+ src_minus11 = src13;
+
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vstx(dst1, dst, dst_stride);
+ __lsx_vstx(dst2, dst, dst_stride_2x);
+ __lsx_vstx(dst3, dst, dst_stride_3x);
+ src += src_stride_4x;
+ dst += dst_stride_4x;
+ }
+ }
+}
+
+static void hevc_sao_edge_filter_45degree_4width_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *sao_offset_val,
+ int32_t height)
+{
+ uint8_t *src_orig;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+ __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i const1 = __lsx_vldi(1);
+ __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
+ __m128i cmp_minus10, diff_minus10, src_minus10, cmp_minus11, diff_minus11;
+ __m128i src_minus11, src10, src11;
+ __m128i src_plus0, src_zero0, src_plus1, src_zero1, dst0;
+ __m128i offset_mask0, offset_mask1;
+ __m128i zeros = {0};
+
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+ src_orig = src - 1;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
+ src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src10, src11);
+
+ for (height -= 2; height; height -= 2) {
+ src_orig += src_stride_2x;
+
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
+ shuf1, src_zero0, src_zero1);
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
+ src_plus0, src_plus1);
+
+ DUP2_ARG2(__lsx_vilvl_b, src_plus0, src_minus10, src_plus1,
+ src_minus11, src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1,
+ src_zero1, src_zero0, src_zero1);
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1,
+ src_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
+ src_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ src_minus10 = src10;
+ src_minus11 = src11;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src10, src11);
+
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
+ dst += dst_stride_2x;
+ }
+
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
+ src_zero0, src_zero1);
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
+ src_plus0, src_plus1);
+
+ DUP2_ARG2(__lsx_vilvl_b, src_plus0, src_minus10, src_plus1, src_minus11,
+ src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
+ src_zero0, src_zero1);
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
+ const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
+ offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
+}
+
+static void hevc_sao_edge_filter_45degree_8width_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *sao_offset_val,
+ int32_t height)
+{
+ uint8_t *src_orig;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+ __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i const1 = __lsx_vldi(1);
+ __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
+ __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+ __m128i src_minus10, src10, src_minus11, src11;
+ __m128i src_zero0, src_plus10, src_zero1, src_plus11, dst0;
+ __m128i offset_mask0, offset_mask1;
+ __m128i zeros = {0};
+
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+ src_orig = src - 1;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0, src_minus10,
+ src_minus11);
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src10, src11);
+
+ for (height -= 2; height; height -= 2) {
+ src_orig += src_stride_2x;
+
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
+ shuf1, src_zero0, src_zero1);
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
+ src_plus10, src_plus11);
+
+ DUP2_ARG2(__lsx_vilvl_b, src_plus10, src_minus10, src_plus11,
+ src_minus11, src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
+ src_zero0, src_zero1);
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
+ src_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ src_minus10 = src10;
+ src_minus11 = src11;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src10, src11)
+ __lsx_vstelm_d(dst0, dst, 0, 0);
+ __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+ dst += dst_stride_2x;
+ }
+
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
+ src_zero0, src_zero1);
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
+ src_plus10, src_plus11);
+ DUP2_ARG2(__lsx_vilvl_b, src_plus10, src_minus10, src_plus11, src_minus11,
+ src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
+ src_zero0, src_zero1);
+
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
+ const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
+ offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ src_minus10 = src10;
+ src_minus11 = src11;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src10, src11);
+
+ __lsx_vstelm_d(dst0, dst, 0, 0);
+ __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+}
+
+static void hevc_sao_edge_filter_45degree_16multiple_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *
+ sao_offset_val,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *src_orig = src;
+ uint8_t *dst_orig = dst;
+ int32_t v_cnt;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ const int32_t src_stride_4x = (src_stride << 2);
+ const int32_t dst_stride_4x = (dst_stride << 2);
+ const int32_t src_stride_3x = src_stride_2x + src_stride;
+ const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+
+ __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+ __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i const1 = __lsx_vldi(1);
+ __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
+ __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
+ __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
+ __m128i diff_plus13, src_minus14, src_plus13;
+ __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3;
+ __m128i src10, src_minus10, dst0, src11, src_minus11, dst1;
+ __m128i src12, src_minus12, dst2, src13, src_minus13, dst3;
+ __m128i src_zero0, src_plus10, src_zero1, src_plus11, src_zero2;
+ __m128i src_zero3, sao_offset, src_plus12;
+
+ sao_offset = __lsx_vld(sao_offset_val, 0);
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+
+ for (; height; height -= 4) {
+ src_orig = src - 1;
+ dst_orig = dst;
+ src_minus11 = __lsx_vld(src_orig, 0);
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src_minus12, src_minus13);
+ src_minus14 = __lsx_vldx(src_orig, src_stride_3x);
+
+ for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
+ src_minus10 = __lsx_vld(src_orig - src_stride, 0);
+ src_orig += 16;
+ src10 = __lsx_vld(src_orig, 0);
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig,
+ src_stride_2x, src11, src12);
+ src13 = __lsx_vldx(src_orig, src_stride_3x);
+ src_plus13 = __lsx_vld(src + v_cnt + src_stride_4x, 1);
+
+ DUP4_ARG3(__lsx_vshuf_b, src10, src_minus11, shuf1, src11,
+ src_minus12, shuf1, src12, src_minus13, shuf1,
+ src13, src_minus14, shuf1, src_zero0, src_zero1,
+ src_zero2, src_zero3);
+ DUP2_ARG3(__lsx_vshuf_b, src11, src_minus12, shuf2, src12,
+ src_minus13, shuf2, src_plus10, src_plus11);
+ src_plus12 = __lsx_vshuf_b(src13, src_minus14, shuf2);
+
+ DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
+ src_plus10, src_zero1, src_minus11, src_zero1,
+ src_plus11, cmp_minus10, cmp_plus10,
+ cmp_minus11, cmp_plus11);
+ DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
+ src_plus12, src_zero3, src_minus13, src_zero3,
+ src_plus13, cmp_minus12, cmp_plus12,
+ cmp_minus13, cmp_plus13);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
+ cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
+ cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
+ diff_plus11);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
+ cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
+ cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
+ diff_plus13);
+ DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
+ src_plus10, src_zero1, src_minus11, src_zero1,
+ src_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
+ cmp_plus11);
+ DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
+ src_plus12, src_zero3, src_minus13, src_zero3,
+ src_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
+ cmp_plus13);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
+ cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
+ cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
+ cmp_plus11);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
+ cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
+ cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
+ cmp_plus13);
+ DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_plus10, const1, cmp_plus10, diff_minus11, const1,
+ cmp_minus11, diff_plus11, const1, cmp_plus11,
+ diff_minus10, diff_plus10, diff_minus11, diff_plus11);
+ DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
+ diff_plus12, const1, cmp_plus12, diff_minus13, const1,
+ cmp_minus13, diff_plus13, const1, cmp_plus13,
+ diff_minus12, diff_plus12, diff_minus13, diff_plus13);
+
+ DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
+ diff_plus11, diff_minus12, diff_plus12, diff_minus13,
+ diff_plus13, offset_mask0, offset_mask1, offset_mask2,
+ offset_mask3);
+ DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask2, 2, offset_mask3, 2, offset_mask0,
+ offset_mask1, offset_mask2, offset_mask3);
+
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
+ sao_offset, sao_offset, offset_mask0, offset_mask0,
+ offset_mask0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
+ sao_offset, sao_offset, offset_mask1, offset_mask1,
+ offset_mask1);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
+ sao_offset, sao_offset, offset_mask2, offset_mask2,
+ offset_mask2);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
+ sao_offset, sao_offset, offset_mask3, offset_mask3,
+ offset_mask3);
+
+ DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128, src_zero2,
+ 128, src_zero3, 128, src_zero0, src_zero1, src_zero2,
+ src_zero3);
+ DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
+ offset_mask1, src_zero2, offset_mask2, src_zero3,
+ offset_mask3, dst0, dst1, dst2, dst3);
+ DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
+ 128, dst0, dst1, dst2, dst3);
+
+ src_minus11 = src10;
+ src_minus12 = src11;
+ src_minus13 = src12;
+ src_minus14 = src13;
+
+ __lsx_vst(dst0, dst_orig, 0);
+ __lsx_vstx(dst1, dst_orig, dst_stride);
+ __lsx_vstx(dst2, dst_orig, dst_stride_2x);
+ __lsx_vstx(dst3, dst_orig, dst_stride_3x);
+ dst_orig += 16;
+ }
+ src += src_stride_4x;
+ dst += dst_stride_4x;
+ }
+}
+
+static void hevc_sao_edge_filter_135degree_4width_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *sao_offset_val,
+ int32_t height)
+{
+ uint8_t *src_orig;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+
+ __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+ __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i const1 = __lsx_vldi(1);
+ __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
+ __m128i src_zero0, src_zero1, dst0;
+ __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+ __m128i src_minus10, src10, src_minus11, src11;
+ __m128i offset_mask0, offset_mask1;
+ __m128i zeros = {0};
+
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+ src_orig = src - 1;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
+ src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src10, src11);
+
+ for (height -= 2; height; height -= 2) {
+ src_orig += src_stride_2x;
+
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
+ shuf1, src_zero0, src_zero1);
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
+ shuf2, src_minus10, src_minus11);
+
+ DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
+ src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
+ src_zero0, src_zero1);
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
+ src_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ src_minus10 = src10;
+ src_minus11 = src11;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src10, src11);
+
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
+ dst += dst_stride_2x;
+ }
+
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
+ src_zero0, src_zero1);
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
+ shuf2, src_minus10, src_minus11);
+
+ DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
+ src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
+ src_zero0, src_zero1);
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
+ const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
+ offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
+ dst += dst_stride_2x;
+}
+
+static void hevc_sao_edge_filter_135degree_8width_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *sao_offset_val,
+ int32_t height)
+{
+ uint8_t *src_orig;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+
+ __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+ __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i const1 = __lsx_vldi(1);
+ __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
+ __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+ __m128i src_minus10, src10, src_minus11, src11;
+ __m128i src_zero0, src_zero1, dst0;
+ __m128i offset_mask0, offset_mask1;
+ __m128i zeros = {0};
+
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+ src_orig = src - 1;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
+ src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src10, src11);
+
+ for (height -= 2; height; height -= 2) {
+ src_orig += src_stride_2x;
+
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
+ shuf1, src_zero0, src_zero1);
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
+ shuf2, src_minus10, src_minus11);
+
+ DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
+ src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
+ src_zero0, src_zero1);
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
+ src_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ src_minus10 = src10;
+ src_minus11 = src11;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src10, src11);
+
+ __lsx_vstelm_d(dst0, dst, 0, 0);
+ __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+ dst += dst_stride_2x;
+ }
+
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
+ src_zero0, src_zero1);
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
+ shuf2, src_minus10, src_minus11);
+
+ DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
+ src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
+ src_zero0, src_zero1);
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
+ const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
+ offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ __lsx_vstelm_d(dst0, dst, 0, 0);
+ __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+}
+
+static void hevc_sao_edge_filter_135degree_16multiple_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *sao_offset_val,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *src_orig, *dst_orig;
+ int32_t v_cnt;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ const int32_t src_stride_4x = (src_stride << 2);
+ const int32_t dst_stride_4x = (dst_stride << 2);
+ const int32_t src_stride_3x = src_stride_2x + src_stride;
+ const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+
+ __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+ __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i const1 = __lsx_vldi(1);
+ __m128i dst0, dst1, dst2, dst3;
+ __m128i cmp_minus10, cmp_minus11, cmp_minus12, cmp_minus13, cmp_plus10;
+ __m128i cmp_plus11, cmp_plus12, cmp_plus13, diff_minus10, diff_minus11;
+ __m128i diff_minus12, diff_minus13, diff_plus10, diff_plus11, diff_plus12;
+ __m128i diff_plus13, src10, src11, src12, src13, src_minus10, src_minus11;
+ __m128i src_plus10, src_plus11, src_plus12, src_plus13;
+ __m128i src_minus12, src_minus13, src_zero0, src_zero1, src_zero2, src_zero3;
+ __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
+
+ sao_offset = __lsx_vld(sao_offset_val, 0);
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+
+ for (; height; height -= 4) {
+ src_orig = src - 1;
+ dst_orig = dst;
+
+ src_minus11 = __lsx_vld(src_orig, 0);
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src_plus10, src_plus11);
+ src_plus12 = __lsx_vldx(src_orig, src_stride_3x);
+
+ for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
+ src_minus10 = __lsx_vld(src_orig - src_stride, 2);
+ src_plus13 = __lsx_vldx(src_orig, src_stride_4x);
+ src_orig += 16;
+ src10 = __lsx_vld(src_orig, 0);
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src11, src12);
+ src13 =__lsx_vldx(src_orig, src_stride_3x);
+
+ DUP4_ARG3(__lsx_vshuf_b, src10, src_minus11, shuf1, src11,
+ src_plus10, shuf1, src12, src_plus11, shuf1, src13,
+ src_plus12, shuf1, src_zero0, src_zero1, src_zero2,
+ src_zero3);
+ src_minus11 = __lsx_vshuf_b(src10, src_minus11, shuf2);
+ DUP2_ARG3(__lsx_vshuf_b, src11, src_plus10, shuf2, src12,
+ src_plus11, shuf2, src_minus12, src_minus13);
+
+ DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
+ src_plus10, src_zero1, src_minus11, src_zero1,
+ src_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
+ cmp_plus11);
+ DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
+ src_plus12, src_zero3, src_minus13, src_zero3,
+ src_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
+ cmp_plus13);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
+ cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
+ cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
+ diff_plus11);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
+ cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
+ cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
+ diff_plus13);
+ DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
+ src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
+ cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
+ DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
+ src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
+ cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
+ cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
+ cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
+ cmp_plus11);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
+ cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
+ cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
+ cmp_plus13);
+ DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_plus10, const1, cmp_plus10, diff_minus11, const1,
+ cmp_minus11, diff_plus11, const1, cmp_plus11,
+ diff_minus10, diff_plus10, diff_minus11, diff_plus11);
+ DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
+ diff_plus12, const1, cmp_plus12, diff_minus13, const1,
+ cmp_minus13, diff_plus13, const1, cmp_plus13,
+ diff_minus12, diff_plus12, diff_minus13, diff_plus13);
+
+ DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
+ diff_plus11, diff_minus12, diff_plus12, diff_minus13,
+ diff_plus13, offset_mask0, offset_mask1, offset_mask2,
+ offset_mask3);
+ DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask2, 2, offset_mask3, 2, offset_mask0,
+ offset_mask1, offset_mask2, offset_mask3);
+
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
+ sao_offset, sao_offset, offset_mask0, offset_mask0,
+ offset_mask0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
+ sao_offset, sao_offset, offset_mask1, offset_mask1,
+ offset_mask1);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
+ sao_offset, sao_offset, offset_mask2, offset_mask2,
+ offset_mask2);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
+ sao_offset, sao_offset, offset_mask3, offset_mask3,
+ offset_mask3);
+
+ DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128,
+ src_zero2, 128, src_zero3, 128, src_zero0, src_zero1,
+ src_zero2, src_zero3);
+ DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
+ offset_mask1, src_zero2, offset_mask2, src_zero3,
+ offset_mask3, dst0, dst1, dst2, dst3);
+ DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
+ 128, dst0, dst1, dst2, dst3);
+
+ src_minus11 = src10;
+ src_plus10 = src11;
+ src_plus11 = src12;
+ src_plus12 = src13;
+
+ __lsx_vst(dst0, dst_orig, 0);
+ __lsx_vstx(dst1, dst_orig, dst_stride);
+ __lsx_vstx(dst2, dst_orig, dst_stride_2x);
+ __lsx_vstx(dst3, dst_orig, dst_stride_3x);
+ dst_orig += 16;
+ }
+
+ src += src_stride_4x;
+ dst += dst_stride_4x;
+ }
+}
+
+void ff_hevc_sao_edge_filter_8_lsx(uint8_t *dst, uint8_t *src,
+ ptrdiff_t stride_dst,
+ int16_t *sao_offset_val,
+ int eo, int width, int height)
+{
+ ptrdiff_t stride_src = (2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE);
+
+ switch (eo) {
+ case 0:
+ if (width >> 4) {
+ hevc_sao_edge_filter_0degree_16multiple_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val,
+ width - (width & 0x0F),
+ height);
+ dst += width & 0xFFFFFFF0;
+ src += width & 0xFFFFFFF0;
+ width &= 0x0F;
+ }
+
+ if (width >> 3) {
+ hevc_sao_edge_filter_0degree_8width_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val, height);
+ dst += 8;
+ src += 8;
+ width &= 0x07;
+ }
+
+ if (width) {
+ hevc_sao_edge_filter_0degree_4width_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val, height);
+ }
+ break;
+
+ case 1:
+ if (width >> 4) {
+ hevc_sao_edge_filter_90degree_16multiple_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val,
+ width - (width & 0x0F),
+ height);
+ dst += width & 0xFFFFFFF0;
+ src += width & 0xFFFFFFF0;
+ width &= 0x0F;
+ }
+
+ if (width >> 3) {
+ hevc_sao_edge_filter_90degree_8width_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val, height);
+ dst += 8;
+ src += 8;
+ width &= 0x07;
+ }
+
+ if (width) {
+ hevc_sao_edge_filter_90degree_4width_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val, height);
+ }
+ break;
+
+ case 2:
+ if (width >> 4) {
+ hevc_sao_edge_filter_45degree_16multiple_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val,
+ width - (width & 0x0F),
+ height);
+ dst += width & 0xFFFFFFF0;
+ src += width & 0xFFFFFFF0;
+ width &= 0x0F;
+ }
+
+ if (width >> 3) {
+ hevc_sao_edge_filter_45degree_8width_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val, height);
+ dst += 8;
+ src += 8;
+ width &= 0x07;
+ }
+
+ if (width) {
+ hevc_sao_edge_filter_45degree_4width_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val, height);
+ }
+ break;
+
+ case 3:
+ if (width >> 4) {
+ hevc_sao_edge_filter_135degree_16multiple_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val,
+ width - (width & 0x0F),
+ height);
+ dst += width & 0xFFFFFFF0;
+ src += width & 0xFFFFFFF0;
+ width &= 0x0F;
+ }
+
+ if (width >> 3) {
+ hevc_sao_edge_filter_135degree_8width_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val, height);
+ dst += 8;
+ src += 8;
+ width &= 0x07;
+ }
+
+ if (width) {
+ hevc_sao_edge_filter_135degree_4width_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val, height);
+ }
+ break;
+ }
+}
diff --git a/libavcodec/loongarch/hevcdsp_init_loongarch.c b/libavcodec/loongarch/hevcdsp_init_loongarch.c
index fc0e8fb0df..f39674be64 100644
--- a/libavcodec/loongarch/hevcdsp_init_loongarch.c
+++ b/libavcodec/loongarch/hevcdsp_init_loongarch.c
@@ -85,6 +85,25 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_8_lsx;
c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_8_lsx;
c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_lsx;
+
+ c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_8_lsx;
+
+ c->hevc_h_loop_filter_luma = ff_hevc_loop_filter_luma_h_8_lsx;
+ c->hevc_v_loop_filter_luma = ff_hevc_loop_filter_luma_v_8_lsx;
+
+ c->hevc_h_loop_filter_luma_c = ff_hevc_loop_filter_luma_h_8_lsx;
+ c->hevc_v_loop_filter_luma_c = ff_hevc_loop_filter_luma_v_8_lsx;
+
+ c->hevc_h_loop_filter_chroma = ff_hevc_loop_filter_chroma_h_8_lsx;
+ c->hevc_v_loop_filter_chroma = ff_hevc_loop_filter_chroma_v_8_lsx;
+
+ c->hevc_h_loop_filter_chroma_c = ff_hevc_loop_filter_chroma_h_8_lsx;
+ c->hevc_v_loop_filter_chroma_c = ff_hevc_loop_filter_chroma_v_8_lsx;
+
+ c->idct[0] = ff_hevc_idct_4x4_lsx;
+ c->idct[1] = ff_hevc_idct_8x8_lsx;
+ c->idct[2] = ff_hevc_idct_16x16_lsx;
+ c->idct[3] = ff_hevc_idct_32x32_lsx;
}
}
}
diff --git a/libavcodec/loongarch/hevcdsp_lsx.h b/libavcodec/loongarch/hevcdsp_lsx.h
index 3259e03f13..0e73fd1f8e 100644
--- a/libavcodec/loongarch/hevcdsp_lsx.h
+++ b/libavcodec/loongarch/hevcdsp_lsx.h
@@ -85,4 +85,30 @@ MC(epel, hv, 32);
#undef MC
+void ff_hevc_loop_filter_luma_h_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int32_t beta, int32_t *tc,
+ uint8_t *p_is_pcm, uint8_t *q_is_pcm);
+
+void ff_hevc_loop_filter_luma_v_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int32_t beta, int32_t *tc,
+ uint8_t *p_is_pcm, uint8_t *q_is_pcm);
+
+void ff_hevc_loop_filter_chroma_h_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int32_t *tc, uint8_t *p_is_pcm,
+ uint8_t *q_is_pcm);
+
+void ff_hevc_loop_filter_chroma_v_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int32_t *tc, uint8_t *p_is_pcm,
+ uint8_t *q_is_pcm);
+
+void ff_hevc_sao_edge_filter_8_lsx(uint8_t *dst, uint8_t *src,
+ ptrdiff_t stride_dst,
+ int16_t *sao_offset_val,
+ int eo, int width, int height);
+
+void ff_hevc_idct_4x4_lsx(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_8x8_lsx(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_16x16_lsx(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_32x32_lsx(int16_t *coeffs, int col_limit);
+
#endif // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LSX_H
--
2.20.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
* [FFmpeg-devel] [PATCH v1 4/5] avcodec: [loongarch] Optimize Hevc_mc_bi with LSX.
2022-02-14 8:25 [FFmpeg-devel] Optimize HEVC decoding for loongarch Hao Chen
` (2 preceding siblings ...)
2022-02-14 8:25 ` [FFmpeg-devel] [PATCH v1 3/5] avcodec: [loongarch] Optimize Hevc_idct/lpf " Hao Chen
@ 2022-02-14 8:25 ` Hao Chen
2022-02-14 8:25 ` [FFmpeg-devel] [PATCH v1 5/5] avcodec: [loongarch] Optimize Hevc_mc_uni/w " Hao Chen
2022-02-15 3:30 ` [FFmpeg-devel] Optimize HEVC decoding for loongarch 殷时友
5 siblings, 0 replies; 7+ messages in thread
From: Hao Chen @ 2022-02-14 8:25 UTC (permalink / raw)
To: ffmpeg-devel
ffmpeg -i 5_h265_1080p_60fps_3Mbps.mkv -f rawvideo -y /dev/null -an
before: 124fps
after : 182fps
---
libavcodec/loongarch/Makefile | 3 +-
libavcodec/loongarch/hevc_mc_bi_lsx.c | 2289 +++++++++++++++++
libavcodec/loongarch/hevcdsp_init_loongarch.c | 51 +
libavcodec/loongarch/hevcdsp_lsx.h | 57 +
4 files changed, 2399 insertions(+), 1 deletion(-)
create mode 100644 libavcodec/loongarch/hevc_mc_bi_lsx.c
diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index cfc8e3aaff..620fba7192 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -27,4 +27,5 @@ LSX-OBJS-$(CONFIG_VP9_DECODER) += loongarch/vp9_mc_lsx.o \
loongarch/vp9_idct_lsx.o
LSX-OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_lsx.o \
loongarch/hevc_idct_lsx.o \
- loongarch/hevc_lpf_sao_lsx.o
+ loongarch/hevc_lpf_sao_lsx.o \
+ loongarch/hevc_mc_bi_lsx.o
diff --git a/libavcodec/loongarch/hevc_mc_bi_lsx.c b/libavcodec/loongarch/hevc_mc_bi_lsx.c
new file mode 100644
index 0000000000..9092fdccb2
--- /dev/null
+++ b/libavcodec/loongarch/hevc_mc_bi_lsx.c
@@ -0,0 +1,2289 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ * Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "hevcdsp_lsx.h"
+
+static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
+ /* 8 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+ 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
+};
+
+static av_always_inline __m128i
+hevc_bi_rnd_clip(__m128i in0, __m128i vec0, __m128i in1, __m128i vec1)
+{
+ __m128i out;
+
+ vec0 = __lsx_vsadd_h(in0, vec0);
+ vec1 = __lsx_vsadd_h(in1, vec1);
+ out = __lsx_vssrarni_bu_h(vec1, vec0, 7);
+ return out;
+}
+
+/* hevc_bi_copy: dst = av_clip_uint8((src0 << 6 + src1) >> 7) */
+static
+void hevc_bi_copy_4w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height)
+{
+ int32_t loop_cnt = height >> 3;
+ int32_t res = (height & 0x07) >> 1;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_4x = (dst_stride << 2);
+ int32_t src2_stride_2x = (src2_stride << 1);
+ int32_t src2_stride_4x = (src2_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ int32_t src2_stride_3x = src2_stride_2x + src2_stride;
+ __m128i src0, src1;
+ __m128i zero = __lsx_vldi(0);
+ __m128i in0, in1, in2, in3;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i reg0, reg1, reg2, reg3;
+ __m128i dst0, dst1, dst2, dst3;
+
+ for (;loop_cnt--;) {
+ reg0 = __lsx_vldrepl_w(src0_ptr, 0);
+ reg1 = __lsx_vldrepl_w(src0_ptr + src_stride, 0);
+ reg2 = __lsx_vldrepl_w(src0_ptr + src_stride_2x, 0);
+ reg3 = __lsx_vldrepl_w(src0_ptr + src_stride_3x, 0);
+ src0_ptr += src_stride_4x;
+ DUP2_ARG2(__lsx_vilvl_w, reg1, reg0, reg3, reg2, tmp0, tmp1);
+ src0 = __lsx_vilvl_d(tmp1, tmp0);
+ reg0 = __lsx_vldrepl_w(src0_ptr, 0);
+ reg1 = __lsx_vldrepl_w(src0_ptr + src_stride, 0);
+ reg2 = __lsx_vldrepl_w(src0_ptr + src_stride_2x, 0);
+ reg3 = __lsx_vldrepl_w(src0_ptr + src_stride_3x, 0);
+ DUP2_ARG2(__lsx_vilvl_w, reg1, reg0, reg3, reg2, tmp0, tmp1);
+ src1 = __lsx_vilvl_d(tmp1, tmp0);
+ src0_ptr += src_stride_4x;
+
+ tmp0 = __lsx_vldrepl_d(src1_ptr, 0);
+ tmp1 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0);
+ tmp2 = __lsx_vldrepl_d(src1_ptr + src2_stride_2x, 0);
+ tmp3 = __lsx_vldrepl_d(src1_ptr + src2_stride_3x, 0);
+ src1_ptr += src2_stride_4x;
+ DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, in0, in1);
+ tmp0 = __lsx_vldrepl_d(src1_ptr, 0);
+ tmp1 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0);
+ tmp2 = __lsx_vldrepl_d(src1_ptr + src2_stride_2x, 0);
+ tmp3 = __lsx_vldrepl_d(src1_ptr + src2_stride_3x, 0);
+ src1_ptr += src2_stride_4x;
+ DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, in2, in3);
+ DUP2_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, dst0, dst2);
+ DUP2_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, dst1, dst3);
+ DUP2_ARG2(__lsx_vslli_h, dst1, 6, dst3, 6, dst1, dst3);
+ dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
+ dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ __lsx_vstelm_w(dst0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(dst0, dst + dst_stride_2x, 0, 2);
+ __lsx_vstelm_w(dst0, dst + dst_stride_3x, 0, 3);
+ dst += dst_stride_4x;
+ __lsx_vstelm_w(dst1, dst, 0, 0);
+ __lsx_vstelm_w(dst1, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(dst1, dst + dst_stride_2x, 0, 2);
+ __lsx_vstelm_w(dst1, dst + dst_stride_3x, 0, 3);
+ dst += dst_stride_4x;
+ }
+ for(;res--;) {
+ reg0 = __lsx_vldrepl_w(src0_ptr, 0);
+ reg1 = __lsx_vldrepl_w(src0_ptr + src_stride, 0);
+ reg2 = __lsx_vldrepl_d(src1_ptr, 0);
+ reg3 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0);
+ src0 = __lsx_vilvl_w(reg1, reg0);
+ in0 = __lsx_vilvl_d(reg3, reg2);
+ dst0 = __lsx_vsllwil_hu_bu(src0, 6);
+ dst0 = __lsx_vsadd_h(dst0, in0);
+ dst0 = __lsx_vssrarni_bu_h(dst0, dst0, 7);
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ __lsx_vstelm_w(dst0, dst + dst_stride, 0, 1);
+ src0_ptr += src_stride_2x;
+ src1_ptr += src2_stride_2x;
+ dst += dst_stride_2x;
+ }
+}
+
+static
+void hevc_bi_copy_6w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height)
+{
+ int32_t loop_cnt;
+ int32_t res = (height & 0x07) >> 1;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_4x = (dst_stride << 2);
+ int32_t src2_stride_x = (src2_stride << 1);
+ int32_t src2_stride_2x = (src2_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
+ __m128i out0, out1, out2, out3;
+ __m128i zero = __lsx_vldi(0);
+ __m128i src0, src1, src2, src3;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ __m128i reg0, reg1, reg2, reg3;
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ reg0 = __lsx_vldrepl_d(src0_ptr, 0);
+ reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
+ reg2 = __lsx_vldrepl_d(src0_ptr + src_stride_2x, 0);
+ reg3 = __lsx_vldrepl_d(src0_ptr + src_stride_3x, 0);
+ DUP2_ARG2(__lsx_vilvl_d, reg1, reg0, reg3, reg2, src0, src1);
+ src0_ptr += src_stride_4x;
+ reg0 = __lsx_vldrepl_d(src0_ptr, 0);
+ reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
+ reg2 = __lsx_vldrepl_d(src0_ptr + src_stride_2x, 0);
+ reg3 = __lsx_vldrepl_d(src0_ptr + src_stride_3x, 0);
+ DUP2_ARG2(__lsx_vilvl_d, reg1, reg0, reg3, reg2, src2, src3);
+ src0_ptr += src_stride_4x;
+ in0 = __lsx_vld(src1_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
+ src2_stride_2x, in1, in2);
+ in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
+ src1_ptr += src2_stride_2x;
+ in4 = __lsx_vld(src1_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
+ src2_stride_2x, in5, in6);
+ in7 = __lsx_vldx(src1_ptr, src2_stride_3x);
+ src1_ptr += src2_stride_2x;
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+ dst0, dst2, dst4, dst6);
+ DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero, src3,
+ dst1, dst3, dst5, dst7);
+ DUP4_ARG2(__lsx_vslli_h, dst1, 6, dst3, 6, dst5, 6, dst7, 6, dst1, dst3,
+ dst5, dst7);
+ out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
+ out1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
+ out2 = hevc_bi_rnd_clip(in4, dst4, in5, dst5);
+ out3 = hevc_bi_rnd_clip(in6, dst6, in7, dst7);
+ __lsx_vstelm_w(out0, dst, 0, 0);
+ __lsx_vstelm_w(out0, dst + dst_stride, 0, 2);
+ __lsx_vstelm_h(out0, dst, 4, 2);
+ __lsx_vstelm_h(out0, dst + dst_stride, 4, 6);
+ __lsx_vstelm_w(out1, dst + dst_stride_2x, 0, 0);
+ __lsx_vstelm_w(out1, dst + dst_stride_3x, 0, 2);
+ __lsx_vstelm_h(out1, dst + dst_stride_2x, 4, 2);
+ __lsx_vstelm_h(out1, dst + dst_stride_3x, 4, 6);
+ dst += dst_stride_4x;
+ __lsx_vstelm_w(out2, dst, 0, 0);
+ __lsx_vstelm_w(out2, dst + dst_stride, 0, 2);
+ __lsx_vstelm_h(out2, dst, 4, 2);
+ __lsx_vstelm_h(out2, dst + dst_stride, 4, 6);
+ __lsx_vstelm_w(out3, dst + dst_stride_2x, 0, 0);
+ __lsx_vstelm_w(out3, dst + dst_stride_3x, 0, 2);
+ __lsx_vstelm_h(out3, dst + dst_stride_2x, 4, 2);
+ __lsx_vstelm_h(out3, dst + dst_stride_3x, 4, 6);
+ dst += dst_stride_4x;
+ }
+ for (;res--;) {
+ reg0 = __lsx_vldrepl_d(src0_ptr, 0);
+ reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
+ src0 = __lsx_vilvl_d(reg1, reg0);
+ src0_ptr += src_stride_2x;
+ in0 = __lsx_vld(src1_ptr, 0);
+ in1 = __lsx_vldx(src1_ptr, src2_stride_x);
+ src1_ptr += src2_stride_x;
+ dst0 = __lsx_vsllwil_hu_bu(src0, 6);
+ dst1 = __lsx_vilvh_b(zero, src0);
+ dst1 = __lsx_vslli_h(dst1, 6);
+ out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
+ __lsx_vstelm_w(out0, dst, 0, 0);
+ __lsx_vstelm_h(out0, dst, 4, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 2);
+ __lsx_vstelm_h(out0, dst, 4, 6);
+ dst += dst_stride;
+ }
+}
+
+static
+void hevc_bi_copy_8w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height)
+{
+ int32_t loop_cnt = height >> 3;
+ int32_t res = (height & 7) >> 1;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_4x = (dst_stride << 2);
+ int32_t src2_stride_x = (src2_stride << 1);
+ int32_t src2_stride_2x = (src2_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
+ __m128i out0, out1, out2, out3;
+ __m128i src0, src1, src2, src3;
+ __m128i zero = __lsx_vldi(0);
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ __m128i reg0, reg1, reg2, reg3;
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ reg0 = __lsx_vldrepl_d(src0_ptr, 0);
+ reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
+ reg2 = __lsx_vldrepl_d(src0_ptr + src_stride_2x, 0);
+ reg3 = __lsx_vldrepl_d(src0_ptr + src_stride_3x, 0);
+ DUP2_ARG2(__lsx_vilvl_d, reg1, reg0, reg3, reg2, src0, src1);
+ src0_ptr += src_stride_4x;
+ reg0 = __lsx_vldrepl_d(src0_ptr, 0);
+ reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
+ reg2 = __lsx_vldrepl_d(src0_ptr + src_stride_2x, 0);
+ reg3 = __lsx_vldrepl_d(src0_ptr + src_stride_3x, 0);
+ DUP2_ARG2(__lsx_vilvl_d, reg1, reg0, reg3, reg2, src2, src3);
+ src0_ptr += src_stride_4x;
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+ dst0, dst2, dst4, dst6);
+ DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero,
+ src3, dst1, dst3, dst5, dst7);
+ DUP4_ARG2(__lsx_vslli_h, dst1, 6, dst3, 6, dst5, 6, dst7, 6, dst1,
+ dst3, dst5, dst7);
+ in0 = __lsx_vld(src1_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
+ src2_stride_2x, in1, in2);
+ in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
+ src1_ptr += src2_stride_2x;
+ in4 = __lsx_vld(src1_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
+ src2_stride_2x, in5, in6);
+ in7 = __lsx_vldx(src1_ptr, src2_stride_3x);
+ src1_ptr += src2_stride_2x;
+ out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
+ out1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
+ out2 = hevc_bi_rnd_clip(in4, dst4, in5, dst5);
+ out3 = hevc_bi_rnd_clip(in6, dst6, in7, dst7);
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
+ dst += dst_stride_4x;
+ __lsx_vstelm_d(out2, dst, 0, 0);
+ __lsx_vstelm_d(out2, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(out3, dst + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(out3, dst + dst_stride_3x, 0, 1);
+ dst += dst_stride_4x;
+ }
+ for (;res--;) {
+ reg0 = __lsx_vldrepl_d(src0_ptr, 0);
+ reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
+ src0 = __lsx_vilvl_d(reg1, reg0);
+ in0 = __lsx_vld(src1_ptr, 0);
+ in1 = __lsx_vldx(src1_ptr, src2_stride_x);
+ dst0 = __lsx_vsllwil_hu_bu(src0, 6);
+ dst1 = __lsx_vilvh_b(zero, src0);
+ dst1 = __lsx_vslli_h(dst1, 6);
+ out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+ src0_ptr += src_stride_2x;
+ src1_ptr += src2_stride_x;
+ dst += dst_stride_2x;
+ }
+}
+
+static
+void hevc_bi_copy_12w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height)
+{
+ uint32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_4x = (dst_stride << 2);
+ int32_t src2_stride_x = (src2_stride << 1);
+ int32_t src2_stride_2x = (src2_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
+ int16_t* _src1 = src1_ptr + 8;
+ __m128i out0, out1, out2;
+ __m128i src0, src1, src2, src3;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5;
+
+ for (loop_cnt = 4; loop_cnt--;) {
+ src0 = __lsx_vld(src0_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+ src1, src2);
+ src3 = __lsx_vldx(src0_ptr, src_stride_3x);
+ src0_ptr += src_stride_4x;
+ in0 = __lsx_vld(src1_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
+ src2_stride_2x, in1, in2);
+ in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
+ src1_ptr += src2_stride_2x;
+ in4 = __lsx_vld(_src1, 0);
+ DUP2_ARG2(__lsx_vldx, _src1, src2_stride_x, _src1, src2_stride_2x,
+ in5, in6);
+ in7 = __lsx_vldx(_src1, src2_stride_3x);
+ _src1 += src2_stride_2x;
+
+ DUP2_ARG2(__lsx_vilvl_d, in5, in4, in7, in6, in4, in5);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+ dst0, dst1, dst2, dst3)
+ DUP2_ARG2(__lsx_vilvh_w, src1, src0, src3, src2, src0, src1);
+ DUP2_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, dst4, dst5)
+ out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
+ out1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
+ out2 = hevc_bi_rnd_clip(in4, dst4, in5, dst5);
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
+ __lsx_vstelm_w(out2, dst, 8, 0);
+ __lsx_vstelm_w(out2, dst + dst_stride, 8, 1);
+ __lsx_vstelm_w(out2, dst + dst_stride_2x, 8, 2);
+ __lsx_vstelm_w(out2, dst + dst_stride_3x, 8, 3);
+ dst += dst_stride_4x;
+ }
+}
+
+static
+void hevc_bi_copy_16w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height)
+{
+ uint32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_4x = (dst_stride << 2);
+ int32_t src2_stride_x = (src2_stride << 1);
+ int32_t src2_stride_2x = (src2_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
+ int16_t *_src1 = src1_ptr + 8;
+ __m128i out0, out1, out2, out3;
+ __m128i src0, src1, src2, src3;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+ __m128i zero = {0};
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src0 = __lsx_vld(src0_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+ src1, src2);
+ src3 = __lsx_vldx(src0_ptr, src_stride_3x);
+ src0_ptr += src_stride_4x;
+ in0 = __lsx_vld(src1_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
+ src2_stride_2x, in1, in2);
+ in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
+ src1_ptr += src2_stride_2x;
+ in4 = __lsx_vld(_src1, 0);
+ DUP2_ARG2(__lsx_vldx, _src1, src2_stride_x, _src1, src2_stride_2x,
+ in5, in6);
+ in7 = __lsx_vldx(_src1, src2_stride_3x);
+ _src1 += src2_stride_2x;
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+ dst0_r, dst1_r, dst2_r, dst3_r)
+ DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero, src3,
+ dst0_l, dst1_l, dst2_l, dst3_l);
+ DUP4_ARG2(__lsx_vslli_h, dst0_l, 6, dst1_l, 6, dst2_l, 6, dst3_l, 6,
+ dst0_l, dst1_l, dst2_l, dst3_l);
+
+ out0 = hevc_bi_rnd_clip(in0, dst0_r, in4, dst0_l);
+ out1 = hevc_bi_rnd_clip(in1, dst1_r, in5, dst1_l);
+ out2 = hevc_bi_rnd_clip(in2, dst2_r, in6, dst2_l);
+ out3 = hevc_bi_rnd_clip(in3, dst3_r, in7, dst3_l);
+ __lsx_vst(out0, dst, 0);
+ __lsx_vstx(out1, dst, dst_stride);
+ __lsx_vstx(out2, dst, dst_stride_2x);
+ __lsx_vstx(out3, dst, dst_stride_3x);
+ dst += dst_stride_4x;
+ }
+}
+
+static
+void hevc_bi_copy_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height)
+{
+ hevc_bi_copy_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, height);
+ hevc_bi_copy_8w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
+ dst + 16, dst_stride, height);
+}
+
+static
+void hevc_bi_copy_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height)
+{
+ hevc_bi_copy_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, height);
+ hevc_bi_copy_16w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
+ dst + 16, dst_stride, height);
+}
+
+static
+void hevc_bi_copy_48w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height)
+{
+ hevc_bi_copy_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, height);
+ hevc_bi_copy_32w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
+ dst + 16, dst_stride, height);
+}
+
+static
+void hevc_bi_copy_64w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height)
+{
+ hevc_bi_copy_32w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, height);
+ hevc_bi_copy_32w_lsx(src0_ptr + 32, src_stride, src1_ptr + 32, src2_stride,
+ dst + 32, dst_stride, height);
+}
+
+static void hevc_hz_8t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ uint32_t loop_cnt;
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ __m128i src0, src1, src2, src3;
+ __m128i filt0, filt1, filt2, filt3;
+ __m128i mask1, mask2, mask3;
+ __m128i vec0, vec1, vec2, vec3;
+ __m128i dst0, dst1, dst2, dst3;
+ __m128i in0, in1, in2, in3;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+
+ src0_ptr -= 3;
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+
+ for (loop_cnt = (height >> 1); loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 8, src0, src1);
+ src0_ptr += src_stride;
+ DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 8, src2, src3);
+ src0_ptr += src_stride;
+ DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in1);
+ src1_ptr += src2_stride;
+ DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in2, in3);
+ src1_ptr += src2_stride;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0,
+ vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+ vec3, filt0, dst0, dst1, dst2, dst3);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1,
+ vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
+ dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2,
+ vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt2, dst1, vec1, filt2,
+ dst2, vec2, filt2, dst3, vec3, filt2, dst0, dst1, dst2, dst3);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask3, src3, src3, mask3,
+ vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt3, dst1, vec1, filt3,
+ dst2, vec2, filt3, dst3, vec3, filt3, dst0, dst1, dst2, dst3);
+
+ dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
+ dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vstx(dst1, dst, dst_stride);
+ dst += dst_stride_2x;
+ }
+}
+
+static void hevc_hz_8t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ uint32_t loop_cnt;
+ __m128i src0, src1, tmp0, tmp1;
+ __m128i filt0, filt1, filt2, filt3;
+ __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+ __m128i vec0, vec1, vec2, vec3;
+ __m128i dst0, dst1, dst2;
+ __m128i in0, in1, in2;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+
+ src0_ptr -= 3;
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ DUP4_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask0, 6, mask0, 8, mask1,
+ mask2, mask3, mask4);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 10, mask0, 12, mask5, mask6);
+ mask7 = __lsx_vaddi_bu(mask0, 14);
+
+ for (loop_cnt = height; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src0, src1);
+ src0_ptr += src_stride;
+ DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in1);
+ in2 = __lsx_vld(src1_ptr, 32);
+ src1_ptr += src2_stride;
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask4, src1,
+ src1, mask0, src0, src0, mask1, vec0, vec1, vec2, vec3);
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, dst0, dst1);
+ dst2 = __lsx_vdp2_h_bu_b(vec2, filt0);
+ dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt1);
+ DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask5, src1, src1, mask1, src0,
+ src0, mask2, src1, src0, mask6, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec0, filt1, dst2, vec1, filt1,
+ dst0, vec2, filt2, dst1, vec3, filt2, dst1, dst2, dst0, dst1);
+ DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask2, src0, src0, mask3, src1, src0,
+ mask7, src1, src1, mask3, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec0, filt2, dst0, vec1, filt3,
+ dst1, vec2, filt3, dst2, vec3, filt3, dst2, dst0, dst1, dst2);
+
+ tmp0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
+ dst2 = __lsx_vsadd_h(dst2, in2);
+ tmp1 = __lsx_vssrarni_bu_h(dst2, dst2, 7);
+
+ __lsx_vst(tmp0, dst, 0);
+ __lsx_vstelm_d(tmp1, dst, 16, 0);
+ dst += dst_stride;
+ }
+}
+
+static void hevc_hz_8t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ hevc_hz_8t_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter, height);
+ hevc_hz_8t_16w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
+ dst + 16, dst_stride, filter, height);
+}
+
+static void hevc_hz_8t_48w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ hevc_hz_8t_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter, height);
+ hevc_hz_8t_32w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
+ dst + 16, dst_stride, filter, height);
+}
+
+static void hevc_hz_8t_64w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ hevc_hz_8t_32w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter, height);
+ hevc_hz_8t_32w_lsx(src0_ptr + 32, src_stride, src1_ptr + 32, src2_stride,
+ dst + 32, dst_stride, filter, height);
+}
+
+static av_always_inline
+void hevc_vt_8t_8w_lsx(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr,
+ int32_t src2_stride, uint8_t *dst, int32_t dst_stride,\
+ const int8_t *filter, int32_t height)
+{
+ int32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_4x = (dst_stride << 2);
+ int32_t src2_stride_x = (src2_stride << 1);
+ int32_t src2_stride_2x = (src2_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
+ __m128i src0, src1, src2, src3, src4, src5;
+ __m128i src6, src7, src8, src9, src10;
+ __m128i in0, in1, in2, in3;
+ __m128i src10_r, src32_r, src54_r, src76_r, src98_r;
+ __m128i src21_r, src43_r, src65_r, src87_r, src109_r;
+ __m128i dst0_r, dst1_r, dst2_r, dst3_r;
+ __m128i filt0, filt1, filt2, filt3;
+
+ src0_ptr -= src_stride_3x;
+
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ src0 = __lsx_vld(src0_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+ src1, src2);
+ src3 = __lsx_vldx(src0_ptr, src_stride_3x);
+ src0_ptr += src_stride_4x;
+ src4 = __lsx_vld(src0_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+ src5, src6);
+ src0_ptr += src_stride_3x;
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ src10_r, src32_r, src54_r, src21_r);
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src7 = __lsx_vld(src0_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+ src8, src9);
+ src10 = __lsx_vldx(src0_ptr, src_stride_3x);
+ src0_ptr += src_stride_4x;
+ in0 = __lsx_vld(src1_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr, src2_stride_2x,
+ in1, in2);
+ in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
+ src1_ptr += src2_stride_2x;
+ DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+ src76_r, src87_r, src98_r, src109_r);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src32_r,
+ filt0, src43_r, filt0, dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst1_r, src43_r,
+ filt1, dst2_r, src54_r, filt1, dst3_r, src65_r, filt1,
+ dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src54_r, filt2, dst1_r, src65_r,
+ filt2, dst2_r, src76_r, filt2, dst3_r, src87_r, filt2,
+ dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src76_r, filt3, dst1_r, src87_r,
+ filt3, dst2_r, src98_r, filt3, dst3_r, src109_r, filt3,
+ dst0_r, dst1_r, dst2_r, dst3_r);
+
+ dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in1, dst1_r);
+ dst1_r = hevc_bi_rnd_clip(in2, dst2_r, in3, dst3_r);
+ __lsx_vstelm_d(dst0_r, dst, 0, 0);
+ __lsx_vstelm_d(dst0_r, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(dst1_r, dst + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(dst1_r, dst + dst_stride_3x, 0, 1);
+ dst += dst_stride_4x;
+
+ src10_r = src54_r;
+ src32_r = src76_r;
+ src54_r = src98_r;
+ src21_r = src65_r;
+ src43_r = src87_r;
+ src65_r = src109_r;
+
+ src6 = src10;
+ }
+}
+
+static av_always_inline
+void hevc_vt_8t_16multx2mult_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height,
+ int32_t width)
+{
+ uint8_t *src0_ptr_tmp;
+ int16_t *src1_ptr_tmp;
+ uint8_t *dst_tmp;
+ uint32_t loop_cnt;
+ uint32_t cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ __m128i in0, in1, in2, in3;
+ __m128i src10_r, src32_r, src54_r, src76_r;
+ __m128i src21_r, src43_r, src65_r, src87_r;
+ __m128i dst0_r, dst1_r;
+ __m128i src10_l, src32_l, src54_l, src76_l;
+ __m128i src21_l, src43_l, src65_l, src87_l;
+ __m128i dst0_l, dst1_l;
+ __m128i filt0, filt1, filt2, filt3;
+
+ src0_ptr -= src_stride_3x;
+
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ for (cnt = (width >> 4); cnt--;) {
+ src0_ptr_tmp = src0_ptr;
+ src1_ptr_tmp = src1_ptr;
+ dst_tmp = dst;
+
+ src0 = __lsx_vld(src0_ptr_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
+ src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src0_ptr_tmp, src_stride_3x);
+ src0_ptr_tmp += src_stride_4x;
+ src4 = __lsx_vld(src0_ptr_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
+ src_stride_2x, src5, src6);
+ src0_ptr_tmp += src_stride_3x;
+
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ src10_r, src32_r, src54_r, src21_r);
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
+ DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ src10_l, src32_l, src54_l, src21_l);
+ DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, src43_l, src65_l);
+
+ for (loop_cnt = (height >> 1); loop_cnt--;) {
+ src7 = __lsx_vld(src0_ptr_tmp, 0);
+ src8 = __lsx_vldx(src0_ptr_tmp, src_stride);
+ src0_ptr_tmp += src_stride_2x;
+ DUP2_ARG2(__lsx_vld, src1_ptr_tmp, 0, src1_ptr_tmp, 16, in0, in2);
+ src1_ptr_tmp += src2_stride;
+ DUP2_ARG2(__lsx_vld, src1_ptr_tmp, 0, src1_ptr_tmp, 16, in1, in3);
+ src1_ptr_tmp += src2_stride;
+
+ DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
+ DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src76_l, src87_l);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src10_l,
+ filt0, src21_l, filt0, dst0_r, dst1_r, dst0_l, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst1_r,
+ src43_r, filt1, dst0_l, src32_l, filt1, dst1_l, src43_l,
+ filt1, dst0_r, dst1_r, dst0_l, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src54_r, filt2, dst1_r,
+ src65_r, filt2, dst0_l, src54_l, filt2, dst1_l, src65_l,
+ filt2, dst0_r, dst1_r, dst0_l, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src76_r, filt3, dst1_r,
+ src87_r, filt3, dst0_l, src76_l, filt3, dst1_l, src87_l,
+ filt3, dst0_r, dst1_r, dst0_l, dst1_l);
+ dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l);
+ dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l);
+
+ __lsx_vst(dst0_r, dst_tmp, 0);
+ __lsx_vstx(dst1_r, dst_tmp, dst_stride);
+ dst_tmp += dst_stride_2x;
+
+ src10_r = src32_r;
+ src32_r = src54_r;
+ src54_r = src76_r;
+ src21_r = src43_r;
+ src43_r = src65_r;
+ src65_r = src87_r;
+ src10_l = src32_l;
+ src32_l = src54_l;
+ src54_l = src76_l;
+ src21_l = src43_l;
+ src43_l = src65_l;
+ src65_l = src87_l;
+ src6 = src8;
+ }
+
+ src0_ptr += 16;
+ src1_ptr += 16;
+ dst += 16;
+ }
+}
+
+static void hevc_vt_8t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter, height, 16);
+}
+
+static void hevc_vt_8t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter, height, 16);
+ hevc_vt_8t_8w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
+ dst + 16, dst_stride, filter, height);
+}
+
+static void hevc_vt_8t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter, height, 32);
+}
+
+static void hevc_vt_8t_48w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter, height, 48);
+}
+
+static void hevc_vt_8t_64w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter, height, 64);
+}
+
+static av_always_inline
+void hevc_hv_8t_8multx1mult_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height, int32_t width)
+{
+ uint32_t loop_cnt;
+ uint32_t cnt;
+ uint8_t *src0_ptr_tmp;
+ int16_t *src1_ptr_tmp;
+ uint8_t *dst_tmp;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ __m128i out;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i in0, tmp;
+ __m128i filt0, filt1, filt2, filt3;
+ __m128i filt_h0, filt_h1, filt_h2, filt_h3;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ __m128i mask1, mask2, mask3;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ __m128i dst0_r, dst0_l;
+ __m128i dst10_r, dst32_r, dst54_r, dst76_r;
+ __m128i dst10_l, dst32_l, dst54_l, dst76_l;
+
+ src0_ptr -= src_stride_3x + 3;
+
+ DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4, filter_x,
+ 6, filt0, filt1, filt2, filt3);
+ filt_h3 = __lsx_vld(filter_y, 0);
+ filt_h3 = __lsx_vsllwil_h_b(filt_h3, 0);
+
+ DUP4_ARG2(__lsx_vreplvei_w, filt_h3, 0, filt_h3, 1, filt_h3, 2, filt_h3, 3,
+ filt_h0, filt_h1, filt_h2, filt_h3);
+
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+
+ for (cnt = width >> 3; cnt--;) {
+ src0_ptr_tmp = src0_ptr;
+ dst_tmp = dst;
+ src1_ptr_tmp = src1_ptr;
+
+ src0 = __lsx_vld(src0_ptr_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
+ src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src0_ptr_tmp, src_stride_3x);
+ src0_ptr_tmp += src_stride_4x;
+ src4 = __lsx_vld(src0_ptr_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
+ src_stride_2x, src5, src6);
+ src0_ptr_tmp += src_stride_3x;
+
+ /* row 0 row 1 row 2 row 3 */
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0,
+ src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1,
+ src1, mask2, src1, src1, mask3, vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2,
+ src2, mask2, src2, src2, mask3, vec8, vec9, vec10, vec11);
+ DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3,
+ src3, mask2, src3, src3, mask3, vec12, vec13, vec14, vec15);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, vec8, filt0,
+ vec12, filt0, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec5, filt1,
+ dst2, vec9, filt1, dst3, vec13, filt1, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec2, filt2, dst1, vec6, filt2,
+ dst2, vec10, filt2, dst3, vec14, filt2, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec3, filt3, dst1, vec7, filt3,
+ dst2, vec11, filt3, dst3, vec15, filt3, dst0, dst1, dst2, dst3);
+
+ DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4,
+ src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5,
+ src5, mask2, src5, src5, mask3, vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6,
+ src6, mask2, src6, src6, mask3, vec8, vec9, vec10, vec11);
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, dst4, dst5);
+ dst6 = __lsx_vdp2_h_bu_b(vec8, filt0);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst5, vec5, filt1,
+ dst6, vec9, filt1, dst4, vec2, filt2, dst4, dst5, dst6, dst4);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec6, filt2, dst6, vec10, filt2,
+ dst4, vec3, filt3, dst5, vec7, filt3, dst5, dst6, dst4, dst5);
+ dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3);
+
+ for (loop_cnt = height; loop_cnt--;) {
+ src7 = __lsx_vld(src0_ptr_tmp, 0);
+ src0_ptr_tmp += src_stride;
+
+ in0 = __lsx_vld(src1_ptr_tmp, 0);
+ src1_ptr_tmp += src2_stride;
+
+ DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7,
+ src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3);
+ dst7 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2,
+ filt2, dst7, dst7);
+ dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3);
+ DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7,
+ dst6, dst10_r, dst32_r, dst54_r, dst76_r);
+ DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7,
+ dst6, dst10_l, dst32_l, dst54_l, dst76_l);
+
+ DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0,
+ dst0_r, dst0_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
+ dst32_l, filt_h1, dst0_r, dst54_r, filt_h2, dst0_l,
+ dst54_l, filt_h2, dst0_r, dst0_l, dst0_r, dst0_l);
+ DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l,
+ dst76_l, filt_h3, dst0_r, dst0_l);
+ dst0_r = __lsx_vsrli_w(dst0_r, 6);
+ dst0_l = __lsx_vsrli_w(dst0_l, 6);
+
+ tmp = __lsx_vpickev_h(dst0_l, dst0_r);
+ tmp = __lsx_vsadd_h(tmp, in0);
+ tmp = __lsx_vmaxi_h(tmp, 0);
+ out = __lsx_vssrlrni_bu_h(tmp, tmp, 7);
+ __lsx_vstelm_d(out, dst_tmp, 0, 0);
+ dst_tmp += dst_stride;
+
+ dst0 = dst1;
+ dst1 = dst2;
+ dst2 = dst3;
+ dst3 = dst4;
+ dst4 = dst5;
+ dst5 = dst6;
+ dst6 = dst7;
+ }
+
+ src0_ptr += 8;
+ dst += 8;
+ src1_ptr += 8;
+ }
+}
+
+static void hevc_hv_8t_8w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter_x, filter_y, height, 8);
+}
+
+static void hevc_hv_8t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter_x, filter_y, height, 16);
+}
+
+static void hevc_hv_8t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter_x, filter_y, height, 24);
+}
+
+static void hevc_hv_8t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter_x, filter_y, height, 32);
+}
+
+static void hevc_hv_8t_48w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter_x, filter_y, height, 48);
+}
+
+static void hevc_hv_8t_64w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter_x, filter_y, height, 64);
+}
+
+static void hevc_hz_4t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ int16_t *src1_ptr_tmp;
+ uint8_t *dst_tmp;
+ uint32_t loop_cnt;
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t dst_stride_4x = (dst_stride << 2);
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ int32_t src2_stride_x = src2_stride << 1;
+ int32_t src2_stride_2x = src2_stride << 2;
+ int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
+
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i filt0, filt1;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ __m128i mask1, mask2, mask3;
+ __m128i vec0, vec1, vec2, vec3;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+ src0_ptr -= 1;
+ DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 8, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 10);
+
+ dst_tmp = dst + 16;
+ src1_ptr_tmp = src1_ptr + 16;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src0, src1);
+ src0_ptr += src_stride;
+ DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src2, src3);
+ src0_ptr += src_stride;
+ DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src4, src5);
+ src0_ptr += src_stride;
+ DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src6, src7);
+ src0_ptr += src_stride;
+
+ DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in1);
+ src1_ptr += src2_stride;
+ DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in2, in3);
+ src1_ptr += src2_stride;
+ DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in4, in5);
+ src1_ptr += src2_stride;
+ DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in6, in7);
+ src1_ptr += src2_stride;
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask2, src2,
+ src2, mask0, src3, src2, mask2, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+ vec3, filt0, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src0, mask3, src2,
+ src2, mask1, src3, src2, mask3, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
+ dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
+
+ DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src5, src4, mask2, src6,
+ src6, mask0, src7, src6, mask2, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+ vec3, filt0, dst4, dst5, dst6, dst7);
+ DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask1, src5, src4, mask3, src6,
+ src6, mask1, src7, src6, mask3, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec0, filt1, dst5, vec1, filt1,
+ dst6, vec2, filt1, dst7, vec3, filt1, dst4, dst5, dst6, dst7);
+
+ dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
+ dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
+ dst2 = hevc_bi_rnd_clip(in4, dst4, in5, dst5);
+ dst3 = hevc_bi_rnd_clip(in6, dst6, in7, dst7);
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vstx(dst1, dst, dst_stride);
+ __lsx_vstx(dst2, dst, dst_stride_2x);
+ __lsx_vstx(dst3, dst, dst_stride_3x);
+ dst += dst_stride_4x;
+
+ in0 = __lsx_vld(src1_ptr_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src1_ptr_tmp, src2_stride_x, src1_ptr_tmp,
+ src2_stride_2x, in1, in2);
+ in3 = __lsx_vldx(src1_ptr_tmp, src2_stride_3x);
+ src1_ptr_tmp += src2_stride_2x;
+
+ DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src3, src3, mask0, src5,
+ src5, mask0, src7, src7, mask0, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+ vec3, filt0, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask1, src3, src3, mask1, src5,
+ src5, mask1, src7, src7, mask1, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
+ dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
+ dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
+ dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
+ __lsx_vstelm_d(dst0, dst_tmp, 0, 0);
+ __lsx_vstelm_d(dst0, dst_tmp + dst_stride, 0, 1);
+ __lsx_vstelm_d(dst1, dst_tmp + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(dst1, dst_tmp + dst_stride_3x, 0, 1);
+ dst_tmp += dst_stride_4x;
+ }
+}
+
+static void hevc_hz_4t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ uint32_t loop_cnt;
+ __m128i src0, src1, src2;
+ __m128i in0, in1, in2, in3;
+ __m128i filt0, filt1;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ __m128i mask1, mask2, mask3;
+ __m128i dst0, dst1, dst2, dst3;
+ __m128i vec0, vec1, vec2, vec3;
+
+ src0_ptr -= 1;
+
+ DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 8, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 10);
+
+ for (loop_cnt = height; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src0, src1);
+ src2 = __lsx_vld(src0_ptr, 24);
+ src0_ptr += src_stride;
+ DUP4_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, src1_ptr, 32,
+ src1_ptr, 48, in0, in1, in2, in3);
+ src1_ptr += src2_stride;
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask2, src1,
+ src1, mask0, src2, src2, mask0, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+ vec3, filt0, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src0, mask3, src1,
+ src1, mask1, src2, src2, mask1, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
+ dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
+ dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
+ dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vst(dst1, dst, 16);
+ dst += dst_stride;
+ }
+}
+
+static void hevc_vt_4t_12w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ int32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t dst_stride_4x = (dst_stride << 2);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t src2_stride_x = (src2_stride << 1);
+ int32_t src2_stride_2x = (src2_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
+ int16_t *_src1 = src1_ptr + 8;
+ __m128i src0, src1, src2, src3, src4, src5, src6;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
+ __m128i dst0_r, dst1_r, dst2_r, dst3_r;
+ __m128i src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
+ __m128i src2110, src4332, src6554;
+ __m128i dst0_l, dst1_l, filt0, filt1;
+
+ src0_ptr -= src_stride;
+ DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+
+ src0 = __lsx_vld(src0_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+ src1, src2);
+ src0_ptr += src_stride_3x;
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+ DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
+ src2110 = __lsx_vilvl_d(src21_l, src10_l);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src3 = __lsx_vld(src0_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+ src4, src5);
+ src6 = __lsx_vldx(src0_ptr, src_stride_3x);
+ src0_ptr += src_stride_4x;
+ in0 = __lsx_vld(src1_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
+ src2_stride_2x, in1, in2);
+ in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
+ src1_ptr += src2_stride_2x;
+ in4 = __lsx_vld(_src1, 0);
+ DUP2_ARG2(__lsx_vldx, _src1, src2_stride_x, _src1, src2_stride_2x,
+ in5, in6);
+ in7 = __lsx_vldx(_src1, src2_stride_3x);
+ _src1 += src2_stride_2x;
+ DUP2_ARG2(__lsx_vilvl_d, in5, in4, in7, in6, in4, in5);
+
+ DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
+ DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
+ src4332 = __lsx_vilvl_d(src43_l, src32_l);
+ DUP2_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src54_r, src65_r);
+ DUP2_ARG2(__lsx_vilvh_b, src5, src4, src6, src5, src54_l, src65_l);
+ src6554 = __lsx_vilvl_d(src65_l, src54_l);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src2110,
+ filt0, src32_r, filt0, dst0_r, dst1_r, dst0_l, dst2_r);
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, src43_r, filt0, src4332, filt0,
+ dst3_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst1_r,
+ src43_r, filt1, dst0_l, src4332, filt1, dst2_r, src54_r,
+ filt1, dst0_r, dst1_r, dst0_l, dst2_r);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3_r, src65_r, filt1, dst1_l,
+ src6554, filt1, dst3_r, dst1_l);
+ dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in1, dst1_r);
+ dst1_r = hevc_bi_rnd_clip(in2, dst2_r, in3, dst3_r);
+ dst0_l = hevc_bi_rnd_clip(in4, dst0_l, in5, dst1_l);
+ __lsx_vstelm_d(dst0_r, dst, 0, 0);
+ __lsx_vstelm_d(dst0_r, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(dst1_r, dst + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(dst1_r, dst + dst_stride_3x, 0, 1);
+ __lsx_vstelm_w(dst0_l, dst, 8, 0);
+ __lsx_vstelm_w(dst0_l, dst + dst_stride, 8, 1);
+ __lsx_vstelm_w(dst0_l, dst + dst_stride_2x, 8, 2);
+ __lsx_vstelm_w(dst0_l, dst + dst_stride_3x, 8, 3);
+ dst += dst_stride_4x;
+
+ src2 = src6;
+ src10_r = src54_r;
+ src21_r = src65_r;
+ src2110 = src6554;
+ }
+}
+
+static void hevc_vt_4t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ int32_t loop_cnt;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ const int32_t src_stride_3x = src_stride_2x + src_stride;
+ __m128i src0, src1, src2, src3, src4, src5;
+ __m128i in0, in1, in2, in3;
+ __m128i src10_r, src32_r, src21_r, src43_r;
+ __m128i src10_l, src32_l, src21_l, src43_l;
+ __m128i dst0_r, dst1_r, dst0_l, dst1_l;
+ __m128i filt0, filt1;
+
+ src0_ptr -= src_stride;
+ DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+
+ src0 = __lsx_vld(src0_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+ src1, src2);
+ src0_ptr += src_stride_3x;
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+ DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src3 = __lsx_vld(src0_ptr, 0);
+ src4 = __lsx_vldx(src0_ptr, src_stride);
+ src0_ptr += src_stride_2x;
+ DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in2);
+ src1_ptr += src2_stride;
+ DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in1, in3);
+ src1_ptr += src2_stride;
+ DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
+ DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src10_l,
+ filt0, src21_l, filt0, dst0_r, dst1_r, dst0_l, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst1_r, src43_r,
+ filt1, dst0_l, src32_l, filt1, dst1_l, src43_l, filt1,
+ dst0_r, dst1_r, dst0_l, dst1_l);
+
+ dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l);
+ dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l);
+ __lsx_vst(dst0_r, dst, 0);
+ __lsx_vstx(dst1_r, dst, dst_stride);
+ dst += dst_stride_2x;
+
+ src5 = __lsx_vld(src0_ptr, 0);
+ src2 = __lsx_vldx(src0_ptr, src_stride);
+ src0_ptr += src_stride_2x;
+ DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in2);
+ src1_ptr += src2_stride;
+ DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in1, in3);
+ src1_ptr += src2_stride;
+ DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r);
+ DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
+ filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l,
+ src10_l, filt1, dst1_r, src21_r, filt1, dst1_l, src21_l,
+ filt1, dst0_r, dst0_l, dst1_r, dst1_l);
+ dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l);
+ dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l);
+ __lsx_vst(dst0_r, dst, 0);
+ __lsx_vstx(dst1_r, dst, dst_stride);
+ dst += dst_stride_2x;
+ }
+}
+
+static void hevc_vt_4t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ uint32_t loop_cnt;
+ int32_t dst_stride_2x = dst_stride << 1;
+ __m128i src0, src1, src2, src3, src4, src5;
+ __m128i src6, src7, src8, src9, src10, src11;
+ __m128i in0, in1, in2, in3, in4, in5;
+ __m128i src10_r, src32_r, src76_r, src98_r;
+ __m128i src21_r, src43_r, src87_r, src109_r;
+ __m128i src10_l, src32_l, src21_l, src43_l;
+ __m128i dst0_r, dst1_r, dst2_r, dst3_r;
+ __m128i dst0_l, dst1_l;
+ __m128i filt0, filt1;
+
+ src0_ptr -= src_stride;
+ DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+
+ /* 16width */
+ DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src0, src6);
+ src0_ptr += src_stride;
+ DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src1, src7);
+ src0_ptr += src_stride;
+ DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src2, src8);
+ src0_ptr += src_stride;
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+ DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
+ /* 8width */
+ DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ /* 16width */
+ DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src3, src9);
+ src0_ptr += src_stride;
+ DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src4, src10);
+ src0_ptr += src_stride;
+ DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in2);
+ in4 = __lsx_vld(src1_ptr, 32);
+ src1_ptr += src2_stride;
+ DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in1, in3);
+ in5 = __lsx_vld(src1_ptr, 32);
+ src1_ptr += src2_stride;
+ DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
+ DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
+ /* 8width */
+ DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r);
+ /* 16width */
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
+ filt0, src21_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_l,
+ src32_l, filt1, dst1_r, src43_r, filt1, dst1_l, src43_l, filt1,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ /* 8width */
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src87_r, filt0,
+ dst2_r, dst3_r);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src98_r, filt1, dst3_r,
+ src109_r, filt1, dst2_r, dst3_r);
+ /* 16width */
+ dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l);
+ dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l);
+ dst2_r = hevc_bi_rnd_clip(in4, dst2_r, in5, dst3_r);
+ __lsx_vst(dst0_r, dst, 0);
+ __lsx_vstx(dst1_r, dst, dst_stride);
+ __lsx_vstelm_d(dst2_r, dst, 16, 0);
+ __lsx_vstelm_d(dst2_r, dst + dst_stride, 16, 1);
+ dst += dst_stride_2x;
+
+ /* 16width */
+ DUP4_ARG2(__lsx_vld, src0_ptr, 0, src1_ptr, 0, src1_ptr, 16, src1_ptr,
+ 32, src5, in0, in2, in4);
+ src1_ptr += src2_stride;
+ DUP4_ARG2(__lsx_vld, src0_ptr, 16, src1_ptr, 0, src1_ptr, 16, src1_ptr,
+ 32, src11, in1, in3, in5);
+ src1_ptr += src2_stride;
+ src0_ptr += src_stride;
+ DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src2, src8);
+ src0_ptr += src_stride;
+ DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r);
+ DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l);
+ /* 8width */
+ DUP2_ARG2(__lsx_vilvl_b, src11, src10, src8, src11, src76_r, src87_r);
+ /* 16width */
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
+ filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l,
+ src10_l, filt1, dst1_r, src21_r, filt1, dst1_l, src21_l,
+ filt1, dst0_r, dst0_l, dst1_r, dst1_l);
+
+ /* 8width */
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, src98_r, filt0, src109_r, filt0,
+ dst2_r, dst3_r);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src76_r, filt1, dst3_r,
+ src87_r, filt1, dst2_r, dst3_r);
+
+ dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l);
+ dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l);
+ dst2_r = hevc_bi_rnd_clip(in4, dst2_r, in5, dst3_r);
+ __lsx_vst(dst0_r, dst, 0);
+ __lsx_vstx(dst1_r, dst, dst_stride);
+ __lsx_vstelm_d(dst2_r, dst, 16, 0);
+ __lsx_vstelm_d(dst2_r, dst + dst_stride, 16, 1);
+ dst += dst_stride_2x;
+ }
+}
+
+static void hevc_vt_4t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ hevc_vt_4t_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter, height);
+ hevc_vt_4t_16w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
+ dst + 16, dst_stride, filter, height);
+}
+
+static void hevc_hv_4t_6w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_4x = (dst_stride << 2);
+ int32_t src2_stride_2x = (src2_stride << 1);
+ int32_t src2_stride_4x = (src2_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ int32_t src2_stride_3x = src2_stride_2x + src2_stride;
+ __m128i out0, out1;
+ __m128i src0, src1, src2, src3, src4, src5, src6;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, mask1;
+ __m128i filt0, filt1, filt_h0, filt_h1;
+ __m128i dsth0, dsth1, dsth2, dsth3, dsth4, dsth5;
+ __m128i dsth6, dsth7, dsth8, dsth9, dsth10;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+ __m128i dst4_r, dst5_r, dst6_r, dst7_r;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+ __m128i reg0, reg1, reg2, reg3;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+
+ src0_ptr -= (src_stride + 1);
+ DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+ filt_h1 = __lsx_vld(filter_y, 0);
+ filt_h1 = __lsx_vsllwil_h_b(filt_h1, 0);
+ DUP2_ARG2(__lsx_vreplvei_w, filt_h1, 0, filt_h1, 1, filt_h0, filt_h1);
+
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ src0 = __lsx_vld(src0_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+ src1, src2);
+ src0_ptr += src_stride_3x;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dsth0, dsth1);
+ dsth2 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dsth0, vec1, filt1, dsth1, vec3, filt1,
+ dsth0, dsth1);
+ dsth2 = __lsx_vdp2add_h_bu_b(dsth2, vec5, filt1);
+
+ DUP2_ARG2(__lsx_vilvl_h, dsth1, dsth0, dsth2, dsth1, tmp0, tmp2);
+ DUP2_ARG2(__lsx_vilvh_h, dsth1, dsth0, dsth2, dsth1, tmp1, tmp3);
+
+ src3 = __lsx_vld(src0_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+ src4, src5);
+ src6 = __lsx_vldx(src0_ptr, src_stride_3x);
+ src0_ptr += src_stride_4x;
+ DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, vec6, vec7);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
+ filt0, dsth3, dsth4, dsth5, dsth6);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dsth3, vec1, filt1, dsth4, vec3, filt1, dsth5,
+ vec5, filt1, dsth6, vec7, filt1, dsth3, dsth4, dsth5, dsth6);
+
+ src3 = __lsx_vld(src0_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+ src4, src5);
+ src6 = __lsx_vldx(src0_ptr, src_stride_3x);
+
+ DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, vec6, vec7);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
+ filt0, dsth7, dsth8, dsth9, dsth10);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dsth7, vec1, filt1, dsth8, vec3, filt1, dsth9,
+ vec5, filt1, dsth10, vec7, filt1, dsth7, dsth8, dsth9, dsth10);
+
+ DUP2_ARG2(__lsx_vilvl_h, dsth3, dsth2, dsth4, dsth3, tmp4, tmp6);
+ DUP2_ARG2(__lsx_vilvh_h, dsth3, dsth2, dsth4, dsth3, tmp5, tmp7);
+ DUP2_ARG2(__lsx_vilvl_h, dsth5, dsth4, dsth6, dsth5, dsth0, dsth2);
+ DUP2_ARG2(__lsx_vilvh_h, dsth5, dsth4, dsth6, dsth5, dsth1, dsth3);
+ DUP4_ARG2(__lsx_vdp2_w_h, tmp0, filt_h0, tmp2, filt_h0, tmp4, filt_h0,
+ tmp6, filt_h0, dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, tmp4, filt_h1, dst1_r, tmp6,
+ filt_h1, dst2_r, dsth0, filt_h1, dst3_r, dsth2, filt_h1,
+ dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP2_ARG2(__lsx_vpickev_d, tmp3, tmp1, tmp7, tmp5, tmp0, tmp8);
+ dst0_l = __lsx_vdp2_w_h(tmp0, filt_h0);
+ dst0_l = __lsx_vdp2add_w_h(dst0_l, tmp8, filt_h1);
+
+ DUP2_ARG2(__lsx_vilvl_h, dsth7, dsth6, dsth8, dsth7, tmp0, tmp2);
+ DUP2_ARG2(__lsx_vilvh_h, dsth7, dsth6, dsth8, dsth7, tmp1, tmp3);
+ DUP2_ARG2(__lsx_vilvl_h, dsth9, dsth8, dsth10, dsth9, tmp4, tmp6);
+ DUP2_ARG2(__lsx_vilvh_h, dsth9, dsth8, dsth10, dsth9, tmp5, tmp7);
+ DUP4_ARG2(__lsx_vdp2_w_h, dsth0, filt_h0, dsth2, filt_h0, tmp0, filt_h0,
+ tmp2, filt_h0, dst4_r, dst5_r, dst6_r, dst7_r);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst4_r, tmp0, filt_h1, dst5_r, tmp2,
+ filt_h1, dst6_r, tmp4, filt_h1, dst7_r, tmp6, filt_h1,
+ dst4_r, dst5_r, dst6_r, dst7_r);
+ DUP2_ARG2(__lsx_vpickev_d, dsth3, dsth1, tmp3, tmp1, tmp0, tmp1);
+ tmp2 = __lsx_vpickev_d(tmp7, tmp5);
+
+ DUP2_ARG2(__lsx_vdp2_w_h, tmp8, filt_h0, tmp0, filt_h0, dst1_l, dst2_l);
+ dst3_l = __lsx_vdp2_w_h(tmp1, filt_h0);
+ DUP2_ARG3(__lsx_vdp2add_w_h, dst1_l, tmp0, filt_h1, dst2_l, tmp1, filt_h1,
+ dst1_l, dst2_l);
+ dst3_l = __lsx_vdp2add_w_h(dst3_l, tmp2, filt_h1);
+
+ DUP4_ARG2(__lsx_vsrai_d, dst0_r, 6, dst1_r, 6, dst2_r, 6, dst3_r, 6,
+ dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP4_ARG2(__lsx_vsrai_d, dst4_r, 6, dst5_r, 6, dst6_r, 6, dst7_r, 6,
+ dst4_r, dst5_r, dst6_r, dst7_r);
+ DUP4_ARG2(__lsx_vsrai_d, dst0_l, 6, dst1_l, 6, dst2_l, 6, dst3_l, 6,
+ dst0_l, dst1_l, dst2_l, dst3_l);
+ DUP2_ARG2(__lsx_vpickev_h, dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vpickev_h, dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
+ DUP2_ARG2(__lsx_vpickev_h, dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
+
+ reg0 = __lsx_vldrepl_d(src1_ptr, 0);
+ reg1 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0);
+ dsth0 = __lsx_vilvl_d(reg1, reg0);
+ reg0 = __lsx_vldrepl_d(src1_ptr + src2_stride_2x, 0);
+ reg1 = __lsx_vldrepl_d(src1_ptr + src2_stride_3x, 0);
+ dsth1 = __lsx_vilvl_d(reg1, reg0);
+ src1_ptr += src2_stride_4x;
+ reg0 = __lsx_vldrepl_d(src1_ptr, 0);
+ reg1 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0);
+ dsth2 = __lsx_vilvl_d(reg1, reg0);
+ reg0 = __lsx_vldrepl_d(src1_ptr + src2_stride_2x, 0);
+ reg1 = __lsx_vldrepl_d(src1_ptr + src2_stride_3x, 0);
+ dsth3 = __lsx_vilvl_d(reg1, reg0);
+
+ DUP4_ARG2(__lsx_vsadd_h, dsth0, tmp0, dsth1, tmp1, dsth2, tmp2, dsth3,
+ tmp3, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp2, 0, tmp3, 0,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrlrni_bu_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
+
+ __lsx_vstelm_w(out0, dst, 0, 0);
+ __lsx_vstelm_w(out0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(out0, dst + dst_stride_2x, 0, 2);
+ __lsx_vstelm_w(out0, dst + dst_stride_3x, 0, 3);
+ dst += dst_stride_4x;
+ __lsx_vstelm_w(out1, dst, 0, 0);
+ __lsx_vstelm_w(out1, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(out1, dst + dst_stride_2x, 0, 2);
+ __lsx_vstelm_w(out1, dst + dst_stride_3x, 0, 3);
+ dst -= dst_stride_4x;
+
+ src1_ptr -= src2_stride_4x;
+
+ reg0 = __lsx_vldrepl_w(src1_ptr, 8);
+ reg1 = __lsx_vldrepl_w(src1_ptr + src2_stride, 8);
+ reg2 = __lsx_vldrepl_w(src1_ptr + src2_stride_2x, 8);
+ reg3 = __lsx_vldrepl_w(src1_ptr + src2_stride_3x, 8);
+ DUP2_ARG2(__lsx_vilvl_w, reg1, reg0, reg3, reg2, tmp0, tmp1);
+ dsth4 = __lsx_vilvl_d(tmp1, tmp0);
+ src1_ptr += src2_stride_4x;
+
+ reg0 = __lsx_vldrepl_w(src1_ptr, 8);
+ reg1 = __lsx_vldrepl_w(src1_ptr + src2_stride, 8);
+ reg2 = __lsx_vldrepl_w(src1_ptr + src2_stride_2x, 8);
+ reg3 = __lsx_vldrepl_w(src1_ptr + src2_stride_3x, 8);
+ DUP2_ARG2(__lsx_vilvl_w, reg1, reg0, reg3, reg2, tmp0, tmp1);
+ dsth5 = __lsx_vilvl_d(tmp1, tmp0);
+ DUP2_ARG2(__lsx_vsadd_h, dsth4, tmp4, dsth5, tmp5, tmp4, tmp5);
+ DUP2_ARG2(__lsx_vmaxi_h, tmp4, 0, tmp5, 7, tmp4, tmp5);
+ out0 = __lsx_vssrlrni_bu_h(tmp5, tmp4, 7);
+
+ __lsx_vstelm_h(out0, dst, 4, 0);
+ __lsx_vstelm_h(out0, dst + dst_stride, 4, 1);
+ __lsx_vstelm_h(out0, dst + dst_stride_2x, 4, 2);
+ __lsx_vstelm_h(out0, dst + dst_stride_3x, 4, 3);
+ dst += dst_stride_4x;
+ __lsx_vstelm_h(out0, dst, 4, 4);
+ __lsx_vstelm_h(out0, dst + dst_stride, 4, 5);
+ __lsx_vstelm_h(out0, dst + dst_stride_2x, 4, 6);
+ __lsx_vstelm_h(out0, dst + dst_stride_3x, 4, 7);
+}
+
+static av_always_inline
+void hevc_hv_4t_8x2_lsx(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr,
+ int32_t src2_stride, uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y)
+{
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+
+ __m128i out;
+ __m128i src0, src1, src2, src3, src4;
+ __m128i filt0, filt1;
+ __m128i filt_h0, filt_h1;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ __m128i mask1, filter_vec;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+ __m128i dst0, dst1, dst2, dst3, dst4;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l;
+ __m128i dst10_r, dst32_r, dst21_r, dst43_r;
+ __m128i dst10_l, dst32_l, dst21_l, dst43_l;
+ __m128i tmp0, tmp1;
+ __m128i in0, in1;
+
+ src0_ptr -= (src_stride + 1);
+ DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+ DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ src0 = __lsx_vld(src0_ptr, 0);
+ DUP4_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+ src0_ptr, src_stride_3x, src0_ptr, src_stride_4x,
+ src1, src2, src3, src4);
+
+ DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr + src2_stride, 0, in0, in1);
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec6, vec7);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec8, vec9);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
+ filt0, dst0, dst1, dst2, dst3);
+ dst4 = __lsx_vdp2_h_bu_b(vec8, filt0);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2,
+ vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
+ dst4 = __lsx_vdp2add_h_bu_b(dst4, vec9, filt1);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
+ DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+ filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+ filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vsadd_h, in0, tmp0, in1, tmp1, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp0, tmp1);
+ out = __lsx_vssrlrni_bu_h(tmp1, tmp0, 7);
+ __lsx_vstelm_d(out, dst, 0, 0);
+ __lsx_vstelm_d(out, dst + dst_stride, 0, 1);
+}
+
+static av_always_inline
+void hevc_hv_4t_8multx4_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t width8mult)
+{
+ uint32_t cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t src2_stride_x = (src2_stride << 1);
+ int32_t src2_stride_2x = (src2_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
+
+ __m128i out0, out1;
+ __m128i src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i filt0, filt1, filt_h0, filt_h1, filter_vec;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
+ __m128i in0, in1, in2, in3;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+ __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
+ __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
+
+ src0_ptr -= (src_stride + 1);
+ DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+ DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+ mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ for (cnt = width8mult; cnt--;) {
+ src0 = __lsx_vld(src0_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+ src1, src2);
+ src3 = __lsx_vldx(src0_ptr, src_stride_3x);
+ src0_ptr += src_stride_4x;
+ src4 = __lsx_vld(src0_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+ src5, src6);
+ src0_ptr += (8 - src_stride_4x);
+
+ in0 = __lsx_vld(src1_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
+ src2_stride_2x, in1, in2);
+ in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
+ src1_ptr += 8;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1,
+ vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,
+ vec4, vec5);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
+ dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
+ dst0, dst1);
+ dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
+
+ DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1,
+ vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1,
+ vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1,
+ vec6, vec7);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
+ vec6, filt0, dst3, dst4, dst5, dst6);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3, filt1,
+ dst5, vec5, filt1, dst6, vec7, filt1, dst3, dst4, dst5, dst6);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
+ DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+ filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
+ filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+ filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
+ filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
+ dst2_r, dst2_l, dst3_r, dst3_l);
+
+ DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6,
+ dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l,
+ dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vsadd_h, in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp2, 0, tmp3, 0,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrlrni_bu_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
+ dst += 8;
+ }
+}
+
+static av_always_inline
+void hevc_hv_4t_8x6_lsx(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr,
+ int32_t src2_stride, uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y)
+{
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_4x = (dst_stride << 2);
+ int32_t src2_stride_x = (src2_stride << 1);
+ int32_t src2_stride_2x = (src2_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
+
+ __m128i out0, out1, out2;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ __m128i in0, in1, in2, in3, in4, in5;
+ __m128i filt0, filt1;
+ __m128i filt_h0, filt_h1;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ __m128i mask1, filter_vec;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+ __m128i vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+ __m128i dst4_r, dst4_l, dst5_r, dst5_l;
+ __m128i dst10_r, dst32_r, dst10_l, dst32_l;
+ __m128i dst21_r, dst43_r, dst21_l, dst43_l;
+ __m128i dst54_r, dst54_l, dst65_r, dst65_l;
+ __m128i dst76_r, dst76_l, dst87_r, dst87_l;
+
+ src0_ptr -= (src_stride + 1);
+ DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+ DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ src0 = __lsx_vld(src0_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+ src1, src2);
+ src3 = __lsx_vldx(src0_ptr, src_stride_3x);
+ src0_ptr += src_stride_4x;
+ src4 = __lsx_vld(src0_ptr, 0);
+ DUP4_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+ src0_ptr, src_stride_3x, src0_ptr, src_stride_4x,
+ src5, src6, src7, src8);
+
+ in0 = __lsx_vld(src1_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr, src2_stride_2x,
+ in1, in2);
+ in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
+ src1_ptr += src2_stride_2x;
+ in4 = __lsx_vld(src1_ptr, 0);
+ in5 = __lsx_vldx(src1_ptr, src2_stride_x);
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec6, vec7);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec8, vec9);
+ DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, vec10, vec11);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, vec12, vec13);
+ DUP2_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, vec14, vec15);
+ DUP2_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1, vec16, vec17);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
+ filt0, dst0, dst1, dst2, dst3);
+ dst4 = __lsx_vdp2_h_bu_b(vec8, filt0);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec10, filt0, vec12, filt0, vec14, filt0,
+ vec16, filt0, dst5, dst6, dst7, dst8);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2,
+ vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
+ dst4 = __lsx_vdp2add_h_bu_b(dst4, vec9, filt1);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec11, filt1, dst6, vec13, filt1,
+ dst7, vec15, filt1, dst8, vec17, filt1, dst5, dst6, dst7, dst8);
+
+ DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
+ dst10_r, dst21_r, dst32_r, dst43_r);
+ DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
+ dst10_l, dst21_l, dst32_l, dst43_l);
+ DUP4_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
+ dst54_r, dst65_r, dst76_r, dst87_r);
+ DUP4_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
+ dst54_l, dst65_l, dst76_l, dst87_l);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+ filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
+ filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst54_l, filt_h0, dst65_r,
+ filt_h0, dst65_l, filt_h0, dst4_r, dst4_l, dst5_r, dst5_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+ filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
+ filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
+ dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst4_r, dst76_r, filt_h1, dst4_l, dst76_l,
+ filt_h1, dst5_r, dst87_r, filt_h1, dst5_l, dst87_l, filt_h1,
+ dst4_r, dst4_l, dst5_r, dst5_l);
+
+ DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6,
+ dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG2(__lsx_vsrai_w, dst4_r, 6, dst4_l, 6, dst5_r, 6, dst5_l, 6,
+ dst4_r, dst4_l, dst5_r, dst5_l);
+ DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r,
+ dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG2(__lsx_vpickev_h, dst4_l, dst4_r, dst5_l, dst5_r, tmp4, tmp5);
+ DUP4_ARG2(__lsx_vsadd_h, in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG2(__lsx_vsadd_h, in4, tmp4, in5, tmp5, tmp4, tmp5);
+ DUP4_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp2, 0, tmp3, 0,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG2(__lsx_vmaxi_h, tmp4, 0, tmp5, 0, tmp4, tmp5);
+ DUP2_ARG3(__lsx_vssrlrni_bu_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
+ out2 = __lsx_vssrlrni_bu_h(tmp5, tmp4, 7);
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
+ dst += dst_stride_4x;
+ __lsx_vstelm_d(out2, dst, 0, 0);
+ __lsx_vstelm_d(out2, dst + dst_stride, 0, 1);
+}
+
+static av_always_inline
+void hevc_hv_4t_8multx4mult_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height, int32_t width)
+{
+ uint32_t loop_cnt, cnt;
+ uint8_t *src0_ptr_tmp;
+ int16_t *src1_ptr_tmp;
+ uint8_t *dst_tmp;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ const int32_t src_stride_4x = (src_stride << 2);
+ const int32_t dst_stride_4x = (dst_stride << 2);
+ const int32_t src2_stride_x = (src2_stride << 1);
+ const int32_t src2_stride_2x = (src2_stride << 2);
+ const int32_t src_stride_3x = src_stride_2x + src_stride;
+ const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ const int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
+ __m128i out0, out1;
+ __m128i src0, src1, src2, src3, src4, src5, src6;
+ __m128i in0, in1, in2, in3;
+ __m128i filt0, filt1;
+ __m128i filt_h0, filt_h1;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ __m128i mask1, filter_vec;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i dst10_r, dst32_r, dst21_r, dst43_r;
+ __m128i dst10_l, dst32_l, dst21_l, dst43_l;
+ __m128i dst54_r, dst54_l, dst65_r, dst65_l, dst6;
+
+ src0_ptr -= (src_stride + 1);
+
+ DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+
+ DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ for (cnt = width >> 3; cnt--;) {
+ src0_ptr_tmp = src0_ptr;
+ dst_tmp = dst;
+ src1_ptr_tmp = src1_ptr;
+
+ src0 = __lsx_vld(src0_ptr_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
+ src_stride_2x, src1, src2);
+ src0_ptr_tmp += src_stride_3x;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1,
+ vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,
+ vec4, vec5);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
+ dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
+ dst0, dst1);
+ dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
+
+ for (loop_cnt = height >> 2; loop_cnt--;) {
+ src3 = __lsx_vld(src0_ptr_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
+ src_stride_2x, src4, src5);
+ src6 = __lsx_vldx(src0_ptr_tmp, src_stride_3x);
+ src0_ptr_tmp += src_stride_4x;
+ in0 = __lsx_vld(src1_ptr_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src1_ptr_tmp, src2_stride_x, src1_ptr_tmp,
+ src2_stride_2x, in1, in2);
+ in3 = __lsx_vldx(src1_ptr_tmp, src2_stride_3x);
+ src1_ptr_tmp += src2_stride_2x;
+
+ DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src4,
+ src4, mask0, src4, src4, mask1, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src6,
+ src6, mask0, src6, src6, mask1, vec4, vec5, vec6, vec7);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
+ vec6, filt0, dst3, dst4, dst5, dst6);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3,
+ filt1, dst5, vec5, filt1, dst6, vec7, filt1,
+ dst3, dst4, dst5, dst6);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
+ DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+ filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
+ filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
+ dst32_l, filt_h1, dst1_r, dst43_r, filt_h1, dst1_l,
+ dst43_l, filt_h1, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l,
+ dst54_l, filt_h1, dst3_r, dst65_r, filt_h1, dst3_l,
+ dst65_l, filt_h1, dst2_r, dst2_l, dst3_r, dst3_l);
+
+ DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6,
+ dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l,
+ dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vsadd_h, in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp2, 0, tmp3, 0, tmp0,
+ tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrlrni_bu_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
+ __lsx_vstelm_d(out0, dst_tmp, 0, 0);
+ __lsx_vstelm_d(out0, dst_tmp + dst_stride, 0, 1);
+ __lsx_vstelm_d(out1, dst_tmp + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(out1, dst_tmp + dst_stride_3x, 0, 1);
+ dst_tmp += dst_stride_4x;
+
+ dst10_r = dst54_r;
+ dst10_l = dst54_l;
+ dst21_r = dst65_r;
+ dst21_l = dst65_l;
+ dst2 = dst6;
+ }
+
+ src0_ptr += 8;
+ dst += 8;
+ src1_ptr += 8;
+ }
+}
+
+static void hevc_hv_4t_8w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ if (2 == height) {
+ hevc_hv_4t_8x2_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter_x, filter_y);
+ } else if (4 == height) {
+ hevc_hv_4t_8multx4_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter_x, filter_y, 1);
+ } else if (6 == height) {
+ hevc_hv_4t_8x6_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter_x, filter_y);
+ } else {
+ hevc_hv_4t_8multx4mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter_x, filter_y, height, 8);
+ }
+}
+
+static void hevc_hv_4t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ if (4 == height) {
+ hevc_hv_4t_8multx4_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter_x, filter_y, 2);
+ } else {
+ hevc_hv_4t_8multx4mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter_x, filter_y, height, 16);
+ }
+}
+
+static void hevc_hv_4t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ hevc_hv_4t_8multx4mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter_x, filter_y, height, 24);
+}
+
+static void hevc_hv_4t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ hevc_hv_4t_8multx4mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter_x, filter_y, height, 32);
+}
+
+#define BI_MC_COPY(WIDTH) \
+void ff_hevc_put_hevc_bi_pel_pixels##WIDTH##_8_lsx(uint8_t *dst, \
+ ptrdiff_t dst_stride, \
+ uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int16_t *src_16bit, \
+ int height, \
+ intptr_t mx, \
+ intptr_t my, \
+ int width) \
+{ \
+ hevc_bi_copy_##WIDTH##w_lsx(src, src_stride, src_16bit, MAX_PB_SIZE, \
+ dst, dst_stride, height); \
+}
+
+BI_MC_COPY(4);
+BI_MC_COPY(6);
+BI_MC_COPY(8);
+BI_MC_COPY(12);
+BI_MC_COPY(16);
+BI_MC_COPY(24);
+BI_MC_COPY(32);
+BI_MC_COPY(48);
+BI_MC_COPY(64);
+
+#undef BI_MC_COPY
+
+#define BI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
+void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_lsx(uint8_t *dst, \
+ ptrdiff_t dst_stride, \
+ uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int16_t *src_16bit, \
+ int height, \
+ intptr_t mx, \
+ intptr_t my, \
+ int width) \
+{ \
+ const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
+ \
+ hevc_##DIR1##_##TAP##t_##WIDTH##w_lsx(src, src_stride, src_16bit, \
+ MAX_PB_SIZE, dst, dst_stride, \
+ filter, height); \
+}
+
+BI_MC(qpel, h, 16, 8, hz, mx);
+BI_MC(qpel, h, 24, 8, hz, mx);
+BI_MC(qpel, h, 32, 8, hz, mx);
+BI_MC(qpel, h, 48, 8, hz, mx);
+BI_MC(qpel, h, 64, 8, hz, mx);
+
+BI_MC(qpel, v, 8, 8, vt, my);
+BI_MC(qpel, v, 16, 8, vt, my);
+BI_MC(qpel, v, 24, 8, vt, my);
+BI_MC(qpel, v, 32, 8, vt, my);
+BI_MC(qpel, v, 48, 8, vt, my);
+BI_MC(qpel, v, 64, 8, vt, my);
+
+BI_MC(epel, h, 24, 4, hz, mx);
+BI_MC(epel, h, 32, 4, hz, mx);
+
+BI_MC(epel, v, 12, 4, vt, my);
+BI_MC(epel, v, 16, 4, vt, my);
+BI_MC(epel, v, 24, 4, vt, my);
+BI_MC(epel, v, 32, 4, vt, my);
+
+#undef BI_MC
+
+#define BI_MC_HV(PEL, WIDTH, TAP) \
+void ff_hevc_put_hevc_bi_##PEL##_hv##WIDTH##_8_lsx(uint8_t *dst, \
+ ptrdiff_t dst_stride, \
+ uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int16_t *src_16bit, \
+ int height, \
+ intptr_t mx, \
+ intptr_t my, \
+ int width) \
+{ \
+ const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
+ const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
+ \
+ hevc_hv_##TAP##t_##WIDTH##w_lsx(src, src_stride, src_16bit, \
+ MAX_PB_SIZE, dst, dst_stride, \
+ filter_x, filter_y, height); \
+}
+
+BI_MC_HV(qpel, 8, 8);
+BI_MC_HV(qpel, 16, 8);
+BI_MC_HV(qpel, 24, 8);
+BI_MC_HV(qpel, 32, 8);
+BI_MC_HV(qpel, 48, 8);
+BI_MC_HV(qpel, 64, 8);
+
+BI_MC_HV(epel, 8, 4);
+BI_MC_HV(epel, 6, 4);
+BI_MC_HV(epel, 16, 4);
+BI_MC_HV(epel, 24, 4);
+BI_MC_HV(epel, 32, 4);
+
+#undef BI_MC_HV
diff --git a/libavcodec/loongarch/hevcdsp_init_loongarch.c b/libavcodec/loongarch/hevcdsp_init_loongarch.c
index f39674be64..e05fe780c2 100644
--- a/libavcodec/loongarch/hevcdsp_init_loongarch.c
+++ b/libavcodec/loongarch/hevcdsp_init_loongarch.c
@@ -86,6 +86,57 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_8_lsx;
c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_lsx;
+ c->put_hevc_qpel_bi[1][0][0] = ff_hevc_put_hevc_bi_pel_pixels4_8_lsx;
+ c->put_hevc_qpel_bi[3][0][0] = ff_hevc_put_hevc_bi_pel_pixels8_8_lsx;
+ c->put_hevc_qpel_bi[4][0][0] = ff_hevc_put_hevc_bi_pel_pixels12_8_lsx;
+ c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_8_lsx;
+ c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_8_lsx;
+ c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_lsx;
+ c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_lsx;
+ c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_lsx;
+
+ c->put_hevc_epel_bi[1][0][0] = ff_hevc_put_hevc_bi_pel_pixels4_8_lsx;
+ c->put_hevc_epel_bi[2][0][0] = ff_hevc_put_hevc_bi_pel_pixels6_8_lsx;
+ c->put_hevc_epel_bi[3][0][0] = ff_hevc_put_hevc_bi_pel_pixels8_8_lsx;
+ c->put_hevc_epel_bi[4][0][0] = ff_hevc_put_hevc_bi_pel_pixels12_8_lsx;
+ c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_8_lsx;
+ c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_8_lsx;
+ c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_lsx;
+
+ c->put_hevc_qpel_bi[3][1][0] = ff_hevc_put_hevc_bi_qpel_v8_8_lsx;
+ c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_8_lsx;
+ c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_8_lsx;
+ c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_lsx;
+ c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_lsx;
+ c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_lsx;
+
+ c->put_hevc_qpel_bi[3][1][1] = ff_hevc_put_hevc_bi_qpel_hv8_8_lsx;
+ c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_8_lsx;
+ c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_8_lsx;
+ c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_8_lsx;
+ c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_8_lsx;
+ c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_8_lsx;
+
+ c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_8_lsx;
+ c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_8_lsx;
+ c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_lsx;
+ c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_lsx;
+ c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_lsx;
+
+ c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_8_lsx;
+ c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_lsx;
+
+ c->put_hevc_epel_bi[4][1][0] = ff_hevc_put_hevc_bi_epel_v12_8_lsx;
+ c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_8_lsx;
+ c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_8_lsx;
+ c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_lsx;
+
+ c->put_hevc_epel_bi[2][1][1] = ff_hevc_put_hevc_bi_epel_hv6_8_lsx;
+ c->put_hevc_epel_bi[3][1][1] = ff_hevc_put_hevc_bi_epel_hv8_8_lsx;
+ c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_8_lsx;
+ c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_8_lsx;
+ c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_lsx;
+
c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_8_lsx;
c->hevc_h_loop_filter_luma = ff_hevc_loop_filter_luma_h_8_lsx;
diff --git a/libavcodec/loongarch/hevcdsp_lsx.h b/libavcodec/loongarch/hevcdsp_lsx.h
index 0e73fd1f8e..6a834290ab 100644
--- a/libavcodec/loongarch/hevcdsp_lsx.h
+++ b/libavcodec/loongarch/hevcdsp_lsx.h
@@ -85,6 +85,63 @@ MC(epel, hv, 32);
#undef MC
+#define BI_MC(PEL, DIR, WIDTH) \
+void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_lsx(uint8_t *dst, \
+ ptrdiff_t dst_stride, \
+ uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int16_t *src_16bit, \
+ int height, \
+ intptr_t mx, \
+ intptr_t my, \
+ int width)
+
+BI_MC(pel, pixels, 4);
+BI_MC(pel, pixels, 6);
+BI_MC(pel, pixels, 8);
+BI_MC(pel, pixels, 12);
+BI_MC(pel, pixels, 16);
+BI_MC(pel, pixels, 24);
+BI_MC(pel, pixels, 32);
+BI_MC(pel, pixels, 48);
+BI_MC(pel, pixels, 64);
+
+BI_MC(qpel, h, 16);
+BI_MC(qpel, h, 24);
+BI_MC(qpel, h, 32);
+BI_MC(qpel, h, 48);
+BI_MC(qpel, h, 64);
+
+BI_MC(qpel, v, 8);
+BI_MC(qpel, v, 16);
+BI_MC(qpel, v, 24);
+BI_MC(qpel, v, 32);
+BI_MC(qpel, v, 48);
+BI_MC(qpel, v, 64);
+
+BI_MC(qpel, hv, 8);
+BI_MC(qpel, hv, 16);
+BI_MC(qpel, hv, 24);
+BI_MC(qpel, hv, 32);
+BI_MC(qpel, hv, 48);
+BI_MC(qpel, hv, 64);
+
+BI_MC(epel, h, 24);
+BI_MC(epel, h, 32);
+
+BI_MC(epel, v, 12);
+BI_MC(epel, v, 16);
+BI_MC(epel, v, 24);
+BI_MC(epel, v, 32);
+
+BI_MC(epel, hv, 6);
+BI_MC(epel, hv, 8);
+BI_MC(epel, hv, 16);
+BI_MC(epel, hv, 24);
+BI_MC(epel, hv, 32);
+
+#undef BI_MC
+
void ff_hevc_loop_filter_luma_h_8_lsx(uint8_t *src, ptrdiff_t stride,
int32_t beta, int32_t *tc,
uint8_t *p_is_pcm, uint8_t *q_is_pcm);
--
2.20.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
* [FFmpeg-devel] [PATCH v1 5/5] avcodec: [loongarch] Optimize Hevc_mc_uni/w with LSX.
2022-02-14 8:25 [FFmpeg-devel] Optimize HEVC decoding for loongarch Hao Chen
` (3 preceding siblings ...)
2022-02-14 8:25 ` [FFmpeg-devel] [PATCH v1 4/5] avcodec: [loongarch] Optimize Hevc_mc_bi " Hao Chen
@ 2022-02-14 8:25 ` Hao Chen
2022-02-15 3:30 ` [FFmpeg-devel] Optimize HEVC decoding for loongarch 殷时友
5 siblings, 0 replies; 7+ messages in thread
From: Hao Chen @ 2022-02-14 8:25 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Lu Wang
From: Lu Wang <wanglu@loongson.cn>
ffmpeg -i 5_h265_1080p_60fps_3Mbps.mkv -f rawvideo -y /dev/null -an
before: 182fps
after : 191fps
---
libavcodec/loongarch/Makefile | 4 +-
libavcodec/loongarch/hevc_mc_uni_lsx.c | 1423 +++++++++++++++++
libavcodec/loongarch/hevc_mc_uniw_lsx.c | 298 ++++
libavcodec/loongarch/hevcdsp_init_loongarch.c | 30 +
libavcodec/loongarch/hevcdsp_lsx.h | 59 +
5 files changed, 1813 insertions(+), 1 deletion(-)
create mode 100644 libavcodec/loongarch/hevc_mc_uni_lsx.c
create mode 100644 libavcodec/loongarch/hevc_mc_uniw_lsx.c
diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index 620fba7192..c1b5de5c44 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -28,4 +28,6 @@ LSX-OBJS-$(CONFIG_VP9_DECODER) += loongarch/vp9_mc_lsx.o \
LSX-OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_lsx.o \
loongarch/hevc_idct_lsx.o \
loongarch/hevc_lpf_sao_lsx.o \
- loongarch/hevc_mc_bi_lsx.o
+ loongarch/hevc_mc_bi_lsx.o \
+ loongarch/hevc_mc_uni_lsx.o \
+ loongarch/hevc_mc_uniw_lsx.o
diff --git a/libavcodec/loongarch/hevc_mc_uni_lsx.c b/libavcodec/loongarch/hevc_mc_uni_lsx.c
new file mode 100644
index 0000000000..a15c86268f
--- /dev/null
+++ b/libavcodec/loongarch/hevc_mc_uni_lsx.c
@@ -0,0 +1,1423 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ * Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "hevcdsp_lsx.h"
+
+static const uint8_t ff_hevc_mask_arr[16 * 3] __attribute__((aligned(0x40))) = {
+ /* 8 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+ /* 4 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+ /* 4 width cases */
+ 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static av_always_inline
+void common_hz_8t_64w_lsx(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ int32_t loop_cnt;
+ __m128i mask0, mask1, mask2, mask3, out1, out2;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i filt0, filt1, filt2, filt3;
+ __m128i res0, res1, res2, res3;
+
+ mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ src -= 3;
+
+ /* rearranging filter */
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+
+ for (loop_cnt = height; loop_cnt--;) {
+ DUP4_ARG2(__lsx_vld, src, 0, src, 8, src, 16, src, 24,
+ src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vld, src, 32, src, 40, src, 48, src, 56,
+ src4, src5, src6, src7);
+ src += src_stride;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0,
+ vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+ vec3, filt0, res0, res1, res2, res3);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2,
+ vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec0, filt2, res1, vec1, filt2,
+ res2, vec2, filt2, res3, vec3, filt2, res0, res1, res2, res3);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1,
+ vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1,
+ vec6, vec7);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt1, res1, vec5, filt1,
+ res2, vec6, filt1, res3, vec7, filt1, res0, res1, res2, res3);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3,
+ vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask3, src3, src3, mask3,
+ vec6, vec7);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt3, res1, vec5, filt3,
+ res2, vec6, filt3, res3, vec7, filt3, res0, res1, res2, res3);
+
+ DUP2_ARG3(__lsx_vssrarni_bu_h, res1, res0, 6, res3, res2, 6,
+ out1, out2);
+ __lsx_vst(out1, dst, 0);
+ __lsx_vst(out2, dst, 16);
+
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src5, src5, mask0,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src7, src7, mask0,
+ vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+ vec3, filt0, res0, res1, res2, res3);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask2, src5, src5, mask2,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask2, src7, src7, mask2,
+ vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec0, filt2, res1, vec1, filt2,
+ res2, vec2, filt2, res3, vec3, filt2, res0, res1, res2, res3);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask1, src5, src5, mask1,
+ vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask1, src7, src7, mask1,
+ vec6, vec7);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt1, res1, vec5, filt1,
+ res2, vec6, filt1, res3, vec7, filt1, res0, res1, res2, res3);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask3, src5, src5, mask3,
+ vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask3, src7, src7, mask3,
+ vec6, vec7);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt3, res1, vec5, filt3,
+ res2, vec6, filt3, res3, vec7, filt3, res0, res1, res2, res3);
+
+ DUP2_ARG3(__lsx_vssrarni_bu_h, res1, res0, 6, res3, res2, 6,
+ out1, out2);
+ __lsx_vst(out1, dst, 32);
+ __lsx_vst(out2, dst, 48);
+ dst += dst_stride;
+ }
+}
+
+static av_always_inline
+void common_vt_8t_8w_lsx(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ uint32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_4x = (dst_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+ __m128i src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
+ __m128i tmp0, tmp1;
+ __m128i out0_r, out1_r, out2_r, out3_r;
+
+ src -= src_stride_3x;
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
+ src += src_stride_3x;
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ src10_r, src32_r, src54_r, src21_r);
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src7 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
+ src10 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+
+ DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
+ src9, src76_r, src87_r, src98_r, src109_r);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src32_r,
+ filt0, src43_r, filt0, out0_r, out1_r, out2_r, out3_r);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out1_r,
+ src43_r, filt1, out2_r, src54_r, filt1, out3_r, src65_r,
+ filt1, out0_r, out1_r, out2_r, out3_r);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src54_r, filt2, out1_r,
+ src65_r, filt2, out2_r, src76_r, filt2, out3_r, src87_r,
+ filt2, out0_r, out1_r, out2_r, out3_r);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src76_r, filt3, out1_r,
+ src87_r, filt3, out2_r, src98_r, filt3, out3_r, src109_r,
+ filt3, out0_r, out1_r, out2_r, out3_r);
+
+ DUP2_ARG3(__lsx_vssrarni_bu_h, out1_r, out0_r, 6, out3_r, out2_r, 6,
+ tmp0, tmp1)
+ __lsx_vstelm_d(tmp0, dst, 0, 0);
+ __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(tmp1, dst + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(tmp1, dst + dst_stride_3x, 0, 1);
+ dst += dst_stride_4x;
+
+ src10_r = src54_r;
+ src32_r = src76_r;
+ src54_r = src98_r;
+ src21_r = src65_r;
+ src43_r = src87_r;
+ src65_r = src109_r;
+ src6 = src10;
+ }
+}
+
+static av_always_inline
+void common_vt_8t_16w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter,
+ int32_t height, int32_t width)
+{
+ uint8_t *src_tmp;
+ uint8_t *dst_tmp;
+ uint32_t loop_cnt, cnt;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ const int32_t src_stride_4x = (src_stride << 2);
+ const int32_t dst_stride_4x = (dst_stride << 2);
+ const int32_t src_stride_3x = src_stride_2x + src_stride;
+ const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i filt0, filt1, filt2, filt3;
+ __m128i src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+ __m128i src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+ __m128i src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+ src -= src_stride_3x;
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, filt0,
+ filt1, filt2, filt3);
+
+ for (cnt = (width >> 4); cnt--;) {
+ src_tmp = src;
+ dst_tmp = dst;
+
+ src0 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src1, src2);
+ src3 = __lsx_vldx(src_tmp, src_stride_3x);
+ src_tmp += src_stride_4x;
+ src4 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src5, src6);
+ src_tmp += src_stride_3x;
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ src10_r, src32_r, src54_r, src21_r);
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
+ DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ src10_l, src32_l, src54_l, src21_l);
+ DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, src43_l, src65_l);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src7 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src8, src9);
+ src10 = __lsx_vldx(src_tmp, src_stride_3x);
+ src_tmp += src_stride_4x;
+ DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
+ src9, src76_r, src87_r, src98_r, src109_r);
+ DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10,
+ src9, src76_l, src87_l, src98_l, src109_l);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src32_r,
+ filt0, src43_r, filt0, out0_r, out1_r, out2_r, out3_r);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out1_r,
+ src43_r, filt1, out2_r, src54_r, filt1, out3_r, src65_r,
+ filt1, out0_r, out1_r, out2_r, out3_r);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src54_r, filt2, out1_r,
+ src65_r, filt2, out2_r, src76_r, filt2, out3_r, src87_r,
+ filt2, out0_r, out1_r, out2_r, out3_r);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src76_r, filt3, out1_r,
+ src87_r, filt3, out2_r, src98_r, filt3, out3_r, src109_r,
+ filt3, out0_r, out1_r, out2_r, out3_r);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_l, filt0, src21_l, filt0, src32_l,
+ filt0, src43_l, filt0, out0_l, out1_l, out2_l, out3_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_l, src32_l, filt1, out1_l,
+ src43_l, filt1, out2_l, src54_l, filt1, out3_l, src65_l,
+ filt1, out0_l, out1_l, out2_l, out3_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_l, src54_l, filt2, out1_l,
+ src65_l, filt2, out2_l, src76_l, filt2, out3_l, src87_l,
+ filt2, out0_l, out1_l, out2_l, out3_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_l, src76_l, filt3, out1_l,
+ src87_l, filt3, out2_l, src98_l, filt3, out3_l, src109_l,
+ filt3, out0_l, out1_l, out2_l, out3_l);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out1_l, out1_r,
+ 6, out2_l, out2_r, 6, out3_l, out3_r, 6,
+ tmp0, tmp1, tmp2, tmp3);
+ __lsx_vst(tmp0, dst_tmp, 0);
+ __lsx_vstx(tmp1, dst_tmp, dst_stride);
+ __lsx_vstx(tmp2, dst_tmp, dst_stride_2x);
+ __lsx_vstx(tmp3, dst_tmp, dst_stride_3x);
+ dst_tmp += dst_stride_4x;
+
+ src10_r = src54_r;
+ src32_r = src76_r;
+ src54_r = src98_r;
+ src21_r = src65_r;
+ src43_r = src87_r;
+ src65_r = src109_r;
+ src10_l = src54_l;
+ src32_l = src76_l;
+ src54_l = src98_l;
+ src21_l = src65_l;
+ src43_l = src87_l;
+ src65_l = src109_l;
+ src6 = src10;
+ }
+
+ src += 16;
+ dst += 16;
+ }
+}
+
+static void common_vt_8t_24w_lsx(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ common_vt_8t_16w_lsx(src, src_stride, dst, dst_stride, filter, height, 16);
+ common_vt_8t_8w_lsx(src + 16, src_stride, dst + 16, dst_stride, filter,
+ height);
+}
+
+static void common_vt_8t_32w_lsx(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ common_vt_8t_16w_lsx(src, src_stride, dst, dst_stride, filter, height, 32);
+}
+
+static void common_vt_8t_48w_lsx(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ common_vt_8t_16w_lsx(src, src_stride, dst, dst_stride, filter, height, 48);
+}
+
+static void common_vt_8t_64w_lsx(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ common_vt_8t_16w_lsx(src, src_stride, dst, dst_stride, filter, height, 64);
+}
+
+static av_always_inline
+void hevc_hv_8t_8x2_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height, int32_t width)
+{
+ uint32_t loop_cnt, cnt;
+ uint8_t *src_tmp;
+ uint8_t *dst_tmp;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ const int32_t src_stride_4x = (src_stride << 2);
+ const int32_t src_stride_3x = src_stride_2x + src_stride;
+
+ __m128i out;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ __m128i filt0, filt1, filt2, filt3;
+ __m128i filt_h0, filt_h1, filt_h2, filt_h3;
+ __m128i mask1, mask2, mask3;
+ __m128i filter_vec;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l;
+ __m128i dst10_r, dst32_r, dst54_r, dst76_r;
+ __m128i dst10_l, dst32_l, dst54_l, dst76_l;
+ __m128i dst21_r, dst43_r, dst65_r, dst87_r;
+ __m128i dst21_l, dst43_l, dst65_l, dst87_l;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+
+ src -= (src_stride_3x + 3);
+ DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4,
+ filter_x, 6, filt0, filt1, filt2, filt3);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+ DUP4_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filter_vec, 2,
+ filter_vec, 3, filt_h0, filt_h1, filt_h2, filt_h3);
+
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+
+ for (cnt = width >> 3; cnt--;) {
+ src_tmp = src;
+ dst_tmp = dst;
+
+ src0 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src1, src2);
+ src3 = __lsx_vldx(src_tmp, src_stride_3x);
+ src_tmp += src_stride_4x;
+ src4 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src5, src6);
+ src_tmp += src_stride_3x;
+
+ /* row 0 row 1 row 2 row 3 */
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0,
+ src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1,
+ src1, mask2, src1, src1, mask3, vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2,
+ src2, mask2, src2, src2, mask3, vec8, vec9, vec10, vec11);
+ DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3,
+ src3, mask2, src3, src3, mask3, vec12, vec13, vec14, vec15);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, vec8, filt0,
+ vec12, filt0, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec5, filt1,
+ dst2, vec9, filt1, dst3, vec13, filt1, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec2, filt2, dst1, vec6, filt2,
+ dst2, vec10, filt2, dst3, vec14, filt2, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec3, filt3, dst1, vec7, filt3,
+ dst2, vec11, filt3, dst3, vec15, filt3, dst0, dst1, dst2, dst3);
+
+ DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4,
+ src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5,
+ src5, mask2, src5, src5, mask3, vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6,
+ src6, mask2, src6, src6, mask3, vec8, vec9, vec10, vec11);
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, dst4, dst5);
+ dst6 = __lsx_vdp2_h_bu_b(vec8, filt0);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst5, vec5, filt1,
+ dst6, vec9, filt1, dst4, vec2, filt2, dst4, dst5, dst6, dst4);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec6, filt2, dst6, vec10, filt2,
+ dst4, vec3, filt3, dst5, vec7, filt3, dst5, dst6, dst4, dst5);
+ dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3);
+ DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst2,
+ dst1, dst10_r, dst32_r, dst54_r, dst21_r);
+ DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst2,
+ dst1, dst10_l, dst32_l, dst54_l, dst21_l);
+ DUP2_ARG2(__lsx_vilvl_h, dst4, dst3, dst6, dst5, dst43_r, dst65_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst4, dst3, dst6, dst5, dst43_l, dst65_l);
+
+ for (loop_cnt = height >> 1; loop_cnt--;) {
+ src7 = __lsx_vld(src_tmp, 0);
+ src8 = __lsx_vldx(src_tmp, src_stride);
+ src_tmp += src_stride_2x;
+
+ DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7,
+ src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3);
+ dst7 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2,
+ filt2, dst7, dst7);
+ dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3);
+ dst76_r = __lsx_vilvl_h(dst7, dst6);
+ dst76_l = __lsx_vilvh_h(dst7, dst6);
+ DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0,
+ dst0_r, dst0_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
+ dst32_l, filt_h1, dst0_r, dst54_r, filt_h2, dst0_l,
+ dst54_l, filt_h2, dst0_r, dst0_l, dst0_r, dst0_l);
+ DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l,
+ dst76_l, filt_h3, dst0_r, dst0_l);
+ DUP2_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst0_r, dst0_l);
+
+ DUP4_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1, src8,
+ src8, mask2, src8, src8, mask3, vec0, vec1, vec2, vec3);
+ dst8 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst8, vec1, filt1, dst8, vec2,
+ filt2, dst8, dst8);
+ dst8 = __lsx_vdp2add_h_bu_b(dst8, vec3, filt3);
+
+ dst87_r = __lsx_vilvl_h(dst8, dst7);
+ dst87_l = __lsx_vilvh_h(dst8, dst7);
+ DUP2_ARG2(__lsx_vdp2_w_h, dst21_r, filt_h0, dst21_l, filt_h0,
+ dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst1_r, dst43_r, filt_h1, dst1_l,
+ dst43_l, filt_h1, dst1_r, dst65_r, filt_h2, dst1_l,
+ dst65_l, filt_h2, dst1_r, dst1_l, dst1_r, dst1_l);
+ DUP2_ARG3(__lsx_vdp2add_w_h, dst1_r, dst87_r, filt_h3, dst1_l,
+ dst87_l, filt_h3, dst1_r, dst1_l);
+ DUP2_ARG2(__lsx_vsrai_w, dst1_r, 6, dst1_l, 6, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vsrari_w, dst0_r, 6, dst0_l, 6,dst1_r, 6, dst1_l,
+ 6, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG1(__lsx_vclip255_w, dst0_l, dst0_r, dst1_l, dst1_r,
+ dst0_l, dst0_r, dst1_l, dst1_r);
+ DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r,
+ dst0, dst1);
+ out = __lsx_vpickev_b(dst1, dst0);
+ __lsx_vstelm_d(out, dst_tmp, 0, 0);
+ __lsx_vstelm_d(out, dst_tmp + dst_stride, 0, 1);
+ dst_tmp += dst_stride_2x;
+
+ dst10_r = dst32_r;
+ dst32_r = dst54_r;
+ dst54_r = dst76_r;
+ dst10_l = dst32_l;
+ dst32_l = dst54_l;
+ dst54_l = dst76_l;
+ dst21_r = dst43_r;
+ dst43_r = dst65_r;
+ dst65_r = dst87_r;
+ dst21_l = dst43_l;
+ dst43_l = dst65_l;
+ dst65_l = dst87_l;
+ dst6 = dst8;
+ }
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void hevc_hv_8t_8w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height)
+{
+ hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 8);
+}
+
+static void hevc_hv_8t_16w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height)
+{
+ hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 16);
+}
+
+static void hevc_hv_8t_24w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height)
+{
+ hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 24);
+}
+
+static void hevc_hv_8t_32w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height)
+{
+ hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 32);
+}
+
+static void hevc_hv_8t_48w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height)
+{
+ hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 48);
+}
+
+static void hevc_hv_8t_64w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height)
+{
+ hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 64);
+}
+
+static av_always_inline
+void common_vt_4t_24w_lsx(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ uint32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ uint8_t *_src;
+
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i src11, filt0, filt1;
+ __m128i src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
+ __m128i src109_r, src10_l, src32_l, src21_l, src43_l;
+ __m128i out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
+ __m128i out1, out2, out3, out4;
+
+ src -= src_stride;
+ DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+ _src = src + 16;
+
+ /* 16 width */
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+ DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
+
+ /* 8 width */
+ src6 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x, src7, src8);
+ src += src_stride_3x;
+ _src += src_stride_3x;
+ DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
+
+ for (loop_cnt = 8; loop_cnt--;) {
+ /* 16 width */
+ DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src3, src9);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src4, src10);
+ DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
+ DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
+
+ /* 8 width */
+ src += src_stride_2x;
+ _src += src_stride_2x;
+ DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r);
+
+ /* 16 width */
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
+ filt0, src21_l, filt0, out0_r, out0_l, out1_r, out1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out0_l, src32_l,
+ filt1, out1_r, src43_r, filt1, out1_l, src43_l, filt1,
+ out0_r, out0_l, out1_r, out1_l);
+
+ /* 8 width */
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src87_r, filt0,
+ out2_r, out3_r);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, out2_r, src98_r, filt1, out3_r,
+ src109_r, filt1, out2_r, out3_r);
+
+ /* 16 + 8 width */
+ DUP4_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out2_r, out2_r, 6,
+ out3_r, out3_r, 6, out1_l, out1_r, 6, out1, out2, out3, out4);
+ __lsx_vst(out1, dst, 0);
+ __lsx_vstelm_d(out2, dst, 16, 0);
+ dst += dst_stride;
+ __lsx_vst(out4, dst, 0);
+ __lsx_vstelm_d(out3, dst, 16, 0);
+ dst += dst_stride;
+
+ /* 16 width */
+ DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src5, src11);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src2, src8);
+ DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r);
+ DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l);
+
+ /* 8 width */
+ src += src_stride_2x;
+ _src += src_stride_2x;
+ DUP2_ARG2(__lsx_vilvl_b, src11, src10, src8, src11, src76_r, src87_r);
+
+ /* 16 width */
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
+ filt0, src43_l, filt0, out0_r, out0_l, out1_r, out1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src10_r, filt1, out0_l, src10_l,
+ filt1, out1_r, src21_r, filt1, out1_l, src21_l, filt1,
+ out0_r, out0_l, out1_r, out1_l);
+
+ /* 8 width */
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, src98_r, filt0, src109_r, filt0,
+ out2_r, out3_r);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, out2_r, src76_r, filt1, out3_r,
+ src87_r, filt1, out2_r, out3_r);
+
+ /* 16 + 8 width */
+ DUP4_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out2_r, out2_r, 6,
+ out1_l, out1_r, 6, out3_r, out3_r, 6, out1, out2, out3, out4);
+
+ __lsx_vst(out1, dst, 0);
+ __lsx_vstelm_d(out2, dst, 16, 0);
+ dst += dst_stride;
+ __lsx_vst(out3, dst, 0);
+ __lsx_vstelm_d(out4, dst, 16, 0);
+ dst += dst_stride;
+ }
+}
+
+static av_always_inline
+void common_vt_4t_32w_lsx(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ uint32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ uint8_t *_src;
+
+ __m128i src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
+ __m128i src10_r, src32_r, src76_r, src98_r;
+ __m128i src21_r, src43_r, src87_r, src109_r;
+ __m128i out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+ __m128i src10_l, src32_l, src76_l, src98_l;
+ __m128i src21_l, src43_l, src87_l, src109_l;
+ __m128i filt0, filt1;
+ __m128i out1, out2;
+
+ src -= src_stride;
+ DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+ _src = src + 16;
+
+ /* 16 width */
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+ DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
+
+ /* next 16 width */
+ src6 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x, src7, src8);
+ src += src_stride_3x;
+ _src += src_stride_3x;
+
+ DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
+ DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src76_l, src87_l);
+
+ for (loop_cnt = (height >> 1); loop_cnt--;) {
+ /* 16 width */
+ DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src3, src9);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src4, src10);
+ DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
+ DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
+
+ /* 16 width */
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
+ filt0, src21_l, filt0, out0_r, out0_l, out1_r, out1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out0_l, src32_l,
+ filt1, out1_r, src43_r, filt1, out1_l, src43_l, filt1,
+ out0_r, out0_l, out1_r, out1_l);
+
+ DUP2_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out1_l, out1_r, 6,
+ out1, out2);
+ __lsx_vst(out1, dst, 0);
+ __lsx_vstx(out2, dst, dst_stride);
+
+ src10_r = src32_r;
+ src21_r = src43_r;
+ src10_l = src32_l;
+ src21_l = src43_l;
+ src2 = src4;
+
+ /* next 16 width */
+ src += src_stride_2x;
+ _src += src_stride_2x;
+ DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r);
+ DUP2_ARG2(__lsx_vilvh_b, src9, src8, src10, src9, src98_l, src109_l);
+
+ /* next 16 width */
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src76_l, filt0, src87_r,
+ filt0, src87_l, filt0, out2_r, out2_l, out3_r, out3_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, out2_r, src98_r, filt1, out2_l, src98_l,
+ filt1, out3_r, src109_r, filt1, out3_l, src109_l, filt1,
+ out2_r, out2_l, out3_r, out3_l);
+
+ /* next 16 width */
+ DUP2_ARG3(__lsx_vssrarni_bu_h, out2_l, out2_r, 6, out3_l, out3_r, 6,
+ out1, out2);
+ __lsx_vst(out1, dst, 16);
+ __lsx_vst(out2, dst + dst_stride, 16);
+
+ dst += dst_stride_2x;
+
+ src76_r = src98_r;
+ src87_r = src109_r;
+ src76_l = src98_l;
+ src87_l = src109_l;
+ src8 = src10;
+ }
+}
+
+static av_always_inline
+void hevc_hv_4t_8x2_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y)
+{
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t src_stride_4x = (src_stride << 2);
+ const int32_t src_stride_3x = src_stride_2x + src_stride;
+ __m128i out;
+ __m128i src0, src1, src2, src3, src4;
+ __m128i filt0, filt1;
+ __m128i filt_h0, filt_h1, filter_vec;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ __m128i mask1;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+ __m128i dst0, dst1, dst2, dst3, dst4;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l;
+ __m128i dst10_r, dst32_r, dst21_r, dst43_r;
+ __m128i dst10_l, dst32_l, dst21_l, dst43_l;
+ __m128i out0_r, out1_r;
+
+ src -= (src_stride + 1);
+ DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+ DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src,
+ src_stride_3x, src, src_stride_4x, src1, src2, src3, src4);
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src1, src1,
+ mask0, src1, src1, mask1, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src3, src3,
+ mask0, src3, src3, mask1, vec4, vec5, vec6, vec7);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec8, vec9);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
+ filt0, dst0, dst1, dst2, dst3);
+ dst4 = __lsx_vdp2_h_bu_b(vec8, filt0);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2,
+ vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
+ dst4 = __lsx_vdp2add_h_bu_b(dst4, vec9, filt1);
+ DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
+ dst10_r, dst21_r, dst32_r, dst43_r);
+ DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
+ dst10_l, dst21_l, dst32_l, dst43_l);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+ filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+ filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP2_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6,
+ out0_r, out1_r);
+ out = __lsx_vssrarni_bu_h(out1_r, out0_r, 6);
+ __lsx_vstelm_d(out, dst, 0, 0);
+ __lsx_vstelm_d(out, dst + dst_stride, 0, 1);
+}
+
+static av_always_inline
+void hevc_hv_4t_8multx4_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t width8mult)
+{
+ uint32_t cnt;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ const int32_t src_stride_4x = (src_stride << 2);
+ const int32_t src_stride_3x = src_stride_2x + src_stride;
+ const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+
+ __m128i out0, out1;
+ __m128i src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i filt0, filt1, filt_h0, filt_h1, filter_vec;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+ __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
+ __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
+
+ src -= (src_stride + 1);
+ DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+ DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+ mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ for (cnt = width8mult; cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
+ src += (8 - src_stride_4x);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1,
+ vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,
+ vec4, vec5);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
+ dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
+ dst0, dst1);
+ dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
+
+ DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1,
+ vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1,
+ vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1,
+ vec6, vec7);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
+ vec6, filt0, dst3, dst4, dst5, dst6);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3, filt1,
+ dst5, vec5, filt1, dst6, vec7, filt1, dst3, dst4, dst5, dst6);
+
+ DUP4_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst5, dst4, dst6,
+ dst5, dst32_r, dst43_r, dst54_r, dst65_r);
+ DUP4_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst5, dst4, dst6,
+ dst5, dst32_l, dst43_l, dst54_l, dst65_l);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+ filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
+ filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+ filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
+ filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
+ dst2_r, dst2_l, dst3_r, dst3_l);
+
+ DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6,
+ dst2_l, dst2_r, 6, dst3_l, dst3_r, 6, tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, 6, tmp3, tmp2, 6, out0, out1);
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
+ dst += 8;
+ }
+}
+
+static av_always_inline
+void hevc_hv_4t_8x6_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y)
+{
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ const int32_t src_stride_4x = (src_stride << 2);
+ const int32_t dst_stride_4x = (dst_stride << 2);
+ const int32_t src_stride_3x = src_stride_2x + src_stride;
+ const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ __m128i out0, out1, out2;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ __m128i filt0, filt1;
+ __m128i filt_h0, filt_h1, filter_vec;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ __m128i mask1;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+ __m128i vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+ __m128i dst4_r, dst4_l, dst5_r, dst5_l;
+ __m128i dst10_r, dst32_r, dst10_l, dst32_l;
+ __m128i dst21_r, dst43_r, dst21_l, dst43_l;
+ __m128i dst54_r, dst54_l, dst65_r, dst65_l;
+ __m128i dst76_r, dst76_l, dst87_r, dst87_l;
+ __m128i out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
+
+ src -= (src_stride + 1);
+ DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+ DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,src,
+ src_stride_3x, src, src_stride_4x, src1, src2, src3, src4);
+ src += src_stride_4x;
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,src,
+ src_stride_3x, src, src_stride_4x, src5, src6, src7, src8);
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src1, src1,
+ mask0, src1, src1, mask1, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src3, src3,
+ mask0, src3, src3, mask1, vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src5, src5,
+ mask0, src5, src5, mask1, vec8, vec9, vec10, vec11);
+ DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src7, src7,
+ mask0, src7, src7, mask1, vec12, vec13, vec14, vec15);
+ DUP2_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1, vec16, vec17);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
+ filt0, dst0, dst1, dst2, dst3);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec8, filt0, vec10, filt0, vec12, filt0, vec14,
+ filt0, dst4, dst5, dst6, dst7);
+ dst8 = __lsx_vdp2_h_bu_b(vec16, filt0);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2,
+ vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec9, filt1, dst5, vec11, filt1, dst6,
+ vec13, filt1, dst7, vec15, filt1, dst4, dst5, dst6, dst7);
+ dst8 = __lsx_vdp2add_h_bu_b(dst8, vec17, filt1);
+
+ DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
+ dst10_r, dst21_r, dst32_r, dst43_r);
+ DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
+ dst10_l, dst21_l, dst32_l, dst43_l);
+ DUP4_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
+ dst54_r, dst65_r, dst76_r, dst87_r);
+ DUP4_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
+ dst54_l, dst65_l, dst76_l, dst87_l);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+ filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
+ filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst54_l, filt_h0, dst65_r,
+ filt_h0, dst65_l, filt_h0, dst4_r, dst4_l, dst5_r, dst5_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+ filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
+ filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
+ dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst4_r, dst76_r, filt_h1, dst4_l, dst76_l,
+ filt_h1, dst5_r, dst87_r, filt_h1, dst5_l, dst87_l, filt_h1,
+ dst4_r, dst4_l, dst5_r, dst5_l);
+
+ DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6, dst2_l,
+ dst2_r, 6, dst3_l, dst3_r, 6, out0_r, out1_r, out2_r, out3_r);
+ DUP2_ARG3(__lsx_vsrani_h_w, dst4_l, dst4_r, 6, dst5_l, dst5_r, 6,
+ out4_r, out5_r);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, out1_r, out0_r, 6, out3_r, out2_r, 6,
+ out0, out1);
+ out2 = __lsx_vssrarni_bu_h(out5_r, out4_r, 6);
+
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
+ dst += dst_stride_4x;
+ __lsx_vstelm_d(out2, dst, 0, 0);
+ __lsx_vstelm_d(out2, dst + dst_stride, 0, 1);
+}
+
+static av_always_inline
+void hevc_hv_4t_8multx4mult_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height,
+ int32_t width8mult)
+{
+ uint32_t loop_cnt, cnt;
+ uint8_t *src_tmp;
+ uint8_t *dst_tmp;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ const int32_t src_stride_4x = (src_stride << 2);
+ const int32_t dst_stride_4x = (dst_stride << 2);
+ const int32_t src_stride_3x = src_stride_2x + src_stride;
+ const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+
+ __m128i out0, out1;
+ __m128i src0, src1, src2, src3, src4, src5, src6;
+ __m128i filt0, filt1;
+ __m128i filt_h0, filt_h1, filter_vec;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ __m128i mask1;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+ __m128i dst10_r, dst32_r, dst21_r, dst43_r;
+ __m128i dst10_l, dst32_l, dst21_l, dst43_l;
+ __m128i dst54_r, dst54_l, dst65_r, dst65_l, dst6;
+ __m128i out0_r, out1_r, out2_r, out3_r;
+
+ src -= (src_stride + 1);
+ DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+ DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ for (cnt = width8mult; cnt--;) {
+ src_tmp = src;
+ dst_tmp = dst;
+
+ src0 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src1, src2);
+ src_tmp += src_stride_3x;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1,
+ vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,
+ vec4, vec5);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
+ dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
+ dst0, dst1);
+ dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src3 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src4, src5);
+ src6 = __lsx_vldx(src_tmp, src_stride_3x);
+ src_tmp += src_stride_4x;
+
+ DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src4,
+ src4, mask0, src4, src4, mask1, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src6,
+ src6, mask0, src6, src6, mask1, vec4, vec5, vec6, vec7);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
+ vec6, filt0, dst3, dst4, dst5, dst6);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3,
+ filt1, dst5, vec5, filt1, dst6, vec7, filt1,
+ dst3, dst4, dst5, dst6);
+
+ DUP4_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst5, dst4,
+ dst6, dst5, dst32_r, dst43_r, dst54_r, dst65_r);
+ DUP4_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst5, dst4,
+ dst6, dst5, dst32_l, dst43_l, dst54_l, dst65_l);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+ filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
+ filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
+ dst32_l, filt_h1, dst1_r, dst43_r, filt_h1, dst1_l,
+ dst43_l, filt_h1, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l,
+ dst54_l, filt_h1, dst3_r, dst65_r, filt_h1, dst3_l,
+ dst65_l, filt_h1, dst2_r, dst2_l, dst3_r, dst3_l);
+
+ DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6,
+ dst2_l, dst2_r, 6, dst3_l, dst3_r, 6, out0_r, out1_r,
+ out2_r, out3_r);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, out1_r, out0_r, 6, out3_r, out2_r,
+ 6, out0, out1);
+ __lsx_vstelm_d(out0, dst_tmp, 0, 0);
+ __lsx_vstelm_d(out0, dst_tmp + dst_stride, 0, 1);
+ __lsx_vstelm_d(out1, dst_tmp + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(out1, dst_tmp + dst_stride_3x, 0, 1);
+ dst_tmp += dst_stride_4x;
+
+ dst10_r = dst54_r;
+ dst10_l = dst54_l;
+ dst21_r = dst65_r;
+ dst21_l = dst65_l;
+ dst2 = dst6;
+ }
+ src += 8;
+ dst += 8;
+ }
+}
+
+static
+void hevc_hv_4t_8w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height)
+{
+ if (2 == height) {
+ hevc_hv_4t_8x2_lsx(src, src_stride, dst, dst_stride, filter_x, filter_y);
+ } else if (4 == height) {
+ hevc_hv_4t_8multx4_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, 1);
+ } else if (6 == height) {
+ hevc_hv_4t_8x6_lsx(src, src_stride, dst, dst_stride, filter_x, filter_y);
+ } else if (0 == (height & 0x03)) {
+ hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 1);
+ }
+}
+
+static av_always_inline
+void hevc_hv_4t_12w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height)
+{
+ uint32_t loop_cnt;
+ uint8_t *src_tmp, *dst_tmp;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ const int32_t src_stride_4x = (src_stride << 2);
+ const int32_t dst_stride_4x = (dst_stride << 2);
+ const int32_t src_stride_3x = src_stride_2x + src_stride;
+ const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ __m128i out0, out1;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i mask0, mask1, mask2, mask3;
+ __m128i filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
+ __m128i dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
+ __m128i dst10, dst21, dst22, dst73, dst84, dst95, dst106;
+ __m128i dst76_r, dst98_r, dst87_r, dst109_r;
+ __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
+ __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+ src -= (src_stride + 1);
+ DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+ DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+ mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ src_tmp = src;
+ dst_tmp = dst;
+
+ src0 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src1, src2);
+ src_tmp += src_stride_3x;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dsth0, dsth1);
+ dsth2 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dsth0, vec1, filt1, dsth1, vec3, filt1,
+ dsth0, dsth1);
+ dsth2 = __lsx_vdp2add_h_bu_b(dsth2, vec5, filt1);
+
+ DUP2_ARG2(__lsx_vilvl_h, dsth1, dsth0, dsth2, dsth1, dst10_r, dst21_r);
+ DUP2_ARG2(__lsx_vilvh_h, dsth1, dsth0, dsth2, dsth1, dst10_l, dst21_l);
+
+ for (loop_cnt = 4; loop_cnt--;) {
+ src3 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src4, src5);
+ src6 = __lsx_vldx(src_tmp, src_stride_3x);
+ src_tmp += src_stride_4x;
+
+ DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src4,
+ src4, mask0, src4, src4, mask1, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src6,
+ src6, mask0, src6, src6, mask1, vec4, vec5, vec6, vec7);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
+ vec6, filt0, dsth3, dsth4, dsth5, dsth6);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dsth3, vec1, filt1, dsth4,
+ vec3, filt1, dsth5, vec5, filt1, dsth6, vec7, filt1,
+ dsth3, dsth4, dsth5, dsth6);
+
+ DUP4_ARG2(__lsx_vilvl_h, dsth3, dsth2, dsth4, dsth3, dsth5, dsth4,
+ dsth6, dsth5, dst32_r, dst43_r, dst54_r, dst65_r);
+ DUP4_ARG2(__lsx_vilvh_h, dsth3, dsth2, dsth4, dsth3, dsth5, dsth4,
+ dsth6, dsth5, dst32_l, dst43_l, dst54_l, dst65_l);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+ filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
+ filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+ filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
+ filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
+ dst2_r, dst2_l, dst3_r, dst3_l);
+
+ DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6, dst2_l,
+ dst2_r, 6, dst3_l, dst3_r, 6, tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, 6, tmp3, tmp2, 6, out0, out1);
+
+ __lsx_vstelm_d(out0, dst_tmp, 0, 0);
+ __lsx_vstelm_d(out0, dst_tmp + dst_stride, 0, 1);
+ __lsx_vstelm_d(out1, dst_tmp + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(out1, dst_tmp + dst_stride_3x, 0, 1);
+ dst_tmp += dst_stride_4x;
+
+ dst10_r = dst54_r;
+ dst10_l = dst54_l;
+ dst21_r = dst65_r;
+ dst21_l = dst65_l;
+ dsth2 = dsth6;
+ }
+
+ src += 8;
+ dst += 8;
+
+ mask2 = __lsx_vld(ff_hevc_mask_arr, 16);
+ mask3 = __lsx_vaddi_bu(mask2, 2);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src += src_stride_3x;
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask2, src1, src0, mask3, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src1, mask2, src2, src1, mask3, vec2, vec3);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst10, dst21);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst10, vec1, filt1, dst21, vec3, filt1,
+ dst10, dst21);
+
+ dst10_r = __lsx_vilvl_h(dst21, dst10);
+ dst21_r = __lsx_vilvh_h(dst21, dst10);
+ dst22 = __lsx_vreplvei_d(dst21, 1);
+
+ for (loop_cnt = 2; loop_cnt--;) {
+ src3 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src4, src5);
+ src6 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src7 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
+ src10 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ DUP4_ARG3(__lsx_vshuf_b, src7, src3, mask2, src7, src3, mask3, src8,
+ src4, mask2, src8, src4, mask3, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src9, src5, mask2, src9, src5, mask3, src10,
+ src6, mask2, src10, src6, mask3, vec4, vec5, vec6, vec7);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
+ vec6, filt0, dst73, dst84, dst95, dst106);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst73, vec1, filt1, dst84, vec3,
+ filt1, dst95, vec5, filt1, dst106, vec7, filt1,
+ dst73, dst84, dst95, dst106);
+
+ dst32_r = __lsx_vilvl_h(dst73, dst22);
+ DUP2_ARG2(__lsx_vilvl_h, dst84, dst73, dst95, dst84, dst43_r, dst54_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst84, dst73, dst95, dst84, dst87_r, dst98_r);
+ dst65_r = __lsx_vilvl_h(dst106, dst95);
+ dst109_r = __lsx_vilvh_h(dst106, dst95);
+ dst22 = __lsx_vreplvei_d(dst73, 1);
+ dst76_r = __lsx_vilvl_h(dst22, dst106);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst21_r, filt_h0, dst32_r,
+ filt_h0, dst43_r, filt_h0, dst0, dst1, dst2, dst3);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst65_r, filt_h0, dst76_r,
+ filt_h0, dst87_r, filt_h0, dst4, dst5, dst6, dst7);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0, dst32_r, filt_h1, dst1, dst43_r,
+ filt_h1, dst2, dst54_r, filt_h1, dst3, dst65_r, filt_h1,
+ dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst4, dst76_r, filt_h1, dst5, dst87_r,
+ filt_h1, dst6, dst98_r, filt_h1, dst7, dst109_r, filt_h1,
+ dst4, dst5, dst6, dst7);
+
+ DUP4_ARG3(__lsx_vsrani_h_w, dst1, dst0, 6, dst3, dst2, 6, dst5, dst4,
+ 6, dst7, dst6, 6, tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, 6, tmp3, tmp2, 6, out0, out1);
+
+ __lsx_vstelm_w(out0, dst, 0, 0);
+ __lsx_vstelm_w(out0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(out0, dst + dst_stride_2x, 0, 2);
+ __lsx_vstelm_w(out0, dst + dst_stride_3x, 0, 3);
+ dst += dst_stride_4x;
+ __lsx_vstelm_w(out1, dst, 0, 0);
+ __lsx_vstelm_w(out1, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(out1, dst + dst_stride_2x, 0, 2);
+ __lsx_vstelm_w(out1, dst + dst_stride_3x, 0, 3);
+ dst += dst_stride_4x;
+
+ dst10_r = dst98_r;
+ dst21_r = dst109_r;
+ dst22 = __lsx_vreplvei_d(dst106, 1);
+ }
+}
+
+static void hevc_hv_4t_16w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height)
+{
+ if (4 == height) {
+ hevc_hv_4t_8multx4_lsx(src, src_stride, dst, dst_stride, filter_x,
+ filter_y, 2);
+ } else {
+ hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 2);
+ }
+}
+
+static void hevc_hv_4t_24w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height)
+{
+ hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 3);
+}
+
+static void hevc_hv_4t_32w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height)
+{
+ hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 4);
+}
+
+#define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
+void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_lsx(uint8_t *dst, \
+ ptrdiff_t dst_stride, \
+ uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int height, \
+ intptr_t mx, \
+ intptr_t my, \
+ int width) \
+{ \
+ const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
+ \
+ common_##DIR1##_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, dst_stride, \
+ filter, height); \
+}
+
+UNI_MC(qpel, h, 64, 8, hz, mx);
+
+UNI_MC(qpel, v, 24, 8, vt, my);
+UNI_MC(qpel, v, 32, 8, vt, my);
+UNI_MC(qpel, v, 48, 8, vt, my);
+UNI_MC(qpel, v, 64, 8, vt, my);
+
+UNI_MC(epel, v, 24, 4, vt, my);
+UNI_MC(epel, v, 32, 4, vt, my);
+
+#undef UNI_MC
+
+#define UNI_MC_HV(PEL, WIDTH, TAP) \
+void ff_hevc_put_hevc_uni_##PEL##_hv##WIDTH##_8_lsx(uint8_t *dst, \
+ ptrdiff_t dst_stride, \
+ uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int height, \
+ intptr_t mx, \
+ intptr_t my, \
+ int width) \
+{ \
+ const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
+ const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
+ \
+ hevc_hv_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, dst_stride, \
+ filter_x, filter_y, height); \
+}
+
+UNI_MC_HV(qpel, 8, 8);
+UNI_MC_HV(qpel, 16, 8);
+UNI_MC_HV(qpel, 24, 8);
+UNI_MC_HV(qpel, 32, 8);
+UNI_MC_HV(qpel, 48, 8);
+UNI_MC_HV(qpel, 64, 8);
+
+UNI_MC_HV(epel, 8, 4);
+UNI_MC_HV(epel, 12, 4);
+UNI_MC_HV(epel, 16, 4);
+UNI_MC_HV(epel, 24, 4);
+UNI_MC_HV(epel, 32, 4);
+
+#undef UNI_MC_HV
diff --git a/libavcodec/loongarch/hevc_mc_uniw_lsx.c b/libavcodec/loongarch/hevc_mc_uniw_lsx.c
new file mode 100644
index 0000000000..118f5b820e
--- /dev/null
+++ b/libavcodec/loongarch/hevc_mc_uniw_lsx.c
@@ -0,0 +1,298 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ * Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "hevcdsp_lsx.h"
+
+static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
+ /* 8 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+ /* 4 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
+};
+
+static av_always_inline
+void hevc_hv_8t_8x2_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height, int32_t weight,
+ int32_t offset, int32_t rnd_val, int32_t width)
+{
+ uint32_t loop_cnt, cnt;
+ uint8_t *src_tmp;
+ uint8_t *dst_tmp;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ const int32_t src_stride_4x = (src_stride << 2);
+ const int32_t src_stride_3x = src_stride_2x + src_stride;
+
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ __m128i filt0, filt1, filt2, filt3;
+ __m128i filt_h0, filt_h1, filt_h2, filt_h3;
+ __m128i mask1, mask2, mask3;
+ __m128i filter_vec;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l;
+ __m128i dst10_r, dst32_r, dst54_r, dst76_r;
+ __m128i dst10_l, dst32_l, dst54_l, dst76_l;
+ __m128i dst21_r, dst43_r, dst65_r, dst87_r;
+ __m128i dst21_l, dst43_l, dst65_l, dst87_l;
+ __m128i weight_vec, offset_vec, rnd_vec;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+
+ src -= (src_stride_3x + 3);
+ weight_vec = __lsx_vreplgr2vr_w(weight);
+ offset_vec = __lsx_vreplgr2vr_w(offset);
+ rnd_vec = __lsx_vreplgr2vr_w(rnd_val);
+
+ DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4,
+ filter_x, 6, filt0, filt1, filt2, filt3);
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+ DUP4_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filter_vec, 2,
+ filter_vec, 3, filt_h0, filt_h1, filt_h2, filt_h3);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+
+ for (cnt = width >> 3; cnt--;) {
+ src_tmp = src;
+ dst_tmp = dst;
+
+ src0 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src1, src2);
+ src3 = __lsx_vldx(src_tmp, src_stride_3x);
+ src_tmp += src_stride_4x;
+ src4 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src5, src6);
+ src_tmp += src_stride_3x;
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0,
+ src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1,
+ src1, mask2, src1, src1, mask3, vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2,
+ src2, mask2, src2, src2, mask3, vec8, vec9, vec10, vec11);
+ DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3,
+ src3, mask2, src3, src3, mask3, vec12, vec13, vec14, vec15);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, vec8, filt0,
+ vec12, filt0, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec5, filt1,
+ dst2, vec9, filt1, dst3, vec13, filt1, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec2, filt2, dst1, vec6, filt2,
+ dst2, vec10, filt2, dst3, vec14, filt2, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec3, filt3, dst1, vec7, filt3,
+ dst2, vec11, filt3, dst3, vec15, filt3, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4,
+ src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5,
+ src5, mask2, src5, src5, mask3, vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6,
+ src6, mask2, src6, src6, mask3, vec8, vec9, vec10, vec11);
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, dst4, dst5);
+ dst6 = __lsx_vdp2_h_bu_b(vec8, filt0);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst5, vec5, filt1,
+ dst6, vec9, filt1, dst4, vec2, filt2, dst4, dst5, dst6, dst4);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec6, filt2, dst6, vec10, filt2,
+ dst4, vec3, filt3, dst5, vec7, filt3, dst5, dst6, dst4, dst5);
+ dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3);
+
+ DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst2,
+ dst1, dst10_r, dst32_r, dst54_r, dst21_r);
+ DUP2_ARG2(__lsx_vilvl_h, dst4, dst3, dst6, dst5, dst43_r, dst65_r);
+ DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst2,
+ dst1, dst10_l, dst32_l, dst54_l, dst21_l);
+ DUP2_ARG2(__lsx_vilvh_h, dst4, dst3, dst6, dst5, dst43_l, dst65_l);
+
+ for (loop_cnt = height >> 1; loop_cnt--;) {
+ src7 = __lsx_vld(src_tmp, 0);
+ src8 = __lsx_vldx(src_tmp, src_stride);
+ src_tmp += src_stride_2x;
+ DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7,
+ src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3);
+ dst7 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2,
+ filt2, dst7, dst7);
+ dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3);
+ dst76_r = __lsx_vilvl_h(dst7, dst6);
+ dst76_l = __lsx_vilvh_h(dst7, dst6);
+ DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0,
+ dst0_r, dst0_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
+ dst32_l, filt_h1, dst0_r, dst54_r, filt_h2, dst0_l,
+ dst54_l, filt_h2, dst0_r, dst0_l, dst0_r, dst0_l);
+ DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l,
+ dst76_l, filt_h3, dst0_r, dst0_l);
+ DUP2_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst0_r, dst0_l);
+
+ /* row 8 */
+ DUP4_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1, src8,
+ src8, mask2, src8, src8, mask3, vec0, vec1, vec2, vec3);
+ dst8 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst8, vec1, filt1, dst8, vec2,
+ filt2, dst8, dst8);
+ dst8 = __lsx_vdp2add_h_bu_b(dst8, vec3, filt3);
+
+ dst87_r = __lsx_vilvl_h(dst8, dst7);
+ dst87_l = __lsx_vilvh_h(dst8, dst7);
+ DUP2_ARG2(__lsx_vdp2_w_h, dst21_r, filt_h0, dst21_l, filt_h0,
+ dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst1_r, dst43_r, filt_h1, dst1_l,
+ dst43_l, filt_h1, dst1_r, dst65_r, filt_h2, dst1_l,
+ dst65_l, filt_h2, dst1_r, dst1_l, dst1_r, dst1_l);
+ DUP2_ARG3(__lsx_vdp2add_w_h, dst1_r, dst87_r, filt_h3, dst1_l,
+ dst87_l, filt_h3, dst1_r, dst1_l);
+ DUP2_ARG2(__lsx_vsrai_w, dst1_r, 6, dst1_l, 6, dst1_r, dst1_l);
+
+ DUP2_ARG2(__lsx_vmul_w, dst0_r, weight_vec, dst0_l, weight_vec,
+ dst0_r, dst0_l);
+ DUP2_ARG2(__lsx_vmul_w, dst1_r, weight_vec, dst1_l, weight_vec,
+ dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vsrar_w, dst0_r, rnd_vec, dst1_r, rnd_vec, dst0_l,
+ rnd_vec, dst1_l, rnd_vec, dst0_r, dst1_r, dst0_l, dst1_l);
+
+ DUP2_ARG2(__lsx_vadd_w, dst0_r, offset_vec, dst0_l, offset_vec,
+ dst0_r, dst0_l);
+ DUP2_ARG2(__lsx_vadd_w, dst1_r, offset_vec, dst1_l, offset_vec,
+ dst1_r, dst1_l);
+ DUP4_ARG1(__lsx_vclip255_w, dst0_r, dst1_r, dst0_l, dst1_l, dst0_r,
+ dst1_r, dst0_l, dst1_l);
+ DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r,
+ dst0_r, dst1_r);
+ dst0_r = __lsx_vpickev_b(dst1_r, dst0_r);
+
+ __lsx_vstelm_d(dst0_r, dst_tmp, 0, 0);
+ __lsx_vstelm_d(dst0_r, dst_tmp + dst_stride, 0, 1);
+ dst_tmp += dst_stride_2x;
+
+ dst10_r = dst32_r;
+ dst32_r = dst54_r;
+ dst54_r = dst76_r;
+ dst10_l = dst32_l;
+ dst32_l = dst54_l;
+ dst54_l = dst76_l;
+ dst21_r = dst43_r;
+ dst43_r = dst65_r;
+ dst65_r = dst87_r;
+ dst21_l = dst43_l;
+ dst43_l = dst65_l;
+ dst65_l = dst87_l;
+ dst6 = dst8;
+ }
+
+ src += 8;
+ dst += 8;
+ }
+}
+
+static
+void hevc_hv_8t_8w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height, int32_t weight,
+ int32_t offset, int32_t rnd_val)
+{
+ hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, filter_x,
+ filter_y, height, weight, offset, rnd_val, 8);
+}
+
+static
+void hevc_hv_8t_16w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height, int32_t weight,
+ int32_t offset, int32_t rnd_val)
+{
+ hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, filter_x,
+ filter_y, height, weight, offset, rnd_val, 16);
+}
+
+static
+void hevc_hv_8t_24w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height, int32_t weight,
+ int32_t offset, int32_t rnd_val)
+{
+ hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, filter_x,
+ filter_y, height, weight, offset, rnd_val, 24);
+}
+
+static
+void hevc_hv_8t_32w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height, int32_t weight,
+ int32_t offset, int32_t rnd_val)
+{
+ hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, filter_x,
+ filter_y, height, weight, offset, rnd_val, 32);
+}
+
+static
+void hevc_hv_8t_48w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height, int32_t weight,
+ int32_t offset, int32_t rnd_val)
+{
+ hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, filter_x,
+ filter_y, height, weight, offset, rnd_val, 48);
+}
+
+static
+void hevc_hv_8t_64w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height, int32_t weight,
+ int32_t offset, int32_t rnd_val)
+{
+ hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, filter_x,
+ filter_y, height, weight, offset, rnd_val, 64);
+}
+
+
+#define UNI_W_MC_HV(PEL, WIDTH, TAP) \
+void ff_hevc_put_hevc_uni_w_##PEL##_hv##WIDTH##_8_lsx(uint8_t *dst, \
+ ptrdiff_t dst_stride, \
+ uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int height, \
+ int denom, \
+ int weight, \
+ int offset, \
+ intptr_t mx, \
+ intptr_t my, \
+ int width) \
+{ \
+ const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
+ const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
+ int shift = denom + 14 - 8; \
+ \
+ hevc_hv_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, dst_stride, filter_x,\
+ filter_y, height, weight, offset, shift); \
+}
+
+UNI_W_MC_HV(qpel, 8, 8);
+UNI_W_MC_HV(qpel, 16, 8);
+UNI_W_MC_HV(qpel, 24, 8);
+UNI_W_MC_HV(qpel, 32, 8);
+UNI_W_MC_HV(qpel, 48, 8);
+UNI_W_MC_HV(qpel, 64, 8);
+
+#undef UNI_W_MC_HV
diff --git a/libavcodec/loongarch/hevcdsp_init_loongarch.c b/libavcodec/loongarch/hevcdsp_init_loongarch.c
index e05fe780c2..22739c6f5b 100644
--- a/libavcodec/loongarch/hevcdsp_init_loongarch.c
+++ b/libavcodec/loongarch/hevcdsp_init_loongarch.c
@@ -137,6 +137,36 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_8_lsx;
c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_lsx;
+ c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_lsx;
+
+ c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_8_lsx;
+ c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_lsx;
+ c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_lsx;
+ c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_lsx;
+
+ c->put_hevc_qpel_uni[3][1][1] = ff_hevc_put_hevc_uni_qpel_hv8_8_lsx;
+ c->put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_8_lsx;
+ c->put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_8_lsx;
+ c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_8_lsx;
+ c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_8_lsx;
+ c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_8_lsx;
+
+ c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_8_lsx;
+ c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_lsx;
+
+ c->put_hevc_epel_uni[3][1][1] = ff_hevc_put_hevc_uni_epel_hv8_8_lsx;
+ c->put_hevc_epel_uni[4][1][1] = ff_hevc_put_hevc_uni_epel_hv12_8_lsx;
+ c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_8_lsx;
+ c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_8_lsx;
+ c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_lsx;
+
+ c->put_hevc_qpel_uni_w[3][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv8_8_lsx;
+ c->put_hevc_qpel_uni_w[5][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv16_8_lsx;
+ c->put_hevc_qpel_uni_w[6][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv24_8_lsx;
+ c->put_hevc_qpel_uni_w[7][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv32_8_lsx;
+ c->put_hevc_qpel_uni_w[8][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv48_8_lsx;
+ c->put_hevc_qpel_uni_w[9][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv64_8_lsx;
+
c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_8_lsx;
c->hevc_h_loop_filter_luma = ff_hevc_loop_filter_luma_h_8_lsx;
diff --git a/libavcodec/loongarch/hevcdsp_lsx.h b/libavcodec/loongarch/hevcdsp_lsx.h
index 6a834290ab..0c517af887 100644
--- a/libavcodec/loongarch/hevcdsp_lsx.h
+++ b/libavcodec/loongarch/hevcdsp_lsx.h
@@ -142,6 +142,65 @@ BI_MC(epel, hv, 32);
#undef BI_MC
+#define UNI_MC(PEL, DIR, WIDTH) \
+void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_lsx(uint8_t *dst, \
+ ptrdiff_t dst_stride, \
+ uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int height, \
+ intptr_t mx, \
+ intptr_t my, \
+ int width)
+
+UNI_MC(qpel, h, 64);
+
+UNI_MC(qpel, v, 24);
+UNI_MC(qpel, v, 32);
+UNI_MC(qpel, v, 48);
+UNI_MC(qpel, v, 64);
+
+UNI_MC(qpel, hv, 8);
+UNI_MC(qpel, hv, 16);
+UNI_MC(qpel, hv, 24);
+UNI_MC(qpel, hv, 32);
+UNI_MC(qpel, hv, 48);
+UNI_MC(qpel, hv, 64);
+
+UNI_MC(epel, v, 24);
+UNI_MC(epel, v, 32);
+
+UNI_MC(epel, hv, 8);
+UNI_MC(epel, hv, 12);
+UNI_MC(epel, hv, 16);
+UNI_MC(epel, hv, 24);
+UNI_MC(epel, hv, 32);
+
+#undef UNI_MC
+
+#define UNI_W_MC(PEL, DIR, WIDTH) \
+void ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_lsx(uint8_t *dst, \
+ ptrdiff_t \
+ dst_stride, \
+ uint8_t *src, \
+ ptrdiff_t \
+ src_stride, \
+ int height, \
+ int denom, \
+ int weight, \
+ int offset, \
+ intptr_t mx, \
+ intptr_t my, \
+ int width)
+
+UNI_W_MC(qpel, hv, 8);
+UNI_W_MC(qpel, hv, 16);
+UNI_W_MC(qpel, hv, 24);
+UNI_W_MC(qpel, hv, 32);
+UNI_W_MC(qpel, hv, 48);
+UNI_W_MC(qpel, hv, 64);
+
+#undef UNI_W_MC
+
void ff_hevc_loop_filter_luma_h_8_lsx(uint8_t *src, ptrdiff_t stride,
int32_t beta, int32_t *tc,
uint8_t *p_is_pcm, uint8_t *q_is_pcm);
--
2.20.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [FFmpeg-devel] Optimize HEVC decoding for loongarch
2022-02-14 8:25 [FFmpeg-devel] Optimize HEVC decoding for loongarch Hao Chen
` (4 preceding siblings ...)
2022-02-14 8:25 ` [FFmpeg-devel] [PATCH v1 5/5] avcodec: [loongarch] Optimize Hevc_mc_uni/w " Hao Chen
@ 2022-02-15 3:30 ` 殷时友
5 siblings, 0 replies; 7+ messages in thread
From: 殷时友 @ 2022-02-15 3:30 UTC (permalink / raw)
To: FFmpeg development discussions and patches
> 2022年2月14日 下午4:25,Hao Chen <chenhao@loongson.cn> 写道:
>
> ffmpeg -i 5_h265_1080p_60fps_3Mbps.mkv -f rawvideo -y /dev/null -an
> before: 94fps
> after : 191fps
>
> [PATCH v1 1/5] avutil: [loongarch] Update loongson_intrinsics.h to
> [PATCH v1 2/5] avcodec: [loongarch] Optimize Hevcdsp with LSX.
> [PATCH v1 3/5] avcodec: [loongarch] Optimize Hevc_idct/lpf with LSX.
> [PATCH v1 4/5] avcodec: [loongarch] Optimize Hevc_mc_bi with LSX.
> [PATCH v1 5/5] avcodec: [loongarch] Optimize Hevc_mc_uni/w with LSX.
>
LGTM
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread