* [FFmpeg-devel] Optimize HEVC decoding for loongarch
@ 2022-02-17 11:11 Hao Chen
2022-02-17 11:11 ` [FFmpeg-devel] [PATCH v2 1/5] avutil: [loongarch] Update loongson_intrinsics.h to v1.1.0 Hao Chen
` (5 more replies)
0 siblings, 6 replies; 15+ messages in thread
From: Hao Chen @ 2022-02-17 11:11 UTC (permalink / raw)
To: ffmpeg-devel
ffmpeg -i 5_h265_1080p_60fps_3Mbps.mkv -f rawvideo -y /dev/null -an
before: 94fps
after : 191fps
V2: Retrigger the patchwork.ffmpeg test.
[PATCH v2 1/5] avutil: [loongarch] Update loongson_intrinsics.h to
[PATCH v2 2/5] avcodec: [loongarch] Optimize Hevcdsp with LSX.
[PATCH v2 3/5] avcodec: [loongarch] Optimize Hevc_idct/lpf with LSX.
[PATCH v2 4/5] avcodec: [loongarch] Optimize Hevc_mc_bi with LSX.
[PATCH v2 5/5] avcodec: [loongarch] Optimize Hevc_mc_uni/w with LSX.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* [FFmpeg-devel] [PATCH v2 1/5] avutil: [loongarch] Update loongson_intrinsics.h to v1.1.0
2022-02-17 11:11 [FFmpeg-devel] Optimize HEVC decoding for loongarch Hao Chen
@ 2022-02-17 11:11 ` Hao Chen
2022-02-21 11:11 ` Hao Chen
2022-02-21 15:00 ` Michael Niedermayer
2022-02-17 11:11 ` [FFmpeg-devel] [PATCH v2 2/5] avcodec: [loongarch] Optimize Hevcdsp with LSX Hao Chen
` (4 subsequent siblings)
5 siblings, 2 replies; 15+ messages in thread
From: Hao Chen @ 2022-02-17 11:11 UTC (permalink / raw)
To: ffmpeg-devel
The loongson_intrinsics.h file is updated from v1.0.3 version
to v1.1.0. Some spelling mistakes are fixed and new functions are added.
Signed-off-by: Hao Chen <chenhao@loongson.cn>
---
libavutil/loongarch/loongson_intrinsics.h | 1697 +++++++++++----------
1 file changed, 884 insertions(+), 813 deletions(-)
diff --git a/libavutil/loongarch/loongson_intrinsics.h b/libavutil/loongarch/loongson_intrinsics.h
index 6e0439f829..eb256863c8 100644
--- a/libavutil/loongarch/loongson_intrinsics.h
+++ b/libavutil/loongarch/loongson_intrinsics.h
@@ -33,7 +33,7 @@
* Xiwei Gu <guxiwei-hf@loongson.cn>
* Lu Wang <wanglu@loongson.cn>
*
- * This file is a header file for loongarch builtin extention.
+ * This file is a header file for loongarch builtin extension.
*
*/
@@ -42,50 +42,50 @@
/**
* MAJOR version: Macro usage changes.
- * MINOR version: Add new functions, or bug fix.
+ * MINOR version: Add new functions, or bug fixes.
* MICRO version: Comment changes or implementation changes.
*/
#define LSOM_VERSION_MAJOR 1
-#define LSOM_VERSION_MINOR 0
-#define LSOM_VERSION_MICRO 3
+#define LSOM_VERSION_MINOR 1
+#define LSOM_VERSION_MICRO 0
#define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
-{ \
- _OUT0 = _INS(_IN0); \
- _OUT1 = _INS(_IN1); \
-}
+ { \
+ _OUT0 = _INS(_IN0); \
+ _OUT1 = _INS(_IN1); \
+ }
#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
-{ \
- _OUT0 = _INS(_IN0, _IN1); \
- _OUT1 = _INS(_IN2, _IN3); \
-}
+ { \
+ _OUT0 = _INS(_IN0, _IN1); \
+ _OUT1 = _INS(_IN2, _IN3); \
+ }
#define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \
-{ \
- _OUT0 = _INS(_IN0, _IN1, _IN2); \
- _OUT1 = _INS(_IN3, _IN4, _IN5); \
-}
+ { \
+ _OUT0 = _INS(_IN0, _IN1, _IN2); \
+ _OUT1 = _INS(_IN3, _IN4, _IN5); \
+ }
#define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \
-{ \
- DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1); \
- DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3); \
-}
-
-#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, \
- _OUT0, _OUT1, _OUT2, _OUT3) \
-{ \
- DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1); \
- DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3); \
-}
-
-#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, \
- _IN8, _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3) \
-{ \
- DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1); \
- DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3); \
-}
+ { \
+ DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1); \
+ DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3); \
+ }
+
+#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, \
+ _OUT1, _OUT2, _OUT3) \
+ { \
+ DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1); \
+ DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3); \
+ }
+
+#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, \
+ _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3) \
+ { \
+ DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1); \
+ DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3); \
+ }
#ifdef __loongarch_sx
#include <lsxintrin.h>
@@ -94,11 +94,11 @@
* Description : Dot product & addition of byte vector elements
* Arguments : Inputs - in_c, in_h, in_l
* Outputs - out
- * Retrun Type - halfword
+ * Return Type - halfword
* Details : Signed byte elements from in_h are multiplied by
* signed byte elements from in_l, and then added adjacent to
* each other to get results with the twice size of input.
- * Then the results plus to signed half word elements from in_c.
+ * Then the results plus to signed half-word elements from in_c.
* Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
* in_c : 1,2,3,4, 1,2,3,4
* in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
@@ -106,13 +106,13 @@
* out : 23,40,41,26, 23,40,41,26
* =============================================================================
*/
-static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h, __m128i in_l)
-{
- __m128i out;
+static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h,
+ __m128i in_l) {
+ __m128i out;
- out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
- out = __lsx_vmaddwod_h_b(out, in_h, in_l);
- return out;
+ out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
+ out = __lsx_vmaddwod_h_b(out, in_h, in_l);
+ return out;
}
/*
@@ -120,35 +120,61 @@ static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h, __m128i in_l
* Description : Dot product & addition of byte vector elements
* Arguments : Inputs - in_c, in_h, in_l
* Outputs - out
- * Retrun Type - halfword
+ * Return Type - halfword
* Details : Unsigned byte elements from in_h are multiplied by
* unsigned byte elements from in_l, and then added adjacent to
* each other to get results with the twice size of input.
- * The results plus to signed half word elements from in_c.
- * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
+ * The results plus to signed half-word elements from in_c.
+ * Example : out = __lsx_vdp2add_h_bu(in_c, in_h, in_l)
* in_c : 1,2,3,4, 1,2,3,4
* in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
* in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
* out : 23,40,41,26, 23,40,41,26
* =============================================================================
*/
-static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h, __m128i in_l)
-{
- __m128i out;
+static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h,
+ __m128i in_l) {
+ __m128i out;
+
+ out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
+ out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments : Inputs - in_c, in_h, in_l
+ * Outputs - out
+ * Return Type - halfword
+ * Details : Unsigned byte elements from in_h are multiplied by
+ * signed byte elements from in_l, and then added adjacent to
+ * each other to get results with the twice size of input.
+ * The results plus to signed half-word elements from in_c.
+ * Example : out = __lsx_vdp2add_h_bu_b(in_c, in_h, in_l)
+ * in_c : 1,1,1,1, 1,1,1,1
+ * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ * in_l : -1,-2,-3,-4, -5,-6,-7,-8, 1,2,3,4, 5,6,7,8
+ * out : -4,-24,-60,-112, 6,26,62,114
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, __m128i in_h,
+ __m128i in_l) {
+ __m128i out;
- out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
- out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
- return out;
+ out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l);
+ out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
+ return out;
}
/*
* =============================================================================
- * Description : Dot product & addition of half word vector elements
+ * Description : Dot product & addition of half-word vector elements
* Arguments : Inputs - in_c, in_h, in_l
* Outputs - out
- * Retrun Type - __m128i
- * Details : Signed half word elements from in_h are multiplied by
- * signed half word elements from in_l, and then added adjacent to
+ * Return Type - __m128i
+ * Details : Signed half-word elements from in_h are multiplied by
+ * signed half-word elements from in_l, and then added adjacent to
* each other to get results with the twice size of input.
* Then the results plus to signed word elements from in_c.
* Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
@@ -158,13 +184,13 @@ static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h, __m128i in_
* out : 23,40,41,26
* =============================================================================
*/
-static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h, __m128i in_l)
-{
- __m128i out;
+static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h,
+ __m128i in_l) {
+ __m128i out;
- out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
- out = __lsx_vmaddwod_w_h(out, in_h, in_l);
- return out;
+ out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
+ out = __lsx_vmaddwod_w_h(out, in_h, in_l);
+ return out;
}
/*
@@ -172,7 +198,7 @@ static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h, __m128i in_l
* Description : Dot product of byte vector elements
* Arguments : Inputs - in_h, in_l
* Outputs - out
- * Retrun Type - halfword
+ * Return Type - halfword
* Details : Signed byte elements from in_h are multiplied by
* signed byte elements from in_l, and then added adjacent to
* each other to get results with the twice size of input.
@@ -182,13 +208,12 @@ static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h, __m128i in_l
* out : 22,38,38,22, 22,38,38,22
* =============================================================================
*/
-static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l)
-{
- __m128i out;
+static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) {
+ __m128i out;
- out = __lsx_vmulwev_h_b(in_h, in_l);
- out = __lsx_vmaddwod_h_b(out, in_h, in_l);
- return out;
+ out = __lsx_vmulwev_h_b(in_h, in_l);
+ out = __lsx_vmaddwod_h_b(out, in_h, in_l);
+ return out;
}
/*
@@ -196,7 +221,7 @@ static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l)
* Description : Dot product of byte vector elements
* Arguments : Inputs - in_h, in_l
* Outputs - out
- * Retrun Type - halfword
+ * Return Type - halfword
* Details : Unsigned byte elements from in_h are multiplied by
* unsigned byte elements from in_l, and then added adjacent to
* each other to get results with the twice size of input.
@@ -206,13 +231,12 @@ static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l)
* out : 22,38,38,22, 22,38,38,22
* =============================================================================
*/
-static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l)
-{
- __m128i out;
+static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) {
+ __m128i out;
- out = __lsx_vmulwev_h_bu(in_h, in_l);
- out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
- return out;
+ out = __lsx_vmulwev_h_bu(in_h, in_l);
+ out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
+ return out;
}
/*
@@ -220,7 +244,7 @@ static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l)
* Description : Dot product of byte vector elements
* Arguments : Inputs - in_h, in_l
* Outputs - out
- * Retrun Type - halfword
+ * Return Type - halfword
* Details : Unsigned byte elements from in_h are multiplied by
* signed byte elements from in_l, and then added adjacent to
* each other to get results with the twice size of input.
@@ -230,13 +254,12 @@ static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l)
* out : 22,38,38,22, 22,38,38,6
* =============================================================================
*/
-static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l)
-{
- __m128i out;
+static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) {
+ __m128i out;
- out = __lsx_vmulwev_h_bu_b(in_h, in_l);
- out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
- return out;
+ out = __lsx_vmulwev_h_bu_b(in_h, in_l);
+ out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
+ return out;
}
/*
@@ -244,7 +267,7 @@ static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l)
* Description : Dot product of byte vector elements
* Arguments : Inputs - in_h, in_l
* Outputs - out
- * Retrun Type - halfword
+ * Return Type - halfword
* Details : Signed byte elements from in_h are multiplied by
* signed byte elements from in_l, and then added adjacent to
* each other to get results with the twice size of input.
@@ -254,19 +277,19 @@ static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l)
* out : 22,38,38,22
* =============================================================================
*/
-static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l)
-{
- __m128i out;
+static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) {
+ __m128i out;
- out = __lsx_vmulwev_w_h(in_h, in_l);
- out = __lsx_vmaddwod_w_h(out, in_h, in_l);
- return out;
+ out = __lsx_vmulwev_w_h(in_h, in_l);
+ out = __lsx_vmaddwod_w_h(out, in_h, in_l);
+ return out;
}
/*
* =============================================================================
* Description : Clip all halfword elements of input vector between min & max
- * out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) : (_in))
+ * out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) :
+ * (_in))
* Arguments : Inputs - _in (input vector)
* - min (min threshold)
* - max (max threshold)
@@ -279,13 +302,12 @@ static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l)
* out : 1,2,9,9, 1,9,9,9
* =============================================================================
*/
-static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max)
-{
- __m128i out;
+static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max) {
+ __m128i out;
- out = __lsx_vmax_h(min, _in);
- out = __lsx_vmin_h(max, out);
- return out;
+ out = __lsx_vmax_h(min, _in);
+ out = __lsx_vmin_h(max, out);
+ return out;
}
/*
@@ -293,20 +315,19 @@ static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max)
* Description : Set each element of vector between 0 and 255
* Arguments : Inputs - _in
* Outputs - out
- * Retrun Type - halfword
+ * Return Type - halfword
* Details : Signed byte elements from _in are clamped between 0 and 255.
* Example : out = __lsx_vclip255_h(_in)
* _in : -8,255,280,249, -8,255,280,249
* out : 0,255,255,249, 0,255,255,249
* =============================================================================
*/
-static inline __m128i __lsx_vclip255_h(__m128i _in)
-{
- __m128i out;
+static inline __m128i __lsx_vclip255_h(__m128i _in) {
+ __m128i out;
- out = __lsx_vmaxi_h(_in, 0);
- out = __lsx_vsat_hu(out, 7);
- return out;
+ out = __lsx_vmaxi_h(_in, 0);
+ out = __lsx_vsat_hu(out, 7);
+ return out;
}
/*
@@ -314,20 +335,19 @@ static inline __m128i __lsx_vclip255_h(__m128i _in)
* Description : Set each element of vector between 0 and 255
* Arguments : Inputs - _in
* Outputs - out
- * Retrun Type - word
+ * Return Type - word
* Details : Signed byte elements from _in are clamped between 0 and 255.
* Example : out = __lsx_vclip255_w(_in)
* _in : -8,255,280,249
* out : 0,255,255,249
* =============================================================================
*/
-static inline __m128i __lsx_vclip255_w(__m128i _in)
-{
- __m128i out;
+static inline __m128i __lsx_vclip255_w(__m128i _in) {
+ __m128i out;
- out = __lsx_vmaxi_w(_in, 0);
- out = __lsx_vsat_wu(out, 7);
- return out;
+ out = __lsx_vmaxi_w(_in, 0);
+ out = __lsx_vsat_wu(out, 7);
+ return out;
}
/*
@@ -343,12 +363,12 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
* _in1(out) : 1,2,3,4
* =============================================================================
*/
-#define LSX_SWAP(_in0, _in1) \
-{ \
- _in0 = __lsx_vxor_v(_in0, _in1); \
- _in1 = __lsx_vxor_v(_in0, _in1); \
- _in0 = __lsx_vxor_v(_in0, _in1); \
-} \
+#define LSX_SWAP(_in0, _in1) \
+ { \
+ _in0 = __lsx_vxor_v(_in0, _in1); \
+ _in1 = __lsx_vxor_v(_in0, _in1); \
+ _in0 = __lsx_vxor_v(_in0, _in1); \
+ }
/*
* =============================================================================
@@ -364,25 +384,27 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
* =============================================================================
*/
#define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
-{ \
+ { \
__m128i _t0, _t1, _t2, _t3; \
\
- _t0 = __lsx_vilvl_w(_in1, _in0); \
- _t1 = __lsx_vilvh_w(_in1, _in0); \
- _t2 = __lsx_vilvl_w(_in3, _in2); \
- _t3 = __lsx_vilvh_w(_in3, _in2); \
+ _t0 = __lsx_vilvl_w(_in1, _in0); \
+ _t1 = __lsx_vilvh_w(_in1, _in0); \
+ _t2 = __lsx_vilvl_w(_in3, _in2); \
+ _t3 = __lsx_vilvh_w(_in3, _in2); \
_out0 = __lsx_vilvl_d(_t2, _t0); \
_out1 = __lsx_vilvh_d(_t2, _t0); \
_out2 = __lsx_vilvl_d(_t3, _t1); \
_out3 = __lsx_vilvh_d(_t3, _t1); \
-}
+ }
/*
* =============================================================================
* Description : Transpose 8x8 block with byte elements in vectors
* Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
- * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
- * Details : The rows of the matrix become columns, and the columns become rows.
+ * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ * _out7
+ * Details : The rows of the matrix become columns, and the columns
+ * become rows.
* Example : LSX_TRANSPOSE8x8_B
* _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00
* _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00
@@ -403,34 +425,35 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
* _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00
* =============================================================================
*/
-#define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
-{ \
- __m128i zero = {0}; \
- __m128i shuf8 = {0x0F0E0D0C0B0A0908, 0x1716151413121110}; \
- __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
- \
- _t0 = __lsx_vilvl_b(_in2, _in0); \
- _t1 = __lsx_vilvl_b(_in3, _in1); \
- _t2 = __lsx_vilvl_b(_in6, _in4); \
- _t3 = __lsx_vilvl_b(_in7, _in5); \
- _t4 = __lsx_vilvl_b(_t1, _t0); \
- _t5 = __lsx_vilvh_b(_t1, _t0); \
- _t6 = __lsx_vilvl_b(_t3, _t2); \
- _t7 = __lsx_vilvh_b(_t3, _t2); \
- _out0 = __lsx_vilvl_w(_t6, _t4); \
- _out2 = __lsx_vilvh_w(_t6, _t4); \
- _out4 = __lsx_vilvl_w(_t7, _t5); \
- _out6 = __lsx_vilvh_w(_t7, _t5); \
- _out1 = __lsx_vshuf_b(zero, _out0, shuf8); \
- _out3 = __lsx_vshuf_b(zero, _out2, shuf8); \
- _out5 = __lsx_vshuf_b(zero, _out4, shuf8); \
- _out7 = __lsx_vshuf_b(zero, _out6, shuf8); \
-}
+#define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ __m128i zero = { 0 }; \
+ __m128i shuf8 = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; \
+ __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
+ \
+ _t0 = __lsx_vilvl_b(_in2, _in0); \
+ _t1 = __lsx_vilvl_b(_in3, _in1); \
+ _t2 = __lsx_vilvl_b(_in6, _in4); \
+ _t3 = __lsx_vilvl_b(_in7, _in5); \
+ _t4 = __lsx_vilvl_b(_t1, _t0); \
+ _t5 = __lsx_vilvh_b(_t1, _t0); \
+ _t6 = __lsx_vilvl_b(_t3, _t2); \
+ _t7 = __lsx_vilvh_b(_t3, _t2); \
+ _out0 = __lsx_vilvl_w(_t6, _t4); \
+ _out2 = __lsx_vilvh_w(_t6, _t4); \
+ _out4 = __lsx_vilvl_w(_t7, _t5); \
+ _out6 = __lsx_vilvh_w(_t7, _t5); \
+ _out1 = __lsx_vshuf_b(zero, _out0, shuf8); \
+ _out3 = __lsx_vshuf_b(zero, _out2, shuf8); \
+ _out5 = __lsx_vshuf_b(zero, _out4, shuf8); \
+ _out7 = __lsx_vshuf_b(zero, _out6, shuf8); \
+ }
/*
* =============================================================================
- * Description : Transpose 8x8 block with half word elements in vectors
+ * Description : Transpose 8x8 block with half-word elements in vectors
* Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
* Outputs - out0, out1, out2, out3, out4, out5, out6, out7
* Details :
@@ -445,37 +468,38 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
* 70,71,72,73,74,75,76,77 07,17,27,37,47,57,67,77
* =============================================================================
*/
-#define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
-{ \
- __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
- \
- _s0 = __lsx_vilvl_h(_in6, _in4); \
- _s1 = __lsx_vilvl_h(_in7, _in5); \
- _t0 = __lsx_vilvl_h(_s1, _s0); \
- _t1 = __lsx_vilvh_h(_s1, _s0); \
- _s0 = __lsx_vilvh_h(_in6, _in4); \
- _s1 = __lsx_vilvh_h(_in7, _in5); \
- _t2 = __lsx_vilvl_h(_s1, _s0); \
- _t3 = __lsx_vilvh_h(_s1, _s0); \
- _s0 = __lsx_vilvl_h(_in2, _in0); \
- _s1 = __lsx_vilvl_h(_in3, _in1); \
- _t4 = __lsx_vilvl_h(_s1, _s0); \
- _t5 = __lsx_vilvh_h(_s1, _s0); \
- _s0 = __lsx_vilvh_h(_in2, _in0); \
- _s1 = __lsx_vilvh_h(_in3, _in1); \
- _t6 = __lsx_vilvl_h(_s1, _s0); \
- _t7 = __lsx_vilvh_h(_s1, _s0); \
- \
- _out0 = __lsx_vpickev_d(_t0, _t4); \
- _out2 = __lsx_vpickev_d(_t1, _t5); \
- _out4 = __lsx_vpickev_d(_t2, _t6); \
- _out6 = __lsx_vpickev_d(_t3, _t7); \
- _out1 = __lsx_vpickod_d(_t0, _t4); \
- _out3 = __lsx_vpickod_d(_t1, _t5); \
- _out5 = __lsx_vpickod_d(_t2, _t6); \
- _out7 = __lsx_vpickod_d(_t3, _t7); \
-}
+#define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
+ \
+ _s0 = __lsx_vilvl_h(_in6, _in4); \
+ _s1 = __lsx_vilvl_h(_in7, _in5); \
+ _t0 = __lsx_vilvl_h(_s1, _s0); \
+ _t1 = __lsx_vilvh_h(_s1, _s0); \
+ _s0 = __lsx_vilvh_h(_in6, _in4); \
+ _s1 = __lsx_vilvh_h(_in7, _in5); \
+ _t2 = __lsx_vilvl_h(_s1, _s0); \
+ _t3 = __lsx_vilvh_h(_s1, _s0); \
+ _s0 = __lsx_vilvl_h(_in2, _in0); \
+ _s1 = __lsx_vilvl_h(_in3, _in1); \
+ _t4 = __lsx_vilvl_h(_s1, _s0); \
+ _t5 = __lsx_vilvh_h(_s1, _s0); \
+ _s0 = __lsx_vilvh_h(_in2, _in0); \
+ _s1 = __lsx_vilvh_h(_in3, _in1); \
+ _t6 = __lsx_vilvl_h(_s1, _s0); \
+ _t7 = __lsx_vilvh_h(_s1, _s0); \
+ \
+ _out0 = __lsx_vpickev_d(_t0, _t4); \
+ _out2 = __lsx_vpickev_d(_t1, _t5); \
+ _out4 = __lsx_vpickev_d(_t2, _t6); \
+ _out6 = __lsx_vpickev_d(_t3, _t7); \
+ _out1 = __lsx_vpickod_d(_t0, _t4); \
+ _out3 = __lsx_vpickod_d(_t1, _t5); \
+ _out5 = __lsx_vpickod_d(_t2, _t6); \
+ _out7 = __lsx_vpickod_d(_t3, _t7); \
+ }
/*
* =============================================================================
@@ -483,7 +507,8 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
* Arguments : Inputs - _in0, _in1, _in2, _in3 (input 8x4 byte block)
* Outputs - _out0, _out1, _out2, _out3 (output 4x8 byte block)
* Return Type - as per RTYPE
- * Details : The rows of the matrix become columns, and the columns become rows.
+ * Details : The rows of the matrix become columns, and the columns become
+ * rows.
* Example : LSX_TRANSPOSE8x4_B
* _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00
* _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00
@@ -500,26 +525,26 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
* _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
* =============================================================================
*/
-#define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _out0, _out1, _out2, _out3) \
-{ \
- __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
- \
- _tmp0_m = __lsx_vpackev_w(_in4, _in0); \
- _tmp1_m = __lsx_vpackev_w(_in5, _in1); \
- _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
- _tmp0_m = __lsx_vpackev_w(_in6, _in2); \
- _tmp1_m = __lsx_vpackev_w(_in7, _in3); \
- \
- _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
- _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m); \
- _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m); \
- \
- _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m); \
- _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m); \
- _out1 = __lsx_vilvh_d(_out2, _out0); \
- _out3 = __lsx_vilvh_d(_out0, _out2); \
-}
+#define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3) \
+ { \
+ __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
+ \
+ _tmp0_m = __lsx_vpackev_w(_in4, _in0); \
+ _tmp1_m = __lsx_vpackev_w(_in5, _in1); \
+ _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
+ _tmp0_m = __lsx_vpackev_w(_in6, _in2); \
+ _tmp1_m = __lsx_vpackev_w(_in7, _in3); \
+ \
+ _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
+ _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m); \
+ _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m); \
+ \
+ _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m); \
+ _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m); \
+ _out1 = __lsx_vilvh_d(_out2, _out0); \
+ _out3 = __lsx_vilvh_d(_out0, _out2); \
+ }
/*
* =============================================================================
@@ -547,29 +572,30 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
* 120,121,122,123,124,125,126,127
* =============================================================================
*/
-#define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, _in8, \
- _in9, _in10, _in11, _in12, _in13, _in14, _in15, _out0, \
- _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
-{ \
- __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
- __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
- DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \
- _tmp0, _tmp1, _tmp2, _tmp3); \
- DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15, \
- _in13, _tmp4, _tmp5, _tmp6, _tmp7); \
- DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2); \
- DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3); \
- DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6); \
- DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7); \
- DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4); \
- DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6); \
- DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5); \
- DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7); \
- DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2); \
- DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3); \
- DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6); \
- DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7); \
-}
+#define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
+ _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+ _out6, _out7) \
+ { \
+ __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
+ __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
+ DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \
+ _tmp0, _tmp1, _tmp2, _tmp3); \
+ DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15, \
+ _in13, _tmp4, _tmp5, _tmp6, _tmp7); \
+ DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2); \
+ DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3); \
+ DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6); \
+ DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7); \
+ DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4); \
+ DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6); \
+ DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5); \
+ DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7); \
+ DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2); \
+ DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3); \
+ DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6); \
+ DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7); \
+ }
/*
* =============================================================================
@@ -585,33 +611,33 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
* =============================================================================
*/
#define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
-{ \
+ { \
_out0 = __lsx_vadd_b(_in0, _in3); \
_out1 = __lsx_vadd_b(_in1, _in2); \
_out2 = __lsx_vsub_b(_in1, _in2); \
_out3 = __lsx_vsub_b(_in0, _in3); \
-}
+ }
#define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
-{ \
+ { \
_out0 = __lsx_vadd_h(_in0, _in3); \
_out1 = __lsx_vadd_h(_in1, _in2); \
_out2 = __lsx_vsub_h(_in1, _in2); \
_out3 = __lsx_vsub_h(_in0, _in3); \
-}
+ }
#define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
-{ \
+ { \
_out0 = __lsx_vadd_w(_in0, _in3); \
_out1 = __lsx_vadd_w(_in1, _in2); \
_out2 = __lsx_vsub_w(_in1, _in2); \
_out3 = __lsx_vsub_w(_in0, _in3); \
-}
+ }
#define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
-{ \
+ { \
_out0 = __lsx_vadd_d(_in0, _in3); \
_out1 = __lsx_vadd_d(_in1, _in2); \
_out2 = __lsx_vsub_d(_in1, _in2); \
_out3 = __lsx_vsub_d(_in0, _in3); \
-}
+ }
/*
* =============================================================================
@@ -630,59 +656,63 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
* _out7 = _in0 - _in7;
* =============================================================================
*/
-#define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
-{ \
- _out0 = __lsx_vadd_b(_in0, _in7); \
- _out1 = __lsx_vadd_b(_in1, _in6); \
- _out2 = __lsx_vadd_b(_in2, _in5); \
- _out3 = __lsx_vadd_b(_in3, _in4); \
- _out4 = __lsx_vsub_b(_in3, _in4); \
- _out5 = __lsx_vsub_b(_in2, _in5); \
- _out6 = __lsx_vsub_b(_in1, _in6); \
- _out7 = __lsx_vsub_b(_in0, _in7); \
-}
-
-#define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
-{ \
- _out0 = __lsx_vadd_h(_in0, _in7); \
- _out1 = __lsx_vadd_h(_in1, _in6); \
- _out2 = __lsx_vadd_h(_in2, _in5); \
- _out3 = __lsx_vadd_h(_in3, _in4); \
- _out4 = __lsx_vsub_h(_in3, _in4); \
- _out5 = __lsx_vsub_h(_in2, _in5); \
- _out6 = __lsx_vsub_h(_in1, _in6); \
- _out7 = __lsx_vsub_h(_in0, _in7); \
-}
-
-#define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
-{ \
- _out0 = __lsx_vadd_w(_in0, _in7); \
- _out1 = __lsx_vadd_w(_in1, _in6); \
- _out2 = __lsx_vadd_w(_in2, _in5); \
- _out3 = __lsx_vadd_w(_in3, _in4); \
- _out4 = __lsx_vsub_w(_in3, _in4); \
- _out5 = __lsx_vsub_w(_in2, _in5); \
- _out6 = __lsx_vsub_w(_in1, _in6); \
- _out7 = __lsx_vsub_w(_in0, _in7); \
-}
-
-#define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
-{ \
- _out0 = __lsx_vadd_d(_in0, _in7); \
- _out1 = __lsx_vadd_d(_in1, _in6); \
- _out2 = __lsx_vadd_d(_in2, _in5); \
- _out3 = __lsx_vadd_d(_in3, _in4); \
- _out4 = __lsx_vsub_d(_in3, _in4); \
- _out5 = __lsx_vsub_d(_in2, _in5); \
- _out6 = __lsx_vsub_d(_in1, _in6); \
- _out7 = __lsx_vsub_d(_in0, _in7); \
-}
-
-#endif //LSX
+#define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ _out0 = __lsx_vadd_b(_in0, _in7); \
+ _out1 = __lsx_vadd_b(_in1, _in6); \
+ _out2 = __lsx_vadd_b(_in2, _in5); \
+ _out3 = __lsx_vadd_b(_in3, _in4); \
+ _out4 = __lsx_vsub_b(_in3, _in4); \
+ _out5 = __lsx_vsub_b(_in2, _in5); \
+ _out6 = __lsx_vsub_b(_in1, _in6); \
+ _out7 = __lsx_vsub_b(_in0, _in7); \
+ }
+
+#define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ _out0 = __lsx_vadd_h(_in0, _in7); \
+ _out1 = __lsx_vadd_h(_in1, _in6); \
+ _out2 = __lsx_vadd_h(_in2, _in5); \
+ _out3 = __lsx_vadd_h(_in3, _in4); \
+ _out4 = __lsx_vsub_h(_in3, _in4); \
+ _out5 = __lsx_vsub_h(_in2, _in5); \
+ _out6 = __lsx_vsub_h(_in1, _in6); \
+ _out7 = __lsx_vsub_h(_in0, _in7); \
+ }
+
+#define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ _out0 = __lsx_vadd_w(_in0, _in7); \
+ _out1 = __lsx_vadd_w(_in1, _in6); \
+ _out2 = __lsx_vadd_w(_in2, _in5); \
+ _out3 = __lsx_vadd_w(_in3, _in4); \
+ _out4 = __lsx_vsub_w(_in3, _in4); \
+ _out5 = __lsx_vsub_w(_in2, _in5); \
+ _out6 = __lsx_vsub_w(_in1, _in6); \
+ _out7 = __lsx_vsub_w(_in0, _in7); \
+ }
+
+#define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ _out0 = __lsx_vadd_d(_in0, _in7); \
+ _out1 = __lsx_vadd_d(_in1, _in6); \
+ _out2 = __lsx_vadd_d(_in2, _in5); \
+ _out3 = __lsx_vadd_d(_in3, _in4); \
+ _out4 = __lsx_vsub_d(_in3, _in4); \
+ _out5 = __lsx_vsub_d(_in2, _in5); \
+ _out6 = __lsx_vsub_d(_in1, _in6); \
+ _out7 = __lsx_vsub_d(_in0, _in7); \
+ }
+
+#endif // LSX
#ifdef __loongarch_asx
#include <lasxintrin.h>
@@ -700,13 +730,12 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
* Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
* =============================================================================
*/
-static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l)
-{
- __m256i out;
+static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) {
+ __m256i out;
- out = __lasx_xvmulwev_h_bu(in_h, in_l);
- out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
- return out;
+ out = __lasx_xvmulwev_h_bu(in_h, in_l);
+ out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+ return out;
}
/*
@@ -718,18 +747,17 @@ static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l)
* Details : Signed byte elements from in_h are multiplied with
* signed byte elements from in_l producing a result
* twice the size of input i.e. signed halfword.
- * Then this iniplication results of adjacent odd-even elements
+ * Then this multiplication results of adjacent odd-even elements
* are added to the out vector
* Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
* =============================================================================
*/
-static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l)
-{
- __m256i out;
+static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) {
+ __m256i out;
- out = __lasx_xvmulwev_h_b(in_h, in_l);
- out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
- return out;
+ out = __lasx_xvmulwev_h_b(in_h, in_l);
+ out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
+ return out;
}
/*
@@ -749,13 +777,12 @@ static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l)
* out : 22,38,38,22, 22,38,38,22
* =============================================================================
*/
-static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l)
-{
- __m256i out;
+static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) {
+ __m256i out;
- out = __lasx_xvmulwev_w_h(in_h, in_l);
- out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
- return out;
+ out = __lasx_xvmulwev_w_h(in_h, in_l);
+ out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+ return out;
}
/*
@@ -763,22 +790,21 @@ static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l)
* Description : Dot product of word vector elements
* Arguments : Inputs - in_h, in_l
* Output - out
- * Retrun Type - signed double
+ * Return Type - signed double
* Details : Signed word elements from in_h are multiplied with
* signed word elements from in_l producing a result
- * twice the size of input i.e. signed double word.
+ * twice the size of input i.e. signed double-word.
* Then this multiplied results of adjacent odd-even elements
* are added to the out vector.
* Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
* =============================================================================
*/
-static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l)
-{
- __m256i out;
+static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l) {
+ __m256i out;
- out = __lasx_xvmulwev_d_w(in_h, in_l);
- out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
- return out;
+ out = __lasx_xvmulwev_d_w(in_h, in_l);
+ out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
+ return out;
}
/*
@@ -795,13 +821,12 @@ static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l)
* Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
* =============================================================================
*/
-static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l)
-{
- __m256i out;
+static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) {
+ __m256i out;
- out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
- out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
- return out;
+ out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
+ out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
+ return out;
}
/*
@@ -809,7 +834,7 @@ static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l)
* Description : Dot product & addition of byte vector elements
* Arguments : Inputs - in_h, in_l
* Output - out
- * Retrun Type - halfword
+ * Return Type - halfword
* Details : Signed byte elements from in_h are multiplied with
* signed byte elements from in_l producing a result
* twice the size of input i.e. signed halfword.
@@ -818,13 +843,59 @@ static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l)
* Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
* =============================================================================
*/
-static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c,__m256i in_h, __m256i in_l)
-{
- __m256i out;
+static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h,
+ __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
+ out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Return Type - halfword
+ * Details : Unsigned byte elements from in_h are multiplied with
+ * unsigned byte elements from in_l producing a result
+ * twice the size of input i.e. signed halfword.
+ * Then this multiplied results of adjacent odd-even elements
+ * are added to the in_c vector.
+ * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, __m256i in_h,
+ __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l);
+ out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+ return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments : Inputs - in_h, in_l
+ * Output - out
+ * Return Type - halfword
+ * Details : Unsigned byte elements from in_h are multiplied with
+ * signed byte elements from in_l producing a result
+ * twice the size of input i.e. signed halfword.
+ * Then this multiplied results of adjacent odd-even elements
+ * are added to the in_c vector.
+ * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, __m256i in_h,
+ __m256i in_l) {
+ __m256i out;
- out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
- out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
- return out;
+ out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l);
+ out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l);
+ return out;
}
/*
@@ -845,13 +916,13 @@ static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c,__m256i in_h, __m256i in_
* out : 23,40,41,26, 23,40,41,26
* =============================================================================
*/
-static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
-{
- __m256i out;
+static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h,
+ __m256i in_l) {
+ __m256i out;
- out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
- out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
- return out;
+ out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
+ out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+ return out;
}
/*
@@ -868,13 +939,13 @@ static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h, __m256i in
* Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
* =============================================================================
*/
-static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h, __m256i in_l)
-{
- __m256i out;
+static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h,
+ __m256i in_l) {
+ __m256i out;
- out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
- out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
- return out;
+ out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
+ out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
+ return out;
}
/*
@@ -891,13 +962,13 @@ static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h, __m256i i
* Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
* =============================================================================
*/
-static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h, __m256i in_l)
-{
- __m256i out;
+static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h,
+ __m256i in_l) {
+ __m256i out;
- out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
- out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
- return out;
+ out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
+ out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
+ return out;
}
/*
@@ -915,14 +986,14 @@ static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h, __m256i
* Example : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
* =============================================================================
*/
-static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h, __m256i in_l)
-{
- __m256i out;
-
- out = __lasx_xvmulwev_h_bu(in_h, in_l);
- out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
- out = __lasx_xvsub_h(in_c, out);
- return out;
+static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h,
+ __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmulwev_h_bu(in_h, in_l);
+ out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+ out = __lasx_xvsub_h(in_c, out);
+ return out;
}
/*
@@ -944,14 +1015,14 @@ static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h, __m256i i
* out : -7,-3,0,0, 0,-1,0,-1
* =============================================================================
*/
-static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
-{
- __m256i out;
-
- out = __lasx_xvmulwev_w_h(in_h, in_l);
- out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
- out = __lasx_xvsub_w(in_c, out);
- return out;
+static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h,
+ __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmulwev_w_h(in_h, in_l);
+ out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+ out = __lasx_xvsub_w(in_c, out);
+ return out;
}
/*
@@ -960,10 +1031,10 @@ static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h, __m256i in
* Arguments : Inputs - in_h, in_l
* Output - out
* Return Type - signed word
- * Details : Signed halfword elements from in_h are iniplied with
+ * Details : Signed halfword elements from in_h are multiplied with
* signed halfword elements from in_l producing a result
* four times the size of input i.e. signed doubleword.
- * Then this iniplication results of four adjacent elements
+ * Then this multiplication results of four adjacent elements
* are added together and stored to the out vector.
* Example : out = __lasx_xvdp4_d_h(in_h, in_l)
* in_h : 3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1
@@ -971,14 +1042,13 @@ static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h, __m256i in
* out : -2,0,1,1
* =============================================================================
*/
-static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l)
-{
- __m256i out;
-
- out = __lasx_xvmulwev_w_h(in_h, in_l);
- out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
- out = __lasx_xvhaddw_d_w(out, out);
- return out;
+static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l) {
+ __m256i out;
+
+ out = __lasx_xvmulwev_w_h(in_h, in_l);
+ out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+ out = __lasx_xvhaddw_d_w(out, out);
+ return out;
}
/*
@@ -993,13 +1063,12 @@ static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l)
* Example : See out = __lasx_xvaddwh_w_h(in_h, in_l)
* =============================================================================
*/
-static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l)
-{
- __m256i out;
+static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l) {
+ __m256i out;
- out = __lasx_xvilvh_b(in_h, in_l);
- out = __lasx_xvhaddw_h_b(out, out);
- return out;
+ out = __lasx_xvilvh_b(in_h, in_l);
+ out = __lasx_xvhaddw_h_b(out, out);
+ return out;
}
/*
@@ -1017,13 +1086,12 @@ static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l)
* out : 1,0,0,-1, 1,0,0, 2
* =============================================================================
*/
- static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l)
-{
- __m256i out;
+static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l) {
+ __m256i out;
- out = __lasx_xvilvh_h(in_h, in_l);
- out = __lasx_xvhaddw_w_h(out, out);
- return out;
+ out = __lasx_xvilvh_h(in_h, in_l);
+ out = __lasx_xvhaddw_w_h(out, out);
+ return out;
}
/*
@@ -1038,13 +1106,12 @@ static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l)
* Example : See out = __lasx_xvaddwl_w_h(in_h, in_l)
* =============================================================================
*/
-static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l)
-{
- __m256i out;
+static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l) {
+ __m256i out;
- out = __lasx_xvilvl_b(in_h, in_l);
- out = __lasx_xvhaddw_h_b(out, out);
- return out;
+ out = __lasx_xvilvl_b(in_h, in_l);
+ out = __lasx_xvhaddw_h_b(out, out);
+ return out;
}
/*
@@ -1062,13 +1129,12 @@ static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l)
* out : 5,-1,4,2, 1,0,2,-1
* =============================================================================
*/
-static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l)
-{
- __m256i out;
+static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l) {
+ __m256i out;
- out = __lasx_xvilvl_h(in_h, in_l);
- out = __lasx_xvhaddw_w_h(out, out);
- return out;
+ out = __lasx_xvilvl_h(in_h, in_l);
+ out = __lasx_xvhaddw_w_h(out, out);
+ return out;
}
/*
@@ -1083,13 +1149,12 @@ static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l)
* Example : See out = __lasx_xvaddwl_w_h(in_h, in_l)
* =============================================================================
*/
-static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l)
-{
- __m256i out;
+static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l) {
+ __m256i out;
- out = __lasx_xvilvl_b(in_h, in_l);
- out = __lasx_xvhaddw_hu_bu(out, out);
- return out;
+ out = __lasx_xvilvl_b(in_h, in_l);
+ out = __lasx_xvhaddw_hu_bu(out, out);
+ return out;
}
/*
@@ -1103,13 +1168,12 @@ static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l)
* Example : See out = __lasx_xvaddw_w_w_h(in_h, in_l)
* =============================================================================
*/
-static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l)
-{
- __m256i out;
+static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l) {
+ __m256i out;
- out = __lasx_xvsllwil_hu_bu(in_l, 0);
- out = __lasx_xvadd_h(in_h, out);
- return out;
+ out = __lasx_xvsllwil_hu_bu(in_l, 0);
+ out = __lasx_xvadd_h(in_h, out);
+ return out;
}
/*
@@ -1126,13 +1190,12 @@ static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l)
* out : 2, 0,1,2, -1,0,1,1,
* =============================================================================
*/
-static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l)
-{
- __m256i out;
+static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) {
+ __m256i out;
- out = __lasx_xvsllwil_w_h(in_l, 0);
- out = __lasx_xvadd_w(in_h, out);
- return out;
+ out = __lasx_xvsllwil_w_h(in_l, 0);
+ out = __lasx_xvadd_w(in_h, out);
+ return out;
}
/*
@@ -1153,15 +1216,15 @@ static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l)
* out : 201, 602,1203,2004, -995, -1794,-2793,-3992
* =============================================================================
*/
-static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
-{
- __m256i tmp0, tmp1, out;
-
- tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
- tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
- tmp0 = __lasx_xvmul_w(tmp0, tmp1);
- out = __lasx_xvadd_w(tmp0, in_c);
- return out;
+static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h,
+ __m256i in_l) {
+ __m256i tmp0, tmp1, out;
+
+ tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
+ tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
+ tmp0 = __lasx_xvmul_w(tmp0, tmp1);
+ out = __lasx_xvadd_w(tmp0, in_c);
+ return out;
}
/*
@@ -1177,15 +1240,15 @@ static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h, __m256i in
* Example : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
* =============================================================================
*/
-static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
-{
- __m256i tmp0, tmp1, out;
-
- tmp0 = __lasx_xvilvh_h(in_h, in_h);
- tmp1 = __lasx_xvilvh_h(in_l, in_l);
- tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
- out = __lasx_xvadd_w(tmp0, in_c);
- return out;
+static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h,
+ __m256i in_l) {
+ __m256i tmp0, tmp1, out;
+
+ tmp0 = __lasx_xvilvh_h(in_h, in_h);
+ tmp1 = __lasx_xvilvh_h(in_l, in_l);
+ tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
+ out = __lasx_xvadd_w(tmp0, in_c);
+ return out;
}
/*
@@ -1203,14 +1266,13 @@ static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h, __m256i in
* out : 6,1,3,0, 0,0,1,0
* =============================================================================
*/
-static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l)
-{
- __m256i tmp0, tmp1, out;
-
- tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
- tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
- out = __lasx_xvmul_w(tmp0, tmp1);
- return out;
+static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l) {
+ __m256i tmp0, tmp1, out;
+
+ tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
+ tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
+ out = __lasx_xvmul_w(tmp0, tmp1);
+ return out;
}
/*
@@ -1228,39 +1290,39 @@ static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l)
* out : 0,0,0,0, 0,0,0,1
* =============================================================================
*/
-static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l)
-{
- __m256i tmp0, tmp1, out;
-
- tmp0 = __lasx_xvilvh_h(in_h, in_h);
- tmp1 = __lasx_xvilvh_h(in_l, in_l);
- out = __lasx_xvmulwev_w_h(tmp0, tmp1);
- return out;
+static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) {
+ __m256i tmp0, tmp1, out;
+
+ tmp0 = __lasx_xvilvh_h(in_h, in_h);
+ tmp1 = __lasx_xvilvh_h(in_l, in_l);
+ out = __lasx_xvmulwev_w_h(tmp0, tmp1);
+ return out;
}
/*
* =============================================================================
- * Description : The low half of the vector elements are expanded and
- * added saturately after being doubled.
+ * Description : The low half of the vector elements are added to the high half
+ * after being doubled, then saturated.
* Arguments : Inputs - in_h, in_l
* Output - out
- * Details : The in_h vector adds the in_l vector saturately after the lower
- * half of the two-fold zero extension (unsigned byte to unsigned
- * halfword) and the results are stored to the out vector.
+ * Details : The in_h vector adds the in_l vector after the lower half of
+ * the two-fold zero extension (unsigned byte to unsigned
+ * halfword) and then saturated. The results are stored to the out
+ * vector.
* Example : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l)
* in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1
- * in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1
- * out : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
+ * in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1,
+ * 0,0,0,1
+ * out : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
* =============================================================================
*/
-static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l)
-{
- __m256i tmp1, out;
- __m256i zero = {0};
-
- tmp1 = __lasx_xvilvl_b(zero, in_l);
- out = __lasx_xvsadd_hu(in_h, tmp1);
- return out;
+static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) {
+ __m256i tmp1, out;
+ __m256i zero = { 0 };
+
+ tmp1 = __lasx_xvilvl_b(zero, in_l);
+ out = __lasx_xvsadd_hu(in_h, tmp1);
+ return out;
}
/*
@@ -1279,13 +1341,12 @@ static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l)
* out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5
* =============================================================================
*/
-static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max)
-{
- __m256i out;
+static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max) {
+ __m256i out;
- out = __lasx_xvmax_h(min, in);
- out = __lasx_xvmin_h(max, out);
- return out;
+ out = __lasx_xvmax_h(min, in);
+ out = __lasx_xvmin_h(max, out);
+ return out;
}
/*
@@ -1298,13 +1359,12 @@ static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max)
* Example : See out = __lasx_xvclip255_w(in)
* =============================================================================
*/
-static inline __m256i __lasx_xvclip255_h(__m256i in)
-{
- __m256i out;
+static inline __m256i __lasx_xvclip255_h(__m256i in) {
+ __m256i out;
- out = __lasx_xvmaxi_h(in, 0);
- out = __lasx_xvsat_hu(out, 7);
- return out;
+ out = __lasx_xvmaxi_h(in, 0);
+ out = __lasx_xvsat_hu(out, 7);
+ return out;
}
/*
@@ -1319,20 +1379,19 @@ static inline __m256i __lasx_xvclip255_h(__m256i in)
* out : 0,255,255,249, 0,255,255,249
* =============================================================================
*/
-static inline __m256i __lasx_xvclip255_w(__m256i in)
-{
- __m256i out;
+static inline __m256i __lasx_xvclip255_w(__m256i in) {
+ __m256i out;
- out = __lasx_xvmaxi_w(in, 0);
- out = __lasx_xvsat_wu(out, 7);
- return out;
+ out = __lasx_xvmaxi_w(in, 0);
+ out = __lasx_xvsat_wu(out, 7);
+ return out;
}
/*
* =============================================================================
* Description : Indexed halfword element values are replicated to all
- * elements in output vector. If 'indx < 8' use xvsplati_l_*,
- * if 'indx >= 8' use xvsplati_h_*.
+ * elements in output vector. If 'idx < 8' use xvsplati_l_*,
+ * if 'idx >= 8' use xvsplati_h_*.
* Arguments : Inputs - in, idx
* Output - out
* Details : Idx element value from in vector is replicated to all
@@ -1344,20 +1403,19 @@ static inline __m256i __lasx_xvclip255_w(__m256i in)
* out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11
* =============================================================================
*/
-static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx)
-{
- __m256i out;
+static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx) {
+ __m256i out;
- out = __lasx_xvpermi_q(in, in, 0x02);
- out = __lasx_xvreplve_h(out, idx);
- return out;
+ out = __lasx_xvpermi_q(in, in, 0x02);
+ out = __lasx_xvreplve_h(out, idx);
+ return out;
}
/*
* =============================================================================
* Description : Indexed halfword element values are replicated to all
- * elements in output vector. If 'indx < 8' use xvsplati_l_*,
- * if 'indx >= 8' use xvsplati_h_*.
+ * elements in output vector. If 'idx < 8' use xvsplati_l_*,
+ * if 'idx >= 8' use xvsplati_h_*.
* Arguments : Inputs - in, idx
* Output - out
* Details : Idx element value from in vector is replicated to all
@@ -1369,18 +1427,17 @@ static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx)
* out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
* =============================================================================
*/
-static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
-{
- __m256i out;
+static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) {
+ __m256i out;
- out = __lasx_xvpermi_q(in, in, 0x13);
- out = __lasx_xvreplve_h(out, idx);
- return out;
+ out = __lasx_xvpermi_q(in, in, 0x13);
+ out = __lasx_xvreplve_h(out, idx);
+ return out;
}
/*
* =============================================================================
- * Description : Transpose 4x4 block with double word elements in vectors
+ * Description : Transpose 4x4 block with double-word elements in vectors
* Arguments : Inputs - _in0, _in1, _in2, _in3
* Outputs - _out0, _out1, _out2, _out3
* Example : LASX_TRANSPOSE4x4_D
@@ -1395,24 +1452,26 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
* _out3 : 4,4,4,4
* =============================================================================
*/
-#define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
-{ \
- __m256i _tmp0, _tmp1, _tmp2, _tmp3; \
- _tmp0 = __lasx_xvilvl_d(_in1, _in0); \
- _tmp1 = __lasx_xvilvh_d(_in1, _in0); \
- _tmp2 = __lasx_xvilvl_d(_in3, _in2); \
- _tmp3 = __lasx_xvilvh_d(_in3, _in2); \
- _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20); \
- _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31); \
- _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20); \
- _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31); \
-}
+#define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
+ _out3) \
+ { \
+ __m256i _tmp0, _tmp1, _tmp2, _tmp3; \
+ _tmp0 = __lasx_xvilvl_d(_in1, _in0); \
+ _tmp1 = __lasx_xvilvh_d(_in1, _in0); \
+ _tmp2 = __lasx_xvilvl_d(_in3, _in2); \
+ _tmp3 = __lasx_xvilvh_d(_in3, _in2); \
+ _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20); \
+ _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31); \
+ _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20); \
+ _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31); \
+ }
/*
* =============================================================================
* Description : Transpose 8x8 block with word elements in vectors
* Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
- * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+ * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ * _out7
* Example : LASX_TRANSPOSE8x8_W
* _in0 : 1,2,3,4,5,6,7,8
* _in1 : 2,2,3,4,5,6,7,8
@@ -1433,38 +1492,39 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
* _out7 : 8,8,8,8,8,8,8,8
* =============================================================================
*/
-#define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
-{ \
- __m256i _s0_m, _s1_m; \
- __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
- __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
- \
- _s0_m = __lasx_xvilvl_w(_in2, _in0); \
- _s1_m = __lasx_xvilvl_w(_in3, _in1); \
- _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
- _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
- _s0_m = __lasx_xvilvh_w(_in2, _in0); \
- _s1_m = __lasx_xvilvh_w(_in3, _in1); \
- _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
- _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
- _s0_m = __lasx_xvilvl_w(_in6, _in4); \
- _s1_m = __lasx_xvilvl_w(_in7, _in5); \
- _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
- _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
- _s0_m = __lasx_xvilvh_w(_in6, _in4); \
- _s1_m = __lasx_xvilvh_w(_in7, _in5); \
- _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
- _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
- _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20); \
- _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20); \
- _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20); \
- _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20); \
- _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31); \
- _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31); \
- _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31); \
- _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31); \
-}
+#define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ __m256i _s0_m, _s1_m; \
+ __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
+ __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
+ \
+ _s0_m = __lasx_xvilvl_w(_in2, _in0); \
+ _s1_m = __lasx_xvilvl_w(_in3, _in1); \
+ _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
+ _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
+ _s0_m = __lasx_xvilvh_w(_in2, _in0); \
+ _s1_m = __lasx_xvilvh_w(_in3, _in1); \
+ _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
+ _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
+ _s0_m = __lasx_xvilvl_w(_in6, _in4); \
+ _s1_m = __lasx_xvilvl_w(_in7, _in5); \
+ _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
+ _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
+ _s0_m = __lasx_xvilvh_w(_in6, _in4); \
+ _s1_m = __lasx_xvilvh_w(_in7, _in5); \
+ _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
+ _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
+ _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20); \
+ _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20); \
+ _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20); \
+ _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20); \
+ _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31); \
+ _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31); \
+ _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31); \
+ _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31); \
+ }
/*
* =============================================================================
@@ -1472,52 +1532,54 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
* Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
* _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
* (input 16x8 byte block)
- * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
- * (output 8x16 byte block)
- * Details : The rows of the matrix become columns, and the columns become rows.
+ * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ * _out7 (output 8x16 byte block)
+ * Details : The rows of the matrix become columns, and the columns become
+ * rows.
* Example : See LASX_TRANSPOSE16x8_H
* =============================================================================
*/
-#define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15, \
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
-{ \
- __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
- __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
- \
- _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
- _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
- _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
- _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
- _tmp4_m = __lasx_xvilvl_b(_in10, _in8); \
- _tmp5_m = __lasx_xvilvl_b(_in11, _in9); \
- _tmp6_m = __lasx_xvilvl_b(_in14, _in12); \
- _tmp7_m = __lasx_xvilvl_b(_in15, _in13); \
- _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
- _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
- _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
- _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
- _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m); \
- _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m); \
- _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m); \
- _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m); \
- _tmp0_m = __lasx_xvilvl_w(_out2, _out0); \
- _tmp2_m = __lasx_xvilvh_w(_out2, _out0); \
- _tmp4_m = __lasx_xvilvl_w(_out3, _out1); \
- _tmp6_m = __lasx_xvilvh_w(_out3, _out1); \
- _tmp1_m = __lasx_xvilvl_w(_out6, _out4); \
- _tmp3_m = __lasx_xvilvh_w(_out6, _out4); \
- _tmp5_m = __lasx_xvilvl_w(_out7, _out5); \
- _tmp7_m = __lasx_xvilvh_w(_out7, _out5); \
- _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m); \
- _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m); \
- _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m); \
- _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m); \
- _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m); \
- _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m); \
- _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m); \
- _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m); \
-}
+#define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
+ _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+ _out6, _out7) \
+ { \
+ __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
+ __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
+ \
+ _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
+ _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
+ _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
+ _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
+ _tmp4_m = __lasx_xvilvl_b(_in10, _in8); \
+ _tmp5_m = __lasx_xvilvl_b(_in11, _in9); \
+ _tmp6_m = __lasx_xvilvl_b(_in14, _in12); \
+ _tmp7_m = __lasx_xvilvl_b(_in15, _in13); \
+ _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
+ _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
+ _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
+ _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
+ _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m); \
+ _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m); \
+ _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m); \
+ _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m); \
+ _tmp0_m = __lasx_xvilvl_w(_out2, _out0); \
+ _tmp2_m = __lasx_xvilvh_w(_out2, _out0); \
+ _tmp4_m = __lasx_xvilvl_w(_out3, _out1); \
+ _tmp6_m = __lasx_xvilvh_w(_out3, _out1); \
+ _tmp1_m = __lasx_xvilvl_w(_out6, _out4); \
+ _tmp3_m = __lasx_xvilvh_w(_out6, _out4); \
+ _tmp5_m = __lasx_xvilvl_w(_out7, _out5); \
+ _tmp7_m = __lasx_xvilvh_w(_out7, _out5); \
+ _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m); \
+ _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m); \
+ _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m); \
+ _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m); \
+ _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m); \
+ _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m); \
+ _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m); \
+ _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m); \
+ }
/*
* =============================================================================
@@ -1525,9 +1587,10 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
* Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
* _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
* (input 16x8 byte block)
- * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
- * (output 8x16 byte block)
- * Details : The rows of the matrix become columns, and the columns become rows.
+ * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ * _out7 (output 8x16 byte block)
+ * Details : The rows of the matrix become columns, and the columns become
+ * rows.
* Example : LASX_TRANSPOSE16x8_H
* _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
* _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
@@ -1556,72 +1619,73 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
* _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
* =============================================================================
*/
-#define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15, \
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
- { \
- __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
- __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
- __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
- \
- _tmp0_m = __lasx_xvilvl_h(_in2, _in0); \
- _tmp1_m = __lasx_xvilvl_h(_in3, _in1); \
- _tmp2_m = __lasx_xvilvl_h(_in6, _in4); \
- _tmp3_m = __lasx_xvilvl_h(_in7, _in5); \
- _tmp4_m = __lasx_xvilvl_h(_in10, _in8); \
- _tmp5_m = __lasx_xvilvl_h(_in11, _in9); \
- _tmp6_m = __lasx_xvilvl_h(_in14, _in12); \
- _tmp7_m = __lasx_xvilvl_h(_in15, _in13); \
- _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
- _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
- _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
- _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
- _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
- _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
- _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
- _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
- _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
- _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
- _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
- _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
- _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
- _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
- _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
- _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
- _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
- _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
- _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
- _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
- \
- _tmp0_m = __lasx_xvilvh_h(_in2, _in0); \
- _tmp1_m = __lasx_xvilvh_h(_in3, _in1); \
- _tmp2_m = __lasx_xvilvh_h(_in6, _in4); \
- _tmp3_m = __lasx_xvilvh_h(_in7, _in5); \
- _tmp4_m = __lasx_xvilvh_h(_in10, _in8); \
- _tmp5_m = __lasx_xvilvh_h(_in11, _in9); \
- _tmp6_m = __lasx_xvilvh_h(_in14, _in12); \
- _tmp7_m = __lasx_xvilvh_h(_in15, _in13); \
- _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
- _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
- _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
- _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
- _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
- _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
- _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
- _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
- _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
- _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
- _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
- _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
- _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
- _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
- _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
- _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
- _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
- _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
- _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
- _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
-}
+#define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
+ _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+ _out6, _out7) \
+ { \
+ __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
+ __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
+ __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
+ \
+ _tmp0_m = __lasx_xvilvl_h(_in2, _in0); \
+ _tmp1_m = __lasx_xvilvl_h(_in3, _in1); \
+ _tmp2_m = __lasx_xvilvl_h(_in6, _in4); \
+ _tmp3_m = __lasx_xvilvl_h(_in7, _in5); \
+ _tmp4_m = __lasx_xvilvl_h(_in10, _in8); \
+ _tmp5_m = __lasx_xvilvl_h(_in11, _in9); \
+ _tmp6_m = __lasx_xvilvl_h(_in14, _in12); \
+ _tmp7_m = __lasx_xvilvl_h(_in15, _in13); \
+ _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
+ _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
+ _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
+ _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
+ _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
+ _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
+ _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
+ _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
+ _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
+ _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
+ _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
+ _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
+ _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
+ _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
+ _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
+ _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
+ _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
+ _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
+ _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
+ _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
+ \
+ _tmp0_m = __lasx_xvilvh_h(_in2, _in0); \
+ _tmp1_m = __lasx_xvilvh_h(_in3, _in1); \
+ _tmp2_m = __lasx_xvilvh_h(_in6, _in4); \
+ _tmp3_m = __lasx_xvilvh_h(_in7, _in5); \
+ _tmp4_m = __lasx_xvilvh_h(_in10, _in8); \
+ _tmp5_m = __lasx_xvilvh_h(_in11, _in9); \
+ _tmp6_m = __lasx_xvilvh_h(_in14, _in12); \
+ _tmp7_m = __lasx_xvilvh_h(_in15, _in13); \
+ _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
+ _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
+ _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
+ _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
+ _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
+ _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
+ _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
+ _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
+ _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
+ _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
+ _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
+ _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
+ _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
+ _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
+ _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
+ _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
+ _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
+ _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
+ _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
+ _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
+ }
/*
* =============================================================================
@@ -1629,61 +1693,65 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
* Arguments : Inputs - _in0, _in1, _in2, _in3
* Outputs - _out0, _out1, _out2, _out3
* Return Type - signed halfword
- * Details : The rows of the matrix become columns, and the columns become rows.
+ * Details : The rows of the matrix become columns, and the columns become
+ * rows.
* Example : See LASX_TRANSPOSE8x8_H
* =============================================================================
*/
-#define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
-{ \
- __m256i _s0_m, _s1_m; \
- \
- _s0_m = __lasx_xvilvl_h(_in1, _in0); \
- _s1_m = __lasx_xvilvl_h(_in3, _in2); \
- _out0 = __lasx_xvilvl_w(_s1_m, _s0_m); \
- _out2 = __lasx_xvilvh_w(_s1_m, _s0_m); \
- _out1 = __lasx_xvilvh_d(_out0, _out0); \
- _out3 = __lasx_xvilvh_d(_out2, _out2); \
-}
+#define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
+ _out3) \
+ { \
+ __m256i _s0_m, _s1_m; \
+ \
+ _s0_m = __lasx_xvilvl_h(_in1, _in0); \
+ _s1_m = __lasx_xvilvl_h(_in3, _in2); \
+ _out0 = __lasx_xvilvl_w(_s1_m, _s0_m); \
+ _out2 = __lasx_xvilvh_w(_s1_m, _s0_m); \
+ _out1 = __lasx_xvilvh_d(_out0, _out0); \
+ _out3 = __lasx_xvilvh_d(_out2, _out2); \
+ }
/*
* =============================================================================
* Description : Transpose input 8x8 byte block
* Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
* (input 8x8 byte block)
- * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
- * (output 8x8 byte block)
+ * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ * _out7 (output 8x8 byte block)
* Example : See LASX_TRANSPOSE8x8_H
* =============================================================================
*/
-#define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, _out0, \
- _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
-{ \
- __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
- __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
- _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
- _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
- _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
- _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
- _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
- _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
- _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
- _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
- _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m); \
- _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m); \
- _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m); \
- _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m); \
- _out1 = __lasx_xvbsrl_v(_out0, 8); \
- _out3 = __lasx_xvbsrl_v(_out2, 8); \
- _out5 = __lasx_xvbsrl_v(_out4, 8); \
- _out7 = __lasx_xvbsrl_v(_out6, 8); \
-}
+#define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
+ __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
+ _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
+ _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
+ _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
+ _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
+ _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
+ _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
+ _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
+ _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
+ _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m); \
+ _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m); \
+ _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m); \
+ _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m); \
+ _out1 = __lasx_xvbsrl_v(_out0, 8); \
+ _out3 = __lasx_xvbsrl_v(_out2, 8); \
+ _out5 = __lasx_xvbsrl_v(_out4, 8); \
+ _out7 = __lasx_xvbsrl_v(_out6, 8); \
+ }
/*
* =============================================================================
* Description : Transpose 8x8 block with halfword elements in vectors.
* Arguments : Inputs - _in0, _in1, ~
* Outputs - _out0, _out1, ~
- * Details : The rows of the matrix become columns, and the columns become rows.
+ * Details : The rows of the matrix become columns, and the columns become
+ * rows.
* Example : LASX_TRANSPOSE8x8_H
* _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
* _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
@@ -1704,40 +1772,41 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
* _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8
* =============================================================================
*/
-#define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, _out0, \
- _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
-{ \
- __m256i _s0_m, _s1_m; \
- __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
- __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
- \
- _s0_m = __lasx_xvilvl_h(_in6, _in4); \
- _s1_m = __lasx_xvilvl_h(_in7, _in5); \
- _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
- _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
- _s0_m = __lasx_xvilvh_h(_in6, _in4); \
- _s1_m = __lasx_xvilvh_h(_in7, _in5); \
- _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
- _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
- \
- _s0_m = __lasx_xvilvl_h(_in2, _in0); \
- _s1_m = __lasx_xvilvl_h(_in3, _in1); \
- _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
- _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
- _s0_m = __lasx_xvilvh_h(_in2, _in0); \
- _s1_m = __lasx_xvilvh_h(_in3, _in1); \
- _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
- _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
- \
- _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m); \
- _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m); \
- _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m); \
- _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m); \
- _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m); \
- _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m); \
- _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m); \
- _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m); \
-}
+#define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ __m256i _s0_m, _s1_m; \
+ __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
+ __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
+ \
+ _s0_m = __lasx_xvilvl_h(_in6, _in4); \
+ _s1_m = __lasx_xvilvl_h(_in7, _in5); \
+ _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
+ _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
+ _s0_m = __lasx_xvilvh_h(_in6, _in4); \
+ _s1_m = __lasx_xvilvh_h(_in7, _in5); \
+ _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
+ _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
+ \
+ _s0_m = __lasx_xvilvl_h(_in2, _in0); \
+ _s1_m = __lasx_xvilvl_h(_in3, _in1); \
+ _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
+ _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
+ _s0_m = __lasx_xvilvh_h(_in2, _in0); \
+ _s1_m = __lasx_xvilvh_h(_in3, _in1); \
+ _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
+ _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
+ \
+ _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m); \
+ _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m); \
+ _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m); \
+ _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m); \
+ _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m); \
+ _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m); \
+ _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m); \
+ _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m); \
+ }
/*
* =============================================================================
@@ -1752,34 +1821,34 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
* _out3 = _in0 - _in3;
* =============================================================================
*/
-#define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
-{ \
- _out0 = __lasx_xvadd_b(_in0, _in3); \
- _out1 = __lasx_xvadd_b(_in1, _in2); \
- _out2 = __lasx_xvsub_b(_in1, _in2); \
- _out3 = __lasx_xvsub_b(_in0, _in3); \
-}
-#define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
-{ \
- _out0 = __lasx_xvadd_h(_in0, _in3); \
- _out1 = __lasx_xvadd_h(_in1, _in2); \
- _out2 = __lasx_xvsub_h(_in1, _in2); \
- _out3 = __lasx_xvsub_h(_in0, _in3); \
-}
-#define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
-{ \
- _out0 = __lasx_xvadd_w(_in0, _in3); \
- _out1 = __lasx_xvadd_w(_in1, _in2); \
- _out2 = __lasx_xvsub_w(_in1, _in2); \
- _out3 = __lasx_xvsub_w(_in0, _in3); \
-}
-#define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
-{ \
- _out0 = __lasx_xvadd_d(_in0, _in3); \
- _out1 = __lasx_xvadd_d(_in1, _in2); \
- _out2 = __lasx_xvsub_d(_in1, _in2); \
- _out3 = __lasx_xvsub_d(_in0, _in3); \
-}
+#define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+ { \
+ _out0 = __lasx_xvadd_b(_in0, _in3); \
+ _out1 = __lasx_xvadd_b(_in1, _in2); \
+ _out2 = __lasx_xvsub_b(_in1, _in2); \
+ _out3 = __lasx_xvsub_b(_in0, _in3); \
+ }
+#define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+ { \
+ _out0 = __lasx_xvadd_h(_in0, _in3); \
+ _out1 = __lasx_xvadd_h(_in1, _in2); \
+ _out2 = __lasx_xvsub_h(_in1, _in2); \
+ _out3 = __lasx_xvsub_h(_in0, _in3); \
+ }
+#define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+ { \
+ _out0 = __lasx_xvadd_w(_in0, _in3); \
+ _out1 = __lasx_xvadd_w(_in1, _in2); \
+ _out2 = __lasx_xvsub_w(_in1, _in2); \
+ _out3 = __lasx_xvsub_w(_in0, _in3); \
+ }
+#define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+ { \
+ _out0 = __lasx_xvadd_d(_in0, _in3); \
+ _out1 = __lasx_xvadd_d(_in1, _in2); \
+ _out2 = __lasx_xvsub_d(_in1, _in2); \
+ _out3 = __lasx_xvsub_d(_in0, _in3); \
+ }
/*
* =============================================================================
@@ -1798,59 +1867,63 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
* _out7 = _in0 - _in7;
* =============================================================================
*/
-#define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
-{ \
- _out0 = __lasx_xvadd_b(_in0, _in7); \
- _out1 = __lasx_xvadd_b(_in1, _in6); \
- _out2 = __lasx_xvadd_b(_in2, _in5); \
- _out3 = __lasx_xvadd_b(_in3, _in4); \
- _out4 = __lasx_xvsub_b(_in3, _in4); \
- _out5 = __lasx_xvsub_b(_in2, _in5); \
- _out6 = __lasx_xvsub_b(_in1, _in6); \
- _out7 = __lasx_xvsub_b(_in0, _in7); \
-}
-
-#define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
-{ \
- _out0 = __lasx_xvadd_h(_in0, _in7); \
- _out1 = __lasx_xvadd_h(_in1, _in6); \
- _out2 = __lasx_xvadd_h(_in2, _in5); \
- _out3 = __lasx_xvadd_h(_in3, _in4); \
- _out4 = __lasx_xvsub_h(_in3, _in4); \
- _out5 = __lasx_xvsub_h(_in2, _in5); \
- _out6 = __lasx_xvsub_h(_in1, _in6); \
- _out7 = __lasx_xvsub_h(_in0, _in7); \
-}
-
-#define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
-{ \
- _out0 = __lasx_xvadd_w(_in0, _in7); \
- _out1 = __lasx_xvadd_w(_in1, _in6); \
- _out2 = __lasx_xvadd_w(_in2, _in5); \
- _out3 = __lasx_xvadd_w(_in3, _in4); \
- _out4 = __lasx_xvsub_w(_in3, _in4); \
- _out5 = __lasx_xvsub_w(_in2, _in5); \
- _out6 = __lasx_xvsub_w(_in1, _in6); \
- _out7 = __lasx_xvsub_w(_in0, _in7); \
-}
-
-#define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
-{ \
- _out0 = __lasx_xvadd_d(_in0, _in7); \
- _out1 = __lasx_xvadd_d(_in1, _in6); \
- _out2 = __lasx_xvadd_d(_in2, _in5); \
- _out3 = __lasx_xvadd_d(_in3, _in4); \
- _out4 = __lasx_xvsub_d(_in3, _in4); \
- _out5 = __lasx_xvsub_d(_in2, _in5); \
- _out6 = __lasx_xvsub_d(_in1, _in6); \
- _out7 = __lasx_xvsub_d(_in0, _in7); \
-}
-
-#endif //LASX
+#define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ _out0 = __lasx_xvadd_b(_in0, _in7); \
+ _out1 = __lasx_xvadd_b(_in1, _in6); \
+ _out2 = __lasx_xvadd_b(_in2, _in5); \
+ _out3 = __lasx_xvadd_b(_in3, _in4); \
+ _out4 = __lasx_xvsub_b(_in3, _in4); \
+ _out5 = __lasx_xvsub_b(_in2, _in5); \
+ _out6 = __lasx_xvsub_b(_in1, _in6); \
+ _out7 = __lasx_xvsub_b(_in0, _in7); \
+ }
+
+#define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ _out0 = __lasx_xvadd_h(_in0, _in7); \
+ _out1 = __lasx_xvadd_h(_in1, _in6); \
+ _out2 = __lasx_xvadd_h(_in2, _in5); \
+ _out3 = __lasx_xvadd_h(_in3, _in4); \
+ _out4 = __lasx_xvsub_h(_in3, _in4); \
+ _out5 = __lasx_xvsub_h(_in2, _in5); \
+ _out6 = __lasx_xvsub_h(_in1, _in6); \
+ _out7 = __lasx_xvsub_h(_in0, _in7); \
+ }
+
+#define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ _out0 = __lasx_xvadd_w(_in0, _in7); \
+ _out1 = __lasx_xvadd_w(_in1, _in6); \
+ _out2 = __lasx_xvadd_w(_in2, _in5); \
+ _out3 = __lasx_xvadd_w(_in3, _in4); \
+ _out4 = __lasx_xvsub_w(_in3, _in4); \
+ _out5 = __lasx_xvsub_w(_in2, _in5); \
+ _out6 = __lasx_xvsub_w(_in1, _in6); \
+ _out7 = __lasx_xvsub_w(_in0, _in7); \
+ }
+
+#define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+ _out7) \
+ { \
+ _out0 = __lasx_xvadd_d(_in0, _in7); \
+ _out1 = __lasx_xvadd_d(_in1, _in6); \
+ _out2 = __lasx_xvadd_d(_in2, _in5); \
+ _out3 = __lasx_xvadd_d(_in3, _in4); \
+ _out4 = __lasx_xvsub_d(_in3, _in4); \
+ _out5 = __lasx_xvsub_d(_in2, _in5); \
+ _out6 = __lasx_xvsub_d(_in1, _in6); \
+ _out7 = __lasx_xvsub_d(_in0, _in7); \
+ }
+
+#endif // LASX
/*
* =============================================================================
@@ -1863,15 +1936,13 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
* VP:1,2,3,4,
* =============================================================================
*/
-#define VECT_PRINT(RTYPE, element_num, in0, enter) \
-{ \
- RTYPE _tmp0 = (RTYPE)in0; \
- int _i = 0; \
- if (enter) \
- printf("\nVP:"); \
- for(_i = 0; _i < element_num; _i++) \
- printf("%d,",_tmp0[_i]); \
-}
+#define VECT_PRINT(RTYPE, element_num, in0, enter) \
+ { \
+ RTYPE _tmp0 = (RTYPE)in0; \
+ int _i = 0; \
+ if (enter) printf("\nVP:"); \
+ for (_i = 0; _i < element_num; _i++) printf("%d,", _tmp0[_i]); \
+ }
#endif /* LOONGSON_INTRINSICS_H */
#endif /* AVUTIL_LOONGARCH_LOONGSON_INTRINSICS_H */
--
2.20.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* [FFmpeg-devel] [PATCH v2 2/5] avcodec: [loongarch] Optimize Hevcdsp with LSX.
2022-02-17 11:11 [FFmpeg-devel] Optimize HEVC decoding for loongarch Hao Chen
2022-02-17 11:11 ` [FFmpeg-devel] [PATCH v2 1/5] avutil: [loongarch] Update loongson_intrinsics.h to v1.1.0 Hao Chen
@ 2022-02-17 11:11 ` Hao Chen
2022-02-17 11:11 ` [FFmpeg-devel] [PATCH v2 3/5] avcodec: [loongarch] Optimize Hevc_idct/lpf " Hao Chen
` (3 subsequent siblings)
5 siblings, 0 replies; 15+ messages in thread
From: Hao Chen @ 2022-02-17 11:11 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Lu Wang
From: Lu Wang <wanglu@loongson.cn>
ffmpeg -i 5_h265_1080p_60fps_3Mbps.mkv -f rawvideo -y /dev/null -an
before: 94fps
after : 110fps
Signed-off-by: Hao Chen <chenhao@loongson.cn>
---
libavcodec/hevcdsp.c | 2 +
libavcodec/hevcdsp.h | 1 +
libavcodec/loongarch/Makefile | 2 +
libavcodec/loongarch/hevcdsp_init_loongarch.c | 90 +
libavcodec/loongarch/hevcdsp_lsx.c | 3299 +++++++++++++++++
libavcodec/loongarch/hevcdsp_lsx.h | 88 +
6 files changed, 3482 insertions(+)
create mode 100644 libavcodec/loongarch/hevcdsp_init_loongarch.c
create mode 100644 libavcodec/loongarch/hevcdsp_lsx.c
create mode 100644 libavcodec/loongarch/hevcdsp_lsx.h
diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
index fe272ac1ce..b6c3f00819 100644
--- a/libavcodec/hevcdsp.c
+++ b/libavcodec/hevcdsp.c
@@ -267,4 +267,6 @@ int i = 0;
ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
if (ARCH_MIPS)
ff_hevc_dsp_init_mips(hevcdsp, bit_depth);
+ if (ARCH_LOONGARCH)
+ ff_hevc_dsp_init_loongarch(hevcdsp, bit_depth);
}
diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
index 0e013a8328..d642ecf9c6 100644
--- a/libavcodec/hevcdsp.h
+++ b/libavcodec/hevcdsp.h
@@ -134,5 +134,6 @@ void ff_hevc_dsp_init_arm(HEVCDSPContext *c, const int bit_depth);
void ff_hevc_dsp_init_ppc(HEVCDSPContext *c, const int bit_depth);
void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth);
void ff_hevc_dsp_init_mips(HEVCDSPContext *c, const int bit_depth);
+void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth);
#endif /* AVCODEC_HEVCDSP_H */
diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index 3c15c2edeb..9f416d0c6e 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -8,6 +8,7 @@ OBJS-$(CONFIG_VC1DSP) += loongarch/vc1dsp_init_loongarch.o
OBJS-$(CONFIG_HPELDSP) += loongarch/hpeldsp_init_loongarch.o
OBJS-$(CONFIG_IDCTDSP) += loongarch/idctdsp_init_loongarch.o
OBJS-$(CONFIG_VIDEODSP) += loongarch/videodsp_init.o
+OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_init_loongarch.o
LASX-OBJS-$(CONFIG_H264CHROMA) += loongarch/h264chroma_lasx.o
LASX-OBJS-$(CONFIG_H264QPEL) += loongarch/h264qpel_lasx.o
LASX-OBJS-$(CONFIG_H264DSP) += loongarch/h264dsp_lasx.o \
@@ -24,3 +25,4 @@ LSX-OBJS-$(CONFIG_VP9_DECODER) += loongarch/vp9_mc_lsx.o \
loongarch/vp9_intra_lsx.o \
loongarch/vp9_lpf_lsx.o \
loongarch/vp9_idct_lsx.o
+LSX-OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_lsx.o \
diff --git a/libavcodec/loongarch/hevcdsp_init_loongarch.c b/libavcodec/loongarch/hevcdsp_init_loongarch.c
new file mode 100644
index 0000000000..fc0e8fb0df
--- /dev/null
+++ b/libavcodec/loongarch/hevcdsp_init_loongarch.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ * Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/cpu.h"
+#include "hevcdsp_lsx.h"
+
+void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_lsx(cpu_flags)) {
+ if (bit_depth == 8) {
+ c->put_hevc_qpel[1][0][0] = ff_hevc_put_hevc_pel_pixels4_8_lsx;
+ c->put_hevc_qpel[2][0][0] = ff_hevc_put_hevc_pel_pixels6_8_lsx;
+ c->put_hevc_qpel[3][0][0] = ff_hevc_put_hevc_pel_pixels8_8_lsx;
+ c->put_hevc_qpel[4][0][0] = ff_hevc_put_hevc_pel_pixels12_8_lsx;
+ c->put_hevc_qpel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_8_lsx;
+ c->put_hevc_qpel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_8_lsx;
+ c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_lsx;
+ c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_lsx;
+ c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_lsx;
+
+ c->put_hevc_epel[1][0][0] = ff_hevc_put_hevc_pel_pixels4_8_lsx;
+ c->put_hevc_epel[2][0][0] = ff_hevc_put_hevc_pel_pixels6_8_lsx;
+ c->put_hevc_epel[3][0][0] = ff_hevc_put_hevc_pel_pixels8_8_lsx;
+ c->put_hevc_epel[4][0][0] = ff_hevc_put_hevc_pel_pixels12_8_lsx;
+ c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_8_lsx;
+ c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_8_lsx;
+ c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_lsx;
+
+ c->put_hevc_qpel[1][0][1] = ff_hevc_put_hevc_qpel_h4_8_lsx;
+ c->put_hevc_qpel[3][0][1] = ff_hevc_put_hevc_qpel_h8_8_lsx;
+ c->put_hevc_qpel[4][0][1] = ff_hevc_put_hevc_qpel_h12_8_lsx;
+ c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_8_lsx;
+ c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h24_8_lsx;
+ c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_lsx;
+ c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_lsx;
+ c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_lsx;
+
+ c->put_hevc_qpel[1][1][0] = ff_hevc_put_hevc_qpel_v4_8_lsx;
+ c->put_hevc_qpel[3][1][0] = ff_hevc_put_hevc_qpel_v8_8_lsx;
+ c->put_hevc_qpel[4][1][0] = ff_hevc_put_hevc_qpel_v12_8_lsx;
+ c->put_hevc_qpel[5][1][0] = ff_hevc_put_hevc_qpel_v16_8_lsx;
+ c->put_hevc_qpel[6][1][0] = ff_hevc_put_hevc_qpel_v24_8_lsx;
+ c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_8_lsx;
+ c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_8_lsx;
+ c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_8_lsx;
+
+ c->put_hevc_qpel[1][1][1] = ff_hevc_put_hevc_qpel_hv4_8_lsx;
+ c->put_hevc_qpel[3][1][1] = ff_hevc_put_hevc_qpel_hv8_8_lsx;
+ c->put_hevc_qpel[4][1][1] = ff_hevc_put_hevc_qpel_hv12_8_lsx;
+ c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_8_lsx;
+ c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_8_lsx;
+ c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_8_lsx;
+ c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_8_lsx;
+ c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_8_lsx;
+
+ c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_8_lsx;
+
+ c->put_hevc_epel[5][1][0] = ff_hevc_put_hevc_epel_v16_8_lsx;
+ c->put_hevc_epel[6][1][0] = ff_hevc_put_hevc_epel_v24_8_lsx;
+ c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_8_lsx;
+
+ c->put_hevc_epel[3][1][1] = ff_hevc_put_hevc_epel_hv8_8_lsx;
+ c->put_hevc_epel[4][1][1] = ff_hevc_put_hevc_epel_hv12_8_lsx;
+ c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_8_lsx;
+ c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_8_lsx;
+ c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_lsx;
+ }
+ }
+}
diff --git a/libavcodec/loongarch/hevcdsp_lsx.c b/libavcodec/loongarch/hevcdsp_lsx.c
new file mode 100644
index 0000000000..a520f02bd1
--- /dev/null
+++ b/libavcodec/loongarch/hevcdsp_lsx.c
@@ -0,0 +1,3299 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ * Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "hevcdsp_lsx.h"
+
+static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
+ /* 8 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+ /* 4 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
+};
+
+/* hevc_copy: dst = src << 6 */
+static void hevc_copy_4w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ int32_t height)
+{
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_4x = (dst_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ int32_t loop_cnt = height >> 3;
+ int32_t res = height & 0x07;
+
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i in0, in1, in2, in3;
+ for (; loop_cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
+ src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
+ src5, src6);
+ src7 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+
+ DUP4_ARG2(__lsx_vilvl_w, src1, src0, src3, src2, src5, src4, src7, src6,
+ src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+ in0, in1, in2, in3);
+
+ __lsx_vstelm_d(in0, dst, 0, 0);
+ __lsx_vstelm_d(in0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(in1, dst + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(in1, dst + dst_stride_3x, 0, 1);
+ dst += dst_stride_4x;
+ __lsx_vstelm_d(in2, dst, 0, 0);
+ __lsx_vstelm_d(in2, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(in3, dst + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(in3, dst + dst_stride_3x, 0, 1);
+ dst += dst_stride_4x;
+ }
+ for (;res--;) {
+ src0 = __lsx_vld(src, 0);
+ in0 = __lsx_vsllwil_hu_bu(src0, 6);
+ __lsx_vstelm_d(in0, dst, 0, 0);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void hevc_copy_6w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ int32_t height)
+{
+ int32_t loop_cnt = (height >> 3);
+ int32_t res = height & 0x07;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
+ src7 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+ in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
+ in4, in5, in6, in7);
+
+ __lsx_vstelm_d(in0, dst, 0, 0);
+ __lsx_vstelm_w(in0, dst, 8, 2);
+ dst += dst_stride;
+ __lsx_vstelm_d(in1, dst, 0, 0);
+ __lsx_vstelm_w(in1, dst, 8, 2);
+ dst += dst_stride;
+ __lsx_vstelm_d(in2, dst, 0, 0);
+ __lsx_vstelm_w(in2, dst, 8, 2);
+ dst += dst_stride;
+ __lsx_vstelm_d(in3, dst, 0, 0);
+ __lsx_vstelm_w(in3, dst, 8, 2);
+ dst += dst_stride;
+ __lsx_vstelm_d(in4, dst, 0, 0);
+ __lsx_vstelm_w(in4, dst, 8, 2);
+ dst += dst_stride;
+ __lsx_vstelm_d(in5, dst, 0, 0);
+ __lsx_vstelm_w(in5, dst, 8, 2);
+ dst += dst_stride;
+ __lsx_vstelm_d(in6, dst, 0, 0);
+ __lsx_vstelm_w(in6, dst, 8, 2);
+ dst += dst_stride;
+ __lsx_vstelm_d(in7, dst, 0, 0);
+ __lsx_vstelm_w(in7, dst, 8, 2);
+ dst += dst_stride;
+ }
+ for (;res--;) {
+ src0 = __lsx_vld(src, 0);
+ in0 = __lsx_vsllwil_hu_bu(src0, 6);
+ src += src_stride;
+ __lsx_vstelm_d(in0, dst, 0, 0);
+ __lsx_vstelm_w(in0, dst, 8, 2);
+ dst += dst_stride;
+ }
+}
+
+static void hevc_copy_8w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ int32_t height)
+{
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_2x = (dst_stride_x << 1);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
+ int32_t loop_cnt = height >> 3;
+ int32_t res = height & 0x07;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
+ src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
+ src5, src6);
+ src7 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+ in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
+ in4, in5, in6, in7);
+ __lsx_vst(in0, dst, 0);
+ __lsx_vstx(in1, dst, dst_stride_x);
+ __lsx_vstx(in2, dst, dst_stride_2x);
+ __lsx_vstx(in3, dst, dst_stride_3x);
+ dst += dst_stride_2x;
+ __lsx_vst(in4, dst, 0);
+ __lsx_vstx(in5, dst, dst_stride_x);
+ __lsx_vstx(in6, dst, dst_stride_2x);
+ __lsx_vstx(in7, dst, dst_stride_3x);
+ dst += dst_stride_2x;
+ }
+ for (;res--;) {
+ src0 = __lsx_vld(src, 0);
+ in0 = __lsx_vsllwil_hu_bu(src0, 6);
+ __lsx_vst(in0, dst, 0);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void hevc_copy_12w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ int32_t height)
+{
+ uint32_t loop_cnt;
+ uint32_t res = height & 0x07;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_2x = (dst_stride_x << 1);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
+ __m128i zero = __lsx_vldi(0);
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i in0, in1, in0_r, in1_r, in2_r, in3_r;
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
+ src7 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+ in0_r, in1_r, in2_r, in3_r);
+ DUP2_ARG2(__lsx_vilvh_w, src1, src0, src3, src2, src0, src1);
+ DUP2_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, in0, in1);
+ __lsx_vst(in0_r, dst, 0);
+ __lsx_vstx(in1_r, dst, dst_stride_x);
+ __lsx_vstx(in2_r, dst, dst_stride_2x);
+ __lsx_vstx(in3_r, dst, dst_stride_3x);
+ __lsx_vstelm_d(in0, dst, 16, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(in0, dst, 16, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(in1, dst, 16, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(in1, dst, 16, 1);
+ dst += dst_stride;
+
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
+ in0_r, in1_r, in2_r, in3_r);
+ DUP2_ARG2(__lsx_vilvh_w, src5, src4, src7, src6, src0, src1);
+ DUP2_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, in0, in1);
+ __lsx_vst(in0_r, dst, 0);
+ __lsx_vstx(in1_r, dst, dst_stride_x);
+ __lsx_vstx(in2_r, dst, dst_stride_2x);
+ __lsx_vstx(in3_r, dst, dst_stride_3x);
+ __lsx_vstelm_d(in0, dst, 16, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(in0, dst, 16, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(in1, dst, 16, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(in1, dst, 16, 1);
+ dst += dst_stride;
+ }
+ for (;res--;) {
+ src0 = __lsx_vld(src, 0);
+ in0 = __lsx_vsllwil_hu_bu(src0, 6);
+ src1 = __lsx_vilvh_b(zero, src0);
+ in1 = __lsx_vslli_h(src1, 6);
+ __lsx_vst(in0, dst, 0);
+ __lsx_vstelm_d(in1, dst, 16, 0);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void hevc_copy_16w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ int32_t height)
+{
+ __m128i zero = __lsx_vldi(0);
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_2x = (dst_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
+ int32_t loop_cnt = height >> 3;
+ int32_t res = height & 0x07;
+ int16_t* dst1 = dst + 8;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
+ src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
+ src5, src6);
+ src7 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero, src3,
+ in0_l, in1_l, in2_l, in3_l);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+ in0_r, in1_r, in2_r, in3_r);
+ DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
+ in1_l, in2_l, in3_l);
+ __lsx_vst(in0_r, dst, 0);
+ __lsx_vstx(in1_r, dst, dst_stride_x);
+ __lsx_vstx(in2_r, dst, dst_stride_2x);
+ __lsx_vstx(in3_r, dst, dst_stride_3x);
+ __lsx_vst(in0_l, dst1, 0);
+ __lsx_vstx(in1_l, dst1, dst_stride_x);
+ __lsx_vstx(in2_l, dst1, dst_stride_2x);
+ __lsx_vstx(in3_l, dst1, dst_stride_3x);
+ dst += dst_stride_2x;
+ dst1 += dst_stride_2x;
+
+ DUP4_ARG2(__lsx_vilvh_b, zero, src4, zero, src5, zero, src6, zero, src7,
+ in0_l, in1_l, in2_l, in3_l);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
+ in0_r, in1_r, in2_r, in3_r);
+ DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
+ in1_l, in2_l, in3_l);
+ __lsx_vst(in0_r, dst, 0);
+ __lsx_vstx(in1_r, dst, dst_stride_x);
+ __lsx_vstx(in2_r, dst, dst_stride_2x);
+ __lsx_vstx(in3_r, dst, dst_stride_3x);
+ __lsx_vst(in0_l, dst1, 0);
+ __lsx_vstx(in1_l, dst1, dst_stride_x);
+ __lsx_vstx(in2_l, dst1, dst_stride_2x);
+ __lsx_vstx(in3_l, dst1, dst_stride_3x);
+ dst += dst_stride_2x;
+ dst1 += dst_stride_2x;
+ }
+ if (res) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+
+ DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero, src3,
+ in0_l, in1_l, in2_l, in3_l);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+ in0_r, in1_r, in2_r, in3_r);
+ DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
+ in1_l, in2_l, in3_l);
+ __lsx_vst(in0_r, dst, 0);
+ __lsx_vstx(in1_r, dst, dst_stride_x);
+ __lsx_vstx(in2_r, dst, dst_stride_2x);
+ __lsx_vstx(in3_r, dst, dst_stride_3x);
+ dst += 8;
+ __lsx_vst(in0_l, dst, 0);
+ __lsx_vstx(in1_l, dst, dst_stride_x);
+ __lsx_vstx(in2_l, dst, dst_stride_2x);
+ __lsx_vstx(in3_l, dst, dst_stride_3x);
+ }
+}
+
+static void hevc_copy_24w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ int32_t height)
+{
+ uint32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_2x = (dst_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
+ uint8_t *_src = src + 16;
+ int16_t *dst1 = dst;
+ __m128i zero = __lsx_vldi(0);
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src4 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x,
+ src5, src6);
+ src7 = __lsx_vldx(_src, src_stride_3x);
+ _src += src_stride_4x;
+
+ DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero,
+ src3, in0_l, in1_l, in2_l, in3_l);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+ in0_r, in1_r, in2_r, in3_r);
+ DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6,
+ in0_l, in1_l, in2_l, in3_l);
+ __lsx_vst(in0_r, dst, 0);
+ __lsx_vstx(in1_r, dst, dst_stride_x);
+ __lsx_vstx(in2_r, dst, dst_stride_2x);
+ __lsx_vstx(in3_r, dst, dst_stride_3x);
+ dst1 = dst + 8;
+ __lsx_vst(in0_l, dst1, 0);
+ __lsx_vstx(in1_l, dst1, dst_stride_x);
+ __lsx_vstx(in2_l, dst1, dst_stride_2x);
+ __lsx_vstx(in3_l, dst1, dst_stride_3x);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
+ in0_r, in1_r, in2_r, in3_r);
+ dst1 = dst1 + 8;
+ __lsx_vst(in0_r, dst1, 0);
+ __lsx_vstx(in1_r, dst1, dst_stride_x);
+ __lsx_vstx(in2_r, dst1, dst_stride_2x);
+ __lsx_vstx(in3_r, dst1, dst_stride_3x);
+ dst += dst_stride_2x;
+ }
+}
+
+static void hevc_copy_32w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ int32_t height)
+{
+ uint32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ uint8_t *_src = src + 16;
+ __m128i zero = {0};
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src2, src4);
+ src6 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src1 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x,
+ src3, src5);
+ src7 = __lsx_vldx(_src, src_stride_3x);
+ _src += src_stride_4x;
+
+ DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero,
+ src3, in0_l, in1_l, in2_l, in3_l);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+ in0_r, in1_r, in2_r, in3_r);
+ DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6,
+ in0_l, in1_l, in2_l, in3_l);
+ __lsx_vst(in0_r, dst, 0);
+ __lsx_vst(in0_l, dst, 16);
+ __lsx_vst(in1_r, dst, 32);
+ __lsx_vst(in1_l, dst, 48);
+ dst += dst_stride;
+ __lsx_vst(in2_r, dst, 0);
+ __lsx_vst(in2_l, dst, 16);
+ __lsx_vst(in3_r, dst, 32);
+ __lsx_vst(in3_l, dst, 48);
+ dst += dst_stride;
+
+ DUP4_ARG2(__lsx_vilvh_b, zero, src4, zero, src5, zero, src6, zero, src7,
+ in0_l, in1_l, in2_l, in3_l);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
+ in0_r, in1_r, in2_r, in3_r);
+ DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
+ in1_l, in2_l, in3_l);
+ __lsx_vst(in0_r, dst, 0);
+ __lsx_vst(in0_l, dst, 16);
+ __lsx_vst(in1_r, dst, 32);
+ __lsx_vst(in1_l, dst, 48);
+ dst += dst_stride;
+ __lsx_vst(in2_r, dst, 0);
+ __lsx_vst(in2_l, dst, 16);
+ __lsx_vst(in3_r, dst, 32);
+ __lsx_vst(in3_l, dst, 48);
+ dst += dst_stride;
+ }
+}
+
+static void hevc_copy_48w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ int32_t height)
+{
+ uint32_t loop_cnt;
+ __m128i zero = {0};
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i src8, src9, src10, src11;
+ __m128i in0_r, in1_r, in2_r, in3_r, in4_r, in5_r;
+ __m128i in0_l, in1_l, in2_l, in3_l, in4_l, in5_l;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+ src2 = __lsx_vld(src, 32);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src3, src4);
+ src5 = __lsx_vld(src, 32);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src6, src7);
+ src8 = __lsx_vld(src, 32);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src9, src10);
+ src11 = __lsx_vld(src, 32);
+ src += src_stride;
+
+ DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero,
+ src3, in0_l, in1_l, in2_l, in3_l);
+ DUP2_ARG2(__lsx_vilvh_b, zero, src4, zero, src5, in4_l, in5_l);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+ in0_r, in1_r, in2_r, in3_r);
+ DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
+ in1_l, in2_l, in3_l);
+ DUP2_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, in4_r, in5_r);
+ DUP2_ARG2(__lsx_vslli_h, in4_l, 6, in5_l, 6, in4_l, in5_l);
+ __lsx_vst(in0_r, dst, 0);
+ __lsx_vst(in0_l, dst, 16);
+ __lsx_vst(in1_r, dst, 32);
+ __lsx_vst(in1_l, dst, 48);
+ __lsx_vst(in2_r, dst, 64);
+ __lsx_vst(in2_l, dst, 80);
+ dst += dst_stride;
+ __lsx_vst(in3_r, dst, 0);
+ __lsx_vst(in3_l, dst, 16);
+ __lsx_vst(in4_r, dst, 32);
+ __lsx_vst(in4_l, dst, 48);
+ __lsx_vst(in5_r, dst, 64);
+ __lsx_vst(in5_l, dst, 80);
+ dst += dst_stride;
+
+ DUP4_ARG2(__lsx_vilvh_b, zero, src6, zero, src7, zero, src8, zero, src9,
+ in0_l, in1_l, in2_l, in3_l);
+ DUP2_ARG2(__lsx_vilvh_b, zero, src10, zero, src11, in4_l, in5_l);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src6, 6, src7, 6, src8, 6, src9, 6,
+ in0_r, in1_r, in2_r, in3_r);
+ DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
+ in1_l, in2_l, in3_l);
+ DUP2_ARG2(__lsx_vsllwil_hu_bu, src10, 6, src11, 6, in4_r, in5_r);
+ DUP2_ARG2(__lsx_vslli_h, in4_l, 6, in5_l, 6, in4_l, in5_l);
+ __lsx_vst(in0_r, dst, 0);
+ __lsx_vst(in0_l, dst, 16);
+ __lsx_vst(in1_r, dst, 32);
+ __lsx_vst(in1_l, dst, 48);
+ __lsx_vst(in2_r, dst, 64);
+ __lsx_vst(in2_l, dst, 80);
+ dst += dst_stride;
+ __lsx_vst(in3_r, dst, 0);
+ __lsx_vst(in3_l, dst, 16);
+ __lsx_vst(in4_r, dst, 32);
+ __lsx_vst(in4_l, dst, 48);
+ __lsx_vst(in5_r, dst, 64);
+ __lsx_vst(in5_l, dst, 80);
+ dst += dst_stride;
+ }
+}
+
+static void hevc_copy_64w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ int32_t height)
+{
+ uint32_t loop_cnt;
+ __m128i zero = {0};
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
+
+
+ for (loop_cnt = (height >> 1); loop_cnt--;) {
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
+ src0, src1, src2, src3);
+ src += src_stride;
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
+ src4, src5, src6, src7);
+ src += src_stride;
+
+ DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero,
+ src3, in0_l, in1_l, in2_l, in3_l);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+ in0_r, in1_r, in2_r, in3_r);
+ DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6,
+ in0_l, in1_l, in2_l, in3_l);
+ __lsx_vst(in0_r, dst, 0);
+ __lsx_vst(in0_l, dst, 16);
+ __lsx_vst(in1_r, dst, 32);
+ __lsx_vst(in1_l, dst, 48);
+ __lsx_vst(in2_r, dst, 64);
+ __lsx_vst(in2_l, dst, 80);
+ __lsx_vst(in3_r, dst, 96);
+ __lsx_vst(in3_l, dst, 112);
+ dst += dst_stride;
+
+ DUP4_ARG2(__lsx_vilvh_b, zero, src4, zero, src5, zero, src6, zero,
+ src7, in0_l, in1_l, in2_l, in3_l);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src4, 6, src5, 6, src6, 6, src7, 6,
+ in0_r, in1_r, in2_r, in3_r);
+ DUP4_ARG2(__lsx_vslli_h, in0_l, 6, in1_l, 6, in2_l, 6, in3_l, 6, in0_l,
+ in1_l, in2_l, in3_l);
+ __lsx_vst(in0_r, dst, 0);
+ __lsx_vst(in0_l, dst, 16);
+ __lsx_vst(in1_r, dst, 32);
+ __lsx_vst(in1_l, dst, 48);
+ __lsx_vst(in2_r, dst, 64);
+ __lsx_vst(in2_l, dst, 80);
+ __lsx_vst(in3_r, dst, 96);
+ __lsx_vst(in3_l, dst, 112);
+ dst += dst_stride;
+ }
+}
+
+static void hevc_hz_8t_4w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ uint32_t loop_cnt = height >> 3;
+ uint32_t res = (height & 0x7) >> 1;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_4x = (dst_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i filt0, filt1, filt2, filt3;
+ __m128i mask1, mask2, mask3;
+ __m128i vec0, vec1, vec2, vec3;
+ __m128i dst0, dst1, dst2, dst3;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 16);
+
+ src -= 3;
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+
+ for (;loop_cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
+ src7 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, src1,
+ src0, mask2, src1, src0, mask3, vec0, vec1, vec2, vec3);
+ dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
+ dst0, dst0);
+ dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
+ DUP4_ARG3(__lsx_vshuf_b, src3, src2, mask0, src3, src2, mask1, src3,
+ src2, mask2, src3, src2, mask3, vec0, vec1, vec2, vec3);
+ dst1 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec1, filt1, dst1, vec2, filt2,
+ dst1, dst1);
+ dst1 = __lsx_vdp2add_h_bu_b(dst1, vec3, filt3);
+ DUP4_ARG3(__lsx_vshuf_b, src5, src4, mask0, src5, src4, mask1, src5,
+ src4, mask2, src5, src4, mask3, vec0, vec1, vec2, vec3);
+ dst2 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec1, filt1, dst2, vec2, filt2,
+ dst2, dst2);
+ dst2 = __lsx_vdp2add_h_bu_b(dst2, vec3, filt3);
+ DUP4_ARG3(__lsx_vshuf_b, src7, src6, mask0, src7, src6, mask1, src7,
+ src6, mask2, src7, src6, mask3, vec0, vec1, vec2, vec3);
+ dst3 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst3, vec2, filt2,
+ dst3, dst3);
+ dst3 = __lsx_vdp2add_h_bu_b(dst3, vec3, filt3);
+
+ __lsx_vstelm_d(dst0, dst, 0, 0);
+ __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(dst1, dst + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(dst1, dst + dst_stride_3x, 0, 1);
+ dst += dst_stride_4x;
+ __lsx_vstelm_d(dst2, dst, 0, 0);
+ __lsx_vstelm_d(dst2, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(dst3, dst + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(dst3, dst + dst_stride_3x, 0, 1);
+ dst += dst_stride_4x;
+ }
+ for (;res--;) {
+ src0 = __lsx_vld(src, 0);
+ src1 = __lsx_vldx(src, src_stride);
+ DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, src1,
+ src0, mask2, src1, src0, mask3, vec0, vec1, vec2, vec3);
+ dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
+ dst0, dst0);
+ dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
+ __lsx_vstelm_d(dst0, dst, 0, 0);
+ __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+ src += src_stride_2x;
+ dst += dst_stride_2x;
+ }
+}
+
+static void hevc_hz_8t_8w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ uint32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_2x = (dst_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
+ __m128i src0, src1, src2, src3;
+ __m128i filt0, filt1, filt2, filt3;
+ __m128i mask1, mask2, mask3;
+ __m128i vec0, vec1, vec2, vec3;
+ __m128i dst0, dst1, dst2, dst3;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+
+ src -= 3;
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0,
+ src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
+ dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
+ dst0, dst0);
+ dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
+ DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1,
+ src1, mask2, src1, src1, mask3, vec0, vec1, vec2, vec3);
+ dst1 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec1, filt1, dst1, vec2, filt2,
+ dst1, dst1);
+ dst1 = __lsx_vdp2add_h_bu_b(dst1, vec3, filt3);
+ DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2,
+ src2, mask2, src2, src2, mask3, vec0, vec1, vec2, vec3);
+ dst2 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec1, filt1, dst2, vec2, filt2,
+ dst2, dst2);
+ dst2 = __lsx_vdp2add_h_bu_b(dst2, vec3, filt3);
+ DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3,
+ src3, mask2, src3, src3, mask3, vec0, vec1, vec2, vec3);
+ dst3 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst3, vec2, filt2,
+ dst3, dst3);
+ dst3 = __lsx_vdp2add_h_bu_b(dst3, vec3, filt3);
+
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vstx(dst1, dst, dst_stride_x);
+ __lsx_vstx(dst2, dst, dst_stride_2x);
+ __lsx_vstx(dst3, dst, dst_stride_3x);
+ dst += dst_stride_2x;
+ }
+}
+
+static void hevc_hz_8t_12w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ uint32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ uint8_t *_src;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5;
+ __m128i filt0, filt1, filt2, filt3, dst0, dst1, dst2, dst3, dst4, dst5;
+
+ src -= 3;
+ _src = src + 8;
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+ mask4 = __lsx_vld(ff_hevc_mask_arr, 16);
+ DUP2_ARG2(__lsx_vaddi_bu, mask4, 2, mask4, 4, mask5, mask6);
+ mask7 = __lsx_vaddi_bu(mask4, 6);
+
+ for (loop_cnt = 4; loop_cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src4 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x,
+ src5, src6);
+ src7 = __lsx_vldx(_src, src_stride_3x);
+ src += src_stride_4x;
+ _src += src_stride_4x;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0,
+ vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask4, src7, src6, mask4,
+ vec4, vec5);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+ vec3, filt0, dst0, dst1, dst2, dst3);
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec4, filt0, vec5, filt0, dst4, dst5);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1,
+ vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask5, src7, src6, mask5,
+ vec4, vec5);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
+ dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt1, dst5, vec5, filt1,
+ dst4, dst5);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2,
+ vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask6, src7, src6, mask6,
+ vec4, vec5);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt2, dst1, vec1, filt2,
+ dst2, vec2, filt2, dst3, vec3, filt2, dst0, dst1, dst2, dst3);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt2, dst5, vec5, filt2,
+ dst4, dst5);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask3, src3, src3, mask3,
+ vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask7, src7, src6, mask7,
+ vec4, vec5);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt3, dst1, vec1, filt3,
+ dst2, vec2, filt3, dst3, vec3, filt3, dst0, dst1, dst2, dst3);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt3, dst5, vec5, filt3,
+ dst4, dst5);
+
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vstelm_d(dst4, dst, 16, 0);
+ dst += dst_stride;
+ __lsx_vst(dst1, dst, 0);
+ __lsx_vstelm_d(dst4, dst, 16, 1);
+ dst += dst_stride;
+ __lsx_vst(dst2, dst, 0);
+ __lsx_vstelm_d(dst5, dst, 16, 0);
+ dst += dst_stride;
+ __lsx_vst(dst3, dst, 0);
+ __lsx_vstelm_d(dst5, dst, 16, 1);
+ dst += dst_stride;
+ }
+}
+
+static void hevc_hz_8t_16w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ uint32_t loop_cnt;
+ __m128i src0, src1, src2, src3;
+ __m128i filt0, filt1, filt2, filt3;
+ __m128i mask1, mask2, mask3;
+ __m128i vec0, vec1, vec2, vec3;
+ __m128i dst0, dst1, dst2, dst3;
+ __m128i mask0;
+
+ src -= 3;
+ mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+
+ for (loop_cnt = (height >> 1); loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
+ src += src_stride;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0,
+ vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+ vec3, filt0, dst0, dst1, dst2, dst3);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1,
+ vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
+ dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2,
+ vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt2, dst1, vec1, filt2,
+ dst2, vec2, filt2, dst3, vec3, filt2, dst0, dst1, dst2, dst3);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask3, src3, src3, mask3,
+ vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt3, dst1, vec1, filt3,
+ dst2, vec2, filt3, dst3, vec3, filt3, dst0, dst1, dst2, dst3);
+
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vst(dst1, dst, 16);
+ dst += dst_stride;
+ __lsx_vst(dst2, dst, 0);
+ __lsx_vst(dst3, dst, 16);
+ dst += dst_stride;
+ }
+}
+
+static void hevc_hz_8t_24w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ uint32_t loop_cnt;
+ __m128i src0, src1, src2, src3;
+ __m128i filt0, filt1, filt2, filt3;
+ __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+
+ src -= 3;
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ DUP4_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask0, 6, mask0, 8, mask1,
+ mask2, mask3, mask4);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 10, mask0, 12, mask5, mask6);
+ mask7 = __lsx_vaddi_bu(mask0, 14);
+
+ for (loop_cnt = (height >> 1); loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+ src += src_stride;
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src2, src3);
+ src += src_stride;
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask4, src1,
+ src1, mask0, src2, src2, mask0, vec0, vec1, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src3, src2, mask4, src3, src3, mask0,
+ vec4, vec5);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+ vec3, filt0, dst0, dst1, dst2, dst3);
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec4, filt0, vec5, filt0, dst4, dst5);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src0, mask5, src1,
+ src1, mask1, src2, src2, mask1, vec0, vec1, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src3, src2, mask5, src3, src3, mask1,
+ vec4, vec5);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
+ dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt1, dst5, vec5, filt1,
+ dst4, dst5);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src0, mask6, src1,
+ src1, mask2, src2, src2, mask2, vec0, vec1, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src3, src2, mask6, src3, src3, mask2,
+ vec4, vec5);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt2, dst1, vec1, filt2,
+ dst2, vec2, filt2, dst3, vec3, filt2, dst0, dst1, dst2, dst3);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt2, dst5, vec5, filt2,
+ dst4, dst5);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src0, mask7, src1,
+ src1, mask3, src2, src2, mask3, vec0, vec1, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src3, src2, mask7, src3, src3, mask3,
+ vec4, vec5);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt3, dst1, vec1, filt3,
+ dst2, vec2, filt3, dst3, vec3, filt3, dst0, dst1, dst2, dst3);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt3, dst5, vec5, filt3,
+ dst4, dst5);
+
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vst(dst1, dst, 16);
+ __lsx_vst(dst2, dst, 32);
+ dst += dst_stride;
+ __lsx_vst(dst3, dst, 0);
+ __lsx_vst(dst4, dst, 16);
+ __lsx_vst(dst5, dst, 32);
+ dst += dst_stride;
+ }
+}
+
+static void hevc_hz_8t_32w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ uint32_t loop_cnt;
+ __m128i src0, src1, src2;
+ __m128i filt0, filt1, filt2, filt3;
+ __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+ __m128i vec0, vec1, vec2, vec3;
+ __m128i dst0, dst1, dst2, dst3;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+
+ src -= 3;
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ DUP4_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask0, 6, mask0, 8,
+ mask1, mask2, mask3, mask4);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 10, mask0, 12, mask5, mask6);
+ mask7 = __lsx_vaddi_bu(mask0, 14);
+
+ for (loop_cnt = height; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+ src2 = __lsx_vld(src, 24);
+ src += src_stride;
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0,
+ src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
+ dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
+ dst0, dst0);
+ dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
+
+ DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask4, src1, src0, mask5, src1,
+ src0, mask6, src1, src0, mask7, vec0, vec1, vec2, vec3);
+ dst1 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec1, filt1, dst1, vec2, filt2,
+ dst1, dst1);
+ dst1 = __lsx_vdp2add_h_bu_b(dst1, vec3, filt3);
+ DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1, src1,
+ mask2, src1, src1, mask3, vec0, vec1, vec2, vec3);
+ dst2 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec1, filt1, dst2, vec2, filt2,
+ dst2, dst2);
+ dst2 = __lsx_vdp2add_h_bu_b(dst2, vec3, filt3);
+ DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2, src2,
+ mask2, src2, src2, mask3, vec0, vec1, vec2, vec3);
+ dst3 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst3, vec2, filt2,
+ dst3, dst3);
+ dst3 = __lsx_vdp2add_h_bu_b(dst3, vec3, filt3);
+
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vst(dst1, dst, 16);
+ __lsx_vst(dst2, dst, 32);
+ __lsx_vst(dst3, dst, 48);
+ dst += dst_stride;
+ }
+}
+
+static void hevc_hz_8t_48w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ uint32_t loop_cnt;
+ __m128i src0, src1, src2, src3;
+ __m128i filt0, filt1, filt2, filt3;
+ __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+
+ src -= 3;
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ DUP4_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask0, 6, mask0, 8, mask1,
+ mask2, mask3, mask4);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 10, mask0, 12, mask5, mask6);
+ mask7 = __lsx_vaddi_bu(mask0, 14);
+
+ for (loop_cnt = height; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+ src2 = __lsx_vld(src, 32);
+ src3 = __lsx_vld(src, 40);
+ src += src_stride;
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask4, src1, src1,
+ mask0, src2, src1, mask4, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+ vec3, filt0, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src0, mask5, src1,
+ src1, mask1, src2, src1, mask5, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
+ dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src0, mask6, src1,
+ src1, mask2, src2, src1, mask6, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt2, dst1, vec1, filt2,
+ dst2, vec2, filt2, dst3, vec3, filt2, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src0, mask7, src1,
+ src1, mask3, src2, src1, mask7, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt3, dst1, vec1, filt3,
+ dst2, vec2, filt3, dst3, vec3, filt3, dst0, dst1, dst2, dst3);
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vst(dst1, dst, 16);
+ __lsx_vst(dst2, dst, 32);
+ __lsx_vst(dst3, dst, 48);
+
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0,
+ vec4, vec5);
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec4, filt0, vec5, filt0, dst4, dst5);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1,
+ vec4, vec5);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt1, dst5, vec5, filt1,
+ dst4, dst5);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2,
+ vec4, vec5);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt2, dst5, vec5, filt2,
+ dst4, dst5);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask3, src3, src3, mask3,
+ vec4, vec5);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec4, filt3, dst5, vec5, filt3,
+ dst4, dst5);
+ __lsx_vst(dst4, dst, 64);
+ __lsx_vst(dst5, dst, 80);
+ dst += dst_stride;
+ }
+}
+
+static void hevc_hz_8t_64w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ uint32_t loop_cnt;
+ __m128i src0, src1, src2, src3, src4;
+ __m128i filt0, filt1, filt2, filt3;
+ __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+ __m128i vec0, vec1, vec2, vec3;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+
+ src -= 3;
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ DUP4_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask0, 6, mask0, 8, mask1,
+ mask2, mask3, mask4);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 10, mask0, 12, mask5, mask6)
+ mask7 = __lsx_vaddi_bu(mask0, 14);
+
+ for (loop_cnt = height; loop_cnt--;) {
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
+ src0, src1, src2, src3);
+ src4 = __lsx_vld(src, 56);
+ src += src_stride;
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0,
+ src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
+ dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
+ dst0, dst0);
+ dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
+ __lsx_vst(dst0, dst, 0);
+
+ DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask4, src1, src0, mask5, src1,
+ src0, mask6, src1, src0, mask7, vec0, vec1, vec2, vec3);
+ dst1 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec1, filt1, dst1, vec2, filt2,
+ dst1, dst1);
+ dst1 = __lsx_vdp2add_h_bu_b(dst1, vec3, filt3);
+ __lsx_vst(dst1, dst, 16);
+
+ DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1,
+ src1, mask2, src1, src1, mask3, vec0, vec1, vec2, vec3);
+ dst2 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec1, filt1, dst2, vec2, filt2,
+ dst2, dst2);
+ dst2 = __lsx_vdp2add_h_bu_b(dst2, vec3, filt3);
+ __lsx_vst(dst2, dst, 32);
+
+ DUP4_ARG3(__lsx_vshuf_b, src2, src1, mask4, src2, src1, mask5, src2,
+ src1, mask6, src2, src1, mask7, vec0, vec1, vec2, vec3);
+ dst3 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst3, vec2, filt2,
+ dst3, dst3);
+ dst3 = __lsx_vdp2add_h_bu_b(dst3, vec3, filt3);
+ __lsx_vst(dst3, dst, 48);
+
+ DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2,
+ src2, mask2, src2, src2, mask3, vec0, vec1, vec2, vec3);
+ dst4 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst4, vec2, filt2,
+ dst4, dst4);
+ dst4 = __lsx_vdp2add_h_bu_b(dst4, vec3, filt3);
+ __lsx_vst(dst4, dst, 64);
+
+ DUP4_ARG3(__lsx_vshuf_b, src3, src2, mask4, src3, src2, mask5, src3,
+ src2, mask6, src3, src2, mask7, vec0, vec1, vec2, vec3);
+ dst5 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec1, filt1, dst5, vec2, filt2,
+ dst5, dst5);
+ dst5 = __lsx_vdp2add_h_bu_b(dst5, vec3, filt3);
+ __lsx_vst(dst5, dst, 80);
+
+ DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3,
+ src3, mask2, src3, src3, mask3, vec0, vec1, vec2, vec3);
+ dst6 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst6, vec1, filt1, dst6, vec2, filt2,
+ dst6, dst6);
+ dst6 = __lsx_vdp2add_h_bu_b(dst6, vec3, filt3);
+ __lsx_vst(dst6, dst, 96);
+
+ DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4,
+ src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3);
+ dst7 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2, filt2,
+ dst7, dst7);
+ dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3);
+ __lsx_vst(dst7, dst, 112);
+ dst += dst_stride;
+ }
+}
+
+static void hevc_vt_8t_4w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ int32_t loop_cnt;
+ int32_t res = (height & 0x07) >> 1;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ __m128i src9, src10, src11, src12, src13, src14;
+ __m128i src10_r, src32_r, src54_r, src76_r, src98_r;
+ __m128i src21_r, src43_r, src65_r, src87_r, src109_r;
+ __m128i src1110_r, src1211_r, src1312_r, src1413_r;
+ __m128i src2110, src4332, src6554, src8776, src10998;
+ __m128i src12111110, src14131312;
+ __m128i dst10, dst32, dst54, dst76;
+ __m128i filt0, filt1, filt2, filt3;
+
+ src -= src_stride_3x;
+
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
+ src += src_stride_3x;
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ src10_r, src32_r, src54_r, src21_r);
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
+ DUP2_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r,
+ src2110, src4332);
+ src6554 = __lsx_vilvl_d(src65_r, src54_r);
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ src7 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
+ src10 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src11 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
+ src12, src13);
+ src14 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+
+ DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+ src76_r, src87_r, src98_r, src109_r);
+ DUP4_ARG2(__lsx_vilvl_b, src11, src10, src12, src11, src13, src12, src14,
+ src13, src1110_r, src1211_r, src1312_r, src1413_r);
+ DUP4_ARG2(__lsx_vilvl_d, src87_r, src76_r, src109_r, src98_r, src1211_r,
+ src1110_r, src1413_r, src1312_r, src8776, src10998,
+ src12111110, src14131312);
+
+ dst10 = __lsx_vdp2_h_bu_b(src2110, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst10, src4332, filt1, dst10, src6554,
+ filt2, dst10, dst10);
+ dst10 = __lsx_vdp2add_h_bu_b(dst10, src8776, filt3);
+ dst32 = __lsx_vdp2_h_bu_b(src4332, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst32, src6554, filt1, dst32, src8776,
+ filt2, dst32, dst32);
+ dst32 = __lsx_vdp2add_h_bu_b(dst32, src10998, filt3);
+ dst54 = __lsx_vdp2_h_bu_b(src6554, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst54, src8776, filt1,
+ dst54, src10998, filt2, dst54, dst54);
+ dst54 = __lsx_vdp2add_h_bu_b(dst54, src12111110, filt3);
+ dst76 = __lsx_vdp2_h_bu_b(src8776, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst76, src10998, filt1, dst76,
+ src12111110, filt2, dst76, dst76);
+ dst76 = __lsx_vdp2add_h_bu_b(dst76, src14131312, filt3);
+
+ __lsx_vstelm_d(dst10, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst10, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst32, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst32, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst54, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst54, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst76, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst76, dst, 0, 1);
+ dst += dst_stride;
+
+ src2110 = src10998;
+ src4332 = src12111110;
+ src6554 = src14131312;
+ src6 = src14;
+ }
+ for (;res--;) {
+ src7 = __lsx_vld(src, 0);
+ src8 = __lsx_vldx(src, src_stride);
+ DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
+ src += src_stride_2x;
+ src8776 = __lsx_vilvl_d(src87_r, src76_r);
+
+ dst10 = __lsx_vdp2_h_bu_b(src2110, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst10, src4332, filt1, dst10, src6554,
+ filt2, dst10, dst10);
+ dst10 = __lsx_vdp2add_h_bu_b(dst10, src8776, filt3);
+
+ __lsx_vstelm_d(dst10, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst10, dst, 0, 1);
+ dst += dst_stride;
+
+ src2110 = src4332;
+ src4332 = src6554;
+ src6554 = src8776;
+ src6 = src8;
+ }
+}
+
+static void hevc_vt_8t_8w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ int32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_2x = (dst_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i src10_r, src32_r, src54_r, src76_r, src98_r;
+ __m128i src21_r, src43_r, src65_r, src87_r, src109_r;
+ __m128i dst0_r, dst1_r, dst2_r, dst3_r;
+ __m128i filt0, filt1, filt2, filt3;
+
+ src -= src_stride_3x;
+
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
+ src += src_stride_3x;
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ src10_r, src32_r, src54_r, src21_r);
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src7 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
+ src10 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
+ src9, src76_r, src87_r, src98_r, src109_r);
+
+ dst0_r = __lsx_vdp2_h_bu_b(src10_r, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_r,
+ src54_r, filt2, dst0_r, dst0_r);
+ dst0_r = __lsx_vdp2add_h_bu_b(dst0_r, src76_r, filt3);
+ dst1_r = __lsx_vdp2_h_bu_b(src21_r, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1_r, src43_r, filt1, dst1_r,
+ src65_r, filt2, dst1_r, dst1_r);
+ dst1_r = __lsx_vdp2add_h_bu_b(dst1_r, src87_r, filt3);
+ dst2_r = __lsx_vdp2_h_bu_b(src32_r, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src54_r, filt1, dst2_r,
+ src76_r, filt2, dst2_r, dst2_r);
+ dst2_r = __lsx_vdp2add_h_bu_b(dst2_r, src98_r, filt3);
+ dst3_r = __lsx_vdp2_h_bu_b(src43_r, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3_r, src65_r, filt1, dst3_r,
+ src87_r, filt2, dst3_r, dst3_r);
+ dst3_r = __lsx_vdp2add_h_bu_b(dst3_r, src109_r, filt3);
+
+ __lsx_vst(dst0_r, dst, 0);
+ __lsx_vstx(dst1_r, dst, dst_stride_x);
+ __lsx_vstx(dst2_r, dst, dst_stride_2x);
+ __lsx_vstx(dst3_r, dst, dst_stride_3x);
+ dst += dst_stride_2x;
+
+ src10_r = src54_r;
+ src32_r = src76_r;
+ src54_r = src98_r;
+ src21_r = src65_r;
+ src43_r = src87_r;
+ src65_r = src109_r;
+ src6 = src10;
+ }
+}
+
+static void hevc_vt_8t_12w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ int32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i src10_r, src32_r, src54_r, src76_r, src98_r;
+ __m128i src21_r, src43_r, src65_r, src87_r, src109_r;
+ __m128i dst0_r, dst1_r, dst2_r, dst3_r;
+ __m128i src10_l, src32_l, src54_l, src76_l, src98_l;
+ __m128i src21_l, src43_l, src65_l, src87_l, src109_l;
+ __m128i src2110, src4332, src6554, src8776, src10998;
+ __m128i dst0_l, dst1_l;
+ __m128i filt0, filt1, filt2, filt3;
+
+ src -= src_stride_3x;
+
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
+ src += src_stride_3x;
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ src10_r, src32_r, src54_r, src21_r);
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
+ DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ src10_l, src32_l, src54_l, src21_l);
+ DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, src43_l, src65_l);
+ DUP2_ARG2(__lsx_vilvl_d, src21_l, src10_l, src43_l, src32_l,
+ src2110, src4332);
+ src6554 = __lsx_vilvl_d(src65_l, src54_l);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src7 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
+ src10 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
+ src9, src76_r, src87_r, src98_r, src109_r);
+ DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10,
+ src9, src76_l, src87_l, src98_l, src109_l);
+ DUP2_ARG2(__lsx_vilvl_d, src87_l, src76_l, src109_l, src98_l,
+ src8776, src10998);
+
+ dst0_r = __lsx_vdp2_h_bu_b(src10_r, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_r,
+ src54_r, filt2, dst0_r, dst0_r);
+ dst0_r = __lsx_vdp2add_h_bu_b(dst0_r, src76_r, filt3);
+ dst1_r = __lsx_vdp2_h_bu_b(src21_r, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1_r, src43_r, filt1, dst1_r,
+ src65_r, filt2, dst1_r, dst1_r);
+ dst1_r = __lsx_vdp2add_h_bu_b(dst1_r, src87_r, filt3);
+ dst2_r = __lsx_vdp2_h_bu_b(src32_r, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src54_r, filt1, dst2_r,
+ src76_r, filt2, dst2_r, dst2_r);
+ dst2_r = __lsx_vdp2add_h_bu_b(dst2_r, src98_r, filt3);
+ dst3_r = __lsx_vdp2_h_bu_b(src43_r, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3_r, src65_r, filt1, dst3_r,
+ src87_r, filt2, dst3_r, dst3_r);
+ dst3_r = __lsx_vdp2add_h_bu_b(dst3_r, src109_r, filt3);
+ dst0_l = __lsx_vdp2_h_bu_b(src2110, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0_l, src4332, filt1, dst0_l,
+ src6554, filt2, dst0_l, dst0_l);
+ dst0_l = __lsx_vdp2add_h_bu_b(dst0_l, src8776, filt3);
+ dst1_l = __lsx_vdp2_h_bu_b(src4332, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1_l, src6554, filt1, dst1_l,
+ src8776, filt2, dst1_l, dst1_l);
+ dst1_l = __lsx_vdp2add_h_bu_b(dst1_l, src10998, filt3);
+
+ __lsx_vst(dst0_r, dst, 0);
+ __lsx_vstelm_d(dst0_l, dst, 16, 0);
+ dst += dst_stride;
+ __lsx_vst(dst1_r, dst, 0);
+ __lsx_vstelm_d(dst0_l, dst, 16, 1);
+ dst += dst_stride;
+ __lsx_vst(dst2_r, dst, 0);
+ __lsx_vstelm_d(dst1_l, dst, 16, 0);
+ dst += dst_stride;
+ __lsx_vst(dst3_r, dst, 0);
+ __lsx_vstelm_d(dst1_l, dst, 16, 1);
+ dst += dst_stride;
+
+ src10_r = src54_r;
+ src32_r = src76_r;
+ src54_r = src98_r;
+ src21_r = src65_r;
+ src43_r = src87_r;
+ src65_r = src109_r;
+ src2110 = src6554;
+ src4332 = src8776;
+ src6554 = src10998;
+ src6 = src10;
+ }
+}
+
+static void hevc_vt_8t_16multx4mult_lsx(uint8_t *src,
+ int32_t src_stride,
+ int16_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter,
+ int32_t height,
+ int32_t width)
+{
+ uint8_t *src_tmp;
+ int16_t *dst_tmp;
+ int32_t loop_cnt, cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i src10_r, src32_r, src54_r, src76_r, src98_r;
+ __m128i src21_r, src43_r, src65_r, src87_r, src109_r;
+ __m128i dst0_r, dst1_r, dst2_r, dst3_r;
+ __m128i src10_l, src32_l, src54_l, src76_l, src98_l;
+ __m128i src21_l, src43_l, src65_l, src87_l, src109_l;
+ __m128i dst0_l, dst1_l, dst2_l, dst3_l;
+ __m128i filt0, filt1, filt2, filt3;
+
+ src -= src_stride_3x;
+
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ for (cnt = width >> 4; cnt--;) {
+ src_tmp = src;
+ dst_tmp = dst;
+
+ src0 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src1, src2);
+ src3 = __lsx_vldx(src_tmp, src_stride_3x);
+ src_tmp += src_stride_4x;
+ src4 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src5, src6);
+ src_tmp += src_stride_3x;
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ src10_r, src32_r, src54_r, src21_r);
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
+ DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ src10_l, src32_l, src54_l, src21_l);
+ DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, src43_l, src65_l);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src7 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src8, src9);
+ src10 = __lsx_vldx(src_tmp, src_stride_3x);
+ src_tmp += src_stride_4x;
+ DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8,
+ src10, src9, src76_r, src87_r, src98_r, src109_r);
+ DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8,
+ src10, src9, src76_l, src87_l, src98_l, src109_l);
+
+ dst0_r = __lsx_vdp2_h_bu_b(src10_r, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_r,
+ src54_r, filt2, dst0_r, dst0_r);
+ dst0_r = __lsx_vdp2add_h_bu_b(dst0_r, src76_r, filt3);
+ dst1_r = __lsx_vdp2_h_bu_b(src21_r, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1_r, src43_r, filt1, dst1_r,
+ src65_r, filt2, dst1_r, dst1_r);
+ dst1_r = __lsx_vdp2add_h_bu_b(dst1_r, src87_r, filt3);
+ dst2_r = __lsx_vdp2_h_bu_b(src32_r, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src54_r, filt1, dst2_r,
+ src76_r, filt2, dst2_r, dst2_r);
+ dst2_r = __lsx_vdp2add_h_bu_b(dst2_r, src98_r, filt3);
+ dst3_r = __lsx_vdp2_h_bu_b(src43_r, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3_r, src65_r, filt1, dst3_r,
+ src87_r, filt2, dst3_r, dst3_r);
+ dst3_r = __lsx_vdp2add_h_bu_b(dst3_r, src109_r, filt3);
+ dst0_l = __lsx_vdp2_h_bu_b(src10_l, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0_l, src32_l, filt1, dst0_l,
+ src54_l, filt2, dst0_l, dst0_l);
+ dst0_l = __lsx_vdp2add_h_bu_b(dst0_l, src76_l, filt3);
+ dst1_l = __lsx_vdp2_h_bu_b(src21_l, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1_l, src43_l, filt1, dst1_l,
+ src65_l, filt2, dst1_l, dst1_l);
+ dst1_l = __lsx_vdp2add_h_bu_b(dst1_l, src87_l, filt3);
+ dst2_l = __lsx_vdp2_h_bu_b(src32_l, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_l, src54_l, filt1, dst2_l,
+ src76_l, filt2, dst2_l, dst2_l);
+ dst2_l = __lsx_vdp2add_h_bu_b(dst2_l, src98_l, filt3);
+ dst3_l = __lsx_vdp2_h_bu_b(src43_l, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3_l, src65_l, filt1, dst3_l,
+ src87_l, filt2, dst3_l, dst3_l);
+ dst3_l = __lsx_vdp2add_h_bu_b(dst3_l, src109_l, filt3);
+
+ __lsx_vst(dst0_r, dst_tmp, 0);
+ __lsx_vst(dst0_l, dst_tmp, 16);
+ dst_tmp += dst_stride;
+ __lsx_vst(dst1_r, dst_tmp, 0);
+ __lsx_vst(dst1_l, dst_tmp, 16);
+ dst_tmp += dst_stride;
+ __lsx_vst(dst2_r, dst_tmp, 0);
+ __lsx_vst(dst2_l, dst_tmp, 16);
+ dst_tmp += dst_stride;
+ __lsx_vst(dst3_r, dst_tmp, 0);
+ __lsx_vst(dst3_l, dst_tmp, 16);
+ dst_tmp += dst_stride;
+
+ src10_r = src54_r;
+ src32_r = src76_r;
+ src54_r = src98_r;
+ src21_r = src65_r;
+ src43_r = src87_r;
+ src65_r = src109_r;
+ src10_l = src54_l;
+ src32_l = src76_l;
+ src54_l = src98_l;
+ src21_l = src65_l;
+ src43_l = src87_l;
+ src65_l = src109_l;
+ src6 = src10;
+ }
+ src += 16;
+ dst += 16;
+ }
+}
+
+static void hevc_vt_8t_16w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ hevc_vt_8t_16multx4mult_lsx(src, src_stride, dst, dst_stride,
+ filter, height, 16);
+}
+
+static void hevc_vt_8t_24w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ hevc_vt_8t_16multx4mult_lsx(src, src_stride, dst, dst_stride,
+ filter, height, 16);
+ hevc_vt_8t_8w_lsx(src + 16, src_stride, dst + 16, dst_stride,
+ filter, height);
+}
+
+static void hevc_vt_8t_32w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ hevc_vt_8t_16multx4mult_lsx(src, src_stride, dst, dst_stride,
+ filter, height, 32);
+}
+
+static void hevc_vt_8t_48w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ hevc_vt_8t_16multx4mult_lsx(src, src_stride, dst, dst_stride,
+ filter, height, 48);
+}
+
+static void hevc_vt_8t_64w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ hevc_vt_8t_16multx4mult_lsx(src, src_stride, dst, dst_stride,
+ filter, height, 64);
+}
+
+static void hevc_hv_8t_4w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ uint32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i filt0, filt1, filt2, filt3;
+ __m128i filt_h0, filt_h1, filt_h2, filt_h3;
+ __m128i mask1, mask2, mask3;
+ __m128i filter_vec;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+ __m128i dst30, dst41, dst52, dst63, dst66, dst97, dst108;
+ __m128i dst0_r, dst1_r, dst2_r, dst3_r;
+ __m128i dst10_r, dst32_r, dst54_r, dst76_r, dst98_r;
+ __m128i dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
+ __m128i mask0;
+
+ mask0 = __lsx_vld(ff_hevc_mask_arr, 16);
+
+ src -= src_stride_3x + 3;
+ DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4,
+ filter_x, 6, filt0, filt1, filt2, filt3);
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+
+ DUP4_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filter_vec, 2,
+ filter_vec, 3, filt_h0, filt_h1, filt_h2, filt_h3);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
+ src += src_stride_3x;
+
+ DUP4_ARG3(__lsx_vshuf_b, src3, src0, mask0, src3, src0, mask1, src3, src0,
+ mask2, src3, src0, mask3, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src4, src1, mask0, src4, src1, mask1, src4, src1,
+ mask2, src4, src1, mask3, vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vshuf_b, src5, src2, mask0, src5, src2, mask1, src5, src2,
+ mask2, src5, src2, mask3, vec8, vec9, vec10, vec11);
+ DUP4_ARG3(__lsx_vshuf_b, src6, src3, mask0, src6, src3, mask1, src6, src3,
+ mask2, src6, src3, mask3, vec12, vec13, vec14, vec15);
+ dst30 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst30, vec1, filt1, dst30, vec2, filt2,
+ dst30, dst30);
+ dst30 = __lsx_vdp2add_h_bu_b(dst30, vec3, filt3);
+ dst41 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst41, vec5, filt1, dst41, vec6, filt2,
+ dst41, dst41);
+ dst41 = __lsx_vdp2add_h_bu_b(dst41, vec7, filt3);
+ dst52 = __lsx_vdp2_h_bu_b(vec8, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst52, vec9, filt1, dst52, vec10, filt2,
+ dst52, dst52);
+ dst52 = __lsx_vdp2add_h_bu_b(dst52, vec11, filt3);
+ dst63 = __lsx_vdp2_h_bu_b(vec12, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst63, vec13, filt1, dst63, vec14, filt2,
+ dst63, dst63);
+ dst63 = __lsx_vdp2add_h_bu_b(dst63, vec15, filt3);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst41, dst30, dst52, dst41, dst10_r, dst21_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst41, dst30, dst52, dst41, dst43_r, dst54_r);
+ dst32_r = __lsx_vilvl_h(dst63, dst52);
+ dst65_r = __lsx_vilvh_h(dst63, dst52);
+ dst66 = __lsx_vreplvei_d(dst63, 1);
+
+ for (loop_cnt = height >> 2; loop_cnt--;) {
+ src7 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
+ src10 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+
+ DUP4_ARG3(__lsx_vshuf_b, src9, src7, mask0, src9, src7, mask1, src9, src7,
+ mask2, src9, src7, mask3, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src10, src8, mask0, src10, src8, mask1, src10, src8,
+ mask2, src10, src8, mask3, vec4, vec5, vec6, vec7);
+
+ dst97 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst97, vec1, filt1, dst97, vec2, filt2,
+ dst97, dst97);
+ dst97 = __lsx_vdp2add_h_bu_b(dst97, vec3, filt3);
+ dst108 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst108, vec5, filt1, dst108, vec6,
+ filt2, dst108, dst108);
+ dst108 = __lsx_vdp2add_h_bu_b(dst108, vec7, filt3);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst97, dst66, dst108, dst97, dst76_r, dst87_r);
+ dst109_r = __lsx_vilvh_h(dst108, dst97);
+ dst66 = __lsx_vreplvei_d(dst97, 1);
+ dst98_r = __lsx_vilvl_h(dst66, dst108);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst21_r, filt_h0, dst32_r,
+ filt_h0, dst43_r, filt_h0, dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst1_r, dst43_r,
+ filt_h1, dst2_r, dst54_r, filt_h1, dst3_r, dst65_r, filt_h1,
+ dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst54_r, filt_h2, dst1_r, dst65_r,
+ filt_h2, dst2_r, dst76_r, filt_h2, dst3_r, dst87_r, filt_h2,
+ dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst1_r, dst87_r,
+ filt_h3, dst2_r, dst98_r, filt_h3, dst3_r, dst109_r, filt_h3,
+ dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst1_r, 6, dst2_r, 6, dst3_r, 6,
+ dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP2_ARG2(__lsx_vpickev_h, dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst2_r);
+ __lsx_vstelm_d(dst0_r, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst0_r, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst2_r, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst2_r, dst, 0, 1);
+ dst += dst_stride;
+
+ dst10_r = dst54_r;
+ dst32_r = dst76_r;
+ dst54_r = dst98_r;
+ dst21_r = dst65_r;
+ dst43_r = dst87_r;
+ dst65_r = dst109_r;
+ dst66 = __lsx_vreplvei_d(dst108, 1);
+ }
+}
+
+static void hevc_hv_8t_8multx1mult_lsx(uint8_t *src,
+ int32_t src_stride,
+ int16_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter_x,
+ const int8_t *filter_y,
+ int32_t height,
+ int32_t width)
+{
+ uint32_t loop_cnt, cnt;
+ uint8_t *src_tmp;
+ int16_t *dst_tmp;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i filt0, filt1, filt2, filt3;
+ __m128i filt_h0, filt_h1, filt_h2, filt_h3;
+ __m128i mask1, mask2, mask3;
+ __m128i filter_vec;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ __m128i dst0_r, dst0_l;
+ __m128i dst10_r, dst32_r, dst54_r, dst76_r;
+ __m128i dst10_l, dst32_l, dst54_l, dst76_l;
+ __m128i mask0 = {0x403030202010100, 0x807070606050504};
+
+ src -= src_stride_3x + 3;
+ DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4,
+ filter_x, 6, filt0, filt1, filt2, filt3);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+
+ DUP4_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filter_vec, 2,
+ filter_vec, 3, filt_h0, filt_h1, filt_h2, filt_h3);
+
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+
+ for (cnt = width >> 3; cnt--;) {
+ src_tmp = src;
+ dst_tmp = dst;
+ src0 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src1, src2);
+ src3 = __lsx_vldx(src_tmp, src_stride_3x);
+ src_tmp += src_stride_4x;
+ src4 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src5, src6);
+ src_tmp += src_stride_3x;
+
+ /* row 0 row 1 row 2 row 3 */
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0,
+ src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1,
+ src1, mask2, src1, src1, mask3, vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2,
+ src2, mask2, src2, src2, mask3, vec8, vec9, vec10, vec11);
+ DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3,
+ src3, mask2, src3, src3, mask3, vec12, vec13, vec14, vec15);
+ dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
+ dst0, dst0);
+ dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
+ dst1 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec5, filt1, dst1, vec6, filt2,
+ dst1, dst1);
+ dst1 = __lsx_vdp2add_h_bu_b(dst1, vec7, filt3);
+ dst2 = __lsx_vdp2_h_bu_b(vec8, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec9, filt1, dst2, vec10, filt2,
+ dst2, dst2);
+ dst2 = __lsx_vdp2add_h_bu_b(dst2, vec11, filt3);
+ dst3 = __lsx_vdp2_h_bu_b(vec12, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec13, filt1, dst3, vec14, filt2,
+ dst3, dst3);
+ dst3 = __lsx_vdp2add_h_bu_b(dst3, vec15, filt3);
+
+ /* row 4 row 5 row 6 */
+ DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4,
+ src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5,
+ src5, mask2, src5, src5, mask3, vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6,
+ src6, mask2, src6, src6, mask3, vec8, vec9, vec10, vec11);
+ dst4 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst4, vec2, filt2,
+ dst4, dst4);
+ dst4 = __lsx_vdp2add_h_bu_b(dst4, vec3, filt3);
+ dst5 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec5, filt1, dst5, vec6, filt2,
+ dst5, dst5);
+ dst5 = __lsx_vdp2add_h_bu_b(dst5, vec7, filt3);
+ dst6 = __lsx_vdp2_h_bu_b(vec8, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst6, vec9, filt1, dst6, vec10, filt2,
+ dst6, dst6);
+ dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3);
+
+ for (loop_cnt = height; loop_cnt--;) {
+ src7 = __lsx_vld(src_tmp, 0);
+ src_tmp += src_stride;
+
+ DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7,
+ src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3);
+ dst7 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2,
+ filt2, dst7, dst7);
+ dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3);
+
+ DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7,
+ dst6, dst10_r, dst32_r, dst54_r, dst76_r);
+ DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7,
+ dst6, dst10_l, dst32_l, dst54_l, dst76_l);
+
+ DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0,
+ dst0_r, dst0_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
+ dst32_l, filt_h1, dst0_r, dst54_r, filt_h2, dst0_l,
+ dst54_l, filt_h2, dst0_r, dst0_l, dst0_r, dst0_l);
+ DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l,
+ dst76_l, filt_h3, dst0_r, dst0_l);
+ dst0_r = __lsx_vsrai_w(dst0_r, 6);
+ dst0_l = __lsx_vsrai_w(dst0_l, 6);
+
+ dst0_r = __lsx_vpickev_h(dst0_l, dst0_r);
+ __lsx_vst(dst0_r, dst_tmp, 0);
+ dst_tmp += dst_stride;
+
+ dst0 = dst1;
+ dst1 = dst2;
+ dst2 = dst3;
+ dst3 = dst4;
+ dst4 = dst5;
+ dst5 = dst6;
+ dst6 = dst7;
+ }
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void hevc_hv_8t_8w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ hevc_hv_8t_8multx1mult_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 8);
+}
+
+static void hevc_hv_8t_12w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ uint32_t loop_cnt;
+ uint8_t *src_tmp;
+ int16_t *dst_tmp;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+ __m128i filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
+ __m128i filter_vec;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ __m128i dst30, dst41, dst52, dst63, dst66, dst97, dst108;
+ __m128i dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
+ __m128i dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
+ __m128i dst0_r, dst0_l, dst1_r, dst2_r, dst3_r;
+
+ src -= src_stride_3x + 3;
+ DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4,
+ filter_x, 6, filt0, filt1, filt2, filt3);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+
+ DUP4_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filter_vec, 2,
+ filter_vec, 3, filt_h0, filt_h1, filt_h2, filt_h3);
+
+ mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+
+ src_tmp = src;
+ dst_tmp = dst;
+
+ src0 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src1, src2);
+ src3 = __lsx_vldx(src_tmp, src_stride_3x);
+ src_tmp += src_stride_4x;
+ src4 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src5, src6);
+ src_tmp += src_stride_3x;
+
+ /* row 0 row 1 row 2 row 3 */
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0, src0,
+ mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1, src1,
+ mask2, src1, src1, mask3, vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2, src2,
+ mask2, src2, src2, mask3, vec8, vec9, vec10, vec11);
+ DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3, src3,
+ mask2, src3, src3, mask3, vec12, vec13, vec14, vec15);
+ dst0 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst0, vec2, filt2,
+ dst0, dst0);
+ dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt3);
+ dst1 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec5, filt1, dst1, vec6, filt2,
+ dst1, dst1);
+ dst1 = __lsx_vdp2add_h_bu_b(dst1, vec7, filt3);
+ dst2 = __lsx_vdp2_h_bu_b(vec8, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec9, filt1, dst2, vec10, filt2,
+ dst2, dst2);
+ dst2 = __lsx_vdp2add_h_bu_b(dst2, vec11, filt3);
+ dst3 = __lsx_vdp2_h_bu_b(vec12, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec13, filt1, dst3, vec14, filt2,
+ dst3, dst3);
+ dst3 = __lsx_vdp2add_h_bu_b(dst3, vec15, filt3);
+
+ /* row 4 row 5 row 6 */
+ DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4, src4,
+ mask2, src4, src4, mask3, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5, src5,
+ mask2, src5, src5, mask3, vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6, src6,
+ mask2, src6, src6, mask3, vec8, vec9, vec10, vec11);
+ dst4 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst4, vec2, filt2,
+ dst4, dst4);
+ dst4 = __lsx_vdp2add_h_bu_b(dst4, vec3, filt3);
+ dst5 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec5, filt1, dst5, vec6, filt2,
+ dst5, dst5);
+ dst5 = __lsx_vdp2add_h_bu_b(dst5, vec7, filt3);
+ dst6 = __lsx_vdp2_h_bu_b(vec8, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst6, vec9, filt1, dst6, vec10, filt2,
+ dst6, dst6);
+ dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3);
+
+ for (loop_cnt = height; loop_cnt--;) {
+ src7 = __lsx_vld(src_tmp, 0);
+ src_tmp += src_stride;
+
+ DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7,
+ src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3);
+ dst7 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2, filt2,
+ dst7, dst7);
+ dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3);
+ DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
+ dst10_r, dst32_r, dst54_r, dst76_r);
+ DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
+ dst10_l, dst32_l, dst54_l, dst76_l);
+ DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0,
+ dst0_r, dst0_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+ filt_h1, dst0_r, dst54_r, filt_h2, dst0_l, dst54_l, filt_h2,
+ dst0_r, dst0_l, dst0_r, dst0_l);
+ DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l, dst76_l,
+ filt_h3, dst0_r, dst0_l)
+ dst0_r = __lsx_vsrai_w(dst0_r, 6);
+ dst0_l = __lsx_vsrai_w(dst0_l, 6);
+
+ dst0_r = __lsx_vpickev_h(dst0_l, dst0_r);
+ __lsx_vst(dst0_r, dst_tmp, 0);
+ dst_tmp += dst_stride;
+
+ dst0 = dst1;
+ dst1 = dst2;
+ dst2 = dst3;
+ dst3 = dst4;
+ dst4 = dst5;
+ dst5 = dst6;
+ dst6 = dst7;
+ }
+ src += 8;
+ dst += 8;
+
+ mask4 = __lsx_vld(ff_hevc_mask_arr, 16);
+ DUP2_ARG2(__lsx_vaddi_bu, mask4, 2, mask4, 4, mask5, mask6);
+ mask7 = __lsx_vaddi_bu(mask4, 6);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
+ src += src_stride_3x;
+
+ DUP4_ARG3(__lsx_vshuf_b, src3, src0, mask4, src3, src0, mask5, src3, src0,
+ mask6, src3, src0, mask7, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src4, src1, mask4, src4, src1, mask5, src4, src1,
+ mask6, src4, src1, mask7, vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vshuf_b, src5, src2, mask4, src5, src2, mask5, src5, src2,
+ mask6, src5, src2, mask7, vec8, vec9, vec10, vec11);
+ DUP4_ARG3(__lsx_vshuf_b, src6, src3, mask4, src6, src3, mask5, src6, src3,
+ mask6, src6, src3, mask7, vec12, vec13, vec14, vec15);
+ dst30 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst30, vec1, filt1, dst30, vec2, filt2,
+ dst30, dst30);
+ dst30 = __lsx_vdp2add_h_bu_b(dst30, vec3, filt3);
+ dst41 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst41, vec5, filt1, dst41, vec6, filt2,
+ dst41, dst41);
+ dst41 = __lsx_vdp2add_h_bu_b(dst41, vec7, filt3);
+ dst52 = __lsx_vdp2_h_bu_b(vec8, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst52, vec9, filt1, dst52, vec10, filt2,
+ dst52, dst52);
+ dst52 = __lsx_vdp2add_h_bu_b(dst52, vec11, filt3);
+ dst63 = __lsx_vdp2_h_bu_b(vec12, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst63, vec13, filt1, dst63, vec14, filt2,
+ dst63, dst63);
+ dst63 = __lsx_vdp2add_h_bu_b(dst63, vec15, filt3);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst41, dst30, dst52, dst41, dst10_r, dst21_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst41, dst30, dst52, dst41, dst43_r, dst54_r);
+ dst32_r = __lsx_vilvl_h(dst63, dst52);
+ dst65_r = __lsx_vilvh_h(dst63, dst52);
+
+ dst66 = __lsx_vreplvei_d(dst63, 1);
+
+ for (loop_cnt = height >> 2; loop_cnt--;) {
+ src7 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
+ src10 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+
+ DUP4_ARG3(__lsx_vshuf_b, src9, src7, mask4, src9, src7, mask5, src9,
+ src7, mask6, src9, src7, mask7, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src10, src8, mask4, src10, src8, mask5, src10,
+ src8, mask6, src10, src8, mask7, vec4, vec5, vec6, vec7);
+ dst97 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst97, vec1, filt1, dst97, vec2, filt2,
+ dst97, dst97);
+ dst97 = __lsx_vdp2add_h_bu_b(dst97, vec3, filt3);
+ dst108 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst108, vec5, filt1, dst108, vec6,
+ filt2, dst108, dst108);
+ dst108 = __lsx_vdp2add_h_bu_b(dst108, vec7, filt3);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst97, dst66, dst108, dst97, dst76_r, dst87_r);
+ dst109_r = __lsx_vilvh_h(dst108, dst97);
+ dst66 = __lsx_vreplvei_d(dst97, 1);
+ dst98_r = __lsx_vilvl_h(dst66, dst108);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst21_r, filt_h0, dst32_r,
+ filt_h0, dst43_r, filt_h0, dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst1_r, dst43_r,
+ filt_h1, dst2_r, dst54_r, filt_h1, dst3_r, dst65_r, filt_h1,
+ dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst54_r, filt_h2, dst1_r, dst65_r,
+ filt_h2, dst2_r, dst76_r, filt_h2, dst3_r, dst87_r, filt_h2,
+ dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst1_r, dst87_r,
+ filt_h3, dst2_r, dst98_r, filt_h3, dst3_r, dst109_r, filt_h3,
+ dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst1_r, 6, dst2_r, 6, dst3_r, 6,
+ dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP2_ARG2(__lsx_vpickev_h, dst1_r, dst0_r, dst3_r, dst2_r,
+ dst0_r, dst2_r);
+ __lsx_vstelm_d(dst0_r, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst0_r, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst2_r, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(dst2_r, dst, 0, 1);
+ dst += dst_stride;
+
+ dst10_r = dst54_r;
+ dst32_r = dst76_r;
+ dst54_r = dst98_r;
+ dst21_r = dst65_r;
+ dst43_r = dst87_r;
+ dst65_r = dst109_r;
+ dst66 = __lsx_vreplvei_d(dst108, 1);
+ }
+}
+
+static void hevc_hv_8t_16w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ hevc_hv_8t_8multx1mult_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 16);
+}
+
+static void hevc_hv_8t_24w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ hevc_hv_8t_8multx1mult_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 24);
+}
+
+static void hevc_hv_8t_32w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ hevc_hv_8t_8multx1mult_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 32);
+}
+
+static void hevc_hv_8t_48w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ hevc_hv_8t_8multx1mult_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 48);
+}
+
+static void hevc_hv_8t_64w_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ hevc_hv_8t_8multx1mult_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 64);
+}
+
+static void hevc_hz_4t_32w_lsx(uint8_t *src,
+ int32_t src_stride,
+ int16_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter,
+ int32_t height)
+{
+ uint32_t loop_cnt;
+ __m128i src0, src1, src2;
+ __m128i filt0, filt1;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ __m128i mask1, mask2, mask3;
+ __m128i dst0, dst1, dst2, dst3;
+ __m128i vec0, vec1, vec2, vec3;
+
+ src -= 1;
+ DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 8, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 10);
+
+ for (loop_cnt = height; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+ src2 = __lsx_vld(src, 24);
+ src += src_stride;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask2,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src2, src2, mask0,
+ vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+ vec3, filt0, dst0, dst1, dst2, dst3);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src0, mask3,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask1, src2, src2, mask1,
+ vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
+ dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vst(dst1, dst, 16);
+ __lsx_vst(dst2, dst, 32);
+ __lsx_vst(dst3, dst, 48);
+ dst += dst_stride;
+ }
+}
+
+static void hevc_vt_4t_16w_lsx(uint8_t *src,
+ int32_t src_stride,
+ int16_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter,
+ int32_t height)
+{
+ int32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ __m128i src0, src1, src2, src3, src4, src5;
+ __m128i src10_r, src32_r, src21_r, src43_r;
+ __m128i src10_l, src32_l, src21_l, src43_l;
+ __m128i dst0_r, dst1_r, dst0_l, dst1_l;
+ __m128i filt0, filt1;
+
+ src -= src_stride;
+ DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src += src_stride_3x;
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+ DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src3 = __lsx_vld(src, 0);
+ src4 = __lsx_vldx(src, src_stride);
+ src += src_stride_2x;
+ DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
+ DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
+ filt0, src21_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_l,
+ src32_l, filt1, dst1_r, src43_r, filt1, dst1_l, src43_l,
+ filt1, dst0_r, dst0_l, dst1_r, dst1_l);
+ __lsx_vst(dst0_r, dst, 0);
+ __lsx_vst(dst0_l, dst, 16);
+ dst += dst_stride;
+ __lsx_vst(dst1_r, dst, 0);
+ __lsx_vst(dst1_l, dst, 16);
+ dst += dst_stride;
+
+ src5 = __lsx_vld(src, 0);
+ src2 = __lsx_vldx(src, src_stride);
+ src += src_stride_2x;
+ DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r);
+ DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
+ filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l,
+ src10_l, filt1, dst1_r, src21_r, filt1, dst1_l, src21_l,
+ filt1, dst0_r, dst0_l, dst1_r, dst1_l);
+ __lsx_vst(dst0_r, dst, 0);
+ __lsx_vst(dst0_l, dst, 16);
+ dst += dst_stride;
+ __lsx_vst(dst1_r, dst, 0);
+ __lsx_vst(dst1_l, dst, 16);
+ dst += dst_stride;
+ }
+}
+
+static void hevc_vt_4t_24w_lsx(uint8_t *src,
+ int32_t src_stride,
+ int16_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter,
+ int32_t height)
+{
+ int32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ uint8_t *_src;
+
+ __m128i src0, src1, src2, src3, src4, src5;
+ __m128i src6, src7, src8, src9, src10, src11;
+ __m128i src10_r, src32_r, src76_r, src98_r;
+ __m128i src21_r, src43_r, src87_r, src109_r;
+ __m128i dst0_r, dst1_r, dst2_r, dst3_r;
+ __m128i src10_l, src32_l, src21_l, src43_l;
+ __m128i dst0_l, dst1_l;
+ __m128i filt0, filt1;
+
+ src -= src_stride;
+ _src = src + 16;
+ DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+ DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
+
+ src6 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x, src7, src8);
+ src += src_stride_3x;
+ _src += src_stride_3x;
+ DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src3, src9);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src4, src10);
+ src += src_stride_2x;
+ _src += src_stride_2x;
+ DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
+ DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
+
+ DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
+ filt0, src21_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_l,
+ src32_l, filt1, dst1_r, src43_r, filt1, dst1_l, src43_l,
+ filt1, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src87_r, filt0,
+ dst2_r, dst3_r);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src98_r, filt1, dst3_r,
+ src109_r, filt1, dst2_r, dst3_r);
+
+ __lsx_vst(dst0_r, dst, 0);
+ __lsx_vst(dst0_l, dst, 16);
+ __lsx_vst(dst2_r, dst, 32);
+ dst += dst_stride;
+ __lsx_vst(dst1_r, dst, 0);
+ __lsx_vst(dst1_l, dst, 16);
+ __lsx_vst(dst3_r, dst, 32);
+ dst += dst_stride;
+
+ DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src5, src11);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src2, src8);
+ src += src_stride_2x;
+ _src += src_stride_2x;
+ DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r);
+ DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l);
+
+ DUP2_ARG2(__lsx_vilvl_b, src11, src10, src8, src11, src76_r, src87_r);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
+ filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l, src10_l,
+ filt1, dst1_r, src21_r, filt1, dst1_l, src21_l, filt1,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, src98_r, filt0, src109_r, filt0,
+ dst2_r, dst3_r);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src76_r, filt1, dst3_r, src87_r,
+ filt1, dst2_r, dst3_r);
+
+ __lsx_vst(dst0_r, dst, 0);
+ __lsx_vst(dst0_l, dst, 16);
+ __lsx_vst(dst2_r, dst, 32);
+ dst += dst_stride;
+ __lsx_vst(dst1_r, dst, 0);
+ __lsx_vst(dst1_l, dst, 16);
+ __lsx_vst(dst3_r, dst, 32);
+ dst += dst_stride;
+ }
+}
+
+static void hevc_vt_4t_32w_lsx(uint8_t *src,
+ int32_t src_stride,
+ int16_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter,
+ int32_t height)
+{
+ int32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ uint8_t *_src;
+
+ __m128i src0, src1, src2, src3, src4, src5;
+ __m128i src6, src7, src8, src9, src10, src11;
+ __m128i src10_r, src32_r, src76_r, src98_r;
+ __m128i src21_r, src43_r, src87_r, src109_r;
+ __m128i dst0_r, dst1_r, dst2_r, dst3_r;
+ __m128i src10_l, src32_l, src76_l, src98_l;
+ __m128i src21_l, src43_l, src87_l, src109_l;
+ __m128i dst0_l, dst1_l, dst2_l, dst3_l;
+ __m128i filt0, filt1;
+
+ src -= src_stride;
+ _src = src + 16;
+ DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+ DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
+
+ src6 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x, src7, src8);
+ src += src_stride_3x;
+ _src += src_stride_3x;
+ DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
+ DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src76_l, src87_l);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src3, src9);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src4, src10);
+ src += src_stride_2x;
+ _src += src_stride_2x;
+ DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
+ DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
+
+ DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r);
+ DUP2_ARG2(__lsx_vilvh_b, src9, src8, src10, src9, src98_l, src109_l);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
+ filt0, src21_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_l,
+ src32_l, filt1, dst1_r, src43_r, filt1, dst1_l,src43_l,
+ filt1, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src76_l, filt0, src87_r,
+ filt0, src87_l, filt0, dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src98_r, filt1, dst2_l, src98_l,
+ filt1, dst3_r, src109_r, filt1, dst3_l, src109_l, filt1,
+ dst2_r, dst2_l, dst3_r, dst3_l);
+
+ __lsx_vst(dst0_r, dst, 0);
+ __lsx_vst(dst0_l, dst, 16);
+ __lsx_vst(dst2_r, dst, 32);
+ __lsx_vst(dst2_l, dst, 48);
+ dst += dst_stride;
+ __lsx_vst(dst1_r, dst, 0);
+ __lsx_vst(dst1_l, dst, 16);
+ __lsx_vst(dst3_r, dst, 32);
+ __lsx_vst(dst3_l, dst, 48);
+ dst += dst_stride;
+
+ DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src5, src11);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src2, src8);
+ src += src_stride_2x;
+ _src += src_stride_2x;
+ DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r);
+ DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l);
+
+ DUP2_ARG2(__lsx_vilvl_b, src11, src10, src8, src11, src76_r, src87_r);
+ DUP2_ARG2(__lsx_vilvh_b, src11, src10, src8, src11, src76_l, src87_l);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
+ filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l,
+ src10_l, filt1, dst1_r, src21_r, filt1, dst1_l, src21_l,
+ filt1, dst0_r, dst0_l, dst1_r, dst1_l);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src98_r, filt0, src98_l, filt0, src109_r,
+ filt0, src109_l, filt0, dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src76_r, filt1, dst2_l, src76_l,
+ filt1, dst3_r, src87_r, filt1, dst3_l, src87_l, filt1,
+ dst2_r, dst2_l, dst3_r, dst3_l);
+
+ __lsx_vst(dst0_r, dst, 0);
+ __lsx_vst(dst0_l, dst, 16);
+ __lsx_vst(dst2_r, dst, 32);
+ __lsx_vst(dst2_l, dst, 48);
+ dst += dst_stride;
+ __lsx_vst(dst1_r, dst, 0);
+ __lsx_vst(dst1_l, dst, 16);
+ __lsx_vst(dst3_r, dst, 32);
+ __lsx_vst(dst3_l, dst, 48);
+ dst += dst_stride;
+ }
+}
+
+static void hevc_hv_4t_8x2_lsx(uint8_t *src,
+ int32_t src_stride,
+ int16_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter_x,
+ const int8_t *filter_y)
+{
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+
+ __m128i src0, src1, src2, src3, src4;
+ __m128i filt0, filt1;
+ __m128i filt_h0, filt_h1;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ __m128i mask1;
+ __m128i filter_vec;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+ __m128i dst0, dst1, dst2, dst3, dst4;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l;
+ __m128i dst10_r, dst32_r, dst21_r, dst43_r;
+ __m128i dst10_l, dst32_l, dst21_l, dst43_l;
+
+ src -= (src_stride + 1);
+ DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+ DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src4 = __lsx_vldx(src, src_stride_4x);
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec6, vec7);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec8, vec9);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
+ vec6, filt0, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
+ dst2, vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
+ dst4 = __lsx_vdp2_h_bu_b(vec8, filt0);
+ dst4 = __lsx_vdp2add_h_bu_b(dst4, vec9, filt1);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
+ DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+ filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+ filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
+ __lsx_vst(dst0_r, dst, 0);
+ __lsx_vst(dst1_r, dst + dst_stride, 0);
+}
+
+static void hevc_hv_4t_8multx4_lsx(uint8_t *src, int32_t src_stride,
+ int16_t *dst, int32_t dst_stride,
+ const int8_t *filter_x,
+ const int8_t *filter_y, int32_t width8mult)
+{
+ int32_t cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_2x = (dst_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
+
+ __m128i src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i filt0, filt1, filt_h0, filt_h1, filter_vec;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6;
+ __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
+ __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+
+ src -= (src_stride + 1);
+ DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+ DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+ mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ for (cnt = width8mult; cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src,
+ src_stride_3x, src, src_stride_4x, src1, src2, src3, src4);
+ src += src_stride_4x;
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
+ src += (8 - src_stride_4x);
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1,
+ vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,
+ vec4, vec5);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
+ dst0, dst1);
+ dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
+
+ DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1,
+ vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1,
+ vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1,
+ vec6, vec7);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
+ vec6, filt0, dst3, dst4, dst5, dst6);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3, filt1,
+ dst5, vec5, filt1, dst6, vec7, filt1, dst3, dst4, dst5, dst6);
+ DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
+ DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+ filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+ filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
+ filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
+ filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
+ dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6,
+ dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r,
+ dst0_r, dst1_r);
+ DUP2_ARG2(__lsx_vpickev_h, dst2_l, dst2_r, dst3_l, dst3_r,
+ dst2_r, dst3_r);
+
+ __lsx_vst(dst0_r, dst, 0);
+ __lsx_vstx(dst1_r, dst, dst_stride_x);
+ __lsx_vstx(dst2_r, dst, dst_stride_2x);
+ __lsx_vstx(dst3_r, dst, dst_stride_3x);
+ dst += 8;
+ }
+}
+
+static void hevc_hv_4t_8x6_lsx(uint8_t *src,
+ int32_t src_stride,
+ int16_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter_x,
+ const int8_t *filter_y)
+{
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ __m128i filt0, filt1;
+ __m128i filt_h0, filt_h1;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ __m128i mask1, filter_vec;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+ __m128i vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+ __m128i dst4_r, dst4_l, dst5_r, dst5_l;
+ __m128i dst10_r, dst32_r, dst10_l, dst32_l;
+ __m128i dst21_r, dst43_r, dst21_l, dst43_l;
+ __m128i dst54_r, dst54_l, dst65_r, dst65_l;
+ __m128i dst76_r, dst76_l, dst87_r, dst87_l;
+
+ src -= (src_stride + 1);
+ DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+ DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src,
+ src_stride_3x, src, src_stride_4x, src1, src2, src3, src4);
+ src += src_stride_4x;
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src,
+ src_stride_3x, src, src_stride_4x, src5, src6, src7, src8);
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src1, src1,
+ mask0, src1, src1, mask1, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,src3, src3,
+ mask0, src3, src3, mask1, vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src5, src5,
+ mask0, src5, src5, mask1, vec8, vec9, vec10, vec11);
+ DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src7, src7,
+ mask0, src7, src7, mask1, vec12, vec13, vec14, vec15);
+ DUP2_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1,
+ vec16, vec17);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
+ filt0, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
+ dst2, vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec8, filt0, vec10, filt0, vec12, filt0,
+ vec14, filt0, dst4, dst5, dst6, dst7);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec9, filt1, dst5, vec11, filt1, dst6,
+ vec13, filt1, dst7, vec15, filt1, dst4, dst5, dst6, dst7);
+ dst8 = __lsx_vdp2_h_bu_b(vec16, filt0);
+ dst8 = __lsx_vdp2add_h_bu_b(dst8, vec17, filt1);
+
+ DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
+ dst10_r, dst21_r, dst32_r, dst43_r);
+ DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
+ dst10_l, dst21_l, dst32_l, dst43_l);
+ DUP4_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
+ dst54_r, dst65_r, dst76_r, dst87_r);
+ DUP4_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
+ dst54_l, dst65_l, dst76_l, dst87_l);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+ filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
+ filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst54_l, filt_h0, dst65_r,
+ filt_h0, dst65_l, filt_h0, dst4_r, dst4_l, dst5_r, dst5_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+ filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
+ filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
+ dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst4_r, dst76_r, filt_h1, dst4_l, dst76_l,
+ filt_h1, dst5_r, dst87_r, filt_h1, dst5_l, dst87_l, filt_h1,
+ dst4_r, dst4_l, dst5_r, dst5_l);
+ DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6, dst0_r,
+ dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6, dst2_r,
+ dst2_l, dst3_r, dst3_l);
+ DUP4_ARG2(__lsx_vsrai_w, dst4_r, 6, dst4_l, 6, dst5_r, 6, dst5_l, 6, dst4_r,
+ dst4_l, dst5_r, dst5_l);
+
+ DUP4_ARG2(__lsx_vpickev_h,dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r,
+ dst3_l, dst3_r, dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP2_ARG2(__lsx_vpickev_h, dst4_l, dst4_r, dst5_l, dst5_r, dst4_r, dst5_r);
+
+ __lsx_vst(dst0_r, dst, 0);
+ __lsx_vstx(dst1_r, dst, dst_stride_2x);
+ dst += dst_stride_2x;
+ __lsx_vst(dst2_r, dst, 0);
+ __lsx_vstx(dst3_r, dst, dst_stride_2x);
+ dst += dst_stride_2x;
+ __lsx_vst(dst4_r, dst, 0);
+ __lsx_vstx(dst5_r, dst, dst_stride_2x);
+}
+
+static void hevc_hv_4t_8multx4mult_lsx(uint8_t *src,
+ int32_t src_stride,
+ int16_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter_x,
+ const int8_t *filter_y,
+ int32_t height,
+ int32_t width8mult)
+{
+ uint32_t loop_cnt, cnt;
+ uint8_t *src_tmp;
+ int16_t *dst_tmp;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_2x = (dst_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
+
+ __m128i src0, src1, src2, src3, src4, src5, src6;
+ __m128i filt0, filt1;
+ __m128i filt_h0, filt_h1;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ __m128i mask1, filter_vec;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+ __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
+ __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
+
+ src -= (src_stride + 1);
+ DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+ DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ for (cnt = width8mult; cnt--;) {
+ src_tmp = src;
+ dst_tmp = dst;
+
+ src0 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src1, src2);
+ src_tmp += src_stride_3x;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1,
+ vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,
+ vec4, vec5);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
+ dst0, dst1);
+ dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
+
+ for (loop_cnt = height >> 2; loop_cnt--;) {
+ src3 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src4, src5);
+ src6 = __lsx_vldx(src_tmp, src_stride_3x);
+ src_tmp += src_stride_4x;
+
+ DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1,
+ vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1,
+ vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1,
+ vec6, vec7);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
+ vec6, filt0, dst3, dst4, dst5, dst6);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3,
+ filt1, dst5, vec5, filt1, dst6, vec7, filt1, dst3,
+ dst4, dst5, dst6);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
+ DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+ filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
+ filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
+ dst32_l, filt_h1, dst1_r, dst43_r, filt_h1, dst1_l,
+ dst43_l, filt_h1, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l,
+ dst54_l, filt_h1, dst3_r, dst65_r, filt_h1, dst3_l,
+ dst65_l, filt_h1, dst2_r, dst2_l, dst3_r, dst3_l);
+
+ DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6,
+ dst2_r, dst2_l, dst3_r, dst3_l);
+
+ DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l,
+ dst2_r, dst3_l, dst3_r, dst0_r, dst1_r, dst2_r, dst3_r);
+
+ __lsx_vst(dst0_r, dst_tmp, 0);
+ __lsx_vstx(dst1_r, dst_tmp, dst_stride_x);
+ __lsx_vstx(dst2_r, dst_tmp, dst_stride_2x);
+ __lsx_vstx(dst3_r, dst_tmp, dst_stride_3x);
+ dst_tmp += dst_stride_2x;
+
+ dst10_r = dst54_r;
+ dst10_l = dst54_l;
+ dst21_r = dst65_r;
+ dst21_l = dst65_l;
+ dst2 = dst6;
+ }
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void hevc_hv_4t_8w_lsx(uint8_t *src,
+ int32_t src_stride,
+ int16_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter_x,
+ const int8_t *filter_y,
+ int32_t height)
+{
+
+ if (2 == height) {
+ hevc_hv_4t_8x2_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y);
+ } else if (4 == height) {
+ hevc_hv_4t_8multx4_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, 1);
+ } else if (6 == height) {
+ hevc_hv_4t_8x6_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y);
+ } else if (0 == (height & 0x03)) {
+ hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 1);
+ }
+}
+
+static void hevc_hv_4t_12w_lsx(uint8_t *src,
+ int32_t src_stride,
+ int16_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter_x,
+ const int8_t *filter_y,
+ int32_t height)
+{
+ uint32_t loop_cnt;
+ uint8_t *src_tmp;
+ int16_t *dst_tmp;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_2x = (dst_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride_x;
+
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i mask0, mask1, mask2, mask3;
+ __m128i filt0, filt1, filt_h0, filt_h1, filter_vec, dst0;
+ __m128i dst1, dst2, dst3, dst4, dst5, dst6, dst10, dst21, dst22, dst73;
+ __m128i dst84, dst95, dst106, dst76_r, dst98_r, dst87_r, dst109_r;
+ __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
+ __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+ src -= (src_stride + 1);
+ DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+ DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+ mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ src_tmp = src;
+ dst_tmp = dst;
+
+ src0 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src1, src2);
+ src_tmp += src_stride_3x;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
+ dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
+ dst0, dst1);
+ dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
+
+ for (loop_cnt = 4; loop_cnt--;) {
+ src3 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src4, src5);
+ src6 = __lsx_vldx(src_tmp, src_stride_3x);
+ src_tmp += src_stride_4x;
+
+ DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1,
+ vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1,
+ vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1,
+ vec6, vec7);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
+ vec6, filt0, dst3, dst4, dst5, dst6);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3,
+ filt1, dst5, vec5, filt1, dst6, vec7, filt1, dst3,
+ dst4, dst5, dst6);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
+ DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+ filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
+ filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+ filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
+ filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
+ dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6,
+ dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r,
+ dst3_l, dst3_r, dst0_r, dst1_r, dst2_r, dst3_r);
+ __lsx_vst(dst0_r, dst_tmp, 0);
+ __lsx_vstx(dst1_r, dst_tmp, dst_stride_x);
+ __lsx_vstx(dst2_r, dst_tmp, dst_stride_2x);
+ __lsx_vstx(dst3_r, dst_tmp, dst_stride_3x);
+ dst_tmp += dst_stride_2x;
+
+ dst10_r = dst54_r;
+ dst10_l = dst54_l;
+ dst21_r = dst65_r;
+ dst21_l = dst65_l;
+ dst2 = dst6;
+ }
+
+ src += 8;
+ dst += 8;
+
+ mask2 = __lsx_vld(ff_hevc_mask_arr, 16);
+ mask3 = __lsx_vaddi_bu(mask2, 2);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src += src_stride_3x;
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask2, src1, src0, mask3, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src1, mask2, src2, src1, mask3, vec2, vec3);
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst10, dst21);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst10, vec1, filt1, dst21, vec3, filt1,
+ dst10, dst21);
+ dst10_r = __lsx_vilvl_h(dst21, dst10);
+ dst21_r = __lsx_vilvh_h(dst21, dst10);
+ dst22 = __lsx_vreplvei_d(dst21, 1);
+
+ for (loop_cnt = 2; loop_cnt--;) {
+ src3 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src4, src5);
+ src6 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src7 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
+ src10 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ DUP2_ARG3(__lsx_vshuf_b, src7, src3, mask2, src7, src3, mask3,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src8, src4, mask2, src8, src4, mask3,
+ vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src9, src5, mask2, src9, src5, mask3,
+ vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src10, src6, mask2, src10, src6, mask3,
+ vec6, vec7);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
+ vec6, filt0, dst73, dst84, dst95, dst106);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst73, vec1, filt1, dst84, vec3,
+ filt1, dst95, vec5, filt1, dst106, vec7, filt1, dst73,
+ dst84, dst95, dst106);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst73, dst22, dst84, dst73, dst32_r, dst43_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst84, dst73, dst95, dst84, dst87_r, dst98_r);
+ DUP2_ARG2(__lsx_vilvl_h, dst95, dst84, dst106, dst95, dst54_r, dst65_r);
+ dst109_r = __lsx_vilvh_h(dst106, dst95);
+ dst22 = __lsx_vreplvei_d(dst73, 1);
+ dst76_r = __lsx_vilvl_h(dst22, dst106);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst21_r, filt_h0, dst32_r,
+ filt_h0, dst43_r, filt_h0, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst65_r, filt_h0, dst76_r,
+ filt_h0, dst87_r, filt_h0, tmp4, tmp5, tmp6, tmp7);
+ DUP4_ARG3(__lsx_vdp2add_w_h, tmp0, dst32_r, filt_h1, tmp1, dst43_r,
+ filt_h1, tmp2, dst54_r, filt_h1, tmp3, dst65_r, filt_h1,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG3(__lsx_vdp2add_w_h, tmp4, dst76_r, filt_h1, tmp5, dst87_r,
+ filt_h1, tmp6, dst98_r, filt_h1, tmp7, dst109_r, filt_h1,
+ tmp4, tmp5, tmp6, tmp7);
+ DUP4_ARG2(__lsx_vsrai_w, tmp0, 6, tmp1, 6, tmp2, 6, tmp3, 6,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vsrai_w, tmp4, 6, tmp5, 6, tmp6, 6, tmp7, 6,
+ tmp4, tmp5, tmp6, tmp7);
+ DUP4_ARG2(__lsx_vpickev_h, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4,
+ tmp7, tmp6, tmp0, tmp1, tmp2, tmp3);
+
+ __lsx_vstelm_d(tmp0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp1, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp2, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp2, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp3, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_d(tmp3, dst, 0, 1);
+ dst += dst_stride;
+
+ dst10_r = dst98_r;
+ dst21_r = dst109_r;
+ dst22 = __lsx_vreplvei_d(dst106, 1);
+ }
+}
+
+static void hevc_hv_4t_16w_lsx(uint8_t *src,
+ int32_t src_stride,
+ int16_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter_x,
+ const int8_t *filter_y,
+ int32_t height)
+{
+ if (4 == height) {
+ hevc_hv_4t_8multx4_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, 2);
+ } else {
+ hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 2);
+ }
+}
+
+static void hevc_hv_4t_24w_lsx(uint8_t *src,
+ int32_t src_stride,
+ int16_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter_x,
+ const int8_t *filter_y,
+ int32_t height)
+{
+ hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 3);
+}
+
+static void hevc_hv_4t_32w_lsx(uint8_t *src,
+ int32_t src_stride,
+ int16_t *dst,
+ int32_t dst_stride,
+ const int8_t *filter_x,
+ const int8_t *filter_y,
+ int32_t height)
+{
+ hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 4);
+}
+
+#define MC_COPY(WIDTH) \
+void ff_hevc_put_hevc_pel_pixels##WIDTH##_8_lsx(int16_t *dst, \
+ uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int height, \
+ intptr_t mx, \
+ intptr_t my, \
+ int width) \
+{ \
+ hevc_copy_##WIDTH##w_lsx(src, src_stride, dst, MAX_PB_SIZE, height); \
+}
+
+MC_COPY(4);
+MC_COPY(6);
+MC_COPY(8);
+MC_COPY(12);
+MC_COPY(16);
+MC_COPY(24);
+MC_COPY(32);
+MC_COPY(48);
+MC_COPY(64);
+
+#undef MC_COPY
+
+#define MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
+void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_lsx(int16_t *dst, \
+ uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int height, \
+ intptr_t mx, \
+ intptr_t my, \
+ int width) \
+{ \
+ const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
+ \
+ hevc_##DIR1##_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, \
+ MAX_PB_SIZE, filter, height); \
+}
+
+MC(qpel, h, 4, 8, hz, mx);
+MC(qpel, h, 8, 8, hz, mx);
+MC(qpel, h, 12, 8, hz, mx);
+MC(qpel, h, 16, 8, hz, mx);
+MC(qpel, h, 24, 8, hz, mx);
+MC(qpel, h, 32, 8, hz, mx);
+MC(qpel, h, 48, 8, hz, mx);
+MC(qpel, h, 64, 8, hz, mx);
+
+MC(qpel, v, 4, 8, vt, my);
+MC(qpel, v, 8, 8, vt, my);
+MC(qpel, v, 12, 8, vt, my);
+MC(qpel, v, 16, 8, vt, my);
+MC(qpel, v, 24, 8, vt, my);
+MC(qpel, v, 32, 8, vt, my);
+MC(qpel, v, 48, 8, vt, my);
+MC(qpel, v, 64, 8, vt, my);
+
+MC(epel, h, 32, 4, hz, mx);
+
+MC(epel, v, 16, 4, vt, my);
+MC(epel, v, 24, 4, vt, my);
+MC(epel, v, 32, 4, vt, my);
+
+#undef MC
+
+#define MC_HV(PEL, WIDTH, TAP) \
+void ff_hevc_put_hevc_##PEL##_hv##WIDTH##_8_lsx(int16_t *dst, \
+ uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int height, \
+ intptr_t mx, \
+ intptr_t my, \
+ int width) \
+{ \
+ const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
+ const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
+ \
+ hevc_hv_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, MAX_PB_SIZE, \
+ filter_x, filter_y, height); \
+}
+
+MC_HV(qpel, 4, 8);
+MC_HV(qpel, 8, 8);
+MC_HV(qpel, 12, 8);
+MC_HV(qpel, 16, 8);
+MC_HV(qpel, 24, 8);
+MC_HV(qpel, 32, 8);
+MC_HV(qpel, 48, 8);
+MC_HV(qpel, 64, 8);
+
+MC_HV(epel, 8, 4);
+MC_HV(epel, 12, 4);
+MC_HV(epel, 16, 4);
+MC_HV(epel, 24, 4);
+MC_HV(epel, 32, 4);
+
+#undef MC_HV
diff --git a/libavcodec/loongarch/hevcdsp_lsx.h b/libavcodec/loongarch/hevcdsp_lsx.h
new file mode 100644
index 0000000000..3259e03f13
--- /dev/null
+++ b/libavcodec/loongarch/hevcdsp_lsx.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ * Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_LOONGARCH_HEVCDSP_LSX_H
+#define AVCODEC_LOONGARCH_HEVCDSP_LSX_H
+
+#include "libavcodec/hevcdsp.h"
+
+#define MC(PEL, DIR, WIDTH) \
+void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_lsx(int16_t *dst, \
+ uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int height, \
+ intptr_t mx, \
+ intptr_t my, \
+ int width)
+
+MC(pel, pixels, 4);
+MC(pel, pixels, 6);
+MC(pel, pixels, 8);
+MC(pel, pixels, 12);
+MC(pel, pixels, 16);
+MC(pel, pixels, 24);
+MC(pel, pixels, 32);
+MC(pel, pixels, 48);
+MC(pel, pixels, 64);
+
+MC(qpel, h, 4);
+MC(qpel, h, 8);
+MC(qpel, h, 12);
+MC(qpel, h, 16);
+MC(qpel, h, 24);
+MC(qpel, h, 32);
+MC(qpel, h, 48);
+MC(qpel, h, 64);
+
+MC(qpel, v, 4);
+MC(qpel, v, 8);
+MC(qpel, v, 12);
+MC(qpel, v, 16);
+MC(qpel, v, 24);
+MC(qpel, v, 32);
+MC(qpel, v, 48);
+MC(qpel, v, 64);
+
+MC(qpel, hv, 4);
+MC(qpel, hv, 8);
+MC(qpel, hv, 12);
+MC(qpel, hv, 16);
+MC(qpel, hv, 24);
+MC(qpel, hv, 32);
+MC(qpel, hv, 48);
+MC(qpel, hv, 64);
+
+MC(epel, h, 32);
+
+MC(epel, v, 16);
+MC(epel, v, 24);
+MC(epel, v, 32);
+
+MC(epel, hv, 8);
+MC(epel, hv, 12);
+MC(epel, hv, 16);
+MC(epel, hv, 24);
+MC(epel, hv, 32);
+
+#undef MC
+
+#endif // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LSX_H
--
2.20.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* [FFmpeg-devel] [PATCH v2 3/5] avcodec: [loongarch] Optimize Hevc_idct/lpf with LSX.
2022-02-17 11:11 [FFmpeg-devel] Optimize HEVC decoding for loongarch Hao Chen
2022-02-17 11:11 ` [FFmpeg-devel] [PATCH v2 1/5] avutil: [loongarch] Update loongson_intrinsics.h to v1.1.0 Hao Chen
2022-02-17 11:11 ` [FFmpeg-devel] [PATCH v2 2/5] avcodec: [loongarch] Optimize Hevcdsp with LSX Hao Chen
@ 2022-02-17 11:11 ` Hao Chen
2022-02-17 11:11 ` [FFmpeg-devel] [PATCH v2 4/5] avcodec: [loongarch] Optimize Hevc_mc_bi " Hao Chen
` (2 subsequent siblings)
5 siblings, 0 replies; 15+ messages in thread
From: Hao Chen @ 2022-02-17 11:11 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Lu Wang
From: Lu Wang <wanglu@loongson.cn>
ffmpeg -i 5_h265_1080p_60fps_3Mbps.mkv -f rawvideo -y /dev/null -an
before: 110fps
after : 124fps
Signed-off-by: Hao Chen <chenhao@loongson.cn>
---
libavcodec/loongarch/Makefile | 2 +
libavcodec/loongarch/hevc_idct_lsx.c | 842 ++++++
libavcodec/loongarch/hevc_lpf_sao_lsx.c | 2485 +++++++++++++++++
libavcodec/loongarch/hevcdsp_init_loongarch.c | 19 +
libavcodec/loongarch/hevcdsp_lsx.h | 26 +
5 files changed, 3374 insertions(+)
create mode 100644 libavcodec/loongarch/hevc_idct_lsx.c
create mode 100644 libavcodec/loongarch/hevc_lpf_sao_lsx.c
diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index 9f416d0c6e..cfc8e3aaff 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -26,3 +26,5 @@ LSX-OBJS-$(CONFIG_VP9_DECODER) += loongarch/vp9_mc_lsx.o \
loongarch/vp9_lpf_lsx.o \
loongarch/vp9_idct_lsx.o
LSX-OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_lsx.o \
+ loongarch/hevc_idct_lsx.o \
+ loongarch/hevc_lpf_sao_lsx.o
diff --git a/libavcodec/loongarch/hevc_idct_lsx.c b/libavcodec/loongarch/hevc_idct_lsx.c
new file mode 100644
index 0000000000..2193b27546
--- /dev/null
+++ b/libavcodec/loongarch/hevc_idct_lsx.c
@@ -0,0 +1,842 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ * Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "hevcdsp_lsx.h"
+
+static const int16_t gt8x8_cnst[16] __attribute__ ((aligned (64))) = {
+ 64, 64, 83, 36, 89, 50, 18, 75, 64, -64, 36, -83, 75, -89, -50, -18
+};
+
+static const int16_t gt16x16_cnst[64] __attribute__ ((aligned (64))) = {
+ 64, 83, 64, 36, 89, 75, 50, 18, 90, 80, 57, 25, 70, 87, 9, 43,
+ 64, 36, -64, -83, 75, -18, -89, -50, 87, 9, -80, -70, -43, 57, -25, -90,
+ 64, -36, -64, 83, 50, -89, 18, 75, 80, -70, -25, 90, -87, 9, 43, 57,
+ 64, -83, 64, -36, 18, -50, 75, -89, 70, -87, 90, -80, 9, -43, -57, 25
+};
+
+static const int16_t gt32x32_cnst0[256] __attribute__ ((aligned (64))) = {
+ 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4,
+ 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13,
+ 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22,
+ 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31,
+ 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38,
+ 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46,
+ 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54,
+ 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61,
+ 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67,
+ 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73,
+ 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78,
+ 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82,
+ 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85,
+ 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88,
+ 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90,
+ 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90
+};
+
+static const int16_t gt32x32_cnst1[64] __attribute__ ((aligned (64))) = {
+ 90, 87, 80, 70, 57, 43, 25, 9, 87, 57, 9, -43, -80, -90, -70, -25,
+ 80, 9, -70, -87, -25, 57, 90, 43, 70, -43, -87, 9, 90, 25, -80, -57,
+ 57, -80, -25, 90, -9, -87, 43, 70, 43, -90, 57, 25, -87, 70, 9, -80,
+ 25, -70, 90, -80, 43, 9, -57, 87, 9, -25, 43, -57, 70, -80, 87, -90
+};
+
+static const int16_t gt32x32_cnst2[16] __attribute__ ((aligned (64))) = {
+ 89, 75, 50, 18, 75, -18, -89, -50, 50, -89, 18, 75, 18, -50, 75, -89
+};
+
+#define HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, \
+ sum0, sum1, sum2, sum3, shift) \
+{ \
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5; \
+ __m128i cnst64 = __lsx_vldi(0x0840); \
+ __m128i cnst83 = __lsx_vldi(0x0853); \
+ __m128i cnst36 = __lsx_vldi(0x0824); \
+ \
+ vec0 = __lsx_vdp2_w_h(in_r0, cnst64); \
+ vec1 = __lsx_vdp2_w_h(in_l0, cnst83); \
+ vec2 = __lsx_vdp2_w_h(in_r1, cnst64); \
+ vec3 = __lsx_vdp2_w_h(in_l1, cnst36); \
+ vec4 = __lsx_vdp2_w_h(in_l0, cnst36); \
+ vec5 = __lsx_vdp2_w_h(in_l1, cnst83); \
+ \
+ sum0 = __lsx_vadd_w(vec0, vec2); \
+ sum1 = __lsx_vsub_w(vec0, vec2); \
+ vec1 = __lsx_vadd_w(vec1, vec3); \
+ vec4 = __lsx_vsub_w(vec4, vec5); \
+ sum2 = __lsx_vsub_w(sum1, vec4); \
+ sum3 = __lsx_vsub_w(sum0, vec1); \
+ sum0 = __lsx_vadd_w(sum0, vec1); \
+ sum1 = __lsx_vadd_w(sum1, vec4); \
+ \
+ sum0 = __lsx_vsrari_w(sum0, shift); \
+ sum1 = __lsx_vsrari_w(sum1, shift); \
+ sum2 = __lsx_vsrari_w(sum2, shift); \
+ sum3 = __lsx_vsrari_w(sum3, shift); \
+ sum0 = __lsx_vsat_w(sum0, 15); \
+ sum1 = __lsx_vsat_w(sum1, 15); \
+ sum2 = __lsx_vsat_w(sum2, 15); \
+ sum3 = __lsx_vsat_w(sum3, 15); \
+}
+
+#define HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, shift) \
+{ \
+ __m128i src0_r, src1_r, src2_r, src3_r; \
+ __m128i src0_l, src1_l, src2_l, src3_l; \
+ __m128i filter0, filter1, filter2, filter3; \
+ __m128i temp0_r, temp1_r, temp2_r, temp3_r, temp4_r, temp5_r; \
+ __m128i temp0_l, temp1_l, temp2_l, temp3_l, temp4_l, temp5_l; \
+ __m128i sum0_r, sum1_r, sum2_r, sum3_r; \
+ __m128i sum0_l, sum1_l, sum2_l, sum3_l; \
+ \
+ DUP4_ARG2(__lsx_vilvl_h, in4, in0, in6, in2, in5, in1, in3, in7, \
+ src0_r, src1_r, src2_r, src3_r); \
+ DUP4_ARG2(__lsx_vilvh_h, in4, in0, in6, in2, in5, in1, in3, in7, \
+ src0_l, src1_l, src2_l, src3_l); \
+ \
+ DUP4_ARG2(__lsx_vldrepl_w, filter, 0, filter, 4, filter, 8, \
+ filter, 12, filter0, filter1, filter2, filter3); \
+ DUP4_ARG2(__lsx_vdp2_w_h, src0_r, filter0, src0_l, filter0, \
+ src1_r, filter1, src1_l, filter1, temp0_r, temp0_l, \
+ temp1_r, temp1_l); \
+ \
+ LSX_BUTTERFLY_4_W(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l,\
+ sum1_l, sum1_r); \
+ sum2_r = sum1_r; \
+ sum2_l = sum1_l; \
+ sum3_r = sum0_r; \
+ sum3_l = sum0_l; \
+ \
+ DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter2, src2_l, filter2, \
+ src3_r, filter3, src3_l, filter3, temp2_r, temp2_l, \
+ temp3_r, temp3_l); \
+ temp2_r = __lsx_vadd_w(temp2_r, temp3_r); \
+ temp2_l = __lsx_vadd_w(temp2_l, temp3_l); \
+ sum0_r = __lsx_vadd_w(sum0_r, temp2_r); \
+ sum0_l = __lsx_vadd_w(sum0_l, temp2_l); \
+ sum3_r = __lsx_vsub_w(sum3_r, temp2_r); \
+ sum3_l = __lsx_vsub_w(sum3_l, temp2_l); \
+ \
+ in0 = __lsx_vssrarni_h_w(sum0_l, sum0_r, shift); \
+ in7 = __lsx_vssrarni_h_w(sum3_l, sum3_r, shift); \
+ \
+ DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter3, src2_l, filter3, \
+ src3_r, filter2, src3_l, filter2, temp4_r, temp4_l, \
+ temp5_r, temp5_l); \
+ temp4_r = __lsx_vsub_w(temp4_r, temp5_r); \
+ temp4_l = __lsx_vsub_w(temp4_l, temp5_l); \
+ sum1_r = __lsx_vadd_w(sum1_r, temp4_r); \
+ sum1_l = __lsx_vadd_w(sum1_l, temp4_l); \
+ sum2_r = __lsx_vsub_w(sum2_r, temp4_r); \
+ sum2_l = __lsx_vsub_w(sum2_l, temp4_l); \
+ \
+ in3 = __lsx_vssrarni_h_w(sum1_l, sum1_r, shift); \
+ in4 = __lsx_vssrarni_h_w(sum2_l, sum2_r, shift); \
+ \
+ DUP4_ARG2(__lsx_vldrepl_w, filter, 16, filter, 20, filter, 24, \
+ filter, 28, filter0, filter1, filter2, filter3); \
+ DUP4_ARG2(__lsx_vdp2_w_h, src0_r, filter0, src0_l, filter0, \
+ src1_r, filter1, src1_l, filter1, temp0_r, temp0_l, \
+ temp1_r, temp1_l); \
+ \
+ LSX_BUTTERFLY_4_W(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l,\
+ sum1_l, sum1_r); \
+ sum2_r = sum1_r; \
+ sum2_l = sum1_l; \
+ sum3_r = sum0_r; \
+ sum3_l = sum0_l; \
+ \
+ DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter2, src2_l, filter2, \
+ src3_r, filter3, src3_l, filter3, temp2_r, temp2_l, \
+ temp3_r, temp3_l); \
+ temp2_r = __lsx_vadd_w(temp2_r, temp3_r); \
+ temp2_l = __lsx_vadd_w(temp2_l, temp3_l); \
+ sum0_r = __lsx_vadd_w(sum0_r, temp2_r); \
+ sum0_l = __lsx_vadd_w(sum0_l, temp2_l); \
+ sum3_r = __lsx_vsub_w(sum3_r, temp2_r); \
+ sum3_l = __lsx_vsub_w(sum3_l, temp2_l); \
+ \
+ in1 = __lsx_vssrarni_h_w(sum0_l, sum0_r, shift); \
+ in6 = __lsx_vssrarni_h_w(sum3_l, sum3_r, shift); \
+ \
+ DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter3, src2_l, filter3, \
+ src3_r, filter2, src3_l, filter2, temp4_r, temp4_l, \
+ temp5_r, temp5_l); \
+ temp4_r = __lsx_vsub_w(temp4_r, temp5_r); \
+ temp4_l = __lsx_vsub_w(temp4_l, temp5_l); \
+ sum1_r = __lsx_vsub_w(sum1_r, temp4_r); \
+ sum1_l = __lsx_vsub_w(sum1_l, temp4_l); \
+ sum2_r = __lsx_vadd_w(sum2_r, temp4_r); \
+ sum2_l = __lsx_vadd_w(sum2_l, temp4_l); \
+ \
+ in2 = __lsx_vssrarni_h_w(sum1_l, sum1_r, shift); \
+ in5 = __lsx_vssrarni_h_w(sum2_l, sum2_r, shift); \
+}
+
+#define HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, \
+ src4_r, src5_r, src6_r, src7_r, \
+ src0_l, src1_l, src2_l, src3_l, \
+ src4_l, src5_l, src6_l, src7_l, shift) \
+{ \
+ int16_t *ptr0, *ptr1; \
+ __m128i dst0, dst1; \
+ __m128i filter0, filter1, filter2, filter3; \
+ __m128i temp0_r, temp1_r, temp0_l, temp1_l; \
+ __m128i sum0_r, sum1_r, sum2_r, sum3_r, sum0_l, sum1_l, sum2_l; \
+ __m128i sum3_l, res0_r, res1_r, res0_l, res1_l; \
+ \
+ ptr0 = (buf_ptr + 112); \
+ ptr1 = (buf_ptr + 128); \
+ k = -1; \
+ \
+ for (j = 0; j < 4; j++) \
+ { \
+ DUP4_ARG2(__lsx_vldrepl_w, filter, 0, filter, 4, filter, 16, \
+ filter, 20, filter0, filter1, filter2, filter3); \
+ DUP4_ARG2(__lsx_vdp2_w_h, src0_r, filter0, src0_l, filter0, \
+ src4_r, filter2, src4_l, filter2, sum0_r, sum0_l, \
+ sum2_r, sum2_l); \
+ DUP2_ARG2(__lsx_vdp2_w_h, src7_r, filter2, src7_l, filter2, \
+ sum3_r, sum3_l); \
+ DUP4_ARG3(__lsx_vdp2add_w_h, sum0_r, src1_r, filter1, sum0_l, \
+ src1_l, filter1, sum2_r, src5_r, filter3, sum2_l, \
+ src5_l, filter3, sum0_r, sum0_l, sum2_r, sum2_l); \
+ DUP2_ARG3(__lsx_vdp2add_w_h, sum3_r, src6_r, filter3, sum3_l, \
+ src6_l, filter3, sum3_r, sum3_l); \
+ \
+ sum1_r = sum0_r; \
+ sum1_l = sum0_l; \
+ \
+ DUP4_ARG2(__lsx_vldrepl_w, filter, 8, filter, 12, filter, 24, \
+ filter, 28, filter0, filter1, filter2, filter3); \
+ filter += 16; \
+ DUP2_ARG2(__lsx_vdp2_w_h, src2_r, filter0, src2_l, filter0, \
+ temp0_r, temp0_l); \
+ DUP2_ARG3(__lsx_vdp2add_w_h, sum2_r, src6_r, filter2, sum2_l, \
+ src6_l, filter2, sum2_r, sum2_l); \
+ DUP2_ARG2(__lsx_vdp2_w_h, src5_r, filter2, src5_l, filter2, \
+ temp1_r, temp1_l); \
+ \
+ sum0_r = __lsx_vadd_w(sum0_r, temp0_r); \
+ sum0_l = __lsx_vadd_w(sum0_l, temp0_l); \
+ sum1_r = __lsx_vsub_w(sum1_r, temp0_r); \
+ sum1_l = __lsx_vsub_w(sum1_l, temp0_l); \
+ sum3_r = __lsx_vsub_w(temp1_r, sum3_r); \
+ sum3_l = __lsx_vsub_w(temp1_l, sum3_l); \
+ \
+ DUP2_ARG2(__lsx_vdp2_w_h, src3_r, filter1, src3_l, filter1, \
+ temp0_r, temp0_l); \
+ DUP4_ARG3(__lsx_vdp2add_w_h, sum2_r, src7_r, filter3, sum2_l, \
+ src7_l, filter3, sum3_r, src4_r, filter3, sum3_l, \
+ src4_l, filter3, sum2_r, sum2_l, sum3_r, sum3_l); \
+ \
+ sum0_r = __lsx_vadd_w(sum0_r, temp0_r); \
+ sum0_l = __lsx_vadd_w(sum0_l, temp0_l); \
+ sum1_r = __lsx_vsub_w(sum1_r, temp0_r); \
+ sum1_l = __lsx_vsub_w(sum1_l, temp0_l); \
+ \
+ LSX_BUTTERFLY_4_W(sum0_r, sum0_l, sum2_l, sum2_r, res0_r, res0_l, \
+ res1_l, res1_r); \
+ dst0 = __lsx_vssrarni_h_w(res0_l, res0_r, shift); \
+ dst1 = __lsx_vssrarni_h_w(res1_l, res1_r, shift); \
+ __lsx_vst(dst0, buf_ptr, 0); \
+ __lsx_vst(dst1, (buf_ptr + ((15 - (j * 2)) << 4)), 0); \
+ \
+ LSX_BUTTERFLY_4_W(sum1_r, sum1_l, sum3_l, sum3_r, res0_r, res0_l, \
+ res1_l, res1_r); \
+ \
+ dst0 = __lsx_vssrarni_h_w(res0_l, res0_r, shift); \
+ dst1 = __lsx_vssrarni_h_w(res1_l, res1_r, shift); \
+ __lsx_vst(dst0, (ptr0 + ((((j + 1) >> 1) * 2 * k) << 4)), 0); \
+ __lsx_vst(dst1, (ptr1 - ((((j + 1) >> 1) * 2 * k) << 4)), 0); \
+ \
+ k *= -1; \
+ buf_ptr += 16; \
+ } \
+}
+
+#define HEVC_EVEN16_CALC(input, sum0_r, sum0_l, load_idx, store_idx) \
+{ \
+ tmp0_r = __lsx_vld(input + load_idx * 8, 0); \
+ tmp0_l = __lsx_vld(input + load_idx * 8, 16); \
+ tmp1_r = sum0_r; \
+ tmp1_l = sum0_l; \
+ sum0_r = __lsx_vadd_w(sum0_r, tmp0_r); \
+ sum0_l = __lsx_vadd_w(sum0_l, tmp0_l); \
+ __lsx_vst(sum0_r, (input + load_idx * 8), 0); \
+ __lsx_vst(sum0_l, (input + load_idx * 8), 16); \
+ tmp1_r = __lsx_vsub_w(tmp1_r, tmp0_r); \
+ tmp1_l = __lsx_vsub_w(tmp1_l, tmp0_l); \
+ __lsx_vst(tmp1_r, (input + store_idx * 8), 0); \
+ __lsx_vst(tmp1_l, (input + store_idx * 8), 16); \
+}
+
+#define HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1, \
+ res0, res1, res2, res3, shift) \
+{ \
+ __m128i vec0, vec1, vec2, vec3; \
+ __m128i cnst74 = __lsx_vldi(0x84a); \
+ __m128i cnst55 = __lsx_vldi(0x837); \
+ __m128i cnst29 = __lsx_vldi(0x81d); \
+ \
+ vec0 = __lsx_vadd_w(in_r0, in_r1); \
+ vec2 = __lsx_vsub_w(in_r0, in_l1); \
+ res0 = __lsx_vmul_w(vec0, cnst29); \
+ res1 = __lsx_vmul_w(vec2, cnst55); \
+ res2 = __lsx_vsub_w(in_r0, in_r1); \
+ vec1 = __lsx_vadd_w(in_r1, in_l1); \
+ res2 = __lsx_vadd_w(res2, in_l1); \
+ vec3 = __lsx_vmul_w(in_l0, cnst74); \
+ res3 = __lsx_vmul_w(vec0, cnst55); \
+ \
+ res0 = __lsx_vadd_w(res0, __lsx_vmul_w(vec1, cnst55)); \
+ res1 = __lsx_vsub_w(res1, __lsx_vmul_w(vec1, cnst29)); \
+ res2 = __lsx_vmul_w(res2, cnst74); \
+ res3 = __lsx_vadd_w(res3, __lsx_vmul_w(vec2, cnst29)); \
+ \
+ res0 = __lsx_vadd_w(res0, vec3); \
+ res1 = __lsx_vadd_w(res1, vec3); \
+ res3 = __lsx_vsub_w(res3, vec3); \
+ \
+ res0 = __lsx_vsrari_w(res0, shift); \
+ res1 = __lsx_vsrari_w(res1, shift); \
+ res2 = __lsx_vsrari_w(res2, shift); \
+ res3 = __lsx_vsrari_w(res3, shift); \
+ res0 = __lsx_vsat_w(res0, 15); \
+ res1 = __lsx_vsat_w(res1, 15); \
+ res2 = __lsx_vsat_w(res2, 15); \
+ res3 = __lsx_vsat_w(res3, 15); \
+}
+
+void ff_hevc_idct_4x4_lsx(int16_t *coeffs, int col_limit)
+{
+ __m128i in0, in1;
+ __m128i in_r0, in_l0, in_r1, in_l1;
+ __m128i sum0, sum1, sum2, sum3;
+ __m128i zero = __lsx_vldi(0x00);
+
+ in0 = __lsx_vld(coeffs, 0);
+ in1 = __lsx_vld(coeffs, 16);
+ in_r0 = __lsx_vilvl_h(zero, in0);
+ in_l0 = __lsx_vilvh_h(zero, in0);
+ in_r1 = __lsx_vilvl_h(zero, in1);
+ in_l1 = __lsx_vilvh_h(zero, in1);
+
+ HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 7);
+ LSX_TRANSPOSE4x4_W(sum0, sum1, sum2, sum3, in_r0, in_l0, in_r1, in_l1);
+ HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 12);
+
+ /* Pack and transpose */
+ in0 = __lsx_vpickev_h(sum2, sum0);
+ in1 = __lsx_vpickev_h(sum3, sum1);
+ sum0 = __lsx_vilvl_h(in1, in0);
+ sum1 = __lsx_vilvh_h(in1, in0);
+ in0 = __lsx_vilvl_w(sum1, sum0);
+ in1 = __lsx_vilvh_w(sum1, sum0);
+
+ __lsx_vst(in0, coeffs, 0);
+ __lsx_vst(in1, coeffs, 16);
+}
+
+void ff_hevc_idct_8x8_lsx(int16_t *coeffs, int col_limit)
+{
+ const int16_t *filter = >8x8_cnst[0];
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+ DUP4_ARG2(__lsx_vld, coeffs, 0, coeffs, 16, coeffs, 32,
+ coeffs, 48, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vld, coeffs, 64, coeffs, 80, coeffs, 96,
+ coeffs, 112, in4, in5, in6, in7);
+ HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 7);
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 12);
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+
+ __lsx_vst(in0, coeffs, 0);
+ __lsx_vst(in1, coeffs, 16);
+ __lsx_vst(in2, coeffs, 32);
+ __lsx_vst(in3, coeffs, 48);
+ __lsx_vst(in4, coeffs, 64);
+ __lsx_vst(in5, coeffs, 80);
+ __lsx_vst(in6, coeffs, 96);
+ __lsx_vst(in7, coeffs, 112);
+}
+
+void ff_hevc_idct_16x16_lsx(int16_t *coeffs, int col_limit)
+{
+ int16_t i, j, k;
+ int16_t buf[256];
+ int16_t *buf_ptr = &buf[0];
+ int16_t *src = coeffs;
+ const int16_t *filter = >16x16_cnst[0];
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+ __m128i src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l;
+
+ for (i = 2; i--;) {
+ DUP4_ARG2(__lsx_vld, src, 0, src, 32, src, 64, src, 96,
+ in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vld, src, 128, src, 160, src, 192, src, 224,
+ in4, in5, in6, in7);
+ DUP4_ARG2(__lsx_vld, src, 256, src, 288, src, 320, src, 352,
+ in8, in9, in10, in11);
+ DUP4_ARG2(__lsx_vld, src, 384, src, 416, src, 448, src, 480,
+ in12, in13, in14, in15);
+
+ DUP4_ARG2(__lsx_vilvl_h, in4, in0, in12, in8, in6, in2, in14, in10,
+ src0_r, src1_r, src2_r, src3_r);
+ DUP4_ARG2(__lsx_vilvl_h, in5, in1, in13, in9, in3, in7, in11, in15,
+ src4_r, src5_r, src6_r, src7_r);
+ DUP4_ARG2(__lsx_vilvh_h, in4, in0, in12, in8, in6, in2, in14, in10,
+ src0_l, src1_l, src2_l, src3_l);
+ DUP4_ARG2(__lsx_vilvh_h, in5, in1, in13, in9, in3, in7, in11, in15,
+ src4_l, src5_l, src6_l, src7_l);
+
+ HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r,
+ src6_r, src7_r, src0_l, src1_l, src2_l, src3_l,
+ src4_l, src5_l, src6_l, src7_l, 7);
+
+ src += 8;
+ buf_ptr = (&buf[0] + 8);
+ filter = >16x16_cnst[0];
+ }
+
+ src = &buf[0];
+ buf_ptr = coeffs;
+ filter = >16x16_cnst[0];
+
+ for (i = 2; i--;) {
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
+ in0, in8, in1, in9);
+ DUP4_ARG2(__lsx_vld, src, 64, src, 80, src, 96, src, 112,
+ in2, in10, in3, in11);
+ DUP4_ARG2(__lsx_vld, src, 128, src, 144, src, 160, src, 176,
+ in4, in12, in5, in13);
+ DUP4_ARG2(__lsx_vld, src, 192, src, 208, src, 224, src, 240,
+ in6, in14, in7, in15);
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15,
+ in8, in9, in10, in11, in12, in13, in14, in15);
+ DUP4_ARG2(__lsx_vilvl_h, in4, in0, in12, in8, in6, in2, in14, in10,
+ src0_r, src1_r, src2_r, src3_r);
+ DUP4_ARG2(__lsx_vilvl_h, in5, in1, in13, in9, in3, in7, in11, in15,
+ src4_r, src5_r, src6_r, src7_r);
+ DUP4_ARG2(__lsx_vilvh_h, in4, in0, in12, in8, in6, in2, in14, in10,
+ src0_l, src1_l, src2_l, src3_l);
+ DUP4_ARG2(__lsx_vilvh_h, in5, in1, in13, in9, in3, in7, in11, in15,
+ src4_l, src5_l, src6_l, src7_l);
+ HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r,
+ src6_r, src7_r, src0_l, src1_l, src2_l, src3_l,
+ src4_l, src5_l, src6_l, src7_l, 12);
+
+ src += 128;
+ buf_ptr = coeffs + 8;
+ filter = >16x16_cnst[0];
+ }
+
+ DUP4_ARG2(__lsx_vld, coeffs, 0, coeffs, 32, coeffs, 64, coeffs, 96,
+ in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vld, coeffs, 128, coeffs, 160, coeffs, 192, coeffs, 224,
+ in4, in5, in6, in7);
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+ vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+ __lsx_vst(vec0, coeffs, 0);
+ __lsx_vst(vec1, coeffs, 32);
+ __lsx_vst(vec2, coeffs, 64);
+ __lsx_vst(vec3, coeffs, 96);
+ __lsx_vst(vec4, coeffs, 128);
+ __lsx_vst(vec5, coeffs, 160);
+ __lsx_vst(vec6, coeffs, 192);
+ __lsx_vst(vec7, coeffs, 224);
+
+ src = coeffs + 8;
+ DUP4_ARG2(__lsx_vld, src, 0, src, 32, src, 64, src, 96, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vld, src, 128, src, 160, src, 192, src, 224,
+ in4, in5, in6, in7);
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+ vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+ src = coeffs + 128;
+ DUP4_ARG2(__lsx_vld, src, 0, src, 32, src, 64, src, 96,
+ in8, in9, in10, in11);
+ DUP4_ARG2(__lsx_vld, src, 128, src, 160, src, 192, src, 224,
+ in12, in13, in14, in15);
+
+ __lsx_vst(vec0, src, 0);
+ __lsx_vst(vec1, src, 32);
+ __lsx_vst(vec2, src, 64);
+ __lsx_vst(vec3, src, 96);
+ __lsx_vst(vec4, src, 128);
+ __lsx_vst(vec5, src, 160);
+ __lsx_vst(vec6, src, 192);
+ __lsx_vst(vec7, src, 224);
+ LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15,
+ vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+ src = coeffs + 8;
+ __lsx_vst(vec0, src, 0);
+ __lsx_vst(vec1, src, 32);
+ __lsx_vst(vec2, src, 64);
+ __lsx_vst(vec3, src, 96);
+ __lsx_vst(vec4, src, 128);
+ __lsx_vst(vec5, src, 160);
+ __lsx_vst(vec6, src, 192);
+ __lsx_vst(vec7, src, 224);
+
+ src = coeffs + 136;
+ DUP4_ARG2(__lsx_vld, src, 0, src, 32, src, 64, src, 96,
+ in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vld, src, 128, src, 160, src, 192, src, 224,
+ in4, in5, in6, in7);
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+ vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+ __lsx_vst(vec0, src, 0);
+ __lsx_vst(vec1, src, 32);
+ __lsx_vst(vec2, src, 64);
+ __lsx_vst(vec3, src, 96);
+ __lsx_vst(vec4, src, 128);
+ __lsx_vst(vec5, src, 160);
+ __lsx_vst(vec6, src, 192);
+ __lsx_vst(vec7, src, 224);
+}
+
+static void hevc_idct_8x32_column_lsx(int16_t *coeffs, int32_t buf_pitch,
+ uint8_t round)
+{
+ uint8_t i;
+ int32_t buf_pitch_2 = buf_pitch << 1;
+ int32_t buf_pitch_4 = buf_pitch << 2;
+ int32_t buf_pitch_8 = buf_pitch << 3;
+ int32_t buf_pitch_16 = buf_pitch << 4;
+
+ const int16_t *filter_ptr0 = >32x32_cnst0[0];
+ const int16_t *filter_ptr1 = >32x32_cnst1[0];
+ const int16_t *filter_ptr2 = >32x32_cnst2[0];
+ const int16_t *filter_ptr3 = >8x8_cnst[0];
+ int16_t *src0 = (coeffs + buf_pitch);
+ int16_t *src1 = (coeffs + buf_pitch_2);
+ int16_t *src2 = (coeffs + buf_pitch_4);
+ int16_t *src3 = (coeffs);
+ int32_t tmp_buf[8 * 32 + 15];
+ int32_t *tmp_buf_ptr = tmp_buf + 15;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
+ __m128i src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l;
+ __m128i filter0, filter1, filter2, filter3;
+ __m128i sum0_r, sum0_l, sum1_r, sum1_l, tmp0_r, tmp0_l, tmp1_r, tmp1_l;
+
+ /* Align pointer to 64 byte boundary */
+ tmp_buf_ptr = (int32_t *)(((uintptr_t) tmp_buf_ptr) & ~(uintptr_t) 63);
+
+ /* process coeff 4, 12, 20, 28 */
+ in0 = __lsx_vld(src2, 0);
+ in1 = __lsx_vld(src2 + buf_pitch_8, 0);
+ in2 = __lsx_vld(src2 + buf_pitch_16, 0);
+ in3 = __lsx_vld(src2 + buf_pitch_16 + buf_pitch_8, 0);
+ in4 = __lsx_vld(src3, 0);
+ in5 = __lsx_vld(src3 + buf_pitch_8, 0);
+ in6 = __lsx_vld(src3 + buf_pitch_16, 0);
+ in7 = __lsx_vld(src3 + buf_pitch_16 + buf_pitch_8, 0);
+ DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in6, in4, in7, in5,
+ src0_r, src1_r, src2_r, src3_r);
+ DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in6, in4, in7, in5,
+ src0_l, src1_l, src2_l, src3_l);
+
+ filter0 = __lsx_vldrepl_w(filter_ptr2, 0);
+ filter1 = __lsx_vldrepl_w(filter_ptr2, 4);
+ sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
+ sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
+ __lsx_vst(sum0_r, tmp_buf_ptr, 0);
+ __lsx_vst(sum0_l, tmp_buf_ptr, 16);
+
+ filter0 = __lsx_vldrepl_w(filter_ptr2, 8);
+ filter1 = __lsx_vldrepl_w(filter_ptr2, 12);
+ sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
+ sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
+ __lsx_vst(sum0_r, tmp_buf_ptr, 32);
+ __lsx_vst(sum0_l, tmp_buf_ptr, 48);
+
+ filter0 = __lsx_vldrepl_w(filter_ptr2, 16);
+ filter1 = __lsx_vldrepl_w(filter_ptr2, 20);
+ sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
+ sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
+ __lsx_vst(sum0_r, tmp_buf_ptr, 64);
+ __lsx_vst(sum0_l, tmp_buf_ptr, 80);
+
+ filter0 = __lsx_vldrepl_w(filter_ptr2, 24);
+ filter1 = __lsx_vldrepl_w(filter_ptr2, 28);
+ sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
+ sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
+ __lsx_vst(sum0_r, tmp_buf_ptr, 96);
+ __lsx_vst(sum0_l, tmp_buf_ptr, 112);
+
+ /* process coeff 0, 8, 16, 24 */
+ filter0 = __lsx_vldrepl_w(filter_ptr3, 0);
+ filter1 = __lsx_vldrepl_w(filter_ptr3, 4);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter0, src2_l, filter0,
+ src3_r, filter1, src3_l, filter1, sum0_r, sum0_l, tmp1_r, tmp1_l);
+ sum1_r = __lsx_vsub_w(sum0_r, tmp1_r);
+ sum1_l = __lsx_vsub_w(sum0_l, tmp1_l);
+ sum0_r = __lsx_vadd_w(sum0_r, tmp1_r);
+ sum0_l = __lsx_vadd_w(sum0_l, tmp1_l);
+
+ HEVC_EVEN16_CALC(tmp_buf_ptr, sum0_r, sum0_l, 0, 7);
+ HEVC_EVEN16_CALC(tmp_buf_ptr, sum1_r, sum1_l, 3, 4);
+
+ filter0 = __lsx_vldrepl_w(filter_ptr3, 16);
+ filter1 = __lsx_vldrepl_w(filter_ptr3, 20);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter0, src2_l, filter0,
+ src3_r, filter1, src3_l, filter1, sum0_r, sum0_l, tmp1_r, tmp1_l);
+ sum1_r = __lsx_vsub_w(sum0_r, tmp1_r);
+ sum1_l = __lsx_vsub_w(sum0_l, tmp1_l);
+ sum0_r = __lsx_vadd_w(sum0_r, tmp1_r);
+ sum0_l = __lsx_vadd_w(sum0_l, tmp1_l);
+
+ HEVC_EVEN16_CALC(tmp_buf_ptr, sum0_r, sum0_l, 1, 6);
+ HEVC_EVEN16_CALC(tmp_buf_ptr, sum1_r, sum1_l, 2, 5);
+
+ /* process coeff 2 6 10 14 18 22 26 30 */
+ in0 = __lsx_vld(src1, 0);
+ in1 = __lsx_vld(src1 + buf_pitch_4, 0);
+ in2 = __lsx_vld(src1 + buf_pitch_8, 0);
+ in3 = __lsx_vld(src1 + buf_pitch_8 + buf_pitch_4, 0);
+ in4 = __lsx_vld(src1 + buf_pitch_16, 0);
+ in5 = __lsx_vld(src1 + buf_pitch_16 + buf_pitch_4, 0);
+ in6 = __lsx_vld(src1 + buf_pitch_16 + buf_pitch_8, 0);
+ in7 = __lsx_vld(src1 + buf_pitch_16 + buf_pitch_8 + buf_pitch_4, 0);
+
+ DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in5, in4, in7, in6,
+ src0_r, src1_r, src2_r, src3_r);
+ DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in5, in4, in7, in6,
+ src0_l, src1_l, src2_l, src3_l);
+
+ /* loop for all columns of constants */
+ for (i = 0; i < 8; i++) {
+ /* processing single column of constants */
+ filter0 = __lsx_vldrepl_w(filter_ptr1, 0);
+ filter1 = __lsx_vldrepl_w(filter_ptr1, 4);
+ filter2 = __lsx_vldrepl_w(filter_ptr1, 8);
+ filter3 = __lsx_vldrepl_w(filter_ptr1, 12);
+ sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
+ sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src2_r, filter2);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src2_l, filter2);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src3_r, filter3);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src3_l, filter3);
+
+ tmp0_r = __lsx_vld(tmp_buf_ptr + (i << 3), 0);
+ tmp0_l = __lsx_vld(tmp_buf_ptr + (i << 3), 16);
+ tmp1_r = tmp0_r;
+ tmp1_l = tmp0_l;
+ tmp0_r = __lsx_vadd_w(tmp0_r, sum0_r);
+ tmp0_l = __lsx_vadd_w(tmp0_l, sum0_l);
+ tmp1_r = __lsx_vsub_w(tmp1_r, sum0_r);
+ tmp1_l = __lsx_vsub_w(tmp1_l, sum0_l);
+ __lsx_vst(tmp0_r, tmp_buf_ptr + (i << 3), 0);
+ __lsx_vst(tmp0_l, tmp_buf_ptr + (i << 3), 16);
+ __lsx_vst(tmp1_r, tmp_buf_ptr + ((15 - i) * 8), 0);
+ __lsx_vst(tmp1_l, tmp_buf_ptr + ((15 - i) * 8), 16);
+
+ filter_ptr1 += 8;
+ }
+
+ /* process coeff 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 */
+ in0 = __lsx_vld(src0, 0);
+ in1 = __lsx_vld(src0 + buf_pitch_2, 0);
+ in2 = __lsx_vld(src0 + buf_pitch_4, 0);
+ in3 = __lsx_vld(src0 + buf_pitch_4 + buf_pitch_2, 0);
+ in4 = __lsx_vld(src0 + buf_pitch_8, 0);
+ in5 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_2, 0);
+ in6 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_4, 0);
+ in7 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_4 + buf_pitch_2, 0);
+
+ src0 += 16 * buf_pitch;
+ DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in5, in4, in7, in6,
+ src0_r, src1_r, src2_r, src3_r);
+ DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in5, in4, in7, in6,
+ src0_l, src1_l, src2_l, src3_l);
+ in0 = __lsx_vld(src0, 0);
+ in1 = __lsx_vld(src0 + buf_pitch_2, 0);
+ in2 = __lsx_vld(src0 + buf_pitch_4, 0);
+ in3 = __lsx_vld(src0 + buf_pitch_4 + buf_pitch_2, 0);
+ in4 = __lsx_vld(src0 + buf_pitch_8, 0);
+ in5 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_2, 0);
+ in6 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_4, 0);
+ in7 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_4 + buf_pitch_2, 0);
+
+ DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in5, in4, in7, in6,
+ src4_r, src5_r, src6_r, src7_r);
+ DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in5, in4, in7, in6,
+ src4_l, src5_l, src6_l, src7_l);
+
+ /* loop for all columns of filter constants */
+ for (i = 0; i < 16; i++) {
+ /* processing single column of constants */
+ filter0 = __lsx_vldrepl_w(filter_ptr0, 0);
+ filter1 = __lsx_vldrepl_w(filter_ptr0, 4);
+ filter2 = __lsx_vldrepl_w(filter_ptr0, 8);
+ filter3 = __lsx_vldrepl_w(filter_ptr0, 12);
+ sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
+ sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src2_r, filter2);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src2_l, filter2);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src3_r, filter3);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src3_l, filter3);
+ tmp1_r = sum0_r;
+ tmp1_l = sum0_l;
+
+ filter0 = __lsx_vldrepl_w(filter_ptr0, 16);
+ filter1 = __lsx_vldrepl_w(filter_ptr0, 20);
+ filter2 = __lsx_vldrepl_w(filter_ptr0, 24);
+ filter3 = __lsx_vldrepl_w(filter_ptr0, 28);
+ sum0_r = __lsx_vdp2_w_h(src4_r, filter0);
+ sum0_l = __lsx_vdp2_w_h(src4_l, filter0);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src5_r, filter1);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src5_l, filter1);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src6_r, filter2);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src6_l, filter2);
+ sum0_r = __lsx_vdp2add_w_h(sum0_r, src7_r, filter3);
+ sum0_l = __lsx_vdp2add_w_h(sum0_l, src7_l, filter3);
+ sum0_r = __lsx_vadd_w(sum0_r, tmp1_r);
+ sum0_l = __lsx_vadd_w(sum0_l, tmp1_l);
+
+ tmp0_r = __lsx_vld(tmp_buf_ptr + i * 8, 0);
+ tmp0_l = __lsx_vld(tmp_buf_ptr + i * 8, 16);
+ tmp1_r = tmp0_r;
+ tmp1_l = tmp0_l;
+ tmp0_r = __lsx_vadd_w(tmp0_r, sum0_r);
+ tmp0_l = __lsx_vadd_w(tmp0_l, sum0_l);
+ sum1_r = __lsx_vreplgr2vr_w(round);
+ tmp0_r = __lsx_vssrarn_h_w(tmp0_r, sum1_r);
+ tmp0_l = __lsx_vssrarn_h_w(tmp0_l, sum1_r);
+ in0 = __lsx_vpackev_d(tmp0_l, tmp0_r);
+ __lsx_vst(in0, (coeffs + i * buf_pitch), 0);
+ tmp1_r = __lsx_vsub_w(tmp1_r, sum0_r);
+ tmp1_l = __lsx_vsub_w(tmp1_l, sum0_l);
+ tmp1_r = __lsx_vssrarn_h_w(tmp1_r, sum1_r);
+ tmp1_l = __lsx_vssrarn_h_w(tmp1_l, sum1_r);
+ in0 = __lsx_vpackev_d(tmp1_l, tmp1_r);
+ __lsx_vst(in0, (coeffs + (31 - i) * buf_pitch), 0);
+
+ filter_ptr0 += 16;
+ }
+}
+
+static void hevc_idct_transpose_32x8_to_8x32(int16_t *coeffs, int16_t *tmp_buf)
+{
+ uint8_t i;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+ for (i = 0; i < 4; i++) {
+ DUP4_ARG2(__lsx_vld, coeffs, 0, coeffs, 64, coeffs, 128,
+ coeffs, 192, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vld, coeffs, 256, coeffs, 320, coeffs, 384,
+ coeffs, 448, in4, in5, in6, in7);
+ coeffs += 8;
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ __lsx_vst(in0, tmp_buf, 0);
+ __lsx_vst(in1, tmp_buf, 16);
+ __lsx_vst(in2, tmp_buf, 32);
+ __lsx_vst(in3, tmp_buf, 48);
+ __lsx_vst(in4, tmp_buf, 64);
+ __lsx_vst(in5, tmp_buf, 80);
+ __lsx_vst(in6, tmp_buf, 96);
+ __lsx_vst(in7, tmp_buf, 112);
+ tmp_buf += 64;
+ }
+}
+
+static void hevc_idct_transpose_8x32_to_32x8(int16_t *tmp_buf, int16_t *coeffs)
+{
+ uint8_t i;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+ for (i = 0; i < 4; i++) {
+ DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 16, tmp_buf, 32,
+ tmp_buf, 48, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vld, tmp_buf, 64, tmp_buf, 80, tmp_buf, 96,
+ tmp_buf, 112, in4, in5, in6, in7);
+ tmp_buf += 64;
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ __lsx_vst(in0, coeffs, 0);
+ __lsx_vst(in1, coeffs, 64);
+ __lsx_vst(in2, coeffs, 128);
+ __lsx_vst(in3, coeffs, 192);
+ __lsx_vst(in4, coeffs, 256);
+ __lsx_vst(in5, coeffs, 320);
+ __lsx_vst(in6, coeffs, 384);
+ __lsx_vst(in7, coeffs, 448);
+ coeffs += 8;
+ }
+}
+
+void ff_hevc_idct_32x32_lsx(int16_t *coeffs, int col_limit)
+{
+ uint8_t row_cnt, col_cnt;
+ int16_t *src = coeffs;
+ int16_t tmp_buf[8 * 32 + 31];
+ int16_t *tmp_buf_ptr = tmp_buf + 31;
+ uint8_t round;
+ int32_t buf_pitch;
+
+ /* Align pointer to 64 byte boundary */
+ tmp_buf_ptr = (int16_t *)(((uintptr_t) tmp_buf_ptr) & ~(uintptr_t) 63);
+
+ /* column transform */
+ round = 7;
+ buf_pitch = 32;
+ for (col_cnt = 0; col_cnt < 4; col_cnt++) {
+ /* process 8x32 blocks */
+ hevc_idct_8x32_column_lsx((coeffs + col_cnt * 8), buf_pitch, round);
+ }
+
+ /* row transform */
+ round = 12;
+ buf_pitch = 8;
+ for (row_cnt = 0; row_cnt < 4; row_cnt++) {
+ /* process 32x8 blocks */
+ src = (coeffs + 32 * 8 * row_cnt);
+
+ hevc_idct_transpose_32x8_to_8x32(src, tmp_buf_ptr);
+ hevc_idct_8x32_column_lsx(tmp_buf_ptr, buf_pitch, round);
+ hevc_idct_transpose_8x32_to_32x8(tmp_buf_ptr, src);
+ }
+}
diff --git a/libavcodec/loongarch/hevc_lpf_sao_lsx.c b/libavcodec/loongarch/hevc_lpf_sao_lsx.c
new file mode 100644
index 0000000000..fc10e8eda8
--- /dev/null
+++ b/libavcodec/loongarch/hevc_lpf_sao_lsx.c
@@ -0,0 +1,2485 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ * Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "hevcdsp_lsx.h"
+
+void ff_hevc_loop_filter_luma_h_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int32_t beta, int32_t *tc,
+ uint8_t *p_is_pcm, uint8_t *q_is_pcm)
+{
+ ptrdiff_t stride_2x = (stride << 1);
+ ptrdiff_t stride_4x = (stride << 2);
+ ptrdiff_t stride_3x = stride_2x + stride;
+ uint8_t *p3 = src - stride_4x;
+ uint8_t *p2 = src - stride_3x;
+ uint8_t *p1 = src - stride_2x;
+ uint8_t *p0 = src - stride;
+ uint8_t *q0 = src;
+ uint8_t *q1 = src + stride;
+ uint8_t *q2 = src + stride_2x;
+ uint8_t *q3 = src + stride_3x;
+ uint8_t flag0, flag1;
+ int32_t dp00, dq00, dp30, dq30, d00, d30, d0030, d0434;
+ int32_t dp04, dq04, dp34, dq34, d04, d34;
+ int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
+ int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
+
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5;
+ __m128i cmp0, cmp1, cmp2, cmp3, p_is_pcm_vec, q_is_pcm_vec;
+ __m128i temp0, temp1;
+ __m128i temp2, tc_pos, tc_neg;
+ __m128i diff0, diff1, delta0, delta1, delta2, abs_delta0;
+ __m128i zero = {0};
+ __m128i p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
+
+ dp00 = abs(p2[0] - (p1[0] << 1) + p0[0]);
+ dq00 = abs(q2[0] - (q1[0] << 1) + q0[0]);
+ dp30 = abs(p2[3] - (p1[3] << 1) + p0[3]);
+ dq30 = abs(q2[3] - (q1[3] << 1) + q0[3]);
+ d00 = dp00 + dq00;
+ d30 = dp30 + dq30;
+ dp04 = abs(p2[4] - (p1[4] << 1) + p0[4]);
+ dq04 = abs(q2[4] - (q1[4] << 1) + q0[4]);
+ dp34 = abs(p2[7] - (p1[7] << 1) + p0[7]);
+ dq34 = abs(q2[7] - (q1[7] << 1) + q0[7]);
+ d04 = dp04 + dq04;
+ d34 = dp34 + dq34;
+
+ p_is_pcm0 = p_is_pcm[0];
+ p_is_pcm4 = p_is_pcm[1];
+ q_is_pcm0 = q_is_pcm[0];
+ q_is_pcm4 = q_is_pcm[1];
+
+ DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm0, p_is_pcm4, cmp0, cmp1);
+ p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
+ d0030 = (d00 + d30) >= beta;
+ d0434 = (d04 + d34) >= beta;
+ DUP2_ARG1(__lsx_vreplgr2vr_w, d0030, d0434, cmp0, cmp1);
+ cmp3 = __lsx_vpackev_w(cmp1, cmp0);
+ cmp3 = __lsx_vseqi_w(cmp3, 0);
+
+ if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
+ (!d0030 || !d0434)) {
+ DUP4_ARG2(__lsx_vld, p3, 0, p2, 0, p1, 0, p0, 0,
+ p3_src, p2_src, p1_src, p0_src);
+ DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm0, q_is_pcm4, cmp0, cmp1);
+ q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
+
+ tc0 = tc[0];
+ beta30 = beta >> 3;
+ beta20 = beta >> 2;
+ tc250 = (((tc0 << 2) + tc0 + 1) >> 1);
+ tc4 = tc[1];
+ tc254 = (((tc4 << 2) + tc4 + 1) >> 1);
+
+ DUP2_ARG1(__lsx_vreplgr2vr_h, tc0, tc4, cmp0, cmp1);
+ DUP4_ARG2(__lsx_vilvl_b, zero, p3_src, zero, p2_src, zero, p1_src, zero,
+ p0_src, p3_src, p2_src, p1_src, p0_src);
+ DUP4_ARG2(__lsx_vld, q0, 0, q1, 0, q2, 0, q3, 0,
+ q0_src, q1_src, q2_src, q3_src);
+ flag0 = abs(p3[0] - p0[0]) + abs(q3[0] - q0[0]) < beta30 &&
+ abs(p0[0] - q0[0]) < tc250;
+ flag0 = flag0 && (abs(p3[3] - p0[3]) + abs(q3[3] - q0[3]) < beta30 &&
+ abs(p0[3] - q0[3]) < tc250 && (d00 << 1) < beta20 &&
+ (d30 << 1) < beta20);
+ tc_pos = __lsx_vpackev_d(cmp1, cmp0);
+ DUP4_ARG2(__lsx_vilvl_b, zero, q0_src, zero, q1_src, zero, q2_src,
+ zero, q3_src, q0_src, q1_src, q2_src, q3_src);
+
+ flag1 = abs(p3[4] - p0[4]) + abs(q3[4] - q0[4]) < beta30 &&
+ abs(p0[4] - q0[4]) < tc254;
+ flag1 = flag1 && (abs(p3[7] - p0[7]) + abs(q3[7] - q0[7]) < beta30 &&
+ abs(p0[7] - q0[7]) < tc254 && (d04 << 1) < beta20 &&
+ (d34 << 1) < beta20);
+ DUP2_ARG1(__lsx_vreplgr2vr_w, flag0, flag1, cmp0, cmp1);
+ cmp2 = __lsx_vpackev_w(cmp1, cmp0);
+ cmp2 = __lsx_vseqi_w(cmp2, 0);
+
+ if (flag0 && flag1) { /* strong only */
+ /* strong filter */
+ tc_pos = __lsx_vslli_h(tc_pos, 1);
+ tc_neg = __lsx_vneg_h(tc_pos);
+
+ /* p part */
+ DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
+ temp0, temp0);
+ temp1 = __lsx_vadd_h(p3_src, p2_src);
+ temp1 = __lsx_vslli_h(temp1, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, p2_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst0 = __lsx_vadd_h(temp2, p2_src);
+
+ temp1 = __lsx_vadd_h(temp0, p2_src);
+ temp1 = __lsx_vsrari_h(temp1, 2);
+ temp2 = __lsx_vsub_h(temp1, p1_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst1 = __lsx_vadd_h(temp2, p1_src);
+
+ temp1 = __lsx_vslli_h(temp0, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src,
+ temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, p0_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst2 = __lsx_vadd_h(temp2, p0_src);
+
+ p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
+ DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1,
+ p1_src, p_is_pcm_vec, dst0, dst1);
+ dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
+
+ /* q part */
+ DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
+ temp0, temp0);
+ temp1 = __lsx_vadd_h(q3_src, q2_src);
+ temp1 = __lsx_vslli_h(temp1, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, q2_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst5 = __lsx_vadd_h(temp2, q2_src);
+
+ temp1 = __lsx_vadd_h(temp0, q2_src);
+ temp1 = __lsx_vsrari_h(temp1, 2);
+ temp2 = __lsx_vsub_h(temp1, q1_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst4 = __lsx_vadd_h(temp2, q1_src);
+
+ temp0 = __lsx_vslli_h(temp0, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp0, p1_src, temp1, q2_src,
+ temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, q0_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst3 = __lsx_vadd_h(temp2, q0_src);
+
+ q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
+ DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4,
+ q1_src, q_is_pcm_vec, dst3, dst4);
+ dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
+
+ /* pack results to 8 bit */
+ DUP2_ARG2(__lsx_vpickev_b, dst1, dst0, dst3, dst2, dst0, dst1);
+ dst2 = __lsx_vpickev_b(dst5, dst4);
+
+ /* pack src to 8 bit */
+ DUP2_ARG2(__lsx_vpickev_b, p1_src, p2_src, q0_src, p0_src,
+ dst3, dst4);
+ dst5 = __lsx_vpickev_b(q2_src, q1_src);
+
+ cmp3 = __lsx_vnor_v(cmp3, cmp3);
+ DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp3, dst1, dst4, cmp3,
+ dst0, dst1);
+ dst2 = __lsx_vbitsel_v(dst2, dst5, cmp3);
+
+ __lsx_vstelm_d(dst0, p2, 0, 0);
+ __lsx_vstelm_d(dst0, p2 + stride, 0, 1);
+ __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
+ __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
+ __lsx_vstelm_d(dst2, p2 + stride_4x, 0, 0);
+ __lsx_vstelm_d(dst2, p2 + stride_4x + stride, 0, 1);
+ /* strong filter ends */
+ } else if (flag0 == flag1) { /* weak only */
+ /* weak filter */
+ tc_neg = __lsx_vneg_h(tc_pos);
+ DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
+ diff0, diff1);
+ DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
+ __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
+ delta0 = __lsx_vsub_h(diff0, diff1);
+ delta0 = __lsx_vsrari_h(delta0, 4);
+ temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
+ __lsx_vslli_h(tc_pos, 1));
+ abs_delta0 = __lsx_vadda_h(delta0, zero);
+ abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
+ abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
+
+ delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
+ temp2 = __lsx_vadd_h(delta0, p0_src);
+ temp2 = __lsx_vclip255_h(temp2);
+ temp0 = __lsx_vbitsel_v(temp2, p0_src,
+ __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec));
+ temp2 = __lsx_vsub_h(q0_src, delta0);
+ temp2 = __lsx_vclip255_h(temp2);
+ temp2 = __lsx_vbitsel_v(temp2, q0_src, __lsx_vnor_v(q_is_pcm_vec,
+ q_is_pcm_vec));
+ DUP2_ARG2(__lsx_vnor_v, p_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec,
+ q_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec);
+
+ tmp = (beta + (beta >> 1)) >> 3;
+ DUP2_ARG1(__lsx_vreplgr2vr_d, dp00 + dp30 < tmp, dp04 + dp34 < tmp,
+ cmp0, cmp1);
+ cmp0 = __lsx_vpackev_d(cmp1, cmp0);
+ cmp0 = __lsx_vseqi_d(cmp0, 0);
+ p_is_pcm_vec = __lsx_vor_v(p_is_pcm_vec, cmp0);
+
+ DUP2_ARG1(__lsx_vreplgr2vr_d, dq00 + dq30 < tmp, dq04 + dq34 < tmp,
+ cmp0, cmp1);
+ cmp0 = __lsx_vpackev_d(cmp1, cmp0);
+ cmp0 = __lsx_vseqi_d(cmp0, 0);
+ q_is_pcm_vec = __lsx_vor_v(q_is_pcm_vec, cmp0);
+ tc_pos = __lsx_vsrai_h(tc_pos, 1);
+ tc_neg = __lsx_vneg_h(tc_pos);
+
+ DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
+ delta1, delta2);
+ DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
+ delta1, delta2);
+ delta1 = __lsx_vadd_h(delta1, delta0);
+ delta2 = __lsx_vsub_h(delta2, delta0);
+ DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
+ DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2,
+ tc_neg, tc_pos, delta1, delta2);
+ DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
+ delta1, delta2);
+ DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
+ DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
+ q1_src, q_is_pcm_vec, delta1, delta2);
+
+ abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
+ DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, temp0,
+ p0_src, abs_delta0, temp2, q0_src, abs_delta0, delta2,
+ q1_src, abs_delta0, dst1, dst2, dst3, dst4);
+ /* pack results to 8 bit */
+ DUP2_ARG2(__lsx_vpickev_b, dst2, dst1, dst4, dst3, dst0, dst1);
+ /* pack src to 8 bit */
+ DUP2_ARG2(__lsx_vpickev_b, p0_src, p1_src, q1_src, q0_src,
+ dst2, dst3);
+ cmp3 = __lsx_vnor_v(cmp3, cmp3);
+ DUP2_ARG3(__lsx_vbitsel_v, dst0, dst2, cmp3, dst1, dst3, cmp3,
+ dst0, dst1);
+
+ p2 += stride;
+ __lsx_vstelm_d(dst0, p2, 0, 0);
+ __lsx_vstelm_d(dst0, p2 + stride, 0, 1);
+ __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
+ __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
+ /* weak filter ends */
+ } else { /* strong + weak */
+ /* strong filter */
+ tc_pos = __lsx_vslli_h(tc_pos, 1);
+ tc_neg = __lsx_vneg_h(tc_pos);
+
+ /* p part */
+ DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
+ temp0, temp0);
+ temp1 = __lsx_vadd_h(p3_src, p2_src);
+ temp1 = __lsx_vslli_h(temp1, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, p2_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst0 = __lsx_vadd_h(temp2, p2_src);
+
+ temp1 = __lsx_vadd_h(temp0, p2_src);
+ temp1 = __lsx_vsrari_h(temp1, 2);
+ temp2 = __lsx_vsub_h(temp1, p1_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst1 = __lsx_vadd_h(temp2, p1_src);
+
+ temp1 = __lsx_vslli_h(temp0, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, p0_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst2 = __lsx_vadd_h(temp2, p0_src);
+
+ p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
+ DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1,
+ p1_src, p_is_pcm_vec, dst0, dst1);
+ dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
+
+ /* q part */
+ DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
+ temp0, temp0);
+ temp1 = __lsx_vadd_h(q3_src, q2_src);
+ temp1 = __lsx_vslli_h(temp1, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, q2_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst5 = __lsx_vadd_h(temp2, q2_src);
+
+ temp1 = __lsx_vadd_h(temp0, q2_src);
+ temp1 = __lsx_vsrari_h(temp1, 2);
+ temp2 = __lsx_vsub_h(temp1, q1_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst4 = __lsx_vadd_h(temp2, q1_src);
+
+ temp1 = __lsx_vslli_h(temp0, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, q0_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst3 = __lsx_vadd_h(temp2, q0_src);
+
+ q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
+ DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4,
+ q1_src, q_is_pcm_vec, dst3, dst4);
+ dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
+
+ /* pack strong results to 8 bit */
+ DUP2_ARG2(__lsx_vpickev_b, dst1, dst0, dst3, dst2, dst0, dst1);
+ dst2 = __lsx_vpickev_b(dst5, dst4);
+ /* strong filter ends */
+
+ /* weak filter */
+ tc_pos = __lsx_vsrai_h(tc_pos, 1);
+ tc_neg = __lsx_vneg_h(tc_pos);
+
+ DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
+ diff0, diff1);
+ DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
+ __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
+ delta0 = __lsx_vsub_h(diff0, diff1);
+ delta0 = __lsx_vsrari_h(delta0, 4);
+ temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
+ __lsx_vslli_h(tc_pos, 1));
+ abs_delta0 = __lsx_vadda_h(delta0, zero);
+ abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
+ abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
+
+ delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
+ temp2 = __lsx_vadd_h(delta0, p0_src);
+ temp2 = __lsx_vclip255_h(temp2);
+ temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
+
+ temp2 = __lsx_vsub_h(q0_src, delta0);
+ temp2 = __lsx_vclip255_h(temp2);
+ temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
+
+ tmp = (beta + (beta >> 1)) >> 3;
+ DUP2_ARG1(__lsx_vreplgr2vr_d, dp00 + dp30 < tmp, dp04 + dp34 < tmp,
+ cmp0, cmp1);
+ cmp0 = __lsx_vpackev_d(cmp1, cmp0);
+ p_is_pcm_vec = __lsx_vor_v(p_is_pcm_vec, __lsx_vseqi_d(cmp0, 0));
+ DUP2_ARG1(__lsx_vreplgr2vr_d, dq00 + dq30 < tmp, dq04 + dq34 < tmp,
+ cmp0, cmp1);
+ cmp0 = __lsx_vpackev_d(cmp1, cmp0);
+ q_is_pcm_vec = __lsx_vor_v(q_is_pcm_vec, __lsx_vseqi_d(cmp0, 0));
+
+ tc_pos = __lsx_vsrai_h(tc_pos, 1);
+ tc_neg = __lsx_vneg_h(tc_pos);
+
+ DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
+ delta1, delta2);
+ DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
+ delta1, delta2);
+ delta1 = __lsx_vadd_h(delta1, delta0);
+ delta2 = __lsx_vsub_h(delta2, delta0);
+ DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
+ DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
+ tc_pos, delta1, delta2);
+ DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
+ delta1, delta2);
+ DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
+ DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
+ q1_src, q_is_pcm_vec, delta1, delta2);
+ abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
+ DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, delta2,
+ q1_src, abs_delta0, temp0, p0_src, abs_delta0, temp2,
+ q0_src, abs_delta0, delta1, delta2, temp0, temp2);
+ /* weak filter ends */
+
+ /* pack weak results to 8 bit */
+ DUP2_ARG2(__lsx_vpickev_b, delta1, p2_src, temp2, temp0,
+ dst3, dst4);
+ dst5 = __lsx_vpickev_b(q2_src, delta2);
+
+ /* select between weak or strong */
+ DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp2, dst1, dst4, cmp2,
+ dst0, dst1);
+ dst2 = __lsx_vbitsel_v(dst2, dst5, cmp2);
+
+ /* pack src to 8 bit */
+ DUP2_ARG2(__lsx_vpickev_b, p1_src, p2_src, q0_src, p0_src,
+ dst3, dst4);
+ dst5 = __lsx_vpickev_b(q2_src, q1_src);
+
+ cmp3 = __lsx_vnor_v(cmp3, cmp3);
+ DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp3, dst1, dst4, cmp3,
+ dst0, dst1);
+ dst2 = __lsx_vbitsel_v(dst2, dst5, cmp3);
+
+ __lsx_vstelm_d(dst0, p2, 0, 0);
+ __lsx_vstelm_d(dst0, p2 + stride, 0, 1);
+ __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
+ __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
+ __lsx_vstelm_d(dst2, p2 + stride_4x, 0, 0);
+ __lsx_vstelm_d(dst2, p2 + stride_4x + stride, 0, 1);
+ }
+ }
+}
+
+void ff_hevc_loop_filter_luma_v_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int32_t beta, int32_t *tc,
+ uint8_t *p_is_pcm, uint8_t *q_is_pcm)
+{
+ ptrdiff_t stride_2x = (stride << 1);
+ ptrdiff_t stride_4x = (stride << 2);
+ ptrdiff_t stride_3x = stride_2x + stride;
+ uint8_t *p3 = src;
+ uint8_t *p2 = src + stride_3x;
+ uint8_t *p1 = src + stride_4x;
+ uint8_t *p0 = src + stride_4x + stride_3x;
+ uint8_t flag0, flag1;
+ int32_t dp00, dq00, dp30, dq30, d00, d30;
+ int32_t d0030, d0434;
+ int32_t dp04, dq04, dp34, dq34, d04, d34;
+ int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
+ int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
+
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ __m128i cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
+ __m128i cmp3;
+ __m128i temp0, temp1;
+ __m128i temp2;
+ __m128i tc_pos, tc_neg;
+ __m128i diff0, diff1, delta0, delta1, delta2, abs_delta0;
+ __m128i zero = {0};
+ __m128i p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
+
+ dp00 = abs(p3[-3] - (p3[-2] << 1) + p3[-1]);
+ dq00 = abs(p3[2] - (p3[1] << 1) + p3[0]);
+ dp30 = abs(p2[-3] - (p2[-2] << 1) + p2[-1]);
+ dq30 = abs(p2[2] - (p2[1] << 1) + p2[0]);
+ d00 = dp00 + dq00;
+ d30 = dp30 + dq30;
+ p_is_pcm0 = p_is_pcm[0];
+ q_is_pcm0 = q_is_pcm[0];
+
+ dp04 = abs(p1[-3] - (p1[-2] << 1) + p1[-1]);
+ dq04 = abs(p1[2] - (p1[1] << 1) + p1[0]);
+ dp34 = abs(p0[-3] - (p0[-2] << 1) + p0[-1]);
+ dq34 = abs(p0[2] - (p0[1] << 1) + p0[0]);
+ d04 = dp04 + dq04;
+ d34 = dp34 + dq34;
+ p_is_pcm4 = p_is_pcm[1];
+ q_is_pcm4 = q_is_pcm[1];
+
+ DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm0, p_is_pcm4, cmp0, cmp1);
+ p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
+
+ d0030 = (d00 + d30) >= beta;
+ d0434 = (d04 + d34) >= beta;
+
+ DUP2_ARG1(__lsx_vreplgr2vr_d, d0030, d0434, cmp0, cmp1);
+ cmp3 = __lsx_vpackev_d(cmp1, cmp0);
+ cmp3 = __lsx_vseqi_d(cmp3, 0);
+
+ if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
+ (!d0030 || !d0434)) {
+ src -= 4;
+ DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
+ src + stride_3x, 0, p3_src, p2_src, p1_src, p0_src);
+ src += stride_4x;
+ DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
+ src + stride_3x, 0, q0_src, q1_src, q2_src, q3_src);
+ src -= stride_4x;
+
+ DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm0, q_is_pcm4, cmp0, cmp1);
+ q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
+
+ tc0 = tc[0];
+ beta30 = beta >> 3;
+ beta20 = beta >> 2;
+ tc250 = (((tc0 << 2) + tc0 + 1) >> 1);
+ tc4 = tc[1];
+ tc254 = (((tc4 << 2) + tc4 + 1) >> 1);
+ DUP2_ARG1( __lsx_vreplgr2vr_h, tc0 << 1, tc4 << 1, cmp0, cmp1);
+ tc_pos = __lsx_vpackev_d(cmp1, cmp0);
+ LSX_TRANSPOSE8x8_B(p3_src, p2_src, p1_src, p0_src, q0_src, q1_src,
+ q2_src, q3_src, p3_src, p2_src, p1_src, p0_src,
+ q0_src, q1_src, q2_src, q3_src);
+
+ flag0 = abs(p3[-4] - p3[-1]) + abs(p3[3] - p3[0]) < beta30 &&
+ abs(p3[-1] - p3[0]) < tc250;
+ flag0 = flag0 && (abs(p2[-4] - p2[-1]) + abs(p2[3] - p2[0]) < beta30 &&
+ abs(p2[-1] - p2[0]) < tc250 && (d00 << 1) < beta20 &&
+ (d30 << 1) < beta20);
+ cmp0 = __lsx_vreplgr2vr_d(flag0);
+ DUP4_ARG2(__lsx_vilvl_b, zero, p3_src, zero, p2_src, zero, p1_src, zero,
+ p0_src, p3_src, p2_src, p1_src, p0_src);
+
+ flag1 = abs(p1[-4] - p1[-1]) + abs(p1[3] - p1[0]) < beta30 &&
+ abs(p1[-1] - p1[0]) < tc254;
+ flag1 = flag1 && (abs(p0[-4] - p0[-1]) + abs(p0[3] - p0[0]) < beta30 &&
+ abs(p0[-1] - p0[0]) < tc254 && (d04 << 1) < beta20 &&
+ (d34 << 1) < beta20);
+ DUP4_ARG2(__lsx_vilvl_b, zero, q0_src, zero, q1_src, zero, q2_src, zero,
+ q3_src, q0_src, q1_src, q2_src, q3_src);
+
+ cmp1 = __lsx_vreplgr2vr_d(flag1);
+ cmp2 = __lsx_vpackev_d(cmp1, cmp0);
+ cmp2 = __lsx_vseqi_d(cmp2, 0);
+
+ if (flag0 && flag1) { /* strong only */
+ /* strong filter */
+ tc_neg = __lsx_vneg_h(tc_pos);
+ /* p part */
+ DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
+ temp0, temp0);
+ temp1 = __lsx_vadd_h(p3_src, p2_src);
+ temp1 = __lsx_vslli_h(temp1, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, p2_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst0 = __lsx_vadd_h(temp2, p2_src);
+
+ temp1 = __lsx_vadd_h(temp0, p2_src);
+ temp1 = __lsx_vsrari_h(temp1, 2);
+ temp2 = __lsx_vsub_h(temp1, p1_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst1 = __lsx_vadd_h(temp2, p1_src);
+
+ temp1 = __lsx_vslli_h(temp0, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, p0_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst2 = __lsx_vadd_h(temp2, p0_src);
+
+ p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
+ DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1, p1_src,
+ p_is_pcm_vec, dst0, dst1);
+ dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
+
+ /* q part */
+ DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
+ temp0, temp0);
+ temp1 = __lsx_vadd_h(q3_src, q2_src);
+ temp1 = __lsx_vslli_h(temp1, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, q2_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst5 = __lsx_vadd_h(temp2, q2_src);
+
+ temp1 = __lsx_vadd_h(temp0, q2_src);
+ temp1 = __lsx_vsrari_h(temp1, 2);
+ temp2 = __lsx_vsub_h(temp1, q1_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst4 = __lsx_vadd_h(temp2, q1_src);
+
+ temp1 = __lsx_vslli_h(temp0, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, q0_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst3 = __lsx_vadd_h(temp2, q0_src);
+
+ q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
+ DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4, q1_src,
+ q_is_pcm_vec, dst3, dst4);
+ dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
+ /* strong filter ends */
+ } else if (flag0 == flag1) { /* weak only */
+ /* weak filter */
+ tc_pos = __lsx_vsrai_h(tc_pos, 1);
+ tc_neg = __lsx_vneg_h(tc_pos);
+
+ DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
+ diff0, diff1);
+ DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
+ __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
+ delta0 = __lsx_vsub_h(diff0, diff1);
+ delta0 = __lsx_vsrari_h(delta0, 4);
+ temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
+ __lsx_vslli_h(tc_pos, 1));
+ abs_delta0 = __lsx_vadda_h(delta0, zero);
+ abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
+ abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
+
+ delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
+ temp2 = __lsx_vadd_h(delta0, p0_src);
+ temp2 = __lsx_vclip255_h(temp2);
+ p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
+ temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
+
+ temp2 = __lsx_vsub_h(q0_src, delta0);
+ temp2 = __lsx_vclip255_h(temp2);
+ q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
+ temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
+
+ tmp = ((beta + (beta >> 1)) >> 3);
+ DUP2_ARG1(__lsx_vreplgr2vr_d, !p_is_pcm0 && ((dp00 + dp30) < tmp),
+ !p_is_pcm4 && ((dp04 + dp34) < tmp), cmp0, cmp1);
+ p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
+
+ DUP2_ARG1(__lsx_vreplgr2vr_h, (!q_is_pcm0) && (dq00 + dq30 < tmp),
+ (!q_is_pcm4) && (dq04 + dq34 < tmp), cmp0, cmp1);
+ q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
+ tc_pos = __lsx_vsrai_h(tc_pos, 1);
+ tc_neg = __lsx_vneg_h(tc_pos);
+
+ DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
+ delta1, delta2);
+ DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
+ delta1, delta2);
+ delta1 = __lsx_vadd_h(delta1, delta0);
+ delta2 = __lsx_vsub_h(delta2, delta0);
+ DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
+ DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
+ tc_pos, delta1, delta2);
+ DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
+ delta1, delta2);
+ DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
+ DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
+ q1_src, q_is_pcm_vec, delta1, delta2);
+
+ abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
+ DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, temp0,
+ p0_src, abs_delta0, temp2, q0_src, abs_delta0, delta2,
+ q1_src, abs_delta0, dst0, dst1, dst2, dst3);
+ /* weak filter ends */
+
+ cmp3 = __lsx_vnor_v(cmp3, cmp3);
+ DUP4_ARG3(__lsx_vbitsel_v, dst0, p1_src, cmp3, dst1, p0_src,
+ cmp3, dst2, q0_src, cmp3, dst3, q1_src, cmp3,
+ dst0, dst1, dst2, dst3);
+ DUP2_ARG2(__lsx_vpickev_b, dst2, dst0, dst3, dst1, dst0, dst1);
+
+ /* transpose */
+ dst4 = __lsx_vilvl_b(dst1, dst0);
+ dst5 = __lsx_vilvh_b(dst1, dst0);
+ dst0 = __lsx_vilvl_h(dst5, dst4);
+ dst1 = __lsx_vilvh_h(dst5, dst4);
+
+ src += 2;
+ __lsx_vstelm_w(dst0, src, 0, 0);
+ __lsx_vstelm_w(dst0, src + stride, 0, 1);
+ __lsx_vstelm_w(dst0, src + stride_2x, 0, 2);
+ __lsx_vstelm_w(dst0, src + stride_3x, 0, 3);
+ src += stride_4x;
+ __lsx_vstelm_w(dst1, src, 0, 0);
+ __lsx_vstelm_w(dst1, src + stride, 0, 1);
+ __lsx_vstelm_w(dst1, src + stride_2x, 0, 2);
+ __lsx_vstelm_w(dst1, src + stride_3x, 0, 3);
+ return;
+ } else { /* strong + weak */
+ /* strong filter */
+ tc_neg = __lsx_vneg_h(tc_pos);
+
+ /* p part */
+ DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
+ temp0, temp0);
+
+ temp1 = __lsx_vadd_h(p3_src, p2_src);
+ temp1 = __lsx_vslli_h(temp1, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, p2_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst0 = __lsx_vadd_h(temp2, p2_src);
+
+ temp1 = __lsx_vadd_h(temp0, p2_src);
+ temp1 = __lsx_vsrari_h(temp1, 2);
+ temp2 = __lsx_vsub_h(temp1, p1_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst1 = __lsx_vadd_h(temp2, p1_src);
+
+ temp1 = __lsx_vslli_h(temp0, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, p0_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst2 = __lsx_vadd_h(temp2, p0_src);
+
+ p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
+ DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1, p1_src,
+ p_is_pcm_vec, dst0, dst1);
+ dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
+
+ /* q part */
+ DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src, temp0, temp0);
+ temp1 = __lsx_vadd_h(q3_src, q2_src);
+ temp1 = __lsx_vslli_h(temp1, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, q2_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst5 = __lsx_vadd_h(temp2, q2_src);
+
+ temp1 = __lsx_vadd_h(temp0, q2_src);
+ temp1 = __lsx_vsrari_h(temp1, 2);
+ temp2 = __lsx_vsub_h(temp1, q1_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst4 = __lsx_vadd_h(temp2, q1_src);
+
+ temp1 = __lsx_vslli_h(temp0, 1);
+ DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
+ temp1 = __lsx_vsrari_h(temp1, 3);
+ temp2 = __lsx_vsub_h(temp1, q0_src);
+ temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
+ dst3 = __lsx_vadd_h(temp2, q0_src);
+
+ q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
+ DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4, q1_src,
+ q_is_pcm_vec, dst3, dst4);
+ dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
+ /* strong filter ends */
+
+ /* weak filter */
+ tc_pos = __lsx_vsrai_h(tc_pos, 1);
+ tc_neg = __lsx_vneg_h(tc_pos);
+
+ DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
+ diff0, diff1);
+ DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
+ __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
+ delta0 = __lsx_vsub_h(diff0, diff1);
+ delta0 = __lsx_vsrari_h(delta0, 4);
+
+ temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
+ __lsx_vslli_h(tc_pos, 1));
+ abs_delta0 = __lsx_vadda_h(delta0, zero);
+ abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
+ abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
+ delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
+ temp2 = __lsx_vadd_h(delta0, p0_src);
+ temp2 = __lsx_vclip255_h(temp2);
+ temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
+ temp2 = __lsx_vsub_h(q0_src, delta0);
+ temp2 = __lsx_vclip255_h(temp2);
+ temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
+
+ tmp = (beta + (beta >> 1)) >> 3;
+ DUP2_ARG1(__lsx_vreplgr2vr_d, !p_is_pcm0 && ((dp00 + dp30) < tmp),
+ !p_is_pcm4 && ((dp04 + dp34) < tmp), cmp0, cmp1);
+ p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
+
+ DUP2_ARG1(__lsx_vreplgr2vr_h, (!q_is_pcm0) && (dq00 + dq30 < tmp),
+ (!q_is_pcm4) && (dq04 + dq34 < tmp), cmp0, cmp1);
+ q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
+ tc_pos = __lsx_vsrai_h(tc_pos, 1);
+ tc_neg = __lsx_vneg_h(tc_pos);
+
+ DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
+ delta1, delta2);
+ DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
+ delta1, delta2);
+ delta1 = __lsx_vadd_h(delta1, delta0);
+ delta2 = __lsx_vsub_h(delta2, delta0);
+ DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
+ DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
+ tc_pos, delta1, delta2);
+ DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
+ delta1, delta2);
+ DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
+ DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
+ q1_src, q_is_pcm_vec, delta1, delta2);
+
+ abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
+ DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, delta2,
+ q1_src, abs_delta0, temp0, p0_src, abs_delta0, temp2,
+ q0_src, abs_delta0, delta1, delta2, temp0, temp2);
+ /* weak filter ends*/
+
+ /* select between weak or strong */
+ DUP4_ARG3(__lsx_vbitsel_v, dst0, p2_src, cmp2, dst1, delta1,
+ cmp2, dst2, temp0, cmp2, dst3, temp2, cmp2,
+ dst0, dst1, dst2, dst3);
+ DUP2_ARG3(__lsx_vbitsel_v, dst4, delta2, cmp2, dst5, q2_src, cmp2,
+ dst4, dst5);
+ }
+
+ cmp3 = __lsx_vnor_v(cmp3, cmp3);
+ DUP4_ARG3(__lsx_vbitsel_v, dst0, p2_src, cmp3, dst1, p1_src, cmp3, dst2,
+ p0_src, cmp3, dst3, q0_src, cmp3, dst0, dst1, dst2, dst3);
+ DUP2_ARG3(__lsx_vbitsel_v, dst4, q1_src, cmp3, dst5, q2_src, cmp3,
+ dst4, dst5);
+
+ /* pack results to 8 bit */
+ DUP4_ARG2(__lsx_vpickev_b, dst2, dst0, dst3, dst1, dst4, dst4, dst5,
+ dst5, dst0, dst1, dst2, dst3);
+
+ /* transpose */
+ DUP2_ARG2(__lsx_vilvl_b, dst1, dst0, dst3, dst2, dst4, dst6);
+ DUP2_ARG2(__lsx_vilvh_b, dst1, dst0, dst3, dst2, dst5, dst7);
+ DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst7, dst6, dst0, dst2);
+ DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst7, dst6, dst1, dst3);
+
+ src += 1;
+ __lsx_vstelm_w(dst0, src, 0, 0);
+ __lsx_vstelm_h(dst2, src, 4, 0);
+ src += stride;
+ __lsx_vstelm_w(dst0, src, 0, 1);
+ __lsx_vstelm_h(dst2, src, 4, 2);
+ src += stride;
+
+ __lsx_vstelm_w(dst0, src, 0, 2);
+ __lsx_vstelm_h(dst2, src, 4, 4);
+ src += stride;
+ __lsx_vstelm_w(dst0, src, 0, 3);
+ __lsx_vstelm_h(dst2, src, 4, 6);
+ src += stride;
+
+ __lsx_vstelm_w(dst1, src, 0, 0);
+ __lsx_vstelm_h(dst3, src, 4, 0);
+ src += stride;
+ __lsx_vstelm_w(dst1, src, 0, 1);
+ __lsx_vstelm_h(dst3, src, 4, 2);
+ src += stride;
+
+ __lsx_vstelm_w(dst1, src, 0, 2);
+ __lsx_vstelm_h(dst3, src, 4, 4);
+ src += stride;
+ __lsx_vstelm_w(dst1, src, 0, 3);
+ __lsx_vstelm_h(dst3, src, 4, 6);
+ }
+}
+
+void ff_hevc_loop_filter_chroma_h_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int32_t *tc, uint8_t *p_is_pcm,
+ uint8_t *q_is_pcm)
+{
+ uint8_t *p1_ptr = src - (stride << 1);
+ uint8_t *p0_ptr = src - stride;
+ uint8_t *q0_ptr = src;
+ uint8_t *q1_ptr = src + stride;
+ __m128i cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
+ __m128i p1, p0, q0, q1;
+ __m128i tc_pos, tc_neg;
+ __m128i zero = {0};
+ __m128i temp0, temp1, delta;
+
+ if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
+ DUP2_ARG1(__lsx_vreplgr2vr_h, tc[0], tc[1], cmp0, cmp1);
+ tc_pos = __lsx_vpackev_d(cmp1, cmp0);
+ tc_neg = __lsx_vneg_h(tc_pos);
+ DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm[0], p_is_pcm[1], cmp0, cmp1);
+ p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
+
+ DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm[0], q_is_pcm[1], cmp0, cmp1);
+ q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
+
+ DUP4_ARG2(__lsx_vld, p1_ptr, 0, p0_ptr, 0, q0_ptr, 0, q1_ptr, 0,
+ p1, p0, q0, q1);
+ DUP4_ARG2(__lsx_vilvl_b, zero, p1, zero, p0, zero, q0, zero, q1,
+ p1, p0, q0, q1);
+ DUP2_ARG2(__lsx_vsub_h, q0, p0, p1, q1, temp0, temp1);
+ temp0 = __lsx_vslli_h(temp0, 2);
+ temp0 = __lsx_vadd_h(temp0, temp1);
+ delta = __lsx_vsrari_h(temp0, 3);
+ delta = __lsx_vclip_h(delta, tc_neg, tc_pos);
+ temp0 = __lsx_vadd_h(p0, delta);
+ temp0 = __lsx_vclip255_h(temp0);
+ p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
+ temp0 = __lsx_vbitsel_v(temp0, p0, p_is_pcm_vec);
+
+ temp1 = __lsx_vsub_h(q0, delta);
+ temp1 = __lsx_vclip255_h(temp1);
+ q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
+ temp1 = __lsx_vbitsel_v(temp1, q0, q_is_pcm_vec);
+
+ tc_pos = __lsx_vslei_d(tc_pos, 0);
+ DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, tc_pos, temp1, q0, tc_pos,
+ temp0, temp1);
+ temp0 = __lsx_vpickev_b(temp1, temp0);
+ __lsx_vstelm_d(temp0, p0_ptr, 0, 0);
+ __lsx_vstelm_d(temp0, p0_ptr + stride, 0, 1);
+ }
+}
+
+void ff_hevc_loop_filter_chroma_v_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int32_t *tc, uint8_t *p_is_pcm,
+ uint8_t *q_is_pcm)
+{
+ ptrdiff_t stride_2x = (stride << 1);
+ ptrdiff_t stride_4x = (stride << 2);
+ ptrdiff_t stride_3x = stride_2x + stride;
+ __m128i cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i p1, p0, q0, q1;
+ __m128i tc_pos, tc_neg;
+ __m128i zero = {0};
+ __m128i temp0, temp1, delta;
+
+ if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
+ DUP2_ARG1(__lsx_vreplgr2vr_h, tc[0], tc[1], cmp0, cmp1);
+ tc_pos = __lsx_vpackev_d(cmp1, cmp0);
+ tc_neg = __lsx_vneg_h(tc_pos);
+
+ DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm[0], p_is_pcm[1], cmp0, cmp1);
+ p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
+ DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm[0], q_is_pcm[1], cmp0, cmp1);
+ q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
+ q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
+
+ src -= 2;
+ DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
+ src + stride_3x, 0, src0, src1, src2, src3);
+ src += stride_4x;
+ DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
+ src + stride_3x, 0, src4, src5, src6, src7);
+ src -= stride_4x;
+ LSX_TRANSPOSE8x4_B(src0, src1, src2, src3, src4, src5, src6, src7,
+ p1, p0, q0, q1);
+ DUP4_ARG2(__lsx_vilvl_b, zero, p1, zero, p0, zero, q0, zero, q1,
+ p1, p0, q0, q1);
+
+ DUP2_ARG2(__lsx_vsub_h, q0, p0, p1, q1, temp0, temp1);
+ temp0 = __lsx_vslli_h(temp0, 2);
+ temp0 = __lsx_vadd_h(temp0, temp1);
+ delta = __lsx_vsrari_h(temp0, 3);
+ delta = __lsx_vclip_h(delta, tc_neg, tc_pos);
+
+ temp0 = __lsx_vadd_h(p0, delta);
+ temp1 = __lsx_vsub_h(q0, delta);
+ DUP2_ARG1(__lsx_vclip255_h, temp0, temp1, temp0, temp1);
+ DUP2_ARG2(__lsx_vnor_v, p_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec,
+ q_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec);
+ DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, p_is_pcm_vec, temp1, q0,
+ q_is_pcm_vec, temp0, temp1);
+
+ tc_pos = __lsx_vslei_d(tc_pos, 0);
+ DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, tc_pos, temp1, q0, tc_pos,
+ temp0, temp1);
+ temp0 = __lsx_vpackev_b(temp1, temp0);
+
+ src += 1;
+ __lsx_vstelm_h(temp0, src, 0, 0);
+ __lsx_vstelm_h(temp0, src + stride, 0, 1);
+ __lsx_vstelm_h(temp0, src + stride_2x, 0, 2);
+ __lsx_vstelm_h(temp0, src + stride_3x, 0, 3);
+ src += stride_4x;
+ __lsx_vstelm_h(temp0, src, 0, 4);
+ __lsx_vstelm_h(temp0, src + stride, 0, 5);
+ __lsx_vstelm_h(temp0, src + stride_2x, 0, 6);
+ __lsx_vstelm_h(temp0, src + stride_3x, 0, 7);
+ src -= stride_4x;
+ }
+}
+
+static void hevc_sao_edge_filter_0degree_4width_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *sao_offset_val,
+ int32_t height)
+{
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+ __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i cmp_minus10, cmp_minus11, diff_minus10, diff_minus11;
+ __m128i sao_offset = __lsx_vld(sao_offset_val, 0);
+ __m128i src_minus10, src_minus11, src_plus10, offset, src0, dst0;
+ __m128i const1 = __lsx_vldi(1);
+ __m128i zero = {0};
+
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+ src -= 1;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src_minus10, src_minus11);
+
+ for (height -= 2; height; height -= 2) {
+ src += src_stride_2x;
+ src_minus10 = __lsx_vpickev_d(src_minus11, src_minus10);
+ src0 = __lsx_vshuf_b(zero, src_minus10, shuf1);
+ src_plus10 = __lsx_vshuf_b(zero, src_minus10, shuf2);
+
+ DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ offset = __lsx_vadd_b(diff_minus10, diff_minus11);
+ offset = __lsx_vaddi_bu(offset, 2);
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0,
+ src_minus10, src_minus11);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset,
+ sao_offset, sao_offset, offset, offset, offset);
+ src0 = __lsx_vxori_b(src0, 128);
+ dst0 = __lsx_vsadd_b(src0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
+ dst += dst_stride_2x;
+ }
+
+ src_minus10 = __lsx_vpickev_d(src_minus11, src_minus10);
+ src0 = __lsx_vshuf_b(zero, src_minus10, shuf1);
+ src_plus10 = __lsx_vshuf_b(zero, src_minus10, shuf2);
+
+ DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10, cmp_minus10,
+ cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10, cmp_minus10,
+ cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
+ const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ offset = __lsx_vadd_b(diff_minus10, diff_minus11);
+ offset = __lsx_vaddi_bu(offset, 2);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, sao_offset,
+ offset, offset, offset);
+ src0 = __lsx_vxori_b(src0, 128);
+ dst0 = __lsx_vsadd_b(src0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
+}
+
+static void hevc_sao_edge_filter_0degree_8width_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *sao_offset_val,
+ int32_t height)
+{
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+ __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i const1 = __lsx_vldi(1);
+ __m128i cmp_minus10, cmp_minus11, diff_minus10, diff_minus11;
+ __m128i src0, src1, dst0, src_minus10, src_minus11, src_plus10, src_plus11;
+ __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
+ __m128i zeros = {0};
+
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+ src -= 1;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src_minus10, src_minus11);
+
+ for (height -= 2; height; height -= 2) {
+ src += src_stride_2x;
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf1, zeros,
+ src_minus11, shuf1, src0, src1);
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros,
+ src_minus11, shuf2, src_plus10, src_plus11);
+ DUP2_ARG2(__lsx_vpickev_d, src_minus11, src_minus10, src_plus11,
+ src_plus10, src_minus10, src_plus10);
+ src0 = __lsx_vpickev_d(src1, src0);
+
+ DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ offset = __lsx_vadd_b(diff_minus10, diff_minus11);
+ offset = __lsx_vaddi_bu(offset, 2);
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0,
+ src_minus10, src_minus11);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ src0 = __lsx_vxori_b(src0, 128);
+ dst0 = __lsx_vsadd_b(src0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ __lsx_vstelm_d(dst0, dst, 0, 0);
+ __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+ dst += dst_stride_2x;
+ }
+
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf1, zeros, src_minus11,
+ shuf1, src0, src1);
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
+ shuf2, src_plus10, src_plus11);
+ DUP2_ARG2(__lsx_vpickev_d, src_minus11, src_minus10, src_plus11,
+ src_plus10, src_minus10, src_plus10);
+ src0 = __lsx_vpickev_d(src1, src0);
+
+ DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10, cmp_minus10,
+ cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10, cmp_minus10,
+ cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
+ const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ offset = __lsx_vadd_b(diff_minus10, diff_minus11);
+ offset = __lsx_vaddi_bu(offset, 2);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ src0 = __lsx_vxori_b(src0, 128);
+ dst0 = __lsx_vsadd_b(src0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ __lsx_vstelm_d(dst0, dst, 0, 0);
+ __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+}
+
+static void hevc_sao_edge_filter_0degree_16multiple_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *sao_offset_val,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *dst_ptr, *src_minus1;
+ int32_t v_cnt;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ const int32_t src_stride_4x = (src_stride << 2);
+ const int32_t dst_stride_4x = (dst_stride << 2);
+ const int32_t src_stride_3x = src_stride_2x + src_stride;
+ const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+
+ __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+ __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i const1 = __lsx_vldi(1);
+ __m128i sao_offset;
+ __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
+ __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
+ __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
+ __m128i diff_plus13;
+ __m128i src10, src11, src12, src13, dst0, dst1, dst2, dst3;
+ __m128i src_minus10, src_minus11, src_minus12, src_minus13;
+ __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3;
+ __m128i src_zero0, src_zero1, src_zero2, src_zero3;
+ __m128i src_plus10, src_plus11, src_plus12, src_plus13;
+
+ sao_offset = __lsx_vld(sao_offset_val, 0);
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+
+ for (; height; height -= 4) {
+ src_minus1 = src - 1;
+ src_minus10 = __lsx_vld(src_minus1, 0);
+ DUP2_ARG2(__lsx_vldx, src_minus1, src_stride, src_minus1,
+ src_stride_2x, src_minus11, src_minus12);
+ src_minus13 = __lsx_vldx(src_minus1, src_stride_3x);
+
+ for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
+ src_minus1 += 16;
+ dst_ptr = dst + v_cnt;
+ src10 = __lsx_vld(src_minus1, 0);
+ DUP2_ARG2(__lsx_vldx, src_minus1, src_stride, src_minus1,
+ src_stride_2x, src11, src12);
+ src13 = __lsx_vldx(src_minus1, src_stride_3x);
+ DUP4_ARG3(__lsx_vshuf_b, src10, src_minus10, shuf1, src11,
+ src_minus11, shuf1, src12, src_minus12, shuf1, src13,
+ src_minus13, shuf1, src_zero0, src_zero1,
+ src_zero2, src_zero3);
+ DUP4_ARG3(__lsx_vshuf_b, src10, src_minus10, shuf2, src11,
+ src_minus11, shuf2, src12, src_minus12, shuf2, src13,
+ src_minus13, shuf2, src_plus10, src_plus11,
+ src_plus12, src_plus13);
+ DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
+ src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
+ cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
+ DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
+ src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
+ cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
+ cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
+ cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
+ diff_plus11);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
+ cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
+ cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
+ diff_plus13);
+ DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
+ src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
+ cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
+ DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
+ src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
+ cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
+ cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
+ cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
+ cmp_plus11);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
+ cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
+ cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
+ cmp_plus13);
+ DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_plus10, const1, cmp_plus10, diff_minus11, const1,
+ cmp_minus11, diff_plus11, const1, cmp_plus11,
+ diff_minus10, diff_plus10, diff_minus11, diff_plus11);
+ DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
+ diff_plus12, const1, cmp_plus12, diff_minus13, const1,
+ cmp_minus13, diff_plus13, const1, cmp_plus13,
+ diff_minus12, diff_plus12, diff_minus13, diff_plus13);
+
+ DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
+ diff_plus11, diff_minus12, diff_plus12, diff_minus13,
+ diff_plus13, offset_mask0, offset_mask1, offset_mask2,
+ offset_mask3);
+ DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask2, 2, offset_mask3, 2, offset_mask0,
+ offset_mask1, offset_mask2, offset_mask3);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
+ sao_offset, sao_offset, offset_mask0, offset_mask0,
+ offset_mask0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
+ sao_offset, sao_offset, offset_mask1, offset_mask1,
+ offset_mask1);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
+ sao_offset, sao_offset, offset_mask2, offset_mask2,
+ offset_mask2);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
+ sao_offset, sao_offset, offset_mask3, offset_mask3,
+ offset_mask3);
+
+ DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128,
+ src_zero2, 128, src_zero3, 128, src_zero0, src_zero1,
+ src_zero2, src_zero3);
+ DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
+ offset_mask1, src_zero2, offset_mask2, src_zero3,
+ offset_mask3, dst0, dst1, dst2, dst3);
+ DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
+ 128, dst0, dst1, dst2, dst3);
+
+ src_minus10 = src10;
+ src_minus11 = src11;
+ src_minus12 = src12;
+ src_minus13 = src13;
+
+ __lsx_vst(dst0, dst_ptr, 0);
+ __lsx_vst(dst1, dst_ptr + dst_stride, 0);
+ __lsx_vst(dst2, dst_ptr + dst_stride_2x, 0);
+ __lsx_vst(dst3, dst_ptr + dst_stride_3x, 0);
+ }
+ src += src_stride_4x;
+ dst += dst_stride_4x;
+ }
+}
+
+static void hevc_sao_edge_filter_90degree_4width_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *sao_offset_val,
+ int32_t height)
+{
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i const1 = __lsx_vldi(1);
+ __m128i dst0;
+ __m128i sao_offset = __lsx_vld(sao_offset_val, 0);
+ __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+ __m128i src_minus10, src_minus11, src10, src11;
+ __m128i src_zero0, src_zero1;
+ __m128i offset;
+ __m128i offset_mask0, offset_mask1;
+
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+
+ /* load in advance */
+ DUP4_ARG2(__lsx_vld, src - src_stride, 0, src, 0, src + src_stride, 0,
+ src + src_stride_2x, 0, src_minus10, src_minus11, src10, src11);
+
+ for (height -= 2; height; height -= 2) {
+ src += src_stride_2x;
+ DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
+ src11, src_minus11, src10, src10, src_minus10, src_zero0,
+ src_minus11, src_zero1);
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
+ src_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ src_minus10 = src10;
+ src_minus11 = src11;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
+ src10, src11);
+
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
+ dst += dst_stride_2x;
+ }
+
+ DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
+ src11, src_minus11, src10, src10, src_minus10, src_zero0,
+ src_minus11, src_zero1);
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
+ const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
+}
+
+static void hevc_sao_edge_filter_90degree_8width_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *sao_offset_val,
+ int32_t height)
+{
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i const1 = __lsx_vldi(1);
+ __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
+ __m128i src_zero0, src_zero1, dst0;
+ __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+ __m128i src_minus10, src_minus11, src10, src11;
+ __m128i offset_mask0, offset_mask1;
+
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vld, src - src_stride, 0, src, 0, src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src10, src11);
+
+ for (height -= 2; height; height -= 2) {
+ src += src_stride_2x;
+ DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
+ src11, src_minus11, src10, src10, src_minus10, src_zero0,
+ src_minus11, src_zero1);
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
+ src_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ src_minus10 = src10;
+ src_minus11 = src11;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
+ src10, src11);
+
+ __lsx_vstelm_d(dst0, dst, 0, 0);
+ __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+ dst += dst_stride_2x;
+ }
+
+ DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
+ src11, src_minus11, src10, src10, src_minus10, src_zero0,
+ src_minus11, src_zero1);
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
+ const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ __lsx_vstelm_d(dst0, dst, 0, 0);
+ __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+}
+
+static void hevc_sao_edge_filter_90degree_16multiple_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *
+ sao_offset_val,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *src_orig = src;
+ uint8_t *dst_orig = dst;
+ int32_t h_cnt, v_cnt;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ const int32_t src_stride_4x = (src_stride << 2);
+ const int32_t dst_stride_4x = (dst_stride << 2);
+ const int32_t src_stride_3x = src_stride_2x + src_stride;
+ const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i const1 = __lsx_vldi(1);
+ __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
+ __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
+ __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
+ __m128i diff_plus13;
+ __m128i src10, src_minus10, dst0, src11, src_minus11, dst1;
+ __m128i src12, dst2, src13, dst3;
+ __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
+
+ sao_offset = __lsx_vld(sao_offset_val, 0);
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+
+ for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
+ src = src_orig + v_cnt;
+ dst = dst_orig + v_cnt;
+
+ DUP2_ARG2(__lsx_vld, src - src_stride, 0, src, 0,
+ src_minus10, src_minus11);
+
+ for (h_cnt = (height >> 2); h_cnt--;) {
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
+ src, src_stride_3x, src, src_stride_4x,
+ src10, src11, src12, src13);
+ DUP4_ARG2(__lsx_vseq_b, src_minus11, src_minus10, src_minus11,
+ src10, src10, src_minus11, src10, src11, cmp_minus10,
+ cmp_plus10, cmp_minus11, cmp_plus11);
+ DUP4_ARG2(__lsx_vseq_b, src11, src10, src11, src12, src12, src11,
+ src12, src13, cmp_minus12, cmp_plus12,
+ cmp_minus13, cmp_plus13);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
+ cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
+ cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
+ diff_plus11);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
+ cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
+ cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
+ diff_plus13);
+ DUP4_ARG2(__lsx_vsle_bu, src_minus11, src_minus10, src_minus11,
+ src10, src10, src_minus11, src10, src11, cmp_minus10,
+ cmp_plus10, cmp_minus11, cmp_plus11);
+ DUP4_ARG2(__lsx_vsle_bu, src11, src10, src11, src12, src12, src11,
+ src12, src13, cmp_minus12, cmp_plus12, cmp_minus13,
+ cmp_plus13);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
+ cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
+ cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
+ cmp_plus11);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
+ cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
+ cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
+ cmp_plus13);
+ DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_plus10, const1, cmp_plus10, diff_minus11, const1,
+ cmp_minus11, diff_plus11, const1, cmp_plus11,
+ diff_minus10, diff_plus10, diff_minus11, diff_plus11);
+ DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
+ diff_plus12, const1, cmp_plus12, diff_minus13, const1,
+ cmp_minus13, diff_plus13, const1, cmp_plus13,
+ diff_minus12, diff_plus12, diff_minus13, diff_plus13);
+
+ DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
+ diff_plus11, diff_minus12, diff_plus12, diff_minus13,
+ diff_plus13, offset_mask0, offset_mask1, offset_mask2,
+ offset_mask3);
+ DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask2, 2, offset_mask3, 2, offset_mask0,
+ offset_mask1, offset_mask2, offset_mask3);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
+ sao_offset, sao_offset, offset_mask0,\
+ offset_mask0, offset_mask0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
+ sao_offset, sao_offset, offset_mask1, offset_mask1,
+ offset_mask1);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
+ sao_offset, sao_offset, offset_mask2, offset_mask2,
+ offset_mask2);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
+ sao_offset, sao_offset, offset_mask3, offset_mask3,
+ offset_mask3);
+
+ src_minus10 = src12;
+ DUP4_ARG2(__lsx_vxori_b, src_minus11, 128, src10, 128, src11, 128,
+ src12, 128, src_minus11, src10, src11, src12);
+ DUP4_ARG2(__lsx_vsadd_b, src_minus11, offset_mask0, src10,
+ offset_mask1, src11, offset_mask2, src12,
+ offset_mask3, dst0, dst1, dst2, dst3);
+ DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
+ 128, dst0, dst1, dst2, dst3);
+ src_minus11 = src13;
+
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vstx(dst1, dst, dst_stride);
+ __lsx_vstx(dst2, dst, dst_stride_2x);
+ __lsx_vstx(dst3, dst, dst_stride_3x);
+ src += src_stride_4x;
+ dst += dst_stride_4x;
+ }
+ }
+}
+
+static void hevc_sao_edge_filter_45degree_4width_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *sao_offset_val,
+ int32_t height)
+{
+ uint8_t *src_orig;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+ __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i const1 = __lsx_vldi(1);
+ __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
+ __m128i cmp_minus10, diff_minus10, src_minus10, cmp_minus11, diff_minus11;
+ __m128i src_minus11, src10, src11;
+ __m128i src_plus0, src_zero0, src_plus1, src_zero1, dst0;
+ __m128i offset_mask0, offset_mask1;
+ __m128i zeros = {0};
+
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+ src_orig = src - 1;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
+ src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src10, src11);
+
+ for (height -= 2; height; height -= 2) {
+ src_orig += src_stride_2x;
+
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
+ shuf1, src_zero0, src_zero1);
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
+ src_plus0, src_plus1);
+
+ DUP2_ARG2(__lsx_vilvl_b, src_plus0, src_minus10, src_plus1,
+ src_minus11, src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1,
+ src_zero1, src_zero0, src_zero1);
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1,
+ src_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
+ src_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ src_minus10 = src10;
+ src_minus11 = src11;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src10, src11);
+
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
+ dst += dst_stride_2x;
+ }
+
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
+ src_zero0, src_zero1);
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
+ src_plus0, src_plus1);
+
+ DUP2_ARG2(__lsx_vilvl_b, src_plus0, src_minus10, src_plus1, src_minus11,
+ src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
+ src_zero0, src_zero1);
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
+ const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
+ offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
+}
+
+static void hevc_sao_edge_filter_45degree_8width_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *sao_offset_val,
+ int32_t height)
+{
+ uint8_t *src_orig;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+ __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i const1 = __lsx_vldi(1);
+ __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
+ __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+ __m128i src_minus10, src10, src_minus11, src11;
+ __m128i src_zero0, src_plus10, src_zero1, src_plus11, dst0;
+ __m128i offset_mask0, offset_mask1;
+ __m128i zeros = {0};
+
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+ src_orig = src - 1;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0, src_minus10,
+ src_minus11);
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src10, src11);
+
+ for (height -= 2; height; height -= 2) {
+ src_orig += src_stride_2x;
+
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
+ shuf1, src_zero0, src_zero1);
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
+ src_plus10, src_plus11);
+
+ DUP2_ARG2(__lsx_vilvl_b, src_plus10, src_minus10, src_plus11,
+ src_minus11, src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
+ src_zero0, src_zero1);
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
+ src_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ src_minus10 = src10;
+ src_minus11 = src11;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src10, src11)
+ __lsx_vstelm_d(dst0, dst, 0, 0);
+ __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+ dst += dst_stride_2x;
+ }
+
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
+ src_zero0, src_zero1);
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
+ src_plus10, src_plus11);
+ DUP2_ARG2(__lsx_vilvl_b, src_plus10, src_minus10, src_plus11, src_minus11,
+ src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
+ src_zero0, src_zero1);
+
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
+ const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
+ offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ src_minus10 = src10;
+ src_minus11 = src11;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src10, src11);
+
+ __lsx_vstelm_d(dst0, dst, 0, 0);
+ __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+}
+
+static void hevc_sao_edge_filter_45degree_16multiple_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *
+ sao_offset_val,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *src_orig = src;
+ uint8_t *dst_orig = dst;
+ int32_t v_cnt;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ const int32_t src_stride_4x = (src_stride << 2);
+ const int32_t dst_stride_4x = (dst_stride << 2);
+ const int32_t src_stride_3x = src_stride_2x + src_stride;
+ const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+
+ __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+ __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i const1 = __lsx_vldi(1);
+ __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
+ __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
+ __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
+ __m128i diff_plus13, src_minus14, src_plus13;
+ __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3;
+ __m128i src10, src_minus10, dst0, src11, src_minus11, dst1;
+ __m128i src12, src_minus12, dst2, src13, src_minus13, dst3;
+ __m128i src_zero0, src_plus10, src_zero1, src_plus11, src_zero2;
+ __m128i src_zero3, sao_offset, src_plus12;
+
+ sao_offset = __lsx_vld(sao_offset_val, 0);
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+
+ for (; height; height -= 4) {
+ src_orig = src - 1;
+ dst_orig = dst;
+ src_minus11 = __lsx_vld(src_orig, 0);
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src_minus12, src_minus13);
+ src_minus14 = __lsx_vldx(src_orig, src_stride_3x);
+
+ for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
+ src_minus10 = __lsx_vld(src_orig - src_stride, 0);
+ src_orig += 16;
+ src10 = __lsx_vld(src_orig, 0);
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig,
+ src_stride_2x, src11, src12);
+ src13 = __lsx_vldx(src_orig, src_stride_3x);
+ src_plus13 = __lsx_vld(src + v_cnt + src_stride_4x, 1);
+
+ DUP4_ARG3(__lsx_vshuf_b, src10, src_minus11, shuf1, src11,
+ src_minus12, shuf1, src12, src_minus13, shuf1,
+ src13, src_minus14, shuf1, src_zero0, src_zero1,
+ src_zero2, src_zero3);
+ DUP2_ARG3(__lsx_vshuf_b, src11, src_minus12, shuf2, src12,
+ src_minus13, shuf2, src_plus10, src_plus11);
+ src_plus12 = __lsx_vshuf_b(src13, src_minus14, shuf2);
+
+ DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
+ src_plus10, src_zero1, src_minus11, src_zero1,
+ src_plus11, cmp_minus10, cmp_plus10,
+ cmp_minus11, cmp_plus11);
+ DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
+ src_plus12, src_zero3, src_minus13, src_zero3,
+ src_plus13, cmp_minus12, cmp_plus12,
+ cmp_minus13, cmp_plus13);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
+ cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
+ cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
+ diff_plus11);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
+ cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
+ cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
+ diff_plus13);
+ DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
+ src_plus10, src_zero1, src_minus11, src_zero1,
+ src_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
+ cmp_plus11);
+ DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
+ src_plus12, src_zero3, src_minus13, src_zero3,
+ src_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
+ cmp_plus13);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
+ cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
+ cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
+ cmp_plus11);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
+ cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
+ cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
+ cmp_plus13);
+ DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_plus10, const1, cmp_plus10, diff_minus11, const1,
+ cmp_minus11, diff_plus11, const1, cmp_plus11,
+ diff_minus10, diff_plus10, diff_minus11, diff_plus11);
+ DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
+ diff_plus12, const1, cmp_plus12, diff_minus13, const1,
+ cmp_minus13, diff_plus13, const1, cmp_plus13,
+ diff_minus12, diff_plus12, diff_minus13, diff_plus13);
+
+ DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
+ diff_plus11, diff_minus12, diff_plus12, diff_minus13,
+ diff_plus13, offset_mask0, offset_mask1, offset_mask2,
+ offset_mask3);
+ DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask2, 2, offset_mask3, 2, offset_mask0,
+ offset_mask1, offset_mask2, offset_mask3);
+
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
+ sao_offset, sao_offset, offset_mask0, offset_mask0,
+ offset_mask0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
+ sao_offset, sao_offset, offset_mask1, offset_mask1,
+ offset_mask1);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
+ sao_offset, sao_offset, offset_mask2, offset_mask2,
+ offset_mask2);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
+ sao_offset, sao_offset, offset_mask3, offset_mask3,
+ offset_mask3);
+
+ DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128, src_zero2,
+ 128, src_zero3, 128, src_zero0, src_zero1, src_zero2,
+ src_zero3);
+ DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
+ offset_mask1, src_zero2, offset_mask2, src_zero3,
+ offset_mask3, dst0, dst1, dst2, dst3);
+ DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
+ 128, dst0, dst1, dst2, dst3);
+
+ src_minus11 = src10;
+ src_minus12 = src11;
+ src_minus13 = src12;
+ src_minus14 = src13;
+
+ __lsx_vst(dst0, dst_orig, 0);
+ __lsx_vstx(dst1, dst_orig, dst_stride);
+ __lsx_vstx(dst2, dst_orig, dst_stride_2x);
+ __lsx_vstx(dst3, dst_orig, dst_stride_3x);
+ dst_orig += 16;
+ }
+ src += src_stride_4x;
+ dst += dst_stride_4x;
+ }
+}
+
+static void hevc_sao_edge_filter_135degree_4width_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *sao_offset_val,
+ int32_t height)
+{
+ uint8_t *src_orig;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+
+ __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+ __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i const1 = __lsx_vldi(1);
+ __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
+ __m128i src_zero0, src_zero1, dst0;
+ __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+ __m128i src_minus10, src10, src_minus11, src11;
+ __m128i offset_mask0, offset_mask1;
+ __m128i zeros = {0};
+
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+ src_orig = src - 1;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
+ src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src10, src11);
+
+ for (height -= 2; height; height -= 2) {
+ src_orig += src_stride_2x;
+
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
+ shuf1, src_zero0, src_zero1);
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
+ shuf2, src_minus10, src_minus11);
+
+ DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
+ src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
+ src_zero0, src_zero1);
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
+ src_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ src_minus10 = src10;
+ src_minus11 = src11;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src10, src11);
+
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
+ dst += dst_stride_2x;
+ }
+
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
+ src_zero0, src_zero1);
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
+ shuf2, src_minus10, src_minus11);
+
+ DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
+ src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
+ src_zero0, src_zero1);
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
+ const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
+ offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
+ dst += dst_stride_2x;
+}
+
+static void hevc_sao_edge_filter_135degree_8width_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *sao_offset_val,
+ int32_t height)
+{
+ uint8_t *src_orig;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+
+ __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+ __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i const1 = __lsx_vldi(1);
+ __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
+ __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
+ __m128i src_minus10, src10, src_minus11, src11;
+ __m128i src_zero0, src_zero1, dst0;
+ __m128i offset_mask0, offset_mask1;
+ __m128i zeros = {0};
+
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+ src_orig = src - 1;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
+ src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src10, src11);
+
+ for (height -= 2; height; height -= 2) {
+ src_orig += src_stride_2x;
+
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
+ shuf1, src_zero0, src_zero1);
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
+ shuf2, src_minus10, src_minus11);
+
+ DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
+ src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
+ src_zero0, src_zero1);
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
+ src_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
+ cmp_minus11, cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ src_minus10 = src10;
+ src_minus11 = src11;
+
+ /* load in advance */
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src10, src11);
+
+ __lsx_vstelm_d(dst0, dst, 0, 0);
+ __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+ dst += dst_stride_2x;
+ }
+
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
+ src_zero0, src_zero1);
+ DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
+ shuf2, src_minus10, src_minus11);
+
+ DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
+ src_minus10, src_minus11);
+ DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
+ src_zero0, src_zero1);
+ DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ diff_minus10, diff_minus11);
+ DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
+ cmp_minus10, cmp_minus11);
+ DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
+ const1, cmp_minus11, diff_minus10, diff_minus11);
+
+ DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
+ diff_minus11, offset_mask0, offset_mask1);
+ DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
+ offset_mask1);
+ DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
+ src_zero0, offset, dst0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
+ sao_offset, offset, offset, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+ dst0 = __lsx_vsadd_b(dst0, offset);
+ dst0 = __lsx_vxori_b(dst0, 128);
+
+ __lsx_vstelm_d(dst0, dst, 0, 0);
+ __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
+}
+
+static void hevc_sao_edge_filter_135degree_16multiple_lsx(uint8_t *dst,
+ int32_t dst_stride,
+ uint8_t *src,
+ int32_t src_stride,
+ int16_t *sao_offset_val,
+ int32_t width,
+ int32_t height)
+{
+ uint8_t *src_orig, *dst_orig;
+ int32_t v_cnt;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ const int32_t src_stride_4x = (src_stride << 2);
+ const int32_t dst_stride_4x = (dst_stride << 2);
+ const int32_t src_stride_3x = src_stride_2x + src_stride;
+ const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+
+ __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
+ __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
+ __m128i edge_idx = {0x403000201, 0x0};
+ __m128i const1 = __lsx_vldi(1);
+ __m128i dst0, dst1, dst2, dst3;
+ __m128i cmp_minus10, cmp_minus11, cmp_minus12, cmp_minus13, cmp_plus10;
+ __m128i cmp_plus11, cmp_plus12, cmp_plus13, diff_minus10, diff_minus11;
+ __m128i diff_minus12, diff_minus13, diff_plus10, diff_plus11, diff_plus12;
+ __m128i diff_plus13, src10, src11, src12, src13, src_minus10, src_minus11;
+ __m128i src_plus10, src_plus11, src_plus12, src_plus13;
+ __m128i src_minus12, src_minus13, src_zero0, src_zero1, src_zero2, src_zero3;
+ __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
+
+ sao_offset = __lsx_vld(sao_offset_val, 0);
+ sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
+
+ for (; height; height -= 4) {
+ src_orig = src - 1;
+ dst_orig = dst;
+
+ src_minus11 = __lsx_vld(src_orig, 0);
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src_plus10, src_plus11);
+ src_plus12 = __lsx_vldx(src_orig, src_stride_3x);
+
+ for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
+ src_minus10 = __lsx_vld(src_orig - src_stride, 2);
+ src_plus13 = __lsx_vldx(src_orig, src_stride_4x);
+ src_orig += 16;
+ src10 = __lsx_vld(src_orig, 0);
+ DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
+ src11, src12);
+ src13 =__lsx_vldx(src_orig, src_stride_3x);
+
+ DUP4_ARG3(__lsx_vshuf_b, src10, src_minus11, shuf1, src11,
+ src_plus10, shuf1, src12, src_plus11, shuf1, src13,
+ src_plus12, shuf1, src_zero0, src_zero1, src_zero2,
+ src_zero3);
+ src_minus11 = __lsx_vshuf_b(src10, src_minus11, shuf2);
+ DUP2_ARG3(__lsx_vshuf_b, src11, src_plus10, shuf2, src12,
+ src_plus11, shuf2, src_minus12, src_minus13);
+
+ DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
+ src_plus10, src_zero1, src_minus11, src_zero1,
+ src_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
+ cmp_plus11);
+ DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
+ src_plus12, src_zero3, src_minus13, src_zero3,
+ src_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
+ cmp_plus13);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
+ cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
+ cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
+ diff_plus11);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
+ cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
+ cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
+ diff_plus13);
+ DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
+ src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
+ cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
+ DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
+ src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
+ cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
+ cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
+ cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
+ cmp_plus11);
+ DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
+ cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
+ cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
+ cmp_plus13);
+ DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
+ diff_plus10, const1, cmp_plus10, diff_minus11, const1,
+ cmp_minus11, diff_plus11, const1, cmp_plus11,
+ diff_minus10, diff_plus10, diff_minus11, diff_plus11);
+ DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
+ diff_plus12, const1, cmp_plus12, diff_minus13, const1,
+ cmp_minus13, diff_plus13, const1, cmp_plus13,
+ diff_minus12, diff_plus12, diff_minus13, diff_plus13);
+
+ DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
+ diff_plus11, diff_minus12, diff_plus12, diff_minus13,
+ diff_plus13, offset_mask0, offset_mask1, offset_mask2,
+ offset_mask3);
+ DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
+ offset_mask2, 2, offset_mask3, 2, offset_mask0,
+ offset_mask1, offset_mask2, offset_mask3);
+
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
+ sao_offset, sao_offset, offset_mask0, offset_mask0,
+ offset_mask0);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
+ sao_offset, sao_offset, offset_mask1, offset_mask1,
+ offset_mask1);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
+ sao_offset, sao_offset, offset_mask2, offset_mask2,
+ offset_mask2);
+ DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
+ sao_offset, sao_offset, offset_mask3, offset_mask3,
+ offset_mask3);
+
+ DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128,
+ src_zero2, 128, src_zero3, 128, src_zero0, src_zero1,
+ src_zero2, src_zero3);
+ DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
+ offset_mask1, src_zero2, offset_mask2, src_zero3,
+ offset_mask3, dst0, dst1, dst2, dst3);
+ DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
+ 128, dst0, dst1, dst2, dst3);
+
+ src_minus11 = src10;
+ src_plus10 = src11;
+ src_plus11 = src12;
+ src_plus12 = src13;
+
+ __lsx_vst(dst0, dst_orig, 0);
+ __lsx_vstx(dst1, dst_orig, dst_stride);
+ __lsx_vstx(dst2, dst_orig, dst_stride_2x);
+ __lsx_vstx(dst3, dst_orig, dst_stride_3x);
+ dst_orig += 16;
+ }
+
+ src += src_stride_4x;
+ dst += dst_stride_4x;
+ }
+}
+
+void ff_hevc_sao_edge_filter_8_lsx(uint8_t *dst, uint8_t *src,
+ ptrdiff_t stride_dst,
+ int16_t *sao_offset_val,
+ int eo, int width, int height)
+{
+ ptrdiff_t stride_src = (2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE);
+
+ switch (eo) {
+ case 0:
+ if (width >> 4) {
+ hevc_sao_edge_filter_0degree_16multiple_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val,
+ width - (width & 0x0F),
+ height);
+ dst += width & 0xFFFFFFF0;
+ src += width & 0xFFFFFFF0;
+ width &= 0x0F;
+ }
+
+ if (width >> 3) {
+ hevc_sao_edge_filter_0degree_8width_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val, height);
+ dst += 8;
+ src += 8;
+ width &= 0x07;
+ }
+
+ if (width) {
+ hevc_sao_edge_filter_0degree_4width_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val, height);
+ }
+ break;
+
+ case 1:
+ if (width >> 4) {
+ hevc_sao_edge_filter_90degree_16multiple_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val,
+ width - (width & 0x0F),
+ height);
+ dst += width & 0xFFFFFFF0;
+ src += width & 0xFFFFFFF0;
+ width &= 0x0F;
+ }
+
+ if (width >> 3) {
+ hevc_sao_edge_filter_90degree_8width_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val, height);
+ dst += 8;
+ src += 8;
+ width &= 0x07;
+ }
+
+ if (width) {
+ hevc_sao_edge_filter_90degree_4width_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val, height);
+ }
+ break;
+
+ case 2:
+ if (width >> 4) {
+ hevc_sao_edge_filter_45degree_16multiple_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val,
+ width - (width & 0x0F),
+ height);
+ dst += width & 0xFFFFFFF0;
+ src += width & 0xFFFFFFF0;
+ width &= 0x0F;
+ }
+
+ if (width >> 3) {
+ hevc_sao_edge_filter_45degree_8width_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val, height);
+ dst += 8;
+ src += 8;
+ width &= 0x07;
+ }
+
+ if (width) {
+ hevc_sao_edge_filter_45degree_4width_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val, height);
+ }
+ break;
+
+ case 3:
+ if (width >> 4) {
+ hevc_sao_edge_filter_135degree_16multiple_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val,
+ width - (width & 0x0F),
+ height);
+ dst += width & 0xFFFFFFF0;
+ src += width & 0xFFFFFFF0;
+ width &= 0x0F;
+ }
+
+ if (width >> 3) {
+ hevc_sao_edge_filter_135degree_8width_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val, height);
+ dst += 8;
+ src += 8;
+ width &= 0x07;
+ }
+
+ if (width) {
+ hevc_sao_edge_filter_135degree_4width_lsx(dst, stride_dst,
+ src, stride_src,
+ sao_offset_val, height);
+ }
+ break;
+ }
+}
diff --git a/libavcodec/loongarch/hevcdsp_init_loongarch.c b/libavcodec/loongarch/hevcdsp_init_loongarch.c
index fc0e8fb0df..f39674be64 100644
--- a/libavcodec/loongarch/hevcdsp_init_loongarch.c
+++ b/libavcodec/loongarch/hevcdsp_init_loongarch.c
@@ -85,6 +85,25 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_8_lsx;
c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_8_lsx;
c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_lsx;
+
+ c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_8_lsx;
+
+ c->hevc_h_loop_filter_luma = ff_hevc_loop_filter_luma_h_8_lsx;
+ c->hevc_v_loop_filter_luma = ff_hevc_loop_filter_luma_v_8_lsx;
+
+ c->hevc_h_loop_filter_luma_c = ff_hevc_loop_filter_luma_h_8_lsx;
+ c->hevc_v_loop_filter_luma_c = ff_hevc_loop_filter_luma_v_8_lsx;
+
+ c->hevc_h_loop_filter_chroma = ff_hevc_loop_filter_chroma_h_8_lsx;
+ c->hevc_v_loop_filter_chroma = ff_hevc_loop_filter_chroma_v_8_lsx;
+
+ c->hevc_h_loop_filter_chroma_c = ff_hevc_loop_filter_chroma_h_8_lsx;
+ c->hevc_v_loop_filter_chroma_c = ff_hevc_loop_filter_chroma_v_8_lsx;
+
+ c->idct[0] = ff_hevc_idct_4x4_lsx;
+ c->idct[1] = ff_hevc_idct_8x8_lsx;
+ c->idct[2] = ff_hevc_idct_16x16_lsx;
+ c->idct[3] = ff_hevc_idct_32x32_lsx;
}
}
}
diff --git a/libavcodec/loongarch/hevcdsp_lsx.h b/libavcodec/loongarch/hevcdsp_lsx.h
index 3259e03f13..0e73fd1f8e 100644
--- a/libavcodec/loongarch/hevcdsp_lsx.h
+++ b/libavcodec/loongarch/hevcdsp_lsx.h
@@ -85,4 +85,30 @@ MC(epel, hv, 32);
#undef MC
+void ff_hevc_loop_filter_luma_h_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int32_t beta, int32_t *tc,
+ uint8_t *p_is_pcm, uint8_t *q_is_pcm);
+
+void ff_hevc_loop_filter_luma_v_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int32_t beta, int32_t *tc,
+ uint8_t *p_is_pcm, uint8_t *q_is_pcm);
+
+void ff_hevc_loop_filter_chroma_h_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int32_t *tc, uint8_t *p_is_pcm,
+ uint8_t *q_is_pcm);
+
+void ff_hevc_loop_filter_chroma_v_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int32_t *tc, uint8_t *p_is_pcm,
+ uint8_t *q_is_pcm);
+
+void ff_hevc_sao_edge_filter_8_lsx(uint8_t *dst, uint8_t *src,
+ ptrdiff_t stride_dst,
+ int16_t *sao_offset_val,
+ int eo, int width, int height);
+
+void ff_hevc_idct_4x4_lsx(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_8x8_lsx(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_16x16_lsx(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_32x32_lsx(int16_t *coeffs, int col_limit);
+
#endif // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LSX_H
--
2.20.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* [FFmpeg-devel] [PATCH v2 4/5] avcodec: [loongarch] Optimize Hevc_mc_bi with LSX.
2022-02-17 11:11 [FFmpeg-devel] Optimize HEVC decoding for loongarch Hao Chen
` (2 preceding siblings ...)
2022-02-17 11:11 ` [FFmpeg-devel] [PATCH v2 3/5] avcodec: [loongarch] Optimize Hevc_idct/lpf " Hao Chen
@ 2022-02-17 11:11 ` Hao Chen
2022-02-17 11:11 ` [FFmpeg-devel] [PATCH v2 5/5] avcodec: [loongarch] Optimize Hevc_mc_uni/w " Hao Chen
2022-02-17 12:08 ` [FFmpeg-devel] Optimize HEVC decoding for loongarch 殷时友
5 siblings, 0 replies; 15+ messages in thread
From: Hao Chen @ 2022-02-17 11:11 UTC (permalink / raw)
To: ffmpeg-devel
ffmpeg -i 5_h265_1080p_60fps_3Mbps.mkv -f rawvideo -y /dev/null -an
before: 124fps
after : 182fps
Signed-off-by: Hao Chen <chenhao@loongson.cn>
---
libavcodec/loongarch/Makefile | 3 +-
libavcodec/loongarch/hevc_mc_bi_lsx.c | 2289 +++++++++++++++++
libavcodec/loongarch/hevcdsp_init_loongarch.c | 51 +
libavcodec/loongarch/hevcdsp_lsx.h | 57 +
4 files changed, 2399 insertions(+), 1 deletion(-)
create mode 100644 libavcodec/loongarch/hevc_mc_bi_lsx.c
diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index cfc8e3aaff..620fba7192 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -27,4 +27,5 @@ LSX-OBJS-$(CONFIG_VP9_DECODER) += loongarch/vp9_mc_lsx.o \
loongarch/vp9_idct_lsx.o
LSX-OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_lsx.o \
loongarch/hevc_idct_lsx.o \
- loongarch/hevc_lpf_sao_lsx.o
+ loongarch/hevc_lpf_sao_lsx.o \
+ loongarch/hevc_mc_bi_lsx.o
diff --git a/libavcodec/loongarch/hevc_mc_bi_lsx.c b/libavcodec/loongarch/hevc_mc_bi_lsx.c
new file mode 100644
index 0000000000..9092fdccb2
--- /dev/null
+++ b/libavcodec/loongarch/hevc_mc_bi_lsx.c
@@ -0,0 +1,2289 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ * Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "hevcdsp_lsx.h"
+
+static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
+ /* 8 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+ 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
+};
+
+static av_always_inline __m128i
+hevc_bi_rnd_clip(__m128i in0, __m128i vec0, __m128i in1, __m128i vec1)
+{
+ __m128i out;
+
+ vec0 = __lsx_vsadd_h(in0, vec0);
+ vec1 = __lsx_vsadd_h(in1, vec1);
+ out = __lsx_vssrarni_bu_h(vec1, vec0, 7);
+ return out;
+}
+
+/* hevc_bi_copy: dst = av_clip_uint8((src0 << 6 + src1) >> 7) */
+static
+void hevc_bi_copy_4w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height)
+{
+ int32_t loop_cnt = height >> 3;
+ int32_t res = (height & 0x07) >> 1;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_4x = (dst_stride << 2);
+ int32_t src2_stride_2x = (src2_stride << 1);
+ int32_t src2_stride_4x = (src2_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ int32_t src2_stride_3x = src2_stride_2x + src2_stride;
+ __m128i src0, src1;
+ __m128i zero = __lsx_vldi(0);
+ __m128i in0, in1, in2, in3;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i reg0, reg1, reg2, reg3;
+ __m128i dst0, dst1, dst2, dst3;
+
+ for (;loop_cnt--;) {
+ reg0 = __lsx_vldrepl_w(src0_ptr, 0);
+ reg1 = __lsx_vldrepl_w(src0_ptr + src_stride, 0);
+ reg2 = __lsx_vldrepl_w(src0_ptr + src_stride_2x, 0);
+ reg3 = __lsx_vldrepl_w(src0_ptr + src_stride_3x, 0);
+ src0_ptr += src_stride_4x;
+ DUP2_ARG2(__lsx_vilvl_w, reg1, reg0, reg3, reg2, tmp0, tmp1);
+ src0 = __lsx_vilvl_d(tmp1, tmp0);
+ reg0 = __lsx_vldrepl_w(src0_ptr, 0);
+ reg1 = __lsx_vldrepl_w(src0_ptr + src_stride, 0);
+ reg2 = __lsx_vldrepl_w(src0_ptr + src_stride_2x, 0);
+ reg3 = __lsx_vldrepl_w(src0_ptr + src_stride_3x, 0);
+ DUP2_ARG2(__lsx_vilvl_w, reg1, reg0, reg3, reg2, tmp0, tmp1);
+ src1 = __lsx_vilvl_d(tmp1, tmp0);
+ src0_ptr += src_stride_4x;
+
+ tmp0 = __lsx_vldrepl_d(src1_ptr, 0);
+ tmp1 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0);
+ tmp2 = __lsx_vldrepl_d(src1_ptr + src2_stride_2x, 0);
+ tmp3 = __lsx_vldrepl_d(src1_ptr + src2_stride_3x, 0);
+ src1_ptr += src2_stride_4x;
+ DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, in0, in1);
+ tmp0 = __lsx_vldrepl_d(src1_ptr, 0);
+ tmp1 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0);
+ tmp2 = __lsx_vldrepl_d(src1_ptr + src2_stride_2x, 0);
+ tmp3 = __lsx_vldrepl_d(src1_ptr + src2_stride_3x, 0);
+ src1_ptr += src2_stride_4x;
+ DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, in2, in3);
+ DUP2_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, dst0, dst2);
+ DUP2_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, dst1, dst3);
+ DUP2_ARG2(__lsx_vslli_h, dst1, 6, dst3, 6, dst1, dst3);
+ dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
+ dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ __lsx_vstelm_w(dst0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(dst0, dst + dst_stride_2x, 0, 2);
+ __lsx_vstelm_w(dst0, dst + dst_stride_3x, 0, 3);
+ dst += dst_stride_4x;
+ __lsx_vstelm_w(dst1, dst, 0, 0);
+ __lsx_vstelm_w(dst1, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(dst1, dst + dst_stride_2x, 0, 2);
+ __lsx_vstelm_w(dst1, dst + dst_stride_3x, 0, 3);
+ dst += dst_stride_4x;
+ }
+ for(;res--;) {
+ reg0 = __lsx_vldrepl_w(src0_ptr, 0);
+ reg1 = __lsx_vldrepl_w(src0_ptr + src_stride, 0);
+ reg2 = __lsx_vldrepl_d(src1_ptr, 0);
+ reg3 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0);
+ src0 = __lsx_vilvl_w(reg1, reg0);
+ in0 = __lsx_vilvl_d(reg3, reg2);
+ dst0 = __lsx_vsllwil_hu_bu(src0, 6);
+ dst0 = __lsx_vsadd_h(dst0, in0);
+ dst0 = __lsx_vssrarni_bu_h(dst0, dst0, 7);
+ __lsx_vstelm_w(dst0, dst, 0, 0);
+ __lsx_vstelm_w(dst0, dst + dst_stride, 0, 1);
+ src0_ptr += src_stride_2x;
+ src1_ptr += src2_stride_2x;
+ dst += dst_stride_2x;
+ }
+}
+
+static
+void hevc_bi_copy_6w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height)
+{
+ int32_t loop_cnt;
+ int32_t res = (height & 0x07) >> 1;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_4x = (dst_stride << 2);
+ int32_t src2_stride_x = (src2_stride << 1);
+ int32_t src2_stride_2x = (src2_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
+ __m128i out0, out1, out2, out3;
+ __m128i zero = __lsx_vldi(0);
+ __m128i src0, src1, src2, src3;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ __m128i reg0, reg1, reg2, reg3;
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ reg0 = __lsx_vldrepl_d(src0_ptr, 0);
+ reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
+ reg2 = __lsx_vldrepl_d(src0_ptr + src_stride_2x, 0);
+ reg3 = __lsx_vldrepl_d(src0_ptr + src_stride_3x, 0);
+ DUP2_ARG2(__lsx_vilvl_d, reg1, reg0, reg3, reg2, src0, src1);
+ src0_ptr += src_stride_4x;
+ reg0 = __lsx_vldrepl_d(src0_ptr, 0);
+ reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
+ reg2 = __lsx_vldrepl_d(src0_ptr + src_stride_2x, 0);
+ reg3 = __lsx_vldrepl_d(src0_ptr + src_stride_3x, 0);
+ DUP2_ARG2(__lsx_vilvl_d, reg1, reg0, reg3, reg2, src2, src3);
+ src0_ptr += src_stride_4x;
+ in0 = __lsx_vld(src1_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
+ src2_stride_2x, in1, in2);
+ in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
+ src1_ptr += src2_stride_2x;
+ in4 = __lsx_vld(src1_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
+ src2_stride_2x, in5, in6);
+ in7 = __lsx_vldx(src1_ptr, src2_stride_3x);
+ src1_ptr += src2_stride_2x;
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+ dst0, dst2, dst4, dst6);
+ DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero, src3,
+ dst1, dst3, dst5, dst7);
+ DUP4_ARG2(__lsx_vslli_h, dst1, 6, dst3, 6, dst5, 6, dst7, 6, dst1, dst3,
+ dst5, dst7);
+ out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
+ out1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
+ out2 = hevc_bi_rnd_clip(in4, dst4, in5, dst5);
+ out3 = hevc_bi_rnd_clip(in6, dst6, in7, dst7);
+ __lsx_vstelm_w(out0, dst, 0, 0);
+ __lsx_vstelm_w(out0, dst + dst_stride, 0, 2);
+ __lsx_vstelm_h(out0, dst, 4, 2);
+ __lsx_vstelm_h(out0, dst + dst_stride, 4, 6);
+ __lsx_vstelm_w(out1, dst + dst_stride_2x, 0, 0);
+ __lsx_vstelm_w(out1, dst + dst_stride_3x, 0, 2);
+ __lsx_vstelm_h(out1, dst + dst_stride_2x, 4, 2);
+ __lsx_vstelm_h(out1, dst + dst_stride_3x, 4, 6);
+ dst += dst_stride_4x;
+ __lsx_vstelm_w(out2, dst, 0, 0);
+ __lsx_vstelm_w(out2, dst + dst_stride, 0, 2);
+ __lsx_vstelm_h(out2, dst, 4, 2);
+ __lsx_vstelm_h(out2, dst + dst_stride, 4, 6);
+ __lsx_vstelm_w(out3, dst + dst_stride_2x, 0, 0);
+ __lsx_vstelm_w(out3, dst + dst_stride_3x, 0, 2);
+ __lsx_vstelm_h(out3, dst + dst_stride_2x, 4, 2);
+ __lsx_vstelm_h(out3, dst + dst_stride_3x, 4, 6);
+ dst += dst_stride_4x;
+ }
+ for (;res--;) {
+ reg0 = __lsx_vldrepl_d(src0_ptr, 0);
+ reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
+ src0 = __lsx_vilvl_d(reg1, reg0);
+ src0_ptr += src_stride_2x;
+ in0 = __lsx_vld(src1_ptr, 0);
+ in1 = __lsx_vldx(src1_ptr, src2_stride_x);
+ src1_ptr += src2_stride_x;
+ dst0 = __lsx_vsllwil_hu_bu(src0, 6);
+ dst1 = __lsx_vilvh_b(zero, src0);
+ dst1 = __lsx_vslli_h(dst1, 6);
+ out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
+ __lsx_vstelm_w(out0, dst, 0, 0);
+ __lsx_vstelm_h(out0, dst, 4, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(out0, dst, 0, 2);
+ __lsx_vstelm_h(out0, dst, 4, 6);
+ dst += dst_stride;
+ }
+}
+
+static
+void hevc_bi_copy_8w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height)
+{
+ int32_t loop_cnt = height >> 3;
+ int32_t res = (height & 7) >> 1;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_4x = (dst_stride << 2);
+ int32_t src2_stride_x = (src2_stride << 1);
+ int32_t src2_stride_2x = (src2_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
+ __m128i out0, out1, out2, out3;
+ __m128i src0, src1, src2, src3;
+ __m128i zero = __lsx_vldi(0);
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ __m128i reg0, reg1, reg2, reg3;
+
+ for (loop_cnt = (height >> 3); loop_cnt--;) {
+ reg0 = __lsx_vldrepl_d(src0_ptr, 0);
+ reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
+ reg2 = __lsx_vldrepl_d(src0_ptr + src_stride_2x, 0);
+ reg3 = __lsx_vldrepl_d(src0_ptr + src_stride_3x, 0);
+ DUP2_ARG2(__lsx_vilvl_d, reg1, reg0, reg3, reg2, src0, src1);
+ src0_ptr += src_stride_4x;
+ reg0 = __lsx_vldrepl_d(src0_ptr, 0);
+ reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
+ reg2 = __lsx_vldrepl_d(src0_ptr + src_stride_2x, 0);
+ reg3 = __lsx_vldrepl_d(src0_ptr + src_stride_3x, 0);
+ DUP2_ARG2(__lsx_vilvl_d, reg1, reg0, reg3, reg2, src2, src3);
+ src0_ptr += src_stride_4x;
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+ dst0, dst2, dst4, dst6);
+ DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero,
+ src3, dst1, dst3, dst5, dst7);
+ DUP4_ARG2(__lsx_vslli_h, dst1, 6, dst3, 6, dst5, 6, dst7, 6, dst1,
+ dst3, dst5, dst7);
+ in0 = __lsx_vld(src1_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
+ src2_stride_2x, in1, in2);
+ in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
+ src1_ptr += src2_stride_2x;
+ in4 = __lsx_vld(src1_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
+ src2_stride_2x, in5, in6);
+ in7 = __lsx_vldx(src1_ptr, src2_stride_3x);
+ src1_ptr += src2_stride_2x;
+ out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
+ out1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
+ out2 = hevc_bi_rnd_clip(in4, dst4, in5, dst5);
+ out3 = hevc_bi_rnd_clip(in6, dst6, in7, dst7);
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
+ dst += dst_stride_4x;
+ __lsx_vstelm_d(out2, dst, 0, 0);
+ __lsx_vstelm_d(out2, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(out3, dst + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(out3, dst + dst_stride_3x, 0, 1);
+ dst += dst_stride_4x;
+ }
+ for (;res--;) {
+ reg0 = __lsx_vldrepl_d(src0_ptr, 0);
+ reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
+ src0 = __lsx_vilvl_d(reg1, reg0);
+ in0 = __lsx_vld(src1_ptr, 0);
+ in1 = __lsx_vldx(src1_ptr, src2_stride_x);
+ dst0 = __lsx_vsllwil_hu_bu(src0, 6);
+ dst1 = __lsx_vilvh_b(zero, src0);
+ dst1 = __lsx_vslli_h(dst1, 6);
+ out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+ src0_ptr += src_stride_2x;
+ src1_ptr += src2_stride_x;
+ dst += dst_stride_2x;
+ }
+}
+
+static
+void hevc_bi_copy_12w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height)
+{
+ uint32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_4x = (dst_stride << 2);
+ int32_t src2_stride_x = (src2_stride << 1);
+ int32_t src2_stride_2x = (src2_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
+ int16_t* _src1 = src1_ptr + 8;
+ __m128i out0, out1, out2;
+ __m128i src0, src1, src2, src3;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5;
+
+ for (loop_cnt = 4; loop_cnt--;) {
+ src0 = __lsx_vld(src0_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+ src1, src2);
+ src3 = __lsx_vldx(src0_ptr, src_stride_3x);
+ src0_ptr += src_stride_4x;
+ in0 = __lsx_vld(src1_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
+ src2_stride_2x, in1, in2);
+ in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
+ src1_ptr += src2_stride_2x;
+ in4 = __lsx_vld(_src1, 0);
+ DUP2_ARG2(__lsx_vldx, _src1, src2_stride_x, _src1, src2_stride_2x,
+ in5, in6);
+ in7 = __lsx_vldx(_src1, src2_stride_3x);
+ _src1 += src2_stride_2x;
+
+ DUP2_ARG2(__lsx_vilvl_d, in5, in4, in7, in6, in4, in5);
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+ dst0, dst1, dst2, dst3)
+ DUP2_ARG2(__lsx_vilvh_w, src1, src0, src3, src2, src0, src1);
+ DUP2_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, dst4, dst5)
+ out0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
+ out1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
+ out2 = hevc_bi_rnd_clip(in4, dst4, in5, dst5);
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
+ __lsx_vstelm_w(out2, dst, 8, 0);
+ __lsx_vstelm_w(out2, dst + dst_stride, 8, 1);
+ __lsx_vstelm_w(out2, dst + dst_stride_2x, 8, 2);
+ __lsx_vstelm_w(out2, dst + dst_stride_3x, 8, 3);
+ dst += dst_stride_4x;
+ }
+}
+
+static
+void hevc_bi_copy_16w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height)
+{
+ uint32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_4x = (dst_stride << 2);
+ int32_t src2_stride_x = (src2_stride << 1);
+ int32_t src2_stride_2x = (src2_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
+ int16_t *_src1 = src1_ptr + 8;
+ __m128i out0, out1, out2, out3;
+ __m128i src0, src1, src2, src3;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
+ __m128i zero = {0};
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src0 = __lsx_vld(src0_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+ src1, src2);
+ src3 = __lsx_vldx(src0_ptr, src_stride_3x);
+ src0_ptr += src_stride_4x;
+ in0 = __lsx_vld(src1_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
+ src2_stride_2x, in1, in2);
+ in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
+ src1_ptr += src2_stride_2x;
+ in4 = __lsx_vld(_src1, 0);
+ DUP2_ARG2(__lsx_vldx, _src1, src2_stride_x, _src1, src2_stride_2x,
+ in5, in6);
+ in7 = __lsx_vldx(_src1, src2_stride_3x);
+ _src1 += src2_stride_2x;
+ DUP4_ARG2(__lsx_vsllwil_hu_bu, src0, 6, src1, 6, src2, 6, src3, 6,
+ dst0_r, dst1_r, dst2_r, dst3_r)
+ DUP4_ARG2(__lsx_vilvh_b, zero, src0, zero, src1, zero, src2, zero, src3,
+ dst0_l, dst1_l, dst2_l, dst3_l);
+ DUP4_ARG2(__lsx_vslli_h, dst0_l, 6, dst1_l, 6, dst2_l, 6, dst3_l, 6,
+ dst0_l, dst1_l, dst2_l, dst3_l);
+
+ out0 = hevc_bi_rnd_clip(in0, dst0_r, in4, dst0_l);
+ out1 = hevc_bi_rnd_clip(in1, dst1_r, in5, dst1_l);
+ out2 = hevc_bi_rnd_clip(in2, dst2_r, in6, dst2_l);
+ out3 = hevc_bi_rnd_clip(in3, dst3_r, in7, dst3_l);
+ __lsx_vst(out0, dst, 0);
+ __lsx_vstx(out1, dst, dst_stride);
+ __lsx_vstx(out2, dst, dst_stride_2x);
+ __lsx_vstx(out3, dst, dst_stride_3x);
+ dst += dst_stride_4x;
+ }
+}
+
+static
+void hevc_bi_copy_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height)
+{
+ hevc_bi_copy_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, height);
+ hevc_bi_copy_8w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
+ dst + 16, dst_stride, height);
+}
+
+static
+void hevc_bi_copy_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height)
+{
+ hevc_bi_copy_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, height);
+ hevc_bi_copy_16w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
+ dst + 16, dst_stride, height);
+}
+
+static
+void hevc_bi_copy_48w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height)
+{
+ hevc_bi_copy_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, height);
+ hevc_bi_copy_32w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
+ dst + 16, dst_stride, height);
+}
+
+static
+void hevc_bi_copy_64w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t height)
+{
+ hevc_bi_copy_32w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, height);
+ hevc_bi_copy_32w_lsx(src0_ptr + 32, src_stride, src1_ptr + 32, src2_stride,
+ dst + 32, dst_stride, height);
+}
+
+static void hevc_hz_8t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ uint32_t loop_cnt;
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ __m128i src0, src1, src2, src3;
+ __m128i filt0, filt1, filt2, filt3;
+ __m128i mask1, mask2, mask3;
+ __m128i vec0, vec1, vec2, vec3;
+ __m128i dst0, dst1, dst2, dst3;
+ __m128i in0, in1, in2, in3;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+
+ src0_ptr -= 3;
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+
+ for (loop_cnt = (height >> 1); loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 8, src0, src1);
+ src0_ptr += src_stride;
+ DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 8, src2, src3);
+ src0_ptr += src_stride;
+ DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in1);
+ src1_ptr += src2_stride;
+ DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in2, in3);
+ src1_ptr += src2_stride;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0,
+ vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+ vec3, filt0, dst0, dst1, dst2, dst3);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1,
+ vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
+ dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2,
+ vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt2, dst1, vec1, filt2,
+ dst2, vec2, filt2, dst3, vec3, filt2, dst0, dst1, dst2, dst3);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask3, src3, src3, mask3,
+ vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt3, dst1, vec1, filt3,
+ dst2, vec2, filt3, dst3, vec3, filt3, dst0, dst1, dst2, dst3);
+
+ dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
+ dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vstx(dst1, dst, dst_stride);
+ dst += dst_stride_2x;
+ }
+}
+
+static void hevc_hz_8t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ uint32_t loop_cnt;
+ __m128i src0, src1, tmp0, tmp1;
+ __m128i filt0, filt1, filt2, filt3;
+ __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7;
+ __m128i vec0, vec1, vec2, vec3;
+ __m128i dst0, dst1, dst2;
+ __m128i in0, in1, in2;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+
+ src0_ptr -= 3;
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ DUP4_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask0, 6, mask0, 8, mask1,
+ mask2, mask3, mask4);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 10, mask0, 12, mask5, mask6);
+ mask7 = __lsx_vaddi_bu(mask0, 14);
+
+ for (loop_cnt = height; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src0, src1);
+ src0_ptr += src_stride;
+ DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in1);
+ in2 = __lsx_vld(src1_ptr, 32);
+ src1_ptr += src2_stride;
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask4, src1,
+ src1, mask0, src0, src0, mask1, vec0, vec1, vec2, vec3);
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, dst0, dst1);
+ dst2 = __lsx_vdp2_h_bu_b(vec2, filt0);
+ dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt1);
+ DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask5, src1, src1, mask1, src0,
+ src0, mask2, src1, src0, mask6, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec0, filt1, dst2, vec1, filt1,
+ dst0, vec2, filt2, dst1, vec3, filt2, dst1, dst2, dst0, dst1);
+ DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask2, src0, src0, mask3, src1, src0,
+ mask7, src1, src1, mask3, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec0, filt2, dst0, vec1, filt3,
+ dst1, vec2, filt3, dst2, vec3, filt3, dst2, dst0, dst1, dst2);
+
+ tmp0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
+ dst2 = __lsx_vsadd_h(dst2, in2);
+ tmp1 = __lsx_vssrarni_bu_h(dst2, dst2, 7);
+
+ __lsx_vst(tmp0, dst, 0);
+ __lsx_vstelm_d(tmp1, dst, 16, 0);
+ dst += dst_stride;
+ }
+}
+
+static void hevc_hz_8t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ hevc_hz_8t_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter, height);
+ hevc_hz_8t_16w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
+ dst + 16, dst_stride, filter, height);
+}
+
+static void hevc_hz_8t_48w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ hevc_hz_8t_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter, height);
+ hevc_hz_8t_32w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
+ dst + 16, dst_stride, filter, height);
+}
+
+static void hevc_hz_8t_64w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ hevc_hz_8t_32w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter, height);
+ hevc_hz_8t_32w_lsx(src0_ptr + 32, src_stride, src1_ptr + 32, src2_stride,
+ dst + 32, dst_stride, filter, height);
+}
+
+static av_always_inline
+void hevc_vt_8t_8w_lsx(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr,
+ int32_t src2_stride, uint8_t *dst, int32_t dst_stride,\
+ const int8_t *filter, int32_t height)
+{
+ int32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_4x = (dst_stride << 2);
+ int32_t src2_stride_x = (src2_stride << 1);
+ int32_t src2_stride_2x = (src2_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
+ __m128i src0, src1, src2, src3, src4, src5;
+ __m128i src6, src7, src8, src9, src10;
+ __m128i in0, in1, in2, in3;
+ __m128i src10_r, src32_r, src54_r, src76_r, src98_r;
+ __m128i src21_r, src43_r, src65_r, src87_r, src109_r;
+ __m128i dst0_r, dst1_r, dst2_r, dst3_r;
+ __m128i filt0, filt1, filt2, filt3;
+
+ src0_ptr -= src_stride_3x;
+
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ src0 = __lsx_vld(src0_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+ src1, src2);
+ src3 = __lsx_vldx(src0_ptr, src_stride_3x);
+ src0_ptr += src_stride_4x;
+ src4 = __lsx_vld(src0_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+ src5, src6);
+ src0_ptr += src_stride_3x;
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ src10_r, src32_r, src54_r, src21_r);
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src7 = __lsx_vld(src0_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+ src8, src9);
+ src10 = __lsx_vldx(src0_ptr, src_stride_3x);
+ src0_ptr += src_stride_4x;
+ in0 = __lsx_vld(src1_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr, src2_stride_2x,
+ in1, in2);
+ in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
+ src1_ptr += src2_stride_2x;
+ DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+ src76_r, src87_r, src98_r, src109_r);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src32_r,
+ filt0, src43_r, filt0, dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst1_r, src43_r,
+ filt1, dst2_r, src54_r, filt1, dst3_r, src65_r, filt1,
+ dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src54_r, filt2, dst1_r, src65_r,
+ filt2, dst2_r, src76_r, filt2, dst3_r, src87_r, filt2,
+ dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src76_r, filt3, dst1_r, src87_r,
+ filt3, dst2_r, src98_r, filt3, dst3_r, src109_r, filt3,
+ dst0_r, dst1_r, dst2_r, dst3_r);
+
+ dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in1, dst1_r);
+ dst1_r = hevc_bi_rnd_clip(in2, dst2_r, in3, dst3_r);
+ __lsx_vstelm_d(dst0_r, dst, 0, 0);
+ __lsx_vstelm_d(dst0_r, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(dst1_r, dst + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(dst1_r, dst + dst_stride_3x, 0, 1);
+ dst += dst_stride_4x;
+
+ src10_r = src54_r;
+ src32_r = src76_r;
+ src54_r = src98_r;
+ src21_r = src65_r;
+ src43_r = src87_r;
+ src65_r = src109_r;
+
+ src6 = src10;
+ }
+}
+
+static av_always_inline
+void hevc_vt_8t_16multx2mult_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height,
+ int32_t width)
+{
+ uint8_t *src0_ptr_tmp;
+ int16_t *src1_ptr_tmp;
+ uint8_t *dst_tmp;
+ uint32_t loop_cnt;
+ uint32_t cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ __m128i in0, in1, in2, in3;
+ __m128i src10_r, src32_r, src54_r, src76_r;
+ __m128i src21_r, src43_r, src65_r, src87_r;
+ __m128i dst0_r, dst1_r;
+ __m128i src10_l, src32_l, src54_l, src76_l;
+ __m128i src21_l, src43_l, src65_l, src87_l;
+ __m128i dst0_l, dst1_l;
+ __m128i filt0, filt1, filt2, filt3;
+
+ src0_ptr -= src_stride_3x;
+
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ for (cnt = (width >> 4); cnt--;) {
+ src0_ptr_tmp = src0_ptr;
+ src1_ptr_tmp = src1_ptr;
+ dst_tmp = dst;
+
+ src0 = __lsx_vld(src0_ptr_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
+ src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src0_ptr_tmp, src_stride_3x);
+ src0_ptr_tmp += src_stride_4x;
+ src4 = __lsx_vld(src0_ptr_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
+ src_stride_2x, src5, src6);
+ src0_ptr_tmp += src_stride_3x;
+
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ src10_r, src32_r, src54_r, src21_r);
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
+ DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ src10_l, src32_l, src54_l, src21_l);
+ DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, src43_l, src65_l);
+
+ for (loop_cnt = (height >> 1); loop_cnt--;) {
+ src7 = __lsx_vld(src0_ptr_tmp, 0);
+ src8 = __lsx_vldx(src0_ptr_tmp, src_stride);
+ src0_ptr_tmp += src_stride_2x;
+ DUP2_ARG2(__lsx_vld, src1_ptr_tmp, 0, src1_ptr_tmp, 16, in0, in2);
+ src1_ptr_tmp += src2_stride;
+ DUP2_ARG2(__lsx_vld, src1_ptr_tmp, 0, src1_ptr_tmp, 16, in1, in3);
+ src1_ptr_tmp += src2_stride;
+
+ DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
+ DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src76_l, src87_l);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src10_l,
+ filt0, src21_l, filt0, dst0_r, dst1_r, dst0_l, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst1_r,
+ src43_r, filt1, dst0_l, src32_l, filt1, dst1_l, src43_l,
+ filt1, dst0_r, dst1_r, dst0_l, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src54_r, filt2, dst1_r,
+ src65_r, filt2, dst0_l, src54_l, filt2, dst1_l, src65_l,
+ filt2, dst0_r, dst1_r, dst0_l, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src76_r, filt3, dst1_r,
+ src87_r, filt3, dst0_l, src76_l, filt3, dst1_l, src87_l,
+ filt3, dst0_r, dst1_r, dst0_l, dst1_l);
+ dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l);
+ dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l);
+
+ __lsx_vst(dst0_r, dst_tmp, 0);
+ __lsx_vstx(dst1_r, dst_tmp, dst_stride);
+ dst_tmp += dst_stride_2x;
+
+ src10_r = src32_r;
+ src32_r = src54_r;
+ src54_r = src76_r;
+ src21_r = src43_r;
+ src43_r = src65_r;
+ src65_r = src87_r;
+ src10_l = src32_l;
+ src32_l = src54_l;
+ src54_l = src76_l;
+ src21_l = src43_l;
+ src43_l = src65_l;
+ src65_l = src87_l;
+ src6 = src8;
+ }
+
+ src0_ptr += 16;
+ src1_ptr += 16;
+ dst += 16;
+ }
+}
+
+static void hevc_vt_8t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter, height, 16);
+}
+
+static void hevc_vt_8t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter, height, 16);
+ hevc_vt_8t_8w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
+ dst + 16, dst_stride, filter, height);
+}
+
+static void hevc_vt_8t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter, height, 32);
+}
+
+static void hevc_vt_8t_48w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter, height, 48);
+}
+
+static void hevc_vt_8t_64w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ hevc_vt_8t_16multx2mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter, height, 64);
+}
+
+static av_always_inline
+void hevc_hv_8t_8multx1mult_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height, int32_t width)
+{
+ uint32_t loop_cnt;
+ uint32_t cnt;
+ uint8_t *src0_ptr_tmp;
+ int16_t *src1_ptr_tmp;
+ uint8_t *dst_tmp;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ __m128i out;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i in0, tmp;
+ __m128i filt0, filt1, filt2, filt3;
+ __m128i filt_h0, filt_h1, filt_h2, filt_h3;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ __m128i mask1, mask2, mask3;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ __m128i dst0_r, dst0_l;
+ __m128i dst10_r, dst32_r, dst54_r, dst76_r;
+ __m128i dst10_l, dst32_l, dst54_l, dst76_l;
+
+ src0_ptr -= src_stride_3x + 3;
+
+ DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4, filter_x,
+ 6, filt0, filt1, filt2, filt3);
+ filt_h3 = __lsx_vld(filter_y, 0);
+ filt_h3 = __lsx_vsllwil_h_b(filt_h3, 0);
+
+ DUP4_ARG2(__lsx_vreplvei_w, filt_h3, 0, filt_h3, 1, filt_h3, 2, filt_h3, 3,
+ filt_h0, filt_h1, filt_h2, filt_h3);
+
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+
+ for (cnt = width >> 3; cnt--;) {
+ src0_ptr_tmp = src0_ptr;
+ dst_tmp = dst;
+ src1_ptr_tmp = src1_ptr;
+
+ src0 = __lsx_vld(src0_ptr_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
+ src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src0_ptr_tmp, src_stride_3x);
+ src0_ptr_tmp += src_stride_4x;
+ src4 = __lsx_vld(src0_ptr_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
+ src_stride_2x, src5, src6);
+ src0_ptr_tmp += src_stride_3x;
+
+ /* row 0 row 1 row 2 row 3 */
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0,
+ src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1,
+ src1, mask2, src1, src1, mask3, vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2,
+ src2, mask2, src2, src2, mask3, vec8, vec9, vec10, vec11);
+ DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3,
+ src3, mask2, src3, src3, mask3, vec12, vec13, vec14, vec15);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, vec8, filt0,
+ vec12, filt0, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec5, filt1,
+ dst2, vec9, filt1, dst3, vec13, filt1, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec2, filt2, dst1, vec6, filt2,
+ dst2, vec10, filt2, dst3, vec14, filt2, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec3, filt3, dst1, vec7, filt3,
+ dst2, vec11, filt3, dst3, vec15, filt3, dst0, dst1, dst2, dst3);
+
+ DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4,
+ src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5,
+ src5, mask2, src5, src5, mask3, vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6,
+ src6, mask2, src6, src6, mask3, vec8, vec9, vec10, vec11);
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, dst4, dst5);
+ dst6 = __lsx_vdp2_h_bu_b(vec8, filt0);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst5, vec5, filt1,
+ dst6, vec9, filt1, dst4, vec2, filt2, dst4, dst5, dst6, dst4);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec6, filt2, dst6, vec10, filt2,
+ dst4, vec3, filt3, dst5, vec7, filt3, dst5, dst6, dst4, dst5);
+ dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3);
+
+ for (loop_cnt = height; loop_cnt--;) {
+ src7 = __lsx_vld(src0_ptr_tmp, 0);
+ src0_ptr_tmp += src_stride;
+
+ in0 = __lsx_vld(src1_ptr_tmp, 0);
+ src1_ptr_tmp += src2_stride;
+
+ DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7,
+ src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3);
+ dst7 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2,
+ filt2, dst7, dst7);
+ dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3);
+ DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7,
+ dst6, dst10_r, dst32_r, dst54_r, dst76_r);
+ DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7,
+ dst6, dst10_l, dst32_l, dst54_l, dst76_l);
+
+ DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0,
+ dst0_r, dst0_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
+ dst32_l, filt_h1, dst0_r, dst54_r, filt_h2, dst0_l,
+ dst54_l, filt_h2, dst0_r, dst0_l, dst0_r, dst0_l);
+ DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l,
+ dst76_l, filt_h3, dst0_r, dst0_l);
+ dst0_r = __lsx_vsrli_w(dst0_r, 6);
+ dst0_l = __lsx_vsrli_w(dst0_l, 6);
+
+ tmp = __lsx_vpickev_h(dst0_l, dst0_r);
+ tmp = __lsx_vsadd_h(tmp, in0);
+ tmp = __lsx_vmaxi_h(tmp, 0);
+ out = __lsx_vssrlrni_bu_h(tmp, tmp, 7);
+ __lsx_vstelm_d(out, dst_tmp, 0, 0);
+ dst_tmp += dst_stride;
+
+ dst0 = dst1;
+ dst1 = dst2;
+ dst2 = dst3;
+ dst3 = dst4;
+ dst4 = dst5;
+ dst5 = dst6;
+ dst6 = dst7;
+ }
+
+ src0_ptr += 8;
+ dst += 8;
+ src1_ptr += 8;
+ }
+}
+
+static void hevc_hv_8t_8w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter_x, filter_y, height, 8);
+}
+
+static void hevc_hv_8t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter_x, filter_y, height, 16);
+}
+
+static void hevc_hv_8t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter_x, filter_y, height, 24);
+}
+
+static void hevc_hv_8t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter_x, filter_y, height, 32);
+}
+
+static void hevc_hv_8t_48w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter_x, filter_y, height, 48);
+}
+
+static void hevc_hv_8t_64w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ hevc_hv_8t_8multx1mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter_x, filter_y, height, 64);
+}
+
+static void hevc_hz_4t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ int16_t *src1_ptr_tmp;
+ uint8_t *dst_tmp;
+ uint32_t loop_cnt;
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t dst_stride_4x = (dst_stride << 2);
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ int32_t src2_stride_x = src2_stride << 1;
+ int32_t src2_stride_2x = src2_stride << 2;
+ int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
+
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i filt0, filt1;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ __m128i mask1, mask2, mask3;
+ __m128i vec0, vec1, vec2, vec3;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+ src0_ptr -= 1;
+ DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 8, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 10);
+
+ dst_tmp = dst + 16;
+ src1_ptr_tmp = src1_ptr + 16;
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src0, src1);
+ src0_ptr += src_stride;
+ DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src2, src3);
+ src0_ptr += src_stride;
+ DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src4, src5);
+ src0_ptr += src_stride;
+ DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src6, src7);
+ src0_ptr += src_stride;
+
+ DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in1);
+ src1_ptr += src2_stride;
+ DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in2, in3);
+ src1_ptr += src2_stride;
+ DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in4, in5);
+ src1_ptr += src2_stride;
+ DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in6, in7);
+ src1_ptr += src2_stride;
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask2, src2,
+ src2, mask0, src3, src2, mask2, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+ vec3, filt0, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src0, mask3, src2,
+ src2, mask1, src3, src2, mask3, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
+ dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
+
+ DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src5, src4, mask2, src6,
+ src6, mask0, src7, src6, mask2, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+ vec3, filt0, dst4, dst5, dst6, dst7);
+ DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask1, src5, src4, mask3, src6,
+ src6, mask1, src7, src6, mask3, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec0, filt1, dst5, vec1, filt1,
+ dst6, vec2, filt1, dst7, vec3, filt1, dst4, dst5, dst6, dst7);
+
+ dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
+ dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
+ dst2 = hevc_bi_rnd_clip(in4, dst4, in5, dst5);
+ dst3 = hevc_bi_rnd_clip(in6, dst6, in7, dst7);
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vstx(dst1, dst, dst_stride);
+ __lsx_vstx(dst2, dst, dst_stride_2x);
+ __lsx_vstx(dst3, dst, dst_stride_3x);
+ dst += dst_stride_4x;
+
+ in0 = __lsx_vld(src1_ptr_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src1_ptr_tmp, src2_stride_x, src1_ptr_tmp,
+ src2_stride_2x, in1, in2);
+ in3 = __lsx_vldx(src1_ptr_tmp, src2_stride_3x);
+ src1_ptr_tmp += src2_stride_2x;
+
+ DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src3, src3, mask0, src5,
+ src5, mask0, src7, src7, mask0, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+ vec3, filt0, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask1, src3, src3, mask1, src5,
+ src5, mask1, src7, src7, mask1, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
+ dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
+ dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
+ dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
+ __lsx_vstelm_d(dst0, dst_tmp, 0, 0);
+ __lsx_vstelm_d(dst0, dst_tmp + dst_stride, 0, 1);
+ __lsx_vstelm_d(dst1, dst_tmp + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(dst1, dst_tmp + dst_stride_3x, 0, 1);
+ dst_tmp += dst_stride_4x;
+ }
+}
+
+static void hevc_hz_4t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ uint32_t loop_cnt;
+ __m128i src0, src1, src2;
+ __m128i in0, in1, in2, in3;
+ __m128i filt0, filt1;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ __m128i mask1, mask2, mask3;
+ __m128i dst0, dst1, dst2, dst3;
+ __m128i vec0, vec1, vec2, vec3;
+
+ src0_ptr -= 1;
+
+ DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 8, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 10);
+
+ for (loop_cnt = height; loop_cnt--;) {
+ DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src0, src1);
+ src2 = __lsx_vld(src0_ptr, 24);
+ src0_ptr += src_stride;
+ DUP4_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, src1_ptr, 32,
+ src1_ptr, 48, in0, in1, in2, in3);
+ src1_ptr += src2_stride;
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src0, mask2, src1,
+ src1, mask0, src2, src2, mask0, vec0, vec1, vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+ vec3, filt0, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src0, mask3, src1,
+ src1, mask1, src2, src2, mask1, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
+ dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
+ dst0 = hevc_bi_rnd_clip(in0, dst0, in1, dst1);
+ dst1 = hevc_bi_rnd_clip(in2, dst2, in3, dst3);
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vst(dst1, dst, 16);
+ dst += dst_stride;
+ }
+}
+
+static void hevc_vt_4t_12w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ int32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t dst_stride_4x = (dst_stride << 2);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t src2_stride_x = (src2_stride << 1);
+ int32_t src2_stride_2x = (src2_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
+ int16_t *_src1 = src1_ptr + 8;
+ __m128i src0, src1, src2, src3, src4, src5, src6;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
+ __m128i dst0_r, dst1_r, dst2_r, dst3_r;
+ __m128i src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
+ __m128i src2110, src4332, src6554;
+ __m128i dst0_l, dst1_l, filt0, filt1;
+
+ src0_ptr -= src_stride;
+ DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+
+ src0 = __lsx_vld(src0_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+ src1, src2);
+ src0_ptr += src_stride_3x;
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+ DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
+ src2110 = __lsx_vilvl_d(src21_l, src10_l);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src3 = __lsx_vld(src0_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+ src4, src5);
+ src6 = __lsx_vldx(src0_ptr, src_stride_3x);
+ src0_ptr += src_stride_4x;
+ in0 = __lsx_vld(src1_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
+ src2_stride_2x, in1, in2);
+ in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
+ src1_ptr += src2_stride_2x;
+ in4 = __lsx_vld(_src1, 0);
+ DUP2_ARG2(__lsx_vldx, _src1, src2_stride_x, _src1, src2_stride_2x,
+ in5, in6);
+ in7 = __lsx_vldx(_src1, src2_stride_3x);
+ _src1 += src2_stride_2x;
+ DUP2_ARG2(__lsx_vilvl_d, in5, in4, in7, in6, in4, in5);
+
+ DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
+ DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
+ src4332 = __lsx_vilvl_d(src43_l, src32_l);
+ DUP2_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src54_r, src65_r);
+ DUP2_ARG2(__lsx_vilvh_b, src5, src4, src6, src5, src54_l, src65_l);
+ src6554 = __lsx_vilvl_d(src65_l, src54_l);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src2110,
+ filt0, src32_r, filt0, dst0_r, dst1_r, dst0_l, dst2_r);
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, src43_r, filt0, src4332, filt0,
+ dst3_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst1_r,
+ src43_r, filt1, dst0_l, src4332, filt1, dst2_r, src54_r,
+ filt1, dst0_r, dst1_r, dst0_l, dst2_r);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3_r, src65_r, filt1, dst1_l,
+ src6554, filt1, dst3_r, dst1_l);
+ dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in1, dst1_r);
+ dst1_r = hevc_bi_rnd_clip(in2, dst2_r, in3, dst3_r);
+ dst0_l = hevc_bi_rnd_clip(in4, dst0_l, in5, dst1_l);
+ __lsx_vstelm_d(dst0_r, dst, 0, 0);
+ __lsx_vstelm_d(dst0_r, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(dst1_r, dst + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(dst1_r, dst + dst_stride_3x, 0, 1);
+ __lsx_vstelm_w(dst0_l, dst, 8, 0);
+ __lsx_vstelm_w(dst0_l, dst + dst_stride, 8, 1);
+ __lsx_vstelm_w(dst0_l, dst + dst_stride_2x, 8, 2);
+ __lsx_vstelm_w(dst0_l, dst + dst_stride_3x, 8, 3);
+ dst += dst_stride_4x;
+
+ src2 = src6;
+ src10_r = src54_r;
+ src21_r = src65_r;
+ src2110 = src6554;
+ }
+}
+
+static void hevc_vt_4t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ int32_t loop_cnt;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ const int32_t src_stride_3x = src_stride_2x + src_stride;
+ __m128i src0, src1, src2, src3, src4, src5;
+ __m128i in0, in1, in2, in3;
+ __m128i src10_r, src32_r, src21_r, src43_r;
+ __m128i src10_l, src32_l, src21_l, src43_l;
+ __m128i dst0_r, dst1_r, dst0_l, dst1_l;
+ __m128i filt0, filt1;
+
+ src0_ptr -= src_stride;
+ DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+
+ src0 = __lsx_vld(src0_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+ src1, src2);
+ src0_ptr += src_stride_3x;
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+ DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src3 = __lsx_vld(src0_ptr, 0);
+ src4 = __lsx_vldx(src0_ptr, src_stride);
+ src0_ptr += src_stride_2x;
+ DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in2);
+ src1_ptr += src2_stride;
+ DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in1, in3);
+ src1_ptr += src2_stride;
+ DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
+ DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src10_l,
+ filt0, src21_l, filt0, dst0_r, dst1_r, dst0_l, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst1_r, src43_r,
+ filt1, dst0_l, src32_l, filt1, dst1_l, src43_l, filt1,
+ dst0_r, dst1_r, dst0_l, dst1_l);
+
+ dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l);
+ dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l);
+ __lsx_vst(dst0_r, dst, 0);
+ __lsx_vstx(dst1_r, dst, dst_stride);
+ dst += dst_stride_2x;
+
+ src5 = __lsx_vld(src0_ptr, 0);
+ src2 = __lsx_vldx(src0_ptr, src_stride);
+ src0_ptr += src_stride_2x;
+ DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in2);
+ src1_ptr += src2_stride;
+ DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in1, in3);
+ src1_ptr += src2_stride;
+ DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r);
+ DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
+ filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l,
+ src10_l, filt1, dst1_r, src21_r, filt1, dst1_l, src21_l,
+ filt1, dst0_r, dst0_l, dst1_r, dst1_l);
+ dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l);
+ dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l);
+ __lsx_vst(dst0_r, dst, 0);
+ __lsx_vstx(dst1_r, dst, dst_stride);
+ dst += dst_stride_2x;
+ }
+}
+
+static void hevc_vt_4t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ uint32_t loop_cnt;
+ int32_t dst_stride_2x = dst_stride << 1;
+ __m128i src0, src1, src2, src3, src4, src5;
+ __m128i src6, src7, src8, src9, src10, src11;
+ __m128i in0, in1, in2, in3, in4, in5;
+ __m128i src10_r, src32_r, src76_r, src98_r;
+ __m128i src21_r, src43_r, src87_r, src109_r;
+ __m128i src10_l, src32_l, src21_l, src43_l;
+ __m128i dst0_r, dst1_r, dst2_r, dst3_r;
+ __m128i dst0_l, dst1_l;
+ __m128i filt0, filt1;
+
+ src0_ptr -= src_stride;
+ DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+
+ /* 16width */
+ DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src0, src6);
+ src0_ptr += src_stride;
+ DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src1, src7);
+ src0_ptr += src_stride;
+ DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src2, src8);
+ src0_ptr += src_stride;
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+ DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
+ /* 8width */
+ DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ /* 16width */
+ DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src3, src9);
+ src0_ptr += src_stride;
+ DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src4, src10);
+ src0_ptr += src_stride;
+ DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in2);
+ in4 = __lsx_vld(src1_ptr, 32);
+ src1_ptr += src2_stride;
+ DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in1, in3);
+ in5 = __lsx_vld(src1_ptr, 32);
+ src1_ptr += src2_stride;
+ DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
+ DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
+ /* 8width */
+ DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r);
+ /* 16width */
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
+ filt0, src21_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_l,
+ src32_l, filt1, dst1_r, src43_r, filt1, dst1_l, src43_l, filt1,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ /* 8width */
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src87_r, filt0,
+ dst2_r, dst3_r);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src98_r, filt1, dst3_r,
+ src109_r, filt1, dst2_r, dst3_r);
+ /* 16width */
+ dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l);
+ dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l);
+ dst2_r = hevc_bi_rnd_clip(in4, dst2_r, in5, dst3_r);
+ __lsx_vst(dst0_r, dst, 0);
+ __lsx_vstx(dst1_r, dst, dst_stride);
+ __lsx_vstelm_d(dst2_r, dst, 16, 0);
+ __lsx_vstelm_d(dst2_r, dst + dst_stride, 16, 1);
+ dst += dst_stride_2x;
+
+ /* 16width */
+ DUP4_ARG2(__lsx_vld, src0_ptr, 0, src1_ptr, 0, src1_ptr, 16, src1_ptr,
+ 32, src5, in0, in2, in4);
+ src1_ptr += src2_stride;
+ DUP4_ARG2(__lsx_vld, src0_ptr, 16, src1_ptr, 0, src1_ptr, 16, src1_ptr,
+ 32, src11, in1, in3, in5);
+ src1_ptr += src2_stride;
+ src0_ptr += src_stride;
+ DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src2, src8);
+ src0_ptr += src_stride;
+ DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r);
+ DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l);
+ /* 8width */
+ DUP2_ARG2(__lsx_vilvl_b, src11, src10, src8, src11, src76_r, src87_r);
+ /* 16width */
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
+ filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l,
+ src10_l, filt1, dst1_r, src21_r, filt1, dst1_l, src21_l,
+ filt1, dst0_r, dst0_l, dst1_r, dst1_l);
+
+ /* 8width */
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, src98_r, filt0, src109_r, filt0,
+ dst2_r, dst3_r);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src76_r, filt1, dst3_r,
+ src87_r, filt1, dst2_r, dst3_r);
+
+ dst0_r = hevc_bi_rnd_clip(in0, dst0_r, in2, dst0_l);
+ dst1_r = hevc_bi_rnd_clip(in1, dst1_r, in3, dst1_l);
+ dst2_r = hevc_bi_rnd_clip(in4, dst2_r, in5, dst3_r);
+ __lsx_vst(dst0_r, dst, 0);
+ __lsx_vstx(dst1_r, dst, dst_stride);
+ __lsx_vstelm_d(dst2_r, dst, 16, 0);
+ __lsx_vstelm_d(dst2_r, dst + dst_stride, 16, 1);
+ dst += dst_stride_2x;
+ }
+}
+
+static void hevc_vt_4t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ hevc_vt_4t_16w_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter, height);
+ hevc_vt_4t_16w_lsx(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
+ dst + 16, dst_stride, filter, height);
+}
+
+static void hevc_hv_4t_6w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_4x = (dst_stride << 2);
+ int32_t src2_stride_2x = (src2_stride << 1);
+ int32_t src2_stride_4x = (src2_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ int32_t src2_stride_3x = src2_stride_2x + src2_stride;
+ __m128i out0, out1;
+ __m128i src0, src1, src2, src3, src4, src5, src6;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, mask1;
+ __m128i filt0, filt1, filt_h0, filt_h1;
+ __m128i dsth0, dsth1, dsth2, dsth3, dsth4, dsth5;
+ __m128i dsth6, dsth7, dsth8, dsth9, dsth10;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+ __m128i dst4_r, dst5_r, dst6_r, dst7_r;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+ __m128i reg0, reg1, reg2, reg3;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+
+ src0_ptr -= (src_stride + 1);
+ DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+ filt_h1 = __lsx_vld(filter_y, 0);
+ filt_h1 = __lsx_vsllwil_h_b(filt_h1, 0);
+ DUP2_ARG2(__lsx_vreplvei_w, filt_h1, 0, filt_h1, 1, filt_h0, filt_h1);
+
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ src0 = __lsx_vld(src0_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+ src1, src2);
+ src0_ptr += src_stride_3x;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dsth0, dsth1);
+ dsth2 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dsth0, vec1, filt1, dsth1, vec3, filt1,
+ dsth0, dsth1);
+ dsth2 = __lsx_vdp2add_h_bu_b(dsth2, vec5, filt1);
+
+ DUP2_ARG2(__lsx_vilvl_h, dsth1, dsth0, dsth2, dsth1, tmp0, tmp2);
+ DUP2_ARG2(__lsx_vilvh_h, dsth1, dsth0, dsth2, dsth1, tmp1, tmp3);
+
+ src3 = __lsx_vld(src0_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+ src4, src5);
+ src6 = __lsx_vldx(src0_ptr, src_stride_3x);
+ src0_ptr += src_stride_4x;
+ DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, vec6, vec7);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
+ filt0, dsth3, dsth4, dsth5, dsth6);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dsth3, vec1, filt1, dsth4, vec3, filt1, dsth5,
+ vec5, filt1, dsth6, vec7, filt1, dsth3, dsth4, dsth5, dsth6);
+
+ src3 = __lsx_vld(src0_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+ src4, src5);
+ src6 = __lsx_vldx(src0_ptr, src_stride_3x);
+
+ DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, vec6, vec7);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
+ filt0, dsth7, dsth8, dsth9, dsth10);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dsth7, vec1, filt1, dsth8, vec3, filt1, dsth9,
+ vec5, filt1, dsth10, vec7, filt1, dsth7, dsth8, dsth9, dsth10);
+
+ DUP2_ARG2(__lsx_vilvl_h, dsth3, dsth2, dsth4, dsth3, tmp4, tmp6);
+ DUP2_ARG2(__lsx_vilvh_h, dsth3, dsth2, dsth4, dsth3, tmp5, tmp7);
+ DUP2_ARG2(__lsx_vilvl_h, dsth5, dsth4, dsth6, dsth5, dsth0, dsth2);
+ DUP2_ARG2(__lsx_vilvh_h, dsth5, dsth4, dsth6, dsth5, dsth1, dsth3);
+ DUP4_ARG2(__lsx_vdp2_w_h, tmp0, filt_h0, tmp2, filt_h0, tmp4, filt_h0,
+ tmp6, filt_h0, dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, tmp4, filt_h1, dst1_r, tmp6,
+ filt_h1, dst2_r, dsth0, filt_h1, dst3_r, dsth2, filt_h1,
+ dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP2_ARG2(__lsx_vpickev_d, tmp3, tmp1, tmp7, tmp5, tmp0, tmp8);
+ dst0_l = __lsx_vdp2_w_h(tmp0, filt_h0);
+ dst0_l = __lsx_vdp2add_w_h(dst0_l, tmp8, filt_h1);
+
+ DUP2_ARG2(__lsx_vilvl_h, dsth7, dsth6, dsth8, dsth7, tmp0, tmp2);
+ DUP2_ARG2(__lsx_vilvh_h, dsth7, dsth6, dsth8, dsth7, tmp1, tmp3);
+ DUP2_ARG2(__lsx_vilvl_h, dsth9, dsth8, dsth10, dsth9, tmp4, tmp6);
+ DUP2_ARG2(__lsx_vilvh_h, dsth9, dsth8, dsth10, dsth9, tmp5, tmp7);
+ DUP4_ARG2(__lsx_vdp2_w_h, dsth0, filt_h0, dsth2, filt_h0, tmp0, filt_h0,
+ tmp2, filt_h0, dst4_r, dst5_r, dst6_r, dst7_r);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst4_r, tmp0, filt_h1, dst5_r, tmp2,
+ filt_h1, dst6_r, tmp4, filt_h1, dst7_r, tmp6, filt_h1,
+ dst4_r, dst5_r, dst6_r, dst7_r);
+ DUP2_ARG2(__lsx_vpickev_d, dsth3, dsth1, tmp3, tmp1, tmp0, tmp1);
+ tmp2 = __lsx_vpickev_d(tmp7, tmp5);
+
+ DUP2_ARG2(__lsx_vdp2_w_h, tmp8, filt_h0, tmp0, filt_h0, dst1_l, dst2_l);
+ dst3_l = __lsx_vdp2_w_h(tmp1, filt_h0);
+ DUP2_ARG3(__lsx_vdp2add_w_h, dst1_l, tmp0, filt_h1, dst2_l, tmp1, filt_h1,
+ dst1_l, dst2_l);
+ dst3_l = __lsx_vdp2add_w_h(dst3_l, tmp2, filt_h1);
+
+ DUP4_ARG2(__lsx_vsrai_d, dst0_r, 6, dst1_r, 6, dst2_r, 6, dst3_r, 6,
+ dst0_r, dst1_r, dst2_r, dst3_r);
+ DUP4_ARG2(__lsx_vsrai_d, dst4_r, 6, dst5_r, 6, dst6_r, 6, dst7_r, 6,
+ dst4_r, dst5_r, dst6_r, dst7_r);
+ DUP4_ARG2(__lsx_vsrai_d, dst0_l, 6, dst1_l, 6, dst2_l, 6, dst3_l, 6,
+ dst0_l, dst1_l, dst2_l, dst3_l);
+ DUP2_ARG2(__lsx_vpickev_h, dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vpickev_h, dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
+ DUP2_ARG2(__lsx_vpickev_h, dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
+
+ reg0 = __lsx_vldrepl_d(src1_ptr, 0);
+ reg1 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0);
+ dsth0 = __lsx_vilvl_d(reg1, reg0);
+ reg0 = __lsx_vldrepl_d(src1_ptr + src2_stride_2x, 0);
+ reg1 = __lsx_vldrepl_d(src1_ptr + src2_stride_3x, 0);
+ dsth1 = __lsx_vilvl_d(reg1, reg0);
+ src1_ptr += src2_stride_4x;
+ reg0 = __lsx_vldrepl_d(src1_ptr, 0);
+ reg1 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0);
+ dsth2 = __lsx_vilvl_d(reg1, reg0);
+ reg0 = __lsx_vldrepl_d(src1_ptr + src2_stride_2x, 0);
+ reg1 = __lsx_vldrepl_d(src1_ptr + src2_stride_3x, 0);
+ dsth3 = __lsx_vilvl_d(reg1, reg0);
+
+ DUP4_ARG2(__lsx_vsadd_h, dsth0, tmp0, dsth1, tmp1, dsth2, tmp2, dsth3,
+ tmp3, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp2, 0, tmp3, 0,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrlrni_bu_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
+
+ __lsx_vstelm_w(out0, dst, 0, 0);
+ __lsx_vstelm_w(out0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(out0, dst + dst_stride_2x, 0, 2);
+ __lsx_vstelm_w(out0, dst + dst_stride_3x, 0, 3);
+ dst += dst_stride_4x;
+ __lsx_vstelm_w(out1, dst, 0, 0);
+ __lsx_vstelm_w(out1, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(out1, dst + dst_stride_2x, 0, 2);
+ __lsx_vstelm_w(out1, dst + dst_stride_3x, 0, 3);
+ dst -= dst_stride_4x;
+
+ src1_ptr -= src2_stride_4x;
+
+ reg0 = __lsx_vldrepl_w(src1_ptr, 8);
+ reg1 = __lsx_vldrepl_w(src1_ptr + src2_stride, 8);
+ reg2 = __lsx_vldrepl_w(src1_ptr + src2_stride_2x, 8);
+ reg3 = __lsx_vldrepl_w(src1_ptr + src2_stride_3x, 8);
+ DUP2_ARG2(__lsx_vilvl_w, reg1, reg0, reg3, reg2, tmp0, tmp1);
+ dsth4 = __lsx_vilvl_d(tmp1, tmp0);
+ src1_ptr += src2_stride_4x;
+
+ reg0 = __lsx_vldrepl_w(src1_ptr, 8);
+ reg1 = __lsx_vldrepl_w(src1_ptr + src2_stride, 8);
+ reg2 = __lsx_vldrepl_w(src1_ptr + src2_stride_2x, 8);
+ reg3 = __lsx_vldrepl_w(src1_ptr + src2_stride_3x, 8);
+ DUP2_ARG2(__lsx_vilvl_w, reg1, reg0, reg3, reg2, tmp0, tmp1);
+ dsth5 = __lsx_vilvl_d(tmp1, tmp0);
+ DUP2_ARG2(__lsx_vsadd_h, dsth4, tmp4, dsth5, tmp5, tmp4, tmp5);
+ DUP2_ARG2(__lsx_vmaxi_h, tmp4, 0, tmp5, 7, tmp4, tmp5);
+ out0 = __lsx_vssrlrni_bu_h(tmp5, tmp4, 7);
+
+ __lsx_vstelm_h(out0, dst, 4, 0);
+ __lsx_vstelm_h(out0, dst + dst_stride, 4, 1);
+ __lsx_vstelm_h(out0, dst + dst_stride_2x, 4, 2);
+ __lsx_vstelm_h(out0, dst + dst_stride_3x, 4, 3);
+ dst += dst_stride_4x;
+ __lsx_vstelm_h(out0, dst, 4, 4);
+ __lsx_vstelm_h(out0, dst + dst_stride, 4, 5);
+ __lsx_vstelm_h(out0, dst + dst_stride_2x, 4, 6);
+ __lsx_vstelm_h(out0, dst + dst_stride_3x, 4, 7);
+}
+
+static av_always_inline
+void hevc_hv_4t_8x2_lsx(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr,
+ int32_t src2_stride, uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y)
+{
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+
+ __m128i out;
+ __m128i src0, src1, src2, src3, src4;
+ __m128i filt0, filt1;
+ __m128i filt_h0, filt_h1;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ __m128i mask1, filter_vec;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+ __m128i dst0, dst1, dst2, dst3, dst4;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l;
+ __m128i dst10_r, dst32_r, dst21_r, dst43_r;
+ __m128i dst10_l, dst32_l, dst21_l, dst43_l;
+ __m128i tmp0, tmp1;
+ __m128i in0, in1;
+
+ src0_ptr -= (src_stride + 1);
+ DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+ DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ src0 = __lsx_vld(src0_ptr, 0);
+ DUP4_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+ src0_ptr, src_stride_3x, src0_ptr, src_stride_4x,
+ src1, src2, src3, src4);
+
+ DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr + src2_stride, 0, in0, in1);
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec6, vec7);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec8, vec9);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
+ filt0, dst0, dst1, dst2, dst3);
+ dst4 = __lsx_vdp2_h_bu_b(vec8, filt0);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2,
+ vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
+ dst4 = __lsx_vdp2add_h_bu_b(dst4, vec9, filt1);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
+ DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+ filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+ filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vsadd_h, in0, tmp0, in1, tmp1, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp0, tmp1);
+ out = __lsx_vssrlrni_bu_h(tmp1, tmp0, 7);
+ __lsx_vstelm_d(out, dst, 0, 0);
+ __lsx_vstelm_d(out, dst + dst_stride, 0, 1);
+}
+
+static av_always_inline
+void hevc_hv_4t_8multx4_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t width8mult)
+{
+ uint32_t cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t src2_stride_x = (src2_stride << 1);
+ int32_t src2_stride_2x = (src2_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
+
+ __m128i out0, out1;
+ __m128i src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i filt0, filt1, filt_h0, filt_h1, filter_vec;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
+ __m128i in0, in1, in2, in3;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+ __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
+ __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
+
+ src0_ptr -= (src_stride + 1);
+ DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+ DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+ mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ for (cnt = width8mult; cnt--;) {
+ src0 = __lsx_vld(src0_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+ src1, src2);
+ src3 = __lsx_vldx(src0_ptr, src_stride_3x);
+ src0_ptr += src_stride_4x;
+ src4 = __lsx_vld(src0_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+ src5, src6);
+ src0_ptr += (8 - src_stride_4x);
+
+ in0 = __lsx_vld(src1_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
+ src2_stride_2x, in1, in2);
+ in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
+ src1_ptr += 8;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1,
+ vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,
+ vec4, vec5);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
+ dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
+ dst0, dst1);
+ dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
+
+ DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1,
+ vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1,
+ vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1,
+ vec6, vec7);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
+ vec6, filt0, dst3, dst4, dst5, dst6);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3, filt1,
+ dst5, vec5, filt1, dst6, vec7, filt1, dst3, dst4, dst5, dst6);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
+ DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+ filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
+ filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+ filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
+ filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
+ dst2_r, dst2_l, dst3_r, dst3_l);
+
+ DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6,
+ dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l,
+ dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vsadd_h, in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp2, 0, tmp3, 0,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrlrni_bu_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
+ dst += 8;
+ }
+}
+
+static av_always_inline
+void hevc_hv_4t_8x6_lsx(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr,
+ int32_t src2_stride, uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y)
+{
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_4x = (dst_stride << 2);
+ int32_t src2_stride_x = (src2_stride << 1);
+ int32_t src2_stride_2x = (src2_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
+
+ __m128i out0, out1, out2;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ __m128i in0, in1, in2, in3, in4, in5;
+ __m128i filt0, filt1;
+ __m128i filt_h0, filt_h1;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ __m128i mask1, filter_vec;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+ __m128i vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+ __m128i dst4_r, dst4_l, dst5_r, dst5_l;
+ __m128i dst10_r, dst32_r, dst10_l, dst32_l;
+ __m128i dst21_r, dst43_r, dst21_l, dst43_l;
+ __m128i dst54_r, dst54_l, dst65_r, dst65_l;
+ __m128i dst76_r, dst76_l, dst87_r, dst87_l;
+
+ src0_ptr -= (src_stride + 1);
+ DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+ DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ src0 = __lsx_vld(src0_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+ src1, src2);
+ src3 = __lsx_vldx(src0_ptr, src_stride_3x);
+ src0_ptr += src_stride_4x;
+ src4 = __lsx_vld(src0_ptr, 0);
+ DUP4_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
+ src0_ptr, src_stride_3x, src0_ptr, src_stride_4x,
+ src5, src6, src7, src8);
+
+ in0 = __lsx_vld(src1_ptr, 0);
+ DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr, src2_stride_2x,
+ in1, in2);
+ in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
+ src1_ptr += src2_stride_2x;
+ in4 = __lsx_vld(src1_ptr, 0);
+ in5 = __lsx_vldx(src1_ptr, src2_stride_x);
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec6, vec7);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec8, vec9);
+ DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, vec10, vec11);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, vec12, vec13);
+ DUP2_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, vec14, vec15);
+ DUP2_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1, vec16, vec17);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
+ filt0, dst0, dst1, dst2, dst3);
+ dst4 = __lsx_vdp2_h_bu_b(vec8, filt0);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec10, filt0, vec12, filt0, vec14, filt0,
+ vec16, filt0, dst5, dst6, dst7, dst8);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2,
+ vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
+ dst4 = __lsx_vdp2add_h_bu_b(dst4, vec9, filt1);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec11, filt1, dst6, vec13, filt1,
+ dst7, vec15, filt1, dst8, vec17, filt1, dst5, dst6, dst7, dst8);
+
+ DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
+ dst10_r, dst21_r, dst32_r, dst43_r);
+ DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
+ dst10_l, dst21_l, dst32_l, dst43_l);
+ DUP4_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
+ dst54_r, dst65_r, dst76_r, dst87_r);
+ DUP4_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
+ dst54_l, dst65_l, dst76_l, dst87_l);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+ filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
+ filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst54_l, filt_h0, dst65_r,
+ filt_h0, dst65_l, filt_h0, dst4_r, dst4_l, dst5_r, dst5_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+ filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
+ filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
+ dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst4_r, dst76_r, filt_h1, dst4_l, dst76_l,
+ filt_h1, dst5_r, dst87_r, filt_h1, dst5_l, dst87_l, filt_h1,
+ dst4_r, dst4_l, dst5_r, dst5_l);
+
+ DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6,
+ dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG2(__lsx_vsrai_w, dst4_r, 6, dst4_l, 6, dst5_r, 6, dst5_l, 6,
+ dst4_r, dst4_l, dst5_r, dst5_l);
+ DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r,
+ dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG2(__lsx_vpickev_h, dst4_l, dst4_r, dst5_l, dst5_r, tmp4, tmp5);
+ DUP4_ARG2(__lsx_vsadd_h, in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG2(__lsx_vsadd_h, in4, tmp4, in5, tmp5, tmp4, tmp5);
+ DUP4_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp2, 0, tmp3, 0,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG2(__lsx_vmaxi_h, tmp4, 0, tmp5, 0, tmp4, tmp5);
+ DUP2_ARG3(__lsx_vssrlrni_bu_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
+ out2 = __lsx_vssrlrni_bu_h(tmp5, tmp4, 7);
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
+ dst += dst_stride_4x;
+ __lsx_vstelm_d(out2, dst, 0, 0);
+ __lsx_vstelm_d(out2, dst + dst_stride, 0, 1);
+}
+
+static av_always_inline
+void hevc_hv_4t_8multx4mult_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height, int32_t width)
+{
+ uint32_t loop_cnt, cnt;
+ uint8_t *src0_ptr_tmp;
+ int16_t *src1_ptr_tmp;
+ uint8_t *dst_tmp;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ const int32_t src_stride_4x = (src_stride << 2);
+ const int32_t dst_stride_4x = (dst_stride << 2);
+ const int32_t src2_stride_x = (src2_stride << 1);
+ const int32_t src2_stride_2x = (src2_stride << 2);
+ const int32_t src_stride_3x = src_stride_2x + src_stride;
+ const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ const int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
+ __m128i out0, out1;
+ __m128i src0, src1, src2, src3, src4, src5, src6;
+ __m128i in0, in1, in2, in3;
+ __m128i filt0, filt1;
+ __m128i filt_h0, filt_h1;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ __m128i mask1, filter_vec;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i dst10_r, dst32_r, dst21_r, dst43_r;
+ __m128i dst10_l, dst32_l, dst21_l, dst43_l;
+ __m128i dst54_r, dst54_l, dst65_r, dst65_l, dst6;
+
+ src0_ptr -= (src_stride + 1);
+
+ DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+
+ DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ for (cnt = width >> 3; cnt--;) {
+ src0_ptr_tmp = src0_ptr;
+ dst_tmp = dst;
+ src1_ptr_tmp = src1_ptr;
+
+ src0 = __lsx_vld(src0_ptr_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
+ src_stride_2x, src1, src2);
+ src0_ptr_tmp += src_stride_3x;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1,
+ vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,
+ vec4, vec5);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
+ dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
+ dst0, dst1);
+ dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
+
+ for (loop_cnt = height >> 2; loop_cnt--;) {
+ src3 = __lsx_vld(src0_ptr_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
+ src_stride_2x, src4, src5);
+ src6 = __lsx_vldx(src0_ptr_tmp, src_stride_3x);
+ src0_ptr_tmp += src_stride_4x;
+ in0 = __lsx_vld(src1_ptr_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src1_ptr_tmp, src2_stride_x, src1_ptr_tmp,
+ src2_stride_2x, in1, in2);
+ in3 = __lsx_vldx(src1_ptr_tmp, src2_stride_3x);
+ src1_ptr_tmp += src2_stride_2x;
+
+ DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src4,
+ src4, mask0, src4, src4, mask1, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src6,
+ src6, mask0, src6, src6, mask1, vec4, vec5, vec6, vec7);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
+ vec6, filt0, dst3, dst4, dst5, dst6);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3,
+ filt1, dst5, vec5, filt1, dst6, vec7, filt1,
+ dst3, dst4, dst5, dst6);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
+ DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+ filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
+ filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
+ dst32_l, filt_h1, dst1_r, dst43_r, filt_h1, dst1_l,
+ dst43_l, filt_h1, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l,
+ dst54_l, filt_h1, dst3_r, dst65_r, filt_h1, dst3_l,
+ dst65_l, filt_h1, dst2_r, dst2_l, dst3_r, dst3_l);
+
+ DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6,
+ dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l,
+ dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vsadd_h, in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp2, 0, tmp3, 0, tmp0,
+ tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrlrni_bu_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
+ __lsx_vstelm_d(out0, dst_tmp, 0, 0);
+ __lsx_vstelm_d(out0, dst_tmp + dst_stride, 0, 1);
+ __lsx_vstelm_d(out1, dst_tmp + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(out1, dst_tmp + dst_stride_3x, 0, 1);
+ dst_tmp += dst_stride_4x;
+
+ dst10_r = dst54_r;
+ dst10_l = dst54_l;
+ dst21_r = dst65_r;
+ dst21_l = dst65_l;
+ dst2 = dst6;
+ }
+
+ src0_ptr += 8;
+ dst += 8;
+ src1_ptr += 8;
+ }
+}
+
+static void hevc_hv_4t_8w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ if (2 == height) {
+ hevc_hv_4t_8x2_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter_x, filter_y);
+ } else if (4 == height) {
+ hevc_hv_4t_8multx4_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter_x, filter_y, 1);
+ } else if (6 == height) {
+ hevc_hv_4t_8x6_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter_x, filter_y);
+ } else {
+ hevc_hv_4t_8multx4mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter_x, filter_y, height, 8);
+ }
+}
+
+static void hevc_hv_4t_16w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ if (4 == height) {
+ hevc_hv_4t_8multx4_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter_x, filter_y, 2);
+ } else {
+ hevc_hv_4t_8multx4mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter_x, filter_y, height, 16);
+ }
+}
+
+static void hevc_hv_4t_24w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ hevc_hv_4t_8multx4mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter_x, filter_y, height, 24);
+}
+
+static void hevc_hv_4t_32w_lsx(uint8_t *src0_ptr, int32_t src_stride,
+ int16_t *src1_ptr, int32_t src2_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter_x, const int8_t *filter_y,
+ int32_t height)
+{
+ hevc_hv_4t_8multx4mult_lsx(src0_ptr, src_stride, src1_ptr, src2_stride,
+ dst, dst_stride, filter_x, filter_y, height, 32);
+}
+
+#define BI_MC_COPY(WIDTH) \
+void ff_hevc_put_hevc_bi_pel_pixels##WIDTH##_8_lsx(uint8_t *dst, \
+ ptrdiff_t dst_stride, \
+ uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int16_t *src_16bit, \
+ int height, \
+ intptr_t mx, \
+ intptr_t my, \
+ int width) \
+{ \
+ hevc_bi_copy_##WIDTH##w_lsx(src, src_stride, src_16bit, MAX_PB_SIZE, \
+ dst, dst_stride, height); \
+}
+
+BI_MC_COPY(4);
+BI_MC_COPY(6);
+BI_MC_COPY(8);
+BI_MC_COPY(12);
+BI_MC_COPY(16);
+BI_MC_COPY(24);
+BI_MC_COPY(32);
+BI_MC_COPY(48);
+BI_MC_COPY(64);
+
+#undef BI_MC_COPY
+
+#define BI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
+void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_lsx(uint8_t *dst, \
+ ptrdiff_t dst_stride, \
+ uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int16_t *src_16bit, \
+ int height, \
+ intptr_t mx, \
+ intptr_t my, \
+ int width) \
+{ \
+ const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
+ \
+ hevc_##DIR1##_##TAP##t_##WIDTH##w_lsx(src, src_stride, src_16bit, \
+ MAX_PB_SIZE, dst, dst_stride, \
+ filter, height); \
+}
+
+BI_MC(qpel, h, 16, 8, hz, mx);
+BI_MC(qpel, h, 24, 8, hz, mx);
+BI_MC(qpel, h, 32, 8, hz, mx);
+BI_MC(qpel, h, 48, 8, hz, mx);
+BI_MC(qpel, h, 64, 8, hz, mx);
+
+BI_MC(qpel, v, 8, 8, vt, my);
+BI_MC(qpel, v, 16, 8, vt, my);
+BI_MC(qpel, v, 24, 8, vt, my);
+BI_MC(qpel, v, 32, 8, vt, my);
+BI_MC(qpel, v, 48, 8, vt, my);
+BI_MC(qpel, v, 64, 8, vt, my);
+
+BI_MC(epel, h, 24, 4, hz, mx);
+BI_MC(epel, h, 32, 4, hz, mx);
+
+BI_MC(epel, v, 12, 4, vt, my);
+BI_MC(epel, v, 16, 4, vt, my);
+BI_MC(epel, v, 24, 4, vt, my);
+BI_MC(epel, v, 32, 4, vt, my);
+
+#undef BI_MC
+
+#define BI_MC_HV(PEL, WIDTH, TAP) \
+void ff_hevc_put_hevc_bi_##PEL##_hv##WIDTH##_8_lsx(uint8_t *dst, \
+ ptrdiff_t dst_stride, \
+ uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int16_t *src_16bit, \
+ int height, \
+ intptr_t mx, \
+ intptr_t my, \
+ int width) \
+{ \
+ const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
+ const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
+ \
+ hevc_hv_##TAP##t_##WIDTH##w_lsx(src, src_stride, src_16bit, \
+ MAX_PB_SIZE, dst, dst_stride, \
+ filter_x, filter_y, height); \
+}
+
+BI_MC_HV(qpel, 8, 8);
+BI_MC_HV(qpel, 16, 8);
+BI_MC_HV(qpel, 24, 8);
+BI_MC_HV(qpel, 32, 8);
+BI_MC_HV(qpel, 48, 8);
+BI_MC_HV(qpel, 64, 8);
+
+BI_MC_HV(epel, 8, 4);
+BI_MC_HV(epel, 6, 4);
+BI_MC_HV(epel, 16, 4);
+BI_MC_HV(epel, 24, 4);
+BI_MC_HV(epel, 32, 4);
+
+#undef BI_MC_HV
diff --git a/libavcodec/loongarch/hevcdsp_init_loongarch.c b/libavcodec/loongarch/hevcdsp_init_loongarch.c
index f39674be64..e05fe780c2 100644
--- a/libavcodec/loongarch/hevcdsp_init_loongarch.c
+++ b/libavcodec/loongarch/hevcdsp_init_loongarch.c
@@ -86,6 +86,57 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_8_lsx;
c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_lsx;
+ c->put_hevc_qpel_bi[1][0][0] = ff_hevc_put_hevc_bi_pel_pixels4_8_lsx;
+ c->put_hevc_qpel_bi[3][0][0] = ff_hevc_put_hevc_bi_pel_pixels8_8_lsx;
+ c->put_hevc_qpel_bi[4][0][0] = ff_hevc_put_hevc_bi_pel_pixels12_8_lsx;
+ c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_8_lsx;
+ c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_8_lsx;
+ c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_lsx;
+ c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_lsx;
+ c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_lsx;
+
+ c->put_hevc_epel_bi[1][0][0] = ff_hevc_put_hevc_bi_pel_pixels4_8_lsx;
+ c->put_hevc_epel_bi[2][0][0] = ff_hevc_put_hevc_bi_pel_pixels6_8_lsx;
+ c->put_hevc_epel_bi[3][0][0] = ff_hevc_put_hevc_bi_pel_pixels8_8_lsx;
+ c->put_hevc_epel_bi[4][0][0] = ff_hevc_put_hevc_bi_pel_pixels12_8_lsx;
+ c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_8_lsx;
+ c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_8_lsx;
+ c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_lsx;
+
+ c->put_hevc_qpel_bi[3][1][0] = ff_hevc_put_hevc_bi_qpel_v8_8_lsx;
+ c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_8_lsx;
+ c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_8_lsx;
+ c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_lsx;
+ c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_lsx;
+ c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_lsx;
+
+ c->put_hevc_qpel_bi[3][1][1] = ff_hevc_put_hevc_bi_qpel_hv8_8_lsx;
+ c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_8_lsx;
+ c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_8_lsx;
+ c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_8_lsx;
+ c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_8_lsx;
+ c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_8_lsx;
+
+ c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_8_lsx;
+ c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_8_lsx;
+ c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_lsx;
+ c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_lsx;
+ c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_lsx;
+
+ c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_8_lsx;
+ c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_lsx;
+
+ c->put_hevc_epel_bi[4][1][0] = ff_hevc_put_hevc_bi_epel_v12_8_lsx;
+ c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_8_lsx;
+ c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_8_lsx;
+ c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_lsx;
+
+ c->put_hevc_epel_bi[2][1][1] = ff_hevc_put_hevc_bi_epel_hv6_8_lsx;
+ c->put_hevc_epel_bi[3][1][1] = ff_hevc_put_hevc_bi_epel_hv8_8_lsx;
+ c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_8_lsx;
+ c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_8_lsx;
+ c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_lsx;
+
c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_8_lsx;
c->hevc_h_loop_filter_luma = ff_hevc_loop_filter_luma_h_8_lsx;
diff --git a/libavcodec/loongarch/hevcdsp_lsx.h b/libavcodec/loongarch/hevcdsp_lsx.h
index 0e73fd1f8e..6a834290ab 100644
--- a/libavcodec/loongarch/hevcdsp_lsx.h
+++ b/libavcodec/loongarch/hevcdsp_lsx.h
@@ -85,6 +85,63 @@ MC(epel, hv, 32);
#undef MC
+#define BI_MC(PEL, DIR, WIDTH) \
+void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_lsx(uint8_t *dst, \
+ ptrdiff_t dst_stride, \
+ uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int16_t *src_16bit, \
+ int height, \
+ intptr_t mx, \
+ intptr_t my, \
+ int width)
+
+BI_MC(pel, pixels, 4);
+BI_MC(pel, pixels, 6);
+BI_MC(pel, pixels, 8);
+BI_MC(pel, pixels, 12);
+BI_MC(pel, pixels, 16);
+BI_MC(pel, pixels, 24);
+BI_MC(pel, pixels, 32);
+BI_MC(pel, pixels, 48);
+BI_MC(pel, pixels, 64);
+
+BI_MC(qpel, h, 16);
+BI_MC(qpel, h, 24);
+BI_MC(qpel, h, 32);
+BI_MC(qpel, h, 48);
+BI_MC(qpel, h, 64);
+
+BI_MC(qpel, v, 8);
+BI_MC(qpel, v, 16);
+BI_MC(qpel, v, 24);
+BI_MC(qpel, v, 32);
+BI_MC(qpel, v, 48);
+BI_MC(qpel, v, 64);
+
+BI_MC(qpel, hv, 8);
+BI_MC(qpel, hv, 16);
+BI_MC(qpel, hv, 24);
+BI_MC(qpel, hv, 32);
+BI_MC(qpel, hv, 48);
+BI_MC(qpel, hv, 64);
+
+BI_MC(epel, h, 24);
+BI_MC(epel, h, 32);
+
+BI_MC(epel, v, 12);
+BI_MC(epel, v, 16);
+BI_MC(epel, v, 24);
+BI_MC(epel, v, 32);
+
+BI_MC(epel, hv, 6);
+BI_MC(epel, hv, 8);
+BI_MC(epel, hv, 16);
+BI_MC(epel, hv, 24);
+BI_MC(epel, hv, 32);
+
+#undef BI_MC
+
void ff_hevc_loop_filter_luma_h_8_lsx(uint8_t *src, ptrdiff_t stride,
int32_t beta, int32_t *tc,
uint8_t *p_is_pcm, uint8_t *q_is_pcm);
--
2.20.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* [FFmpeg-devel] [PATCH v2 5/5] avcodec: [loongarch] Optimize Hevc_mc_uni/w with LSX.
2022-02-17 11:11 [FFmpeg-devel] Optimize HEVC decoding for loongarch Hao Chen
` (3 preceding siblings ...)
2022-02-17 11:11 ` [FFmpeg-devel] [PATCH v2 4/5] avcodec: [loongarch] Optimize Hevc_mc_bi " Hao Chen
@ 2022-02-17 11:11 ` Hao Chen
2022-02-17 12:08 ` [FFmpeg-devel] Optimize HEVC decoding for loongarch 殷时友
5 siblings, 0 replies; 15+ messages in thread
From: Hao Chen @ 2022-02-17 11:11 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Lu Wang
From: Lu Wang <wanglu@loongson.cn>
ffmpeg -i 5_h265_1080p_60fps_3Mbps.mkv -f rawvideo -y /dev/null -an
before: 182fps
after : 191fps
Signed-off-by: Hao Chen <chenhao@loongson.cn>
---
libavcodec/loongarch/Makefile | 4 +-
libavcodec/loongarch/hevc_mc_uni_lsx.c | 1423 +++++++++++++++++
libavcodec/loongarch/hevc_mc_uniw_lsx.c | 298 ++++
libavcodec/loongarch/hevcdsp_init_loongarch.c | 30 +
libavcodec/loongarch/hevcdsp_lsx.h | 59 +
5 files changed, 1813 insertions(+), 1 deletion(-)
create mode 100644 libavcodec/loongarch/hevc_mc_uni_lsx.c
create mode 100644 libavcodec/loongarch/hevc_mc_uniw_lsx.c
diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index 620fba7192..c1b5de5c44 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -28,4 +28,6 @@ LSX-OBJS-$(CONFIG_VP9_DECODER) += loongarch/vp9_mc_lsx.o \
LSX-OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_lsx.o \
loongarch/hevc_idct_lsx.o \
loongarch/hevc_lpf_sao_lsx.o \
- loongarch/hevc_mc_bi_lsx.o
+ loongarch/hevc_mc_bi_lsx.o \
+ loongarch/hevc_mc_uni_lsx.o \
+ loongarch/hevc_mc_uniw_lsx.o
diff --git a/libavcodec/loongarch/hevc_mc_uni_lsx.c b/libavcodec/loongarch/hevc_mc_uni_lsx.c
new file mode 100644
index 0000000000..a15c86268f
--- /dev/null
+++ b/libavcodec/loongarch/hevc_mc_uni_lsx.c
@@ -0,0 +1,1423 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ * Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "hevcdsp_lsx.h"
+
+static const uint8_t ff_hevc_mask_arr[16 * 3] __attribute__((aligned(0x40))) = {
+ /* 8 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+ /* 4 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+ /* 4 width cases */
+ 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static av_always_inline
+void common_hz_8t_64w_lsx(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ int32_t loop_cnt;
+ __m128i mask0, mask1, mask2, mask3, out1, out2;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i filt0, filt1, filt2, filt3;
+ __m128i res0, res1, res2, res3;
+
+ mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ src -= 3;
+
+ /* rearranging filter */
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+
+ for (loop_cnt = height; loop_cnt--;) {
+ DUP4_ARG2(__lsx_vld, src, 0, src, 8, src, 16, src, 24,
+ src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vld, src, 32, src, 40, src, 48, src, 56,
+ src4, src5, src6, src7);
+ src += src_stride;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0,
+ vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+ vec3, filt0, res0, res1, res2, res3);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2,
+ vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec0, filt2, res1, vec1, filt2,
+ res2, vec2, filt2, res3, vec3, filt2, res0, res1, res2, res3);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1,
+ vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1,
+ vec6, vec7);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt1, res1, vec5, filt1,
+ res2, vec6, filt1, res3, vec7, filt1, res0, res1, res2, res3);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3,
+ vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask3, src3, src3, mask3,
+ vec6, vec7);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt3, res1, vec5, filt3,
+ res2, vec6, filt3, res3, vec7, filt3, res0, res1, res2, res3);
+
+ DUP2_ARG3(__lsx_vssrarni_bu_h, res1, res0, 6, res3, res2, 6,
+ out1, out2);
+ __lsx_vst(out1, dst, 0);
+ __lsx_vst(out2, dst, 16);
+
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src5, src5, mask0,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src7, src7, mask0,
+ vec2, vec3);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
+ vec3, filt0, res0, res1, res2, res3);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask2, src5, src5, mask2,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask2, src7, src7, mask2,
+ vec2, vec3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec0, filt2, res1, vec1, filt2,
+ res2, vec2, filt2, res3, vec3, filt2, res0, res1, res2, res3);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask1, src5, src5, mask1,
+ vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask1, src7, src7, mask1,
+ vec6, vec7);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt1, res1, vec5, filt1,
+ res2, vec6, filt1, res3, vec7, filt1, res0, res1, res2, res3);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask3, src5, src5, mask3,
+ vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask3, src7, src7, mask3,
+ vec6, vec7);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt3, res1, vec5, filt3,
+ res2, vec6, filt3, res3, vec7, filt3, res0, res1, res2, res3);
+
+ DUP2_ARG3(__lsx_vssrarni_bu_h, res1, res0, 6, res3, res2, 6,
+ out1, out2);
+ __lsx_vst(out1, dst, 32);
+ __lsx_vst(out2, dst, 48);
+ dst += dst_stride;
+ }
+}
+
+static av_always_inline
+void common_vt_8t_8w_lsx(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ uint32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t src_stride_4x = (src_stride << 2);
+ int32_t dst_stride_4x = (dst_stride << 2);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+ __m128i src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
+ __m128i tmp0, tmp1;
+ __m128i out0_r, out1_r, out2_r, out3_r;
+
+ src -= src_stride_3x;
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+ filt0, filt1, filt2, filt3);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
+ src += src_stride_3x;
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ src10_r, src32_r, src54_r, src21_r);
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src7 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
+ src10 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+
+ DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
+ src9, src76_r, src87_r, src98_r, src109_r);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src32_r,
+ filt0, src43_r, filt0, out0_r, out1_r, out2_r, out3_r);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out1_r,
+ src43_r, filt1, out2_r, src54_r, filt1, out3_r, src65_r,
+ filt1, out0_r, out1_r, out2_r, out3_r);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src54_r, filt2, out1_r,
+ src65_r, filt2, out2_r, src76_r, filt2, out3_r, src87_r,
+ filt2, out0_r, out1_r, out2_r, out3_r);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src76_r, filt3, out1_r,
+ src87_r, filt3, out2_r, src98_r, filt3, out3_r, src109_r,
+ filt3, out0_r, out1_r, out2_r, out3_r);
+
+ DUP2_ARG3(__lsx_vssrarni_bu_h, out1_r, out0_r, 6, out3_r, out2_r, 6,
+ tmp0, tmp1)
+ __lsx_vstelm_d(tmp0, dst, 0, 0);
+ __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(tmp1, dst + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(tmp1, dst + dst_stride_3x, 0, 1);
+ dst += dst_stride_4x;
+
+ src10_r = src54_r;
+ src32_r = src76_r;
+ src54_r = src98_r;
+ src21_r = src65_r;
+ src43_r = src87_r;
+ src65_r = src109_r;
+ src6 = src10;
+ }
+}
+
+static av_always_inline
+void common_vt_8t_16w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter,
+ int32_t height, int32_t width)
+{
+ uint8_t *src_tmp;
+ uint8_t *dst_tmp;
+ uint32_t loop_cnt, cnt;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ const int32_t src_stride_4x = (src_stride << 2);
+ const int32_t dst_stride_4x = (dst_stride << 2);
+ const int32_t src_stride_3x = src_stride_2x + src_stride;
+ const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i filt0, filt1, filt2, filt3;
+ __m128i src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+ __m128i src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+ __m128i src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+ src -= src_stride_3x;
+ DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, filt0,
+ filt1, filt2, filt3);
+
+ for (cnt = (width >> 4); cnt--;) {
+ src_tmp = src;
+ dst_tmp = dst;
+
+ src0 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src1, src2);
+ src3 = __lsx_vldx(src_tmp, src_stride_3x);
+ src_tmp += src_stride_4x;
+ src4 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src5, src6);
+ src_tmp += src_stride_3x;
+ DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ src10_r, src32_r, src54_r, src21_r);
+ DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
+ DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
+ src10_l, src32_l, src54_l, src21_l);
+ DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, src43_l, src65_l);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src7 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src8, src9);
+ src10 = __lsx_vldx(src_tmp, src_stride_3x);
+ src_tmp += src_stride_4x;
+ DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
+ src9, src76_r, src87_r, src98_r, src109_r);
+ DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10,
+ src9, src76_l, src87_l, src98_l, src109_l);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src32_r,
+ filt0, src43_r, filt0, out0_r, out1_r, out2_r, out3_r);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out1_r,
+ src43_r, filt1, out2_r, src54_r, filt1, out3_r, src65_r,
+ filt1, out0_r, out1_r, out2_r, out3_r);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src54_r, filt2, out1_r,
+ src65_r, filt2, out2_r, src76_r, filt2, out3_r, src87_r,
+ filt2, out0_r, out1_r, out2_r, out3_r);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src76_r, filt3, out1_r,
+ src87_r, filt3, out2_r, src98_r, filt3, out3_r, src109_r,
+ filt3, out0_r, out1_r, out2_r, out3_r);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_l, filt0, src21_l, filt0, src32_l,
+ filt0, src43_l, filt0, out0_l, out1_l, out2_l, out3_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_l, src32_l, filt1, out1_l,
+ src43_l, filt1, out2_l, src54_l, filt1, out3_l, src65_l,
+ filt1, out0_l, out1_l, out2_l, out3_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_l, src54_l, filt2, out1_l,
+ src65_l, filt2, out2_l, src76_l, filt2, out3_l, src87_l,
+ filt2, out0_l, out1_l, out2_l, out3_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_l, src76_l, filt3, out1_l,
+ src87_l, filt3, out2_l, src98_l, filt3, out3_l, src109_l,
+ filt3, out0_l, out1_l, out2_l, out3_l);
+ DUP4_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out1_l, out1_r,
+ 6, out2_l, out2_r, 6, out3_l, out3_r, 6,
+ tmp0, tmp1, tmp2, tmp3);
+ __lsx_vst(tmp0, dst_tmp, 0);
+ __lsx_vstx(tmp1, dst_tmp, dst_stride);
+ __lsx_vstx(tmp2, dst_tmp, dst_stride_2x);
+ __lsx_vstx(tmp3, dst_tmp, dst_stride_3x);
+ dst_tmp += dst_stride_4x;
+
+ src10_r = src54_r;
+ src32_r = src76_r;
+ src54_r = src98_r;
+ src21_r = src65_r;
+ src43_r = src87_r;
+ src65_r = src109_r;
+ src10_l = src54_l;
+ src32_l = src76_l;
+ src54_l = src98_l;
+ src21_l = src65_l;
+ src43_l = src87_l;
+ src65_l = src109_l;
+ src6 = src10;
+ }
+
+ src += 16;
+ dst += 16;
+ }
+}
+
+static void common_vt_8t_24w_lsx(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ common_vt_8t_16w_lsx(src, src_stride, dst, dst_stride, filter, height, 16);
+ common_vt_8t_8w_lsx(src + 16, src_stride, dst + 16, dst_stride, filter,
+ height);
+}
+
+static void common_vt_8t_32w_lsx(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ common_vt_8t_16w_lsx(src, src_stride, dst, dst_stride, filter, height, 32);
+}
+
+static void common_vt_8t_48w_lsx(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ common_vt_8t_16w_lsx(src, src_stride, dst, dst_stride, filter, height, 48);
+}
+
+static void common_vt_8t_64w_lsx(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ common_vt_8t_16w_lsx(src, src_stride, dst, dst_stride, filter, height, 64);
+}
+
+static av_always_inline
+void hevc_hv_8t_8x2_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height, int32_t width)
+{
+ uint32_t loop_cnt, cnt;
+ uint8_t *src_tmp;
+ uint8_t *dst_tmp;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ const int32_t src_stride_4x = (src_stride << 2);
+ const int32_t src_stride_3x = src_stride_2x + src_stride;
+
+ __m128i out;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ __m128i filt0, filt1, filt2, filt3;
+ __m128i filt_h0, filt_h1, filt_h2, filt_h3;
+ __m128i mask1, mask2, mask3;
+ __m128i filter_vec;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l;
+ __m128i dst10_r, dst32_r, dst54_r, dst76_r;
+ __m128i dst10_l, dst32_l, dst54_l, dst76_l;
+ __m128i dst21_r, dst43_r, dst65_r, dst87_r;
+ __m128i dst21_l, dst43_l, dst65_l, dst87_l;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+
+ src -= (src_stride_3x + 3);
+ DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4,
+ filter_x, 6, filt0, filt1, filt2, filt3);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+ DUP4_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filter_vec, 2,
+ filter_vec, 3, filt_h0, filt_h1, filt_h2, filt_h3);
+
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+
+ for (cnt = width >> 3; cnt--;) {
+ src_tmp = src;
+ dst_tmp = dst;
+
+ src0 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src1, src2);
+ src3 = __lsx_vldx(src_tmp, src_stride_3x);
+ src_tmp += src_stride_4x;
+ src4 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src5, src6);
+ src_tmp += src_stride_3x;
+
+ /* row 0 row 1 row 2 row 3 */
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0,
+ src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1,
+ src1, mask2, src1, src1, mask3, vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2,
+ src2, mask2, src2, src2, mask3, vec8, vec9, vec10, vec11);
+ DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3,
+ src3, mask2, src3, src3, mask3, vec12, vec13, vec14, vec15);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, vec8, filt0,
+ vec12, filt0, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec5, filt1,
+ dst2, vec9, filt1, dst3, vec13, filt1, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec2, filt2, dst1, vec6, filt2,
+ dst2, vec10, filt2, dst3, vec14, filt2, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec3, filt3, dst1, vec7, filt3,
+ dst2, vec11, filt3, dst3, vec15, filt3, dst0, dst1, dst2, dst3);
+
+ DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4,
+ src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5,
+ src5, mask2, src5, src5, mask3, vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6,
+ src6, mask2, src6, src6, mask3, vec8, vec9, vec10, vec11);
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, dst4, dst5);
+ dst6 = __lsx_vdp2_h_bu_b(vec8, filt0);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst5, vec5, filt1,
+ dst6, vec9, filt1, dst4, vec2, filt2, dst4, dst5, dst6, dst4);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec6, filt2, dst6, vec10, filt2,
+ dst4, vec3, filt3, dst5, vec7, filt3, dst5, dst6, dst4, dst5);
+ dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3);
+ DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst2,
+ dst1, dst10_r, dst32_r, dst54_r, dst21_r);
+ DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst2,
+ dst1, dst10_l, dst32_l, dst54_l, dst21_l);
+ DUP2_ARG2(__lsx_vilvl_h, dst4, dst3, dst6, dst5, dst43_r, dst65_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst4, dst3, dst6, dst5, dst43_l, dst65_l);
+
+ for (loop_cnt = height >> 1; loop_cnt--;) {
+ src7 = __lsx_vld(src_tmp, 0);
+ src8 = __lsx_vldx(src_tmp, src_stride);
+ src_tmp += src_stride_2x;
+
+ DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7,
+ src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3);
+ dst7 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2,
+ filt2, dst7, dst7);
+ dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3);
+ dst76_r = __lsx_vilvl_h(dst7, dst6);
+ dst76_l = __lsx_vilvh_h(dst7, dst6);
+ DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0,
+ dst0_r, dst0_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
+ dst32_l, filt_h1, dst0_r, dst54_r, filt_h2, dst0_l,
+ dst54_l, filt_h2, dst0_r, dst0_l, dst0_r, dst0_l);
+ DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l,
+ dst76_l, filt_h3, dst0_r, dst0_l);
+ DUP2_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst0_r, dst0_l);
+
+ DUP4_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1, src8,
+ src8, mask2, src8, src8, mask3, vec0, vec1, vec2, vec3);
+ dst8 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst8, vec1, filt1, dst8, vec2,
+ filt2, dst8, dst8);
+ dst8 = __lsx_vdp2add_h_bu_b(dst8, vec3, filt3);
+
+ dst87_r = __lsx_vilvl_h(dst8, dst7);
+ dst87_l = __lsx_vilvh_h(dst8, dst7);
+ DUP2_ARG2(__lsx_vdp2_w_h, dst21_r, filt_h0, dst21_l, filt_h0,
+ dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst1_r, dst43_r, filt_h1, dst1_l,
+ dst43_l, filt_h1, dst1_r, dst65_r, filt_h2, dst1_l,
+ dst65_l, filt_h2, dst1_r, dst1_l, dst1_r, dst1_l);
+ DUP2_ARG3(__lsx_vdp2add_w_h, dst1_r, dst87_r, filt_h3, dst1_l,
+ dst87_l, filt_h3, dst1_r, dst1_l);
+ DUP2_ARG2(__lsx_vsrai_w, dst1_r, 6, dst1_l, 6, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vsrari_w, dst0_r, 6, dst0_l, 6,dst1_r, 6, dst1_l,
+ 6, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG1(__lsx_vclip255_w, dst0_l, dst0_r, dst1_l, dst1_r,
+ dst0_l, dst0_r, dst1_l, dst1_r);
+ DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r,
+ dst0, dst1);
+ out = __lsx_vpickev_b(dst1, dst0);
+ __lsx_vstelm_d(out, dst_tmp, 0, 0);
+ __lsx_vstelm_d(out, dst_tmp + dst_stride, 0, 1);
+ dst_tmp += dst_stride_2x;
+
+ dst10_r = dst32_r;
+ dst32_r = dst54_r;
+ dst54_r = dst76_r;
+ dst10_l = dst32_l;
+ dst32_l = dst54_l;
+ dst54_l = dst76_l;
+ dst21_r = dst43_r;
+ dst43_r = dst65_r;
+ dst65_r = dst87_r;
+ dst21_l = dst43_l;
+ dst43_l = dst65_l;
+ dst65_l = dst87_l;
+ dst6 = dst8;
+ }
+ src += 8;
+ dst += 8;
+ }
+}
+
+static void hevc_hv_8t_8w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height)
+{
+ hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 8);
+}
+
+static void hevc_hv_8t_16w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height)
+{
+ hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 16);
+}
+
+static void hevc_hv_8t_24w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height)
+{
+ hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 24);
+}
+
+static void hevc_hv_8t_32w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height)
+{
+ hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 32);
+}
+
+static void hevc_hv_8t_48w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height)
+{
+ hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 48);
+}
+
+static void hevc_hv_8t_64w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height)
+{
+ hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 64);
+}
+
+static av_always_inline
+void common_vt_4t_24w_lsx(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ uint32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ uint8_t *_src;
+
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i src11, filt0, filt1;
+ __m128i src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
+ __m128i src109_r, src10_l, src32_l, src21_l, src43_l;
+ __m128i out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
+ __m128i out1, out2, out3, out4;
+
+ src -= src_stride;
+ DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+ _src = src + 16;
+
+ /* 16 width */
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+ DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
+
+ /* 8 width */
+ src6 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x, src7, src8);
+ src += src_stride_3x;
+ _src += src_stride_3x;
+ DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
+
+ for (loop_cnt = 8; loop_cnt--;) {
+ /* 16 width */
+ DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src3, src9);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src4, src10);
+ DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
+ DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
+
+ /* 8 width */
+ src += src_stride_2x;
+ _src += src_stride_2x;
+ DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r);
+
+ /* 16 width */
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
+ filt0, src21_l, filt0, out0_r, out0_l, out1_r, out1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out0_l, src32_l,
+ filt1, out1_r, src43_r, filt1, out1_l, src43_l, filt1,
+ out0_r, out0_l, out1_r, out1_l);
+
+ /* 8 width */
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src87_r, filt0,
+ out2_r, out3_r);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, out2_r, src98_r, filt1, out3_r,
+ src109_r, filt1, out2_r, out3_r);
+
+ /* 16 + 8 width */
+ DUP4_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out2_r, out2_r, 6,
+ out3_r, out3_r, 6, out1_l, out1_r, 6, out1, out2, out3, out4);
+ __lsx_vst(out1, dst, 0);
+ __lsx_vstelm_d(out2, dst, 16, 0);
+ dst += dst_stride;
+ __lsx_vst(out4, dst, 0);
+ __lsx_vstelm_d(out3, dst, 16, 0);
+ dst += dst_stride;
+
+ /* 16 width */
+ DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src5, src11);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src2, src8);
+ DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r);
+ DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l);
+
+ /* 8 width */
+ src += src_stride_2x;
+ _src += src_stride_2x;
+ DUP2_ARG2(__lsx_vilvl_b, src11, src10, src8, src11, src76_r, src87_r);
+
+ /* 16 width */
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
+ filt0, src43_l, filt0, out0_r, out0_l, out1_r, out1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src10_r, filt1, out0_l, src10_l,
+ filt1, out1_r, src21_r, filt1, out1_l, src21_l, filt1,
+ out0_r, out0_l, out1_r, out1_l);
+
+ /* 8 width */
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, src98_r, filt0, src109_r, filt0,
+ out2_r, out3_r);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, out2_r, src76_r, filt1, out3_r,
+ src87_r, filt1, out2_r, out3_r);
+
+ /* 16 + 8 width */
+ DUP4_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out2_r, out2_r, 6,
+ out1_l, out1_r, 6, out3_r, out3_r, 6, out1, out2, out3, out4);
+
+ __lsx_vst(out1, dst, 0);
+ __lsx_vstelm_d(out2, dst, 16, 0);
+ dst += dst_stride;
+ __lsx_vst(out3, dst, 0);
+ __lsx_vstelm_d(out4, dst, 16, 0);
+ dst += dst_stride;
+ }
+}
+
+static av_always_inline
+void common_vt_4t_32w_lsx(uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride,
+ const int8_t *filter, int32_t height)
+{
+ uint32_t loop_cnt;
+ int32_t src_stride_2x = (src_stride << 1);
+ int32_t dst_stride_2x = (dst_stride << 1);
+ int32_t src_stride_3x = src_stride_2x + src_stride;
+ uint8_t *_src;
+
+ __m128i src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
+ __m128i src10_r, src32_r, src76_r, src98_r;
+ __m128i src21_r, src43_r, src87_r, src109_r;
+ __m128i out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+ __m128i src10_l, src32_l, src76_l, src98_l;
+ __m128i src21_l, src43_l, src87_l, src109_l;
+ __m128i filt0, filt1;
+ __m128i out1, out2;
+
+ src -= src_stride;
+ DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+ _src = src + 16;
+
+ /* 16 width */
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+
+ DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+ DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
+
+ /* next 16 width */
+ src6 = __lsx_vld(_src, 0);
+ DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x, src7, src8);
+ src += src_stride_3x;
+ _src += src_stride_3x;
+
+ DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
+ DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src76_l, src87_l);
+
+ for (loop_cnt = (height >> 1); loop_cnt--;) {
+ /* 16 width */
+ DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src3, src9);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src4, src10);
+ DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
+ DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
+
+ /* 16 width */
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
+ filt0, src21_l, filt0, out0_r, out0_l, out1_r, out1_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out0_l, src32_l,
+ filt1, out1_r, src43_r, filt1, out1_l, src43_l, filt1,
+ out0_r, out0_l, out1_r, out1_l);
+
+ DUP2_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out1_l, out1_r, 6,
+ out1, out2);
+ __lsx_vst(out1, dst, 0);
+ __lsx_vstx(out2, dst, dst_stride);
+
+ src10_r = src32_r;
+ src21_r = src43_r;
+ src10_l = src32_l;
+ src21_l = src43_l;
+ src2 = src4;
+
+ /* next 16 width */
+ src += src_stride_2x;
+ _src += src_stride_2x;
+ DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r);
+ DUP2_ARG2(__lsx_vilvh_b, src9, src8, src10, src9, src98_l, src109_l);
+
+ /* next 16 width */
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src76_l, filt0, src87_r,
+ filt0, src87_l, filt0, out2_r, out2_l, out3_r, out3_l);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, out2_r, src98_r, filt1, out2_l, src98_l,
+ filt1, out3_r, src109_r, filt1, out3_l, src109_l, filt1,
+ out2_r, out2_l, out3_r, out3_l);
+
+ /* next 16 width */
+ DUP2_ARG3(__lsx_vssrarni_bu_h, out2_l, out2_r, 6, out3_l, out3_r, 6,
+ out1, out2);
+ __lsx_vst(out1, dst, 16);
+ __lsx_vst(out2, dst + dst_stride, 16);
+
+ dst += dst_stride_2x;
+
+ src76_r = src98_r;
+ src87_r = src109_r;
+ src76_l = src98_l;
+ src87_l = src109_l;
+ src8 = src10;
+ }
+}
+
+static av_always_inline
+void hevc_hv_4t_8x2_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y)
+{
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t src_stride_4x = (src_stride << 2);
+ const int32_t src_stride_3x = src_stride_2x + src_stride;
+ __m128i out;
+ __m128i src0, src1, src2, src3, src4;
+ __m128i filt0, filt1;
+ __m128i filt_h0, filt_h1, filter_vec;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ __m128i mask1;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+ __m128i dst0, dst1, dst2, dst3, dst4;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l;
+ __m128i dst10_r, dst32_r, dst21_r, dst43_r;
+ __m128i dst10_l, dst32_l, dst21_l, dst43_l;
+ __m128i out0_r, out1_r;
+
+ src -= (src_stride + 1);
+ DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+ DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src,
+ src_stride_3x, src, src_stride_4x, src1, src2, src3, src4);
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src1, src1,
+ mask0, src1, src1, mask1, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src3, src3,
+ mask0, src3, src3, mask1, vec4, vec5, vec6, vec7);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec8, vec9);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
+ filt0, dst0, dst1, dst2, dst3);
+ dst4 = __lsx_vdp2_h_bu_b(vec8, filt0);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2,
+ vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
+ dst4 = __lsx_vdp2add_h_bu_b(dst4, vec9, filt1);
+ DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
+ dst10_r, dst21_r, dst32_r, dst43_r);
+ DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
+ dst10_l, dst21_l, dst32_l, dst43_l);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+ filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+ filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP2_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6,
+ out0_r, out1_r);
+ out = __lsx_vssrarni_bu_h(out1_r, out0_r, 6);
+ __lsx_vstelm_d(out, dst, 0, 0);
+ __lsx_vstelm_d(out, dst + dst_stride, 0, 1);
+}
+
+static av_always_inline
+void hevc_hv_4t_8multx4_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t width8mult)
+{
+ uint32_t cnt;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ const int32_t src_stride_4x = (src_stride << 2);
+ const int32_t src_stride_3x = src_stride_2x + src_stride;
+ const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+
+ __m128i out0, out1;
+ __m128i src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i filt0, filt1, filt_h0, filt_h1, filter_vec;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+ __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
+ __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
+
+ src -= (src_stride + 1);
+ DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+ DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+ mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ for (cnt = width8mult; cnt--;) {
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src3 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src4 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
+ src += (8 - src_stride_4x);
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1,
+ vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,
+ vec4, vec5);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
+ dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
+ dst0, dst1);
+ dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
+
+ DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1,
+ vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1,
+ vec4, vec5);
+ DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1,
+ vec6, vec7);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
+ vec6, filt0, dst3, dst4, dst5, dst6);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3, filt1,
+ dst5, vec5, filt1, dst6, vec7, filt1, dst3, dst4, dst5, dst6);
+
+ DUP4_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst5, dst4, dst6,
+ dst5, dst32_r, dst43_r, dst54_r, dst65_r);
+ DUP4_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst5, dst4, dst6,
+ dst5, dst32_l, dst43_l, dst54_l, dst65_l);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+ filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
+ filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+ filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
+ filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
+ dst2_r, dst2_l, dst3_r, dst3_l);
+
+ DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6,
+ dst2_l, dst2_r, 6, dst3_l, dst3_r, 6, tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, 6, tmp3, tmp2, 6, out0, out1);
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
+ dst += 8;
+ }
+}
+
+static av_always_inline
+void hevc_hv_4t_8x6_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y)
+{
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ const int32_t src_stride_4x = (src_stride << 2);
+ const int32_t dst_stride_4x = (dst_stride << 2);
+ const int32_t src_stride_3x = src_stride_2x + src_stride;
+ const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ __m128i out0, out1, out2;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ __m128i filt0, filt1;
+ __m128i filt_h0, filt_h1, filter_vec;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ __m128i mask1;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+ __m128i vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+ __m128i dst4_r, dst4_l, dst5_r, dst5_l;
+ __m128i dst10_r, dst32_r, dst10_l, dst32_l;
+ __m128i dst21_r, dst43_r, dst21_l, dst43_l;
+ __m128i dst54_r, dst54_l, dst65_r, dst65_l;
+ __m128i dst76_r, dst76_l, dst87_r, dst87_l;
+ __m128i out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
+
+ src -= (src_stride + 1);
+ DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+ DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ src0 = __lsx_vld(src, 0);
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,src,
+ src_stride_3x, src, src_stride_4x, src1, src2, src3, src4);
+ src += src_stride_4x;
+ DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,src,
+ src_stride_3x, src, src_stride_4x, src5, src6, src7, src8);
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src1, src1,
+ mask0, src1, src1, mask1, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src3, src3,
+ mask0, src3, src3, mask1, vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src5, src5,
+ mask0, src5, src5, mask1, vec8, vec9, vec10, vec11);
+ DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src7, src7,
+ mask0, src7, src7, mask1, vec12, vec13, vec14, vec15);
+ DUP2_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1, vec16, vec17);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
+ filt0, dst0, dst1, dst2, dst3);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec8, filt0, vec10, filt0, vec12, filt0, vec14,
+ filt0, dst4, dst5, dst6, dst7);
+ dst8 = __lsx_vdp2_h_bu_b(vec16, filt0);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2,
+ vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec9, filt1, dst5, vec11, filt1, dst6,
+ vec13, filt1, dst7, vec15, filt1, dst4, dst5, dst6, dst7);
+ dst8 = __lsx_vdp2add_h_bu_b(dst8, vec17, filt1);
+
+ DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
+ dst10_r, dst21_r, dst32_r, dst43_r);
+ DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
+ dst10_l, dst21_l, dst32_l, dst43_l);
+ DUP4_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
+ dst54_r, dst65_r, dst76_r, dst87_r);
+ DUP4_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
+ dst54_l, dst65_l, dst76_l, dst87_l);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+ filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
+ filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst54_l, filt_h0, dst65_r,
+ filt_h0, dst65_l, filt_h0, dst4_r, dst4_l, dst5_r, dst5_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+ filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
+ filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
+ dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst4_r, dst76_r, filt_h1, dst4_l, dst76_l,
+ filt_h1, dst5_r, dst87_r, filt_h1, dst5_l, dst87_l, filt_h1,
+ dst4_r, dst4_l, dst5_r, dst5_l);
+
+ DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6, dst2_l,
+ dst2_r, 6, dst3_l, dst3_r, 6, out0_r, out1_r, out2_r, out3_r);
+ DUP2_ARG3(__lsx_vsrani_h_w, dst4_l, dst4_r, 6, dst5_l, dst5_r, 6,
+ out4_r, out5_r);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, out1_r, out0_r, 6, out3_r, out2_r, 6,
+ out0, out1);
+ out2 = __lsx_vssrarni_bu_h(out5_r, out4_r, 6);
+
+ __lsx_vstelm_d(out0, dst, 0, 0);
+ __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
+ dst += dst_stride_4x;
+ __lsx_vstelm_d(out2, dst, 0, 0);
+ __lsx_vstelm_d(out2, dst + dst_stride, 0, 1);
+}
+
+static av_always_inline
+void hevc_hv_4t_8multx4mult_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height,
+ int32_t width8mult)
+{
+ uint32_t loop_cnt, cnt;
+ uint8_t *src_tmp;
+ uint8_t *dst_tmp;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ const int32_t src_stride_4x = (src_stride << 2);
+ const int32_t dst_stride_4x = (dst_stride << 2);
+ const int32_t src_stride_3x = src_stride_2x + src_stride;
+ const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+
+ __m128i out0, out1;
+ __m128i src0, src1, src2, src3, src4, src5, src6;
+ __m128i filt0, filt1;
+ __m128i filt_h0, filt_h1, filter_vec;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ __m128i mask1;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+ __m128i dst10_r, dst32_r, dst21_r, dst43_r;
+ __m128i dst10_l, dst32_l, dst21_l, dst43_l;
+ __m128i dst54_r, dst54_l, dst65_r, dst65_l, dst6;
+ __m128i out0_r, out1_r, out2_r, out3_r;
+
+ src -= (src_stride + 1);
+ DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+ DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ for (cnt = width8mult; cnt--;) {
+ src_tmp = src;
+ dst_tmp = dst;
+
+ src0 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src1, src2);
+ src_tmp += src_stride_3x;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1,
+ vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1,
+ vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,
+ vec4, vec5);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
+ dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
+ dst0, dst1);
+ dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
+
+ DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
+
+ for (loop_cnt = (height >> 2); loop_cnt--;) {
+ src3 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src4, src5);
+ src6 = __lsx_vldx(src_tmp, src_stride_3x);
+ src_tmp += src_stride_4x;
+
+ DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src4,
+ src4, mask0, src4, src4, mask1, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src6,
+ src6, mask0, src6, src6, mask1, vec4, vec5, vec6, vec7);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
+ vec6, filt0, dst3, dst4, dst5, dst6);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3,
+ filt1, dst5, vec5, filt1, dst6, vec7, filt1,
+ dst3, dst4, dst5, dst6);
+
+ DUP4_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst5, dst4,
+ dst6, dst5, dst32_r, dst43_r, dst54_r, dst65_r);
+ DUP4_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst5, dst4,
+ dst6, dst5, dst32_l, dst43_l, dst54_l, dst65_l);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+ filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
+ filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
+ dst32_l, filt_h1, dst1_r, dst43_r, filt_h1, dst1_l,
+ dst43_l, filt_h1, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l,
+ dst54_l, filt_h1, dst3_r, dst65_r, filt_h1, dst3_l,
+ dst65_l, filt_h1, dst2_r, dst2_l, dst3_r, dst3_l);
+
+ DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6,
+ dst2_l, dst2_r, 6, dst3_l, dst3_r, 6, out0_r, out1_r,
+ out2_r, out3_r);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, out1_r, out0_r, 6, out3_r, out2_r,
+ 6, out0, out1);
+ __lsx_vstelm_d(out0, dst_tmp, 0, 0);
+ __lsx_vstelm_d(out0, dst_tmp + dst_stride, 0, 1);
+ __lsx_vstelm_d(out1, dst_tmp + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(out1, dst_tmp + dst_stride_3x, 0, 1);
+ dst_tmp += dst_stride_4x;
+
+ dst10_r = dst54_r;
+ dst10_l = dst54_l;
+ dst21_r = dst65_r;
+ dst21_l = dst65_l;
+ dst2 = dst6;
+ }
+ src += 8;
+ dst += 8;
+ }
+}
+
+static
+void hevc_hv_4t_8w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height)
+{
+ if (2 == height) {
+ hevc_hv_4t_8x2_lsx(src, src_stride, dst, dst_stride, filter_x, filter_y);
+ } else if (4 == height) {
+ hevc_hv_4t_8multx4_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, 1);
+ } else if (6 == height) {
+ hevc_hv_4t_8x6_lsx(src, src_stride, dst, dst_stride, filter_x, filter_y);
+ } else if (0 == (height & 0x03)) {
+ hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 1);
+ }
+}
+
+static av_always_inline
+void hevc_hv_4t_12w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height)
+{
+ uint32_t loop_cnt;
+ uint8_t *src_tmp, *dst_tmp;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ const int32_t src_stride_4x = (src_stride << 2);
+ const int32_t dst_stride_4x = (dst_stride << 2);
+ const int32_t src_stride_3x = src_stride_2x + src_stride;
+ const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
+ __m128i out0, out1;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i mask0, mask1, mask2, mask3;
+ __m128i filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
+ __m128i dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
+ __m128i dst10, dst21, dst22, dst73, dst84, dst95, dst106;
+ __m128i dst76_r, dst98_r, dst87_r, dst109_r;
+ __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
+ __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+ src -= (src_stride + 1);
+ DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
+
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+ DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
+
+ mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+ mask1 = __lsx_vaddi_bu(mask0, 2);
+
+ src_tmp = src;
+ dst_tmp = dst;
+
+ src0 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src1, src2);
+ src_tmp += src_stride_3x;
+
+ DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dsth0, dsth1);
+ dsth2 = __lsx_vdp2_h_bu_b(vec4, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dsth0, vec1, filt1, dsth1, vec3, filt1,
+ dsth0, dsth1);
+ dsth2 = __lsx_vdp2add_h_bu_b(dsth2, vec5, filt1);
+
+ DUP2_ARG2(__lsx_vilvl_h, dsth1, dsth0, dsth2, dsth1, dst10_r, dst21_r);
+ DUP2_ARG2(__lsx_vilvh_h, dsth1, dsth0, dsth2, dsth1, dst10_l, dst21_l);
+
+ for (loop_cnt = 4; loop_cnt--;) {
+ src3 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src4, src5);
+ src6 = __lsx_vldx(src_tmp, src_stride_3x);
+ src_tmp += src_stride_4x;
+
+ DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src4,
+ src4, mask0, src4, src4, mask1, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src6,
+ src6, mask0, src6, src6, mask1, vec4, vec5, vec6, vec7);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
+ vec6, filt0, dsth3, dsth4, dsth5, dsth6);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dsth3, vec1, filt1, dsth4,
+ vec3, filt1, dsth5, vec5, filt1, dsth6, vec7, filt1,
+ dsth3, dsth4, dsth5, dsth6);
+
+ DUP4_ARG2(__lsx_vilvl_h, dsth3, dsth2, dsth4, dsth3, dsth5, dsth4,
+ dsth6, dsth5, dst32_r, dst43_r, dst54_r, dst65_r);
+ DUP4_ARG2(__lsx_vilvh_h, dsth3, dsth2, dsth4, dsth3, dsth5, dsth4,
+ dsth6, dsth5, dst32_l, dst43_l, dst54_l, dst65_l);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
+ filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
+ filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
+ filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
+ dst0_r, dst0_l, dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
+ filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
+ dst2_r, dst2_l, dst3_r, dst3_l);
+
+ DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6, dst2_l,
+ dst2_r, 6, dst3_l, dst3_r, 6, tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, 6, tmp3, tmp2, 6, out0, out1);
+
+ __lsx_vstelm_d(out0, dst_tmp, 0, 0);
+ __lsx_vstelm_d(out0, dst_tmp + dst_stride, 0, 1);
+ __lsx_vstelm_d(out1, dst_tmp + dst_stride_2x, 0, 0);
+ __lsx_vstelm_d(out1, dst_tmp + dst_stride_3x, 0, 1);
+ dst_tmp += dst_stride_4x;
+
+ dst10_r = dst54_r;
+ dst10_l = dst54_l;
+ dst21_r = dst65_r;
+ dst21_l = dst65_l;
+ dsth2 = dsth6;
+ }
+
+ src += 8;
+ dst += 8;
+
+ mask2 = __lsx_vld(ff_hevc_mask_arr, 16);
+ mask3 = __lsx_vaddi_bu(mask2, 2);
+
+ src0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
+ src += src_stride_3x;
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask2, src1, src0, mask3, vec0, vec1);
+ DUP2_ARG3(__lsx_vshuf_b, src2, src1, mask2, src2, src1, mask3, vec2, vec3);
+
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst10, dst21);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst10, vec1, filt1, dst21, vec3, filt1,
+ dst10, dst21);
+
+ dst10_r = __lsx_vilvl_h(dst21, dst10);
+ dst21_r = __lsx_vilvh_h(dst21, dst10);
+ dst22 = __lsx_vreplvei_d(dst21, 1);
+
+ for (loop_cnt = 2; loop_cnt--;) {
+ src3 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src4, src5);
+ src6 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ src7 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
+ src10 = __lsx_vldx(src, src_stride_3x);
+ src += src_stride_4x;
+ DUP4_ARG3(__lsx_vshuf_b, src7, src3, mask2, src7, src3, mask3, src8,
+ src4, mask2, src8, src4, mask3, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src9, src5, mask2, src9, src5, mask3, src10,
+ src6, mask2, src10, src6, mask3, vec4, vec5, vec6, vec7);
+
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
+ vec6, filt0, dst73, dst84, dst95, dst106);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst73, vec1, filt1, dst84, vec3,
+ filt1, dst95, vec5, filt1, dst106, vec7, filt1,
+ dst73, dst84, dst95, dst106);
+
+ dst32_r = __lsx_vilvl_h(dst73, dst22);
+ DUP2_ARG2(__lsx_vilvl_h, dst84, dst73, dst95, dst84, dst43_r, dst54_r);
+ DUP2_ARG2(__lsx_vilvh_h, dst84, dst73, dst95, dst84, dst87_r, dst98_r);
+ dst65_r = __lsx_vilvl_h(dst106, dst95);
+ dst109_r = __lsx_vilvh_h(dst106, dst95);
+ dst22 = __lsx_vreplvei_d(dst73, 1);
+ dst76_r = __lsx_vilvl_h(dst22, dst106);
+
+ DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst21_r, filt_h0, dst32_r,
+ filt_h0, dst43_r, filt_h0, dst0, dst1, dst2, dst3);
+ DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst65_r, filt_h0, dst76_r,
+ filt_h0, dst87_r, filt_h0, dst4, dst5, dst6, dst7);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0, dst32_r, filt_h1, dst1, dst43_r,
+ filt_h1, dst2, dst54_r, filt_h1, dst3, dst65_r, filt_h1,
+ dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst4, dst76_r, filt_h1, dst5, dst87_r,
+ filt_h1, dst6, dst98_r, filt_h1, dst7, dst109_r, filt_h1,
+ dst4, dst5, dst6, dst7);
+
+ DUP4_ARG3(__lsx_vsrani_h_w, dst1, dst0, 6, dst3, dst2, 6, dst5, dst4,
+ 6, dst7, dst6, 6, tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, 6, tmp3, tmp2, 6, out0, out1);
+
+ __lsx_vstelm_w(out0, dst, 0, 0);
+ __lsx_vstelm_w(out0, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(out0, dst + dst_stride_2x, 0, 2);
+ __lsx_vstelm_w(out0, dst + dst_stride_3x, 0, 3);
+ dst += dst_stride_4x;
+ __lsx_vstelm_w(out1, dst, 0, 0);
+ __lsx_vstelm_w(out1, dst + dst_stride, 0, 1);
+ __lsx_vstelm_w(out1, dst + dst_stride_2x, 0, 2);
+ __lsx_vstelm_w(out1, dst + dst_stride_3x, 0, 3);
+ dst += dst_stride_4x;
+
+ dst10_r = dst98_r;
+ dst21_r = dst109_r;
+ dst22 = __lsx_vreplvei_d(dst106, 1);
+ }
+}
+
+static void hevc_hv_4t_16w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height)
+{
+ if (4 == height) {
+ hevc_hv_4t_8multx4_lsx(src, src_stride, dst, dst_stride, filter_x,
+ filter_y, 2);
+ } else {
+ hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 2);
+ }
+}
+
+static void hevc_hv_4t_24w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height)
+{
+ hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 3);
+}
+
+static void hevc_hv_4t_32w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height)
+{
+ hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
+ filter_x, filter_y, height, 4);
+}
+
+#define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
+void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_lsx(uint8_t *dst, \
+ ptrdiff_t dst_stride, \
+ uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int height, \
+ intptr_t mx, \
+ intptr_t my, \
+ int width) \
+{ \
+ const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
+ \
+ common_##DIR1##_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, dst_stride, \
+ filter, height); \
+}
+
+UNI_MC(qpel, h, 64, 8, hz, mx);
+
+UNI_MC(qpel, v, 24, 8, vt, my);
+UNI_MC(qpel, v, 32, 8, vt, my);
+UNI_MC(qpel, v, 48, 8, vt, my);
+UNI_MC(qpel, v, 64, 8, vt, my);
+
+UNI_MC(epel, v, 24, 4, vt, my);
+UNI_MC(epel, v, 32, 4, vt, my);
+
+#undef UNI_MC
+
+#define UNI_MC_HV(PEL, WIDTH, TAP) \
+void ff_hevc_put_hevc_uni_##PEL##_hv##WIDTH##_8_lsx(uint8_t *dst, \
+ ptrdiff_t dst_stride, \
+ uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int height, \
+ intptr_t mx, \
+ intptr_t my, \
+ int width) \
+{ \
+ const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
+ const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
+ \
+ hevc_hv_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, dst_stride, \
+ filter_x, filter_y, height); \
+}
+
+UNI_MC_HV(qpel, 8, 8);
+UNI_MC_HV(qpel, 16, 8);
+UNI_MC_HV(qpel, 24, 8);
+UNI_MC_HV(qpel, 32, 8);
+UNI_MC_HV(qpel, 48, 8);
+UNI_MC_HV(qpel, 64, 8);
+
+UNI_MC_HV(epel, 8, 4);
+UNI_MC_HV(epel, 12, 4);
+UNI_MC_HV(epel, 16, 4);
+UNI_MC_HV(epel, 24, 4);
+UNI_MC_HV(epel, 32, 4);
+
+#undef UNI_MC_HV
diff --git a/libavcodec/loongarch/hevc_mc_uniw_lsx.c b/libavcodec/loongarch/hevc_mc_uniw_lsx.c
new file mode 100644
index 0000000000..118f5b820e
--- /dev/null
+++ b/libavcodec/loongarch/hevc_mc_uniw_lsx.c
@@ -0,0 +1,298 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ * Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "hevcdsp_lsx.h"
+
+static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
+ /* 8 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+ /* 4 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
+};
+
+static av_always_inline
+void hevc_hv_8t_8x2_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height, int32_t weight,
+ int32_t offset, int32_t rnd_val, int32_t width)
+{
+ uint32_t loop_cnt, cnt;
+ uint8_t *src_tmp;
+ uint8_t *dst_tmp;
+ const int32_t src_stride_2x = (src_stride << 1);
+ const int32_t dst_stride_2x = (dst_stride << 1);
+ const int32_t src_stride_4x = (src_stride << 2);
+ const int32_t src_stride_3x = src_stride_2x + src_stride;
+
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ __m128i filt0, filt1, filt2, filt3;
+ __m128i filt_h0, filt_h1, filt_h2, filt_h3;
+ __m128i mask1, mask2, mask3;
+ __m128i filter_vec;
+ __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+ __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+ __m128i dst0_r, dst0_l, dst1_r, dst1_l;
+ __m128i dst10_r, dst32_r, dst54_r, dst76_r;
+ __m128i dst10_l, dst32_l, dst54_l, dst76_l;
+ __m128i dst21_r, dst43_r, dst65_r, dst87_r;
+ __m128i dst21_l, dst43_l, dst65_l, dst87_l;
+ __m128i weight_vec, offset_vec, rnd_vec;
+ __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
+
+ src -= (src_stride_3x + 3);
+ weight_vec = __lsx_vreplgr2vr_w(weight);
+ offset_vec = __lsx_vreplgr2vr_w(offset);
+ rnd_vec = __lsx_vreplgr2vr_w(rnd_val);
+
+ DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4,
+ filter_x, 6, filt0, filt1, filt2, filt3);
+ filter_vec = __lsx_vld(filter_y, 0);
+ filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
+ DUP4_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filter_vec, 2,
+ filter_vec, 3, filt_h0, filt_h1, filt_h2, filt_h3);
+ DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+ mask3 = __lsx_vaddi_bu(mask0, 6);
+
+ for (cnt = width >> 3; cnt--;) {
+ src_tmp = src;
+ dst_tmp = dst;
+
+ src0 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src1, src2);
+ src3 = __lsx_vldx(src_tmp, src_stride_3x);
+ src_tmp += src_stride_4x;
+ src4 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
+ src5, src6);
+ src_tmp += src_stride_3x;
+
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0,
+ src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1,
+ src1, mask2, src1, src1, mask3, vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2,
+ src2, mask2, src2, src2, mask3, vec8, vec9, vec10, vec11);
+ DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3,
+ src3, mask2, src3, src3, mask3, vec12, vec13, vec14, vec15);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, vec8, filt0,
+ vec12, filt0, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec5, filt1,
+ dst2, vec9, filt1, dst3, vec13, filt1, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec2, filt2, dst1, vec6, filt2,
+ dst2, vec10, filt2, dst3, vec14, filt2, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec3, filt3, dst1, vec7, filt3,
+ dst2, vec11, filt3, dst3, vec15, filt3, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4,
+ src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3);
+ DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5,
+ src5, mask2, src5, src5, mask3, vec4, vec5, vec6, vec7);
+ DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6,
+ src6, mask2, src6, src6, mask3, vec8, vec9, vec10, vec11);
+ DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, dst4, dst5);
+ dst6 = __lsx_vdp2_h_bu_b(vec8, filt0);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst5, vec5, filt1,
+ dst6, vec9, filt1, dst4, vec2, filt2, dst4, dst5, dst6, dst4);
+ DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec6, filt2, dst6, vec10, filt2,
+ dst4, vec3, filt3, dst5, vec7, filt3, dst5, dst6, dst4, dst5);
+ dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3);
+
+ DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst2,
+ dst1, dst10_r, dst32_r, dst54_r, dst21_r);
+ DUP2_ARG2(__lsx_vilvl_h, dst4, dst3, dst6, dst5, dst43_r, dst65_r);
+ DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst2,
+ dst1, dst10_l, dst32_l, dst54_l, dst21_l);
+ DUP2_ARG2(__lsx_vilvh_h, dst4, dst3, dst6, dst5, dst43_l, dst65_l);
+
+ for (loop_cnt = height >> 1; loop_cnt--;) {
+ src7 = __lsx_vld(src_tmp, 0);
+ src8 = __lsx_vldx(src_tmp, src_stride);
+ src_tmp += src_stride_2x;
+ DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7,
+ src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3);
+ dst7 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2,
+ filt2, dst7, dst7);
+ dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3);
+ dst76_r = __lsx_vilvl_h(dst7, dst6);
+ dst76_l = __lsx_vilvh_h(dst7, dst6);
+ DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0,
+ dst0_r, dst0_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
+ dst32_l, filt_h1, dst0_r, dst54_r, filt_h2, dst0_l,
+ dst54_l, filt_h2, dst0_r, dst0_l, dst0_r, dst0_l);
+ DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l,
+ dst76_l, filt_h3, dst0_r, dst0_l);
+ DUP2_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst0_r, dst0_l);
+
+ /* row 8 */
+ DUP4_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1, src8,
+ src8, mask2, src8, src8, mask3, vec0, vec1, vec2, vec3);
+ dst8 = __lsx_vdp2_h_bu_b(vec0, filt0);
+ DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst8, vec1, filt1, dst8, vec2,
+ filt2, dst8, dst8);
+ dst8 = __lsx_vdp2add_h_bu_b(dst8, vec3, filt3);
+
+ dst87_r = __lsx_vilvl_h(dst8, dst7);
+ dst87_l = __lsx_vilvh_h(dst8, dst7);
+ DUP2_ARG2(__lsx_vdp2_w_h, dst21_r, filt_h0, dst21_l, filt_h0,
+ dst1_r, dst1_l);
+ DUP4_ARG3(__lsx_vdp2add_w_h, dst1_r, dst43_r, filt_h1, dst1_l,
+ dst43_l, filt_h1, dst1_r, dst65_r, filt_h2, dst1_l,
+ dst65_l, filt_h2, dst1_r, dst1_l, dst1_r, dst1_l);
+ DUP2_ARG3(__lsx_vdp2add_w_h, dst1_r, dst87_r, filt_h3, dst1_l,
+ dst87_l, filt_h3, dst1_r, dst1_l);
+ DUP2_ARG2(__lsx_vsrai_w, dst1_r, 6, dst1_l, 6, dst1_r, dst1_l);
+
+ DUP2_ARG2(__lsx_vmul_w, dst0_r, weight_vec, dst0_l, weight_vec,
+ dst0_r, dst0_l);
+ DUP2_ARG2(__lsx_vmul_w, dst1_r, weight_vec, dst1_l, weight_vec,
+ dst1_r, dst1_l);
+ DUP4_ARG2(__lsx_vsrar_w, dst0_r, rnd_vec, dst1_r, rnd_vec, dst0_l,
+ rnd_vec, dst1_l, rnd_vec, dst0_r, dst1_r, dst0_l, dst1_l);
+
+ DUP2_ARG2(__lsx_vadd_w, dst0_r, offset_vec, dst0_l, offset_vec,
+ dst0_r, dst0_l);
+ DUP2_ARG2(__lsx_vadd_w, dst1_r, offset_vec, dst1_l, offset_vec,
+ dst1_r, dst1_l);
+ DUP4_ARG1(__lsx_vclip255_w, dst0_r, dst1_r, dst0_l, dst1_l, dst0_r,
+ dst1_r, dst0_l, dst1_l);
+ DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r,
+ dst0_r, dst1_r);
+ dst0_r = __lsx_vpickev_b(dst1_r, dst0_r);
+
+ __lsx_vstelm_d(dst0_r, dst_tmp, 0, 0);
+ __lsx_vstelm_d(dst0_r, dst_tmp + dst_stride, 0, 1);
+ dst_tmp += dst_stride_2x;
+
+ dst10_r = dst32_r;
+ dst32_r = dst54_r;
+ dst54_r = dst76_r;
+ dst10_l = dst32_l;
+ dst32_l = dst54_l;
+ dst54_l = dst76_l;
+ dst21_r = dst43_r;
+ dst43_r = dst65_r;
+ dst65_r = dst87_r;
+ dst21_l = dst43_l;
+ dst43_l = dst65_l;
+ dst65_l = dst87_l;
+ dst6 = dst8;
+ }
+
+ src += 8;
+ dst += 8;
+ }
+}
+
+static
+void hevc_hv_8t_8w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height, int32_t weight,
+ int32_t offset, int32_t rnd_val)
+{
+ hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, filter_x,
+ filter_y, height, weight, offset, rnd_val, 8);
+}
+
+static
+void hevc_hv_8t_16w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height, int32_t weight,
+ int32_t offset, int32_t rnd_val)
+{
+ hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, filter_x,
+ filter_y, height, weight, offset, rnd_val, 16);
+}
+
+static
+void hevc_hv_8t_24w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height, int32_t weight,
+ int32_t offset, int32_t rnd_val)
+{
+ hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, filter_x,
+ filter_y, height, weight, offset, rnd_val, 24);
+}
+
+static
+void hevc_hv_8t_32w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height, int32_t weight,
+ int32_t offset, int32_t rnd_val)
+{
+ hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, filter_x,
+ filter_y, height, weight, offset, rnd_val, 32);
+}
+
+static
+void hevc_hv_8t_48w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height, int32_t weight,
+ int32_t offset, int32_t rnd_val)
+{
+ hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, filter_x,
+ filter_y, height, weight, offset, rnd_val, 48);
+}
+
+static
+void hevc_hv_8t_64w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
+ int32_t dst_stride, const int8_t *filter_x,
+ const int8_t *filter_y, int32_t height, int32_t weight,
+ int32_t offset, int32_t rnd_val)
+{
+ hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride, filter_x,
+ filter_y, height, weight, offset, rnd_val, 64);
+}
+
+
+#define UNI_W_MC_HV(PEL, WIDTH, TAP) \
+void ff_hevc_put_hevc_uni_w_##PEL##_hv##WIDTH##_8_lsx(uint8_t *dst, \
+ ptrdiff_t dst_stride, \
+ uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int height, \
+ int denom, \
+ int weight, \
+ int offset, \
+ intptr_t mx, \
+ intptr_t my, \
+ int width) \
+{ \
+ const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
+ const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
+ int shift = denom + 14 - 8; \
+ \
+ hevc_hv_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, dst_stride, filter_x,\
+ filter_y, height, weight, offset, shift); \
+}
+
+UNI_W_MC_HV(qpel, 8, 8);
+UNI_W_MC_HV(qpel, 16, 8);
+UNI_W_MC_HV(qpel, 24, 8);
+UNI_W_MC_HV(qpel, 32, 8);
+UNI_W_MC_HV(qpel, 48, 8);
+UNI_W_MC_HV(qpel, 64, 8);
+
+#undef UNI_W_MC_HV
diff --git a/libavcodec/loongarch/hevcdsp_init_loongarch.c b/libavcodec/loongarch/hevcdsp_init_loongarch.c
index e05fe780c2..22739c6f5b 100644
--- a/libavcodec/loongarch/hevcdsp_init_loongarch.c
+++ b/libavcodec/loongarch/hevcdsp_init_loongarch.c
@@ -137,6 +137,36 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_8_lsx;
c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_lsx;
+ c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_lsx;
+
+ c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_8_lsx;
+ c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_lsx;
+ c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_lsx;
+ c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_lsx;
+
+ c->put_hevc_qpel_uni[3][1][1] = ff_hevc_put_hevc_uni_qpel_hv8_8_lsx;
+ c->put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_8_lsx;
+ c->put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_8_lsx;
+ c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_8_lsx;
+ c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_8_lsx;
+ c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_8_lsx;
+
+ c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_8_lsx;
+ c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_lsx;
+
+ c->put_hevc_epel_uni[3][1][1] = ff_hevc_put_hevc_uni_epel_hv8_8_lsx;
+ c->put_hevc_epel_uni[4][1][1] = ff_hevc_put_hevc_uni_epel_hv12_8_lsx;
+ c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_8_lsx;
+ c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_8_lsx;
+ c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_lsx;
+
+ c->put_hevc_qpel_uni_w[3][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv8_8_lsx;
+ c->put_hevc_qpel_uni_w[5][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv16_8_lsx;
+ c->put_hevc_qpel_uni_w[6][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv24_8_lsx;
+ c->put_hevc_qpel_uni_w[7][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv32_8_lsx;
+ c->put_hevc_qpel_uni_w[8][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv48_8_lsx;
+ c->put_hevc_qpel_uni_w[9][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv64_8_lsx;
+
c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_8_lsx;
c->hevc_h_loop_filter_luma = ff_hevc_loop_filter_luma_h_8_lsx;
diff --git a/libavcodec/loongarch/hevcdsp_lsx.h b/libavcodec/loongarch/hevcdsp_lsx.h
index 6a834290ab..0c517af887 100644
--- a/libavcodec/loongarch/hevcdsp_lsx.h
+++ b/libavcodec/loongarch/hevcdsp_lsx.h
@@ -142,6 +142,65 @@ BI_MC(epel, hv, 32);
#undef BI_MC
+#define UNI_MC(PEL, DIR, WIDTH) \
+void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_lsx(uint8_t *dst, \
+ ptrdiff_t dst_stride, \
+ uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int height, \
+ intptr_t mx, \
+ intptr_t my, \
+ int width)
+
+UNI_MC(qpel, h, 64);
+
+UNI_MC(qpel, v, 24);
+UNI_MC(qpel, v, 32);
+UNI_MC(qpel, v, 48);
+UNI_MC(qpel, v, 64);
+
+UNI_MC(qpel, hv, 8);
+UNI_MC(qpel, hv, 16);
+UNI_MC(qpel, hv, 24);
+UNI_MC(qpel, hv, 32);
+UNI_MC(qpel, hv, 48);
+UNI_MC(qpel, hv, 64);
+
+UNI_MC(epel, v, 24);
+UNI_MC(epel, v, 32);
+
+UNI_MC(epel, hv, 8);
+UNI_MC(epel, hv, 12);
+UNI_MC(epel, hv, 16);
+UNI_MC(epel, hv, 24);
+UNI_MC(epel, hv, 32);
+
+#undef UNI_MC
+
+#define UNI_W_MC(PEL, DIR, WIDTH) \
+void ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_lsx(uint8_t *dst, \
+ ptrdiff_t \
+ dst_stride, \
+ uint8_t *src, \
+ ptrdiff_t \
+ src_stride, \
+ int height, \
+ int denom, \
+ int weight, \
+ int offset, \
+ intptr_t mx, \
+ intptr_t my, \
+ int width)
+
+UNI_W_MC(qpel, hv, 8);
+UNI_W_MC(qpel, hv, 16);
+UNI_W_MC(qpel, hv, 24);
+UNI_W_MC(qpel, hv, 32);
+UNI_W_MC(qpel, hv, 48);
+UNI_W_MC(qpel, hv, 64);
+
+#undef UNI_W_MC
+
void ff_hevc_loop_filter_luma_h_8_lsx(uint8_t *src, ptrdiff_t stride,
int32_t beta, int32_t *tc,
uint8_t *p_is_pcm, uint8_t *q_is_pcm);
--
2.20.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [FFmpeg-devel] Optimize HEVC decoding for loongarch
2022-02-17 11:11 [FFmpeg-devel] Optimize HEVC decoding for loongarch Hao Chen
` (4 preceding siblings ...)
2022-02-17 11:11 ` [FFmpeg-devel] [PATCH v2 5/5] avcodec: [loongarch] Optimize Hevc_mc_uni/w " Hao Chen
@ 2022-02-17 12:08 ` 殷时友
2022-03-01 22:53 ` Michael Niedermayer
5 siblings, 1 reply; 15+ messages in thread
From: 殷时友 @ 2022-02-17 12:08 UTC (permalink / raw)
To: FFmpeg development discussions and patches
> 2022年2月17日 下午7:11,Hao Chen <chenhao@loongson.cn> 写道:
>
> ffmpeg -i 5_h265_1080p_60fps_3Mbps.mkv -f rawvideo -y /dev/null -an
> before: 94fps
> after : 191fps
>
> V2: Retrigger the patchwork.ffmpeg test.
>
> [PATCH v2 1/5] avutil: [loongarch] Update loongson_intrinsics.h to
> [PATCH v2 2/5] avcodec: [loongarch] Optimize Hevcdsp with LSX.
> [PATCH v2 3/5] avcodec: [loongarch] Optimize Hevc_idct/lpf with LSX.
> [PATCH v2 4/5] avcodec: [loongarch] Optimize Hevc_mc_bi with LSX.
> [PATCH v2 5/5] avcodec: [loongarch] Optimize Hevc_mc_uni/w with LSX.
>
LGTM
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 1/5] avutil: [loongarch] Update loongson_intrinsics.h to v1.1.0
2022-02-17 11:11 ` [FFmpeg-devel] [PATCH v2 1/5] avutil: [loongarch] Update loongson_intrinsics.h to v1.1.0 Hao Chen
@ 2022-02-21 11:11 ` Hao Chen
2022-02-21 11:42 ` 殷时友
2022-02-21 15:00 ` Michael Niedermayer
1 sibling, 1 reply; 15+ messages in thread
From: Hao Chen @ 2022-02-21 11:11 UTC (permalink / raw)
To: ffmpeg-devel
ping
any more opinions on this?
在 2022/2/17 下午7:11, Hao Chen 写道:
> The loongson_intrinsics.h file is updated from v1.0.3 version
> to v1.1.0. Some spelling mistakes are fixed and new functions are added.
>
> Signed-off-by: Hao Chen <chenhao@loongson.cn>
> ---
> libavutil/loongarch/loongson_intrinsics.h | 1697 +++++++++++----------
> 1 file changed, 884 insertions(+), 813 deletions(-)
>
> diff --git a/libavutil/loongarch/loongson_intrinsics.h b/libavutil/loongarch/loongson_intrinsics.h
> index 6e0439f829..eb256863c8 100644
> --- a/libavutil/loongarch/loongson_intrinsics.h
> +++ b/libavutil/loongarch/loongson_intrinsics.h
> @@ -33,7 +33,7 @@
> * Xiwei Gu <guxiwei-hf@loongson.cn>
> * Lu Wang <wanglu@loongson.cn>
> *
> - * This file is a header file for loongarch builtin extention.
> + * This file is a header file for loongarch builtin extension.
> *
> */
>
> @@ -42,50 +42,50 @@
>
> /**
> * MAJOR version: Macro usage changes.
> - * MINOR version: Add new functions, or bug fix.
> + * MINOR version: Add new functions, or bug fixes.
> * MICRO version: Comment changes or implementation changes.
> */
> #define LSOM_VERSION_MAJOR 1
> -#define LSOM_VERSION_MINOR 0
> -#define LSOM_VERSION_MICRO 3
> +#define LSOM_VERSION_MINOR 1
> +#define LSOM_VERSION_MICRO 0
>
> #define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
> -{ \
> - _OUT0 = _INS(_IN0); \
> - _OUT1 = _INS(_IN1); \
> -}
> + { \
> + _OUT0 = _INS(_IN0); \
> + _OUT1 = _INS(_IN1); \
> + }
>
> #define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
> -{ \
> - _OUT0 = _INS(_IN0, _IN1); \
> - _OUT1 = _INS(_IN2, _IN3); \
> -}
> + { \
> + _OUT0 = _INS(_IN0, _IN1); \
> + _OUT1 = _INS(_IN2, _IN3); \
> + }
>
> #define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \
> -{ \
> - _OUT0 = _INS(_IN0, _IN1, _IN2); \
> - _OUT1 = _INS(_IN3, _IN4, _IN5); \
> -}
> + { \
> + _OUT0 = _INS(_IN0, _IN1, _IN2); \
> + _OUT1 = _INS(_IN3, _IN4, _IN5); \
> + }
>
> #define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \
> -{ \
> - DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1); \
> - DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3); \
> -}
> -
> -#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, \
> - _OUT0, _OUT1, _OUT2, _OUT3) \
> -{ \
> - DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1); \
> - DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3); \
> -}
> -
> -#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, \
> - _IN8, _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3) \
> -{ \
> - DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1); \
> - DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3); \
> -}
> + { \
> + DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1); \
> + DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3); \
> + }
> +
> +#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, \
> + _OUT1, _OUT2, _OUT3) \
> + { \
> + DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1); \
> + DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3); \
> + }
> +
> +#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, \
> + _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3) \
> + { \
> + DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1); \
> + DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3); \
> + }
>
> #ifdef __loongarch_sx
> #include <lsxintrin.h>
> @@ -94,11 +94,11 @@
> * Description : Dot product & addition of byte vector elements
> * Arguments : Inputs - in_c, in_h, in_l
> * Outputs - out
> - * Retrun Type - halfword
> + * Return Type - halfword
> * Details : Signed byte elements from in_h are multiplied by
> * signed byte elements from in_l, and then added adjacent to
> * each other to get results with the twice size of input.
> - * Then the results plus to signed half word elements from in_c.
> + * Then the results plus to signed half-word elements from in_c.
> * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
> * in_c : 1,2,3,4, 1,2,3,4
> * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
> @@ -106,13 +106,13 @@
> * out : 23,40,41,26, 23,40,41,26
> * =============================================================================
> */
> -static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h, __m128i in_l)
> -{
> - __m128i out;
> +static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h,
> + __m128i in_l) {
> + __m128i out;
>
> - out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
> - out = __lsx_vmaddwod_h_b(out, in_h, in_l);
> - return out;
> + out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
> + out = __lsx_vmaddwod_h_b(out, in_h, in_l);
> + return out;
> }
>
> /*
> @@ -120,35 +120,61 @@ static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h, __m128i in_l
> * Description : Dot product & addition of byte vector elements
> * Arguments : Inputs - in_c, in_h, in_l
> * Outputs - out
> - * Retrun Type - halfword
> + * Return Type - halfword
> * Details : Unsigned byte elements from in_h are multiplied by
> * unsigned byte elements from in_l, and then added adjacent to
> * each other to get results with the twice size of input.
> - * The results plus to signed half word elements from in_c.
> - * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
> + * The results plus to signed half-word elements from in_c.
> + * Example : out = __lsx_vdp2add_h_bu(in_c, in_h, in_l)
> * in_c : 1,2,3,4, 1,2,3,4
> * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
> * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
> * out : 23,40,41,26, 23,40,41,26
> * =============================================================================
> */
> -static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h, __m128i in_l)
> -{
> - __m128i out;
> +static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h,
> + __m128i in_l) {
> + __m128i out;
> +
> + out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
> + out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
> + return out;
> +}
> +
> +/*
> + * =============================================================================
> + * Description : Dot product & addition of byte vector elements
> + * Arguments : Inputs - in_c, in_h, in_l
> + * Outputs - out
> + * Return Type - halfword
> + * Details : Unsigned byte elements from in_h are multiplied by
> + * signed byte elements from in_l, and then added adjacent to
> + * each other to get results with the twice size of input.
> + * The results plus to signed half-word elements from in_c.
> + * Example : out = __lsx_vdp2add_h_bu_b(in_c, in_h, in_l)
> + * in_c : 1,1,1,1, 1,1,1,1
> + * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
> + * in_l : -1,-2,-3,-4, -5,-6,-7,-8, 1,2,3,4, 5,6,7,8
> + * out : -4,-24,-60,-112, 6,26,62,114
> + * =============================================================================
> + */
> +static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, __m128i in_h,
> + __m128i in_l) {
> + __m128i out;
>
> - out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
> - out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
> - return out;
> + out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l);
> + out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
> + return out;
> }
>
> /*
> * =============================================================================
> - * Description : Dot product & addition of half word vector elements
> + * Description : Dot product & addition of half-word vector elements
> * Arguments : Inputs - in_c, in_h, in_l
> * Outputs - out
> - * Retrun Type - __m128i
> - * Details : Signed half word elements from in_h are multiplied by
> - * signed half word elements from in_l, and then added adjacent to
> + * Return Type - __m128i
> + * Details : Signed half-word elements from in_h are multiplied by
> + * signed half-word elements from in_l, and then added adjacent to
> * each other to get results with the twice size of input.
> * Then the results plus to signed word elements from in_c.
> * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
> @@ -158,13 +184,13 @@ static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h, __m128i in_
> * out : 23,40,41,26
> * =============================================================================
> */
> -static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h, __m128i in_l)
> -{
> - __m128i out;
> +static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h,
> + __m128i in_l) {
> + __m128i out;
>
> - out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
> - out = __lsx_vmaddwod_w_h(out, in_h, in_l);
> - return out;
> + out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
> + out = __lsx_vmaddwod_w_h(out, in_h, in_l);
> + return out;
> }
>
> /*
> @@ -172,7 +198,7 @@ static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h, __m128i in_l
> * Description : Dot product of byte vector elements
> * Arguments : Inputs - in_h, in_l
> * Outputs - out
> - * Retrun Type - halfword
> + * Return Type - halfword
> * Details : Signed byte elements from in_h are multiplied by
> * signed byte elements from in_l, and then added adjacent to
> * each other to get results with the twice size of input.
> @@ -182,13 +208,12 @@ static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h, __m128i in_l
> * out : 22,38,38,22, 22,38,38,22
> * =============================================================================
> */
> -static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l)
> -{
> - __m128i out;
> +static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) {
> + __m128i out;
>
> - out = __lsx_vmulwev_h_b(in_h, in_l);
> - out = __lsx_vmaddwod_h_b(out, in_h, in_l);
> - return out;
> + out = __lsx_vmulwev_h_b(in_h, in_l);
> + out = __lsx_vmaddwod_h_b(out, in_h, in_l);
> + return out;
> }
>
> /*
> @@ -196,7 +221,7 @@ static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l)
> * Description : Dot product of byte vector elements
> * Arguments : Inputs - in_h, in_l
> * Outputs - out
> - * Retrun Type - halfword
> + * Return Type - halfword
> * Details : Unsigned byte elements from in_h are multiplied by
> * unsigned byte elements from in_l, and then added adjacent to
> * each other to get results with the twice size of input.
> @@ -206,13 +231,12 @@ static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l)
> * out : 22,38,38,22, 22,38,38,22
> * =============================================================================
> */
> -static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l)
> -{
> - __m128i out;
> +static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) {
> + __m128i out;
>
> - out = __lsx_vmulwev_h_bu(in_h, in_l);
> - out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
> - return out;
> + out = __lsx_vmulwev_h_bu(in_h, in_l);
> + out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
> + return out;
> }
>
> /*
> @@ -220,7 +244,7 @@ static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l)
> * Description : Dot product of byte vector elements
> * Arguments : Inputs - in_h, in_l
> * Outputs - out
> - * Retrun Type - halfword
> + * Return Type - halfword
> * Details : Unsigned byte elements from in_h are multiplied by
> * signed byte elements from in_l, and then added adjacent to
> * each other to get results with the twice size of input.
> @@ -230,13 +254,12 @@ static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l)
> * out : 22,38,38,22, 22,38,38,6
> * =============================================================================
> */
> -static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l)
> -{
> - __m128i out;
> +static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) {
> + __m128i out;
>
> - out = __lsx_vmulwev_h_bu_b(in_h, in_l);
> - out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
> - return out;
> + out = __lsx_vmulwev_h_bu_b(in_h, in_l);
> + out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
> + return out;
> }
>
> /*
> @@ -244,7 +267,7 @@ static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l)
> * Description : Dot product of byte vector elements
> * Arguments : Inputs - in_h, in_l
> * Outputs - out
> - * Retrun Type - halfword
> + * Return Type - halfword
> * Details : Signed byte elements from in_h are multiplied by
> * signed byte elements from in_l, and then added adjacent to
> * each other to get results with the twice size of input.
> @@ -254,19 +277,19 @@ static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l)
> * out : 22,38,38,22
> * =============================================================================
> */
> -static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l)
> -{
> - __m128i out;
> +static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) {
> + __m128i out;
>
> - out = __lsx_vmulwev_w_h(in_h, in_l);
> - out = __lsx_vmaddwod_w_h(out, in_h, in_l);
> - return out;
> + out = __lsx_vmulwev_w_h(in_h, in_l);
> + out = __lsx_vmaddwod_w_h(out, in_h, in_l);
> + return out;
> }
>
> /*
> * =============================================================================
> * Description : Clip all halfword elements of input vector between min & max
> - * out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) : (_in))
> + * out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) :
> + * (_in))
> * Arguments : Inputs - _in (input vector)
> * - min (min threshold)
> * - max (max threshold)
> @@ -279,13 +302,12 @@ static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l)
> * out : 1,2,9,9, 1,9,9,9
> * =============================================================================
> */
> -static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max)
> -{
> - __m128i out;
> +static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max) {
> + __m128i out;
>
> - out = __lsx_vmax_h(min, _in);
> - out = __lsx_vmin_h(max, out);
> - return out;
> + out = __lsx_vmax_h(min, _in);
> + out = __lsx_vmin_h(max, out);
> + return out;
> }
>
> /*
> @@ -293,20 +315,19 @@ static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max)
> * Description : Set each element of vector between 0 and 255
> * Arguments : Inputs - _in
> * Outputs - out
> - * Retrun Type - halfword
> + * Return Type - halfword
> * Details : Signed byte elements from _in are clamped between 0 and 255.
> * Example : out = __lsx_vclip255_h(_in)
> * _in : -8,255,280,249, -8,255,280,249
> * out : 0,255,255,249, 0,255,255,249
> * =============================================================================
> */
> -static inline __m128i __lsx_vclip255_h(__m128i _in)
> -{
> - __m128i out;
> +static inline __m128i __lsx_vclip255_h(__m128i _in) {
> + __m128i out;
>
> - out = __lsx_vmaxi_h(_in, 0);
> - out = __lsx_vsat_hu(out, 7);
> - return out;
> + out = __lsx_vmaxi_h(_in, 0);
> + out = __lsx_vsat_hu(out, 7);
> + return out;
> }
>
> /*
> @@ -314,20 +335,19 @@ static inline __m128i __lsx_vclip255_h(__m128i _in)
> * Description : Set each element of vector between 0 and 255
> * Arguments : Inputs - _in
> * Outputs - out
> - * Retrun Type - word
> + * Return Type - word
> * Details : Signed byte elements from _in are clamped between 0 and 255.
> * Example : out = __lsx_vclip255_w(_in)
> * _in : -8,255,280,249
> * out : 0,255,255,249
> * =============================================================================
> */
> -static inline __m128i __lsx_vclip255_w(__m128i _in)
> -{
> - __m128i out;
> +static inline __m128i __lsx_vclip255_w(__m128i _in) {
> + __m128i out;
>
> - out = __lsx_vmaxi_w(_in, 0);
> - out = __lsx_vsat_wu(out, 7);
> - return out;
> + out = __lsx_vmaxi_w(_in, 0);
> + out = __lsx_vsat_wu(out, 7);
> + return out;
> }
>
> /*
> @@ -343,12 +363,12 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
> * _in1(out) : 1,2,3,4
> * =============================================================================
> */
> -#define LSX_SWAP(_in0, _in1) \
> -{ \
> - _in0 = __lsx_vxor_v(_in0, _in1); \
> - _in1 = __lsx_vxor_v(_in0, _in1); \
> - _in0 = __lsx_vxor_v(_in0, _in1); \
> -} \
> +#define LSX_SWAP(_in0, _in1) \
> + { \
> + _in0 = __lsx_vxor_v(_in0, _in1); \
> + _in1 = __lsx_vxor_v(_in0, _in1); \
> + _in0 = __lsx_vxor_v(_in0, _in1); \
> + }
>
> /*
> * =============================================================================
> @@ -364,25 +384,27 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
> * =============================================================================
> */
> #define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
> -{ \
> + { \
> __m128i _t0, _t1, _t2, _t3; \
> \
> - _t0 = __lsx_vilvl_w(_in1, _in0); \
> - _t1 = __lsx_vilvh_w(_in1, _in0); \
> - _t2 = __lsx_vilvl_w(_in3, _in2); \
> - _t3 = __lsx_vilvh_w(_in3, _in2); \
> + _t0 = __lsx_vilvl_w(_in1, _in0); \
> + _t1 = __lsx_vilvh_w(_in1, _in0); \
> + _t2 = __lsx_vilvl_w(_in3, _in2); \
> + _t3 = __lsx_vilvh_w(_in3, _in2); \
> _out0 = __lsx_vilvl_d(_t2, _t0); \
> _out1 = __lsx_vilvh_d(_t2, _t0); \
> _out2 = __lsx_vilvl_d(_t3, _t1); \
> _out3 = __lsx_vilvh_d(_t3, _t1); \
> -}
> + }
>
> /*
> * =============================================================================
> * Description : Transpose 8x8 block with byte elements in vectors
> * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
> - * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
> - * Details : The rows of the matrix become columns, and the columns become rows.
> + * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
> + * _out7
> + * Details : The rows of the matrix become columns, and the columns
> + * become rows.
> * Example : LSX_TRANSPOSE8x8_B
> * _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00
> * _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00
> @@ -403,34 +425,35 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
> * _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00
> * =============================================================================
> */
> -#define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
> - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
> -{ \
> - __m128i zero = {0}; \
> - __m128i shuf8 = {0x0F0E0D0C0B0A0908, 0x1716151413121110}; \
> - __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
> - \
> - _t0 = __lsx_vilvl_b(_in2, _in0); \
> - _t1 = __lsx_vilvl_b(_in3, _in1); \
> - _t2 = __lsx_vilvl_b(_in6, _in4); \
> - _t3 = __lsx_vilvl_b(_in7, _in5); \
> - _t4 = __lsx_vilvl_b(_t1, _t0); \
> - _t5 = __lsx_vilvh_b(_t1, _t0); \
> - _t6 = __lsx_vilvl_b(_t3, _t2); \
> - _t7 = __lsx_vilvh_b(_t3, _t2); \
> - _out0 = __lsx_vilvl_w(_t6, _t4); \
> - _out2 = __lsx_vilvh_w(_t6, _t4); \
> - _out4 = __lsx_vilvl_w(_t7, _t5); \
> - _out6 = __lsx_vilvh_w(_t7, _t5); \
> - _out1 = __lsx_vshuf_b(zero, _out0, shuf8); \
> - _out3 = __lsx_vshuf_b(zero, _out2, shuf8); \
> - _out5 = __lsx_vshuf_b(zero, _out4, shuf8); \
> - _out7 = __lsx_vshuf_b(zero, _out6, shuf8); \
> -}
> +#define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
> + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
> + _out7) \
> + { \
> + __m128i zero = { 0 }; \
> + __m128i shuf8 = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; \
> + __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
> + \
> + _t0 = __lsx_vilvl_b(_in2, _in0); \
> + _t1 = __lsx_vilvl_b(_in3, _in1); \
> + _t2 = __lsx_vilvl_b(_in6, _in4); \
> + _t3 = __lsx_vilvl_b(_in7, _in5); \
> + _t4 = __lsx_vilvl_b(_t1, _t0); \
> + _t5 = __lsx_vilvh_b(_t1, _t0); \
> + _t6 = __lsx_vilvl_b(_t3, _t2); \
> + _t7 = __lsx_vilvh_b(_t3, _t2); \
> + _out0 = __lsx_vilvl_w(_t6, _t4); \
> + _out2 = __lsx_vilvh_w(_t6, _t4); \
> + _out4 = __lsx_vilvl_w(_t7, _t5); \
> + _out6 = __lsx_vilvh_w(_t7, _t5); \
> + _out1 = __lsx_vshuf_b(zero, _out0, shuf8); \
> + _out3 = __lsx_vshuf_b(zero, _out2, shuf8); \
> + _out5 = __lsx_vshuf_b(zero, _out4, shuf8); \
> + _out7 = __lsx_vshuf_b(zero, _out6, shuf8); \
> + }
>
> /*
> * =============================================================================
> - * Description : Transpose 8x8 block with half word elements in vectors
> + * Description : Transpose 8x8 block with half-word elements in vectors
> * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
> * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
> * Details :
> @@ -445,37 +468,38 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
> * 70,71,72,73,74,75,76,77 07,17,27,37,47,57,67,77
> * =============================================================================
> */
> -#define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
> - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
> -{ \
> - __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
> - \
> - _s0 = __lsx_vilvl_h(_in6, _in4); \
> - _s1 = __lsx_vilvl_h(_in7, _in5); \
> - _t0 = __lsx_vilvl_h(_s1, _s0); \
> - _t1 = __lsx_vilvh_h(_s1, _s0); \
> - _s0 = __lsx_vilvh_h(_in6, _in4); \
> - _s1 = __lsx_vilvh_h(_in7, _in5); \
> - _t2 = __lsx_vilvl_h(_s1, _s0); \
> - _t3 = __lsx_vilvh_h(_s1, _s0); \
> - _s0 = __lsx_vilvl_h(_in2, _in0); \
> - _s1 = __lsx_vilvl_h(_in3, _in1); \
> - _t4 = __lsx_vilvl_h(_s1, _s0); \
> - _t5 = __lsx_vilvh_h(_s1, _s0); \
> - _s0 = __lsx_vilvh_h(_in2, _in0); \
> - _s1 = __lsx_vilvh_h(_in3, _in1); \
> - _t6 = __lsx_vilvl_h(_s1, _s0); \
> - _t7 = __lsx_vilvh_h(_s1, _s0); \
> - \
> - _out0 = __lsx_vpickev_d(_t0, _t4); \
> - _out2 = __lsx_vpickev_d(_t1, _t5); \
> - _out4 = __lsx_vpickev_d(_t2, _t6); \
> - _out6 = __lsx_vpickev_d(_t3, _t7); \
> - _out1 = __lsx_vpickod_d(_t0, _t4); \
> - _out3 = __lsx_vpickod_d(_t1, _t5); \
> - _out5 = __lsx_vpickod_d(_t2, _t6); \
> - _out7 = __lsx_vpickod_d(_t3, _t7); \
> -}
> +#define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
> + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
> + _out7) \
> + { \
> + __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
> + \
> + _s0 = __lsx_vilvl_h(_in6, _in4); \
> + _s1 = __lsx_vilvl_h(_in7, _in5); \
> + _t0 = __lsx_vilvl_h(_s1, _s0); \
> + _t1 = __lsx_vilvh_h(_s1, _s0); \
> + _s0 = __lsx_vilvh_h(_in6, _in4); \
> + _s1 = __lsx_vilvh_h(_in7, _in5); \
> + _t2 = __lsx_vilvl_h(_s1, _s0); \
> + _t3 = __lsx_vilvh_h(_s1, _s0); \
> + _s0 = __lsx_vilvl_h(_in2, _in0); \
> + _s1 = __lsx_vilvl_h(_in3, _in1); \
> + _t4 = __lsx_vilvl_h(_s1, _s0); \
> + _t5 = __lsx_vilvh_h(_s1, _s0); \
> + _s0 = __lsx_vilvh_h(_in2, _in0); \
> + _s1 = __lsx_vilvh_h(_in3, _in1); \
> + _t6 = __lsx_vilvl_h(_s1, _s0); \
> + _t7 = __lsx_vilvh_h(_s1, _s0); \
> + \
> + _out0 = __lsx_vpickev_d(_t0, _t4); \
> + _out2 = __lsx_vpickev_d(_t1, _t5); \
> + _out4 = __lsx_vpickev_d(_t2, _t6); \
> + _out6 = __lsx_vpickev_d(_t3, _t7); \
> + _out1 = __lsx_vpickod_d(_t0, _t4); \
> + _out3 = __lsx_vpickod_d(_t1, _t5); \
> + _out5 = __lsx_vpickod_d(_t2, _t6); \
> + _out7 = __lsx_vpickod_d(_t3, _t7); \
> + }
>
> /*
> * =============================================================================
> @@ -483,7 +507,8 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
> * Arguments : Inputs - _in0, _in1, _in2, _in3 (input 8x4 byte block)
> * Outputs - _out0, _out1, _out2, _out3 (output 4x8 byte block)
> * Return Type - as per RTYPE
> - * Details : The rows of the matrix become columns, and the columns become rows.
> + * Details : The rows of the matrix become columns, and the columns become
> + * rows.
> * Example : LSX_TRANSPOSE8x4_B
> * _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00
> * _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00
> @@ -500,26 +525,26 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
> * _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
> * =============================================================================
> */
> -#define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
> - _out0, _out1, _out2, _out3) \
> -{ \
> - __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
> - \
> - _tmp0_m = __lsx_vpackev_w(_in4, _in0); \
> - _tmp1_m = __lsx_vpackev_w(_in5, _in1); \
> - _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
> - _tmp0_m = __lsx_vpackev_w(_in6, _in2); \
> - _tmp1_m = __lsx_vpackev_w(_in7, _in3); \
> - \
> - _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
> - _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m); \
> - _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m); \
> - \
> - _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m); \
> - _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m); \
> - _out1 = __lsx_vilvh_d(_out2, _out0); \
> - _out3 = __lsx_vilvh_d(_out0, _out2); \
> -}
> +#define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
> + _out0, _out1, _out2, _out3) \
> + { \
> + __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
> + \
> + _tmp0_m = __lsx_vpackev_w(_in4, _in0); \
> + _tmp1_m = __lsx_vpackev_w(_in5, _in1); \
> + _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
> + _tmp0_m = __lsx_vpackev_w(_in6, _in2); \
> + _tmp1_m = __lsx_vpackev_w(_in7, _in3); \
> + \
> + _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
> + _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m); \
> + _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m); \
> + \
> + _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m); \
> + _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m); \
> + _out1 = __lsx_vilvh_d(_out2, _out0); \
> + _out3 = __lsx_vilvh_d(_out0, _out2); \
> + }
>
> /*
> * =============================================================================
> @@ -547,29 +572,30 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
> * 120,121,122,123,124,125,126,127
> * =============================================================================
> */
> -#define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, _in8, \
> - _in9, _in10, _in11, _in12, _in13, _in14, _in15, _out0, \
> - _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
> -{ \
> - __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
> - __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
> - DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \
> - _tmp0, _tmp1, _tmp2, _tmp3); \
> - DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15, \
> - _in13, _tmp4, _tmp5, _tmp6, _tmp7); \
> - DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2); \
> - DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3); \
> - DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6); \
> - DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7); \
> - DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4); \
> - DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6); \
> - DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5); \
> - DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7); \
> - DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2); \
> - DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3); \
> - DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6); \
> - DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7); \
> -}
> +#define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
> + _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
> + _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
> + _out6, _out7) \
> + { \
> + __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
> + __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
> + DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \
> + _tmp0, _tmp1, _tmp2, _tmp3); \
> + DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15, \
> + _in13, _tmp4, _tmp5, _tmp6, _tmp7); \
> + DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2); \
> + DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3); \
> + DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6); \
> + DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7); \
> + DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4); \
> + DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6); \
> + DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5); \
> + DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7); \
> + DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2); \
> + DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3); \
> + DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6); \
> + DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7); \
> + }
>
> /*
> * =============================================================================
> @@ -585,33 +611,33 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
> * =============================================================================
> */
> #define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
> -{ \
> + { \
> _out0 = __lsx_vadd_b(_in0, _in3); \
> _out1 = __lsx_vadd_b(_in1, _in2); \
> _out2 = __lsx_vsub_b(_in1, _in2); \
> _out3 = __lsx_vsub_b(_in0, _in3); \
> -}
> + }
> #define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
> -{ \
> + { \
> _out0 = __lsx_vadd_h(_in0, _in3); \
> _out1 = __lsx_vadd_h(_in1, _in2); \
> _out2 = __lsx_vsub_h(_in1, _in2); \
> _out3 = __lsx_vsub_h(_in0, _in3); \
> -}
> + }
> #define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
> -{ \
> + { \
> _out0 = __lsx_vadd_w(_in0, _in3); \
> _out1 = __lsx_vadd_w(_in1, _in2); \
> _out2 = __lsx_vsub_w(_in1, _in2); \
> _out3 = __lsx_vsub_w(_in0, _in3); \
> -}
> + }
> #define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
> -{ \
> + { \
> _out0 = __lsx_vadd_d(_in0, _in3); \
> _out1 = __lsx_vadd_d(_in1, _in2); \
> _out2 = __lsx_vsub_d(_in1, _in2); \
> _out3 = __lsx_vsub_d(_in0, _in3); \
> -}
> + }
>
> /*
> * =============================================================================
> @@ -630,59 +656,63 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
> * _out7 = _in0 - _in7;
> * =============================================================================
> */
> -#define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
> - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
> -{ \
> - _out0 = __lsx_vadd_b(_in0, _in7); \
> - _out1 = __lsx_vadd_b(_in1, _in6); \
> - _out2 = __lsx_vadd_b(_in2, _in5); \
> - _out3 = __lsx_vadd_b(_in3, _in4); \
> - _out4 = __lsx_vsub_b(_in3, _in4); \
> - _out5 = __lsx_vsub_b(_in2, _in5); \
> - _out6 = __lsx_vsub_b(_in1, _in6); \
> - _out7 = __lsx_vsub_b(_in0, _in7); \
> -}
> -
> -#define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
> - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
> -{ \
> - _out0 = __lsx_vadd_h(_in0, _in7); \
> - _out1 = __lsx_vadd_h(_in1, _in6); \
> - _out2 = __lsx_vadd_h(_in2, _in5); \
> - _out3 = __lsx_vadd_h(_in3, _in4); \
> - _out4 = __lsx_vsub_h(_in3, _in4); \
> - _out5 = __lsx_vsub_h(_in2, _in5); \
> - _out6 = __lsx_vsub_h(_in1, _in6); \
> - _out7 = __lsx_vsub_h(_in0, _in7); \
> -}
> -
> -#define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
> - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
> -{ \
> - _out0 = __lsx_vadd_w(_in0, _in7); \
> - _out1 = __lsx_vadd_w(_in1, _in6); \
> - _out2 = __lsx_vadd_w(_in2, _in5); \
> - _out3 = __lsx_vadd_w(_in3, _in4); \
> - _out4 = __lsx_vsub_w(_in3, _in4); \
> - _out5 = __lsx_vsub_w(_in2, _in5); \
> - _out6 = __lsx_vsub_w(_in1, _in6); \
> - _out7 = __lsx_vsub_w(_in0, _in7); \
> -}
> -
> -#define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
> - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
> -{ \
> - _out0 = __lsx_vadd_d(_in0, _in7); \
> - _out1 = __lsx_vadd_d(_in1, _in6); \
> - _out2 = __lsx_vadd_d(_in2, _in5); \
> - _out3 = __lsx_vadd_d(_in3, _in4); \
> - _out4 = __lsx_vsub_d(_in3, _in4); \
> - _out5 = __lsx_vsub_d(_in2, _in5); \
> - _out6 = __lsx_vsub_d(_in1, _in6); \
> - _out7 = __lsx_vsub_d(_in0, _in7); \
> -}
> -
> -#endif //LSX
> +#define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
> + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
> + _out7) \
> + { \
> + _out0 = __lsx_vadd_b(_in0, _in7); \
> + _out1 = __lsx_vadd_b(_in1, _in6); \
> + _out2 = __lsx_vadd_b(_in2, _in5); \
> + _out3 = __lsx_vadd_b(_in3, _in4); \
> + _out4 = __lsx_vsub_b(_in3, _in4); \
> + _out5 = __lsx_vsub_b(_in2, _in5); \
> + _out6 = __lsx_vsub_b(_in1, _in6); \
> + _out7 = __lsx_vsub_b(_in0, _in7); \
> + }
> +
> +#define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
> + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
> + _out7) \
> + { \
> + _out0 = __lsx_vadd_h(_in0, _in7); \
> + _out1 = __lsx_vadd_h(_in1, _in6); \
> + _out2 = __lsx_vadd_h(_in2, _in5); \
> + _out3 = __lsx_vadd_h(_in3, _in4); \
> + _out4 = __lsx_vsub_h(_in3, _in4); \
> + _out5 = __lsx_vsub_h(_in2, _in5); \
> + _out6 = __lsx_vsub_h(_in1, _in6); \
> + _out7 = __lsx_vsub_h(_in0, _in7); \
> + }
> +
> +#define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
> + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
> + _out7) \
> + { \
> + _out0 = __lsx_vadd_w(_in0, _in7); \
> + _out1 = __lsx_vadd_w(_in1, _in6); \
> + _out2 = __lsx_vadd_w(_in2, _in5); \
> + _out3 = __lsx_vadd_w(_in3, _in4); \
> + _out4 = __lsx_vsub_w(_in3, _in4); \
> + _out5 = __lsx_vsub_w(_in2, _in5); \
> + _out6 = __lsx_vsub_w(_in1, _in6); \
> + _out7 = __lsx_vsub_w(_in0, _in7); \
> + }
> +
> +#define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
> + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
> + _out7) \
> + { \
> + _out0 = __lsx_vadd_d(_in0, _in7); \
> + _out1 = __lsx_vadd_d(_in1, _in6); \
> + _out2 = __lsx_vadd_d(_in2, _in5); \
> + _out3 = __lsx_vadd_d(_in3, _in4); \
> + _out4 = __lsx_vsub_d(_in3, _in4); \
> + _out5 = __lsx_vsub_d(_in2, _in5); \
> + _out6 = __lsx_vsub_d(_in1, _in6); \
> + _out7 = __lsx_vsub_d(_in0, _in7); \
> + }
> +
> +#endif // LSX
>
> #ifdef __loongarch_asx
> #include <lasxintrin.h>
> @@ -700,13 +730,12 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
> * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
> * =============================================================================
> */
> -static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l)
> -{
> - __m256i out;
> +static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) {
> + __m256i out;
>
> - out = __lasx_xvmulwev_h_bu(in_h, in_l);
> - out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
> - return out;
> + out = __lasx_xvmulwev_h_bu(in_h, in_l);
> + out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
> + return out;
> }
>
> /*
> @@ -718,18 +747,17 @@ static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l)
> * Details : Signed byte elements from in_h are multiplied with
> * signed byte elements from in_l producing a result
> * twice the size of input i.e. signed halfword.
> - * Then this iniplication results of adjacent odd-even elements
> + * Then this multiplication results of adjacent odd-even elements
> * are added to the out vector
> * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
> * =============================================================================
> */
> -static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l)
> -{
> - __m256i out;
> +static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) {
> + __m256i out;
>
> - out = __lasx_xvmulwev_h_b(in_h, in_l);
> - out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
> - return out;
> + out = __lasx_xvmulwev_h_b(in_h, in_l);
> + out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
> + return out;
> }
>
> /*
> @@ -749,13 +777,12 @@ static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l)
> * out : 22,38,38,22, 22,38,38,22
> * =============================================================================
> */
> -static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l)
> -{
> - __m256i out;
> +static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) {
> + __m256i out;
>
> - out = __lasx_xvmulwev_w_h(in_h, in_l);
> - out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
> - return out;
> + out = __lasx_xvmulwev_w_h(in_h, in_l);
> + out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
> + return out;
> }
>
> /*
> @@ -763,22 +790,21 @@ static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l)
> * Description : Dot product of word vector elements
> * Arguments : Inputs - in_h, in_l
> * Output - out
> - * Retrun Type - signed double
> + * Return Type - signed double
> * Details : Signed word elements from in_h are multiplied with
> * signed word elements from in_l producing a result
> - * twice the size of input i.e. signed double word.
> + * twice the size of input i.e. signed double-word.
> * Then this multiplied results of adjacent odd-even elements
> * are added to the out vector.
> * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
> * =============================================================================
> */
> -static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l)
> -{
> - __m256i out;
> +static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l) {
> + __m256i out;
>
> - out = __lasx_xvmulwev_d_w(in_h, in_l);
> - out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
> - return out;
> + out = __lasx_xvmulwev_d_w(in_h, in_l);
> + out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
> + return out;
> }
>
> /*
> @@ -795,13 +821,12 @@ static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l)
> * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
> * =============================================================================
> */
> -static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l)
> -{
> - __m256i out;
> +static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) {
> + __m256i out;
>
> - out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
> - out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
> - return out;
> + out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
> + out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
> + return out;
> }
>
> /*
> @@ -809,7 +834,7 @@ static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l)
> * Description : Dot product & addition of byte vector elements
> * Arguments : Inputs - in_h, in_l
> * Output - out
> - * Retrun Type - halfword
> + * Return Type - halfword
> * Details : Signed byte elements from in_h are multiplied with
> * signed byte elements from in_l producing a result
> * twice the size of input i.e. signed halfword.
> @@ -818,13 +843,59 @@ static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l)
> * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
> * =============================================================================
> */
> -static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c,__m256i in_h, __m256i in_l)
> -{
> - __m256i out;
> +static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h,
> + __m256i in_l) {
> + __m256i out;
> +
> + out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
> + out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
> + return out;
> +}
> +
> +/*
> + * =============================================================================
> + * Description : Dot product & addition of byte vector elements
> + * Arguments : Inputs - in_h, in_l
> + * Output - out
> + * Return Type - halfword
> + * Details : Unsigned byte elements from in_h are multiplied with
> + * unsigned byte elements from in_l producing a result
> + * twice the size of input i.e. signed halfword.
> + * Then this multiplied results of adjacent odd-even elements
> + * are added to the in_c vector.
> + * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
> + * =============================================================================
> + */
> +static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, __m256i in_h,
> + __m256i in_l) {
> + __m256i out;
> +
> + out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l);
> + out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
> + return out;
> +}
> +
> +/*
> + * =============================================================================
> + * Description : Dot product & addition of byte vector elements
> + * Arguments : Inputs - in_h, in_l
> + * Output - out
> + * Return Type - halfword
> + * Details : Unsigned byte elements from in_h are multiplied with
> + * signed byte elements from in_l producing a result
> + * twice the size of input i.e. signed halfword.
> + * Then this multiplied results of adjacent odd-even elements
> + * are added to the in_c vector.
> + * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
> + * =============================================================================
> + */
> +static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, __m256i in_h,
> + __m256i in_l) {
> + __m256i out;
>
> - out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
> - out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
> - return out;
> + out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l);
> + out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l);
> + return out;
> }
>
> /*
> @@ -845,13 +916,13 @@ static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c,__m256i in_h, __m256i in_
> * out : 23,40,41,26, 23,40,41,26
> * =============================================================================
> */
> -static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
> -{
> - __m256i out;
> +static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h,
> + __m256i in_l) {
> + __m256i out;
>
> - out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
> - out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
> - return out;
> + out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
> + out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
> + return out;
> }
>
> /*
> @@ -868,13 +939,13 @@ static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h, __m256i in
> * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
> * =============================================================================
> */
> -static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h, __m256i in_l)
> -{
> - __m256i out;
> +static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h,
> + __m256i in_l) {
> + __m256i out;
>
> - out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
> - out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
> - return out;
> + out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
> + out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
> + return out;
> }
>
> /*
> @@ -891,13 +962,13 @@ static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h, __m256i i
> * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
> * =============================================================================
> */
> -static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h, __m256i in_l)
> -{
> - __m256i out;
> +static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h,
> + __m256i in_l) {
> + __m256i out;
>
> - out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
> - out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
> - return out;
> + out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
> + out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
> + return out;
> }
>
> /*
> @@ -915,14 +986,14 @@ static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h, __m256i
> * Example : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
> * =============================================================================
> */
> -static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h, __m256i in_l)
> -{
> - __m256i out;
> -
> - out = __lasx_xvmulwev_h_bu(in_h, in_l);
> - out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
> - out = __lasx_xvsub_h(in_c, out);
> - return out;
> +static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h,
> + __m256i in_l) {
> + __m256i out;
> +
> + out = __lasx_xvmulwev_h_bu(in_h, in_l);
> + out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
> + out = __lasx_xvsub_h(in_c, out);
> + return out;
> }
>
> /*
> @@ -944,14 +1015,14 @@ static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h, __m256i i
> * out : -7,-3,0,0, 0,-1,0,-1
> * =============================================================================
> */
> -static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
> -{
> - __m256i out;
> -
> - out = __lasx_xvmulwev_w_h(in_h, in_l);
> - out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
> - out = __lasx_xvsub_w(in_c, out);
> - return out;
> +static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h,
> + __m256i in_l) {
> + __m256i out;
> +
> + out = __lasx_xvmulwev_w_h(in_h, in_l);
> + out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
> + out = __lasx_xvsub_w(in_c, out);
> + return out;
> }
>
> /*
> @@ -960,10 +1031,10 @@ static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h, __m256i in
> * Arguments : Inputs - in_h, in_l
> * Output - out
> * Return Type - signed word
> - * Details : Signed halfword elements from in_h are iniplied with
> + * Details : Signed halfword elements from in_h are multiplied with
> * signed halfword elements from in_l producing a result
> * four times the size of input i.e. signed doubleword.
> - * Then this iniplication results of four adjacent elements
> + * Then this multiplication results of four adjacent elements
> * are added together and stored to the out vector.
> * Example : out = __lasx_xvdp4_d_h(in_h, in_l)
> * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1
> @@ -971,14 +1042,13 @@ static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h, __m256i in
> * out : -2,0,1,1
> * =============================================================================
> */
> -static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l)
> -{
> - __m256i out;
> -
> - out = __lasx_xvmulwev_w_h(in_h, in_l);
> - out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
> - out = __lasx_xvhaddw_d_w(out, out);
> - return out;
> +static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l) {
> + __m256i out;
> +
> + out = __lasx_xvmulwev_w_h(in_h, in_l);
> + out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
> + out = __lasx_xvhaddw_d_w(out, out);
> + return out;
> }
>
> /*
> @@ -993,13 +1063,12 @@ static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l)
> * Example : See out = __lasx_xvaddwh_w_h(in_h, in_l)
> * =============================================================================
> */
> -static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l)
> -{
> - __m256i out;
> +static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l) {
> + __m256i out;
>
> - out = __lasx_xvilvh_b(in_h, in_l);
> - out = __lasx_xvhaddw_h_b(out, out);
> - return out;
> + out = __lasx_xvilvh_b(in_h, in_l);
> + out = __lasx_xvhaddw_h_b(out, out);
> + return out;
> }
>
> /*
> @@ -1017,13 +1086,12 @@ static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l)
> * out : 1,0,0,-1, 1,0,0, 2
> * =============================================================================
> */
> - static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l)
> -{
> - __m256i out;
> +static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l) {
> + __m256i out;
>
> - out = __lasx_xvilvh_h(in_h, in_l);
> - out = __lasx_xvhaddw_w_h(out, out);
> - return out;
> + out = __lasx_xvilvh_h(in_h, in_l);
> + out = __lasx_xvhaddw_w_h(out, out);
> + return out;
> }
>
> /*
> @@ -1038,13 +1106,12 @@ static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l)
> * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l)
> * =============================================================================
> */
> -static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l)
> -{
> - __m256i out;
> +static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l) {
> + __m256i out;
>
> - out = __lasx_xvilvl_b(in_h, in_l);
> - out = __lasx_xvhaddw_h_b(out, out);
> - return out;
> + out = __lasx_xvilvl_b(in_h, in_l);
> + out = __lasx_xvhaddw_h_b(out, out);
> + return out;
> }
>
> /*
> @@ -1062,13 +1129,12 @@ static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l)
> * out : 5,-1,4,2, 1,0,2,-1
> * =============================================================================
> */
> -static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l)
> -{
> - __m256i out;
> +static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l) {
> + __m256i out;
>
> - out = __lasx_xvilvl_h(in_h, in_l);
> - out = __lasx_xvhaddw_w_h(out, out);
> - return out;
> + out = __lasx_xvilvl_h(in_h, in_l);
> + out = __lasx_xvhaddw_w_h(out, out);
> + return out;
> }
>
> /*
> @@ -1083,13 +1149,12 @@ static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l)
> * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l)
> * =============================================================================
> */
> -static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l)
> -{
> - __m256i out;
> +static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l) {
> + __m256i out;
>
> - out = __lasx_xvilvl_b(in_h, in_l);
> - out = __lasx_xvhaddw_hu_bu(out, out);
> - return out;
> + out = __lasx_xvilvl_b(in_h, in_l);
> + out = __lasx_xvhaddw_hu_bu(out, out);
> + return out;
> }
>
> /*
> @@ -1103,13 +1168,12 @@ static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l)
> * Example : See out = __lasx_xvaddw_w_w_h(in_h, in_l)
> * =============================================================================
> */
> -static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l)
> -{
> - __m256i out;
> +static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l) {
> + __m256i out;
>
> - out = __lasx_xvsllwil_hu_bu(in_l, 0);
> - out = __lasx_xvadd_h(in_h, out);
> - return out;
> + out = __lasx_xvsllwil_hu_bu(in_l, 0);
> + out = __lasx_xvadd_h(in_h, out);
> + return out;
> }
>
> /*
> @@ -1126,13 +1190,12 @@ static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l)
> * out : 2, 0,1,2, -1,0,1,1,
> * =============================================================================
> */
> -static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l)
> -{
> - __m256i out;
> +static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) {
> + __m256i out;
>
> - out = __lasx_xvsllwil_w_h(in_l, 0);
> - out = __lasx_xvadd_w(in_h, out);
> - return out;
> + out = __lasx_xvsllwil_w_h(in_l, 0);
> + out = __lasx_xvadd_w(in_h, out);
> + return out;
> }
>
> /*
> @@ -1153,15 +1216,15 @@ static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l)
> * out : 201, 602,1203,2004, -995, -1794,-2793,-3992
> * =============================================================================
> */
> -static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
> -{
> - __m256i tmp0, tmp1, out;
> -
> - tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
> - tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
> - tmp0 = __lasx_xvmul_w(tmp0, tmp1);
> - out = __lasx_xvadd_w(tmp0, in_c);
> - return out;
> +static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h,
> + __m256i in_l) {
> + __m256i tmp0, tmp1, out;
> +
> + tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
> + tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
> + tmp0 = __lasx_xvmul_w(tmp0, tmp1);
> + out = __lasx_xvadd_w(tmp0, in_c);
> + return out;
> }
>
> /*
> @@ -1177,15 +1240,15 @@ static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h, __m256i in
> * Example : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
> * =============================================================================
> */
> -static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
> -{
> - __m256i tmp0, tmp1, out;
> -
> - tmp0 = __lasx_xvilvh_h(in_h, in_h);
> - tmp1 = __lasx_xvilvh_h(in_l, in_l);
> - tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
> - out = __lasx_xvadd_w(tmp0, in_c);
> - return out;
> +static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h,
> + __m256i in_l) {
> + __m256i tmp0, tmp1, out;
> +
> + tmp0 = __lasx_xvilvh_h(in_h, in_h);
> + tmp1 = __lasx_xvilvh_h(in_l, in_l);
> + tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
> + out = __lasx_xvadd_w(tmp0, in_c);
> + return out;
> }
>
> /*
> @@ -1203,14 +1266,13 @@ static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h, __m256i in
> * out : 6,1,3,0, 0,0,1,0
> * =============================================================================
> */
> -static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l)
> -{
> - __m256i tmp0, tmp1, out;
> -
> - tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
> - tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
> - out = __lasx_xvmul_w(tmp0, tmp1);
> - return out;
> +static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l) {
> + __m256i tmp0, tmp1, out;
> +
> + tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
> + tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
> + out = __lasx_xvmul_w(tmp0, tmp1);
> + return out;
> }
>
> /*
> @@ -1228,39 +1290,39 @@ static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l)
> * out : 0,0,0,0, 0,0,0,1
> * =============================================================================
> */
> -static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l)
> -{
> - __m256i tmp0, tmp1, out;
> -
> - tmp0 = __lasx_xvilvh_h(in_h, in_h);
> - tmp1 = __lasx_xvilvh_h(in_l, in_l);
> - out = __lasx_xvmulwev_w_h(tmp0, tmp1);
> - return out;
> +static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) {
> + __m256i tmp0, tmp1, out;
> +
> + tmp0 = __lasx_xvilvh_h(in_h, in_h);
> + tmp1 = __lasx_xvilvh_h(in_l, in_l);
> + out = __lasx_xvmulwev_w_h(tmp0, tmp1);
> + return out;
> }
>
> /*
> * =============================================================================
> - * Description : The low half of the vector elements are expanded and
> - * added saturately after being doubled.
> + * Description : The low half of the vector elements are added to the high half
> + * after being doubled, then saturated.
> * Arguments : Inputs - in_h, in_l
> * Output - out
> - * Details : The in_h vector adds the in_l vector saturately after the lower
> - * half of the two-fold zero extension (unsigned byte to unsigned
> - * halfword) and the results are stored to the out vector.
> + * Details : The in_h vector adds the in_l vector after the lower half of
> + * the two-fold zero extension (unsigned byte to unsigned
> + * halfword) and then saturated. The results are stored to the out
> + * vector.
> * Example : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l)
> * in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1
> - * in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1
> - * out : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
> + * in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1,
> + * 0,0,0,1
> + * out : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
> * =============================================================================
> */
> -static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l)
> -{
> - __m256i tmp1, out;
> - __m256i zero = {0};
> -
> - tmp1 = __lasx_xvilvl_b(zero, in_l);
> - out = __lasx_xvsadd_hu(in_h, tmp1);
> - return out;
> +static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) {
> + __m256i tmp1, out;
> + __m256i zero = { 0 };
> +
> + tmp1 = __lasx_xvilvl_b(zero, in_l);
> + out = __lasx_xvsadd_hu(in_h, tmp1);
> + return out;
> }
>
> /*
> @@ -1279,13 +1341,12 @@ static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l)
> * out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5
> * =============================================================================
> */
> -static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max)
> -{
> - __m256i out;
> +static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max) {
> + __m256i out;
>
> - out = __lasx_xvmax_h(min, in);
> - out = __lasx_xvmin_h(max, out);
> - return out;
> + out = __lasx_xvmax_h(min, in);
> + out = __lasx_xvmin_h(max, out);
> + return out;
> }
>
> /*
> @@ -1298,13 +1359,12 @@ static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max)
> * Example : See out = __lasx_xvclip255_w(in)
> * =============================================================================
> */
> -static inline __m256i __lasx_xvclip255_h(__m256i in)
> -{
> - __m256i out;
> +static inline __m256i __lasx_xvclip255_h(__m256i in) {
> + __m256i out;
>
> - out = __lasx_xvmaxi_h(in, 0);
> - out = __lasx_xvsat_hu(out, 7);
> - return out;
> + out = __lasx_xvmaxi_h(in, 0);
> + out = __lasx_xvsat_hu(out, 7);
> + return out;
> }
>
> /*
> @@ -1319,20 +1379,19 @@ static inline __m256i __lasx_xvclip255_h(__m256i in)
> * out : 0,255,255,249, 0,255,255,249
> * =============================================================================
> */
> -static inline __m256i __lasx_xvclip255_w(__m256i in)
> -{
> - __m256i out;
> +static inline __m256i __lasx_xvclip255_w(__m256i in) {
> + __m256i out;
>
> - out = __lasx_xvmaxi_w(in, 0);
> - out = __lasx_xvsat_wu(out, 7);
> - return out;
> + out = __lasx_xvmaxi_w(in, 0);
> + out = __lasx_xvsat_wu(out, 7);
> + return out;
> }
>
> /*
> * =============================================================================
> * Description : Indexed halfword element values are replicated to all
> - * elements in output vector. If 'indx < 8' use xvsplati_l_*,
> - * if 'indx >= 8' use xvsplati_h_*.
> + * elements in output vector. If 'idx < 8' use xvsplati_l_*,
> + * if 'idx >= 8' use xvsplati_h_*.
> * Arguments : Inputs - in, idx
> * Output - out
> * Details : Idx element value from in vector is replicated to all
> @@ -1344,20 +1403,19 @@ static inline __m256i __lasx_xvclip255_w(__m256i in)
> * out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11
> * =============================================================================
> */
> -static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx)
> -{
> - __m256i out;
> +static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx) {
> + __m256i out;
>
> - out = __lasx_xvpermi_q(in, in, 0x02);
> - out = __lasx_xvreplve_h(out, idx);
> - return out;
> + out = __lasx_xvpermi_q(in, in, 0x02);
> + out = __lasx_xvreplve_h(out, idx);
> + return out;
> }
>
> /*
> * =============================================================================
> * Description : Indexed halfword element values are replicated to all
> - * elements in output vector. If 'indx < 8' use xvsplati_l_*,
> - * if 'indx >= 8' use xvsplati_h_*.
> + * elements in output vector. If 'idx < 8' use xvsplati_l_*,
> + * if 'idx >= 8' use xvsplati_h_*.
> * Arguments : Inputs - in, idx
> * Output - out
> * Details : Idx element value from in vector is replicated to all
> @@ -1369,18 +1427,17 @@ static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx)
> * out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
> * =============================================================================
> */
> -static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
> -{
> - __m256i out;
> +static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) {
> + __m256i out;
>
> - out = __lasx_xvpermi_q(in, in, 0x13);
> - out = __lasx_xvreplve_h(out, idx);
> - return out;
> + out = __lasx_xvpermi_q(in, in, 0x13);
> + out = __lasx_xvreplve_h(out, idx);
> + return out;
> }
>
> /*
> * =============================================================================
> - * Description : Transpose 4x4 block with double word elements in vectors
> + * Description : Transpose 4x4 block with double-word elements in vectors
> * Arguments : Inputs - _in0, _in1, _in2, _in3
> * Outputs - _out0, _out1, _out2, _out3
> * Example : LASX_TRANSPOSE4x4_D
> @@ -1395,24 +1452,26 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
> * _out3 : 4,4,4,4
> * =============================================================================
> */
> -#define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
> -{ \
> - __m256i _tmp0, _tmp1, _tmp2, _tmp3; \
> - _tmp0 = __lasx_xvilvl_d(_in1, _in0); \
> - _tmp1 = __lasx_xvilvh_d(_in1, _in0); \
> - _tmp2 = __lasx_xvilvl_d(_in3, _in2); \
> - _tmp3 = __lasx_xvilvh_d(_in3, _in2); \
> - _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20); \
> - _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31); \
> - _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20); \
> - _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31); \
> -}
> +#define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
> + _out3) \
> + { \
> + __m256i _tmp0, _tmp1, _tmp2, _tmp3; \
> + _tmp0 = __lasx_xvilvl_d(_in1, _in0); \
> + _tmp1 = __lasx_xvilvh_d(_in1, _in0); \
> + _tmp2 = __lasx_xvilvl_d(_in3, _in2); \
> + _tmp3 = __lasx_xvilvh_d(_in3, _in2); \
> + _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20); \
> + _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31); \
> + _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20); \
> + _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31); \
> + }
>
> /*
> * =============================================================================
> * Description : Transpose 8x8 block with word elements in vectors
> * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
> - * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
> + * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
> + * _out7
> * Example : LASX_TRANSPOSE8x8_W
> * _in0 : 1,2,3,4,5,6,7,8
> * _in1 : 2,2,3,4,5,6,7,8
> @@ -1433,38 +1492,39 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
> * _out7 : 8,8,8,8,8,8,8,8
> * =============================================================================
> */
> -#define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
> - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
> -{ \
> - __m256i _s0_m, _s1_m; \
> - __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
> - __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
> - \
> - _s0_m = __lasx_xvilvl_w(_in2, _in0); \
> - _s1_m = __lasx_xvilvl_w(_in3, _in1); \
> - _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
> - _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
> - _s0_m = __lasx_xvilvh_w(_in2, _in0); \
> - _s1_m = __lasx_xvilvh_w(_in3, _in1); \
> - _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
> - _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
> - _s0_m = __lasx_xvilvl_w(_in6, _in4); \
> - _s1_m = __lasx_xvilvl_w(_in7, _in5); \
> - _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
> - _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
> - _s0_m = __lasx_xvilvh_w(_in6, _in4); \
> - _s1_m = __lasx_xvilvh_w(_in7, _in5); \
> - _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
> - _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
> - _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20); \
> - _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20); \
> - _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20); \
> - _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20); \
> - _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31); \
> - _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31); \
> - _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31); \
> - _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31); \
> -}
> +#define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
> + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
> + _out7) \
> + { \
> + __m256i _s0_m, _s1_m; \
> + __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
> + __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
> + \
> + _s0_m = __lasx_xvilvl_w(_in2, _in0); \
> + _s1_m = __lasx_xvilvl_w(_in3, _in1); \
> + _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
> + _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
> + _s0_m = __lasx_xvilvh_w(_in2, _in0); \
> + _s1_m = __lasx_xvilvh_w(_in3, _in1); \
> + _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
> + _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
> + _s0_m = __lasx_xvilvl_w(_in6, _in4); \
> + _s1_m = __lasx_xvilvl_w(_in7, _in5); \
> + _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
> + _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
> + _s0_m = __lasx_xvilvh_w(_in6, _in4); \
> + _s1_m = __lasx_xvilvh_w(_in7, _in5); \
> + _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
> + _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
> + _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20); \
> + _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20); \
> + _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20); \
> + _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20); \
> + _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31); \
> + _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31); \
> + _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31); \
> + _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31); \
> + }
>
> /*
> * =============================================================================
> @@ -1472,52 +1532,54 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
> * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
> * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
> * (input 16x8 byte block)
> - * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
> - * (output 8x16 byte block)
> - * Details : The rows of the matrix become columns, and the columns become rows.
> + * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
> + * _out7 (output 8x16 byte block)
> + * Details : The rows of the matrix become columns, and the columns become
> + * rows.
> * Example : See LASX_TRANSPOSE16x8_H
> * =============================================================================
> */
> -#define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
> - _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15, \
> - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
> -{ \
> - __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
> - __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
> - \
> - _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
> - _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
> - _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
> - _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
> - _tmp4_m = __lasx_xvilvl_b(_in10, _in8); \
> - _tmp5_m = __lasx_xvilvl_b(_in11, _in9); \
> - _tmp6_m = __lasx_xvilvl_b(_in14, _in12); \
> - _tmp7_m = __lasx_xvilvl_b(_in15, _in13); \
> - _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
> - _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
> - _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
> - _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
> - _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m); \
> - _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m); \
> - _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m); \
> - _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m); \
> - _tmp0_m = __lasx_xvilvl_w(_out2, _out0); \
> - _tmp2_m = __lasx_xvilvh_w(_out2, _out0); \
> - _tmp4_m = __lasx_xvilvl_w(_out3, _out1); \
> - _tmp6_m = __lasx_xvilvh_w(_out3, _out1); \
> - _tmp1_m = __lasx_xvilvl_w(_out6, _out4); \
> - _tmp3_m = __lasx_xvilvh_w(_out6, _out4); \
> - _tmp5_m = __lasx_xvilvl_w(_out7, _out5); \
> - _tmp7_m = __lasx_xvilvh_w(_out7, _out5); \
> - _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m); \
> - _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m); \
> - _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m); \
> - _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m); \
> - _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m); \
> - _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m); \
> - _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m); \
> - _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m); \
> -}
> +#define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
> + _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
> + _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
> + _out6, _out7) \
> + { \
> + __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
> + __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
> + \
> + _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
> + _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
> + _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
> + _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
> + _tmp4_m = __lasx_xvilvl_b(_in10, _in8); \
> + _tmp5_m = __lasx_xvilvl_b(_in11, _in9); \
> + _tmp6_m = __lasx_xvilvl_b(_in14, _in12); \
> + _tmp7_m = __lasx_xvilvl_b(_in15, _in13); \
> + _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
> + _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
> + _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
> + _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
> + _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m); \
> + _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m); \
> + _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m); \
> + _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m); \
> + _tmp0_m = __lasx_xvilvl_w(_out2, _out0); \
> + _tmp2_m = __lasx_xvilvh_w(_out2, _out0); \
> + _tmp4_m = __lasx_xvilvl_w(_out3, _out1); \
> + _tmp6_m = __lasx_xvilvh_w(_out3, _out1); \
> + _tmp1_m = __lasx_xvilvl_w(_out6, _out4); \
> + _tmp3_m = __lasx_xvilvh_w(_out6, _out4); \
> + _tmp5_m = __lasx_xvilvl_w(_out7, _out5); \
> + _tmp7_m = __lasx_xvilvh_w(_out7, _out5); \
> + _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m); \
> + _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m); \
> + _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m); \
> + _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m); \
> + _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m); \
> + _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m); \
> + _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m); \
> + _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m); \
> + }
>
> /*
> * =============================================================================
> @@ -1525,9 +1587,10 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
> * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
> * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
> * (input 16x8 byte block)
> - * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
> - * (output 8x16 byte block)
> - * Details : The rows of the matrix become columns, and the columns become rows.
> + * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
> + * _out7 (output 8x16 byte block)
> + * Details : The rows of the matrix become columns, and the columns become
> + * rows.
> * Example : LASX_TRANSPOSE16x8_H
> * _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
> * _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
> @@ -1556,72 +1619,73 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
> * _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
> * =============================================================================
> */
> -#define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
> - _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15, \
> - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
> - { \
> - __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
> - __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
> - __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
> - \
> - _tmp0_m = __lasx_xvilvl_h(_in2, _in0); \
> - _tmp1_m = __lasx_xvilvl_h(_in3, _in1); \
> - _tmp2_m = __lasx_xvilvl_h(_in6, _in4); \
> - _tmp3_m = __lasx_xvilvl_h(_in7, _in5); \
> - _tmp4_m = __lasx_xvilvl_h(_in10, _in8); \
> - _tmp5_m = __lasx_xvilvl_h(_in11, _in9); \
> - _tmp6_m = __lasx_xvilvl_h(_in14, _in12); \
> - _tmp7_m = __lasx_xvilvl_h(_in15, _in13); \
> - _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
> - _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
> - _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
> - _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
> - _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
> - _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
> - _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
> - _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
> - _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
> - _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
> - _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
> - _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
> - _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
> - _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
> - _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
> - _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
> - _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
> - _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
> - _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
> - _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
> - \
> - _tmp0_m = __lasx_xvilvh_h(_in2, _in0); \
> - _tmp1_m = __lasx_xvilvh_h(_in3, _in1); \
> - _tmp2_m = __lasx_xvilvh_h(_in6, _in4); \
> - _tmp3_m = __lasx_xvilvh_h(_in7, _in5); \
> - _tmp4_m = __lasx_xvilvh_h(_in10, _in8); \
> - _tmp5_m = __lasx_xvilvh_h(_in11, _in9); \
> - _tmp6_m = __lasx_xvilvh_h(_in14, _in12); \
> - _tmp7_m = __lasx_xvilvh_h(_in15, _in13); \
> - _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
> - _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
> - _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
> - _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
> - _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
> - _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
> - _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
> - _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
> - _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
> - _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
> - _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
> - _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
> - _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
> - _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
> - _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
> - _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
> - _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
> - _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
> - _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
> - _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
> -}
> +#define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
> + _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
> + _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
> + _out6, _out7) \
> + { \
> + __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
> + __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
> + __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
> + \
> + _tmp0_m = __lasx_xvilvl_h(_in2, _in0); \
> + _tmp1_m = __lasx_xvilvl_h(_in3, _in1); \
> + _tmp2_m = __lasx_xvilvl_h(_in6, _in4); \
> + _tmp3_m = __lasx_xvilvl_h(_in7, _in5); \
> + _tmp4_m = __lasx_xvilvl_h(_in10, _in8); \
> + _tmp5_m = __lasx_xvilvl_h(_in11, _in9); \
> + _tmp6_m = __lasx_xvilvl_h(_in14, _in12); \
> + _tmp7_m = __lasx_xvilvl_h(_in15, _in13); \
> + _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
> + _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
> + _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
> + _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
> + _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
> + _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
> + _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
> + _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
> + _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
> + _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
> + _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
> + _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
> + _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
> + _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
> + _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
> + _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
> + _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
> + _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
> + _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
> + _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
> + \
> + _tmp0_m = __lasx_xvilvh_h(_in2, _in0); \
> + _tmp1_m = __lasx_xvilvh_h(_in3, _in1); \
> + _tmp2_m = __lasx_xvilvh_h(_in6, _in4); \
> + _tmp3_m = __lasx_xvilvh_h(_in7, _in5); \
> + _tmp4_m = __lasx_xvilvh_h(_in10, _in8); \
> + _tmp5_m = __lasx_xvilvh_h(_in11, _in9); \
> + _tmp6_m = __lasx_xvilvh_h(_in14, _in12); \
> + _tmp7_m = __lasx_xvilvh_h(_in15, _in13); \
> + _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
> + _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
> + _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
> + _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
> + _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
> + _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
> + _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
> + _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
> + _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
> + _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
> + _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
> + _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
> + _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
> + _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
> + _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
> + _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
> + _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
> + _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
> + _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
> + _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
> + }
>
> /*
> * =============================================================================
> @@ -1629,61 +1693,65 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
> * Arguments : Inputs - _in0, _in1, _in2, _in3
> * Outputs - _out0, _out1, _out2, _out3
> * Return Type - signed halfword
> - * Details : The rows of the matrix become columns, and the columns become rows.
> + * Details : The rows of the matrix become columns, and the columns become
> + * rows.
> * Example : See LASX_TRANSPOSE8x8_H
> * =============================================================================
> */
> -#define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
> -{ \
> - __m256i _s0_m, _s1_m; \
> - \
> - _s0_m = __lasx_xvilvl_h(_in1, _in0); \
> - _s1_m = __lasx_xvilvl_h(_in3, _in2); \
> - _out0 = __lasx_xvilvl_w(_s1_m, _s0_m); \
> - _out2 = __lasx_xvilvh_w(_s1_m, _s0_m); \
> - _out1 = __lasx_xvilvh_d(_out0, _out0); \
> - _out3 = __lasx_xvilvh_d(_out2, _out2); \
> -}
> +#define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
> + _out3) \
> + { \
> + __m256i _s0_m, _s1_m; \
> + \
> + _s0_m = __lasx_xvilvl_h(_in1, _in0); \
> + _s1_m = __lasx_xvilvl_h(_in3, _in2); \
> + _out0 = __lasx_xvilvl_w(_s1_m, _s0_m); \
> + _out2 = __lasx_xvilvh_w(_s1_m, _s0_m); \
> + _out1 = __lasx_xvilvh_d(_out0, _out0); \
> + _out3 = __lasx_xvilvh_d(_out2, _out2); \
> + }
>
> /*
> * =============================================================================
> * Description : Transpose input 8x8 byte block
> * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
> * (input 8x8 byte block)
> - * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
> - * (output 8x8 byte block)
> + * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
> + * _out7 (output 8x8 byte block)
> * Example : See LASX_TRANSPOSE8x8_H
> * =============================================================================
> */
> -#define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, _out0, \
> - _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
> -{ \
> - __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
> - __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
> - _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
> - _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
> - _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
> - _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
> - _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
> - _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
> - _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
> - _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
> - _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m); \
> - _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m); \
> - _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m); \
> - _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m); \
> - _out1 = __lasx_xvbsrl_v(_out0, 8); \
> - _out3 = __lasx_xvbsrl_v(_out2, 8); \
> - _out5 = __lasx_xvbsrl_v(_out4, 8); \
> - _out7 = __lasx_xvbsrl_v(_out6, 8); \
> -}
> +#define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
> + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
> + _out7) \
> + { \
> + __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
> + __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
> + _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
> + _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
> + _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
> + _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
> + _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
> + _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
> + _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
> + _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
> + _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m); \
> + _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m); \
> + _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m); \
> + _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m); \
> + _out1 = __lasx_xvbsrl_v(_out0, 8); \
> + _out3 = __lasx_xvbsrl_v(_out2, 8); \
> + _out5 = __lasx_xvbsrl_v(_out4, 8); \
> + _out7 = __lasx_xvbsrl_v(_out6, 8); \
> + }
>
> /*
> * =============================================================================
> * Description : Transpose 8x8 block with halfword elements in vectors.
> * Arguments : Inputs - _in0, _in1, ~
> * Outputs - _out0, _out1, ~
> - * Details : The rows of the matrix become columns, and the columns become rows.
> + * Details : The rows of the matrix become columns, and the columns become
> + * rows.
> * Example : LASX_TRANSPOSE8x8_H
> * _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
> * _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
> @@ -1704,40 +1772,41 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
> * _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8
> * =============================================================================
> */
> -#define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, _out0, \
> - _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
> -{ \
> - __m256i _s0_m, _s1_m; \
> - __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
> - __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
> - \
> - _s0_m = __lasx_xvilvl_h(_in6, _in4); \
> - _s1_m = __lasx_xvilvl_h(_in7, _in5); \
> - _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
> - _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
> - _s0_m = __lasx_xvilvh_h(_in6, _in4); \
> - _s1_m = __lasx_xvilvh_h(_in7, _in5); \
> - _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
> - _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
> - \
> - _s0_m = __lasx_xvilvl_h(_in2, _in0); \
> - _s1_m = __lasx_xvilvl_h(_in3, _in1); \
> - _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
> - _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
> - _s0_m = __lasx_xvilvh_h(_in2, _in0); \
> - _s1_m = __lasx_xvilvh_h(_in3, _in1); \
> - _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
> - _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
> - \
> - _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m); \
> - _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m); \
> - _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m); \
> - _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m); \
> - _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m); \
> - _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m); \
> - _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m); \
> - _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m); \
> -}
> +#define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
> + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
> + _out7) \
> + { \
> + __m256i _s0_m, _s1_m; \
> + __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
> + __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
> + \
> + _s0_m = __lasx_xvilvl_h(_in6, _in4); \
> + _s1_m = __lasx_xvilvl_h(_in7, _in5); \
> + _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
> + _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
> + _s0_m = __lasx_xvilvh_h(_in6, _in4); \
> + _s1_m = __lasx_xvilvh_h(_in7, _in5); \
> + _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
> + _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
> + \
> + _s0_m = __lasx_xvilvl_h(_in2, _in0); \
> + _s1_m = __lasx_xvilvl_h(_in3, _in1); \
> + _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
> + _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
> + _s0_m = __lasx_xvilvh_h(_in2, _in0); \
> + _s1_m = __lasx_xvilvh_h(_in3, _in1); \
> + _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
> + _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
> + \
> + _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m); \
> + _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m); \
> + _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m); \
> + _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m); \
> + _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m); \
> + _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m); \
> + _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m); \
> + _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m); \
> + }
>
> /*
> * =============================================================================
> @@ -1752,34 +1821,34 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
> * _out3 = _in0 - _in3;
> * =============================================================================
> */
> -#define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
> -{ \
> - _out0 = __lasx_xvadd_b(_in0, _in3); \
> - _out1 = __lasx_xvadd_b(_in1, _in2); \
> - _out2 = __lasx_xvsub_b(_in1, _in2); \
> - _out3 = __lasx_xvsub_b(_in0, _in3); \
> -}
> -#define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
> -{ \
> - _out0 = __lasx_xvadd_h(_in0, _in3); \
> - _out1 = __lasx_xvadd_h(_in1, _in2); \
> - _out2 = __lasx_xvsub_h(_in1, _in2); \
> - _out3 = __lasx_xvsub_h(_in0, _in3); \
> -}
> -#define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
> -{ \
> - _out0 = __lasx_xvadd_w(_in0, _in3); \
> - _out1 = __lasx_xvadd_w(_in1, _in2); \
> - _out2 = __lasx_xvsub_w(_in1, _in2); \
> - _out3 = __lasx_xvsub_w(_in0, _in3); \
> -}
> -#define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
> -{ \
> - _out0 = __lasx_xvadd_d(_in0, _in3); \
> - _out1 = __lasx_xvadd_d(_in1, _in2); \
> - _out2 = __lasx_xvsub_d(_in1, _in2); \
> - _out3 = __lasx_xvsub_d(_in0, _in3); \
> -}
> +#define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
> + { \
> + _out0 = __lasx_xvadd_b(_in0, _in3); \
> + _out1 = __lasx_xvadd_b(_in1, _in2); \
> + _out2 = __lasx_xvsub_b(_in1, _in2); \
> + _out3 = __lasx_xvsub_b(_in0, _in3); \
> + }
> +#define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
> + { \
> + _out0 = __lasx_xvadd_h(_in0, _in3); \
> + _out1 = __lasx_xvadd_h(_in1, _in2); \
> + _out2 = __lasx_xvsub_h(_in1, _in2); \
> + _out3 = __lasx_xvsub_h(_in0, _in3); \
> + }
> +#define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
> + { \
> + _out0 = __lasx_xvadd_w(_in0, _in3); \
> + _out1 = __lasx_xvadd_w(_in1, _in2); \
> + _out2 = __lasx_xvsub_w(_in1, _in2); \
> + _out3 = __lasx_xvsub_w(_in0, _in3); \
> + }
> +#define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
> + { \
> + _out0 = __lasx_xvadd_d(_in0, _in3); \
> + _out1 = __lasx_xvadd_d(_in1, _in2); \
> + _out2 = __lasx_xvsub_d(_in1, _in2); \
> + _out3 = __lasx_xvsub_d(_in0, _in3); \
> + }
>
> /*
> * =============================================================================
> @@ -1798,59 +1867,63 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
> * _out7 = _in0 - _in7;
> * =============================================================================
> */
> -#define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
> - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
> -{ \
> - _out0 = __lasx_xvadd_b(_in0, _in7); \
> - _out1 = __lasx_xvadd_b(_in1, _in6); \
> - _out2 = __lasx_xvadd_b(_in2, _in5); \
> - _out3 = __lasx_xvadd_b(_in3, _in4); \
> - _out4 = __lasx_xvsub_b(_in3, _in4); \
> - _out5 = __lasx_xvsub_b(_in2, _in5); \
> - _out6 = __lasx_xvsub_b(_in1, _in6); \
> - _out7 = __lasx_xvsub_b(_in0, _in7); \
> -}
> -
> -#define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
> - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
> -{ \
> - _out0 = __lasx_xvadd_h(_in0, _in7); \
> - _out1 = __lasx_xvadd_h(_in1, _in6); \
> - _out2 = __lasx_xvadd_h(_in2, _in5); \
> - _out3 = __lasx_xvadd_h(_in3, _in4); \
> - _out4 = __lasx_xvsub_h(_in3, _in4); \
> - _out5 = __lasx_xvsub_h(_in2, _in5); \
> - _out6 = __lasx_xvsub_h(_in1, _in6); \
> - _out7 = __lasx_xvsub_h(_in0, _in7); \
> -}
> -
> -#define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
> - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
> -{ \
> - _out0 = __lasx_xvadd_w(_in0, _in7); \
> - _out1 = __lasx_xvadd_w(_in1, _in6); \
> - _out2 = __lasx_xvadd_w(_in2, _in5); \
> - _out3 = __lasx_xvadd_w(_in3, _in4); \
> - _out4 = __lasx_xvsub_w(_in3, _in4); \
> - _out5 = __lasx_xvsub_w(_in2, _in5); \
> - _out6 = __lasx_xvsub_w(_in1, _in6); \
> - _out7 = __lasx_xvsub_w(_in0, _in7); \
> -}
> -
> -#define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
> - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
> -{ \
> - _out0 = __lasx_xvadd_d(_in0, _in7); \
> - _out1 = __lasx_xvadd_d(_in1, _in6); \
> - _out2 = __lasx_xvadd_d(_in2, _in5); \
> - _out3 = __lasx_xvadd_d(_in3, _in4); \
> - _out4 = __lasx_xvsub_d(_in3, _in4); \
> - _out5 = __lasx_xvsub_d(_in2, _in5); \
> - _out6 = __lasx_xvsub_d(_in1, _in6); \
> - _out7 = __lasx_xvsub_d(_in0, _in7); \
> -}
> -
> -#endif //LASX
> +#define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
> + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
> + _out7) \
> + { \
> + _out0 = __lasx_xvadd_b(_in0, _in7); \
> + _out1 = __lasx_xvadd_b(_in1, _in6); \
> + _out2 = __lasx_xvadd_b(_in2, _in5); \
> + _out3 = __lasx_xvadd_b(_in3, _in4); \
> + _out4 = __lasx_xvsub_b(_in3, _in4); \
> + _out5 = __lasx_xvsub_b(_in2, _in5); \
> + _out6 = __lasx_xvsub_b(_in1, _in6); \
> + _out7 = __lasx_xvsub_b(_in0, _in7); \
> + }
> +
> +#define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
> + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
> + _out7) \
> + { \
> + _out0 = __lasx_xvadd_h(_in0, _in7); \
> + _out1 = __lasx_xvadd_h(_in1, _in6); \
> + _out2 = __lasx_xvadd_h(_in2, _in5); \
> + _out3 = __lasx_xvadd_h(_in3, _in4); \
> + _out4 = __lasx_xvsub_h(_in3, _in4); \
> + _out5 = __lasx_xvsub_h(_in2, _in5); \
> + _out6 = __lasx_xvsub_h(_in1, _in6); \
> + _out7 = __lasx_xvsub_h(_in0, _in7); \
> + }
> +
> +#define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
> + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
> + _out7) \
> + { \
> + _out0 = __lasx_xvadd_w(_in0, _in7); \
> + _out1 = __lasx_xvadd_w(_in1, _in6); \
> + _out2 = __lasx_xvadd_w(_in2, _in5); \
> + _out3 = __lasx_xvadd_w(_in3, _in4); \
> + _out4 = __lasx_xvsub_w(_in3, _in4); \
> + _out5 = __lasx_xvsub_w(_in2, _in5); \
> + _out6 = __lasx_xvsub_w(_in1, _in6); \
> + _out7 = __lasx_xvsub_w(_in0, _in7); \
> + }
> +
> +#define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
> + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
> + _out7) \
> + { \
> + _out0 = __lasx_xvadd_d(_in0, _in7); \
> + _out1 = __lasx_xvadd_d(_in1, _in6); \
> + _out2 = __lasx_xvadd_d(_in2, _in5); \
> + _out3 = __lasx_xvadd_d(_in3, _in4); \
> + _out4 = __lasx_xvsub_d(_in3, _in4); \
> + _out5 = __lasx_xvsub_d(_in2, _in5); \
> + _out6 = __lasx_xvsub_d(_in1, _in6); \
> + _out7 = __lasx_xvsub_d(_in0, _in7); \
> + }
> +
> +#endif // LASX
>
> /*
> * =============================================================================
> @@ -1863,15 +1936,13 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
> * VP:1,2,3,4,
> * =============================================================================
> */
> -#define VECT_PRINT(RTYPE, element_num, in0, enter) \
> -{ \
> - RTYPE _tmp0 = (RTYPE)in0; \
> - int _i = 0; \
> - if (enter) \
> - printf("\nVP:"); \
> - for(_i = 0; _i < element_num; _i++) \
> - printf("%d,",_tmp0[_i]); \
> -}
> +#define VECT_PRINT(RTYPE, element_num, in0, enter) \
> + { \
> + RTYPE _tmp0 = (RTYPE)in0; \
> + int _i = 0; \
> + if (enter) printf("\nVP:"); \
> + for (_i = 0; _i < element_num; _i++) printf("%d,", _tmp0[_i]); \
> + }
>
> #endif /* LOONGSON_INTRINSICS_H */
> #endif /* AVUTIL_LOONGARCH_LOONGSON_INTRINSICS_H */
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 1/5] avutil: [loongarch] Update loongson_intrinsics.h to v1.1.0
2022-02-21 11:11 ` Hao Chen
@ 2022-02-21 11:42 ` 殷时友
0 siblings, 0 replies; 15+ messages in thread
From: 殷时友 @ 2022-02-21 11:42 UTC (permalink / raw)
To: FFmpeg development discussions and patches
This patch set looks good to me.
> 2022年2月21日 下午7:11,Hao Chen <chenhao@loongson.cn> 写道:
>
> ping
>
> any more opinions on this?
>
> 在 2022/2/17 下午7:11, Hao Chen 写道:
>> The loongson_intrinsics.h file is updated from v1.0.3 version
>> to v1.1.0. Some spelling mistakes are fixed and new functions are added.
>>
>> Signed-off-by: Hao Chen <chenhao@loongson.cn>
>> ---
>> libavutil/loongarch/loongson_intrinsics.h | 1697 +++++++++++----------
>> 1 file changed, 884 insertions(+), 813 deletions(-)
>>
>> diff --git a/libavutil/loongarch/loongson_intrinsics.h b/libavutil/loongarch/loongson_intrinsics.h
>> index 6e0439f829..eb256863c8 100644
>> --- a/libavutil/loongarch/loongson_intrinsics.h
>> +++ b/libavutil/loongarch/loongson_intrinsics.h
>> @@ -33,7 +33,7 @@
>> * Xiwei Gu <guxiwei-hf@loongson.cn>
>> * Lu Wang <wanglu@loongson.cn>
>> *
>> - * This file is a header file for loongarch builtin extention.
>> + * This file is a header file for loongarch builtin extension.
>> *
>> */
>> @@ -42,50 +42,50 @@
>> /**
>> * MAJOR version: Macro usage changes.
>> - * MINOR version: Add new functions, or bug fix.
>> + * MINOR version: Add new functions, or bug fixes.
>> * MICRO version: Comment changes or implementation changes.
>> */
>> #define LSOM_VERSION_MAJOR 1
>> -#define LSOM_VERSION_MINOR 0
>> -#define LSOM_VERSION_MICRO 3
>> +#define LSOM_VERSION_MINOR 1
>> +#define LSOM_VERSION_MICRO 0
>> #define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
>> -{ \
>> - _OUT0 = _INS(_IN0); \
>> - _OUT1 = _INS(_IN1); \
>> -}
>> + { \
>> + _OUT0 = _INS(_IN0); \
>> + _OUT1 = _INS(_IN1); \
>> + }
>> #define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
>> -{ \
>> - _OUT0 = _INS(_IN0, _IN1); \
>> - _OUT1 = _INS(_IN2, _IN3); \
>> -}
>> + { \
>> + _OUT0 = _INS(_IN0, _IN1); \
>> + _OUT1 = _INS(_IN2, _IN3); \
>> + }
>> #define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \
>> -{ \
>> - _OUT0 = _INS(_IN0, _IN1, _IN2); \
>> - _OUT1 = _INS(_IN3, _IN4, _IN5); \
>> -}
>> + { \
>> + _OUT0 = _INS(_IN0, _IN1, _IN2); \
>> + _OUT1 = _INS(_IN3, _IN4, _IN5); \
>> + }
>> #define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \
>> -{ \
>> - DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1); \
>> - DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3); \
>> -}
>> -
>> -#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, \
>> - _OUT0, _OUT1, _OUT2, _OUT3) \
>> -{ \
>> - DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1); \
>> - DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3); \
>> -}
>> -
>> -#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, \
>> - _IN8, _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3) \
>> -{ \
>> - DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1); \
>> - DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3); \
>> -}
>> + { \
>> + DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1); \
>> + DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3); \
>> + }
>> +
>> +#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, \
>> + _OUT1, _OUT2, _OUT3) \
>> + { \
>> + DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1); \
>> + DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3); \
>> + }
>> +
>> +#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, \
>> + _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3) \
>> + { \
>> + DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1); \
>> + DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3); \
>> + }
>> #ifdef __loongarch_sx
>> #include <lsxintrin.h>
>> @@ -94,11 +94,11 @@
>> * Description : Dot product & addition of byte vector elements
>> * Arguments : Inputs - in_c, in_h, in_l
>> * Outputs - out
>> - * Retrun Type - halfword
>> + * Return Type - halfword
>> * Details : Signed byte elements from in_h are multiplied by
>> * signed byte elements from in_l, and then added adjacent to
>> * each other to get results with the twice size of input.
>> - * Then the results plus to signed half word elements from in_c.
>> + * Then the results plus to signed half-word elements from in_c.
>> * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
>> * in_c : 1,2,3,4, 1,2,3,4
>> * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
>> @@ -106,13 +106,13 @@
>> * out : 23,40,41,26, 23,40,41,26
>> * =============================================================================
>> */
>> -static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h, __m128i in_l)
>> -{
>> - __m128i out;
>> +static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h,
>> + __m128i in_l) {
>> + __m128i out;
>> - out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
>> - out = __lsx_vmaddwod_h_b(out, in_h, in_l);
>> - return out;
>> + out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
>> + out = __lsx_vmaddwod_h_b(out, in_h, in_l);
>> + return out;
>> }
>> /*
>> @@ -120,35 +120,61 @@ static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h, __m128i in_l
>> * Description : Dot product & addition of byte vector elements
>> * Arguments : Inputs - in_c, in_h, in_l
>> * Outputs - out
>> - * Retrun Type - halfword
>> + * Return Type - halfword
>> * Details : Unsigned byte elements from in_h are multiplied by
>> * unsigned byte elements from in_l, and then added adjacent to
>> * each other to get results with the twice size of input.
>> - * The results plus to signed half word elements from in_c.
>> - * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
>> + * The results plus to signed half-word elements from in_c.
>> + * Example : out = __lsx_vdp2add_h_bu(in_c, in_h, in_l)
>> * in_c : 1,2,3,4, 1,2,3,4
>> * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
>> * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
>> * out : 23,40,41,26, 23,40,41,26
>> * =============================================================================
>> */
>> -static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h, __m128i in_l)
>> -{
>> - __m128i out;
>> +static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h,
>> + __m128i in_l) {
>> + __m128i out;
>> +
>> + out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
>> + out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
>> + return out;
>> +}
>> +
>> +/*
>> + * =============================================================================
>> + * Description : Dot product & addition of byte vector elements
>> + * Arguments : Inputs - in_c, in_h, in_l
>> + * Outputs - out
>> + * Return Type - halfword
>> + * Details : Unsigned byte elements from in_h are multiplied by
>> + * signed byte elements from in_l, and then added adjacent to
>> + * each other to get results with the twice size of input.
>> + * The results plus to signed half-word elements from in_c.
>> + * Example : out = __lsx_vdp2add_h_bu_b(in_c, in_h, in_l)
>> + * in_c : 1,1,1,1, 1,1,1,1
>> + * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
>> + * in_l : -1,-2,-3,-4, -5,-6,-7,-8, 1,2,3,4, 5,6,7,8
>> + * out : -4,-24,-60,-112, 6,26,62,114
>> + * =============================================================================
>> + */
>> +static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, __m128i in_h,
>> + __m128i in_l) {
>> + __m128i out;
>> - out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
>> - out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
>> - return out;
>> + out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l);
>> + out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
>> + return out;
>> }
>> /*
>> * =============================================================================
>> - * Description : Dot product & addition of half word vector elements
>> + * Description : Dot product & addition of half-word vector elements
>> * Arguments : Inputs - in_c, in_h, in_l
>> * Outputs - out
>> - * Retrun Type - __m128i
>> - * Details : Signed half word elements from in_h are multiplied by
>> - * signed half word elements from in_l, and then added adjacent to
>> + * Return Type - __m128i
>> + * Details : Signed half-word elements from in_h are multiplied by
>> + * signed half-word elements from in_l, and then added adjacent to
>> * each other to get results with the twice size of input.
>> * Then the results plus to signed word elements from in_c.
>> * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
>> @@ -158,13 +184,13 @@ static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h, __m128i in_
>> * out : 23,40,41,26
>> * =============================================================================
>> */
>> -static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h, __m128i in_l)
>> -{
>> - __m128i out;
>> +static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h,
>> + __m128i in_l) {
>> + __m128i out;
>> - out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
>> - out = __lsx_vmaddwod_w_h(out, in_h, in_l);
>> - return out;
>> + out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
>> + out = __lsx_vmaddwod_w_h(out, in_h, in_l);
>> + return out;
>> }
>> /*
>> @@ -172,7 +198,7 @@ static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h, __m128i in_l
>> * Description : Dot product of byte vector elements
>> * Arguments : Inputs - in_h, in_l
>> * Outputs - out
>> - * Retrun Type - halfword
>> + * Return Type - halfword
>> * Details : Signed byte elements from in_h are multiplied by
>> * signed byte elements from in_l, and then added adjacent to
>> * each other to get results with the twice size of input.
>> @@ -182,13 +208,12 @@ static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h, __m128i in_l
>> * out : 22,38,38,22, 22,38,38,22
>> * =============================================================================
>> */
>> -static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l)
>> -{
>> - __m128i out;
>> +static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) {
>> + __m128i out;
>> - out = __lsx_vmulwev_h_b(in_h, in_l);
>> - out = __lsx_vmaddwod_h_b(out, in_h, in_l);
>> - return out;
>> + out = __lsx_vmulwev_h_b(in_h, in_l);
>> + out = __lsx_vmaddwod_h_b(out, in_h, in_l);
>> + return out;
>> }
>> /*
>> @@ -196,7 +221,7 @@ static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l)
>> * Description : Dot product of byte vector elements
>> * Arguments : Inputs - in_h, in_l
>> * Outputs - out
>> - * Retrun Type - halfword
>> + * Return Type - halfword
>> * Details : Unsigned byte elements from in_h are multiplied by
>> * unsigned byte elements from in_l, and then added adjacent to
>> * each other to get results with the twice size of input.
>> @@ -206,13 +231,12 @@ static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l)
>> * out : 22,38,38,22, 22,38,38,22
>> * =============================================================================
>> */
>> -static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l)
>> -{
>> - __m128i out;
>> +static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) {
>> + __m128i out;
>> - out = __lsx_vmulwev_h_bu(in_h, in_l);
>> - out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
>> - return out;
>> + out = __lsx_vmulwev_h_bu(in_h, in_l);
>> + out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
>> + return out;
>> }
>> /*
>> @@ -220,7 +244,7 @@ static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l)
>> * Description : Dot product of byte vector elements
>> * Arguments : Inputs - in_h, in_l
>> * Outputs - out
>> - * Retrun Type - halfword
>> + * Return Type - halfword
>> * Details : Unsigned byte elements from in_h are multiplied by
>> * signed byte elements from in_l, and then added adjacent to
>> * each other to get results with the twice size of input.
>> @@ -230,13 +254,12 @@ static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l)
>> * out : 22,38,38,22, 22,38,38,6
>> * =============================================================================
>> */
>> -static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l)
>> -{
>> - __m128i out;
>> +static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) {
>> + __m128i out;
>> - out = __lsx_vmulwev_h_bu_b(in_h, in_l);
>> - out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
>> - return out;
>> + out = __lsx_vmulwev_h_bu_b(in_h, in_l);
>> + out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
>> + return out;
>> }
>> /*
>> @@ -244,7 +267,7 @@ static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l)
>> * Description : Dot product of byte vector elements
>> * Arguments : Inputs - in_h, in_l
>> * Outputs - out
>> - * Retrun Type - halfword
>> + * Return Type - halfword
>> * Details : Signed byte elements from in_h are multiplied by
>> * signed byte elements from in_l, and then added adjacent to
>> * each other to get results with the twice size of input.
>> @@ -254,19 +277,19 @@ static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l)
>> * out : 22,38,38,22
>> * =============================================================================
>> */
>> -static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l)
>> -{
>> - __m128i out;
>> +static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) {
>> + __m128i out;
>> - out = __lsx_vmulwev_w_h(in_h, in_l);
>> - out = __lsx_vmaddwod_w_h(out, in_h, in_l);
>> - return out;
>> + out = __lsx_vmulwev_w_h(in_h, in_l);
>> + out = __lsx_vmaddwod_w_h(out, in_h, in_l);
>> + return out;
>> }
>> /*
>> * =============================================================================
>> * Description : Clip all halfword elements of input vector between min & max
>> - * out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) : (_in))
>> + * out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) :
>> + * (_in))
>> * Arguments : Inputs - _in (input vector)
>> * - min (min threshold)
>> * - max (max threshold)
>> @@ -279,13 +302,12 @@ static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l)
>> * out : 1,2,9,9, 1,9,9,9
>> * =============================================================================
>> */
>> -static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max)
>> -{
>> - __m128i out;
>> +static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max) {
>> + __m128i out;
>> - out = __lsx_vmax_h(min, _in);
>> - out = __lsx_vmin_h(max, out);
>> - return out;
>> + out = __lsx_vmax_h(min, _in);
>> + out = __lsx_vmin_h(max, out);
>> + return out;
>> }
>> /*
>> @@ -293,20 +315,19 @@ static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max)
>> * Description : Set each element of vector between 0 and 255
>> * Arguments : Inputs - _in
>> * Outputs - out
>> - * Retrun Type - halfword
>> + * Return Type - halfword
>> * Details : Signed byte elements from _in are clamped between 0 and 255.
>> * Example : out = __lsx_vclip255_h(_in)
>> * _in : -8,255,280,249, -8,255,280,249
>> * out : 0,255,255,249, 0,255,255,249
>> * =============================================================================
>> */
>> -static inline __m128i __lsx_vclip255_h(__m128i _in)
>> -{
>> - __m128i out;
>> +static inline __m128i __lsx_vclip255_h(__m128i _in) {
>> + __m128i out;
>> - out = __lsx_vmaxi_h(_in, 0);
>> - out = __lsx_vsat_hu(out, 7);
>> - return out;
>> + out = __lsx_vmaxi_h(_in, 0);
>> + out = __lsx_vsat_hu(out, 7);
>> + return out;
>> }
>> /*
>> @@ -314,20 +335,19 @@ static inline __m128i __lsx_vclip255_h(__m128i _in)
>> * Description : Set each element of vector between 0 and 255
>> * Arguments : Inputs - _in
>> * Outputs - out
>> - * Retrun Type - word
>> + * Return Type - word
>> * Details : Signed byte elements from _in are clamped between 0 and 255.
>> * Example : out = __lsx_vclip255_w(_in)
>> * _in : -8,255,280,249
>> * out : 0,255,255,249
>> * =============================================================================
>> */
>> -static inline __m128i __lsx_vclip255_w(__m128i _in)
>> -{
>> - __m128i out;
>> +static inline __m128i __lsx_vclip255_w(__m128i _in) {
>> + __m128i out;
>> - out = __lsx_vmaxi_w(_in, 0);
>> - out = __lsx_vsat_wu(out, 7);
>> - return out;
>> + out = __lsx_vmaxi_w(_in, 0);
>> + out = __lsx_vsat_wu(out, 7);
>> + return out;
>> }
>> /*
>> @@ -343,12 +363,12 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
>> * _in1(out) : 1,2,3,4
>> * =============================================================================
>> */
>> -#define LSX_SWAP(_in0, _in1) \
>> -{ \
>> - _in0 = __lsx_vxor_v(_in0, _in1); \
>> - _in1 = __lsx_vxor_v(_in0, _in1); \
>> - _in0 = __lsx_vxor_v(_in0, _in1); \
>> -} \
>> +#define LSX_SWAP(_in0, _in1) \
>> + { \
>> + _in0 = __lsx_vxor_v(_in0, _in1); \
>> + _in1 = __lsx_vxor_v(_in0, _in1); \
>> + _in0 = __lsx_vxor_v(_in0, _in1); \
>> + }
>> /*
>> * =============================================================================
>> @@ -364,25 +384,27 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
>> * =============================================================================
>> */
>> #define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
>> -{ \
>> + { \
>> __m128i _t0, _t1, _t2, _t3; \
>> \
>> - _t0 = __lsx_vilvl_w(_in1, _in0); \
>> - _t1 = __lsx_vilvh_w(_in1, _in0); \
>> - _t2 = __lsx_vilvl_w(_in3, _in2); \
>> - _t3 = __lsx_vilvh_w(_in3, _in2); \
>> + _t0 = __lsx_vilvl_w(_in1, _in0); \
>> + _t1 = __lsx_vilvh_w(_in1, _in0); \
>> + _t2 = __lsx_vilvl_w(_in3, _in2); \
>> + _t3 = __lsx_vilvh_w(_in3, _in2); \
>> _out0 = __lsx_vilvl_d(_t2, _t0); \
>> _out1 = __lsx_vilvh_d(_t2, _t0); \
>> _out2 = __lsx_vilvl_d(_t3, _t1); \
>> _out3 = __lsx_vilvh_d(_t3, _t1); \
>> -}
>> + }
>> /*
>> * =============================================================================
>> * Description : Transpose 8x8 block with byte elements in vectors
>> * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
>> - * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
>> - * Details : The rows of the matrix become columns, and the columns become rows.
>> + * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
>> + * _out7
>> + * Details : The rows of the matrix become columns, and the columns
>> + * become rows.
>> * Example : LSX_TRANSPOSE8x8_B
>> * _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00
>> * _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00
>> @@ -403,34 +425,35 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
>> * _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00
>> * =============================================================================
>> */
>> -#define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
>> - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
>> -{ \
>> - __m128i zero = {0}; \
>> - __m128i shuf8 = {0x0F0E0D0C0B0A0908, 0x1716151413121110}; \
>> - __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
>> - \
>> - _t0 = __lsx_vilvl_b(_in2, _in0); \
>> - _t1 = __lsx_vilvl_b(_in3, _in1); \
>> - _t2 = __lsx_vilvl_b(_in6, _in4); \
>> - _t3 = __lsx_vilvl_b(_in7, _in5); \
>> - _t4 = __lsx_vilvl_b(_t1, _t0); \
>> - _t5 = __lsx_vilvh_b(_t1, _t0); \
>> - _t6 = __lsx_vilvl_b(_t3, _t2); \
>> - _t7 = __lsx_vilvh_b(_t3, _t2); \
>> - _out0 = __lsx_vilvl_w(_t6, _t4); \
>> - _out2 = __lsx_vilvh_w(_t6, _t4); \
>> - _out4 = __lsx_vilvl_w(_t7, _t5); \
>> - _out6 = __lsx_vilvh_w(_t7, _t5); \
>> - _out1 = __lsx_vshuf_b(zero, _out0, shuf8); \
>> - _out3 = __lsx_vshuf_b(zero, _out2, shuf8); \
>> - _out5 = __lsx_vshuf_b(zero, _out4, shuf8); \
>> - _out7 = __lsx_vshuf_b(zero, _out6, shuf8); \
>> -}
>> +#define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
>> + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
>> + _out7) \
>> + { \
>> + __m128i zero = { 0 }; \
>> + __m128i shuf8 = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; \
>> + __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
>> + \
>> + _t0 = __lsx_vilvl_b(_in2, _in0); \
>> + _t1 = __lsx_vilvl_b(_in3, _in1); \
>> + _t2 = __lsx_vilvl_b(_in6, _in4); \
>> + _t3 = __lsx_vilvl_b(_in7, _in5); \
>> + _t4 = __lsx_vilvl_b(_t1, _t0); \
>> + _t5 = __lsx_vilvh_b(_t1, _t0); \
>> + _t6 = __lsx_vilvl_b(_t3, _t2); \
>> + _t7 = __lsx_vilvh_b(_t3, _t2); \
>> + _out0 = __lsx_vilvl_w(_t6, _t4); \
>> + _out2 = __lsx_vilvh_w(_t6, _t4); \
>> + _out4 = __lsx_vilvl_w(_t7, _t5); \
>> + _out6 = __lsx_vilvh_w(_t7, _t5); \
>> + _out1 = __lsx_vshuf_b(zero, _out0, shuf8); \
>> + _out3 = __lsx_vshuf_b(zero, _out2, shuf8); \
>> + _out5 = __lsx_vshuf_b(zero, _out4, shuf8); \
>> + _out7 = __lsx_vshuf_b(zero, _out6, shuf8); \
>> + }
>> /*
>> * =============================================================================
>> - * Description : Transpose 8x8 block with half word elements in vectors
>> + * Description : Transpose 8x8 block with half-word elements in vectors
>> * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
>> * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
>> * Details :
>> @@ -445,37 +468,38 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
>> * 70,71,72,73,74,75,76,77 07,17,27,37,47,57,67,77
>> * =============================================================================
>> */
>> -#define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
>> - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
>> -{ \
>> - __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
>> - \
>> - _s0 = __lsx_vilvl_h(_in6, _in4); \
>> - _s1 = __lsx_vilvl_h(_in7, _in5); \
>> - _t0 = __lsx_vilvl_h(_s1, _s0); \
>> - _t1 = __lsx_vilvh_h(_s1, _s0); \
>> - _s0 = __lsx_vilvh_h(_in6, _in4); \
>> - _s1 = __lsx_vilvh_h(_in7, _in5); \
>> - _t2 = __lsx_vilvl_h(_s1, _s0); \
>> - _t3 = __lsx_vilvh_h(_s1, _s0); \
>> - _s0 = __lsx_vilvl_h(_in2, _in0); \
>> - _s1 = __lsx_vilvl_h(_in3, _in1); \
>> - _t4 = __lsx_vilvl_h(_s1, _s0); \
>> - _t5 = __lsx_vilvh_h(_s1, _s0); \
>> - _s0 = __lsx_vilvh_h(_in2, _in0); \
>> - _s1 = __lsx_vilvh_h(_in3, _in1); \
>> - _t6 = __lsx_vilvl_h(_s1, _s0); \
>> - _t7 = __lsx_vilvh_h(_s1, _s0); \
>> - \
>> - _out0 = __lsx_vpickev_d(_t0, _t4); \
>> - _out2 = __lsx_vpickev_d(_t1, _t5); \
>> - _out4 = __lsx_vpickev_d(_t2, _t6); \
>> - _out6 = __lsx_vpickev_d(_t3, _t7); \
>> - _out1 = __lsx_vpickod_d(_t0, _t4); \
>> - _out3 = __lsx_vpickod_d(_t1, _t5); \
>> - _out5 = __lsx_vpickod_d(_t2, _t6); \
>> - _out7 = __lsx_vpickod_d(_t3, _t7); \
>> -}
>> +#define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
>> + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
>> + _out7) \
>> + { \
>> + __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
>> + \
>> + _s0 = __lsx_vilvl_h(_in6, _in4); \
>> + _s1 = __lsx_vilvl_h(_in7, _in5); \
>> + _t0 = __lsx_vilvl_h(_s1, _s0); \
>> + _t1 = __lsx_vilvh_h(_s1, _s0); \
>> + _s0 = __lsx_vilvh_h(_in6, _in4); \
>> + _s1 = __lsx_vilvh_h(_in7, _in5); \
>> + _t2 = __lsx_vilvl_h(_s1, _s0); \
>> + _t3 = __lsx_vilvh_h(_s1, _s0); \
>> + _s0 = __lsx_vilvl_h(_in2, _in0); \
>> + _s1 = __lsx_vilvl_h(_in3, _in1); \
>> + _t4 = __lsx_vilvl_h(_s1, _s0); \
>> + _t5 = __lsx_vilvh_h(_s1, _s0); \
>> + _s0 = __lsx_vilvh_h(_in2, _in0); \
>> + _s1 = __lsx_vilvh_h(_in3, _in1); \
>> + _t6 = __lsx_vilvl_h(_s1, _s0); \
>> + _t7 = __lsx_vilvh_h(_s1, _s0); \
>> + \
>> + _out0 = __lsx_vpickev_d(_t0, _t4); \
>> + _out2 = __lsx_vpickev_d(_t1, _t5); \
>> + _out4 = __lsx_vpickev_d(_t2, _t6); \
>> + _out6 = __lsx_vpickev_d(_t3, _t7); \
>> + _out1 = __lsx_vpickod_d(_t0, _t4); \
>> + _out3 = __lsx_vpickod_d(_t1, _t5); \
>> + _out5 = __lsx_vpickod_d(_t2, _t6); \
>> + _out7 = __lsx_vpickod_d(_t3, _t7); \
>> + }
>> /*
>> * =============================================================================
>> @@ -483,7 +507,8 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
>> * Arguments : Inputs - _in0, _in1, _in2, _in3 (input 8x4 byte block)
>> * Outputs - _out0, _out1, _out2, _out3 (output 4x8 byte block)
>> * Return Type - as per RTYPE
>> - * Details : The rows of the matrix become columns, and the columns become rows.
>> + * Details : The rows of the matrix become columns, and the columns become
>> + * rows.
>> * Example : LSX_TRANSPOSE8x4_B
>> * _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00
>> * _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00
>> @@ -500,26 +525,26 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
>> * _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
>> * =============================================================================
>> */
>> -#define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
>> - _out0, _out1, _out2, _out3) \
>> -{ \
>> - __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
>> - \
>> - _tmp0_m = __lsx_vpackev_w(_in4, _in0); \
>> - _tmp1_m = __lsx_vpackev_w(_in5, _in1); \
>> - _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
>> - _tmp0_m = __lsx_vpackev_w(_in6, _in2); \
>> - _tmp1_m = __lsx_vpackev_w(_in7, _in3); \
>> - \
>> - _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
>> - _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m); \
>> - _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m); \
>> - \
>> - _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m); \
>> - _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m); \
>> - _out1 = __lsx_vilvh_d(_out2, _out0); \
>> - _out3 = __lsx_vilvh_d(_out0, _out2); \
>> -}
>> +#define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
>> + _out0, _out1, _out2, _out3) \
>> + { \
>> + __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
>> + \
>> + _tmp0_m = __lsx_vpackev_w(_in4, _in0); \
>> + _tmp1_m = __lsx_vpackev_w(_in5, _in1); \
>> + _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
>> + _tmp0_m = __lsx_vpackev_w(_in6, _in2); \
>> + _tmp1_m = __lsx_vpackev_w(_in7, _in3); \
>> + \
>> + _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
>> + _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m); \
>> + _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m); \
>> + \
>> + _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m); \
>> + _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m); \
>> + _out1 = __lsx_vilvh_d(_out2, _out0); \
>> + _out3 = __lsx_vilvh_d(_out0, _out2); \
>> + }
>> /*
>> * =============================================================================
>> @@ -547,29 +572,30 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
>> * 120,121,122,123,124,125,126,127
>> * =============================================================================
>> */
>> -#define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, _in8, \
>> - _in9, _in10, _in11, _in12, _in13, _in14, _in15, _out0, \
>> - _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
>> -{ \
>> - __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
>> - __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
>> - DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \
>> - _tmp0, _tmp1, _tmp2, _tmp3); \
>> - DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15, \
>> - _in13, _tmp4, _tmp5, _tmp6, _tmp7); \
>> - DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2); \
>> - DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3); \
>> - DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6); \
>> - DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7); \
>> - DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4); \
>> - DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6); \
>> - DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5); \
>> - DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7); \
>> - DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2); \
>> - DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3); \
>> - DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6); \
>> - DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7); \
>> -}
>> +#define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
>> + _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
>> + _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
>> + _out6, _out7) \
>> + { \
>> + __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
>> + __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
>> + DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \
>> + _tmp0, _tmp1, _tmp2, _tmp3); \
>> + DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15, \
>> + _in13, _tmp4, _tmp5, _tmp6, _tmp7); \
>> + DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2); \
>> + DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3); \
>> + DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6); \
>> + DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7); \
>> + DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4); \
>> + DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6); \
>> + DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5); \
>> + DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7); \
>> + DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2); \
>> + DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3); \
>> + DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6); \
>> + DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7); \
>> + }
>> /*
>> * =============================================================================
>> @@ -585,33 +611,33 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
>> * =============================================================================
>> */
>> #define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
>> -{ \
>> + { \
>> _out0 = __lsx_vadd_b(_in0, _in3); \
>> _out1 = __lsx_vadd_b(_in1, _in2); \
>> _out2 = __lsx_vsub_b(_in1, _in2); \
>> _out3 = __lsx_vsub_b(_in0, _in3); \
>> -}
>> + }
>> #define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
>> -{ \
>> + { \
>> _out0 = __lsx_vadd_h(_in0, _in3); \
>> _out1 = __lsx_vadd_h(_in1, _in2); \
>> _out2 = __lsx_vsub_h(_in1, _in2); \
>> _out3 = __lsx_vsub_h(_in0, _in3); \
>> -}
>> + }
>> #define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
>> -{ \
>> + { \
>> _out0 = __lsx_vadd_w(_in0, _in3); \
>> _out1 = __lsx_vadd_w(_in1, _in2); \
>> _out2 = __lsx_vsub_w(_in1, _in2); \
>> _out3 = __lsx_vsub_w(_in0, _in3); \
>> -}
>> + }
>> #define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
>> -{ \
>> + { \
>> _out0 = __lsx_vadd_d(_in0, _in3); \
>> _out1 = __lsx_vadd_d(_in1, _in2); \
>> _out2 = __lsx_vsub_d(_in1, _in2); \
>> _out3 = __lsx_vsub_d(_in0, _in3); \
>> -}
>> + }
>> /*
>> * =============================================================================
>> @@ -630,59 +656,63 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
>> * _out7 = _in0 - _in7;
>> * =============================================================================
>> */
>> -#define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
>> - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
>> -{ \
>> - _out0 = __lsx_vadd_b(_in0, _in7); \
>> - _out1 = __lsx_vadd_b(_in1, _in6); \
>> - _out2 = __lsx_vadd_b(_in2, _in5); \
>> - _out3 = __lsx_vadd_b(_in3, _in4); \
>> - _out4 = __lsx_vsub_b(_in3, _in4); \
>> - _out5 = __lsx_vsub_b(_in2, _in5); \
>> - _out6 = __lsx_vsub_b(_in1, _in6); \
>> - _out7 = __lsx_vsub_b(_in0, _in7); \
>> -}
>> -
>> -#define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
>> - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
>> -{ \
>> - _out0 = __lsx_vadd_h(_in0, _in7); \
>> - _out1 = __lsx_vadd_h(_in1, _in6); \
>> - _out2 = __lsx_vadd_h(_in2, _in5); \
>> - _out3 = __lsx_vadd_h(_in3, _in4); \
>> - _out4 = __lsx_vsub_h(_in3, _in4); \
>> - _out5 = __lsx_vsub_h(_in2, _in5); \
>> - _out6 = __lsx_vsub_h(_in1, _in6); \
>> - _out7 = __lsx_vsub_h(_in0, _in7); \
>> -}
>> -
>> -#define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
>> - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
>> -{ \
>> - _out0 = __lsx_vadd_w(_in0, _in7); \
>> - _out1 = __lsx_vadd_w(_in1, _in6); \
>> - _out2 = __lsx_vadd_w(_in2, _in5); \
>> - _out3 = __lsx_vadd_w(_in3, _in4); \
>> - _out4 = __lsx_vsub_w(_in3, _in4); \
>> - _out5 = __lsx_vsub_w(_in2, _in5); \
>> - _out6 = __lsx_vsub_w(_in1, _in6); \
>> - _out7 = __lsx_vsub_w(_in0, _in7); \
>> -}
>> -
>> -#define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
>> - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
>> -{ \
>> - _out0 = __lsx_vadd_d(_in0, _in7); \
>> - _out1 = __lsx_vadd_d(_in1, _in6); \
>> - _out2 = __lsx_vadd_d(_in2, _in5); \
>> - _out3 = __lsx_vadd_d(_in3, _in4); \
>> - _out4 = __lsx_vsub_d(_in3, _in4); \
>> - _out5 = __lsx_vsub_d(_in2, _in5); \
>> - _out6 = __lsx_vsub_d(_in1, _in6); \
>> - _out7 = __lsx_vsub_d(_in0, _in7); \
>> -}
>> -
>> -#endif //LSX
>> +#define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
>> + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
>> + _out7) \
>> + { \
>> + _out0 = __lsx_vadd_b(_in0, _in7); \
>> + _out1 = __lsx_vadd_b(_in1, _in6); \
>> + _out2 = __lsx_vadd_b(_in2, _in5); \
>> + _out3 = __lsx_vadd_b(_in3, _in4); \
>> + _out4 = __lsx_vsub_b(_in3, _in4); \
>> + _out5 = __lsx_vsub_b(_in2, _in5); \
>> + _out6 = __lsx_vsub_b(_in1, _in6); \
>> + _out7 = __lsx_vsub_b(_in0, _in7); \
>> + }
>> +
>> +#define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
>> + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
>> + _out7) \
>> + { \
>> + _out0 = __lsx_vadd_h(_in0, _in7); \
>> + _out1 = __lsx_vadd_h(_in1, _in6); \
>> + _out2 = __lsx_vadd_h(_in2, _in5); \
>> + _out3 = __lsx_vadd_h(_in3, _in4); \
>> + _out4 = __lsx_vsub_h(_in3, _in4); \
>> + _out5 = __lsx_vsub_h(_in2, _in5); \
>> + _out6 = __lsx_vsub_h(_in1, _in6); \
>> + _out7 = __lsx_vsub_h(_in0, _in7); \
>> + }
>> +
>> +#define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
>> + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
>> + _out7) \
>> + { \
>> + _out0 = __lsx_vadd_w(_in0, _in7); \
>> + _out1 = __lsx_vadd_w(_in1, _in6); \
>> + _out2 = __lsx_vadd_w(_in2, _in5); \
>> + _out3 = __lsx_vadd_w(_in3, _in4); \
>> + _out4 = __lsx_vsub_w(_in3, _in4); \
>> + _out5 = __lsx_vsub_w(_in2, _in5); \
>> + _out6 = __lsx_vsub_w(_in1, _in6); \
>> + _out7 = __lsx_vsub_w(_in0, _in7); \
>> + }
>> +
>> +#define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
>> + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
>> + _out7) \
>> + { \
>> + _out0 = __lsx_vadd_d(_in0, _in7); \
>> + _out1 = __lsx_vadd_d(_in1, _in6); \
>> + _out2 = __lsx_vadd_d(_in2, _in5); \
>> + _out3 = __lsx_vadd_d(_in3, _in4); \
>> + _out4 = __lsx_vsub_d(_in3, _in4); \
>> + _out5 = __lsx_vsub_d(_in2, _in5); \
>> + _out6 = __lsx_vsub_d(_in1, _in6); \
>> + _out7 = __lsx_vsub_d(_in0, _in7); \
>> + }
>> +
>> +#endif // LSX
>> #ifdef __loongarch_asx
>> #include <lasxintrin.h>
>> @@ -700,13 +730,12 @@ static inline __m128i __lsx_vclip255_w(__m128i _in)
>> * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
>> * =============================================================================
>> */
>> -static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l)
>> -{
>> - __m256i out;
>> +static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) {
>> + __m256i out;
>> - out = __lasx_xvmulwev_h_bu(in_h, in_l);
>> - out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
>> - return out;
>> + out = __lasx_xvmulwev_h_bu(in_h, in_l);
>> + out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
>> + return out;
>> }
>> /*
>> @@ -718,18 +747,17 @@ static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l)
>> * Details : Signed byte elements from in_h are multiplied with
>> * signed byte elements from in_l producing a result
>> * twice the size of input i.e. signed halfword.
>> - * Then this iniplication results of adjacent odd-even elements
>> + * Then this multiplication results of adjacent odd-even elements
>> * are added to the out vector
>> * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
>> * =============================================================================
>> */
>> -static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l)
>> -{
>> - __m256i out;
>> +static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) {
>> + __m256i out;
>> - out = __lasx_xvmulwev_h_b(in_h, in_l);
>> - out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
>> - return out;
>> + out = __lasx_xvmulwev_h_b(in_h, in_l);
>> + out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
>> + return out;
>> }
>> /*
>> @@ -749,13 +777,12 @@ static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l)
>> * out : 22,38,38,22, 22,38,38,22
>> * =============================================================================
>> */
>> -static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l)
>> -{
>> - __m256i out;
>> +static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) {
>> + __m256i out;
>> - out = __lasx_xvmulwev_w_h(in_h, in_l);
>> - out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
>> - return out;
>> + out = __lasx_xvmulwev_w_h(in_h, in_l);
>> + out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
>> + return out;
>> }
>> /*
>> @@ -763,22 +790,21 @@ static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l)
>> * Description : Dot product of word vector elements
>> * Arguments : Inputs - in_h, in_l
>> * Output - out
>> - * Retrun Type - signed double
>> + * Return Type - signed double
>> * Details : Signed word elements from in_h are multiplied with
>> * signed word elements from in_l producing a result
>> - * twice the size of input i.e. signed double word.
>> + * twice the size of input i.e. signed double-word.
>> * Then this multiplied results of adjacent odd-even elements
>> * are added to the out vector.
>> * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
>> * =============================================================================
>> */
>> -static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l)
>> -{
>> - __m256i out;
>> +static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l) {
>> + __m256i out;
>> - out = __lasx_xvmulwev_d_w(in_h, in_l);
>> - out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
>> - return out;
>> + out = __lasx_xvmulwev_d_w(in_h, in_l);
>> + out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
>> + return out;
>> }
>> /*
>> @@ -795,13 +821,12 @@ static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l)
>> * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
>> * =============================================================================
>> */
>> -static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l)
>> -{
>> - __m256i out;
>> +static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) {
>> + __m256i out;
>> - out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
>> - out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
>> - return out;
>> + out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
>> + out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
>> + return out;
>> }
>> /*
>> @@ -809,7 +834,7 @@ static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l)
>> * Description : Dot product & addition of byte vector elements
>> * Arguments : Inputs - in_h, in_l
>> * Output - out
>> - * Retrun Type - halfword
>> + * Return Type - halfword
>> * Details : Signed byte elements from in_h are multiplied with
>> * signed byte elements from in_l producing a result
>> * twice the size of input i.e. signed halfword.
>> @@ -818,13 +843,59 @@ static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l)
>> * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
>> * =============================================================================
>> */
>> -static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c,__m256i in_h, __m256i in_l)
>> -{
>> - __m256i out;
>> +static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h,
>> + __m256i in_l) {
>> + __m256i out;
>> +
>> + out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
>> + out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
>> + return out;
>> +}
>> +
>> +/*
>> + * =============================================================================
>> + * Description : Dot product & addition of byte vector elements
>> + * Arguments : Inputs - in_h, in_l
>> + * Output - out
>> + * Return Type - halfword
>> + * Details : Unsigned byte elements from in_h are multiplied with
>> + * unsigned byte elements from in_l producing a result
>> + * twice the size of input i.e. signed halfword.
>> + * Then this multiplied results of adjacent odd-even elements
>> + * are added to the in_c vector.
>> + * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
>> + * =============================================================================
>> + */
>> +static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, __m256i in_h,
>> + __m256i in_l) {
>> + __m256i out;
>> +
>> + out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l);
>> + out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
>> + return out;
>> +}
>> +
>> +/*
>> + * =============================================================================
>> + * Description : Dot product & addition of byte vector elements
>> + * Arguments : Inputs - in_h, in_l
>> + * Output - out
>> + * Return Type - halfword
>> + * Details : Unsigned byte elements from in_h are multiplied with
>> + * signed byte elements from in_l producing a result
>> + * twice the size of input i.e. signed halfword.
>> + * Then this multiplied results of adjacent odd-even elements
>> + * are added to the in_c vector.
>> + * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
>> + * =============================================================================
>> + */
>> +static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, __m256i in_h,
>> + __m256i in_l) {
>> + __m256i out;
>> - out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
>> - out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
>> - return out;
>> + out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l);
>> + out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l);
>> + return out;
>> }
>> /*
>> @@ -845,13 +916,13 @@ static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c,__m256i in_h, __m256i in_
>> * out : 23,40,41,26, 23,40,41,26
>> * =============================================================================
>> */
>> -static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
>> -{
>> - __m256i out;
>> +static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h,
>> + __m256i in_l) {
>> + __m256i out;
>> - out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
>> - out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
>> - return out;
>> + out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
>> + out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
>> + return out;
>> }
>> /*
>> @@ -868,13 +939,13 @@ static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h, __m256i in
>> * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
>> * =============================================================================
>> */
>> -static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h, __m256i in_l)
>> -{
>> - __m256i out;
>> +static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h,
>> + __m256i in_l) {
>> + __m256i out;
>> - out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
>> - out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
>> - return out;
>> + out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
>> + out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
>> + return out;
>> }
>> /*
>> @@ -891,13 +962,13 @@ static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h, __m256i i
>> * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
>> * =============================================================================
>> */
>> -static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h, __m256i in_l)
>> -{
>> - __m256i out;
>> +static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h,
>> + __m256i in_l) {
>> + __m256i out;
>> - out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
>> - out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
>> - return out;
>> + out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
>> + out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
>> + return out;
>> }
>> /*
>> @@ -915,14 +986,14 @@ static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h, __m256i
>> * Example : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
>> * =============================================================================
>> */
>> -static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h, __m256i in_l)
>> -{
>> - __m256i out;
>> -
>> - out = __lasx_xvmulwev_h_bu(in_h, in_l);
>> - out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
>> - out = __lasx_xvsub_h(in_c, out);
>> - return out;
>> +static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h,
>> + __m256i in_l) {
>> + __m256i out;
>> +
>> + out = __lasx_xvmulwev_h_bu(in_h, in_l);
>> + out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
>> + out = __lasx_xvsub_h(in_c, out);
>> + return out;
>> }
>> /*
>> @@ -944,14 +1015,14 @@ static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h, __m256i i
>> * out : -7,-3,0,0, 0,-1,0,-1
>> * =============================================================================
>> */
>> -static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
>> -{
>> - __m256i out;
>> -
>> - out = __lasx_xvmulwev_w_h(in_h, in_l);
>> - out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
>> - out = __lasx_xvsub_w(in_c, out);
>> - return out;
>> +static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h,
>> + __m256i in_l) {
>> + __m256i out;
>> +
>> + out = __lasx_xvmulwev_w_h(in_h, in_l);
>> + out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
>> + out = __lasx_xvsub_w(in_c, out);
>> + return out;
>> }
>> /*
>> @@ -960,10 +1031,10 @@ static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h, __m256i in
>> * Arguments : Inputs - in_h, in_l
>> * Output - out
>> * Return Type - signed word
>> - * Details : Signed halfword elements from in_h are iniplied with
>> + * Details : Signed halfword elements from in_h are multiplied with
>> * signed halfword elements from in_l producing a result
>> * four times the size of input i.e. signed doubleword.
>> - * Then this iniplication results of four adjacent elements
>> + * Then this multiplication results of four adjacent elements
>> * are added together and stored to the out vector.
>> * Example : out = __lasx_xvdp4_d_h(in_h, in_l)
>> * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1
>> @@ -971,14 +1042,13 @@ static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h, __m256i in
>> * out : -2,0,1,1
>> * =============================================================================
>> */
>> -static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l)
>> -{
>> - __m256i out;
>> -
>> - out = __lasx_xvmulwev_w_h(in_h, in_l);
>> - out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
>> - out = __lasx_xvhaddw_d_w(out, out);
>> - return out;
>> +static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l) {
>> + __m256i out;
>> +
>> + out = __lasx_xvmulwev_w_h(in_h, in_l);
>> + out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
>> + out = __lasx_xvhaddw_d_w(out, out);
>> + return out;
>> }
>> /*
>> @@ -993,13 +1063,12 @@ static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l)
>> * Example : See out = __lasx_xvaddwh_w_h(in_h, in_l)
>> * =============================================================================
>> */
>> -static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l)
>> -{
>> - __m256i out;
>> +static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l) {
>> + __m256i out;
>> - out = __lasx_xvilvh_b(in_h, in_l);
>> - out = __lasx_xvhaddw_h_b(out, out);
>> - return out;
>> + out = __lasx_xvilvh_b(in_h, in_l);
>> + out = __lasx_xvhaddw_h_b(out, out);
>> + return out;
>> }
>> /*
>> @@ -1017,13 +1086,12 @@ static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l)
>> * out : 1,0,0,-1, 1,0,0, 2
>> * =============================================================================
>> */
>> - static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l)
>> -{
>> - __m256i out;
>> +static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l) {
>> + __m256i out;
>> - out = __lasx_xvilvh_h(in_h, in_l);
>> - out = __lasx_xvhaddw_w_h(out, out);
>> - return out;
>> + out = __lasx_xvilvh_h(in_h, in_l);
>> + out = __lasx_xvhaddw_w_h(out, out);
>> + return out;
>> }
>> /*
>> @@ -1038,13 +1106,12 @@ static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l)
>> * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l)
>> * =============================================================================
>> */
>> -static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l)
>> -{
>> - __m256i out;
>> +static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l) {
>> + __m256i out;
>> - out = __lasx_xvilvl_b(in_h, in_l);
>> - out = __lasx_xvhaddw_h_b(out, out);
>> - return out;
>> + out = __lasx_xvilvl_b(in_h, in_l);
>> + out = __lasx_xvhaddw_h_b(out, out);
>> + return out;
>> }
>> /*
>> @@ -1062,13 +1129,12 @@ static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l)
>> * out : 5,-1,4,2, 1,0,2,-1
>> * =============================================================================
>> */
>> -static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l)
>> -{
>> - __m256i out;
>> +static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l) {
>> + __m256i out;
>> - out = __lasx_xvilvl_h(in_h, in_l);
>> - out = __lasx_xvhaddw_w_h(out, out);
>> - return out;
>> + out = __lasx_xvilvl_h(in_h, in_l);
>> + out = __lasx_xvhaddw_w_h(out, out);
>> + return out;
>> }
>> /*
>> @@ -1083,13 +1149,12 @@ static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l)
>> * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l)
>> * =============================================================================
>> */
>> -static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l)
>> -{
>> - __m256i out;
>> +static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l) {
>> + __m256i out;
>> - out = __lasx_xvilvl_b(in_h, in_l);
>> - out = __lasx_xvhaddw_hu_bu(out, out);
>> - return out;
>> + out = __lasx_xvilvl_b(in_h, in_l);
>> + out = __lasx_xvhaddw_hu_bu(out, out);
>> + return out;
>> }
>> /*
>> @@ -1103,13 +1168,12 @@ static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l)
>> * Example : See out = __lasx_xvaddw_w_w_h(in_h, in_l)
>> * =============================================================================
>> */
>> -static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l)
>> -{
>> - __m256i out;
>> +static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l) {
>> + __m256i out;
>> - out = __lasx_xvsllwil_hu_bu(in_l, 0);
>> - out = __lasx_xvadd_h(in_h, out);
>> - return out;
>> + out = __lasx_xvsllwil_hu_bu(in_l, 0);
>> + out = __lasx_xvadd_h(in_h, out);
>> + return out;
>> }
>> /*
>> @@ -1126,13 +1190,12 @@ static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l)
>> * out : 2, 0,1,2, -1,0,1,1,
>> * =============================================================================
>> */
>> -static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l)
>> -{
>> - __m256i out;
>> +static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) {
>> + __m256i out;
>> - out = __lasx_xvsllwil_w_h(in_l, 0);
>> - out = __lasx_xvadd_w(in_h, out);
>> - return out;
>> + out = __lasx_xvsllwil_w_h(in_l, 0);
>> + out = __lasx_xvadd_w(in_h, out);
>> + return out;
>> }
>> /*
>> @@ -1153,15 +1216,15 @@ static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l)
>> * out : 201, 602,1203,2004, -995, -1794,-2793,-3992
>> * =============================================================================
>> */
>> -static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
>> -{
>> - __m256i tmp0, tmp1, out;
>> -
>> - tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
>> - tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
>> - tmp0 = __lasx_xvmul_w(tmp0, tmp1);
>> - out = __lasx_xvadd_w(tmp0, in_c);
>> - return out;
>> +static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h,
>> + __m256i in_l) {
>> + __m256i tmp0, tmp1, out;
>> +
>> + tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
>> + tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
>> + tmp0 = __lasx_xvmul_w(tmp0, tmp1);
>> + out = __lasx_xvadd_w(tmp0, in_c);
>> + return out;
>> }
>> /*
>> @@ -1177,15 +1240,15 @@ static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h, __m256i in
>> * Example : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
>> * =============================================================================
>> */
>> -static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
>> -{
>> - __m256i tmp0, tmp1, out;
>> -
>> - tmp0 = __lasx_xvilvh_h(in_h, in_h);
>> - tmp1 = __lasx_xvilvh_h(in_l, in_l);
>> - tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
>> - out = __lasx_xvadd_w(tmp0, in_c);
>> - return out;
>> +static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h,
>> + __m256i in_l) {
>> + __m256i tmp0, tmp1, out;
>> +
>> + tmp0 = __lasx_xvilvh_h(in_h, in_h);
>> + tmp1 = __lasx_xvilvh_h(in_l, in_l);
>> + tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
>> + out = __lasx_xvadd_w(tmp0, in_c);
>> + return out;
>> }
>> /*
>> @@ -1203,14 +1266,13 @@ static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h, __m256i in
>> * out : 6,1,3,0, 0,0,1,0
>> * =============================================================================
>> */
>> -static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l)
>> -{
>> - __m256i tmp0, tmp1, out;
>> -
>> - tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
>> - tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
>> - out = __lasx_xvmul_w(tmp0, tmp1);
>> - return out;
>> +static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l) {
>> + __m256i tmp0, tmp1, out;
>> +
>> + tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
>> + tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
>> + out = __lasx_xvmul_w(tmp0, tmp1);
>> + return out;
>> }
>> /*
>> @@ -1228,39 +1290,39 @@ static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l)
>> * out : 0,0,0,0, 0,0,0,1
>> * =============================================================================
>> */
>> -static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l)
>> -{
>> - __m256i tmp0, tmp1, out;
>> -
>> - tmp0 = __lasx_xvilvh_h(in_h, in_h);
>> - tmp1 = __lasx_xvilvh_h(in_l, in_l);
>> - out = __lasx_xvmulwev_w_h(tmp0, tmp1);
>> - return out;
>> +static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) {
>> + __m256i tmp0, tmp1, out;
>> +
>> + tmp0 = __lasx_xvilvh_h(in_h, in_h);
>> + tmp1 = __lasx_xvilvh_h(in_l, in_l);
>> + out = __lasx_xvmulwev_w_h(tmp0, tmp1);
>> + return out;
>> }
>> /*
>> * =============================================================================
>> - * Description : The low half of the vector elements are expanded and
>> - * added saturately after being doubled.
>> + * Description : The low half of the vector elements are added to the high half
>> + * after being doubled, then saturated.
>> * Arguments : Inputs - in_h, in_l
>> * Output - out
>> - * Details : The in_h vector adds the in_l vector saturately after the lower
>> - * half of the two-fold zero extension (unsigned byte to unsigned
>> - * halfword) and the results are stored to the out vector.
>> + * Details : The in_h vector adds the in_l vector after the lower half of
>> + * the two-fold zero extension (unsigned byte to unsigned
>> + * halfword) and then saturated. The results are stored to the out
>> + * vector.
>> * Example : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l)
>> * in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1
>> - * in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1
>> - * out : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
>> + * in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1,
>> + * 0,0,0,1
>> + * out : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
>> * =============================================================================
>> */
>> -static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l)
>> -{
>> - __m256i tmp1, out;
>> - __m256i zero = {0};
>> -
>> - tmp1 = __lasx_xvilvl_b(zero, in_l);
>> - out = __lasx_xvsadd_hu(in_h, tmp1);
>> - return out;
>> +static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) {
>> + __m256i tmp1, out;
>> + __m256i zero = { 0 };
>> +
>> + tmp1 = __lasx_xvilvl_b(zero, in_l);
>> + out = __lasx_xvsadd_hu(in_h, tmp1);
>> + return out;
>> }
>> /*
>> @@ -1279,13 +1341,12 @@ static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l)
>> * out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5
>> * =============================================================================
>> */
>> -static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max)
>> -{
>> - __m256i out;
>> +static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max) {
>> + __m256i out;
>> - out = __lasx_xvmax_h(min, in);
>> - out = __lasx_xvmin_h(max, out);
>> - return out;
>> + out = __lasx_xvmax_h(min, in);
>> + out = __lasx_xvmin_h(max, out);
>> + return out;
>> }
>> /*
>> @@ -1298,13 +1359,12 @@ static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max)
>> * Example : See out = __lasx_xvclip255_w(in)
>> * =============================================================================
>> */
>> -static inline __m256i __lasx_xvclip255_h(__m256i in)
>> -{
>> - __m256i out;
>> +static inline __m256i __lasx_xvclip255_h(__m256i in) {
>> + __m256i out;
>> - out = __lasx_xvmaxi_h(in, 0);
>> - out = __lasx_xvsat_hu(out, 7);
>> - return out;
>> + out = __lasx_xvmaxi_h(in, 0);
>> + out = __lasx_xvsat_hu(out, 7);
>> + return out;
>> }
>> /*
>> @@ -1319,20 +1379,19 @@ static inline __m256i __lasx_xvclip255_h(__m256i in)
>> * out : 0,255,255,249, 0,255,255,249
>> * =============================================================================
>> */
>> -static inline __m256i __lasx_xvclip255_w(__m256i in)
>> -{
>> - __m256i out;
>> +static inline __m256i __lasx_xvclip255_w(__m256i in) {
>> + __m256i out;
>> - out = __lasx_xvmaxi_w(in, 0);
>> - out = __lasx_xvsat_wu(out, 7);
>> - return out;
>> + out = __lasx_xvmaxi_w(in, 0);
>> + out = __lasx_xvsat_wu(out, 7);
>> + return out;
>> }
>> /*
>> * =============================================================================
>> * Description : Indexed halfword element values are replicated to all
>> - * elements in output vector. If 'indx < 8' use xvsplati_l_*,
>> - * if 'indx >= 8' use xvsplati_h_*.
>> + * elements in output vector. If 'idx < 8' use xvsplati_l_*,
>> + * if 'idx >= 8' use xvsplati_h_*.
>> * Arguments : Inputs - in, idx
>> * Output - out
>> * Details : Idx element value from in vector is replicated to all
>> @@ -1344,20 +1403,19 @@ static inline __m256i __lasx_xvclip255_w(__m256i in)
>> * out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11
>> * =============================================================================
>> */
>> -static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx)
>> -{
>> - __m256i out;
>> +static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx) {
>> + __m256i out;
>> - out = __lasx_xvpermi_q(in, in, 0x02);
>> - out = __lasx_xvreplve_h(out, idx);
>> - return out;
>> + out = __lasx_xvpermi_q(in, in, 0x02);
>> + out = __lasx_xvreplve_h(out, idx);
>> + return out;
>> }
>> /*
>> * =============================================================================
>> * Description : Indexed halfword element values are replicated to all
>> - * elements in output vector. If 'indx < 8' use xvsplati_l_*,
>> - * if 'indx >= 8' use xvsplati_h_*.
>> + * elements in output vector. If 'idx < 8' use xvsplati_l_*,
>> + * if 'idx >= 8' use xvsplati_h_*.
>> * Arguments : Inputs - in, idx
>> * Output - out
>> * Details : Idx element value from in vector is replicated to all
>> @@ -1369,18 +1427,17 @@ static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx)
>> * out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
>> * =============================================================================
>> */
>> -static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
>> -{
>> - __m256i out;
>> +static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) {
>> + __m256i out;
>> - out = __lasx_xvpermi_q(in, in, 0x13);
>> - out = __lasx_xvreplve_h(out, idx);
>> - return out;
>> + out = __lasx_xvpermi_q(in, in, 0x13);
>> + out = __lasx_xvreplve_h(out, idx);
>> + return out;
>> }
>> /*
>> * =============================================================================
>> - * Description : Transpose 4x4 block with double word elements in vectors
>> + * Description : Transpose 4x4 block with double-word elements in vectors
>> * Arguments : Inputs - _in0, _in1, _in2, _in3
>> * Outputs - _out0, _out1, _out2, _out3
>> * Example : LASX_TRANSPOSE4x4_D
>> @@ -1395,24 +1452,26 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
>> * _out3 : 4,4,4,4
>> * =============================================================================
>> */
>> -#define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
>> -{ \
>> - __m256i _tmp0, _tmp1, _tmp2, _tmp3; \
>> - _tmp0 = __lasx_xvilvl_d(_in1, _in0); \
>> - _tmp1 = __lasx_xvilvh_d(_in1, _in0); \
>> - _tmp2 = __lasx_xvilvl_d(_in3, _in2); \
>> - _tmp3 = __lasx_xvilvh_d(_in3, _in2); \
>> - _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20); \
>> - _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31); \
>> - _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20); \
>> - _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31); \
>> -}
>> +#define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
>> + _out3) \
>> + { \
>> + __m256i _tmp0, _tmp1, _tmp2, _tmp3; \
>> + _tmp0 = __lasx_xvilvl_d(_in1, _in0); \
>> + _tmp1 = __lasx_xvilvh_d(_in1, _in0); \
>> + _tmp2 = __lasx_xvilvl_d(_in3, _in2); \
>> + _tmp3 = __lasx_xvilvh_d(_in3, _in2); \
>> + _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20); \
>> + _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31); \
>> + _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20); \
>> + _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31); \
>> + }
>> /*
>> * =============================================================================
>> * Description : Transpose 8x8 block with word elements in vectors
>> * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
>> - * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
>> + * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
>> + * _out7
>> * Example : LASX_TRANSPOSE8x8_W
>> * _in0 : 1,2,3,4,5,6,7,8
>> * _in1 : 2,2,3,4,5,6,7,8
>> @@ -1433,38 +1492,39 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
>> * _out7 : 8,8,8,8,8,8,8,8
>> * =============================================================================
>> */
>> -#define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
>> - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
>> -{ \
>> - __m256i _s0_m, _s1_m; \
>> - __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
>> - __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
>> - \
>> - _s0_m = __lasx_xvilvl_w(_in2, _in0); \
>> - _s1_m = __lasx_xvilvl_w(_in3, _in1); \
>> - _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
>> - _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
>> - _s0_m = __lasx_xvilvh_w(_in2, _in0); \
>> - _s1_m = __lasx_xvilvh_w(_in3, _in1); \
>> - _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
>> - _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
>> - _s0_m = __lasx_xvilvl_w(_in6, _in4); \
>> - _s1_m = __lasx_xvilvl_w(_in7, _in5); \
>> - _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
>> - _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
>> - _s0_m = __lasx_xvilvh_w(_in6, _in4); \
>> - _s1_m = __lasx_xvilvh_w(_in7, _in5); \
>> - _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
>> - _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
>> - _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20); \
>> - _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20); \
>> - _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20); \
>> - _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20); \
>> - _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31); \
>> - _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31); \
>> - _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31); \
>> - _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31); \
>> -}
>> +#define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
>> + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
>> + _out7) \
>> + { \
>> + __m256i _s0_m, _s1_m; \
>> + __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
>> + __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
>> + \
>> + _s0_m = __lasx_xvilvl_w(_in2, _in0); \
>> + _s1_m = __lasx_xvilvl_w(_in3, _in1); \
>> + _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
>> + _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
>> + _s0_m = __lasx_xvilvh_w(_in2, _in0); \
>> + _s1_m = __lasx_xvilvh_w(_in3, _in1); \
>> + _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
>> + _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
>> + _s0_m = __lasx_xvilvl_w(_in6, _in4); \
>> + _s1_m = __lasx_xvilvl_w(_in7, _in5); \
>> + _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
>> + _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
>> + _s0_m = __lasx_xvilvh_w(_in6, _in4); \
>> + _s1_m = __lasx_xvilvh_w(_in7, _in5); \
>> + _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
>> + _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
>> + _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20); \
>> + _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20); \
>> + _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20); \
>> + _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20); \
>> + _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31); \
>> + _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31); \
>> + _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31); \
>> + _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31); \
>> + }
>> /*
>> * =============================================================================
>> @@ -1472,52 +1532,54 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
>> * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
>> * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
>> * (input 16x8 byte block)
>> - * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
>> - * (output 8x16 byte block)
>> - * Details : The rows of the matrix become columns, and the columns become rows.
>> + * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
>> + * _out7 (output 8x16 byte block)
>> + * Details : The rows of the matrix become columns, and the columns become
>> + * rows.
>> * Example : See LASX_TRANSPOSE16x8_H
>> * =============================================================================
>> */
>> -#define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
>> - _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15, \
>> - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
>> -{ \
>> - __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
>> - __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
>> - \
>> - _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
>> - _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
>> - _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
>> - _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
>> - _tmp4_m = __lasx_xvilvl_b(_in10, _in8); \
>> - _tmp5_m = __lasx_xvilvl_b(_in11, _in9); \
>> - _tmp6_m = __lasx_xvilvl_b(_in14, _in12); \
>> - _tmp7_m = __lasx_xvilvl_b(_in15, _in13); \
>> - _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
>> - _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
>> - _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
>> - _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
>> - _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m); \
>> - _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m); \
>> - _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m); \
>> - _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m); \
>> - _tmp0_m = __lasx_xvilvl_w(_out2, _out0); \
>> - _tmp2_m = __lasx_xvilvh_w(_out2, _out0); \
>> - _tmp4_m = __lasx_xvilvl_w(_out3, _out1); \
>> - _tmp6_m = __lasx_xvilvh_w(_out3, _out1); \
>> - _tmp1_m = __lasx_xvilvl_w(_out6, _out4); \
>> - _tmp3_m = __lasx_xvilvh_w(_out6, _out4); \
>> - _tmp5_m = __lasx_xvilvl_w(_out7, _out5); \
>> - _tmp7_m = __lasx_xvilvh_w(_out7, _out5); \
>> - _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m); \
>> - _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m); \
>> - _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m); \
>> - _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m); \
>> - _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m); \
>> - _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m); \
>> - _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m); \
>> - _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m); \
>> -}
>> +#define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
>> + _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
>> + _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
>> + _out6, _out7) \
>> + { \
>> + __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
>> + __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
>> + \
>> + _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
>> + _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
>> + _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
>> + _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
>> + _tmp4_m = __lasx_xvilvl_b(_in10, _in8); \
>> + _tmp5_m = __lasx_xvilvl_b(_in11, _in9); \
>> + _tmp6_m = __lasx_xvilvl_b(_in14, _in12); \
>> + _tmp7_m = __lasx_xvilvl_b(_in15, _in13); \
>> + _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
>> + _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
>> + _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
>> + _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
>> + _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m); \
>> + _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m); \
>> + _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m); \
>> + _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m); \
>> + _tmp0_m = __lasx_xvilvl_w(_out2, _out0); \
>> + _tmp2_m = __lasx_xvilvh_w(_out2, _out0); \
>> + _tmp4_m = __lasx_xvilvl_w(_out3, _out1); \
>> + _tmp6_m = __lasx_xvilvh_w(_out3, _out1); \
>> + _tmp1_m = __lasx_xvilvl_w(_out6, _out4); \
>> + _tmp3_m = __lasx_xvilvh_w(_out6, _out4); \
>> + _tmp5_m = __lasx_xvilvl_w(_out7, _out5); \
>> + _tmp7_m = __lasx_xvilvh_w(_out7, _out5); \
>> + _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m); \
>> + _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m); \
>> + _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m); \
>> + _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m); \
>> + _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m); \
>> + _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m); \
>> + _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m); \
>> + _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m); \
>> + }
>> /*
>> * =============================================================================
>> @@ -1525,9 +1587,10 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
>> * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
>> * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
>> * (input 16x8 byte block)
>> - * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
>> - * (output 8x16 byte block)
>> - * Details : The rows of the matrix become columns, and the columns become rows.
>> + * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
>> + * _out7 (output 8x16 byte block)
>> + * Details : The rows of the matrix become columns, and the columns become
>> + * rows.
>> * Example : LASX_TRANSPOSE16x8_H
>> * _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
>> * _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
>> @@ -1556,72 +1619,73 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
>> * _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
>> * =============================================================================
>> */
>> -#define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
>> - _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15, \
>> - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
>> - { \
>> - __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
>> - __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
>> - __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
>> - \
>> - _tmp0_m = __lasx_xvilvl_h(_in2, _in0); \
>> - _tmp1_m = __lasx_xvilvl_h(_in3, _in1); \
>> - _tmp2_m = __lasx_xvilvl_h(_in6, _in4); \
>> - _tmp3_m = __lasx_xvilvl_h(_in7, _in5); \
>> - _tmp4_m = __lasx_xvilvl_h(_in10, _in8); \
>> - _tmp5_m = __lasx_xvilvl_h(_in11, _in9); \
>> - _tmp6_m = __lasx_xvilvl_h(_in14, _in12); \
>> - _tmp7_m = __lasx_xvilvl_h(_in15, _in13); \
>> - _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
>> - _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
>> - _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
>> - _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
>> - _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
>> - _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
>> - _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
>> - _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
>> - _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
>> - _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
>> - _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
>> - _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
>> - _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
>> - _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
>> - _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
>> - _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
>> - _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
>> - _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
>> - _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
>> - _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
>> - \
>> - _tmp0_m = __lasx_xvilvh_h(_in2, _in0); \
>> - _tmp1_m = __lasx_xvilvh_h(_in3, _in1); \
>> - _tmp2_m = __lasx_xvilvh_h(_in6, _in4); \
>> - _tmp3_m = __lasx_xvilvh_h(_in7, _in5); \
>> - _tmp4_m = __lasx_xvilvh_h(_in10, _in8); \
>> - _tmp5_m = __lasx_xvilvh_h(_in11, _in9); \
>> - _tmp6_m = __lasx_xvilvh_h(_in14, _in12); \
>> - _tmp7_m = __lasx_xvilvh_h(_in15, _in13); \
>> - _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
>> - _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
>> - _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
>> - _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
>> - _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
>> - _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
>> - _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
>> - _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
>> - _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
>> - _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
>> - _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
>> - _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
>> - _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
>> - _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
>> - _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
>> - _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
>> - _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
>> - _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
>> - _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
>> - _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
>> -}
>> +#define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
>> + _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
>> + _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
>> + _out6, _out7) \
>> + { \
>> + __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
>> + __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
>> + __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
>> + \
>> + _tmp0_m = __lasx_xvilvl_h(_in2, _in0); \
>> + _tmp1_m = __lasx_xvilvl_h(_in3, _in1); \
>> + _tmp2_m = __lasx_xvilvl_h(_in6, _in4); \
>> + _tmp3_m = __lasx_xvilvl_h(_in7, _in5); \
>> + _tmp4_m = __lasx_xvilvl_h(_in10, _in8); \
>> + _tmp5_m = __lasx_xvilvl_h(_in11, _in9); \
>> + _tmp6_m = __lasx_xvilvl_h(_in14, _in12); \
>> + _tmp7_m = __lasx_xvilvl_h(_in15, _in13); \
>> + _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
>> + _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
>> + _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
>> + _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
>> + _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
>> + _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
>> + _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
>> + _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
>> + _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
>> + _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
>> + _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
>> + _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
>> + _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
>> + _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
>> + _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
>> + _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
>> + _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
>> + _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
>> + _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
>> + _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
>> + \
>> + _tmp0_m = __lasx_xvilvh_h(_in2, _in0); \
>> + _tmp1_m = __lasx_xvilvh_h(_in3, _in1); \
>> + _tmp2_m = __lasx_xvilvh_h(_in6, _in4); \
>> + _tmp3_m = __lasx_xvilvh_h(_in7, _in5); \
>> + _tmp4_m = __lasx_xvilvh_h(_in10, _in8); \
>> + _tmp5_m = __lasx_xvilvh_h(_in11, _in9); \
>> + _tmp6_m = __lasx_xvilvh_h(_in14, _in12); \
>> + _tmp7_m = __lasx_xvilvh_h(_in15, _in13); \
>> + _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
>> + _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
>> + _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
>> + _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
>> + _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
>> + _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
>> + _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
>> + _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
>> + _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
>> + _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
>> + _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
>> + _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
>> + _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
>> + _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
>> + _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
>> + _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
>> + _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
>> + _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
>> + _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
>> + _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
>> + }
>> /*
>> * =============================================================================
>> @@ -1629,61 +1693,65 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
>> * Arguments : Inputs - _in0, _in1, _in2, _in3
>> * Outputs - _out0, _out1, _out2, _out3
>> * Return Type - signed halfword
>> - * Details : The rows of the matrix become columns, and the columns become rows.
>> + * Details : The rows of the matrix become columns, and the columns become
>> + * rows.
>> * Example : See LASX_TRANSPOSE8x8_H
>> * =============================================================================
>> */
>> -#define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
>> -{ \
>> - __m256i _s0_m, _s1_m; \
>> - \
>> - _s0_m = __lasx_xvilvl_h(_in1, _in0); \
>> - _s1_m = __lasx_xvilvl_h(_in3, _in2); \
>> - _out0 = __lasx_xvilvl_w(_s1_m, _s0_m); \
>> - _out2 = __lasx_xvilvh_w(_s1_m, _s0_m); \
>> - _out1 = __lasx_xvilvh_d(_out0, _out0); \
>> - _out3 = __lasx_xvilvh_d(_out2, _out2); \
>> -}
>> +#define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
>> + _out3) \
>> + { \
>> + __m256i _s0_m, _s1_m; \
>> + \
>> + _s0_m = __lasx_xvilvl_h(_in1, _in0); \
>> + _s1_m = __lasx_xvilvl_h(_in3, _in2); \
>> + _out0 = __lasx_xvilvl_w(_s1_m, _s0_m); \
>> + _out2 = __lasx_xvilvh_w(_s1_m, _s0_m); \
>> + _out1 = __lasx_xvilvh_d(_out0, _out0); \
>> + _out3 = __lasx_xvilvh_d(_out2, _out2); \
>> + }
>> /*
>> * =============================================================================
>> * Description : Transpose input 8x8 byte block
>> * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
>> * (input 8x8 byte block)
>> - * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
>> - * (output 8x8 byte block)
>> + * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
>> + * _out7 (output 8x8 byte block)
>> * Example : See LASX_TRANSPOSE8x8_H
>> * =============================================================================
>> */
>> -#define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, _out0, \
>> - _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
>> -{ \
>> - __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
>> - __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
>> - _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
>> - _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
>> - _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
>> - _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
>> - _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
>> - _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
>> - _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
>> - _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
>> - _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m); \
>> - _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m); \
>> - _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m); \
>> - _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m); \
>> - _out1 = __lasx_xvbsrl_v(_out0, 8); \
>> - _out3 = __lasx_xvbsrl_v(_out2, 8); \
>> - _out5 = __lasx_xvbsrl_v(_out4, 8); \
>> - _out7 = __lasx_xvbsrl_v(_out6, 8); \
>> -}
>> +#define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
>> + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
>> + _out7) \
>> + { \
>> + __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
>> + __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
>> + _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
>> + _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
>> + _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
>> + _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
>> + _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
>> + _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
>> + _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
>> + _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
>> + _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m); \
>> + _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m); \
>> + _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m); \
>> + _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m); \
>> + _out1 = __lasx_xvbsrl_v(_out0, 8); \
>> + _out3 = __lasx_xvbsrl_v(_out2, 8); \
>> + _out5 = __lasx_xvbsrl_v(_out4, 8); \
>> + _out7 = __lasx_xvbsrl_v(_out6, 8); \
>> + }
>> /*
>> * =============================================================================
>> * Description : Transpose 8x8 block with halfword elements in vectors.
>> * Arguments : Inputs - _in0, _in1, ~
>> * Outputs - _out0, _out1, ~
>> - * Details : The rows of the matrix become columns, and the columns become rows.
>> + * Details : The rows of the matrix become columns, and the columns become
>> + * rows.
>> * Example : LASX_TRANSPOSE8x8_H
>> * _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
>> * _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
>> @@ -1704,40 +1772,41 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
>> * _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8
>> * =============================================================================
>> */
>> -#define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, _out0, \
>> - _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
>> -{ \
>> - __m256i _s0_m, _s1_m; \
>> - __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
>> - __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
>> - \
>> - _s0_m = __lasx_xvilvl_h(_in6, _in4); \
>> - _s1_m = __lasx_xvilvl_h(_in7, _in5); \
>> - _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
>> - _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
>> - _s0_m = __lasx_xvilvh_h(_in6, _in4); \
>> - _s1_m = __lasx_xvilvh_h(_in7, _in5); \
>> - _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
>> - _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
>> - \
>> - _s0_m = __lasx_xvilvl_h(_in2, _in0); \
>> - _s1_m = __lasx_xvilvl_h(_in3, _in1); \
>> - _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
>> - _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
>> - _s0_m = __lasx_xvilvh_h(_in2, _in0); \
>> - _s1_m = __lasx_xvilvh_h(_in3, _in1); \
>> - _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
>> - _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
>> - \
>> - _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m); \
>> - _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m); \
>> - _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m); \
>> - _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m); \
>> - _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m); \
>> - _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m); \
>> - _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m); \
>> - _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m); \
>> -}
>> +#define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
>> + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
>> + _out7) \
>> + { \
>> + __m256i _s0_m, _s1_m; \
>> + __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
>> + __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
>> + \
>> + _s0_m = __lasx_xvilvl_h(_in6, _in4); \
>> + _s1_m = __lasx_xvilvl_h(_in7, _in5); \
>> + _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
>> + _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
>> + _s0_m = __lasx_xvilvh_h(_in6, _in4); \
>> + _s1_m = __lasx_xvilvh_h(_in7, _in5); \
>> + _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
>> + _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
>> + \
>> + _s0_m = __lasx_xvilvl_h(_in2, _in0); \
>> + _s1_m = __lasx_xvilvl_h(_in3, _in1); \
>> + _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
>> + _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
>> + _s0_m = __lasx_xvilvh_h(_in2, _in0); \
>> + _s1_m = __lasx_xvilvh_h(_in3, _in1); \
>> + _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
>> + _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
>> + \
>> + _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m); \
>> + _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m); \
>> + _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m); \
>> + _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m); \
>> + _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m); \
>> + _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m); \
>> + _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m); \
>> + _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m); \
>> + }
>> /*
>> * =============================================================================
>> @@ -1752,34 +1821,34 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
>> * _out3 = _in0 - _in3;
>> * =============================================================================
>> */
>> -#define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
>> -{ \
>> - _out0 = __lasx_xvadd_b(_in0, _in3); \
>> - _out1 = __lasx_xvadd_b(_in1, _in2); \
>> - _out2 = __lasx_xvsub_b(_in1, _in2); \
>> - _out3 = __lasx_xvsub_b(_in0, _in3); \
>> -}
>> -#define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
>> -{ \
>> - _out0 = __lasx_xvadd_h(_in0, _in3); \
>> - _out1 = __lasx_xvadd_h(_in1, _in2); \
>> - _out2 = __lasx_xvsub_h(_in1, _in2); \
>> - _out3 = __lasx_xvsub_h(_in0, _in3); \
>> -}
>> -#define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
>> -{ \
>> - _out0 = __lasx_xvadd_w(_in0, _in3); \
>> - _out1 = __lasx_xvadd_w(_in1, _in2); \
>> - _out2 = __lasx_xvsub_w(_in1, _in2); \
>> - _out3 = __lasx_xvsub_w(_in0, _in3); \
>> -}
>> -#define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
>> -{ \
>> - _out0 = __lasx_xvadd_d(_in0, _in3); \
>> - _out1 = __lasx_xvadd_d(_in1, _in2); \
>> - _out2 = __lasx_xvsub_d(_in1, _in2); \
>> - _out3 = __lasx_xvsub_d(_in0, _in3); \
>> -}
>> +#define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
>> + { \
>> + _out0 = __lasx_xvadd_b(_in0, _in3); \
>> + _out1 = __lasx_xvadd_b(_in1, _in2); \
>> + _out2 = __lasx_xvsub_b(_in1, _in2); \
>> + _out3 = __lasx_xvsub_b(_in0, _in3); \
>> + }
>> +#define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
>> + { \
>> + _out0 = __lasx_xvadd_h(_in0, _in3); \
>> + _out1 = __lasx_xvadd_h(_in1, _in2); \
>> + _out2 = __lasx_xvsub_h(_in1, _in2); \
>> + _out3 = __lasx_xvsub_h(_in0, _in3); \
>> + }
>> +#define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
>> + { \
>> + _out0 = __lasx_xvadd_w(_in0, _in3); \
>> + _out1 = __lasx_xvadd_w(_in1, _in2); \
>> + _out2 = __lasx_xvsub_w(_in1, _in2); \
>> + _out3 = __lasx_xvsub_w(_in0, _in3); \
>> + }
>> +#define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
>> + { \
>> + _out0 = __lasx_xvadd_d(_in0, _in3); \
>> + _out1 = __lasx_xvadd_d(_in1, _in2); \
>> + _out2 = __lasx_xvsub_d(_in1, _in2); \
>> + _out3 = __lasx_xvsub_d(_in0, _in3); \
>> + }
>> /*
>> * =============================================================================
>> @@ -1798,59 +1867,63 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
>> * _out7 = _in0 - _in7;
>> * =============================================================================
>> */
>> -#define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
>> - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
>> -{ \
>> - _out0 = __lasx_xvadd_b(_in0, _in7); \
>> - _out1 = __lasx_xvadd_b(_in1, _in6); \
>> - _out2 = __lasx_xvadd_b(_in2, _in5); \
>> - _out3 = __lasx_xvadd_b(_in3, _in4); \
>> - _out4 = __lasx_xvsub_b(_in3, _in4); \
>> - _out5 = __lasx_xvsub_b(_in2, _in5); \
>> - _out6 = __lasx_xvsub_b(_in1, _in6); \
>> - _out7 = __lasx_xvsub_b(_in0, _in7); \
>> -}
>> -
>> -#define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
>> - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
>> -{ \
>> - _out0 = __lasx_xvadd_h(_in0, _in7); \
>> - _out1 = __lasx_xvadd_h(_in1, _in6); \
>> - _out2 = __lasx_xvadd_h(_in2, _in5); \
>> - _out3 = __lasx_xvadd_h(_in3, _in4); \
>> - _out4 = __lasx_xvsub_h(_in3, _in4); \
>> - _out5 = __lasx_xvsub_h(_in2, _in5); \
>> - _out6 = __lasx_xvsub_h(_in1, _in6); \
>> - _out7 = __lasx_xvsub_h(_in0, _in7); \
>> -}
>> -
>> -#define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
>> - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
>> -{ \
>> - _out0 = __lasx_xvadd_w(_in0, _in7); \
>> - _out1 = __lasx_xvadd_w(_in1, _in6); \
>> - _out2 = __lasx_xvadd_w(_in2, _in5); \
>> - _out3 = __lasx_xvadd_w(_in3, _in4); \
>> - _out4 = __lasx_xvsub_w(_in3, _in4); \
>> - _out5 = __lasx_xvsub_w(_in2, _in5); \
>> - _out6 = __lasx_xvsub_w(_in1, _in6); \
>> - _out7 = __lasx_xvsub_w(_in0, _in7); \
>> -}
>> -
>> -#define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
>> - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
>> -{ \
>> - _out0 = __lasx_xvadd_d(_in0, _in7); \
>> - _out1 = __lasx_xvadd_d(_in1, _in6); \
>> - _out2 = __lasx_xvadd_d(_in2, _in5); \
>> - _out3 = __lasx_xvadd_d(_in3, _in4); \
>> - _out4 = __lasx_xvsub_d(_in3, _in4); \
>> - _out5 = __lasx_xvsub_d(_in2, _in5); \
>> - _out6 = __lasx_xvsub_d(_in1, _in6); \
>> - _out7 = __lasx_xvsub_d(_in0, _in7); \
>> -}
>> -
>> -#endif //LASX
>> +#define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
>> + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
>> + _out7) \
>> + { \
>> + _out0 = __lasx_xvadd_b(_in0, _in7); \
>> + _out1 = __lasx_xvadd_b(_in1, _in6); \
>> + _out2 = __lasx_xvadd_b(_in2, _in5); \
>> + _out3 = __lasx_xvadd_b(_in3, _in4); \
>> + _out4 = __lasx_xvsub_b(_in3, _in4); \
>> + _out5 = __lasx_xvsub_b(_in2, _in5); \
>> + _out6 = __lasx_xvsub_b(_in1, _in6); \
>> + _out7 = __lasx_xvsub_b(_in0, _in7); \
>> + }
>> +
>> +#define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
>> + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
>> + _out7) \
>> + { \
>> + _out0 = __lasx_xvadd_h(_in0, _in7); \
>> + _out1 = __lasx_xvadd_h(_in1, _in6); \
>> + _out2 = __lasx_xvadd_h(_in2, _in5); \
>> + _out3 = __lasx_xvadd_h(_in3, _in4); \
>> + _out4 = __lasx_xvsub_h(_in3, _in4); \
>> + _out5 = __lasx_xvsub_h(_in2, _in5); \
>> + _out6 = __lasx_xvsub_h(_in1, _in6); \
>> + _out7 = __lasx_xvsub_h(_in0, _in7); \
>> + }
>> +
>> +#define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
>> + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
>> + _out7) \
>> + { \
>> + _out0 = __lasx_xvadd_w(_in0, _in7); \
>> + _out1 = __lasx_xvadd_w(_in1, _in6); \
>> + _out2 = __lasx_xvadd_w(_in2, _in5); \
>> + _out3 = __lasx_xvadd_w(_in3, _in4); \
>> + _out4 = __lasx_xvsub_w(_in3, _in4); \
>> + _out5 = __lasx_xvsub_w(_in2, _in5); \
>> + _out6 = __lasx_xvsub_w(_in1, _in6); \
>> + _out7 = __lasx_xvsub_w(_in0, _in7); \
>> + }
>> +
>> +#define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
>> + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
>> + _out7) \
>> + { \
>> + _out0 = __lasx_xvadd_d(_in0, _in7); \
>> + _out1 = __lasx_xvadd_d(_in1, _in6); \
>> + _out2 = __lasx_xvadd_d(_in2, _in5); \
>> + _out3 = __lasx_xvadd_d(_in3, _in4); \
>> + _out4 = __lasx_xvsub_d(_in3, _in4); \
>> + _out5 = __lasx_xvsub_d(_in2, _in5); \
>> + _out6 = __lasx_xvsub_d(_in1, _in6); \
>> + _out7 = __lasx_xvsub_d(_in0, _in7); \
>> + }
>> +
>> +#endif // LASX
>> /*
>> * =============================================================================
>> @@ -1863,15 +1936,13 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
>> * VP:1,2,3,4,
>> * =============================================================================
>> */
>> -#define VECT_PRINT(RTYPE, element_num, in0, enter) \
>> -{ \
>> - RTYPE _tmp0 = (RTYPE)in0; \
>> - int _i = 0; \
>> - if (enter) \
>> - printf("\nVP:"); \
>> - for(_i = 0; _i < element_num; _i++) \
>> - printf("%d,",_tmp0[_i]); \
>> -}
>> +#define VECT_PRINT(RTYPE, element_num, in0, enter) \
>> + { \
>> + RTYPE _tmp0 = (RTYPE)in0; \
>> + int _i = 0; \
>> + if (enter) printf("\nVP:"); \
>> + for (_i = 0; _i < element_num; _i++) printf("%d,", _tmp0[_i]); \
>> + }
>> #endif /* LOONGSON_INTRINSICS_H */
>> #endif /* AVUTIL_LOONGARCH_LOONGSON_INTRINSICS_H */
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 1/5] avutil: [loongarch] Update loongson_intrinsics.h to v1.1.0
2022-02-17 11:11 ` [FFmpeg-devel] [PATCH v2 1/5] avutil: [loongarch] Update loongson_intrinsics.h to v1.1.0 Hao Chen
2022-02-21 11:11 ` Hao Chen
@ 2022-02-21 15:00 ` Michael Niedermayer
2022-02-25 3:20 ` Hao Chen
2022-02-28 8:19 ` 殷时友
1 sibling, 2 replies; 15+ messages in thread
From: Michael Niedermayer @ 2022-02-21 15:00 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1.1: Type: text/plain, Size: 2510 bytes --]
On Thu, Feb 17, 2022 at 07:11:47PM +0800, Hao Chen wrote:
> The loongson_intrinsics.h file is updated from v1.0.3 version
> to v1.1.0. Some spelling mistakes are fixed and new functions are added.
>
> Signed-off-by: Hao Chen <chenhao@loongson.cn>
> ---
> libavutil/loongarch/loongson_intrinsics.h | 1697 +++++++++++----------
> 1 file changed, 884 insertions(+), 813 deletions(-)
>
> diff --git a/libavutil/loongarch/loongson_intrinsics.h b/libavutil/loongarch/loongson_intrinsics.h
> index 6e0439f829..eb256863c8 100644
> --- a/libavutil/loongarch/loongson_intrinsics.h
> +++ b/libavutil/loongarch/loongson_intrinsics.h
> @@ -33,7 +33,7 @@
> * Xiwei Gu <guxiwei-hf@loongson.cn>
> * Lu Wang <wanglu@loongson.cn>
> *
> - * This file is a header file for loongarch builtin extention.
> + * This file is a header file for loongarch builtin extension.
> *
> */
>
> @@ -42,50 +42,50 @@
>
> /**
> * MAJOR version: Macro usage changes.
> - * MINOR version: Add new functions, or bug fix.
> + * MINOR version: Add new functions, or bug fixes.
> * MICRO version: Comment changes or implementation changes.
> */
> #define LSOM_VERSION_MAJOR 1
> -#define LSOM_VERSION_MINOR 0
> -#define LSOM_VERSION_MICRO 3
> +#define LSOM_VERSION_MINOR 1
> +#define LSOM_VERSION_MICRO 0
>
> #define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
> -{ \
> - _OUT0 = _INS(_IN0); \
> - _OUT1 = _INS(_IN1); \
> -}
> + { \
> + _OUT0 = _INS(_IN0); \
> + _OUT1 = _INS(_IN1); \
> + }
>
> #define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
> -{ \
> - _OUT0 = _INS(_IN0, _IN1); \
> - _OUT1 = _INS(_IN2, _IN3); \
> -}
> + { \
> + _OUT0 = _INS(_IN0, _IN1); \
> + _OUT1 = _INS(_IN2, _IN3); \
> + }
These style changes differ from the recommandition in doc/developer.texi
is that intentional ?
thx
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
Awnsering whenever a program halts or runs forever is
On a turing machine, in general impossible (turings halting problem).
On any real computer, always possible as a real computer has a finite number
of states N, and will either halt in less than N cycles or never halt.
[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]
[-- Attachment #2: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 1/5] avutil: [loongarch] Update loongson_intrinsics.h to v1.1.0
2022-02-21 15:00 ` Michael Niedermayer
@ 2022-02-25 3:20 ` Hao Chen
2022-02-28 8:19 ` 殷时友
1 sibling, 0 replies; 15+ messages in thread
From: Hao Chen @ 2022-02-25 3:20 UTC (permalink / raw)
To: ffmpeg-devel
Thanks for your advices. This file is used by many projects, including
Google.
In order to facilitate the merger of the Google community, this file is
adopted to the Google coding style.
It will be troublesome for us to modify and maintain it, if the
community is very concerned about this coding style.
在 2022/2/21 下午11:00, Michael Niedermayer 写道:
> On Thu, Feb 17, 2022 at 07:11:47PM +0800, Hao Chen wrote:
>> The loongson_intrinsics.h file is updated from v1.0.3 version
>> to v1.1.0. Some spelling mistakes are fixed and new functions are added.
>>
>> Signed-off-by: Hao Chen <chenhao@loongson.cn>
>> ---
>> libavutil/loongarch/loongson_intrinsics.h | 1697 +++++++++++----------
>> 1 file changed, 884 insertions(+), 813 deletions(-)
>>
>> diff --git a/libavutil/loongarch/loongson_intrinsics.h b/libavutil/loongarch/loongson_intrinsics.h
>> index 6e0439f829..eb256863c8 100644
>> --- a/libavutil/loongarch/loongson_intrinsics.h
>> +++ b/libavutil/loongarch/loongson_intrinsics.h
>> @@ -33,7 +33,7 @@
>> * Xiwei Gu <guxiwei-hf@loongson.cn>
>> * Lu Wang <wanglu@loongson.cn>
>> *
>> - * This file is a header file for loongarch builtin extention.
>> + * This file is a header file for loongarch builtin extension.
>> *
>> */
>>
>> @@ -42,50 +42,50 @@
>>
>> /**
>> * MAJOR version: Macro usage changes.
>> - * MINOR version: Add new functions, or bug fix.
>> + * MINOR version: Add new functions, or bug fixes.
>> * MICRO version: Comment changes or implementation changes.
>> */
>> #define LSOM_VERSION_MAJOR 1
>> -#define LSOM_VERSION_MINOR 0
>> -#define LSOM_VERSION_MICRO 3
>> +#define LSOM_VERSION_MINOR 1
>> +#define LSOM_VERSION_MICRO 0
>>
>> #define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
>> -{ \
>> - _OUT0 = _INS(_IN0); \
>> - _OUT1 = _INS(_IN1); \
>> -}
>> + { \
>> + _OUT0 = _INS(_IN0); \
>> + _OUT1 = _INS(_IN1); \
>> + }
>>
>> #define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
>> -{ \
>> - _OUT0 = _INS(_IN0, _IN1); \
>> - _OUT1 = _INS(_IN2, _IN3); \
>> -}
>> + { \
>> + _OUT0 = _INS(_IN0, _IN1); \
>> + _OUT1 = _INS(_IN2, _IN3); \
>> + }
> These style changes differ from the recommandition in doc/developer.texi
> is that intentional ?
>
> thx
>
> [...]
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 1/5] avutil: [loongarch] Update loongson_intrinsics.h to v1.1.0
2022-02-21 15:00 ` Michael Niedermayer
2022-02-25 3:20 ` Hao Chen
@ 2022-02-28 8:19 ` 殷时友
2022-02-28 14:17 ` Michael Niedermayer
1 sibling, 1 reply; 15+ messages in thread
From: 殷时友 @ 2022-02-28 8:19 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: Michael Niedermayer
> 2022年2月21日 下午11:00,Michael Niedermayer <michael@niedermayer.cc> 写道:
>
> On Thu, Feb 17, 2022 at 07:11:47PM +0800, Hao Chen wrote:
>> The loongson_intrinsics.h file is updated from v1.0.3 version
>> to v1.1.0. Some spelling mistakes are fixed and new functions are added.
>>
>> Signed-off-by: Hao Chen <chenhao@loongson.cn>
>> ---
>> libavutil/loongarch/loongson_intrinsics.h | 1697 +++++++++++----------
>> 1 file changed, 884 insertions(+), 813 deletions(-)
>>
>> diff --git a/libavutil/loongarch/loongson_intrinsics.h b/libavutil/loongarch/loongson_intrinsics.h
>> index 6e0439f829..eb256863c8 100644
>> --- a/libavutil/loongarch/loongson_intrinsics.h
>> +++ b/libavutil/loongarch/loongson_intrinsics.h
>> @@ -33,7 +33,7 @@
>> * Xiwei Gu <guxiwei-hf@loongson.cn>
>> * Lu Wang <wanglu@loongson.cn>
>> *
>> - * This file is a header file for loongarch builtin extention.
>> + * This file is a header file for loongarch builtin extension.
>> *
>> */
>>
>> @@ -42,50 +42,50 @@
>>
>> /**
>> * MAJOR version: Macro usage changes.
>> - * MINOR version: Add new functions, or bug fix.
>> + * MINOR version: Add new functions, or bug fixes.
>> * MICRO version: Comment changes or implementation changes.
>> */
>> #define LSOM_VERSION_MAJOR 1
>> -#define LSOM_VERSION_MINOR 0
>> -#define LSOM_VERSION_MICRO 3
>> +#define LSOM_VERSION_MINOR 1
>> +#define LSOM_VERSION_MICRO 0
>>
>> #define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
>> -{ \
>> - _OUT0 = _INS(_IN0); \
>> - _OUT1 = _INS(_IN1); \
>> -}
>> + { \
>> + _OUT0 = _INS(_IN0); \
>> + _OUT1 = _INS(_IN1); \
>> + }
>>
>> #define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
>> -{ \
>> - _OUT0 = _INS(_IN0, _IN1); \
>> - _OUT1 = _INS(_IN2, _IN3); \
>> -}
>> + { \
>> + _OUT0 = _INS(_IN0, _IN1); \
>> + _OUT1 = _INS(_IN2, _IN3); \
>> + }
>
> These style changes differ from the recommandition in doc/developer.texi
> is that intentional ?
>
> thx
Can we keep this style in here, it’s helpful for us to maintain this header file
which used for some other projects too. Thanks very much.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 1/5] avutil: [loongarch] Update loongson_intrinsics.h to v1.1.0
2022-02-28 8:19 ` 殷时友
@ 2022-02-28 14:17 ` Michael Niedermayer
2022-03-01 12:31 ` 殷时友
0 siblings, 1 reply; 15+ messages in thread
From: Michael Niedermayer @ 2022-02-28 14:17 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1.1: Type: text/plain, Size: 2882 bytes --]
On Mon, Feb 28, 2022 at 04:19:55PM +0800, 殷时友 wrote:
>
>
> > 2022年2月21日 下午11:00,Michael Niedermayer <michael@niedermayer.cc> 写道:
> >
> > On Thu, Feb 17, 2022 at 07:11:47PM +0800, Hao Chen wrote:
> >> The loongson_intrinsics.h file is updated from v1.0.3 version
> >> to v1.1.0. Some spelling mistakes are fixed and new functions are added.
> >>
> >> Signed-off-by: Hao Chen <chenhao@loongson.cn>
> >> ---
> >> libavutil/loongarch/loongson_intrinsics.h | 1697 +++++++++++----------
> >> 1 file changed, 884 insertions(+), 813 deletions(-)
> >>
> >> diff --git a/libavutil/loongarch/loongson_intrinsics.h b/libavutil/loongarch/loongson_intrinsics.h
> >> index 6e0439f829..eb256863c8 100644
> >> --- a/libavutil/loongarch/loongson_intrinsics.h
> >> +++ b/libavutil/loongarch/loongson_intrinsics.h
> >> @@ -33,7 +33,7 @@
> >> * Xiwei Gu <guxiwei-hf@loongson.cn>
> >> * Lu Wang <wanglu@loongson.cn>
> >> *
> >> - * This file is a header file for loongarch builtin extention.
> >> + * This file is a header file for loongarch builtin extension.
> >> *
> >> */
> >>
> >> @@ -42,50 +42,50 @@
> >>
> >> /**
> >> * MAJOR version: Macro usage changes.
> >> - * MINOR version: Add new functions, or bug fix.
> >> + * MINOR version: Add new functions, or bug fixes.
> >> * MICRO version: Comment changes or implementation changes.
> >> */
> >> #define LSOM_VERSION_MAJOR 1
> >> -#define LSOM_VERSION_MINOR 0
> >> -#define LSOM_VERSION_MICRO 3
> >> +#define LSOM_VERSION_MINOR 1
> >> +#define LSOM_VERSION_MICRO 0
> >>
> >> #define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
> >> -{ \
> >> - _OUT0 = _INS(_IN0); \
> >> - _OUT1 = _INS(_IN1); \
> >> -}
> >> + { \
> >> + _OUT0 = _INS(_IN0); \
> >> + _OUT1 = _INS(_IN1); \
> >> + }
> >>
> >> #define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
> >> -{ \
> >> - _OUT0 = _INS(_IN0, _IN1); \
> >> - _OUT1 = _INS(_IN2, _IN3); \
> >> -}
> >> + { \
> >> + _OUT0 = _INS(_IN0, _IN1); \
> >> + _OUT1 = _INS(_IN2, _IN3); \
> >> + }
> >
> > These style changes differ from the recommandition in doc/developer.texi
> > is that intentional ?
> >
> > thx
>
>
> Can we keep this style in here, it’s helpful for us to maintain this header file
> which used for some other projects too. Thanks very much.
yes, it makes sense to keep the style under these circumstances
thx
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
In a rich man's house there is no place to spit but his face.
-- Diogenes of Sinope
[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]
[-- Attachment #2: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 1/5] avutil: [loongarch] Update loongson_intrinsics.h to v1.1.0
2022-02-28 14:17 ` Michael Niedermayer
@ 2022-03-01 12:31 ` 殷时友
0 siblings, 0 replies; 15+ messages in thread
From: 殷时友 @ 2022-03-01 12:31 UTC (permalink / raw)
To: FFmpeg development discussions and patches
> 2022年2月28日 下午10:17,Michael Niedermayer <michael@niedermayer.cc> 写道:
>
> On Mon, Feb 28, 2022 at 04:19:55PM +0800, 殷时友 wrote:
>>
>>
>>> 2022年2月21日 下午11:00,Michael Niedermayer <michael@niedermayer.cc> 写道:
>>>
>>> On Thu, Feb 17, 2022 at 07:11:47PM +0800, Hao Chen wrote:
>>>> The loongson_intrinsics.h file is updated from v1.0.3 version
>>>> to v1.1.0. Some spelling mistakes are fixed and new functions are added.
>>>>
>>>> Signed-off-by: Hao Chen <chenhao@loongson.cn>
>>>> ---
>>>> libavutil/loongarch/loongson_intrinsics.h | 1697 +++++++++++----------
>>>> 1 file changed, 884 insertions(+), 813 deletions(-)
>>>>
>>>> diff --git a/libavutil/loongarch/loongson_intrinsics.h b/libavutil/loongarch/loongson_intrinsics.h
>>>> index 6e0439f829..eb256863c8 100644
>>>> --- a/libavutil/loongarch/loongson_intrinsics.h
>>>> +++ b/libavutil/loongarch/loongson_intrinsics.h
>>>> @@ -33,7 +33,7 @@
>>>> * Xiwei Gu <guxiwei-hf@loongson.cn>
>>>> * Lu Wang <wanglu@loongson.cn>
>>>> *
>>>> - * This file is a header file for loongarch builtin extention.
>>>> + * This file is a header file for loongarch builtin extension.
>>>> *
>>>> */
>>>>
>>>> @@ -42,50 +42,50 @@
>>>>
>>>> /**
>>>> * MAJOR version: Macro usage changes.
>>>> - * MINOR version: Add new functions, or bug fix.
>>>> + * MINOR version: Add new functions, or bug fixes.
>>>> * MICRO version: Comment changes or implementation changes.
>>>> */
>>>> #define LSOM_VERSION_MAJOR 1
>>>> -#define LSOM_VERSION_MINOR 0
>>>> -#define LSOM_VERSION_MICRO 3
>>>> +#define LSOM_VERSION_MINOR 1
>>>> +#define LSOM_VERSION_MICRO 0
>>>>
>>>> #define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
>>>> -{ \
>>>> - _OUT0 = _INS(_IN0); \
>>>> - _OUT1 = _INS(_IN1); \
>>>> -}
>>>> + { \
>>>> + _OUT0 = _INS(_IN0); \
>>>> + _OUT1 = _INS(_IN1); \
>>>> + }
>>>>
>>>> #define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
>>>> -{ \
>>>> - _OUT0 = _INS(_IN0, _IN1); \
>>>> - _OUT1 = _INS(_IN2, _IN3); \
>>>> -}
>>>> + { \
>>>> + _OUT0 = _INS(_IN0, _IN1); \
>>>> + _OUT1 = _INS(_IN2, _IN3); \
>>>> + }
>>>
>>> These style changes differ from the recommandition in doc/developer.texi
>>> is that intentional ?
>>>
>>> thx
>>
>>
>> Can we keep this style in here, it’s helpful for us to maintain this header file
>> which used for some other projects too. Thanks very much.
>
> yes, it makes sense to keep the style under these circumstances
>
Thanks, hope this patch set can be merged.
It’s the last part of LoongArch avcodec optimization.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [FFmpeg-devel] Optimize HEVC decoding for loongarch
2022-02-17 12:08 ` [FFmpeg-devel] Optimize HEVC decoding for loongarch 殷时友
@ 2022-03-01 22:53 ` Michael Niedermayer
0 siblings, 0 replies; 15+ messages in thread
From: Michael Niedermayer @ 2022-03-01 22:53 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1.1: Type: text/plain, Size: 887 bytes --]
On Thu, Feb 17, 2022 at 08:08:08PM +0800, 殷时友 wrote:
>
>
> > 2022年2月17日 下午7:11,Hao Chen <chenhao@loongson.cn> 写道:
> >
> > ffmpeg -i 5_h265_1080p_60fps_3Mbps.mkv -f rawvideo -y /dev/null -an
> > before: 94fps
> > after : 191fps
> >
> > V2: Retrigger the patchwork.ffmpeg test.
> >
> > [PATCH v2 1/5] avutil: [loongarch] Update loongson_intrinsics.h to
> > [PATCH v2 2/5] avcodec: [loongarch] Optimize Hevcdsp with LSX.
> > [PATCH v2 3/5] avcodec: [loongarch] Optimize Hevc_idct/lpf with LSX.
> > [PATCH v2 4/5] avcodec: [loongarch] Optimize Hevc_mc_bi with LSX.
> > [PATCH v2 5/5] avcodec: [loongarch] Optimize Hevc_mc_uni/w with LSX.
> >
> LGTM
will apply patchset
thx
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
Observe your enemies, for they first find out your faults. -- Antisthenes
[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]
[-- Attachment #2: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 15+ messages in thread
end of thread, other threads:[~2022-03-01 22:53 UTC | newest]
Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-02-17 11:11 [FFmpeg-devel] Optimize HEVC decoding for loongarch Hao Chen
2022-02-17 11:11 ` [FFmpeg-devel] [PATCH v2 1/5] avutil: [loongarch] Update loongson_intrinsics.h to v1.1.0 Hao Chen
2022-02-21 11:11 ` Hao Chen
2022-02-21 11:42 ` 殷时友
2022-02-21 15:00 ` Michael Niedermayer
2022-02-25 3:20 ` Hao Chen
2022-02-28 8:19 ` 殷时友
2022-02-28 14:17 ` Michael Niedermayer
2022-03-01 12:31 ` 殷时友
2022-02-17 11:11 ` [FFmpeg-devel] [PATCH v2 2/5] avcodec: [loongarch] Optimize Hevcdsp with LSX Hao Chen
2022-02-17 11:11 ` [FFmpeg-devel] [PATCH v2 3/5] avcodec: [loongarch] Optimize Hevc_idct/lpf " Hao Chen
2022-02-17 11:11 ` [FFmpeg-devel] [PATCH v2 4/5] avcodec: [loongarch] Optimize Hevc_mc_bi " Hao Chen
2022-02-17 11:11 ` [FFmpeg-devel] [PATCH v2 5/5] avcodec: [loongarch] Optimize Hevc_mc_uni/w " Hao Chen
2022-02-17 12:08 ` [FFmpeg-devel] Optimize HEVC decoding for loongarch 殷时友
2022-03-01 22:53 ` Michael Niedermayer
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git