* [FFmpeg-devel] [PATCH v5 1/7] avcodec/la: add LSX optimization for h264 idct.
2023-05-25 7:24 [FFmpeg-devel] Add LSX optimization in avcodec and swscale Hao Chen
@ 2023-05-25 7:24 ` Hao Chen
2023-05-25 7:24 ` [FFmpeg-devel] [PATCH v5 2/7] avcodec/la: Add LSX optimization for loop filter Hao Chen
` (5 subsequent siblings)
6 siblings, 0 replies; 17+ messages in thread
From: Hao Chen @ 2023-05-25 7:24 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Shiyou Yin
From: Shiyou Yin <yinshiyou-hf@loongson.cn>
loongson_asm.S is LoongArch asm optimization helper.
Add functions:
ff_h264_idct_add_8_lsx
ff_h264_idct8_add_8_lsx
ff_h264_idct_dc_add_8_lsx
ff_h264_idct8_dc_add_8_lsx
ff_h264_idct_add16_8_lsx
ff_h264_idct8_add4_8_lsx
ff_h264_idct_add8_8_lsx
ff_h264_idct_add8_422_8_lsx
ff_h264_idct_add16_intra_8_lsx
ff_h264_luma_dc_dequant_idct_8_lsx
Replaced function(LSX is sufficient for these functions):
ff_h264_idct_add_lasx
ff_h264_idct4x4_addblk_dc_lasx
ff_h264_idct_add16_lasx
ff_h264_idct8_add4_lasx
ff_h264_idct_add8_lasx
ff_h264_idct_add8_422_lasx
ff_h264_idct_add16_intra_lasx
ff_h264_deq_idct_luma_dc_lasx
Renamed functions:
ff_h264_idct8_addblk_lasx ==> ff_h264_idct8_add_8_lasx
ff_h264_idct8_dc_addblk_lasx ==> ff_h264_idct8_dc_add_8_lasx
./configure --disable-lasx
ffmpeg -i 1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -y /dev/null -an
before: 155fps
after: 161fps
---
libavcodec/loongarch/Makefile | 3 +-
libavcodec/loongarch/h264_deblock_lasx.c | 2 +-
libavcodec/loongarch/h264dsp_init_loongarch.c | 39 +-
libavcodec/loongarch/h264dsp_lasx.c | 2 +-
.../{h264dsp_lasx.h => h264dsp_loongarch.h} | 60 +-
libavcodec/loongarch/h264idct.S | 658 ++++++++++++
libavcodec/loongarch/h264idct_lasx.c | 498 ---------
libavcodec/loongarch/h264idct_loongarch.c | 184 ++++
libavcodec/loongarch/loongson_asm.S | 945 ++++++++++++++++++
9 files changed, 1848 insertions(+), 543 deletions(-)
rename libavcodec/loongarch/{h264dsp_lasx.h => h264dsp_loongarch.h} (68%)
create mode 100644 libavcodec/loongarch/h264idct.S
delete mode 100644 libavcodec/loongarch/h264idct_lasx.c
create mode 100644 libavcodec/loongarch/h264idct_loongarch.c
create mode 100644 libavcodec/loongarch/loongson_asm.S
diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index c1b5de5c44..34ebbbe133 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -12,7 +12,6 @@ OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_init_loongarch.o
LASX-OBJS-$(CONFIG_H264CHROMA) += loongarch/h264chroma_lasx.o
LASX-OBJS-$(CONFIG_H264QPEL) += loongarch/h264qpel_lasx.o
LASX-OBJS-$(CONFIG_H264DSP) += loongarch/h264dsp_lasx.o \
- loongarch/h264idct_lasx.o \
loongarch/h264_deblock_lasx.o
LASX-OBJS-$(CONFIG_H264PRED) += loongarch/h264_intrapred_lasx.o
LASX-OBJS-$(CONFIG_VC1_DECODER) += loongarch/vc1dsp_lasx.o
@@ -31,3 +30,5 @@ LSX-OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_lsx.o \
loongarch/hevc_mc_bi_lsx.o \
loongarch/hevc_mc_uni_lsx.o \
loongarch/hevc_mc_uniw_lsx.o
+LSX-OBJS-$(CONFIG_H264DSP) += loongarch/h264idct.o \
+ loongarch/h264idct_loongarch.o
diff --git a/libavcodec/loongarch/h264_deblock_lasx.c b/libavcodec/loongarch/h264_deblock_lasx.c
index c89bea9a84..eead931dcf 100644
--- a/libavcodec/loongarch/h264_deblock_lasx.c
+++ b/libavcodec/loongarch/h264_deblock_lasx.c
@@ -20,7 +20,7 @@
*/
#include "libavcodec/bit_depth_template.c"
-#include "h264dsp_lasx.h"
+#include "h264dsp_loongarch.h"
#include "libavutil/loongarch/loongson_intrinsics.h"
#define H264_LOOP_FILTER_STRENGTH_ITERATION_LASX(edges, step, mask_mv, dir, \
diff --git a/libavcodec/loongarch/h264dsp_init_loongarch.c b/libavcodec/loongarch/h264dsp_init_loongarch.c
index 37633c3e51..cb07deb398 100644
--- a/libavcodec/loongarch/h264dsp_init_loongarch.c
+++ b/libavcodec/loongarch/h264dsp_init_loongarch.c
@@ -21,13 +21,32 @@
*/
#include "libavutil/loongarch/cpu.h"
-#include "h264dsp_lasx.h"
+#include "h264dsp_loongarch.h"
av_cold void ff_h264dsp_init_loongarch(H264DSPContext *c, const int bit_depth,
const int chroma_format_idc)
{
int cpu_flags = av_get_cpu_flags();
+ if (have_lsx(cpu_flags)) {
+ if (bit_depth == 8) {
+ c->h264_idct_add = ff_h264_idct_add_8_lsx;
+ c->h264_idct8_add = ff_h264_idct8_add_8_lsx;
+ c->h264_idct_dc_add = ff_h264_idct_dc_add_8_lsx;
+ c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_lsx;
+
+ if (chroma_format_idc <= 1)
+ c->h264_idct_add8 = ff_h264_idct_add8_8_lsx;
+ else
+ c->h264_idct_add8 = ff_h264_idct_add8_422_8_lsx;
+
+ c->h264_idct_add16 = ff_h264_idct_add16_8_lsx;
+ c->h264_idct8_add4 = ff_h264_idct8_add4_8_lsx;
+ c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_8_lsx;
+ c->h264_idct_add16intra = ff_h264_idct_add16_intra_8_lsx;
+ }
+ }
+#if HAVE_LASX
if (have_lasx(cpu_flags)) {
if (chroma_format_idc <= 1)
c->h264_loop_filter_strength = ff_h264_loop_filter_strength_lasx;
@@ -56,20 +75,10 @@ av_cold void ff_h264dsp_init_loongarch(H264DSPContext *c, const int bit_depth,
c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_8_lasx;
c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels4_8_lasx;
- c->h264_idct_add = ff_h264_idct_add_lasx;
- c->h264_idct8_add = ff_h264_idct8_addblk_lasx;
- c->h264_idct_dc_add = ff_h264_idct4x4_addblk_dc_lasx;
- c->h264_idct8_dc_add = ff_h264_idct8_dc_addblk_lasx;
- c->h264_idct_add16 = ff_h264_idct_add16_lasx;
- c->h264_idct8_add4 = ff_h264_idct8_add4_lasx;
-
- if (chroma_format_idc <= 1)
- c->h264_idct_add8 = ff_h264_idct_add8_lasx;
- else
- c->h264_idct_add8 = ff_h264_idct_add8_422_lasx;
-
- c->h264_idct_add16intra = ff_h264_idct_add16_intra_lasx;
- c->h264_luma_dc_dequant_idct = ff_h264_deq_idct_luma_dc_lasx;
+ c->h264_idct8_add = ff_h264_idct8_add_8_lasx;
+ c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_lasx;
+ c->h264_idct8_add4 = ff_h264_idct8_add4_8_lasx;
}
}
+#endif // #if HAVE_LASX
}
diff --git a/libavcodec/loongarch/h264dsp_lasx.c b/libavcodec/loongarch/h264dsp_lasx.c
index 7fd4cedf7e..7b2b8ff0f0 100644
--- a/libavcodec/loongarch/h264dsp_lasx.c
+++ b/libavcodec/loongarch/h264dsp_lasx.c
@@ -23,7 +23,7 @@
*/
#include "libavutil/loongarch/loongson_intrinsics.h"
-#include "h264dsp_lasx.h"
+#include "h264dsp_loongarch.h"
#define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in, \
p1_or_q1_org_in, p2_or_q2_org_in, \
diff --git a/libavcodec/loongarch/h264dsp_lasx.h b/libavcodec/loongarch/h264dsp_loongarch.h
similarity index 68%
rename from libavcodec/loongarch/h264dsp_lasx.h
rename to libavcodec/loongarch/h264dsp_loongarch.h
index 4cf813750b..28dca2b537 100644
--- a/libavcodec/loongarch/h264dsp_lasx.h
+++ b/libavcodec/loongarch/h264dsp_loongarch.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
* Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
* Xiwei Gu <guxiwei-hf@loongson.cn>
*
@@ -20,11 +20,34 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#ifndef AVCODEC_LOONGARCH_H264DSP_LASX_H
-#define AVCODEC_LOONGARCH_H264DSP_LASX_H
+#ifndef AVCODEC_LOONGARCH_H264DSP_LOONGARCH_H
+#define AVCODEC_LOONGARCH_H264DSP_LOONGARCH_H
#include "libavcodec/h264dec.h"
+#include "config.h"
+void ff_h264_idct_add_8_lsx(uint8_t *dst, int16_t *src, int dst_stride);
+void ff_h264_idct8_add_8_lsx(uint8_t *dst, int16_t *src, int dst_stride);
+void ff_h264_idct_dc_add_8_lsx(uint8_t *dst, int16_t *src, int dst_stride);
+void ff_h264_idct8_dc_add_8_lsx(uint8_t *dst, int16_t *src, int dst_stride);
+void ff_h264_luma_dc_dequant_idct_8_lsx(int16_t *_output, int16_t *_input, int qmul);
+void ff_h264_idct_add16_8_lsx(uint8_t *dst, const int32_t *blk_offset,
+ int16_t *block, int32_t dst_stride,
+ const uint8_t nzc[15 * 8]);
+void ff_h264_idct8_add4_8_lsx(uint8_t *dst, const int32_t *blk_offset,
+ int16_t *block, int32_t dst_stride,
+ const uint8_t nzc[15 * 8]);
+void ff_h264_idct_add8_8_lsx(uint8_t **dst, const int32_t *blk_offset,
+ int16_t *block, int32_t dst_stride,
+ const uint8_t nzc[15 * 8]);
+void ff_h264_idct_add8_422_8_lsx(uint8_t **dst, const int32_t *blk_offset,
+ int16_t *block, int32_t dst_stride,
+ const uint8_t nzc[15 * 8]);
+void ff_h264_idct_add16_intra_8_lsx(uint8_t *dst, const int32_t *blk_offset,
+ int16_t *block, int32_t dst_stride,
+ const uint8_t nzc[15 * 8]);
+
+#if HAVE_LASX
void ff_h264_h_lpf_luma_8_lasx(uint8_t *src, ptrdiff_t stride,
int alpha, int beta, int8_t *tc0);
void ff_h264_v_lpf_luma_8_lasx(uint8_t *src, ptrdiff_t stride,
@@ -65,33 +88,16 @@ void ff_weight_h264_pixels4_8_lasx(uint8_t *src, ptrdiff_t stride,
void ff_h264_add_pixels4_8_lasx(uint8_t *_dst, int16_t *_src, int stride);
void ff_h264_add_pixels8_8_lasx(uint8_t *_dst, int16_t *_src, int stride);
-void ff_h264_idct_add_lasx(uint8_t *dst, int16_t *src, int32_t dst_stride);
-void ff_h264_idct8_addblk_lasx(uint8_t *dst, int16_t *src, int32_t dst_stride);
-void ff_h264_idct4x4_addblk_dc_lasx(uint8_t *dst, int16_t *src,
- int32_t dst_stride);
-void ff_h264_idct8_dc_addblk_lasx(uint8_t *dst, int16_t *src,
+void ff_h264_idct8_add_8_lasx(uint8_t *dst, int16_t *src, int32_t dst_stride);
+void ff_h264_idct8_dc_add_8_lasx(uint8_t *dst, int16_t *src,
int32_t dst_stride);
-void ff_h264_idct_add16_lasx(uint8_t *dst, const int32_t *blk_offset,
- int16_t *block, int32_t dst_stride,
- const uint8_t nzc[15 * 8]);
-void ff_h264_idct8_add4_lasx(uint8_t *dst, const int32_t *blk_offset,
- int16_t *block, int32_t dst_stride,
- const uint8_t nzc[15 * 8]);
-void ff_h264_idct_add8_lasx(uint8_t **dst, const int32_t *blk_offset,
- int16_t *block, int32_t dst_stride,
- const uint8_t nzc[15 * 8]);
-void ff_h264_idct_add8_422_lasx(uint8_t **dst, const int32_t *blk_offset,
- int16_t *block, int32_t dst_stride,
- const uint8_t nzc[15 * 8]);
-void ff_h264_idct_add16_intra_lasx(uint8_t *dst, const int32_t *blk_offset,
- int16_t *block, int32_t dst_stride,
- const uint8_t nzc[15 * 8]);
-void ff_h264_deq_idct_luma_dc_lasx(int16_t *dst, int16_t *src,
- int32_t de_qval);
-
+void ff_h264_idct8_add4_8_lasx(uint8_t *dst, const int32_t *blk_offset,
+ int16_t *block, int32_t dst_stride,
+ const uint8_t nzc[15 * 8]);
void ff_h264_loop_filter_strength_lasx(int16_t bS[2][4][4], uint8_t nnz[40],
int8_t ref[2][40], int16_t mv[2][40][2],
int bidir, int edges, int step,
int mask_mv0, int mask_mv1, int field);
+#endif // #if HAVE_LASX
-#endif // #ifndef AVCODEC_LOONGARCH_H264DSP_LASX_H
+#endif // #ifndef AVCODEC_LOONGARCH_H264DSP_LOONGARCH_H
diff --git a/libavcodec/loongarch/h264idct.S b/libavcodec/loongarch/h264idct.S
new file mode 100644
index 0000000000..f504cfb714
--- /dev/null
+++ b/libavcodec/loongarch/h264idct.S
@@ -0,0 +1,658 @@
+/*
+ * Loongson LASX optimized h264idct
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+/*
+ * #define FUNC2(a, b, c) FUNC3(a, b, c)
+ * #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
+ * void FUNCC(ff_h264_idct_add)(uint8_t *_dst, int16_t *_block, int stride)
+ * LSX optimization is enough for this function.
+ */
+function ff_h264_idct_add_8_lsx
+ fld.d f0, a1, 0
+ fld.d f1, a1, 8
+ fld.d f2, a1, 16
+ fld.d f3, a1, 24
+ vxor.v vr7, vr7, vr7
+ add.d t2, a2, a2
+ add.d t3, t2, a2
+ vst vr7, a1, 0
+ vst vr7, a1, 16
+
+ vadd.h vr4, vr0, vr2
+ vsub.h vr5, vr0, vr2
+ vsrai.h vr6, vr1, 1
+ vsrai.h vr7, vr3, 1
+ vsub.h vr6, vr6, vr3
+ vadd.h vr7, vr1, vr7
+ LSX_BUTTERFLY_4_H vr4, vr5, vr6, vr7, vr0, vr1, vr2, vr3
+ LSX_TRANSPOSE4x4_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5
+ vadd.h vr4, vr0, vr2
+ vsub.h vr5, vr0, vr2
+ vsrai.h vr6, vr1, 1
+ vsrai.h vr7, vr3, 1
+ vsub.h vr6, vr6, vr3
+ vadd.h vr7, vr1, vr7
+ LSX_BUTTERFLY_4_H vr4, vr5, vr6, vr7, vr0, vr1, vr2, vr3
+
+ fld.s f4, a0, 0
+ fldx.s f5, a0, a2
+ fldx.s f6, a0, t2
+ fldx.s f7, a0, t3
+
+ vsrari.h vr0, vr0, 6
+ vsrari.h vr1, vr1, 6
+ vsrari.h vr2, vr2, 6
+ vsrari.h vr3, vr3, 6
+
+ vsllwil.hu.bu vr4, vr4, 0
+ vsllwil.hu.bu vr5, vr5, 0
+ vsllwil.hu.bu vr6, vr6, 0
+ vsllwil.hu.bu vr7, vr7, 0
+ vadd.h vr0, vr0, vr4
+ vadd.h vr1, vr1, vr5
+ vadd.h vr2, vr2, vr6
+ vadd.h vr3, vr3, vr7
+ vssrarni.bu.h vr1, vr0, 0
+ vssrarni.bu.h vr3, vr2, 0
+
+ vbsrl.v vr0, vr1, 8
+ vbsrl.v vr2, vr3, 8
+ fst.s f1, a0, 0
+ fstx.s f0, a0, a2
+ fstx.s f3, a0, t2
+ fstx.s f2, a0, t3
+endfunc
+
+/*
+ * #define FUNC2(a, b, c) FUNC3(a, b, c)
+ * #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
+ * void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride)
+ */
+function ff_h264_idct8_add_8_lsx
+ ld.h t0, a1, 0
+ add.d t2, a2, a2
+ add.d t3, t2, a2
+ add.d t4, t3, a2
+ add.d t5, t4, a2
+ add.d t6, t5, a2
+ add.d t7, t6, a2
+ addi.w t0, t0, 32
+ st.h t0, a1, 0
+
+ vld vr0, a1, 0
+ vld vr1, a1, 16
+ vld vr2, a1, 32
+ vld vr3, a1, 48
+ vld vr4, a1, 64
+ vld vr5, a1, 80
+ vld vr6, a1, 96
+ vld vr7, a1, 112
+ vxor.v vr8, vr8, vr8
+ vst vr8, a1, 0
+ vst vr8, a1, 16
+ vst vr8, a1, 32
+ vst vr8, a1, 48
+ vst vr8, a1, 64
+ vst vr8, a1, 80
+ vst vr8, a1, 96
+ vst vr8, a1, 112
+
+ vadd.h vr18, vr0, vr4
+ vsub.h vr19, vr0, vr4
+ vsrai.h vr20, vr2, 1
+ vsrai.h vr21, vr6, 1
+ vsub.h vr20, vr20, vr6
+ vadd.h vr21, vr21, vr2
+ LSX_BUTTERFLY_4_H vr18, vr19, vr20, vr21, vr10, vr12, vr14, vr16
+ vsrai.h vr11, vr7, 1
+ vsrai.h vr13, vr3, 1
+ vsrai.h vr15, vr5, 1
+ vsrai.h vr17, vr1, 1
+ vsub.h vr11, vr5, vr11
+ vsub.h vr13, vr7, vr13
+ vadd.h vr15, vr7, vr15
+ vadd.h vr17, vr5, vr17
+ vsub.h vr11, vr11, vr7
+ vsub.h vr13, vr13, vr3
+ vadd.h vr15, vr15, vr5
+ vadd.h vr17, vr17, vr1
+ vsub.h vr11, vr11, vr3
+ vadd.h vr13, vr13, vr1
+ vsub.h vr15, vr15, vr1
+ vadd.h vr17, vr17, vr3
+ vsrai.h vr18, vr11, 2
+ vsrai.h vr19, vr13, 2
+ vsrai.h vr20, vr15, 2
+ vsrai.h vr21, vr17, 2
+ vadd.h vr11, vr11, vr21
+ vadd.h vr13, vr13, vr20
+ vsub.h vr15, vr19, vr15
+ vsub.h vr17, vr17, vr18
+ LSX_BUTTERFLY_8_H vr10, vr16, vr12, vr14, vr13, vr15, vr11, vr17, \
+ vr0, vr3, vr1, vr2, vr5, vr6, vr4, vr7
+
+ LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr10, vr11, vr12, vr13, vr14, vr15, vr16, vr17
+ vexth.w.h vr20, vr0
+ vexth.w.h vr21, vr1
+ vexth.w.h vr22, vr2
+ vexth.w.h vr23, vr3
+ vexth.w.h vr8, vr4
+ vexth.w.h vr9, vr5
+ vexth.w.h vr18, vr6
+ vexth.w.h vr19, vr7
+ vsllwil.w.h vr0, vr0, 0
+ vsllwil.w.h vr1, vr1, 0
+ vsllwil.w.h vr2, vr2, 0
+ vsllwil.w.h vr3, vr3, 0
+ vsllwil.w.h vr4, vr4, 0
+ vsllwil.w.h vr5, vr5, 0
+ vsllwil.w.h vr6, vr6, 0
+ vsllwil.w.h vr7, vr7, 0
+
+ vadd.w vr11, vr0, vr4
+ vsub.w vr13, vr0, vr4
+ vsrai.w vr15, vr2, 1
+ vsrai.w vr17, vr6, 1
+ vsub.w vr15, vr15, vr6
+ vadd.w vr17, vr17, vr2
+ LSX_BUTTERFLY_4_W vr11, vr13, vr15, vr17, vr10, vr12, vr14, vr16
+ vsrai.w vr11, vr7, 1
+ vsrai.w vr13, vr3, 1
+ vsrai.w vr15, vr5, 1
+ vsrai.w vr17, vr1, 1
+ vsub.w vr11, vr5, vr11
+ vsub.w vr13, vr7, vr13
+ vadd.w vr15, vr7, vr15
+ vadd.w vr17, vr5, vr17
+ vsub.w vr11, vr11, vr7
+ vsub.w vr13, vr13, vr3
+ vadd.w vr15, vr15, vr5
+ vadd.w vr17, vr17, vr1
+ vsub.w vr11, vr11, vr3
+ vadd.w vr13, vr13, vr1
+ vsub.w vr15, vr15, vr1
+ vadd.w vr17, vr17, vr3
+ vsrai.w vr0, vr11, 2
+ vsrai.w vr1, vr13, 2
+ vsrai.w vr2, vr15, 2
+ vsrai.w vr3, vr17, 2
+ vadd.w vr11, vr11, vr3
+ vadd.w vr13, vr13, vr2
+ vsub.w vr15, vr1, vr15
+ vsub.w vr17, vr17, vr0
+ LSX_BUTTERFLY_8_W vr10, vr12, vr14, vr16, vr11, vr13, vr15, vr17, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
+
+ vadd.w vr11, vr20, vr8
+ vsub.w vr13, vr20, vr8
+ vsrai.w vr15, vr22, 1
+ vsrai.w vr17, vr18, 1
+ vsub.w vr15, vr15, vr18
+ vadd.w vr17, vr17, vr22
+ LSX_BUTTERFLY_4_W vr11, vr13, vr15, vr17, vr10, vr12, vr14, vr16
+ vsrai.w vr11, vr19, 1
+ vsrai.w vr13, vr23, 1
+ vsrai.w vr15, vr9, 1
+ vsrai.w vr17, vr21, 1
+ vsub.w vr11, vr9, vr11
+ vsub.w vr13, vr19, vr13
+ vadd.w vr15, vr19, vr15
+ vadd.w vr17, vr9, vr17
+ vsub.w vr11, vr11, vr19
+ vsub.w vr13, vr13, vr23
+ vadd.w vr15, vr15, vr9
+ vadd.w vr17, vr17, vr21
+ vsub.w vr11, vr11, vr23
+ vadd.w vr13, vr13, vr21
+ vsub.w vr15, vr15, vr21
+ vadd.w vr17, vr17, vr23
+ vsrai.w vr20, vr11, 2
+ vsrai.w vr21, vr13, 2
+ vsrai.w vr22, vr15, 2
+ vsrai.w vr23, vr17, 2
+ vadd.w vr11, vr11, vr23
+ vadd.w vr13, vr13, vr22
+ vsub.w vr15, vr21, vr15
+ vsub.w vr17, vr17, vr20
+ LSX_BUTTERFLY_8_W vr10, vr12, vr14, vr16, vr11, vr13, vr15, vr17, \
+ vr20, vr21, vr22, vr23, vr8, vr9, vr18, vr19
+
+ vld vr10, a0, 0
+ vldx vr11, a0, a2
+ vldx vr12, a0, t2
+ vldx vr13, a0, t3
+ vldx vr14, a0, t4
+ vldx vr15, a0, t5
+ vldx vr16, a0, t6
+ vldx vr17, a0, t7
+ vsrani.h.w vr20, vr0, 6
+ vsrani.h.w vr21, vr1, 6
+ vsrani.h.w vr22, vr2, 6
+ vsrani.h.w vr23, vr3, 6
+ vsrani.h.w vr8, vr4, 6
+ vsrani.h.w vr9, vr5, 6
+ vsrani.h.w vr18, vr6, 6
+ vsrani.h.w vr19, vr7, 6
+ vsllwil.hu.bu vr10, vr10, 0
+ vsllwil.hu.bu vr11, vr11, 0
+ vsllwil.hu.bu vr12, vr12, 0
+ vsllwil.hu.bu vr13, vr13, 0
+ vsllwil.hu.bu vr14, vr14, 0
+ vsllwil.hu.bu vr15, vr15, 0
+ vsllwil.hu.bu vr16, vr16, 0
+ vsllwil.hu.bu vr17, vr17, 0
+
+ vadd.h vr0, vr20, vr10
+ vadd.h vr1, vr21, vr11
+ vadd.h vr2, vr22, vr12
+ vadd.h vr3, vr23, vr13
+ vadd.h vr4, vr8, vr14
+ vadd.h vr5, vr9, vr15
+ vadd.h vr6, vr18, vr16
+ vadd.h vr7, vr19, vr17
+ vssrarni.bu.h vr1, vr0, 0
+ vssrarni.bu.h vr3, vr2, 0
+ vssrarni.bu.h vr5, vr4, 0
+ vssrarni.bu.h vr7, vr6, 0
+ vbsrl.v vr0, vr1, 8
+ vbsrl.v vr2, vr3, 8
+ vbsrl.v vr4, vr5, 8
+ vbsrl.v vr6, vr7, 8
+ fst.d f1, a0, 0
+ fstx.d f0, a0, a2
+ fstx.d f3, a0, t2
+ fstx.d f2, a0, t3
+ fstx.d f5, a0, t4
+ fstx.d f4, a0, t5
+ fstx.d f7, a0, t6
+ fstx.d f6, a0, t7
+endfunc
+
+/*
+ * #define FUNC2(a, b, c) FUNC3(a, b, c)
+ * #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
+ * void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride)
+ */
+function ff_h264_idct8_add_8_lasx
+ ld.h t0, a1, 0
+ add.d t2, a2, a2
+ add.d t3, t2, a2
+ add.d t4, t3, a2
+ add.d t5, t4, a2
+ add.d t6, t5, a2
+ add.d t7, t6, a2
+ addi.w t0, t0, 32
+ st.h t0, a1, 0
+
+ vld vr0, a1, 0
+ vld vr1, a1, 16
+ vld vr2, a1, 32
+ vld vr3, a1, 48
+ vld vr4, a1, 64
+ vld vr5, a1, 80
+ vld vr6, a1, 96
+ vld vr7, a1, 112
+ xvxor.v xr8, xr8, xr8
+ xvst xr8, a1, 0
+ xvst xr8, a1, 32
+ xvst xr8, a1, 64
+ xvst xr8, a1, 96
+
+ vadd.h vr18, vr0, vr4
+ vsub.h vr19, vr0, vr4
+ vsrai.h vr20, vr2, 1
+ vsrai.h vr21, vr6, 1
+ vsub.h vr20, vr20, vr6
+ vadd.h vr21, vr21, vr2
+ LSX_BUTTERFLY_4_H vr18, vr19, vr20, vr21, vr10, vr12, vr14, vr16
+ vsrai.h vr11, vr7, 1
+ vsrai.h vr13, vr3, 1
+ vsrai.h vr15, vr5, 1
+ vsrai.h vr17, vr1, 1
+ vsub.h vr11, vr5, vr11
+ vsub.h vr13, vr7, vr13
+ vadd.h vr15, vr7, vr15
+ vadd.h vr17, vr5, vr17
+ vsub.h vr11, vr11, vr7
+ vsub.h vr13, vr13, vr3
+ vadd.h vr15, vr15, vr5
+ vadd.h vr17, vr17, vr1
+ vsub.h vr11, vr11, vr3
+ vadd.h vr13, vr13, vr1
+ vsub.h vr15, vr15, vr1
+ vadd.h vr17, vr17, vr3
+ vsrai.h vr18, vr11, 2
+ vsrai.h vr19, vr13, 2
+ vsrai.h vr20, vr15, 2
+ vsrai.h vr21, vr17, 2
+ vadd.h vr11, vr11, vr21
+ vadd.h vr13, vr13, vr20
+ vsub.h vr15, vr19, vr15
+ vsub.h vr17, vr17, vr18
+ LSX_BUTTERFLY_8_H vr10, vr16, vr12, vr14, vr13, vr15, vr11, vr17, \
+ vr0, vr3, vr1, vr2, vr5, vr6, vr4, vr7
+
+ LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr10, vr11, vr12, vr13, vr14, vr15, vr16, vr17
+ vext2xv.w.h xr0, xr0
+ vext2xv.w.h xr1, xr1
+ vext2xv.w.h xr2, xr2
+ vext2xv.w.h xr3, xr3
+ vext2xv.w.h xr4, xr4
+ vext2xv.w.h xr5, xr5
+ vext2xv.w.h xr6, xr6
+ vext2xv.w.h xr7, xr7
+
+ xvadd.w xr11, xr0, xr4
+ xvsub.w xr13, xr0, xr4
+ xvsrai.w xr15, xr2, 1
+ xvsrai.w xr17, xr6, 1
+ xvsub.w xr15, xr15, xr6
+ xvadd.w xr17, xr17, xr2
+ LASX_BUTTERFLY_4_W xr11, xr13, xr15, xr17, xr10, xr12, xr14, xr16
+ xvsrai.w xr11, xr7, 1
+ xvsrai.w xr13, xr3, 1
+ xvsrai.w xr15, xr5, 1
+ xvsrai.w xr17, xr1, 1
+ xvsub.w xr11, xr5, xr11
+ xvsub.w xr13, xr7, xr13
+ xvadd.w xr15, xr7, xr15
+ xvadd.w xr17, xr5, xr17
+ xvsub.w xr11, xr11, xr7
+ xvsub.w xr13, xr13, xr3
+ xvadd.w xr15, xr15, xr5
+ xvadd.w xr17, xr17, xr1
+ xvsub.w xr11, xr11, xr3
+ xvadd.w xr13, xr13, xr1
+ xvsub.w xr15, xr15, xr1
+ xvadd.w xr17, xr17, xr3
+ xvsrai.w xr0, xr11, 2
+ xvsrai.w xr1, xr13, 2
+ xvsrai.w xr2, xr15, 2
+ xvsrai.w xr3, xr17, 2
+ xvadd.w xr11, xr11, xr3
+ xvadd.w xr13, xr13, xr2
+ xvsub.w xr15, xr1, xr15
+ xvsub.w xr17, xr17, xr0
+ LASX_BUTTERFLY_8_W xr10, xr12, xr14, xr16, xr11, xr13, xr15, xr17, \
+ xr0, xr1, xr2, xr3, xr4, xr5, xr6, xr7
+
+ vld vr10, a0, 0
+ vldx vr11, a0, a2
+ vldx vr12, a0, t2
+ vldx vr13, a0, t3
+ vldx vr14, a0, t4
+ vldx vr15, a0, t5
+ vldx vr16, a0, t6
+ vldx vr17, a0, t7
+ xvldi xr8, 0x806 //"xvldi.w xr8 6"
+ xvsran.h.w xr0, xr0, xr8
+ xvsran.h.w xr1, xr1, xr8
+ xvsran.h.w xr2, xr2, xr8
+ xvsran.h.w xr3, xr3, xr8
+ xvsran.h.w xr4, xr4, xr8
+ xvsran.h.w xr5, xr5, xr8
+ xvsran.h.w xr6, xr6, xr8
+ xvsran.h.w xr7, xr7, xr8
+ xvpermi.d xr0, xr0, 0x08
+ xvpermi.d xr1, xr1, 0x08
+ xvpermi.d xr2, xr2, 0x08
+ xvpermi.d xr3, xr3, 0x08
+ xvpermi.d xr4, xr4, 0x08
+ xvpermi.d xr5, xr5, 0x08
+ xvpermi.d xr6, xr6, 0x08
+ xvpermi.d xr7, xr7, 0x08
+
+ vsllwil.hu.bu vr10, vr10, 0
+ vsllwil.hu.bu vr11, vr11, 0
+ vsllwil.hu.bu vr12, vr12, 0
+ vsllwil.hu.bu vr13, vr13, 0
+ vsllwil.hu.bu vr14, vr14, 0
+ vsllwil.hu.bu vr15, vr15, 0
+ vsllwil.hu.bu vr16, vr16, 0
+ vsllwil.hu.bu vr17, vr17, 0
+
+ vadd.h vr0, vr0, vr10
+ vadd.h vr1, vr1, vr11
+ vadd.h vr2, vr2, vr12
+ vadd.h vr3, vr3, vr13
+ vadd.h vr4, vr4, vr14
+ vadd.h vr5, vr5, vr15
+ vadd.h vr6, vr6, vr16
+ vadd.h vr7, vr7, vr17
+ vssrarni.bu.h vr1, vr0, 0
+ vssrarni.bu.h vr3, vr2, 0
+ vssrarni.bu.h vr5, vr4, 0
+ vssrarni.bu.h vr7, vr6, 0
+ vbsrl.v vr0, vr1, 8
+ vbsrl.v vr2, vr3, 8
+ vbsrl.v vr4, vr5, 8
+ vbsrl.v vr6, vr7, 8
+ fst.d f1, a0, 0
+ fstx.d f0, a0, a2
+ fstx.d f3, a0, t2
+ fstx.d f2, a0, t3
+ fstx.d f5, a0, t4
+ fstx.d f4, a0, t5
+ fstx.d f7, a0, t6
+ fstx.d f6, a0, t7
+endfunc
+
+/*
+ * #define FUNC2(a, b, c) FUNC3(a, b, c)
+ * #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
+ * void FUNCC(ff_h264_idct_dc_add)(uint8_t *_dst, int16_t *_block, int stride)
+ * LSX optimization is enough for this function.
+ */
+function ff_h264_idct_dc_add_8_lsx
+ vldrepl.h vr4, a1, 0
+ add.d t2, a2, a2
+ add.d t3, t2, a2
+ fld.s f0, a0, 0
+ fldx.s f1, a0, a2
+ fldx.s f2, a0, t2
+ fldx.s f3, a0, t3
+ st.h zero, a1, 0
+
+ vsrari.h vr4, vr4, 6
+ vilvl.w vr0, vr1, vr0
+ vilvl.w vr1, vr3, vr2
+ vsllwil.hu.bu vr0, vr0, 0
+ vsllwil.hu.bu vr1, vr1, 0
+ vadd.h vr0, vr0, vr4
+ vadd.h vr1, vr1, vr4
+ vssrarni.bu.h vr1, vr0, 0
+
+ vbsrl.v vr2, vr1, 4
+ vbsrl.v vr3, vr1, 8
+ vbsrl.v vr4, vr1, 12
+ fst.s f1, a0, 0
+ fstx.s f2, a0, a2
+ fstx.s f3, a0, t2
+ fstx.s f4, a0, t3
+endfunc
+
+/*
+ * #define FUNC2(a, b, c) FUNC3(a, b, c)
+ * #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
+ * void FUNCC(ff_h264_idct8_dc_add)(uint8_t *_dst, int16_t *_block, int stride)
+ */
+function ff_h264_idct8_dc_add_8_lsx
+ vldrepl.h vr8, a1, 0
+ add.d t2, a2, a2
+ add.d t3, t2, a2
+ add.d t4, t3, a2
+ add.d t5, t4, a2
+ add.d t6, t5, a2
+ add.d t7, t6, a2
+
+ fld.d f0, a0, 0
+ fldx.d f1, a0, a2
+ fldx.d f2, a0, t2
+ fldx.d f3, a0, t3
+ fldx.d f4, a0, t4
+ fldx.d f5, a0, t5
+ fldx.d f6, a0, t6
+ fldx.d f7, a0, t7
+ st.h zero, a1, 0
+
+ vsrari.h vr8, vr8, 6
+ vsllwil.hu.bu vr0, vr0, 0
+ vsllwil.hu.bu vr1, vr1, 0
+ vsllwil.hu.bu vr2, vr2, 0
+ vsllwil.hu.bu vr3, vr3, 0
+ vsllwil.hu.bu vr4, vr4, 0
+ vsllwil.hu.bu vr5, vr5, 0
+ vsllwil.hu.bu vr6, vr6, 0
+ vsllwil.hu.bu vr7, vr7, 0
+ vadd.h vr0, vr0, vr8
+ vadd.h vr1, vr1, vr8
+ vadd.h vr2, vr2, vr8
+ vadd.h vr3, vr3, vr8
+ vadd.h vr4, vr4, vr8
+ vadd.h vr5, vr5, vr8
+ vadd.h vr6, vr6, vr8
+ vadd.h vr7, vr7, vr8
+ vssrarni.bu.h vr1, vr0, 0
+ vssrarni.bu.h vr3, vr2, 0
+ vssrarni.bu.h vr5, vr4, 0
+ vssrarni.bu.h vr7, vr6, 0
+
+ vbsrl.v vr0, vr1, 8
+ vbsrl.v vr2, vr3, 8
+ vbsrl.v vr4, vr5, 8
+ vbsrl.v vr6, vr7, 8
+ fst.d f1, a0, 0
+ fstx.d f0, a0, a2
+ fstx.d f3, a0, t2
+ fstx.d f2, a0, t3
+ fstx.d f5, a0, t4
+ fstx.d f4, a0, t5
+ fstx.d f7, a0, t6
+ fstx.d f6, a0, t7
+endfunc
+function ff_h264_idct8_dc_add_8_lasx
+ xvldrepl.h xr8, a1, 0
+ add.d t2, a2, a2
+ add.d t3, t2, a2
+ add.d t4, t3, a2
+ add.d t5, t4, a2
+ add.d t6, t5, a2
+ add.d t7, t6, a2
+
+ fld.d f0, a0, 0
+ fldx.d f1, a0, a2
+ fldx.d f2, a0, t2
+ fldx.d f3, a0, t3
+ fldx.d f4, a0, t4
+ fldx.d f5, a0, t5
+ fldx.d f6, a0, t6
+ fldx.d f7, a0, t7
+ st.h zero, a1, 0
+
+ xvsrari.h xr8, xr8, 6
+ xvpermi.q xr1, xr0, 0x20
+ xvpermi.q xr3, xr2, 0x20
+ xvpermi.q xr5, xr4, 0x20
+ xvpermi.q xr7, xr6, 0x20
+ xvsllwil.hu.bu xr1, xr1, 0
+ xvsllwil.hu.bu xr3, xr3, 0
+ xvsllwil.hu.bu xr5, xr5, 0
+ xvsllwil.hu.bu xr7, xr7, 0
+ xvadd.h xr1, xr1, xr8
+ xvadd.h xr3, xr3, xr8
+ xvadd.h xr5, xr5, xr8
+ xvadd.h xr7, xr7, xr8
+
+ xvssrarni.bu.h xr3, xr1, 0
+ xvssrarni.bu.h xr7, xr5, 0
+
+ xvpermi.q xr1, xr3, 0x11
+ xvpermi.q xr5, xr7, 0x11
+ xvbsrl.v xr0, xr1, 8
+ xvbsrl.v xr2, xr3, 8
+ xvbsrl.v xr4, xr5, 8
+ xvbsrl.v xr6, xr7, 8
+
+ fst.d f3, a0, 0
+ fstx.d f1, a0, a2
+ fstx.d f2, a0, t2
+ fstx.d f0, a0, t3
+ fstx.d f7, a0, t4
+ fstx.d f5, a0, t5
+ fstx.d f6, a0, t6
+ fstx.d f4, a0, t7
+endfunc
+
+/**
+ * IDCT transforms the 16 dc values and dequantizes them.
+ * @param qmul quantization parameter
+ * void FUNCC(ff_h264_luma_dc_dequant_idct)(int16_t *_output, int16_t *_input, int qmul){
+ * LSX optimization is enough for this function.
+ */
+function ff_h264_luma_dc_dequant_idct_8_lsx
+ vld vr0, a1, 0
+ vld vr1, a1, 8
+ vld vr2, a1, 16
+ vld vr3, a1, 24
+ vreplgr2vr.w vr8, a2
+ LSX_TRANSPOSE4x4_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr9, vr10
+ LSX_BUTTERFLY_4_H vr4, vr6, vr7, vr5, vr0, vr3, vr2, vr1
+ LSX_BUTTERFLY_4_H vr0, vr1, vr2, vr3, vr4, vr7, vr6, vr5
+ LSX_TRANSPOSE4x4_H vr4, vr5, vr6, vr7, vr0, vr1, vr2, vr3, vr9, vr10
+ LSX_BUTTERFLY_4_H vr0, vr1, vr3, vr2, vr4, vr7, vr6, vr5
+ LSX_BUTTERFLY_4_H vr4, vr5, vr6, vr7, vr0, vr1, vr2, vr3
+ vsllwil.w.h vr0, vr0, 0
+ vsllwil.w.h vr1, vr1, 0
+ vsllwil.w.h vr2, vr2, 0
+ vsllwil.w.h vr3, vr3, 0
+ vmul.w vr0, vr0, vr8
+ vmul.w vr1, vr1, vr8
+ vmul.w vr2, vr2, vr8
+ vmul.w vr3, vr3, vr8
+ vsrarni.h.w vr1, vr0, 8
+ vsrarni.h.w vr3, vr2, 8
+
+ vstelm.h vr1, a0, 0, 0
+ vstelm.h vr1, a0, 32, 4
+ vstelm.h vr1, a0, 64, 1
+ vstelm.h vr1, a0, 96, 5
+ vstelm.h vr3, a0, 128, 0
+ vstelm.h vr3, a0, 160, 4
+ vstelm.h vr3, a0, 192, 1
+ vstelm.h vr3, a0, 224, 5
+ addi.d a0, a0, 256
+ vstelm.h vr1, a0, 0, 2
+ vstelm.h vr1, a0, 32, 6
+ vstelm.h vr1, a0, 64, 3
+ vstelm.h vr1, a0, 96, 7
+ vstelm.h vr3, a0, 128, 2
+ vstelm.h vr3, a0, 160, 6
+ vstelm.h vr3, a0, 192, 3
+ vstelm.h vr3, a0, 224, 7
+endfunc
diff --git a/libavcodec/loongarch/h264idct_lasx.c b/libavcodec/loongarch/h264idct_lasx.c
deleted file mode 100644
index 46bd3b74d5..0000000000
--- a/libavcodec/loongarch/h264idct_lasx.c
+++ /dev/null
@@ -1,498 +0,0 @@
-/*
- * Loongson LASX optimized h264dsp
- *
- * Copyright (c) 2021 Loongson Technology Corporation Limited
- * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
- * Xiwei Gu <guxiwei-hf@loongson.cn>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/loongarch/loongson_intrinsics.h"
-#include "h264dsp_lasx.h"
-#include "libavcodec/bit_depth_template.c"
-
-#define AVC_ITRANS_H(in0, in1, in2, in3, out0, out1, out2, out3) \
-{ \
- __m256i tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
- \
- tmp0_m = __lasx_xvadd_h(in0, in2); \
- tmp1_m = __lasx_xvsub_h(in0, in2); \
- tmp2_m = __lasx_xvsrai_h(in1, 1); \
- tmp2_m = __lasx_xvsub_h(tmp2_m, in3); \
- tmp3_m = __lasx_xvsrai_h(in3, 1); \
- tmp3_m = __lasx_xvadd_h(in1, tmp3_m); \
- \
- LASX_BUTTERFLY_4_H(tmp0_m, tmp1_m, tmp2_m, tmp3_m, \
- out0, out1, out2, out3); \
-}
-
-void ff_h264_idct_add_lasx(uint8_t *dst, int16_t *src, int32_t dst_stride)
-{
- __m256i src0_m, src1_m, src2_m, src3_m;
- __m256i dst0_m, dst1_m;
- __m256i hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3;
- __m256i inp0_m, inp1_m, res0_m, src1, src3;
- __m256i src0 = __lasx_xvld(src, 0);
- __m256i src2 = __lasx_xvld(src, 16);
- __m256i zero = __lasx_xvldi(0);
- int32_t dst_stride_2x = dst_stride << 1;
- int32_t dst_stride_3x = dst_stride_2x + dst_stride;
-
- __lasx_xvst(zero, src, 0);
- DUP2_ARG2(__lasx_xvilvh_d, src0, src0, src2, src2, src1, src3);
- AVC_ITRANS_H(src0, src1, src2, src3, hres0, hres1, hres2, hres3);
- LASX_TRANSPOSE4x4_H(hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3);
- AVC_ITRANS_H(hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3);
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
- dst, dst_stride_3x, src0_m, src1_m, src2_m, src3_m);
- DUP4_ARG2(__lasx_xvld, dst, 0, dst + dst_stride, 0, dst + dst_stride_2x,
- 0, dst + dst_stride_3x, 0, src0_m, src1_m, src2_m, src3_m);
- DUP2_ARG2(__lasx_xvilvl_d, vres1, vres0, vres3, vres2, inp0_m, inp1_m);
- inp0_m = __lasx_xvpermi_q(inp1_m, inp0_m, 0x20);
- inp0_m = __lasx_xvsrari_h(inp0_m, 6);
- DUP2_ARG2(__lasx_xvilvl_w, src1_m, src0_m, src3_m, src2_m, dst0_m, dst1_m);
- dst0_m = __lasx_xvilvl_d(dst1_m, dst0_m);
- res0_m = __lasx_vext2xv_hu_bu(dst0_m);
- res0_m = __lasx_xvadd_h(res0_m, inp0_m);
- res0_m = __lasx_xvclip255_h(res0_m);
- dst0_m = __lasx_xvpickev_b(res0_m, res0_m);
- __lasx_xvstelm_w(dst0_m, dst, 0, 0);
- __lasx_xvstelm_w(dst0_m, dst + dst_stride, 0, 1);
- __lasx_xvstelm_w(dst0_m, dst + dst_stride_2x, 0, 4);
- __lasx_xvstelm_w(dst0_m, dst + dst_stride_3x, 0, 5);
-}
-
-void ff_h264_idct8_addblk_lasx(uint8_t *dst, int16_t *src,
- int32_t dst_stride)
-{
- __m256i src0, src1, src2, src3, src4, src5, src6, src7;
- __m256i vec0, vec1, vec2, vec3;
- __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- __m256i res0, res1, res2, res3, res4, res5, res6, res7;
- __m256i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
- __m256i zero = __lasx_xvldi(0);
- int32_t dst_stride_2x = dst_stride << 1;
- int32_t dst_stride_4x = dst_stride << 2;
- int32_t dst_stride_3x = dst_stride_2x + dst_stride;
-
- src[0] += 32;
- DUP4_ARG2(__lasx_xvld, src, 0, src, 16, src, 32, src, 48,
- src0, src1, src2, src3);
- DUP4_ARG2(__lasx_xvld, src, 64, src, 80, src, 96, src, 112,
- src4, src5, src6, src7);
- __lasx_xvst(zero, src, 0);
- __lasx_xvst(zero, src, 32);
- __lasx_xvst(zero, src, 64);
- __lasx_xvst(zero, src, 96);
-
- vec0 = __lasx_xvadd_h(src0, src4);
- vec1 = __lasx_xvsub_h(src0, src4);
- vec2 = __lasx_xvsrai_h(src2, 1);
- vec2 = __lasx_xvsub_h(vec2, src6);
- vec3 = __lasx_xvsrai_h(src6, 1);
- vec3 = __lasx_xvadd_h(src2, vec3);
-
- LASX_BUTTERFLY_4_H(vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3);
-
- vec0 = __lasx_xvsrai_h(src7, 1);
- vec0 = __lasx_xvsub_h(src5, vec0);
- vec0 = __lasx_xvsub_h(vec0, src3);
- vec0 = __lasx_xvsub_h(vec0, src7);
-
- vec1 = __lasx_xvsrai_h(src3, 1);
- vec1 = __lasx_xvsub_h(src1, vec1);
- vec1 = __lasx_xvadd_h(vec1, src7);
- vec1 = __lasx_xvsub_h(vec1, src3);
-
- vec2 = __lasx_xvsrai_h(src5, 1);
- vec2 = __lasx_xvsub_h(vec2, src1);
- vec2 = __lasx_xvadd_h(vec2, src7);
- vec2 = __lasx_xvadd_h(vec2, src5);
-
- vec3 = __lasx_xvsrai_h(src1, 1);
- vec3 = __lasx_xvadd_h(src3, vec3);
- vec3 = __lasx_xvadd_h(vec3, src5);
- vec3 = __lasx_xvadd_h(vec3, src1);
-
- tmp4 = __lasx_xvsrai_h(vec3, 2);
- tmp4 = __lasx_xvadd_h(tmp4, vec0);
- tmp5 = __lasx_xvsrai_h(vec2, 2);
- tmp5 = __lasx_xvadd_h(tmp5, vec1);
- tmp6 = __lasx_xvsrai_h(vec1, 2);
- tmp6 = __lasx_xvsub_h(tmp6, vec2);
- tmp7 = __lasx_xvsrai_h(vec0, 2);
- tmp7 = __lasx_xvsub_h(vec3, tmp7);
-
- LASX_BUTTERFLY_8_H(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
- res0, res1, res2, res3, res4, res5, res6, res7);
- LASX_TRANSPOSE8x8_H(res0, res1, res2, res3, res4, res5, res6, res7,
- res0, res1, res2, res3, res4, res5, res6, res7);
-
- DUP4_ARG1(__lasx_vext2xv_w_h, res0, res1, res2, res3,
- tmp0, tmp1, tmp2, tmp3);
- DUP4_ARG1(__lasx_vext2xv_w_h, res4, res5, res6, res7,
- tmp4, tmp5, tmp6, tmp7);
- vec0 = __lasx_xvadd_w(tmp0, tmp4);
- vec1 = __lasx_xvsub_w(tmp0, tmp4);
-
- vec2 = __lasx_xvsrai_w(tmp2, 1);
- vec2 = __lasx_xvsub_w(vec2, tmp6);
- vec3 = __lasx_xvsrai_w(tmp6, 1);
- vec3 = __lasx_xvadd_w(vec3, tmp2);
-
- tmp0 = __lasx_xvadd_w(vec0, vec3);
- tmp2 = __lasx_xvadd_w(vec1, vec2);
- tmp4 = __lasx_xvsub_w(vec1, vec2);
- tmp6 = __lasx_xvsub_w(vec0, vec3);
-
- vec0 = __lasx_xvsrai_w(tmp7, 1);
- vec0 = __lasx_xvsub_w(tmp5, vec0);
- vec0 = __lasx_xvsub_w(vec0, tmp3);
- vec0 = __lasx_xvsub_w(vec0, tmp7);
-
- vec1 = __lasx_xvsrai_w(tmp3, 1);
- vec1 = __lasx_xvsub_w(tmp1, vec1);
- vec1 = __lasx_xvadd_w(vec1, tmp7);
- vec1 = __lasx_xvsub_w(vec1, tmp3);
-
- vec2 = __lasx_xvsrai_w(tmp5, 1);
- vec2 = __lasx_xvsub_w(vec2, tmp1);
- vec2 = __lasx_xvadd_w(vec2, tmp7);
- vec2 = __lasx_xvadd_w(vec2, tmp5);
-
- vec3 = __lasx_xvsrai_w(tmp1, 1);
- vec3 = __lasx_xvadd_w(tmp3, vec3);
- vec3 = __lasx_xvadd_w(vec3, tmp5);
- vec3 = __lasx_xvadd_w(vec3, tmp1);
-
- tmp1 = __lasx_xvsrai_w(vec3, 2);
- tmp1 = __lasx_xvadd_w(tmp1, vec0);
- tmp3 = __lasx_xvsrai_w(vec2, 2);
- tmp3 = __lasx_xvadd_w(tmp3, vec1);
- tmp5 = __lasx_xvsrai_w(vec1, 2);
- tmp5 = __lasx_xvsub_w(tmp5, vec2);
- tmp7 = __lasx_xvsrai_w(vec0, 2);
- tmp7 = __lasx_xvsub_w(vec3, tmp7);
-
- LASX_BUTTERFLY_4_W(tmp0, tmp2, tmp5, tmp7, res0, res1, res6, res7);
- LASX_BUTTERFLY_4_W(tmp4, tmp6, tmp1, tmp3, res2, res3, res4, res5);
-
- DUP4_ARG2(__lasx_xvsrai_w, res0, 6, res1, 6, res2, 6, res3, 6,
- res0, res1, res2, res3);
- DUP4_ARG2(__lasx_xvsrai_w, res4, 6, res5, 6, res6, 6, res7, 6,
- res4, res5, res6, res7);
- DUP4_ARG2(__lasx_xvpickev_h, res1, res0, res3, res2, res5, res4, res7,
- res6, res0, res1, res2, res3);
- DUP4_ARG2(__lasx_xvpermi_d, res0, 0xd8, res1, 0xd8, res2, 0xd8, res3, 0xd8,
- res0, res1, res2, res3);
-
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
- dst, dst_stride_3x, dst0, dst1, dst2, dst3);
- dst += dst_stride_4x;
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
- dst, dst_stride_3x, dst4, dst5, dst6, dst7);
- dst -= dst_stride_4x;
- DUP4_ARG2(__lasx_xvilvl_b, zero, dst0, zero, dst1, zero, dst2, zero, dst3,
- dst0, dst1, dst2, dst3);
- DUP4_ARG2(__lasx_xvilvl_b, zero, dst4, zero, dst5, zero, dst6, zero, dst7,
- dst4, dst5, dst6, dst7);
- DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst3, dst2, 0x20, dst5,
- dst4, 0x20, dst7, dst6, 0x20, dst0, dst1, dst2, dst3);
- res0 = __lasx_xvadd_h(res0, dst0);
- res1 = __lasx_xvadd_h(res1, dst1);
- res2 = __lasx_xvadd_h(res2, dst2);
- res3 = __lasx_xvadd_h(res3, dst3);
- DUP4_ARG1(__lasx_xvclip255_h, res0, res1, res2, res3, res0, res1,
- res2, res3);
- DUP2_ARG2(__lasx_xvpickev_b, res1, res0, res3, res2, res0, res1);
- __lasx_xvstelm_d(res0, dst, 0, 0);
- __lasx_xvstelm_d(res0, dst + dst_stride, 0, 2);
- __lasx_xvstelm_d(res0, dst + dst_stride_2x, 0, 1);
- __lasx_xvstelm_d(res0, dst + dst_stride_3x, 0, 3);
- dst += dst_stride_4x;
- __lasx_xvstelm_d(res1, dst, 0, 0);
- __lasx_xvstelm_d(res1, dst + dst_stride, 0, 2);
- __lasx_xvstelm_d(res1, dst + dst_stride_2x, 0, 1);
- __lasx_xvstelm_d(res1, dst + dst_stride_3x, 0, 3);
-}
-
-void ff_h264_idct4x4_addblk_dc_lasx(uint8_t *dst, int16_t *src,
- int32_t dst_stride)
-{
- const int16_t dc = (src[0] + 32) >> 6;
- int32_t dst_stride_2x = dst_stride << 1;
- int32_t dst_stride_3x = dst_stride_2x + dst_stride;
- __m256i pred, out;
- __m256i src0, src1, src2, src3;
- __m256i input_dc = __lasx_xvreplgr2vr_h(dc);
-
- src[0] = 0;
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
- dst, dst_stride_3x, src0, src1, src2, src3);
- DUP2_ARG2(__lasx_xvilvl_w, src1, src0, src3, src2, src0, src1);
-
- pred = __lasx_xvpermi_q(src0, src1, 0x02);
- pred = __lasx_xvaddw_h_h_bu(input_dc, pred);
- pred = __lasx_xvclip255_h(pred);
- out = __lasx_xvpickev_b(pred, pred);
- __lasx_xvstelm_w(out, dst, 0, 0);
- __lasx_xvstelm_w(out, dst + dst_stride, 0, 1);
- __lasx_xvstelm_w(out, dst + dst_stride_2x, 0, 4);
- __lasx_xvstelm_w(out, dst + dst_stride_3x, 0, 5);
-}
-
-void ff_h264_idct8_dc_addblk_lasx(uint8_t *dst, int16_t *src,
- int32_t dst_stride)
-{
- int32_t dc_val;
- int32_t dst_stride_2x = dst_stride << 1;
- int32_t dst_stride_4x = dst_stride << 2;
- int32_t dst_stride_3x = dst_stride_2x + dst_stride;
- __m256i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
- __m256i dc;
-
- dc_val = (src[0] + 32) >> 6;
- dc = __lasx_xvreplgr2vr_h(dc_val);
-
- src[0] = 0;
-
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
- dst, dst_stride_3x, dst0, dst1, dst2, dst3);
- dst += dst_stride_4x;
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
- dst, dst_stride_3x, dst4, dst5, dst6, dst7);
- dst -= dst_stride_4x;
- DUP4_ARG1(__lasx_vext2xv_hu_bu, dst0, dst1, dst2, dst3,
- dst0, dst1, dst2, dst3);
- DUP4_ARG1(__lasx_vext2xv_hu_bu, dst4, dst5, dst6, dst7,
- dst4, dst5, dst6, dst7);
- DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst3, dst2, 0x20, dst5,
- dst4, 0x20, dst7, dst6, 0x20, dst0, dst1, dst2, dst3);
- dst0 = __lasx_xvadd_h(dst0, dc);
- dst1 = __lasx_xvadd_h(dst1, dc);
- dst2 = __lasx_xvadd_h(dst2, dc);
- dst3 = __lasx_xvadd_h(dst3, dc);
- DUP4_ARG1(__lasx_xvclip255_h, dst0, dst1, dst2, dst3,
- dst0, dst1, dst2, dst3);
- DUP2_ARG2(__lasx_xvpickev_b, dst1, dst0, dst3, dst2, dst0, dst1);
- __lasx_xvstelm_d(dst0, dst, 0, 0);
- __lasx_xvstelm_d(dst0, dst + dst_stride, 0, 2);
- __lasx_xvstelm_d(dst0, dst + dst_stride_2x, 0, 1);
- __lasx_xvstelm_d(dst0, dst + dst_stride_3x, 0, 3);
- dst += dst_stride_4x;
- __lasx_xvstelm_d(dst1, dst, 0, 0);
- __lasx_xvstelm_d(dst1, dst + dst_stride, 0, 2);
- __lasx_xvstelm_d(dst1, dst + dst_stride_2x, 0, 1);
- __lasx_xvstelm_d(dst1, dst + dst_stride_3x, 0, 3);
-}
-
-void ff_h264_idct_add16_lasx(uint8_t *dst,
- const int32_t *blk_offset,
- int16_t *block, int32_t dst_stride,
- const uint8_t nzc[15 * 8])
-{
- int32_t i;
-
- for (i = 0; i < 16; i++) {
- int32_t nnz = nzc[scan8[i]];
-
- if (nnz) {
- if (nnz == 1 && ((dctcoef *) block)[i * 16])
- ff_h264_idct4x4_addblk_dc_lasx(dst + blk_offset[i],
- block + i * 16 * sizeof(pixel),
- dst_stride);
- else
- ff_h264_idct_add_lasx(dst + blk_offset[i],
- block + i * 16 * sizeof(pixel),
- dst_stride);
- }
- }
-}
-
-void ff_h264_idct8_add4_lasx(uint8_t *dst, const int32_t *blk_offset,
- int16_t *block, int32_t dst_stride,
- const uint8_t nzc[15 * 8])
-{
- int32_t cnt;
-
- for (cnt = 0; cnt < 16; cnt += 4) {
- int32_t nnz = nzc[scan8[cnt]];
-
- if (nnz) {
- if (nnz == 1 && ((dctcoef *) block)[cnt * 16])
- ff_h264_idct8_dc_addblk_lasx(dst + blk_offset[cnt],
- block + cnt * 16 * sizeof(pixel),
- dst_stride);
- else
- ff_h264_idct8_addblk_lasx(dst + blk_offset[cnt],
- block + cnt * 16 * sizeof(pixel),
- dst_stride);
- }
- }
-}
-
-
-void ff_h264_idct_add8_lasx(uint8_t **dst,
- const int32_t *blk_offset,
- int16_t *block, int32_t dst_stride,
- const uint8_t nzc[15 * 8])
-{
- int32_t i;
-
- for (i = 16; i < 20; i++) {
- if (nzc[scan8[i]])
- ff_h264_idct_add_lasx(dst[0] + blk_offset[i],
- block + i * 16 * sizeof(pixel),
- dst_stride);
- else if (((dctcoef *) block)[i * 16])
- ff_h264_idct4x4_addblk_dc_lasx(dst[0] + blk_offset[i],
- block + i * 16 * sizeof(pixel),
- dst_stride);
- }
- for (i = 32; i < 36; i++) {
- if (nzc[scan8[i]])
- ff_h264_idct_add_lasx(dst[1] + blk_offset[i],
- block + i * 16 * sizeof(pixel),
- dst_stride);
- else if (((dctcoef *) block)[i * 16])
- ff_h264_idct4x4_addblk_dc_lasx(dst[1] + blk_offset[i],
- block + i * 16 * sizeof(pixel),
- dst_stride);
- }
-}
-
-void ff_h264_idct_add8_422_lasx(uint8_t **dst,
- const int32_t *blk_offset,
- int16_t *block, int32_t dst_stride,
- const uint8_t nzc[15 * 8])
-{
- int32_t i;
-
- for (i = 16; i < 20; i++) {
- if (nzc[scan8[i]])
- ff_h264_idct_add_lasx(dst[0] + blk_offset[i],
- block + i * 16 * sizeof(pixel),
- dst_stride);
- else if (((dctcoef *) block)[i * 16])
- ff_h264_idct4x4_addblk_dc_lasx(dst[0] + blk_offset[i],
- block + i * 16 * sizeof(pixel),
- dst_stride);
- }
- for (i = 32; i < 36; i++) {
- if (nzc[scan8[i]])
- ff_h264_idct_add_lasx(dst[1] + blk_offset[i],
- block + i * 16 * sizeof(pixel),
- dst_stride);
- else if (((dctcoef *) block)[i * 16])
- ff_h264_idct4x4_addblk_dc_lasx(dst[1] + blk_offset[i],
- block + i * 16 * sizeof(pixel),
- dst_stride);
- }
- for (i = 20; i < 24; i++) {
- if (nzc[scan8[i + 4]])
- ff_h264_idct_add_lasx(dst[0] + blk_offset[i + 4],
- block + i * 16 * sizeof(pixel),
- dst_stride);
- else if (((dctcoef *) block)[i * 16])
- ff_h264_idct4x4_addblk_dc_lasx(dst[0] + blk_offset[i + 4],
- block + i * 16 * sizeof(pixel),
- dst_stride);
- }
- for (i = 36; i < 40; i++) {
- if (nzc[scan8[i + 4]])
- ff_h264_idct_add_lasx(dst[1] + blk_offset[i + 4],
- block + i * 16 * sizeof(pixel),
- dst_stride);
- else if (((dctcoef *) block)[i * 16])
- ff_h264_idct4x4_addblk_dc_lasx(dst[1] + blk_offset[i + 4],
- block + i * 16 * sizeof(pixel),
- dst_stride);
- }
-}
-
-void ff_h264_idct_add16_intra_lasx(uint8_t *dst,
- const int32_t *blk_offset,
- int16_t *block,
- int32_t dst_stride,
- const uint8_t nzc[15 * 8])
-{
- int32_t i;
-
- for (i = 0; i < 16; i++) {
- if (nzc[scan8[i]])
- ff_h264_idct_add_lasx(dst + blk_offset[i],
- block + i * 16 * sizeof(pixel), dst_stride);
- else if (((dctcoef *) block)[i * 16])
- ff_h264_idct4x4_addblk_dc_lasx(dst + blk_offset[i],
- block + i * 16 * sizeof(pixel),
- dst_stride);
- }
-}
-
-void ff_h264_deq_idct_luma_dc_lasx(int16_t *dst, int16_t *src,
- int32_t de_qval)
-{
-#define DC_DEST_STRIDE 16
-
- __m256i src0, src1, src2, src3;
- __m256i vec0, vec1, vec2, vec3;
- __m256i tmp0, tmp1, tmp2, tmp3;
- __m256i hres0, hres1, hres2, hres3;
- __m256i vres0, vres1, vres2, vres3;
- __m256i de_q_vec = __lasx_xvreplgr2vr_w(de_qval);
-
- DUP4_ARG2(__lasx_xvld, src, 0, src, 8, src, 16, src, 24,
- src0, src1, src2, src3);
- LASX_TRANSPOSE4x4_H(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
- LASX_BUTTERFLY_4_H(tmp0, tmp2, tmp3, tmp1, vec0, vec3, vec2, vec1);
- LASX_BUTTERFLY_4_H(vec0, vec1, vec2, vec3, hres0, hres3, hres2, hres1);
- LASX_TRANSPOSE4x4_H(hres0, hres1, hres2, hres3,
- hres0, hres1, hres2, hres3);
- LASX_BUTTERFLY_4_H(hres0, hres1, hres3, hres2, vec0, vec3, vec2, vec1);
- LASX_BUTTERFLY_4_H(vec0, vec1, vec2, vec3, vres0, vres1, vres2, vres3);
- DUP4_ARG1(__lasx_vext2xv_w_h, vres0, vres1, vres2, vres3,
- vres0, vres1, vres2, vres3);
- DUP2_ARG3(__lasx_xvpermi_q, vres1, vres0, 0x20, vres3, vres2, 0x20,
- vres0, vres1);
-
- vres0 = __lasx_xvmul_w(vres0, de_q_vec);
- vres1 = __lasx_xvmul_w(vres1, de_q_vec);
-
- vres0 = __lasx_xvsrari_w(vres0, 8);
- vres1 = __lasx_xvsrari_w(vres1, 8);
- vec0 = __lasx_xvpickev_h(vres1, vres0);
- vec0 = __lasx_xvpermi_d(vec0, 0xd8);
- __lasx_xvstelm_h(vec0, dst + 0 * DC_DEST_STRIDE, 0, 0);
- __lasx_xvstelm_h(vec0, dst + 2 * DC_DEST_STRIDE, 0, 1);
- __lasx_xvstelm_h(vec0, dst + 8 * DC_DEST_STRIDE, 0, 2);
- __lasx_xvstelm_h(vec0, dst + 10 * DC_DEST_STRIDE, 0, 3);
- __lasx_xvstelm_h(vec0, dst + 1 * DC_DEST_STRIDE, 0, 4);
- __lasx_xvstelm_h(vec0, dst + 3 * DC_DEST_STRIDE, 0, 5);
- __lasx_xvstelm_h(vec0, dst + 9 * DC_DEST_STRIDE, 0, 6);
- __lasx_xvstelm_h(vec0, dst + 11 * DC_DEST_STRIDE, 0, 7);
- __lasx_xvstelm_h(vec0, dst + 4 * DC_DEST_STRIDE, 0, 8);
- __lasx_xvstelm_h(vec0, dst + 6 * DC_DEST_STRIDE, 0, 9);
- __lasx_xvstelm_h(vec0, dst + 12 * DC_DEST_STRIDE, 0, 10);
- __lasx_xvstelm_h(vec0, dst + 14 * DC_DEST_STRIDE, 0, 11);
- __lasx_xvstelm_h(vec0, dst + 5 * DC_DEST_STRIDE, 0, 12);
- __lasx_xvstelm_h(vec0, dst + 7 * DC_DEST_STRIDE, 0, 13);
- __lasx_xvstelm_h(vec0, dst + 13 * DC_DEST_STRIDE, 0, 14);
- __lasx_xvstelm_h(vec0, dst + 15 * DC_DEST_STRIDE, 0, 15);
-
-#undef DC_DEST_STRIDE
-}
diff --git a/libavcodec/loongarch/h264idct_loongarch.c b/libavcodec/loongarch/h264idct_loongarch.c
new file mode 100644
index 0000000000..26af45503f
--- /dev/null
+++ b/libavcodec/loongarch/h264idct_loongarch.c
@@ -0,0 +1,184 @@
+/*
+ * Loongson LSX/LASX optimized h264idct
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ * Xiwei Gu <guxiwei-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264dsp_loongarch.h"
+#include "libavcodec/bit_depth_template.c"
+
+void ff_h264_idct_add16_8_lsx(uint8_t *dst, const int32_t *blk_offset,
+ int16_t *block, int32_t dst_stride,
+ const uint8_t nzc[15 * 8])
+{
+ int32_t i;
+
+ for (i = 0; i < 16; i++) {
+ int32_t nnz = nzc[scan8[i]];
+
+ if (nnz == 1 && ((dctcoef *) block)[i * 16]) {
+ ff_h264_idct_dc_add_8_lsx(dst + blk_offset[i],
+ block + i * 16 * sizeof(pixel),
+ dst_stride);
+ } else if (nnz) {
+ ff_h264_idct_add_8_lsx(dst + blk_offset[i],
+ block + i * 16 * sizeof(pixel),
+ dst_stride);
+ }
+ }
+}
+
+void ff_h264_idct8_add4_8_lsx(uint8_t *dst, const int32_t *blk_offset,
+ int16_t *block, int32_t dst_stride,
+ const uint8_t nzc[15 * 8])
+{
+ int32_t cnt;
+
+ for (cnt = 0; cnt < 16; cnt += 4) {
+ int32_t nnz = nzc[scan8[cnt]];
+
+ if (nnz == 1 && ((dctcoef *) block)[cnt * 16]) {
+ ff_h264_idct8_dc_add_8_lsx(dst + blk_offset[cnt],
+ block + cnt * 16 * sizeof(pixel),
+ dst_stride);
+ } else if (nnz) {
+ ff_h264_idct8_add_8_lsx(dst + blk_offset[cnt],
+ block + cnt * 16 * sizeof(pixel),
+ dst_stride);
+ }
+ }
+}
+
+#if HAVE_LASX
+void ff_h264_idct8_add4_8_lasx(uint8_t *dst, const int32_t *blk_offset,
+ int16_t *block, int32_t dst_stride,
+ const uint8_t nzc[15 * 8])
+{
+ int32_t cnt;
+
+ for (cnt = 0; cnt < 16; cnt += 4) {
+ int32_t nnz = nzc[scan8[cnt]];
+
+ if (nnz == 1 && ((dctcoef *) block)[cnt * 16]) {
+ ff_h264_idct8_dc_add_8_lasx(dst + blk_offset[cnt],
+ block + cnt * 16 * sizeof(pixel),
+ dst_stride);
+ } else if (nnz) {
+ ff_h264_idct8_add_8_lasx(dst + blk_offset[cnt],
+ block + cnt * 16 * sizeof(pixel),
+ dst_stride);
+ }
+ }
+}
+#endif // #if HAVE_LASX
+
+void ff_h264_idct_add8_8_lsx(uint8_t **dst, const int32_t *blk_offset,
+ int16_t *block, int32_t dst_stride,
+ const uint8_t nzc[15 * 8])
+{
+ int32_t i;
+
+ for (i = 16; i < 20; i++) {
+ if (nzc[scan8[i]])
+ ff_h264_idct_add_8_lsx(dst[0] + blk_offset[i],
+ block + i * 16 * sizeof(pixel),
+ dst_stride);
+ else if (((dctcoef *) block)[i * 16])
+ ff_h264_idct_dc_add_8_lsx(dst[0] + blk_offset[i],
+ block + i * 16 * sizeof(pixel),
+ dst_stride);
+ }
+ for (i = 32; i < 36; i++) {
+ if (nzc[scan8[i]])
+ ff_h264_idct_add_8_lsx(dst[1] + blk_offset[i],
+ block + i * 16 * sizeof(pixel),
+ dst_stride);
+ else if (((dctcoef *) block)[i * 16])
+ ff_h264_idct_dc_add_8_lsx(dst[1] + blk_offset[i],
+ block + i * 16 * sizeof(pixel),
+ dst_stride);
+ }
+}
+
+void ff_h264_idct_add8_422_8_lsx(uint8_t **dst, const int32_t *blk_offset,
+ int16_t *block, int32_t dst_stride,
+ const uint8_t nzc[15 * 8])
+{
+ int32_t i;
+
+ for (i = 16; i < 20; i++) {
+ if (nzc[scan8[i]])
+ ff_h264_idct_add_8_lsx(dst[0] + blk_offset[i],
+ block + i * 16 * sizeof(pixel),
+ dst_stride);
+ else if (((dctcoef *) block)[i * 16])
+ ff_h264_idct_dc_add_8_lsx(dst[0] + blk_offset[i],
+ block + i * 16 * sizeof(pixel),
+ dst_stride);
+ }
+ for (i = 20; i < 24; i++) {
+ if (nzc[scan8[i + 4]])
+ ff_h264_idct_add_8_lsx(dst[0] + blk_offset[i + 4],
+ block + i * 16 * sizeof(pixel),
+ dst_stride);
+ else if (((dctcoef *) block)[i * 16])
+ ff_h264_idct_dc_add_8_lsx(dst[0] + blk_offset[i + 4],
+ block + i * 16 * sizeof(pixel),
+ dst_stride);
+ }
+ for (i = 32; i < 36; i++) {
+ if (nzc[scan8[i]])
+ ff_h264_idct_add_8_lsx(dst[1] + blk_offset[i],
+ block + i * 16 * sizeof(pixel),
+ dst_stride);
+ else if (((dctcoef *) block)[i * 16])
+ ff_h264_idct_dc_add_8_lsx(dst[1] + blk_offset[i],
+ block + i * 16 * sizeof(pixel),
+ dst_stride);
+ }
+ for (i = 36; i < 40; i++) {
+ if (nzc[scan8[i + 4]])
+ ff_h264_idct_add_8_lsx(dst[1] + blk_offset[i + 4],
+ block + i * 16 * sizeof(pixel),
+ dst_stride);
+ else if (((dctcoef *) block)[i * 16])
+ ff_h264_idct_dc_add_8_lsx(dst[1] + blk_offset[i + 4],
+ block + i * 16 * sizeof(pixel),
+ dst_stride);
+ }
+}
+
+void ff_h264_idct_add16_intra_8_lsx(uint8_t *dst, const int32_t *blk_offset,
+ int16_t *block, int32_t dst_stride,
+ const uint8_t nzc[15 * 8])
+{
+ int32_t i;
+
+ for (i = 0; i < 16; i++) {
+ if (nzc[scan8[i]])
+ ff_h264_idct_add_8_lsx(dst + blk_offset[i],
+ block + i * 16 * sizeof(pixel), dst_stride);
+ else if (((dctcoef *) block)[i * 16])
+ ff_h264_idct_dc_add_8_lsx(dst + blk_offset[i],
+ block + i * 16 * sizeof(pixel),
+ dst_stride);
+ }
+}
diff --git a/libavcodec/loongarch/loongson_asm.S b/libavcodec/loongarch/loongson_asm.S
new file mode 100644
index 0000000000..0a649f51c7
--- /dev/null
+++ b/libavcodec/loongarch/loongson_asm.S
@@ -0,0 +1,945 @@
+/*
+ * Loongson asm helper.
+ *
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Gu Xiwei(guxiwei-hf@loongson.cn)
+ * Shiyou Yin(yinshiyou-hf@loongson.cn)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * MAJOR version: Macro usage changes.
+ * MINOR version: Add new functions, or bug fixes.
+ * MICRO version: Comment changes or implementation changes.
+ */
+#define LML_VERSION_MAJOR 0
+#define LML_VERSION_MINOR 2
+#define LML_VERSION_MICRO 0
+
+/*
+ *============================================================================
+ * macros for specific projetc, set them as needed.
+ * Following LoongML macros for your reference.
+ *============================================================================
+ */
+#define ASM_PREF
+#define DEFAULT_ALIGN 5
+
+.macro function name, align=DEFAULT_ALIGN
+.macro endfunc
+ jirl $r0, $r1, 0x0
+ .size ASM_PREF\name, . - ASM_PREF\name
+ .purgem endfunc
+.endm
+.text ;
+.align \align ;
+.globl ASM_PREF\name ;
+.type ASM_PREF\name, @function ;
+ASM_PREF\name: ;
+.endm
+
+/**
+ * Attention: If align is not zero, the macro will use
+ * t7 until the end of function
+ */
+.macro alloc_stack size, align=0
+.if \align
+ .macro clean_stack
+ add.d sp, sp, t7
+ .endm
+ addi.d sp, sp, - \size
+ andi.d t7, sp, \align - 1
+ sub.d sp, sp, t7
+ addi.d t7, t7, \size
+.else
+ .macro clean_stack
+ addi.d sp, sp, \size
+ .endm
+ addi.d sp, sp, - \size
+.endif
+.endm
+
+.macro const name, align=DEFAULT_ALIGN
+ .macro endconst
+ .size \name, . - \name
+ .purgem endconst
+ .endm
+.section .rodata
+.align \align
+\name:
+.endm
+
+/*
+ *============================================================================
+ * LoongArch register alias
+ *============================================================================
+ */
+
+#define a0 $a0
+#define a1 $a1
+#define a2 $a2
+#define a3 $a3
+#define a4 $a4
+#define a5 $a5
+#define a6 $a6
+#define a7 $a7
+
+#define t0 $t0
+#define t1 $t1
+#define t2 $t2
+#define t3 $t3
+#define t4 $t4
+#define t5 $t5
+#define t6 $t6
+#define t7 $t7
+#define t8 $t8
+
+#define s0 $s0
+#define s1 $s1
+#define s2 $s2
+#define s3 $s3
+#define s4 $s4
+#define s5 $s5
+#define s6 $s6
+#define s7 $s7
+#define s8 $s8
+
+#define zero $zero
+#define sp $sp
+#define ra $ra
+
+#define f0 $f0
+#define f1 $f1
+#define f2 $f2
+#define f3 $f3
+#define f4 $f4
+#define f5 $f5
+#define f6 $f6
+#define f7 $f7
+#define f8 $f8
+#define f9 $f9
+#define f10 $f10
+#define f11 $f11
+#define f12 $f12
+#define f13 $f13
+#define f14 $f14
+#define f15 $f15
+#define f16 $f16
+#define f17 $f17
+#define f18 $f18
+#define f19 $f19
+#define f20 $f20
+#define f21 $f21
+#define f22 $f22
+#define f23 $f23
+#define f24 $f24
+#define f25 $f25
+#define f26 $f26
+#define f27 $f27
+#define f28 $f28
+#define f29 $f29
+#define f30 $f30
+#define f31 $f31
+
+#define vr0 $vr0
+#define vr1 $vr1
+#define vr2 $vr2
+#define vr3 $vr3
+#define vr4 $vr4
+#define vr5 $vr5
+#define vr6 $vr6
+#define vr7 $vr7
+#define vr8 $vr8
+#define vr9 $vr9
+#define vr10 $vr10
+#define vr11 $vr11
+#define vr12 $vr12
+#define vr13 $vr13
+#define vr14 $vr14
+#define vr15 $vr15
+#define vr16 $vr16
+#define vr17 $vr17
+#define vr18 $vr18
+#define vr19 $vr19
+#define vr20 $vr20
+#define vr21 $vr21
+#define vr22 $vr22
+#define vr23 $vr23
+#define vr24 $vr24
+#define vr25 $vr25
+#define vr26 $vr26
+#define vr27 $vr27
+#define vr28 $vr28
+#define vr29 $vr29
+#define vr30 $vr30
+#define vr31 $vr31
+
+#define xr0 $xr0
+#define xr1 $xr1
+#define xr2 $xr2
+#define xr3 $xr3
+#define xr4 $xr4
+#define xr5 $xr5
+#define xr6 $xr6
+#define xr7 $xr7
+#define xr8 $xr8
+#define xr9 $xr9
+#define xr10 $xr10
+#define xr11 $xr11
+#define xr12 $xr12
+#define xr13 $xr13
+#define xr14 $xr14
+#define xr15 $xr15
+#define xr16 $xr16
+#define xr17 $xr17
+#define xr18 $xr18
+#define xr19 $xr19
+#define xr20 $xr20
+#define xr21 $xr21
+#define xr22 $xr22
+#define xr23 $xr23
+#define xr24 $xr24
+#define xr25 $xr25
+#define xr26 $xr26
+#define xr27 $xr27
+#define xr28 $xr28
+#define xr29 $xr29
+#define xr30 $xr30
+#define xr31 $xr31
+
+/*
+ *============================================================================
+ * LSX/LASX synthesize instructions
+ *============================================================================
+ */
+
+/*
+ * Description : Dot product of byte vector elements
+ * Arguments : Inputs - vj, vk
+ * Outputs - vd
+ * Return Type - halfword
+ */
+.macro vdp2.h.bu vd, vj, vk
+ vmulwev.h.bu \vd, \vj, \vk
+ vmaddwod.h.bu \vd, \vj, \vk
+.endm
+
+.macro vdp2.h.bu.b vd, vj, vk
+ vmulwev.h.bu.b \vd, \vj, \vk
+ vmaddwod.h.bu.b \vd, \vj, \vk
+.endm
+
+.macro vdp2.w.h vd, vj, vk
+ vmulwev.w.h \vd, \vj, \vk
+ vmaddwod.w.h \vd, \vj, \vk
+.endm
+
+.macro xvdp2.h.bu xd, xj, xk
+ xvmulwev.h.bu \xd, \xj, \xk
+ xvmaddwod.h.bu \xd, \xj, \xk
+.endm
+
+.macro xvdp2.h.bu.b xd, xj, xk
+ xvmulwev.h.bu.b \xd, \xj, \xk
+ xvmaddwod.h.bu.b \xd, \xj, \xk
+.endm
+
+.macro xvdp2.w.h xd, xj, xk
+ xvmulwev.w.h \xd, \xj, \xk
+ xvmaddwod.w.h \xd, \xj, \xk
+.endm
+
+/*
+ * Description : Dot product & addition of halfword vector elements
+ * Arguments : Inputs - vj, vk
+ * Outputs - vd
+ * Return Type - twice size of input
+ */
+.macro vdp2add.h.bu vd, vj, vk
+ vmaddwev.h.bu \vd, \vj, \vk
+ vmaddwod.h.bu \vd, \vj, \vk
+.endm
+
+.macro vdp2add.h.bu.b vd, vj, vk
+ vmaddwev.h.bu.b \vd, \vj, \vk
+ vmaddwod.h.bu.b \vd, \vj, \vk
+.endm
+
+.macro vdp2add.w.h vd, vj, vk
+ vmaddwev.w.h \vd, \vj, \vk
+ vmaddwod.w.h \vd, \vj, \vk
+.endm
+
+.macro xvdp2add.h.bu.b xd, xj, xk
+ xvmaddwev.h.bu.b \xd, \xj, \xk
+ xvmaddwod.h.bu.b \xd, \xj, \xk
+.endm
+
+.macro xvdp2add.w.h xd, xj, xk
+ xvmaddwev.w.h \xd, \xj, \xk
+ xvmaddwod.w.h \xd, \xj, \xk
+.endm
+
+/*
+ * Description : Range each element of vector
+ * clip: vj > vk ? vj : vk && vj < va ? vj : va
+ * clip255: vj < 255 ? vj : 255 && vj > 0 ? vj : 0
+ */
+.macro vclip.h vd, vj, vk, va
+ vmax.h \vd, \vj, \vk
+ vmin.h \vd, \vd, \va
+.endm
+
+.macro vclip255.w vd, vj
+ vmaxi.w \vd, \vj, 0
+ vsat.wu \vd, \vd, 7
+.endm
+
+.macro vclip255.h vd, vj
+ vmaxi.h \vd, \vj, 0
+ vsat.hu \vd, \vd, 7
+.endm
+
+.macro xvclip.h xd, xj, xk, xa
+ xvmax.h \xd, \xj, \xk
+ xvmin.h \xd, \xd, \xa
+.endm
+
+.macro xvclip255.h xd, xj
+ xvmaxi.h \xd, \xj, 0
+ xvsat.hu \xd, \xd, 7
+.endm
+
+.macro xvclip255.w xd, xj
+ xvmaxi.w \xd, \xj, 0
+ xvsat.wu \xd, \xd, 7
+.endm
+
+/*
+ * Description : Store elements of vector
+ * vd : Data vector to be stroed
+ * rk : Address of data storage
+ * ra : Offset of address
+ * si : Index of data in vd
+ */
+.macro vstelmx.b vd, rk, ra, si
+ add.d \rk, \rk, \ra
+ vstelm.b \vd, \rk, 0, \si
+.endm
+
+.macro vstelmx.h vd, rk, ra, si
+ add.d \rk, \rk, \ra
+ vstelm.h \vd, \rk, 0, \si
+.endm
+
+.macro vstelmx.w vd, rk, ra, si
+ add.d \rk, \rk, \ra
+ vstelm.w \vd, \rk, 0, \si
+.endm
+
+.macro vstelmx.d vd, rk, ra, si
+ add.d \rk, \rk, \ra
+ vstelm.d \vd, \rk, 0, \si
+.endm
+
+.macro vmov xd, xj
+ vor.v \xd, \xj, \xj
+.endm
+
+.macro xmov xd, xj
+ xvor.v \xd, \xj, \xj
+.endm
+
+.macro xvstelmx.d xd, rk, ra, si
+ add.d \rk, \rk, \ra
+ xvstelm.d \xd, \rk, 0, \si
+.endm
+
+/*
+ *============================================================================
+ * LSX/LASX custom macros
+ *============================================================================
+ */
+
+/*
+ * Load 4 float, double, V128, v256 elements with stride.
+ */
+.macro FLDS_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+ fld.s \out0, \src, 0
+ fldx.s \out1, \src, \stride
+ fldx.s \out2, \src, \stride2
+ fldx.s \out3, \src, \stride3
+.endm
+
+.macro FLDD_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+ fld.d \out0, \src, 0
+ fldx.d \out1, \src, \stride
+ fldx.d \out2, \src, \stride2
+ fldx.d \out3, \src, \stride3
+.endm
+
+.macro LSX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+ vld \out0, \src, 0
+ vldx \out1, \src, \stride
+ vldx \out2, \src, \stride2
+ vldx \out3, \src, \stride3
+.endm
+
+.macro LASX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+ xvld \out0, \src, 0
+ xvldx \out1, \src, \stride
+ xvldx \out2, \src, \stride2
+ xvldx \out3, \src, \stride3
+.endm
+
+/*
+ * Description : Transpose 4x4 block with half-word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3
+ * Outputs - out0, out1, out2, out3
+ */
+.macro LSX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
+ tmp0, tmp1
+ vilvl.h \tmp0, \in1, \in0
+ vilvl.h \tmp1, \in3, \in2
+ vilvl.w \out0, \tmp1, \tmp0
+ vilvh.w \out2, \tmp1, \tmp0
+ vilvh.d \out1, \out0, \out0
+ vilvh.d \out3, \out0, \out2
+.endm
+
+/*
+ * Description : Transpose 4x4 block with word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3
+ * Outputs - out0, out1, out2, out3
+ * Details :
+ * Example :
+ * 1, 2, 3, 4 1, 5, 9,13
+ * 5, 6, 7, 8 to 2, 6,10,14
+ * 9,10,11,12 =====> 3, 7,11,15
+ * 13,14,15,16 4, 8,12,16
+ */
+.macro LSX_TRANSPOSE4x4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3, \
+ _tmp0, _tmp1
+
+ vilvl.w \_tmp0, \_in1, \_in0
+ vilvh.w \_out1, \_in1, \_in0
+ vilvl.w \_tmp1, \_in3, \_in2
+ vilvh.w \_out3, \_in3, \_in2
+
+ vilvl.d \_out0, \_tmp1, \_tmp0
+ vilvl.d \_out2, \_out3, \_out1
+ vilvh.d \_out3, \_out3, \_out1
+ vilvh.d \_out1, \_tmp1, \_tmp0
+.endm
+
+/*
+ * Description : Transpose 8x8 block with half-word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ */
+.macro LSX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3, out4, out5, out6, out7, tmp0, tmp1, tmp2, \
+ tmp3, tmp4, tmp5, tmp6, tmp7
+ vilvl.h \tmp0, \in6, \in4
+ vilvl.h \tmp1, \in7, \in5
+ vilvl.h \tmp2, \in2, \in0
+ vilvl.h \tmp3, \in3, \in1
+
+ vilvl.h \tmp4, \tmp1, \tmp0
+ vilvh.h \tmp5, \tmp1, \tmp0
+ vilvl.h \tmp6, \tmp3, \tmp2
+ vilvh.h \tmp7, \tmp3, \tmp2
+
+ vilvh.h \tmp0, \in6, \in4
+ vilvh.h \tmp1, \in7, \in5
+ vilvh.h \tmp2, \in2, \in0
+ vilvh.h \tmp3, \in3, \in1
+
+ vpickev.d \out0, \tmp4, \tmp6
+ vpickod.d \out1, \tmp4, \tmp6
+ vpickev.d \out2, \tmp5, \tmp7
+ vpickod.d \out3, \tmp5, \tmp7
+
+ vilvl.h \tmp4, \tmp1, \tmp0
+ vilvh.h \tmp5, \tmp1, \tmp0
+ vilvl.h \tmp6, \tmp3, \tmp2
+ vilvh.h \tmp7, \tmp3, \tmp2
+
+ vpickev.d \out4, \tmp4, \tmp6
+ vpickod.d \out5, \tmp4, \tmp6
+ vpickev.d \out6, \tmp5, \tmp7
+ vpickod.d \out7, \tmp5, \tmp7
+.endm
+
+/*
+ * Description : Transpose 16x8 block with byte elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ */
+.macro LASX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7, \
+ in8, in9, in10, in11, in12, in13, in14, in15, \
+ out0, out1, out2, out3, out4, out5, out6, out7,\
+ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
+ xvilvl.b \tmp0, \in2, \in0
+ xvilvl.b \tmp1, \in3, \in1
+ xvilvl.b \tmp2, \in6, \in4
+ xvilvl.b \tmp3, \in7, \in5
+ xvilvl.b \tmp4, \in10, \in8
+ xvilvl.b \tmp5, \in11, \in9
+ xvilvl.b \tmp6, \in14, \in12
+ xvilvl.b \tmp7, \in15, \in13
+ xvilvl.b \out0, \tmp1, \tmp0
+ xvilvh.b \out1, \tmp1, \tmp0
+ xvilvl.b \out2, \tmp3, \tmp2
+ xvilvh.b \out3, \tmp3, \tmp2
+ xvilvl.b \out4, \tmp5, \tmp4
+ xvilvh.b \out5, \tmp5, \tmp4
+ xvilvl.b \out6, \tmp7, \tmp6
+ xvilvh.b \out7, \tmp7, \tmp6
+ xvilvl.w \tmp0, \out2, \out0
+ xvilvh.w \tmp2, \out2, \out0
+ xvilvl.w \tmp4, \out3, \out1
+ xvilvh.w \tmp6, \out3, \out1
+ xvilvl.w \tmp1, \out6, \out4
+ xvilvh.w \tmp3, \out6, \out4
+ xvilvl.w \tmp5, \out7, \out5
+ xvilvh.w \tmp7, \out7, \out5
+ xvilvl.d \out0, \tmp1, \tmp0
+ xvilvh.d \out1, \tmp1, \tmp0
+ xvilvl.d \out2, \tmp3, \tmp2
+ xvilvh.d \out3, \tmp3, \tmp2
+ xvilvl.d \out4, \tmp5, \tmp4
+ xvilvh.d \out5, \tmp5, \tmp4
+ xvilvl.d \out6, \tmp7, \tmp6
+ xvilvh.d \out7, \tmp7, \tmp6
+.endm
+
+/*
+ * Description : Transpose 16x8 block with byte elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ */
+.macro LSX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7, \
+ in8, in9, in10, in11, in12, in13, in14, in15, \
+ out0, out1, out2, out3, out4, out5, out6, out7,\
+ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
+ vilvl.b \tmp0, \in2, \in0
+ vilvl.b \tmp1, \in3, \in1
+ vilvl.b \tmp2, \in6, \in4
+ vilvl.b \tmp3, \in7, \in5
+ vilvl.b \tmp4, \in10, \in8
+ vilvl.b \tmp5, \in11, \in9
+ vilvl.b \tmp6, \in14, \in12
+ vilvl.b \tmp7, \in15, \in13
+
+ vilvl.b \out0, \tmp1, \tmp0
+ vilvh.b \out1, \tmp1, \tmp0
+ vilvl.b \out2, \tmp3, \tmp2
+ vilvh.b \out3, \tmp3, \tmp2
+ vilvl.b \out4, \tmp5, \tmp4
+ vilvh.b \out5, \tmp5, \tmp4
+ vilvl.b \out6, \tmp7, \tmp6
+ vilvh.b \out7, \tmp7, \tmp6
+ vilvl.w \tmp0, \out2, \out0
+ vilvh.w \tmp2, \out2, \out0
+ vilvl.w \tmp4, \out3, \out1
+ vilvh.w \tmp6, \out3, \out1
+ vilvl.w \tmp1, \out6, \out4
+ vilvh.w \tmp3, \out6, \out4
+ vilvl.w \tmp5, \out7, \out5
+ vilvh.w \tmp7, \out7, \out5
+ vilvl.d \out0, \tmp1, \tmp0
+ vilvh.d \out1, \tmp1, \tmp0
+ vilvl.d \out2, \tmp3, \tmp2
+ vilvh.d \out3, \tmp3, \tmp2
+ vilvl.d \out4, \tmp5, \tmp4
+ vilvh.d \out5, \tmp5, \tmp4
+ vilvl.d \out6, \tmp7, \tmp6
+ vilvh.d \out7, \tmp7, \tmp6
+.endm
+
+/*
+ * Description : Transpose 4x4 block with half-word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3
+ * Outputs - out0, out1, out2, out3
+ */
+.macro LASX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
+ tmp0, tmp1
+ xvilvl.h \tmp0, \in1, \in0
+ xvilvl.h \tmp1, \in3, \in2
+ xvilvl.w \out0, \tmp1, \tmp0
+ xvilvh.w \out2, \tmp1, \tmp0
+ xvilvh.d \out1, \out0, \out0
+ xvilvh.d \out3, \out0, \out2
+.endm
+
+/*
+ * Description : Transpose 4x8 block with half-word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3
+ * Outputs - out0, out1, out2, out3
+ */
+.macro LASX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, \
+ tmp0, tmp1
+ xvilvl.h \tmp0, \in2, \in0
+ xvilvl.h \tmp1, \in3, \in1
+ xvilvl.h \out2, \tmp1, \tmp0
+ xvilvh.h \out3, \tmp1, \tmp0
+
+ xvilvl.d \out0, \out2, \out2
+ xvilvh.d \out1, \out2, \out2
+ xvilvl.d \out2, \out3, \out3
+ xvilvh.d \out3, \out3, \out3
+.endm
+
+/*
+ * Description : Transpose 8x8 block with half-word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
+ * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ */
+.macro LASX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3, out4, out5, out6, out7, \
+ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
+ xvilvl.h \tmp0, \in6, \in4
+ xvilvl.h \tmp1, \in7, \in5
+ xvilvl.h \tmp2, \in2, \in0
+ xvilvl.h \tmp3, \in3, \in1
+
+ xvilvl.h \tmp4, \tmp1, \tmp0
+ xvilvh.h \tmp5, \tmp1, \tmp0
+ xvilvl.h \tmp6, \tmp3, \tmp2
+ xvilvh.h \tmp7, \tmp3, \tmp2
+
+ xvilvh.h \tmp0, \in6, \in4
+ xvilvh.h \tmp1, \in7, \in5
+ xvilvh.h \tmp2, \in2, \in0
+ xvilvh.h \tmp3, \in3, \in1
+
+ xvpickev.d \out0, \tmp4, \tmp6
+ xvpickod.d \out1, \tmp4, \tmp6
+ xvpickev.d \out2, \tmp5, \tmp7
+ xvpickod.d \out3, \tmp5, \tmp7
+
+ xvilvl.h \tmp4, \tmp1, \tmp0
+ xvilvh.h \tmp5, \tmp1, \tmp0
+ xvilvl.h \tmp6, \tmp3, \tmp2
+ xvilvh.h \tmp7, \tmp3, \tmp2
+
+ xvpickev.d \out4, \tmp4, \tmp6
+ xvpickod.d \out5, \tmp4, \tmp6
+ xvpickev.d \out6, \tmp5, \tmp7
+ xvpickod.d \out7, \tmp5, \tmp7
+.endm
+
+/*
+ * Description : Transpose 2x4x4 block with half-word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3
+ * Outputs - out0, out1, out2, out3
+ */
+.macro LASX_TRANSPOSE2x4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
+ tmp0, tmp1, tmp2
+ xvilvh.h \tmp1, \in0, \in1
+ xvilvl.h \out1, \in0, \in1
+ xvilvh.h \tmp0, \in2, \in3
+ xvilvl.h \out3, \in2, \in3
+
+ xvilvh.w \tmp2, \out3, \out1
+ xvilvl.w \out3, \out3, \out1
+
+ xvilvl.w \out2, \tmp0, \tmp1
+ xvilvh.w \tmp1, \tmp0, \tmp1
+
+ xvilvh.d \out0, \out2, \out3
+ xvilvl.d \out2, \out2, \out3
+ xvilvh.d \out1, \tmp1, \tmp2
+ xvilvl.d \out3, \tmp1, \tmp2
+.endm
+
+/*
+ * Description : Transpose 4x4 block with word elements in vectors
+ * Arguments : Inputs - in0, in1, in2, in3
+ * Outputs - out0, out1, out2, out3
+ * Details :
+ * Example :
+ * 1, 2, 3, 4, 1, 2, 3, 4 1,5, 9,13, 1,5, 9,13
+ * 5, 6, 7, 8, 5, 6, 7, 8 to 2,6,10,14, 2,6,10,14
+ * 9,10,11,12, 9,10,11,12 =====> 3,7,11,15, 3,7,11,15
+ * 13,14,15,16, 13,14,15,16 4,8,12,16, 4,8,12,16
+ */
+.macro LASX_TRANSPOSE4x4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3, \
+ _tmp0, _tmp1
+
+ xvilvl.w \_tmp0, \_in1, \_in0
+ xvilvh.w \_out1, \_in1, \_in0
+ xvilvl.w \_tmp1, \_in3, \_in2
+ xvilvh.w \_out3, \_in3, \_in2
+
+ xvilvl.d \_out0, \_tmp1, \_tmp0
+ xvilvl.d \_out2, \_out3, \_out1
+ xvilvh.d \_out3, \_out3, \_out1
+ xvilvh.d \_out1, \_tmp1, \_tmp0
+.endm
+
+/*
+ * Description : Transpose 8x8 block with word elements in vectors
+ * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
+ * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ * _out7
+ * Example : LASX_TRANSPOSE8x8_W
+ * _in0 : 1,2,3,4,5,6,7,8
+ * _in1 : 2,2,3,4,5,6,7,8
+ * _in2 : 3,2,3,4,5,6,7,8
+ * _in3 : 4,2,3,4,5,6,7,8
+ * _in4 : 5,2,3,4,5,6,7,8
+ * _in5 : 6,2,3,4,5,6,7,8
+ * _in6 : 7,2,3,4,5,6,7,8
+ * _in7 : 8,2,3,4,5,6,7,8
+ *
+ * _out0 : 1,2,3,4,5,6,7,8
+ * _out1 : 2,2,2,2,2,2,2,2
+ * _out2 : 3,3,3,3,3,3,3,3
+ * _out3 : 4,4,4,4,4,4,4,4
+ * _out4 : 5,5,5,5,5,5,5,5
+ * _out5 : 6,6,6,6,6,6,6,6
+ * _out6 : 7,7,7,7,7,7,7,7
+ * _out7 : 8,8,8,8,8,8,8,8
+ */
+.macro LASX_TRANSPOSE8x8_W _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,\
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7,\
+ _tmp0, _tmp1, _tmp2, _tmp3
+ xvilvl.w \_tmp0, \_in2, \_in0
+ xvilvl.w \_tmp1, \_in3, \_in1
+ xvilvh.w \_tmp2, \_in2, \_in0
+ xvilvh.w \_tmp3, \_in3, \_in1
+ xvilvl.w \_out0, \_tmp1, \_tmp0
+ xvilvh.w \_out1, \_tmp1, \_tmp0
+ xvilvl.w \_out2, \_tmp3, \_tmp2
+ xvilvh.w \_out3, \_tmp3, \_tmp2
+
+ xvilvl.w \_tmp0, \_in6, \_in4
+ xvilvl.w \_tmp1, \_in7, \_in5
+ xvilvh.w \_tmp2, \_in6, \_in4
+ xvilvh.w \_tmp3, \_in7, \_in5
+ xvilvl.w \_out4, \_tmp1, \_tmp0
+ xvilvh.w \_out5, \_tmp1, \_tmp0
+ xvilvl.w \_out6, \_tmp3, \_tmp2
+ xvilvh.w \_out7, \_tmp3, \_tmp2
+
+ xmov \_tmp0, \_out0
+ xmov \_tmp1, \_out1
+ xmov \_tmp2, \_out2
+ xmov \_tmp3, \_out3
+ xvpermi.q \_out0, \_out4, 0x02
+ xvpermi.q \_out1, \_out5, 0x02
+ xvpermi.q \_out2, \_out6, 0x02
+ xvpermi.q \_out3, \_out7, 0x02
+ xvpermi.q \_out4, \_tmp0, 0x31
+ xvpermi.q \_out5, \_tmp1, 0x31
+ xvpermi.q \_out6, \_tmp2, 0x31
+ xvpermi.q \_out7, \_tmp3, 0x31
+.endm
+
+/*
+ * Description : Transpose 4x4 block with double-word elements in vectors
+ * Arguments : Inputs - _in0, _in1, _in2, _in3
+ * Outputs - _out0, _out1, _out2, _out3
+ * Example : LASX_TRANSPOSE4x4_D
+ * _in0 : 1,2,3,4
+ * _in1 : 1,2,3,4
+ * _in2 : 1,2,3,4
+ * _in3 : 1,2,3,4
+ *
+ * _out0 : 1,1,1,1
+ * _out1 : 2,2,2,2
+ * _out2 : 3,3,3,3
+ * _out3 : 4,4,4,4
+ */
+.macro LASX_TRANSPOSE4x4_D _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3, \
+ _tmp0, _tmp1
+ xvilvl.d \_tmp0, \_in1, \_in0
+ xvilvh.d \_out1, \_in1, \_in0
+ xvilvh.d \_tmp1, \_in3, \_in2
+ xvilvl.d \_out2, \_in3, \_in2
+
+ xvor.v \_out0, \_tmp0, \_tmp0
+ xvor.v \_out3, \_tmp1, \_tmp1
+
+ xvpermi.q \_out0, \_out2, 0x02
+ xvpermi.q \_out2, \_tmp0, 0x31
+ xvpermi.q \_out3, \_out1, 0x31
+ xvpermi.q \_out1, \_tmp1, 0x02
+.endm
+
+/*
+ * Description : Butterfly of 4 input vectors
+ * Arguments : Inputs - _in0, _in1, _in2, _in3
+ * Outputs - _out0, _out1, _out2, _out3
+ * Details : Butterfly operation
+ * Example : LSX_BUTTERFLY_4
+ * _out0 = _in0 + _in3;
+ * _out1 = _in1 + _in2;
+ * _out2 = _in1 - _in2;
+ * _out3 = _in0 - _in3;
+ */
+.macro LSX_BUTTERFLY_4_B _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+ vadd.b \_out0, \_in0, \_in3
+ vadd.b \_out1, \_in1, \_in2
+ vsub.b \_out2, \_in1, \_in2
+ vsub.b \_out3, \_in0, \_in3
+.endm
+.macro LSX_BUTTERFLY_4_H _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+ vadd.h \_out0, \_in0, \_in3
+ vadd.h \_out1, \_in1, \_in2
+ vsub.h \_out2, \_in1, \_in2
+ vsub.h \_out3, \_in0, \_in3
+.endm
+.macro LSX_BUTTERFLY_4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+ vadd.w \_out0, \_in0, \_in3
+ vadd.w \_out1, \_in1, \_in2
+ vsub.w \_out2, \_in1, \_in2
+ vsub.w \_out3, \_in0, \_in3
+.endm
+.macro LSX_BUTTERFLY_4_D _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+ vadd.d \_out0, \_in0, \_in3
+ vadd.d \_out1, \_in1, \_in2
+ vsub.d \_out2, \_in1, \_in2
+ vsub.d \_out3, \_in0, \_in3
+.endm
+
+.macro LASX_BUTTERFLY_4_B _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+ xvadd.b \_out0, \_in0, \_in3
+ xvadd.b \_out1, \_in1, \_in2
+ xvsub.b \_out2, \_in1, \_in2
+ xvsub.b \_out3, \_in0, \_in3
+.endm
+.macro LASX_BUTTERFLY_4_H _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+ xvadd.h \_out0, \_in0, \_in3
+ xvadd.h \_out1, \_in1, \_in2
+ xvsub.h \_out2, \_in1, \_in2
+ xvsub.h \_out3, \_in0, \_in3
+.endm
+.macro LASX_BUTTERFLY_4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+ xvadd.w \_out0, \_in0, \_in3
+ xvadd.w \_out1, \_in1, \_in2
+ xvsub.w \_out2, \_in1, \_in2
+ xvsub.w \_out3, \_in0, \_in3
+.endm
+.macro LASX_BUTTERFLY_4_D _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+ xvadd.d \_out0, \_in0, \_in3
+ xvadd.d \_out1, \_in1, \_in2
+ xvsub.d \_out2, \_in1, \_in2
+ xvsub.d \_out3, \_in0, \_in3
+.endm
+
+/*
+ * Description : Butterfly of 8 input vectors
+ * Arguments : Inputs - _in0, _in1, _in2, _in3, ~
+ * Outputs - _out0, _out1, _out2, _out3, ~
+ * Details : Butterfly operation
+ * Example : LASX_BUTTERFLY_8
+ * _out0 = _in0 + _in7;
+ * _out1 = _in1 + _in6;
+ * _out2 = _in2 + _in5;
+ * _out3 = _in3 + _in4;
+ * _out4 = _in3 - _in4;
+ * _out5 = _in2 - _in5;
+ * _out6 = _in1 - _in6;
+ * _out7 = _in0 - _in7;
+ */
+.macro LSX_BUTTERFLY_8_B _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+ vadd.b \_out0, \_in0, \_in7
+ vadd.b \_out1, \_in1, \_in6
+ vadd.b \_out2, \_in2, \_in5
+ vadd.b \_out3, \_in3, \_in4
+ vsub.b \_out4, \_in3, \_in4
+ vsub.b \_out5, \_in2, \_in5
+ vsub.b \_out6, \_in1, \_in6
+ vsub.b \_out7, \_in0, \_in7
+.endm
+
+.macro LSX_BUTTERFLY_8_H _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+ vadd.h \_out0, \_in0, \_in7
+ vadd.h \_out1, \_in1, \_in6
+ vadd.h \_out2, \_in2, \_in5
+ vadd.h \_out3, \_in3, \_in4
+ vsub.h \_out4, \_in3, \_in4
+ vsub.h \_out5, \_in2, \_in5
+ vsub.h \_out6, \_in1, \_in6
+ vsub.h \_out7, \_in0, \_in7
+.endm
+
+.macro LSX_BUTTERFLY_8_W _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+ vadd.w \_out0, \_in0, \_in7
+ vadd.w \_out1, \_in1, \_in6
+ vadd.w \_out2, \_in2, \_in5
+ vadd.w \_out3, \_in3, \_in4
+ vsub.w \_out4, \_in3, \_in4
+ vsub.w \_out5, \_in2, \_in5
+ vsub.w \_out6, \_in1, \_in6
+ vsub.w \_out7, \_in0, \_in7
+.endm
+
+.macro LSX_BUTTERFLY_8_D _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+ vadd.d \_out0, \_in0, \_in7
+ vadd.d \_out1, \_in1, \_in6
+ vadd.d \_out2, \_in2, \_in5
+ vadd.d \_out3, \_in3, \_in4
+ vsub.d \_out4, \_in3, \_in4
+ vsub.d \_out5, \_in2, \_in5
+ vsub.d \_out6, \_in1, \_in6
+ vsub.d \_out7, \_in0, \_in7
+.endm
+
+.macro LASX_BUTTERFLY_8_B _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+ xvadd.b \_out0, \_in0, \_in7
+ xvadd.b \_out1, \_in1, \_in6
+ xvadd.b \_out2, \_in2, \_in5
+ xvadd.b \_out3, \_in3, \_in4
+ xvsub.b \_out4, \_in3, \_in4
+ xvsub.b \_out5, \_in2, \_in5
+ xvsub.b \_out6, \_in1, \_in6
+ xvsub.b \_out7, \_in0, \_in7
+.endm
+
+.macro LASX_BUTTERFLY_8_H _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+ xvadd.h \_out0, \_in0, \_in7
+ xvadd.h \_out1, \_in1, \_in6
+ xvadd.h \_out2, \_in2, \_in5
+ xvadd.h \_out3, \_in3, \_in4
+ xvsub.h \_out4, \_in3, \_in4
+ xvsub.h \_out5, \_in2, \_in5
+ xvsub.h \_out6, \_in1, \_in6
+ xvsub.h \_out7, \_in0, \_in7
+.endm
+
+.macro LASX_BUTTERFLY_8_W _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+ _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+ xvadd.w \_out0, \_in0, \_in7
+ xvadd.w \_out1, \_in1, \_in6
+ xvadd.w \_out2, \_in2, \_in5
+ xvadd.w \_out3, \_in3, \_in4
+ xvsub.w \_out4, \_in3, \_in4
+ xvsub.w \_out5, \_in2, \_in5
+ xvsub.w \_out6, \_in1, \_in6
+ xvsub.w \_out7, \_in0, \_in7
+.endm
--
2.20.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 17+ messages in thread
* [FFmpeg-devel] [PATCH v5 2/7] avcodec/la: Add LSX optimization for loop filter.
2023-05-25 7:24 [FFmpeg-devel] Add LSX optimization in avcodec and swscale Hao Chen
2023-05-25 7:24 ` [FFmpeg-devel] [PATCH v5 1/7] avcodec/la: add LSX optimization for h264 idct Hao Chen
@ 2023-05-25 7:24 ` Hao Chen
2023-05-25 7:24 ` [FFmpeg-devel] [PATCH v5 3/7] avcodec/la: Add LSX optimization for h264 chroma and intrapred Hao Chen
` (4 subsequent siblings)
6 siblings, 0 replies; 17+ messages in thread
From: Hao Chen @ 2023-05-25 7:24 UTC (permalink / raw)
To: ffmpeg-devel
Replaced function(LSX is sufficient for these functions):
ff_h264_v_lpf_chroma_8_lasx
ff_h264_h_lpf_chroma_8_lasx
ff_h264_v_lpf_chroma_intra_8_lasx
ff_h264_h_lpf_chroma_intra_8_lasx
ff_weight_h264_pixels4_8_lasx
ff_biweight_h264_pixels4_8_lasx
./configure --disable-lasx
ffmpeg -i 1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -y /dev/null -an
before: 161fps
after: 199fps
---
libavcodec/loongarch/Makefile | 3 +-
libavcodec/loongarch/h264dsp.S | 1977 +++++++++++++++++
libavcodec/loongarch/h264dsp_init_loongarch.c | 37 +-
libavcodec/loongarch/h264dsp_lasx.c | 1354 +----------
libavcodec/loongarch/h264dsp_loongarch.h | 67 +-
5 files changed, 2063 insertions(+), 1375 deletions(-)
create mode 100644 libavcodec/loongarch/h264dsp.S
diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index 34ebbbe133..111bc23e4e 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -31,4 +31,5 @@ LSX-OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_lsx.o \
loongarch/hevc_mc_uni_lsx.o \
loongarch/hevc_mc_uniw_lsx.o
LSX-OBJS-$(CONFIG_H264DSP) += loongarch/h264idct.o \
- loongarch/h264idct_loongarch.o
+ loongarch/h264idct_loongarch.o \
+ loongarch/h264dsp.o
diff --git a/libavcodec/loongarch/h264dsp.S b/libavcodec/loongarch/h264dsp.S
new file mode 100644
index 0000000000..750fe49143
--- /dev/null
+++ b/libavcodec/loongarch/h264dsp.S
@@ -0,0 +1,1977 @@
+/*
+ * Loongson LSX/LASX optimized h264dsp
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen <chenhao@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+const vec_shuf
+.rept 2
+.byte 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3
+.endr
+endconst
+
+.macro AVC_LPF_P1_OR_Q1 _in0, _in1, _in2, _in3, _in4, _in5, _out, _tmp0, _tmp1
+ vavgr.hu \_tmp0, \_in0, \_in1
+ vslli.h \_tmp1, \_in2, 1
+ vsub.h \_tmp0, \_tmp0, \_tmp1
+ vavg.h \_tmp0, \_in3, \_tmp0
+ vclip.h \_tmp0, \_tmp0, \_in4, \_in5
+ vadd.h \_out, \_in2, \_tmp0
+.endm
+
+.macro AVC_LPF_P0Q0 _in0, _in1, _in2, _in3, _in4, _in5, _out0, \
+ _out1, _tmp0, _tmp1
+ vsub.h \_tmp0, \_in0, \_in1
+ vsub.h \_tmp1, \_in2, \_in3
+ vslli.h \_tmp0, \_tmp0, 2
+ vaddi.hu \_tmp1, \_tmp1, 4
+ vadd.h \_tmp0, \_tmp0, \_tmp1
+ vsrai.h \_tmp0, \_tmp0, 3
+ vclip.h \_tmp0, \_tmp0, \_in4, \_in5
+ vadd.h \_out0, \_in1, \_tmp0
+ vsub.h \_out1, \_in0, \_tmp0
+ vclip255.h \_out0, \_out0
+ vclip255.h \_out1, \_out1
+.endm
+
+.macro SAVE_REG
+ addi.d sp, sp, -64
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+ fst.d f30, sp, 48
+ fst.d f31, sp, 56
+.endm
+
+.macro RESTORE_REG
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ fld.d f30, sp, 48
+ fld.d f31, sp, 56
+ addi.d sp, sp, 64
+.endm
+
+.macro load_double _in0, _in1, _in2, _in3, _src, _str0, _str1, _str2
+ fld.d \_in0, \_src, 0
+ fldx.d \_in1, \_src, \_str0
+ fldx.d \_in2, \_src, \_str1
+ fldx.d \_in3, \_src, \_str2
+.endm
+
+.macro store_double _in0, _in1, _in2, _in3, _dst, _str0, _str1, _str2
+ fst.d \_in0, \_dst, 0
+ fstx.d \_in1, \_dst, \_str0
+ fstx.d \_in2, \_dst, \_str1
+ fstx.d \_in3, \_dst, \_str2
+.endm
+
+function ff_h264_h_lpf_luma_8_lsx
+ slli.d t0, a1, 1 //img_width_2x
+ slli.d t1, a1, 2 //img_width_4x
+ slli.d t2, a1, 3 //img_width_8x
+ SAVE_REG
+ la.local t4, vec_shuf
+ add.d t3, t0, a1 //img_width_3x
+ vldrepl.w vr0, a4, 0 //tmp_vec0
+ vld vr1, t4, 0 //tc_vec
+ vshuf.b vr1, vr0, vr0, vr1 //tc_vec
+ vslti.b vr2, vr1, 0
+ vxori.b vr2, vr2, 255
+ vandi.b vr2, vr2, 1 //bs_vec
+ vsetnez.v $fcc0, vr2
+ bceqz $fcc0, .END_LUMA_8
+ vldi vr0, 0 //zero
+ addi.d t4, a0, -4 //src
+ vslt.bu vr3, vr0, vr2 //is_bs_greater_than0
+ add.d t5, t4, t2 //src_tmp
+ vld vr4, t4, 0 //row0
+ vldx vr5, t4, a1 //row1
+ vldx vr6, t4, t0 //row2
+ vldx vr7, t4, t3 //row3
+ add.d t6, t4, t1 // src += img_width_4x
+ vld vr8, t6, 0 //row4
+ vldx vr9, t6, a1 //row5
+ vldx vr10, t6, t0 //row6
+ vldx vr11, t6, t3 //row7
+ vld vr12, t5, 0 //row8
+ vldx vr13, t5, a1 //row9
+ vldx vr14, t5, t0 //row10
+ vldx vr15, t5, t3 //row11
+ add.d t6, t5, t1 // src_tmp += img_width_4x
+ vld vr16, t6, 0 //row12
+ vldx vr17, t6, a1 //row13
+ vldx vr18, t6, t0 //row14
+ vldx vr19, t6, t3 //row15
+ LSX_TRANSPOSE16X8_B vr4, vr5, vr6, vr7, vr8, vr9, vr10, vr11, \
+ vr12, vr13, vr14, vr15, vr16, vr17, vr18, vr19, \
+ vr10, vr11, vr12, vr13, vr14, vr15, vr16, vr17, \
+ vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27
+ //vr10: p3_org, vr11: p2_org, vr12: p1_org, vr13: p0_org
+ //vr14: q0_org, vr15: q1_org, vr16: q2_org, vr17: q3_org
+ vabsd.bu vr20, vr13, vr14 //p0_asub_q0
+ vabsd.bu vr21, vr12, vr13 //p1_asub_p0
+ vabsd.bu vr22, vr15, vr14 //q1_asub_q0
+
+ vreplgr2vr.b vr4, a2 //alpha
+ vreplgr2vr.b vr5, a3 //beta
+
+ vslt.bu vr6, vr20, vr4 //is_less_than_alpha
+ vslt.bu vr7, vr21, vr5 //is_less_than_beta
+ vand.v vr8, vr6, vr7 //is_less_than
+ vslt.bu vr7, vr22, vr5 //is_less_than_beta
+ vand.v vr8, vr7, vr8 //is_less_than
+ vand.v vr8, vr8, vr3 //is_less_than
+ vsetnez.v $fcc0, vr8
+ bceqz $fcc0, .END_LUMA_8
+ vneg.b vr9, vr1 //neg_tc_h
+ vsllwil.hu.bu vr18, vr1, 0 //tc_h.0
+ vexth.hu.bu vr19, vr1 //tc_h.1
+ vexth.h.b vr2, vr9 //neg_tc_h.1
+ vsllwil.h.b vr9, vr9, 0 //neg_tc_h.0
+
+ vsllwil.hu.bu vr23, vr12, 0 //p1_org_h.0
+ vexth.hu.bu vr3, vr12 //p1_org_h.1
+ vsllwil.hu.bu vr24, vr13, 0 //p0_org_h.0
+ vexth.hu.bu vr4, vr13 //p0_org_h.1
+ vsllwil.hu.bu vr25, vr14, 0 //q0_org_h.0
+ vexth.hu.bu vr6, vr14 //q0_org_h.1
+
+ vabsd.bu vr0, vr11, vr13 //p2_asub_p0
+ vslt.bu vr7, vr0, vr5
+ vand.v vr7, vr8, vr7 //is_less_than_beta
+ vsetnez.v $fcc0, vr7
+ bceqz $fcc0, .END_LUMA_BETA
+ vsllwil.hu.bu vr26, vr11, 0 //p2_org_h.0
+ vexth.hu.bu vr0, vr11 //p2_org_h.1
+ AVC_LPF_P1_OR_Q1 vr24, vr25, vr23, vr26, vr9, vr18, vr27, vr28, vr29
+ AVC_LPF_P1_OR_Q1 vr4, vr6, vr3, vr0, vr2, vr19, vr28, vr29, vr30
+ vpickev.b vr27, vr28, vr27
+ vbitsel.v vr12, vr12, vr27, vr7
+ vandi.b vr7, vr7, 1
+ vadd.b vr1, vr1, vr7
+.END_LUMA_BETA:
+ vabsd.bu vr26, vr16, vr14 //q2_asub_q0
+ vslt.bu vr7, vr26, vr5
+ vand.v vr7, vr7, vr8
+ vsllwil.hu.bu vr27, vr15, 0 //q1_org_h.0
+ vexth.hu.bu vr26, vr15 //q1_org_h.1
+ vsetnez.v $fcc0, vr7
+ bceqz $fcc0, .END_LUMA_BETA_SEC
+ vsllwil.hu.bu vr28, vr16, 0 //q2_org_h.0
+ vexth.hu.bu vr0, vr16 //q2_org_h.1
+ AVC_LPF_P1_OR_Q1 vr24, vr25, vr27, vr28, vr9, vr18, vr29, vr30, vr31
+ AVC_LPF_P1_OR_Q1 vr4, vr6, vr26, vr0, vr2, vr19, vr22, vr30, vr31
+ vpickev.b vr29, vr22, vr29
+ vbitsel.v vr15, vr15, vr29, vr7
+ vandi.b vr7, vr7, 1
+ vadd.b vr1, vr1, vr7
+.END_LUMA_BETA_SEC:
+ vneg.b vr22, vr1 //neg_thresh_h
+ vsllwil.h.b vr28, vr22, 0 //neg_thresh_h.0
+ vexth.h.b vr29, vr22 //neg_thresh_h.1
+ vsllwil.hu.bu vr18, vr1, 0 //tc_h.0
+ vexth.hu.bu vr1, vr1 //tc_h.1
+ AVC_LPF_P0Q0 vr25, vr24, vr23, vr27, vr28, vr18, vr30, vr31, vr0, vr2
+ AVC_LPF_P0Q0 vr6, vr4, vr3, vr26, vr29, vr1, vr20, vr21, vr0, vr2
+ vpickev.b vr30, vr20, vr30 //p0_h
+ vpickev.b vr31, vr21, vr31 //q0_h
+ vbitsel.v vr13, vr13, vr30, vr8 //p0_org
+ vbitsel.v vr14, vr14, vr31, vr8 //q0_org
+
+ vilvl.b vr4, vr12, vr10 // row0.0
+ vilvl.b vr5, vr16, vr14 // row0.1
+ vilvl.b vr6, vr13, vr11 // row2.0
+ vilvl.b vr7, vr17, vr15 // row2.1
+
+ vilvh.b vr8, vr12, vr10 // row1.0
+ vilvh.b vr9, vr16, vr14 // row1.1
+ vilvh.b vr10, vr13, vr11 // row3.0
+ vilvh.b vr11, vr17, vr15 // row3.1
+
+ vilvl.b vr12, vr6, vr4 // row4.0
+ vilvl.b vr13, vr7, vr5 // row4.1
+ vilvl.b vr14, vr10, vr8 // row6.0
+ vilvl.b vr15, vr11, vr9 // row6.1
+
+ vilvh.b vr16, vr6, vr4 // row5.0
+ vilvh.b vr17, vr7, vr5 // row5.1
+ vilvh.b vr18, vr10, vr8 // row7.0
+ vilvh.b vr19, vr11, vr9 // row7.1
+
+ vilvl.w vr4, vr13, vr12 // row4: 0, 4, 1, 5
+ vilvh.w vr5, vr13, vr12 // row4: 2, 6, 3, 7
+ vilvl.w vr6, vr17, vr16 // row5: 0, 4, 1, 5
+ vilvh.w vr7, vr17, vr16 // row5: 2, 6, 3, 7
+
+ vilvl.w vr8, vr15, vr14 // row6: 0, 4, 1, 5
+ vilvh.w vr9, vr15, vr14 // row6: 2, 6, 3, 7
+ vilvl.w vr10, vr19, vr18 // row7: 0, 4, 1, 5
+ vilvh.w vr11, vr19, vr18 // row7: 2, 6, 3, 7
+
+ vbsrl.v vr20, vr4, 8
+ vbsrl.v vr21, vr5, 8
+ vbsrl.v vr22, vr6, 8
+ vbsrl.v vr23, vr7, 8
+
+ vbsrl.v vr24, vr8, 8
+ vbsrl.v vr25, vr9, 8
+ vbsrl.v vr26, vr10, 8
+ vbsrl.v vr27, vr11, 8
+
+ store_double f4, f20, f5, f21, t4, a1, t0, t3
+ add.d t4, t4, t1
+ store_double f6, f22, f7, f23, t4, a1, t0, t3
+ add.d t4, t4, t1
+ store_double f8, f24, f9, f25, t4, a1, t0, t3
+ add.d t4, t4, t1
+ store_double f10, f26, f11, f27, t4, a1, t0, t3
+.END_LUMA_8:
+ RESTORE_REG
+endfunc
+
+function ff_h264_v_lpf_luma_8_lsx
+ slli.d t0, a1, 1 //img_width_2x
+ la.local t4, vec_shuf
+ vldrepl.w vr0, a4, 0 //tmp_vec0
+ vld vr1, t4, 0 //tc_vec
+ add.d t1, t0, a1 //img_width_3x
+ vshuf.b vr1, vr0, vr0, vr1 //tc_vec
+ addi.d sp, sp, -24
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ vslti.b vr2, vr1, 0
+ vxori.b vr2, vr2, 255
+ vandi.b vr2, vr2, 1 //bs_vec
+ vsetnez.v $fcc0, vr2
+ bceqz $fcc0, .END_V_LUMA_8
+ sub.d t2, a0, t1 //data - img_width_3x
+ vreplgr2vr.b vr4, a2 //alpha
+ vreplgr2vr.b vr5, a3 //beta
+ vldi vr0, 0 //zero
+ vld vr10, t2, 0 //p2_org
+ vldx vr11, t2, a1 //p1_org
+ vldx vr12, t2, t0 //p0_org
+ vld vr13, a0, 0 //q0_org
+ vldx vr14, a0, a1 //q1_org
+
+ vslt.bu vr0, vr0, vr2 //is_bs_greater_than0
+ vabsd.bu vr16, vr11, vr12 //p1_asub_p0
+ vabsd.bu vr15, vr12, vr13 //p0_asub_q0
+ vabsd.bu vr17, vr14, vr13 //q1_asub_q0
+
+ vslt.bu vr6, vr15, vr4 //is_less_than_alpha
+ vslt.bu vr7, vr16, vr5 //is_less_than_beta
+ vand.v vr8, vr6, vr7 //is_less_than
+ vslt.bu vr7, vr17, vr5 //is_less_than_beta
+ vand.v vr8, vr7, vr8
+ vand.v vr8, vr8, vr0 //is_less_than
+
+ vsetnez.v $fcc0, vr8
+ bceqz $fcc0, .END_V_LUMA_8
+ vldx vr15, a0, t0 //q2_org
+ vneg.b vr0, vr1 //neg_tc_h
+ vsllwil.h.b vr18, vr1, 0 //tc_h.0
+ vexth.h.b vr19, vr1 //tc_h.1
+ vsllwil.h.b vr9, vr0, 0 //neg_tc_h.0
+ vexth.h.b vr2, vr0 //neg_tc_h.1
+
+ vsllwil.hu.bu vr16, vr11, 0 //p1_org_h.0
+ vexth.hu.bu vr17, vr11 //p1_org_h.1
+ vsllwil.hu.bu vr20, vr12, 0 //p0_org_h.0
+ vexth.hu.bu vr21, vr12 //p0_org_h.1
+ vsllwil.hu.bu vr22, vr13, 0 //q0_org_h.0
+ vexth.hu.bu vr23, vr13 //q0_org_h.1
+
+ vabsd.bu vr0, vr10, vr12 //p2_asub_p0
+ vslt.bu vr7, vr0, vr5 //is_less_than_beta
+ vand.v vr7, vr7, vr8 //is_less_than_beta
+
+ vsetnez.v $fcc0, vr8
+ bceqz $fcc0, .END_V_LESS_BETA
+ vsllwil.hu.bu vr3, vr10, 0 //p2_org_h.0
+ vexth.hu.bu vr4, vr10 //p2_org_h.1
+ AVC_LPF_P1_OR_Q1 vr20, vr22, vr16, vr3, vr9, vr18, vr24, vr0, vr26
+ AVC_LPF_P1_OR_Q1 vr21, vr23, vr17, vr4, vr2, vr19, vr25, vr0, vr26
+ vpickev.b vr24, vr25, vr24
+ vbitsel.v vr24, vr11, vr24, vr7
+ addi.d t3, t2, 16
+ vstx vr24, t2, a1
+ vandi.b vr7, vr7, 1
+ vadd.b vr1, vr7, vr1
+.END_V_LESS_BETA:
+ vabsd.bu vr0, vr15, vr13 //q2_asub_q0
+ vslt.bu vr7, vr0, vr5 //is_less_than_beta
+ vand.v vr7, vr7, vr8 //is_less_than_beta
+ vsllwil.hu.bu vr3, vr14, 0 //q1_org_h.0
+ vexth.hu.bu vr4, vr14 //q1_org_h.1
+
+ vsetnez.v $fcc0, vr7
+ bceqz $fcc0, .END_V_LESS_BETA_SEC
+ vsllwil.hu.bu vr11, vr15, 0 //q2_org_h.0
+ vexth.hu.bu vr15, vr15 //q2_org_h.1
+ AVC_LPF_P1_OR_Q1 vr20, vr22, vr3, vr11, vr9, vr18, vr24, vr0, vr26
+ AVC_LPF_P1_OR_Q1 vr21, vr23, vr4, vr15, vr2, vr19, vr25, vr0, vr26
+ vpickev.b vr24, vr25, vr24
+ vbitsel.v vr24, vr14, vr24, vr7
+ vstx vr24, a0, a1
+ vandi.b vr7, vr7, 1
+ vadd.b vr1, vr1, vr7
+.END_V_LESS_BETA_SEC:
+ vneg.b vr0, vr1
+ vsllwil.h.b vr9, vr0, 0 //neg_thresh_h.0
+ vexth.h.b vr2, vr0 //neg_thresh_h.1
+ vsllwil.hu.bu vr18, vr1, 0 //tc_h.0
+ vexth.hu.bu vr19, vr1 //tc_h.1
+ AVC_LPF_P0Q0 vr22, vr20, vr16, vr3, vr9, vr18, vr11, vr15, vr0, vr26
+ AVC_LPF_P0Q0 vr23, vr21, vr17, vr4, vr2, vr19, vr10, vr14, vr0, vr26
+ vpickev.b vr11, vr10, vr11 //p0_h
+ vpickev.b vr15, vr14, vr15 //q0_h
+ vbitsel.v vr11, vr12, vr11, vr8 //p0_h
+ vbitsel.v vr15, vr13, vr15, vr8 //q0_h
+ vstx vr11, t2, t0
+ vst vr15, a0, 0
+.END_V_LUMA_8:
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ addi.d sp, sp, 24
+endfunc
+
+const chroma_shuf
+.byte 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3
+endconst
+
+function ff_h264_h_lpf_chroma_8_lsx
+ slli.d t0, a1, 1 //img_width_2x
+ slli.d t1, a1, 2 //img_width_4x
+ la.local t4, chroma_shuf
+ add.d t2, t0, a1 //img_width_3x
+ vldrepl.w vr0, a4, 0 //tmp_vec0
+ vld vr1, t4, 0 //tc_vec
+ vshuf.b vr1, vr0, vr0, vr1 //tc_vec
+ vslti.b vr2, vr1, 0
+ vxori.b vr2, vr2, 255
+ vandi.b vr2, vr2, 1 //bs_vec
+ vsetnez.v $fcc0, vr2
+ bceqz $fcc0, .END_CHROMA_8
+ vldi vr0, 0
+ addi.d t4, a0, -2
+ vslt.bu vr3, vr0, vr2 //is_bs_greater_than0
+ add.d t5, t4, t1
+ vld vr4, t4, 0 //row0
+ vldx vr5, t4, a1 //row1
+ vldx vr6, t4, t0 //row2
+ vldx vr7, t4, t2 //row3
+ vld vr8, t5, 0 //row4
+ vldx vr9, t5, a1 //row5
+ vldx vr10, t5, t0 //row6
+ vldx vr11, t5, t2 //row7
+ vilvl.b vr12, vr6, vr4 //p1_org
+ vilvl.b vr13, vr7, vr5 //p0_org
+ vilvl.b vr14, vr10, vr8 //q0_org
+ vilvl.b vr15, vr11, vr9 //q1_org
+ vilvl.b vr4, vr13, vr12 //row0
+ vilvl.b vr5, vr15, vr14 //row1
+ vilvl.w vr6, vr5, vr4 //row2
+ vilvh.w vr7, vr5, vr4 //row3
+ vilvl.d vr12, vr6, vr6 //p1_org
+ vilvh.d vr13, vr6, vr6 //p0_org
+ vilvl.d vr14, vr7, vr7 //q0_org
+ vilvh.d vr15, vr7, vr7 //q1_org
+
+ vabsd.bu vr20, vr13, vr14 //p0_asub_q0
+ vabsd.bu vr21, vr12, vr13 //p1_asub_p0
+ vabsd.bu vr22, vr15, vr14 //q1_asub_q0
+
+ vreplgr2vr.b vr4, a2 //alpha
+ vreplgr2vr.b vr5, a3 //beta
+
+ vslt.bu vr6, vr20, vr4 //is_less_than_alpha
+ vslt.bu vr7, vr21, vr5 //is_less_than_beta
+ vand.v vr8, vr6, vr7 //is_less_than
+ vslt.bu vr7, vr22, vr5 //is_less_than_beta
+ vand.v vr8, vr7, vr8 //is_less_than
+ vand.v vr8, vr8, vr3 //is_less_than
+ vsetnez.v $fcc0, vr8
+ bceqz $fcc0, .END_CHROMA_8
+
+ vneg.b vr9, vr1 //neg_tc_h
+ vexth.hu.bu vr3, vr12 //p1_org_h
+ vexth.hu.bu vr4, vr13 //p0_org_h.1
+ vexth.hu.bu vr5, vr14 //q0_org_h.1
+ vexth.hu.bu vr6, vr15 //q1_org_h.1
+
+ vexth.hu.bu vr18, vr1 //tc_h.1
+ vexth.h.b vr2, vr9 //neg_tc_h.1
+
+ AVC_LPF_P0Q0 vr5, vr4, vr3, vr6, vr2, vr18, vr10, vr11, vr16, vr17
+ vpickev.b vr10, vr10, vr10 //p0_h
+ vpickev.b vr11, vr11, vr11 //q0_h
+ vbitsel.v vr13, vr13, vr10, vr8
+ vbitsel.v vr14, vr14, vr11, vr8
+ vilvl.b vr15, vr14, vr13
+ addi.d t4, t4, 1
+ add.d t5, t4, a1
+ add.d t6, t4, t0
+ add.d t7, t4, t2
+ vstelm.h vr15, t4, 0, 0
+ vstelm.h vr15, t5, 0, 1
+ vstelm.h vr15, t6, 0, 2
+ vstelm.h vr15, t7, 0, 3
+ add.d t4, t4, t1
+ add.d t5, t4, a1
+ add.d t6, t4, t0
+ add.d t7, t4, t2
+ vstelm.h vr15, t4, 0, 4
+ vstelm.h vr15, t5, 0, 5
+ vstelm.h vr15, t6, 0, 6
+ vstelm.h vr15, t7, 0, 7
+.END_CHROMA_8:
+endfunc
+
+function ff_h264_v_lpf_chroma_8_lsx
+ slli.d t0, a1, 1 //img_width_2x
+ la.local t4, chroma_shuf
+ vldrepl.w vr0, a4, 0 //tmp_vec0
+ vld vr1, t4, 0 //tc_vec
+ vshuf.b vr1, vr0, vr0, vr1 //tc_vec
+ vslti.b vr2, vr1, 0
+ vxori.b vr2, vr2, 255
+ vandi.b vr2, vr2, 1 //bs_vec
+ vsetnez.v $fcc0, vr2
+ bceqz $fcc0, .END_CHROMA_V_8
+ vldi vr0, 0
+ sub.d t4, a0, t0
+ vslt.bu vr3, vr0, vr2 //is_bs_greater_than0
+ vld vr12, t4, 0 //p1_org
+ vldx vr13, t4, a1 //p0_org
+ vld vr14, a0, 0 //q0_org
+ vldx vr15, a0, a1 //q1_org
+
+ vabsd.bu vr20, vr13, vr14 //p0_asub_q0
+ vabsd.bu vr21, vr12, vr13 //p1_asub_p0
+ vabsd.bu vr22, vr15, vr14 //q1_asub_q0
+
+ vreplgr2vr.b vr4, a2 //alpha
+ vreplgr2vr.b vr5, a3 //beta
+
+ vslt.bu vr6, vr20, vr4 //is_less_than_alpha
+ vslt.bu vr7, vr21, vr5 //is_less_than_beta
+ vand.v vr8, vr6, vr7 //is_less_than
+ vslt.bu vr7, vr22, vr5 //is_less_than_beta
+ vand.v vr8, vr7, vr8 //is_less_than
+ vand.v vr8, vr8, vr3 //is_less_than
+ vsetnez.v $fcc0, vr8
+ bceqz $fcc0, .END_CHROMA_V_8
+
+ vneg.b vr9, vr1 //neg_tc_h
+ vsllwil.hu.bu vr3, vr12, 0 //p1_org_h
+ vsllwil.hu.bu vr4, vr13, 0 //p0_org_h.1
+ vsllwil.hu.bu vr5, vr14, 0 //q0_org_h.1
+ vsllwil.hu.bu vr6, vr15, 0 //q1_org_h.1
+
+ vexth.hu.bu vr18, vr1 //tc_h.1
+ vexth.h.b vr2, vr9 //neg_tc_h.1
+
+ AVC_LPF_P0Q0 vr5, vr4, vr3, vr6, vr2, vr18, vr10, vr11, vr16, vr17
+ vpickev.b vr10, vr10, vr10 //p0_h
+ vpickev.b vr11, vr11, vr11 //q0_h
+ vbitsel.v vr10, vr13, vr10, vr8
+ vbitsel.v vr11, vr14, vr11, vr8
+ fstx.d f10, t4, a1
+ fst.d f11, a0, 0
+.END_CHROMA_V_8:
+endfunc
+
+.macro AVC_LPF_P0P1P2_OR_Q0Q1Q2 _in0, _in1, _in2, _in3, _in4, _in5 \
+ _out0, _out1, _out2, _tmp0, _const3
+ vadd.h \_tmp0, \_in1, \_in2
+ vadd.h \_tmp0, \_tmp0, \_in3
+ vslli.h \_out2, \_in0, 1
+ vslli.h \_out0, \_tmp0, 1
+ vadd.h \_out0, \_out0, \_in4
+ vadd.h \_out1, \_in4, \_tmp0
+ vadd.h \_out0, \_out0, \_in5
+ vmadd.h \_out2, \_in4, \_const3
+ vsrar.h \_out0, \_out0, \_const3
+ vadd.h \_out2, \_out2, \_tmp0
+ vsrari.h \_out1, \_out1, 2
+ vsrar.h \_out2, \_out2, \_const3
+.endm
+
+.macro AVC_LPF_P0_OR_Q0 _in0, _in1, _in2, _out0, _tmp0
+ vslli.h \_tmp0, \_in2, 1
+ vadd.h \_out0, \_in0, \_in1
+ vadd.h \_out0, \_out0, \_tmp0
+ vsrari.h \_out0, \_out0, 2
+.endm
+
+////LSX optimization is sufficient for this function.
+function ff_h264_h_lpf_luma_intra_8_lsx
+ slli.d t0, a1, 1 //img_width_2x
+ slli.d t1, a1, 2 //img_width_4x
+ addi.d t4, a0, -4 //src
+ SAVE_REG
+ add.d t2, t0, a1 //img_width_3x
+ add.d t5, t4, t1
+ vld vr0, t4, 0 //row0
+ vldx vr1, t4, a1 //row1
+ vldx vr2, t4, t0 //row2
+ vldx vr3, t4, t2 //row3
+ add.d t6, t5, t1
+ vld vr4, t5, 0 //row4
+ vldx vr5, t5, a1 //row5
+ vldx vr6, t5, t0 //row6
+ vldx vr7, t5, t2 //row7
+ add.d t7, t6, t1
+ vld vr8, t6, 0 //row8
+ vldx vr9, t6, a1 //row9
+ vldx vr10, t6, t0 //row10
+ vldx vr11, t6, t2 //row11
+ vld vr12, t7, 0 //row12
+ vldx vr13, t7, a1 //row13
+ vldx vr14, t7, t0 //row14
+ vldx vr15, t7, t2 //row15
+ LSX_TRANSPOSE16X8_B vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
+ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
+ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
+ // vr0: p3_org, vr1: p2_org, vr2: p1_org, vr3: p0_org
+ // vr4: q0_org, vr5: q1_org, vr6: q2_org, vr7: q3_org
+
+ vreplgr2vr.b vr16, a2 //alpha_in
+ vreplgr2vr.b vr17, a3 //beta_in
+ vabsd.bu vr10, vr3, vr4 //p0_asub_q0
+ vabsd.bu vr11, vr2, vr3 //p1_asub_p0
+ vabsd.bu vr12, vr5, vr4 //q1_asub_q0
+
+ vslt.bu vr8, vr10, vr16 //is_less_than_alpha
+ vslt.bu vr9, vr11, vr17 //is_less_than_beta
+ vand.v vr18, vr8, vr9 //is_less_than
+ vslt.bu vr9, vr12, vr17 //is_less_than_beta
+ vand.v vr18, vr18, vr9 //is_less_than
+
+ vsetnez.v $fcc0, vr18
+ bceqz $fcc0, .END_H_INTRA_8
+ vsrli.b vr16, vr16, 2 //less_alpha_shift2_add2
+ vaddi.bu vr16, vr16, 2
+ vslt.bu vr16, vr10, vr16
+ vsllwil.hu.bu vr10, vr2, 0 //p1_org_h.0
+ vexth.hu.bu vr11, vr2 //p1_org_h.1
+ vsllwil.hu.bu vr12, vr3, 0 //p0_org_h.0
+ vexth.hu.bu vr13, vr3 //p0_org_h.1
+
+ vsllwil.hu.bu vr14, vr4, 0 //q0_org_h.0
+ vexth.hu.bu vr15, vr4 //q0_org_h.1
+ vsllwil.hu.bu vr19, vr5, 0 //q1_org_h.0
+ vexth.hu.bu vr20, vr5 //q1_org_h.1
+
+ vabsd.bu vr21, vr1, vr3 //p2_asub_p0
+ vslt.bu vr9, vr21, vr17 //is_less_than_beta
+ vand.v vr9, vr9, vr16
+ vxori.b vr22, vr9, 0xff //negate_is_less_than_beta
+ vand.v vr9, vr9, vr18
+ vand.v vr22, vr22, vr18
+
+ vsetnez.v $fcc0, vr9
+ bceqz $fcc0, .END_H_INTRA_LESS_BETA
+ vsllwil.hu.bu vr23, vr1, 0 //p2_org_h.0
+ vexth.hu.bu vr24, vr1 //p2_org_h.1
+ vsllwil.hu.bu vr25, vr0, 0 //p3_org_h.0
+ vexth.hu.bu vr26, vr0 //p3_org_h.1
+ vldi vr27, 0x403
+
+ AVC_LPF_P0P1P2_OR_Q0Q1Q2 vr25, vr12, vr14, vr10, vr23, vr19, vr28, vr29, vr30, vr31, vr27
+ AVC_LPF_P0P1P2_OR_Q0Q1Q2 vr26, vr13, vr15, vr11, vr24, vr20, vr23, vr25, vr21, vr31, vr27
+ vpickev.b vr28, vr23, vr28 //p0_h
+ vpickev.b vr29, vr25, vr29 //p1_h
+ vpickev.b vr30, vr21, vr30 //p2_h
+ vbitsel.v vr3, vr3, vr28, vr9
+ vbitsel.v vr2, vr2, vr29, vr9
+ vbitsel.v vr1, vr1, vr30, vr9
+.END_H_INTRA_LESS_BETA:
+ AVC_LPF_P0_OR_Q0 vr12, vr19, vr10, vr23, vr25
+ AVC_LPF_P0_OR_Q0 vr13, vr20, vr11, vr24, vr25
+ //vr23: p0_h.0 vr24: p0_h.1
+ vpickev.b vr23, vr24, vr23
+ vbitsel.v vr3, vr3, vr23, vr22
+
+ vabsd.bu vr21, vr6, vr4 //q2_asub_q0
+ vslt.bu vr9, vr21, vr17 //is_less_than_beta
+ vand.v vr9, vr9, vr16
+ vxori.b vr22, vr9, 0xff //negate_is_less_than_beta
+ vand.v vr9, vr9, vr18
+ vand.v vr22, vr22, vr18
+
+ vsetnez.v $fcc0, vr9
+ bceqz $fcc0, .END_H_INTRA_LESS_BETA_SEC
+ vsllwil.hu.bu vr23, vr6, 0 //q2_org_h.0
+ vexth.hu.bu vr24, vr6 //q2_org_h.1
+ vsllwil.hu.bu vr25, vr7, 0 //q3_org_h.0
+ vexth.hu.bu vr26, vr7 //q3_org_h.1
+ vldi vr27, 0x403
+
+ AVC_LPF_P0P1P2_OR_Q0Q1Q2 vr25, vr14, vr12, vr19, vr23, vr10, vr28, vr29, vr30, vr31, vr27
+ AVC_LPF_P0P1P2_OR_Q0Q1Q2 vr26, vr15, vr13, vr20, vr24, vr11, vr23, vr25, vr21, vr31, vr27
+ vpickev.b vr28, vr23, vr28 //q0_h
+ vpickev.b vr29, vr25, vr29 //q1_h
+ vpickev.b vr30, vr21, vr30 //q2_h
+ vbitsel.v vr4, vr4, vr28, vr9
+ vbitsel.v vr5, vr5, vr29, vr9
+ vbitsel.v vr6, vr6, vr30, vr9
+.END_H_INTRA_LESS_BETA_SEC:
+ AVC_LPF_P0_OR_Q0 vr14, vr10, vr19, vr23, vr25
+ AVC_LPF_P0_OR_Q0 vr15, vr11, vr20, vr24, vr25
+ vpickev.b vr23, vr24, vr23
+ vbitsel.v vr4, vr4, vr23, vr22
+
+ vilvl.b vr14, vr2, vr0 // row0.0
+ vilvl.b vr15, vr6, vr4 // row0.1
+ vilvl.b vr16, vr3, vr1 // row2.0
+ vilvl.b vr17, vr7, vr5 // row2.1
+
+ vilvh.b vr18, vr2, vr0 // row1.0
+ vilvh.b vr19, vr6, vr4 // row1.1
+ vilvh.b vr20, vr3, vr1 // row3.0
+ vilvh.b vr21, vr7, vr5 // row3.1
+
+ vilvl.b vr2, vr16, vr14 // row4.0
+ vilvl.b vr3, vr17, vr15 // row4.1
+ vilvl.b vr4, vr20, vr18 // row6.0
+ vilvl.b vr5, vr21, vr19 // row6.1
+
+ vilvh.b vr6, vr16, vr14 // row5.0
+ vilvh.b vr7, vr17, vr15 // row5.1
+ vilvh.b vr8, vr20, vr18 // row7.0
+ vilvh.b vr9, vr21, vr19 // row7.1
+
+ vilvl.w vr14, vr3, vr2 // row4: 0, 4, 1, 5
+ vilvh.w vr15, vr3, vr2 // row4: 2, 6, 3, 7
+ vilvl.w vr16, vr7, vr6 // row5: 0, 4, 1, 5
+ vilvh.w vr17, vr7, vr6 // row5: 2, 6, 3, 7
+
+ vilvl.w vr18, vr5, vr4 // row6: 0, 4, 1, 5
+ vilvh.w vr19, vr5, vr4 // row6: 2, 6, 3, 7
+ vilvl.w vr20, vr9, vr8 // row7: 0, 4, 1, 5
+ vilvh.w vr21, vr9, vr8 // row7: 2, 6, 3, 7
+
+ vbsrl.v vr0, vr14, 8
+ vbsrl.v vr1, vr15, 8
+ vbsrl.v vr2, vr16, 8
+ vbsrl.v vr3, vr17, 8
+
+ vbsrl.v vr4, vr18, 8
+ vbsrl.v vr5, vr19, 8
+ vbsrl.v vr6, vr20, 8
+ vbsrl.v vr7, vr21, 8
+
+ store_double f14, f0, f15, f1, t4, a1, t0, t2
+ store_double f16, f2, f17, f3, t5, a1, t0, t2
+ store_double f18, f4, f19, f5, t6, a1, t0, t2
+ store_double f20, f6, f21, f7, t7, a1, t0, t2
+.END_H_INTRA_8:
+ RESTORE_REG
+endfunc
+
+//LSX optimization is sufficient for this function.
+function ff_h264_v_lpf_luma_intra_8_lsx
+ slli.d t0, a1, 1 //img_width_2x
+ add.d t1, t0, a1 //img_width_3x
+ SAVE_REG
+ sub.d t4, a0, t1 //src - img_width_3x
+
+ vld vr0, a0, 0 //q0_org
+ vldx vr1, a0, a1 //q1_org
+ vldx vr2, t4, a1 //p1_org
+ vldx vr3, t4, t0 //p0_org
+
+ vreplgr2vr.b vr4, a2 //alpha
+ vreplgr2vr.b vr5, a3 //beta
+
+ vabsd.bu vr6, vr3, vr0 //p0_asub_q0
+ vabsd.bu vr7, vr2, vr3 //p1_asub_p0
+ vabsd.bu vr8, vr1, vr0 //q1_asub_q0
+
+ vslt.bu vr9, vr6, vr4 //is_less_than_alpha
+ vslt.bu vr10, vr7, vr5 //is_less_than_beta
+ vand.v vr11, vr9, vr10 //is_less_than
+ vslt.bu vr10, vr8, vr5
+ vand.v vr11, vr10, vr11
+
+ vsetnez.v $fcc0, vr11
+ bceqz $fcc0, .END_V_INTRA_8
+
+ vld vr12, t4, 0 //p2_org
+ vldx vr13, a0, t0 //q2_org
+ vsrli.b vr14, vr4, 2 //is_alpha_shift2_add2
+ vsllwil.hu.bu vr15, vr2, 0 //p1_org_h.0
+ vexth.hu.bu vr16, vr2 //p1_org_h.1
+ vaddi.bu vr14, vr14, 2
+ vsllwil.hu.bu vr17, vr3, 0 //p0_org_h.0
+ vexth.hu.bu vr18, vr3 //p0_org_h.1
+ vslt.bu vr14, vr6, vr14
+ vsllwil.hu.bu vr19, vr0, 0 //q0_org_h.0
+ vexth.hu.bu vr20, vr0 //q0_org_h.1
+ vsllwil.hu.bu vr21, vr1, 0 //q1_org_h.0
+ vexth.hu.bu vr22, vr1 //q1_org_h.1
+
+ vabsd.bu vr23, vr12, vr3 //p2_asub_p0
+ vslt.bu vr10, vr23, vr5 //is_less_than_beta
+ vand.v vr10, vr10, vr14
+ vxori.b vr23, vr10, 0xff //negate_is_less_than_beta
+ vand.v vr10, vr10, vr11
+ vand.v vr23, vr23, vr11
+
+ vsetnez.v $fcc0, vr10
+ bceqz $fcc0, .END_V_INTRA_LESS_BETA
+ sub.d t5, t4, a1
+ vld vr24, t5, 0 //p3_org
+ vsllwil.hu.bu vr26, vr12, 0 //p2_org_h.0
+ vexth.hu.bu vr27, vr12 //p2_org_h.1
+ vsllwil.hu.bu vr28, vr24, 0 //p3_org_h.0
+ vexth.hu.bu vr29, vr24 //p3_org_h.1
+ vldi vr4, 0x403
+
+ AVC_LPF_P0P1P2_OR_Q0Q1Q2 vr28, vr17, vr19, vr15, vr26, vr21, vr25, vr30, vr31, vr24, vr4
+ AVC_LPF_P0P1P2_OR_Q0Q1Q2 vr29, vr18, vr20, vr16, vr27, vr22, vr6, vr7, vr8, vr24, vr4
+
+ vpickev.b vr25, vr6, vr25 //p0_h
+ vpickev.b vr30, vr7, vr30 //p1_h
+ vpickev.b vr31, vr8, vr31 //p2_h
+
+ vbitsel.v vr3, vr3, vr25, vr10
+ vbitsel.v vr2, vr2, vr30, vr10
+ vbitsel.v vr12, vr12, vr31, vr10
+
+ vstx vr2, t4, a1
+ vst vr12, t4, 0
+.END_V_INTRA_LESS_BETA:
+ AVC_LPF_P0_OR_Q0 vr17, vr21, vr15, vr24, vr30
+ AVC_LPF_P0_OR_Q0 vr18, vr22, vr16, vr25, vr30
+ vpickev.b vr24, vr25, vr24
+ vbitsel.v vr3, vr3, vr24, vr23
+ vstx vr3, t4, t0
+
+ vabsd.bu vr23, vr13, vr0 //q2_asub_q0
+ vslt.bu vr10, vr23, vr5 //is_less_than_beta
+ vand.v vr10, vr10, vr14
+ vxori.b vr23, vr10, 0xff //negate_is_less_than_beta
+ vand.v vr10, vr10, vr11
+ vand.v vr23, vr23, vr11
+
+ vsetnez.v $fcc0, vr10
+ bceqz $fcc0, .END_V_INTRA_LESS_BETA_SEC
+ vldx vr24, a0, t1 //q3_org
+
+ vsllwil.hu.bu vr26, vr13, 0 //q2_org_h.0
+ vexth.hu.bu vr27, vr13 //q2_org_h.1
+ vsllwil.hu.bu vr28, vr24, 0 //q3_org_h.0
+ vexth.hu.bu vr29, vr24 //q3_org_h.1
+ vldi vr4, 0x403
+
+ AVC_LPF_P0P1P2_OR_Q0Q1Q2 vr28, vr19, vr17, vr21, vr26, vr15, vr25, vr30, vr31, vr24, vr4
+ AVC_LPF_P0P1P2_OR_Q0Q1Q2 vr29, vr20, vr18, vr22, vr27, vr16, vr6, vr7, vr8, vr24, vr4
+
+ vpickev.b vr25, vr6, vr25
+ vpickev.b vr30, vr7, vr30
+ vpickev.b vr31, vr8, vr31
+
+ vbitsel.v vr0, vr0, vr25, vr10
+ vbitsel.v vr1, vr1, vr30, vr10
+ vbitsel.v vr13, vr13, vr31, vr10
+ vstx vr1, a0, a1
+ vstx vr13, a0, t0
+.END_V_INTRA_LESS_BETA_SEC:
+ AVC_LPF_P0_OR_Q0 vr19, vr15, vr21, vr24, vr30
+ AVC_LPF_P0_OR_Q0 vr20, vr16, vr22, vr25, vr30
+ vpickev.b vr24, vr25, vr24
+ vbitsel.v vr0, vr0, vr24, vr23
+ vst vr0, a0, 0
+.END_V_INTRA_8:
+ RESTORE_REG
+endfunc
+
+function ff_h264_h_lpf_chroma_intra_8_lsx
+ addi.d t4, a0, -2
+ slli.d t0, a1, 1 //img_2x
+ slli.d t2, a1, 2 //img_4x
+ add.d t1, t0, a1 //img_3x
+
+ add.d t5, t4, t2
+ fld.s f0, t4, 0 //row0
+ fldx.s f1, t4, a1 //row1
+ fldx.s f2, t4, t0 //row2
+ fldx.s f3, t4, t1 //row3
+ fld.s f4, t5, 0 //row4
+ fldx.s f5, t5, a1 //row5
+ fldx.s f6, t5, t0 //row6
+ fldx.s f7, t5, t1 //row7
+
+ vilvl.b vr8, vr2, vr0 //p1_org
+ vilvl.b vr9, vr3, vr1 //p0_org
+ vilvl.b vr10, vr6, vr4 //q0_org
+ vilvl.b vr11, vr7, vr5 //q1_org
+
+ vilvl.b vr0, vr9, vr8
+ vilvl.b vr1, vr11, vr10
+ vilvl.w vr2, vr1, vr0
+ vilvh.w vr3, vr1, vr0
+
+ vilvl.d vr8, vr2, vr2 //p1_org
+ vilvh.d vr9, vr2, vr2 //p0_org
+ vilvl.d vr10, vr3, vr3 //q0_org
+ vilvh.d vr11, vr3, vr3 //q1_org
+
+ vreplgr2vr.b vr0, a2 //alpha
+ vreplgr2vr.b vr1, a3 //beta
+
+ vabsd.bu vr2, vr9, vr10 //p0_asub_q0
+ vabsd.bu vr3, vr8, vr9 //p1_asub_p0
+ vabsd.bu vr4, vr11, vr10 //q1_asub_q0
+
+ vslt.bu vr5, vr2, vr0 //is_less_than_alpha
+ vslt.bu vr6, vr3, vr1 //is_less_than_beta
+ vand.v vr7, vr5, vr6 //is_less_than
+ vslt.bu vr6, vr4, vr1
+ vand.v vr7, vr7, vr6
+
+ vsetnez.v $fcc0, vr7
+ bceqz $fcc0, .END_H_CHROMA_INTRA_8
+
+ vexth.hu.bu vr12, vr8 //p1_org_h
+ vexth.hu.bu vr13, vr9 //p0_org_h
+ vexth.hu.bu vr14, vr10 //q0_org_h
+ vexth.hu.bu vr15, vr11 //q1_org_h
+
+ AVC_LPF_P0_OR_Q0 vr13, vr15, vr12, vr16, vr18
+ AVC_LPF_P0_OR_Q0 vr14, vr12, vr15, vr17, vr18
+
+ vpickev.b vr18, vr16, vr16
+ vpickev.b vr19, vr17, vr17
+ vbitsel.v vr9, vr9, vr18, vr7
+ vbitsel.v vr10, vr10, vr19, vr7
+.END_H_CHROMA_INTRA_8:
+ vilvl.b vr11, vr10, vr9
+ addi.d t4, t4, 1
+ vstelm.h vr11, t4, 0, 0
+ add.d t4, t4, a1
+ vstelm.h vr11, t4, 0, 1
+ add.d t4, t4, a1
+ vstelm.h vr11, t4, 0, 2
+ add.d t4, t4, a1
+ vstelm.h vr11, t4, 0, 3
+ add.d t4, t4, a1
+ vstelm.h vr11, t4, 0, 4
+ add.d t4, t4, a1
+ vstelm.h vr11, t4, 0, 5
+ add.d t4, t4, a1
+ vstelm.h vr11, t4, 0, 6
+ add.d t4, t4, a1
+ vstelm.h vr11, t4, 0, 7
+endfunc
+
+function ff_h264_v_lpf_chroma_intra_8_lsx
+ slli.d t0, a1, 1 //img_width_2x
+ sub.d t2, a0, a1
+ sub.d t1, a0, t0 //data - img_width_2x
+
+ vreplgr2vr.b vr0, a2
+ vreplgr2vr.b vr1, a3
+
+ vld vr2, t1, 0 //p1_org
+ vldx vr3, t1, a1 //p0_org
+ vld vr4, a0, 0 //q0_org
+ vldx vr5, a0, a1 //q1_org
+
+ vabsd.bu vr6, vr3, vr4 //p0_asub_q0
+ vabsd.bu vr7, vr2, vr3 //p1_asub_p0
+ vabsd.bu vr8, vr5, vr4 //q1_asub_q0
+
+ vslt.bu vr9, vr6, vr0 //is_less_than_alpha
+ vslt.bu vr10, vr7, vr1 //is_less_than_beta
+ vand.v vr11, vr9, vr10 //is_less_than
+ vslt.bu vr10, vr8, vr1
+ vand.v vr11, vr10, vr11
+
+ vsetnez.v $fcc0, vr11
+ bceqz $fcc0, .END_V_CHROMA_INTRA_8
+
+ vsllwil.hu.bu vr6, vr2, 0 //p1_org_h.0
+ vsllwil.hu.bu vr8, vr3, 0 //p0_org_h.0
+ vsllwil.hu.bu vr13, vr4, 0 //q0_org_h.0
+ vsllwil.hu.bu vr15, vr5, 0 //q1_org_h.0
+
+ AVC_LPF_P0_OR_Q0 vr8, vr15, vr6, vr17, vr23
+ AVC_LPF_P0_OR_Q0 vr13, vr6, vr15, vr18, vr23
+
+ vpickev.b vr19, vr17, vr17
+ vpickev.b vr20, vr18, vr18
+ vbitsel.v vr3, vr3, vr19, vr11
+ vbitsel.v vr4, vr4, vr20, vr11
+
+ vstelm.d vr3, t2, 0, 0
+ vstelm.d vr4, a0, 0, 0
+.END_V_CHROMA_INTRA_8:
+endfunc
+
+.macro biweight_calc _in0, _in1, _in2, _in3, _reg0, _reg1, _reg2,\
+ _out0, _out1, _out2, _out3
+ vmov \_out0, \_reg0
+ vmov \_out1, \_reg0
+ vmov \_out2, \_reg0
+ vmov \_out3, \_reg0
+ vmaddwev.h.bu.b \_out0, \_in0, \_reg1
+ vmaddwev.h.bu.b \_out1, \_in1, \_reg1
+ vmaddwev.h.bu.b \_out2, \_in2, \_reg1
+ vmaddwev.h.bu.b \_out3, \_in3, \_reg1
+ vmaddwod.h.bu.b \_out0, \_in0, \_reg1
+ vmaddwod.h.bu.b \_out1, \_in1, \_reg1
+ vmaddwod.h.bu.b \_out2, \_in2, \_reg1
+ vmaddwod.h.bu.b \_out3, \_in3, \_reg1
+
+ vssran.bu.h \_out0, \_out0, \_reg2
+ vssran.bu.h \_out1, \_out1, \_reg2
+ vssran.bu.h \_out2, \_out2, \_reg2
+ vssran.bu.h \_out3, \_out3, \_reg2
+.endm
+
+.macro biweight_load_8
+ load_double f0, f1, f2, f3, a1, a2, t0, t1
+ load_double f10, f11, f12, f13, a0, a2, t0, t1
+
+ vilvl.d vr0, vr1, vr0 //src0
+ vilvl.d vr2, vr3, vr2 //src2
+ vilvl.d vr10, vr11, vr10 //dst0
+ vilvl.d vr12, vr13, vr12 //dst2
+
+ vilvl.b vr1, vr10, vr0 //vec0.0
+ vilvh.b vr3, vr10, vr0 //vec0.1
+ vilvl.b vr5, vr12, vr2 //vec1.0
+ vilvh.b vr7, vr12, vr2 //vec1.1
+.endm
+
+.macro biweight_8
+ biweight_calc vr1, vr3, vr5, vr7, vr8, vr20, vr9, vr0, vr2, vr4, vr6
+ vilvl.d vr0, vr2, vr0
+ vilvl.d vr2, vr6, vr4
+
+ vbsrl.v vr1, vr0, 8
+ vbsrl.v vr3, vr2, 8
+
+ store_double f0, f1, f2, f3, a0, a2, t0, t1
+.endm
+
+.macro biweight_load2_8
+ biweight_load_8
+ load_double f0, f2, f4, f6, t4, a2, t0, t1
+ load_double f14, f15, f16, f17, t5, a2, t0, t1
+
+ vilvl.d vr0, vr2, vr0 //src4
+ vilvl.d vr4, vr6, vr4 //src6
+ vilvl.d vr14, vr15, vr14 //dst4
+ vilvl.d vr16, vr17, vr16 //dst6
+
+ vilvl.b vr11, vr14, vr0 //vec4.0
+ vilvh.b vr13, vr14, vr0 //vec4.1
+ vilvl.b vr15, vr16, vr4 //vec6.0
+ vilvh.b vr17, vr16, vr4 //vec6.1
+.endm
+
+.macro biweight2_8
+ biweight_8
+ biweight_calc vr11, vr13, vr15, vr17, vr8, vr20, vr9, \
+ vr10, vr12, vr14, vr16
+ vilvl.d vr10, vr12, vr10
+ vilvl.d vr12, vr16, vr14
+
+ vbsrl.v vr11, vr10, 8
+ vbsrl.v vr13, vr12, 8
+
+ store_double f10, f11, f12, f13, t5, a2, t0, t1
+.endm
+
+.macro biweight_load_16
+ add.d t4, a1, t2
+ vld vr0, a1, 0
+ vldx vr1, a1, a2
+ vldx vr2, a1, t0
+ vldx vr3, a1, t1
+ vld vr4, t4, 0
+ vldx vr5, t4, a2
+ vldx vr6, t4, t0
+ vldx vr7, t4, t1
+
+ add.d t5, a0, t2
+ vld vr10, a0, 0
+ vldx vr11, a0, a2
+ vldx vr12, a0, t0
+ vldx vr13, a0, t1
+ vld vr14, t5, 0
+ vldx vr15, t5, a2
+ vldx vr16, t5, t0
+ vldx vr17, t5, t1
+
+ vilvl.b vr18, vr10, vr0
+ vilvl.b vr19, vr11, vr1
+ vilvl.b vr21, vr12, vr2
+ vilvl.b vr22, vr13, vr3
+ vilvh.b vr0, vr10, vr0
+ vilvh.b vr1, vr11, vr1
+ vilvh.b vr2, vr12, vr2
+ vilvh.b vr3, vr13, vr3
+
+ vilvl.b vr10, vr14, vr4
+ vilvl.b vr11, vr15, vr5
+ vilvl.b vr12, vr16, vr6
+ vilvl.b vr13, vr17, vr7
+ vilvh.b vr14, vr14, vr4
+ vilvh.b vr15, vr15, vr5
+ vilvh.b vr16, vr16, vr6
+ vilvh.b vr17, vr17, vr7
+.endm
+
+.macro biweight_16
+ biweight_calc vr18, vr19, vr21, vr22, vr8, vr20, vr9, vr4, vr5, vr6, vr7
+ biweight_calc vr0, vr1, vr2, vr3, vr8, vr20, vr9, vr18, vr19, vr21, vr22
+ biweight_calc vr10, vr11, vr12, vr13, vr8, vr20, vr9, vr0, vr1, vr2, vr3
+ biweight_calc vr14, vr15, vr16, vr17, vr8, vr20, vr9, vr10, vr11, vr12, vr13
+
+ vilvl.d vr4, vr18, vr4
+ vilvl.d vr5, vr19, vr5
+ vilvl.d vr6, vr21, vr6
+ vilvl.d vr7, vr22, vr7
+ vilvl.d vr0, vr10, vr0
+ vilvl.d vr1, vr11, vr1
+ vilvl.d vr2, vr12, vr2
+ vilvl.d vr3, vr13, vr3
+
+ vst vr4, a0, 0
+ vstx vr5, a0, a2
+ vstx vr6, a0, t0
+ vstx vr7, a0, t1
+ vst vr0, t5, 0
+ vstx vr1, t5, a2
+ vstx vr2, t5, t0
+ vstx vr3, t5, t1
+.endm
+
+.macro biweight_func w
+function ff_biweight_h264_pixels\w\()_8_lsx
+ slli.d t0, a2, 1
+ slli.d t2, a2, 2
+ add.d t1, t0, a2
+ addi.d a7, a7, 1
+ ori a7, a7, 1
+ sll.d a7, a7, a4
+ addi.d a4, a4, 1
+
+ vreplgr2vr.b vr0, a6 //tmp0
+ vreplgr2vr.b vr1, a5 //tmp1
+ vreplgr2vr.h vr8, a7 //offset
+ vreplgr2vr.h vr9, a4 //denom
+ vilvh.b vr20, vr1, vr0 //wgt
+.endm
+
+biweight_func 8
+ addi.d t3, zero, 8
+ biweight_load_8
+ biweight_8
+ blt a3, t3, .END_BIWEIGHT_H264_PIXELS8
+ addi.d t3, zero, 16
+ add.d a1, a1, t2
+ add.d a0, a0, t2
+ biweight_load_8
+ biweight_8
+ blt a3, t3, .END_BIWEIGHT_H264_PIXELS8
+ add.d a1, a1, t2
+ add.d a0, a0, t2
+ add.d t4, a1, t2
+ add.d t5, a0, t2
+ biweight_load2_8
+ biweight2_8
+.END_BIWEIGHT_H264_PIXELS8:
+endfunc
+
+biweight_func 16
+ addi.d t6, zero, 16
+ biweight_load_16
+ biweight_16
+
+ bne a3, t6, .END_BIWEIGHT_PIXELS16
+ add.d a1, t4, t2
+ add.d a0, t5, t2
+ biweight_load_16
+ biweight_16
+.END_BIWEIGHT_PIXELS16:
+endfunc
+
+.macro biweight_calc_4 _in0, _out0
+ vmov \_out0, vr8
+ vmaddwev.h.bu.b \_out0, \_in0, vr20
+ vmaddwod.h.bu.b \_out0, \_in0, vr20
+ vssran.bu.h \_out0, \_out0, vr9
+.endm
+
+//LSX optimization is sufficient for this function.
+biweight_func 4
+ addi.d t3, zero, 4
+ fld.s f0, a1, 0
+ fldx.s f1, a1, a2
+ fld.s f10, a0, 0
+ fldx.s f11, a0, a2
+ vilvl.w vr2, vr1, vr0
+ vilvl.w vr12, vr11, vr10
+ vilvl.b vr0, vr12, vr2
+
+ biweight_calc_4 vr0, vr1
+ vbsrl.v vr2, vr1, 4
+ fst.s f1, a0, 0
+ fstx.s f2, a0, a2
+
+ blt a3, t3, .END_BIWEIGHT_H264_PIXELS4
+ addi.d t3, zero, 8
+ fldx.s f0, a1, t0
+ fldx.s f1, a1, t1
+ fldx.s f10, a0, t0
+ fldx.s f11, a0, t1
+ vilvl.w vr2, vr1, vr0
+ vilvl.w vr12, vr11, vr10
+ vilvl.b vr0, vr12, vr2
+
+ biweight_calc_4 vr0, vr1
+ vbsrl.v vr2, vr1, 4
+ fstx.s f1, a0, t0
+ fstx.s f2, a0, t1
+ blt a3, t3, .END_BIWEIGHT_H264_PIXELS4
+ add.d a1, a1, t2
+ add.d a0, a0, t2
+ fld.s f0, a1, 0
+ fldx.s f1, a1, a2
+ fldx.s f2, a1, t0
+ fldx.s f3, a1, t1
+ fld.s f10, a0, 0
+ fldx.s f11, a0, a2
+ fldx.s f12, a0, t0
+ fldx.s f13, a0, t1
+ vilvl.w vr4, vr1, vr0
+ vilvl.w vr5, vr3, vr2
+ vilvl.w vr14, vr11, vr10
+ vilvl.w vr15, vr13, vr12
+
+ vilvl.b vr0, vr14, vr4
+ vilvl.b vr10, vr15, vr5
+
+ vmov vr1, vr8
+ vmov vr11, vr8
+ vmaddwev.h.bu.b vr1, vr0, vr20
+ vmaddwev.h.bu.b vr11, vr10, vr20
+ vmaddwod.h.bu.b vr1, vr0, vr20
+ vmaddwod.h.bu.b vr11, vr10, vr20
+
+ vssran.bu.h vr0, vr1, vr9 //vec0
+ vssran.bu.h vr10, vr11, vr9 //vec0
+ vbsrl.v vr2, vr0, 4
+ vbsrl.v vr12, vr10, 4
+
+ fst.s f0, a0, 0
+ fstx.s f2, a0, a2
+ fstx.s f10, a0, t0
+ fstx.s f12, a0, t1
+.END_BIWEIGHT_H264_PIXELS4:
+endfunc
+
+.macro biweight_func_lasx w
+function ff_biweight_h264_pixels\w\()_8_lasx
+ slli.d t0, a2, 1
+ slli.d t2, a2, 2
+ add.d t1, t0, a2
+ addi.d a7, a7, 1
+ ori a7, a7, 1
+ sll.d a7, a7, a4
+ addi.d a4, a4, 1
+
+ xvreplgr2vr.b xr0, a6 //tmp0
+ xvreplgr2vr.b xr1, a5 //tmp1
+ xvreplgr2vr.h xr8, a7 //offset
+ xvreplgr2vr.h xr9, a4 //denom
+ xvilvh.b xr20, xr1, xr0 //wgt
+.endm
+
+.macro biweight_calc_lasx _in0, _in1, _reg0, _reg1, _reg2, _out0, _out1
+ xmov \_out0, \_reg0
+ xmov \_out1, \_reg0
+ xvmaddwev.h.bu.b \_out0, \_in0, \_reg1
+ xvmaddwev.h.bu.b \_out1, \_in1, \_reg1
+ xvmaddwod.h.bu.b \_out0, \_in0, \_reg1
+ xvmaddwod.h.bu.b \_out1, \_in1, \_reg1
+
+ xvssran.bu.h \_out0, \_out0, \_reg2
+ xvssran.bu.h \_out1, \_out1, \_reg2
+.endm
+
+.macro biweight_load_lasx_8
+ load_double f0, f1, f2, f3, a1, a2, t0, t1
+ load_double f10, f11, f12, f13, a0, a2, t0, t1
+
+ vilvl.d vr0, vr1, vr0 //src0
+ vilvl.d vr2, vr3, vr2 //src2
+ vilvl.d vr10, vr11, vr10 //dst0
+ vilvl.d vr12, vr13, vr12 //dst2
+
+ xvpermi.q xr2, xr0, 0x20
+ xvpermi.q xr12, xr10, 0x20
+
+ xvilvl.b xr0, xr12, xr2
+ xvilvh.b xr1, xr12, xr2
+.endm
+
+.macro biweight_lasx_8
+ biweight_calc_lasx xr0, xr1, xr8, xr20, xr9, xr2, xr3
+ xvilvl.d xr0, xr3, xr2
+ xvpermi.d xr2, xr0, 0x4E
+ vbsrl.v vr1, vr0, 8
+ vbsrl.v vr3, vr2, 8
+
+ store_double f0, f1, f2, f3, a0, a2, t0, t1
+.endm
+
+biweight_func_lasx 8
+ addi.d t3, zero, 8
+ biweight_load_lasx_8
+ biweight_lasx_8
+ blt a3, t3, .END_BIWEIGHT_H264_PIXELS8_LASX
+ addi.d t3, zero, 16
+ add.d a1, a1, t2
+ add.d a0, a0, t2
+ biweight_load_lasx_8
+ biweight_lasx_8
+ blt a3, t3, .END_BIWEIGHT_H264_PIXELS8_LASX
+ add.d a1, a1, t2
+ add.d a0, a0, t2
+ add.d t4, a1, t2
+ add.d t5, a0, t2
+ biweight_load_lasx_8
+ load_double f4, f5, f6, f7, t4, a2, t0, t1
+ load_double f14, f15, f16, f17, t5, a2, t0, t1
+ vilvl.d vr4, vr5, vr4 //src4
+ vilvl.d vr6, vr7, vr6 //src6
+ vilvl.d vr14, vr15, vr14 //dst4
+ vilvl.d vr16, vr17, vr16 //dst6
+ xvpermi.q xr6, xr4, 0x20
+ xvpermi.q xr16, xr14, 0x20
+ xvilvl.b xr10, xr16, xr6
+ xvilvh.b xr11, xr16, xr6
+ biweight_lasx_8
+ biweight_calc_lasx xr10, xr11, xr8, xr20, xr9, xr12, xr13
+ xvilvl.d xr10, xr13, xr12
+ xvpermi.d xr12, xr10, 0x4E
+ vbsrl.v vr11, vr10, 8
+ vbsrl.v vr13, vr12, 8
+ store_double f10, f11, f12, f13, t5, a2, t0, t1
+.END_BIWEIGHT_H264_PIXELS8_LASX:
+endfunc
+
+.macro biweight_load_lasx_16
+ add.d t4, a1, t2
+ vld vr0, a1, 0
+ vldx vr1, a1, a2
+ vldx vr2, a1, t0
+ vldx vr3, a1, t1
+ vld vr4, t4, 0
+ vldx vr5, t4, a2
+ vldx vr6, t4, t0
+ vldx vr7, t4, t1
+
+ add.d t5, a0, t2
+ vld vr10, a0, 0
+ vldx vr11, a0, a2
+ vldx vr12, a0, t0
+ vldx vr13, a0, t1
+ vld vr14, t5, 0
+ vldx vr15, t5, a2
+ vldx vr16, t5, t0
+ vldx vr17, t5, t1
+
+ xvpermi.q xr1, xr0, 0x20
+ xvpermi.q xr3, xr2, 0x20
+ xvpermi.q xr5, xr4, 0x20
+ xvpermi.q xr7, xr6, 0x20
+
+ xvpermi.q xr11, xr10, 0x20
+ xvpermi.q xr13, xr12, 0x20
+ xvpermi.q xr15, xr14, 0x20
+ xvpermi.q xr17, xr16, 0x20
+
+ xvilvl.b xr0, xr11, xr1 //vec0
+ xvilvl.b xr2, xr13, xr3 //vec2
+ xvilvl.b xr4, xr15, xr5 //vec4
+ xvilvl.b xr6, xr17, xr7 //vec6
+
+ xvilvh.b xr10, xr11, xr1 //vec1
+ xvilvh.b xr12, xr13, xr3 //vec2
+ xvilvh.b xr14, xr15, xr5 //vec5
+ xvilvh.b xr16, xr17, xr7 //vec7
+.endm
+
+.macro biweight_lasx_16
+ biweight_calc_lasx xr0, xr2, xr8, xr20, xr9, xr1, xr3
+ biweight_calc_lasx xr4, xr6, xr8, xr20, xr9, xr5, xr7
+ biweight_calc_lasx xr10, xr12, xr8, xr20, xr9, xr11, xr13
+ biweight_calc_lasx xr14, xr16, xr8, xr20, xr9, xr15, xr17
+ xvilvl.d xr0, xr11, xr1
+ xvilvl.d xr2, xr13, xr3
+ xvilvl.d xr4, xr15, xr5
+ xvilvl.d xr6, xr17, xr7
+
+ xvpermi.d xr1, xr0, 0x4E
+ xvpermi.d xr3, xr2, 0x4E
+ xvpermi.d xr5, xr4, 0x4E
+ xvpermi.d xr7, xr6, 0x4E
+ vst vr0, a0, 0
+ vstx vr1, a0, a2
+ vstx vr2, a0, t0
+ vstx vr3, a0, t1
+ vst vr4, t5, 0
+ vstx vr5, t5, a2
+ vstx vr6, t5, t0
+ vstx vr7, t5, t1
+.endm
+
+biweight_func_lasx 16
+ addi.d t6, zero, 16
+ biweight_load_lasx_16
+ biweight_lasx_16
+ bne a3, t6, .END_BIWEIGHT_PIXELS16_LASX
+ add.d a1, t4, t2
+ add.d a0, t5, t2
+ biweight_load_lasx_16
+ biweight_lasx_16
+.END_BIWEIGHT_PIXELS16_LASX:
+endfunc
+
+.macro weight_func w
+function ff_weight_h264_pixels\w\()_8_lsx
+ slli.d t0, a1, 1
+ slli.d t2, a1, 2
+ add.d t1, t0, a1
+
+ sll.d a5, a5, a3
+ vreplgr2vr.h vr20, a4 //weight
+ vreplgr2vr.h vr8, a5 //offset
+ vreplgr2vr.h vr9, a3 //log2_denom
+.endm
+
+.macro weight_load_16
+ add.d t4, a0, t2
+ vld vr0, a0, 0
+ vldx vr1, a0, a1
+ vldx vr2, a0, t0
+ vldx vr3, a0, t1
+ vld vr4, t4, 0
+ vldx vr5, t4, a1
+ vldx vr6, t4, t0
+ vldx vr7, t4, t1
+
+ vilvl.b vr10, vr23, vr0
+ vilvl.b vr11, vr23, vr1
+ vilvl.b vr12, vr23, vr2
+ vilvl.b vr13, vr23, vr3
+ vilvl.b vr14, vr23, vr4
+ vilvl.b vr15, vr23, vr5
+ vilvl.b vr16, vr23, vr6
+ vilvl.b vr17, vr23, vr7
+.endm
+
+.macro weight_extend_16
+ vilvl.b vr10, vr23, vr0
+ vilvl.b vr11, vr23, vr1
+ vilvl.b vr12, vr23, vr2
+ vilvl.b vr13, vr23, vr3
+ vilvl.b vr14, vr23, vr4
+ vilvl.b vr15, vr23, vr5
+ vilvl.b vr16, vr23, vr6
+ vilvl.b vr17, vr23, vr7
+
+ vilvh.b vr18, vr23, vr0
+ vilvh.b vr19, vr23, vr1
+ vilvh.b vr21, vr23, vr2
+ vilvh.b vr22, vr23, vr3
+ vilvh.b vr0, vr23, vr4
+ vilvh.b vr1, vr23, vr5
+ vilvh.b vr2, vr23, vr6
+ vilvh.b vr3, vr23, vr7
+.endm
+
+.macro weight_calc _in0, _in1, _in2, _in3, _reg0, _reg1, _reg2, \
+ _out0, _out1, _out2, _out3
+ vmul.h \_in0, \_in0, \_reg1
+ vmul.h \_in1, \_in1, \_reg1
+ vmul.h \_in2, \_in2, \_reg1
+ vmul.h \_in3, \_in3, \_reg1
+ vsadd.h \_out0, \_reg0, \_in0
+ vsadd.h \_out1, \_reg0, \_in1
+ vsadd.h \_out2, \_reg0, \_in2
+ vsadd.h \_out3, \_reg0, \_in3
+ vssrarn.bu.h \_out0, \_out0, \_reg2
+ vssrarn.bu.h \_out1, \_out1, \_reg2
+ vssrarn.bu.h \_out2, \_out2, \_reg2
+ vssrarn.bu.h \_out3, \_out3, \_reg2
+.endm
+
+.macro weight_16
+ weight_calc vr10, vr11, vr12, vr13, vr8, vr20, vr9, vr10, vr11, vr12, vr13
+ weight_calc vr14, vr15, vr16, vr17, vr8, vr20, vr9, vr14, vr15, vr16, vr17
+ weight_calc vr18, vr19, vr21, vr22, vr8, vr20, vr9, vr4, vr5, vr6, vr7
+ weight_calc vr0, vr1, vr2, vr3, vr8, vr20, vr9, vr0, vr1, vr2, vr3
+
+ vilvl.d vr10, vr4, vr10
+ vilvl.d vr11, vr5, vr11
+ vilvl.d vr12, vr6, vr12
+ vilvl.d vr13, vr7, vr13
+ vilvl.d vr14, vr0, vr14
+ vilvl.d vr15, vr1, vr15
+ vilvl.d vr16, vr2, vr16
+ vilvl.d vr17, vr3, vr17
+
+ vst vr10, a0, 0
+ vstx vr11, a0, a1
+ vstx vr12, a0, t0
+ vstx vr13, a0, t1
+ vst vr14, t4, 0
+ vstx vr15, t4, a1
+ vstx vr16, t4, t0
+ vstx vr17, t4, t1
+.endm
+
+weight_func 16
+ vldi vr23, 0
+ addi.d t3, zero, 16
+ weight_load_16
+ weight_extend_16
+ weight_16
+ bne a2, t3, .END_WEIGHT_H264_PIXELS16_8
+ add.d a0, t4, t2
+ weight_load_16
+ weight_extend_16
+ weight_16
+.END_WEIGHT_H264_PIXELS16_8:
+endfunc
+
+.macro weight_load_8
+ load_double f0, f1, f2, f3, a0, a1, t0, t1
+.endm
+
+.macro weight_extend_8
+ vilvl.b vr10, vr21, vr0
+ vilvl.b vr11, vr21, vr1
+ vilvl.b vr12, vr21, vr2
+ vilvl.b vr13, vr21, vr3
+.endm
+
+.macro weight_8
+ weight_calc vr10, vr11, vr12, vr13, vr8, vr20, vr9, vr0, vr1, vr2, vr3
+ store_double f0, f1, f2, f3, a0, a1, t0, t1
+.endm
+
+weight_func 8
+ vldi vr21, 0
+ addi.d t3, zero, 8
+ weight_load_8
+ weight_extend_8
+ weight_8
+ blt a2, t3, .END_WEIGHT_H264_PIXELS8
+ add.d a0, a0, t2
+ addi.d t3, zero, 16
+ weight_load_8
+ weight_extend_8
+ weight_8
+ blt a2, t3, .END_WEIGHT_H264_PIXELS8
+ add.d a0, a0, t2
+ add.d t4, a0, t2
+ weight_load_8
+ load_double f4, f5, f6, f7, t4, a1, t0, t1
+ weight_extend_8
+ vilvl.b vr14, vr21, vr4
+ vilvl.b vr15, vr21, vr5
+ vilvl.b vr16, vr21, vr6
+ vilvl.b vr17, vr21, vr7
+ weight_8
+ weight_calc vr14, vr15, vr16, vr17, vr8, vr20, vr9, vr4, vr5, vr6, vr7
+ store_double f4, f5, f6, f7, t4, a1, t0, t1
+.END_WEIGHT_H264_PIXELS8:
+endfunc
+
+.macro weight_func_lasx w
+function ff_weight_h264_pixels\w\()_8_lasx
+ slli.d t0, a1, 1
+ slli.d t2, a1, 2
+ add.d t1, t0, a1
+
+ sll.d a5, a5, a3
+ xvreplgr2vr.h xr20, a4 //weight
+ xvreplgr2vr.h xr8, a5 //offset
+ xvreplgr2vr.h xr9, a3 //log2_denom
+.endm
+
+.macro weight_calc_lasx _in0, _in1, _reg0, _reg1, _reg2, _out0, _out1
+ xvmul.h \_out0, \_in0, \_reg1
+ xvmul.h \_out1, \_in1, \_reg1
+ xvsadd.h \_out0, \_reg0, \_out0
+ xvsadd.h \_out1, \_reg0, \_out1
+ xvssrarn.bu.h \_out0, \_out0, \_reg2
+ xvssrarn.bu.h \_out1, \_out1, \_reg2
+.endm
+
+.macro weight_load_lasx_8
+ load_double f0, f1, f2, f3, a0, a1, t0, t1
+ vilvl.d vr4, vr1, vr0
+ vilvl.d vr5, vr3, vr2
+ vext2xv.hu.bu xr6, xr4
+ vext2xv.hu.bu xr7, xr5
+.endm
+
+.macro weight_lasx_8
+ weight_calc_lasx xr6, xr7, xr8, xr20, xr9, xr1, xr3
+ xvpermi.d xr2, xr1, 0x2
+ xvpermi.d xr4, xr3, 0x2
+ store_double f1, f2, f3, f4, a0, a1, t0, t1
+.endm
+
+weight_func_lasx 8
+ addi.d t3, zero, 8
+ weight_load_lasx_8
+ weight_lasx_8
+ blt a2, t3, .END_WEIGHT_H264_PIXELS8_LASX
+ add.d a0, a0, t2
+ addi.d t3, zero, 16
+ weight_load_lasx_8
+ weight_lasx_8
+ blt a2, t3, .END_WEIGHT_H264_PIXELS8_LASX
+ add.d a0, a0, t2
+ add.d t4, a0, t2
+ weight_load_lasx_8
+ load_double f14, f15, f16, f17, t4, a1, t0, t1
+ vilvl.d vr4, vr15, vr14
+ vilvl.d vr5, vr17, vr16
+ vext2xv.hu.bu xr10, xr4
+ vext2xv.hu.bu xr11, xr5
+ weight_lasx_8
+ weight_calc_lasx xr10, xr11, xr8, xr20, xr9, xr4, xr6
+ xvpermi.d xr5, xr4, 0x2
+ xvpermi.d xr7, xr6, 0x2
+ store_double f4, f5, f6, f7, t4, a1, t0, t1
+.END_WEIGHT_H264_PIXELS8_LASX:
+endfunc
+
+.macro weight_load_lasx_16
+ add.d t4, a0, t2
+ vld vr0, a0, 0
+ vldx vr1, a0, a1
+ vldx vr2, a0, t0
+ vldx vr3, a0, t1
+ vld vr4, t4, 0
+ vldx vr5, t4, a1
+ vldx vr6, t4, t0
+ vldx vr7, t4, t1
+
+ vext2xv.hu.bu xr0, xr0
+ vext2xv.hu.bu xr1, xr1
+ vext2xv.hu.bu xr2, xr2
+ vext2xv.hu.bu xr3, xr3
+ vext2xv.hu.bu xr4, xr4
+ vext2xv.hu.bu xr5, xr5
+ vext2xv.hu.bu xr6, xr6
+ vext2xv.hu.bu xr7, xr7
+.endm
+
+.macro weight_lasx_16
+ weight_calc_lasx xr0, xr1, xr8, xr20, xr9, xr10, xr11
+ weight_calc_lasx xr2, xr3, xr8, xr20, xr9, xr12, xr13
+ weight_calc_lasx xr4, xr5, xr8, xr20, xr9, xr14, xr15
+ weight_calc_lasx xr6, xr7, xr8, xr20, xr9, xr16, xr17
+ xvpermi.d xr10, xr10, 0xD8
+ xvpermi.d xr11, xr11, 0xD8
+ xvpermi.d xr12, xr12, 0xD8
+ xvpermi.d xr13, xr13, 0xD8
+ xvpermi.d xr14, xr14, 0xD8
+ xvpermi.d xr15, xr15, 0xD8
+ xvpermi.d xr16, xr16, 0xD8
+ xvpermi.d xr17, xr17, 0xD8
+
+ vst vr10, a0, 0
+ vstx vr11, a0, a1
+ vstx vr12, a0, t0
+ vstx vr13, a0, t1
+ vst vr14, t4, 0
+ vstx vr15, t4, a1
+ vstx vr16, t4, t0
+ vstx vr17, t4, t1
+.endm
+
+weight_func_lasx 16
+ addi.d t3, zero, 16
+ weight_load_lasx_16
+ weight_lasx_16
+ bne a2, t3, .END_WEIGHT_H264_PIXELS16_8_LASX
+ add.d a0, t4, t2
+ weight_load_lasx_16
+ weight_lasx_16
+.END_WEIGHT_H264_PIXELS16_8_LASX:
+endfunc
+
+//LSX optimization is sufficient for this function.
+function ff_weight_h264_pixels4_8_lsx
+ add.d t0, a0, a1
+ addi.d t3, zero, 4
+
+ sll.d a5, a5, a3
+ vreplgr2vr.h vr20, a4 //weight
+ vreplgr2vr.h vr8, a5 //offset
+ vreplgr2vr.h vr9, a3 //log2_denom
+ vldi vr21, 0
+
+ fld.s f0, a0, 0
+ fldx.s f1, a0, a1
+ vilvl.w vr4, vr1, vr0
+ vilvl.b vr5, vr21, vr4
+ vmul.h vr10, vr5, vr20
+ vsadd.h vr0, vr8, vr10
+ vssrarn.bu.h vr0, vr0, vr9
+
+ fst.s f0, a0, 0
+ vstelm.w vr0, t0, 0, 1
+ blt a2, t3, .END_WEIGHT_H264_PIXELS4
+ add.d a0, t0, a1
+ addi.d t3, zero, 8
+ fld.s f0, a0, 0
+ fldx.s f1, a0, a1
+ add.d t0, a0, a1
+ vilvl.w vr4, vr1, vr0
+ vilvl.b vr5, vr21, vr4
+
+ vmul.h vr10, vr5, vr20
+ vsadd.h vr0, vr8, vr10
+ vssrarn.bu.h vr0, vr0, vr9
+
+ fst.s f0, a0, 0
+ vstelm.w vr0, t0, 0, 1
+ blt a2, t3, .END_WEIGHT_H264_PIXELS4
+ add.d a0, t0, a1
+ add.d t0, a0, a1
+ add.d t1, t0, a1
+ add.d t2, t1, a1
+
+ fld.s f0, a0, 0
+ fld.s f1, t0, 0
+ fld.s f2, t1, 0
+ fld.s f3, t2, 0
+
+ vilvl.w vr4, vr1, vr0
+ vilvl.w vr5, vr3, vr2
+ vilvl.b vr6, vr21, vr4
+ vilvl.b vr7, vr21, vr5
+
+ vmul.h vr10, vr6, vr20
+ vmul.h vr11, vr7, vr20
+ vsadd.h vr0, vr8, vr10
+ vsadd.h vr1, vr8, vr11
+ vssrarn.bu.h vr10, vr0, vr9
+ vssrarn.bu.h vr11, vr1, vr9
+
+ fst.s f10, a0, 0
+ vstelm.w vr10, t0, 0, 1
+ fst.s f11, t1, 0
+ vstelm.w vr11, t2, 0, 1
+.END_WEIGHT_H264_PIXELS4:
+endfunc
+
+function ff_h264_add_pixels4_8_lsx
+ slli.d t0, a2, 1
+ add.d t1, t0, a2
+ vld vr0, a1, 0
+ vld vr1, a1, 16
+ vldi vr2, 0
+ fld.s f3, a0, 0
+ fldx.s f4, a0, a2
+ fldx.s f5, a0, t0
+ fldx.s f6, a0, t1
+ vilvl.w vr7, vr4, vr3
+ vilvl.w vr8, vr6, vr5
+ vilvl.b vr9, vr2, vr7
+ vilvl.b vr10, vr2, vr8
+ vadd.h vr11, vr0, vr9
+ vadd.h vr12, vr1, vr10
+ vpickev.b vr0, vr12, vr11
+ vbsrl.v vr3, vr0, 4
+ vbsrl.v vr4, vr0, 8
+ vbsrl.v vr5, vr0, 12
+ fst.s f0, a0, 0
+ fstx.s f3, a0, a2
+ fstx.s f4, a0, t0
+ fstx.s f5, a0, t1
+ vst vr2, a1, 0
+ vst vr2, a1, 16
+endfunc
+
+function ff_h264_add_pixels8_8_lsx
+ slli.d t0, a2, 1
+ slli.d t2, a2, 2
+ add.d t1, t0, a2
+ add.d t3, a0, t2
+ vldi vr0, 0
+ vld vr1, a1, 0
+ vld vr2, a1, 16
+ vld vr3, a1, 32
+ vld vr4, a1, 48
+ vld vr5, a1, 64
+ vld vr6, a1, 80
+ vld vr7, a1, 96
+ vld vr8, a1, 112
+ load_double f10, f11, f12, f13, a0, a2, t0, t1
+ load_double f14, f15, f16, f17, t3, a2, t0, t1
+ vilvl.b vr10, vr0, vr10
+ vilvl.b vr11, vr0, vr11
+ vilvl.b vr12, vr0, vr12
+ vilvl.b vr13, vr0, vr13
+ vilvl.b vr14, vr0, vr14
+ vilvl.b vr15, vr0, vr15
+ vilvl.b vr16, vr0, vr16
+ vilvl.b vr17, vr0, vr17
+ vadd.h vr1, vr1, vr10
+ vadd.h vr2, vr2, vr11
+ vadd.h vr3, vr3, vr12
+ vadd.h vr4, vr4, vr13
+ vadd.h vr5, vr5, vr14
+ vadd.h vr6, vr6, vr15
+ vadd.h vr7, vr7, vr16
+ vadd.h vr8, vr8, vr17
+ vpickev.b vr10, vr2, vr1
+ vpickev.b vr12, vr4, vr3
+ vpickev.b vr14, vr6, vr5
+ vpickev.b vr16, vr8, vr7
+ vbsrl.v vr11, vr10, 8
+ vbsrl.v vr13, vr12, 8
+ vbsrl.v vr15, vr14, 8
+ vbsrl.v vr17, vr16, 8
+ vst vr0, a1, 0
+ vst vr0, a1, 16
+ vst vr0, a1, 32
+ vst vr0, a1, 48
+ vst vr0, a1, 64
+ vst vr0, a1, 80
+ vst vr0, a1, 96
+ vst vr0, a1, 112
+ store_double f10, f11, f12, f13, a0, a2, t0, t1
+ store_double f14, f15, f16, f17, t3, a2, t0, t1
+endfunc
+
+const cnst_value
+.byte 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2
+.byte 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1
+endconst
+
+function ff_h264_loop_filter_strength_lsx
+ vldi vr0, 0
+ ldptr.w t0, sp, 0 //mask_mv1
+ ldptr.w t1, sp, 8 //field
+ beqz t1, .FIELD
+ la.local t2, cnst_value
+ vld vr1, t2, 0
+ vld vr2, t2, 16
+ b .END_FIELD
+.FIELD:
+ vldi vr1, 0x06
+ vldi vr2, 0x03
+.END_FIELD:
+ vldi vr3, 0x01
+ slli.d a6, a6, 3 //step <<= 3
+ slli.d a5, a5, 3 //edges <<= 3
+ move t3, zero
+ slli.d t4, a6, 2
+ move t5, a2
+ move t6, a3
+ move t7, a1
+ move t8, a0
+ slli.d t0, t0, 3
+.ITERATION_FIR:
+ bge t3, a5, .END_ITERATION_FIR
+ vand.v vr20, vr20, vr0
+ and t2, t0, t3
+ bnez t2, .MASK_MV_FIR
+ beqz a4, .BIDIR_FIR
+ vld vr4, t5, 4
+ vld vr5, t5, 44
+ vld vr6, t5, 12
+ vld vr7, t5, 52
+ vilvl.w vr4, vr5, vr4
+ vilvl.w vr6, vr6, vr6
+ vilvl.w vr7, vr7, vr7
+ vshuf4i.h vr5, vr4, 0x4e
+ vsub.b vr6, vr6, vr4
+ vsub.b vr7, vr7, vr5
+ vor.v vr6, vr6, vr7
+ vld vr10, t6, 16
+ vld vr11, t6, 48
+ vld vr12, t6, 208
+ vld vr8, t6, 176
+ vsub.h vr13, vr10, vr11
+ vsub.h vr14, vr10, vr12
+ vsub.h vr15, vr8, vr11
+ vsub.h vr16, vr8, vr12
+ vssrarni.b.h vr14, vr13, 0
+ vssrarni.b.h vr16, vr15, 0
+ vadd.b vr14, vr2, vr14
+ vadd.b vr16, vr2, vr16
+ vssub.bu vr14, vr14, vr1
+ vssub.bu vr16, vr16, vr1
+ vssrarni.b.h vr14, vr14, 0
+ vssrarni.b.h vr16, vr16, 0
+ vor.v vr20, vr6, vr14
+ vshuf4i.h vr16, vr16, 0x4e
+ vor.v vr20, vr20, vr16
+ vshuf4i.h vr21, vr20, 0x4e
+ vmin.bu vr20, vr20, vr21
+ b .MASK_MV_FIR
+.BIDIR_FIR:
+ vld vr4, t5, 4
+ vld vr5, t5, 12
+ vld vr10, t6, 16
+ vld vr11, t6, 48
+ vsub.h vr12, vr11, vr10
+ vssrarni.b.h vr12, vr12, 0
+ vadd.b vr13, vr12, vr2
+ vssub.bu vr14, vr13, vr1
+ vsat.h vr15, vr14, 7
+ vpickev.b vr20, vr15, vr15
+ vsub.b vr6, vr5, vr4
+ vor.v vr20, vr20, vr6
+.MASK_MV_FIR:
+ vld vr4, t7, 12
+ vld vr5, t7, 4
+ vor.v vr6, vr4, vr5
+ vmin.bu vr6, vr6, vr3
+ vmin.bu vr20, vr20, vr3
+ vslli.h vr6, vr6, 1
+ vmax.bu vr6, vr20, vr6
+ vilvl.b vr7, vr0, vr6
+ add.d t3, t3, a6
+ fst.d f7, t8, 32
+ add.d t5, t5, a6
+ add.d t6, t6, t4
+ add.d t7, t7, a6
+ add.d t8, t8, a6
+ b .ITERATION_FIR
+.END_ITERATION_FIR:
+ move t3, zero
+ addi.d a5, zero, 32
+ vldi vr21, 0xff
+ move t5, a2
+ move t6, a3
+ move t7, a1
+ move t8, a0
+ slli.d a7, a7, 3
+.ITERATION_SEC:
+ bge t3, a5, .END_ITERATION_SEC
+ vand.v vr20, vr20, vr21
+ and t2, a7, t3
+ bnez t2, .MASK_MV_SEC
+ beqz a4, .BIDIR_SEC
+ vld vr4, t5, 11
+ vld vr5, t5, 51
+ vld vr6, t5, 12
+ vld vr7, t5, 52
+ vilvl.w vr4, vr5, vr4
+ vilvl.w vr6, vr6, vr6
+ vilvl.w vr7, vr7, vr7
+ vshuf4i.h vr5, vr4, 0x4e
+ vsub.b vr6, vr6, vr4
+ vsub.b vr7, vr7, vr5
+ vor.v vr6, vr6, vr7
+ vld vr10, t6, 44
+ vld vr11, t6, 48
+ vld vr12, t6, 208
+ vld vr8, t6, 204
+ vsub.h vr13, vr10, vr11
+ vsub.h vr14, vr10, vr12
+ vsub.h vr15, vr8, vr11
+ vsub.h vr16, vr8, vr12
+ vssrarni.b.h vr14, vr13, 0
+ vssrarni.b.h vr16, vr15, 0
+ vadd.b vr14, vr2, vr14
+ vadd.b vr16, vr2, vr16
+ vssub.bu vr14, vr14, vr1
+ vssub.bu vr16, vr16, vr1
+ vssrarni.b.h vr14, vr14, 0
+ vssrarni.b.h vr16, vr16, 0
+ vor.v vr20, vr6, vr14
+ vshuf4i.h vr16, vr16, 0x4e
+ vor.v vr20, vr20, vr16
+ vshuf4i.h vr22, vr20, 0x4e
+ vmin.bu vr20, vr20, vr22
+ b .MASK_MV_SEC
+.BIDIR_SEC:
+ vld vr4, t5, 11
+ vld vr5, t5, 12
+ vld vr10, t6, 44
+ vld vr11, t6, 48
+ vsub.h vr12, vr11, vr10
+ vssrarni.b.h vr12, vr12, 0
+ vadd.b vr13, vr12, vr2
+ vssub.bu vr14, vr13, vr1
+ vssrarni.b.h vr14, vr14, 0
+ vsub.b vr6, vr5, vr4
+ vor.v vr20, vr14, vr6
+.MASK_MV_SEC:
+ vld vr4, t7, 12
+ vld vr5, t7, 11
+ vor.v vr6, vr4, vr5
+ vmin.bu vr6, vr6, vr3
+ vmin.bu vr20, vr20, vr3
+ vslli.h vr6, vr6, 1
+ vmax.bu vr6, vr20, vr6
+ vilvl.b vr7, vr0, vr6
+ addi.d t3, t3, 8
+ fst.d f7, t8, 0
+ addi.d t5, t5, 8
+ addi.d t6, t6, 32
+ addi.d t7, t7, 8
+ addi.d t8, t8, 8
+ b .ITERATION_SEC
+.END_ITERATION_SEC:
+ vld vr4, a0, 0
+ vld vr5, a0, 16
+ vilvh.d vr6, vr4, vr4
+ vilvh.d vr7, vr5, vr5
+ LSX_TRANSPOSE4x4_H vr4, vr6, vr5, vr7, vr6, vr7, vr8, vr9, vr10, vr11
+ vilvl.d vr4, vr7, vr6
+ vilvl.d vr5, vr9, vr8
+ vst vr4, a0, 0
+ vst vr5, a0, 16
+endfunc
diff --git a/libavcodec/loongarch/h264dsp_init_loongarch.c b/libavcodec/loongarch/h264dsp_init_loongarch.c
index cb07deb398..b70fe696d2 100644
--- a/libavcodec/loongarch/h264dsp_init_loongarch.c
+++ b/libavcodec/loongarch/h264dsp_init_loongarch.c
@@ -29,21 +29,44 @@ av_cold void ff_h264dsp_init_loongarch(H264DSPContext *c, const int bit_depth,
int cpu_flags = av_get_cpu_flags();
if (have_lsx(cpu_flags)) {
+ if (chroma_format_idc <= 1)
+ c->h264_loop_filter_strength = ff_h264_loop_filter_strength_lsx;
if (bit_depth == 8) {
c->h264_idct_add = ff_h264_idct_add_8_lsx;
c->h264_idct8_add = ff_h264_idct8_add_8_lsx;
c->h264_idct_dc_add = ff_h264_idct_dc_add_8_lsx;
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_lsx;
- if (chroma_format_idc <= 1)
+ if (chroma_format_idc <= 1) {
c->h264_idct_add8 = ff_h264_idct_add8_8_lsx;
- else
+ c->h264_h_loop_filter_chroma = ff_h264_h_lpf_chroma_8_lsx;
+ c->h264_h_loop_filter_chroma_intra = ff_h264_h_lpf_chroma_intra_8_lsx;
+ } else
c->h264_idct_add8 = ff_h264_idct_add8_422_8_lsx;
c->h264_idct_add16 = ff_h264_idct_add16_8_lsx;
c->h264_idct8_add4 = ff_h264_idct8_add4_8_lsx;
c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_8_lsx;
c->h264_idct_add16intra = ff_h264_idct_add16_intra_8_lsx;
+
+ c->h264_add_pixels4_clear = ff_h264_add_pixels4_8_lsx;
+ c->h264_add_pixels8_clear = ff_h264_add_pixels8_8_lsx;
+ c->h264_v_loop_filter_luma = ff_h264_v_lpf_luma_8_lsx;
+ c->h264_h_loop_filter_luma = ff_h264_h_lpf_luma_8_lsx;
+ c->h264_v_loop_filter_luma_intra = ff_h264_v_lpf_luma_intra_8_lsx;
+ c->h264_h_loop_filter_luma_intra = ff_h264_h_lpf_luma_intra_8_lsx;
+ c->h264_v_loop_filter_chroma = ff_h264_v_lpf_chroma_8_lsx;
+
+ c->h264_v_loop_filter_chroma_intra = ff_h264_v_lpf_chroma_intra_8_lsx;
+
+ c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16_8_lsx;
+ c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_8_lsx;
+ c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels4_8_lsx;
+ c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16_8_lsx;
+ c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels8_8_lsx;
+ c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels4_8_lsx;
+ c->h264_idct8_add = ff_h264_idct8_add_8_lsx;
+ c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_lsx;
}
}
#if HAVE_LASX
@@ -57,23 +80,13 @@ av_cold void ff_h264dsp_init_loongarch(H264DSPContext *c, const int bit_depth,
c->h264_h_loop_filter_luma = ff_h264_h_lpf_luma_8_lasx;
c->h264_v_loop_filter_luma_intra = ff_h264_v_lpf_luma_intra_8_lasx;
c->h264_h_loop_filter_luma_intra = ff_h264_h_lpf_luma_intra_8_lasx;
- c->h264_v_loop_filter_chroma = ff_h264_v_lpf_chroma_8_lasx;
-
- if (chroma_format_idc <= 1)
- c->h264_h_loop_filter_chroma = ff_h264_h_lpf_chroma_8_lasx;
- c->h264_v_loop_filter_chroma_intra = ff_h264_v_lpf_chroma_intra_8_lasx;
-
- if (chroma_format_idc <= 1)
- c->h264_h_loop_filter_chroma_intra = ff_h264_h_lpf_chroma_intra_8_lasx;
/* Weighted MC */
c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16_8_lasx;
c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels8_8_lasx;
- c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels4_8_lasx;
c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16_8_lasx;
c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_8_lasx;
- c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels4_8_lasx;
c->h264_idct8_add = ff_h264_idct8_add_8_lasx;
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_lasx;
diff --git a/libavcodec/loongarch/h264dsp_lasx.c b/libavcodec/loongarch/h264dsp_lasx.c
index 7b2b8ff0f0..5205cc849f 100644
--- a/libavcodec/loongarch/h264dsp_lasx.c
+++ b/libavcodec/loongarch/h264dsp_lasx.c
@@ -67,10 +67,10 @@
void ff_h264_h_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width,
int alpha_in, int beta_in, int8_t *tc)
{
- ptrdiff_t img_width_2x = img_width << 1;
- ptrdiff_t img_width_4x = img_width << 2;
- ptrdiff_t img_width_8x = img_width << 3;
- ptrdiff_t img_width_3x = img_width_2x + img_width;
+ int img_width_2x = img_width << 1;
+ int img_width_4x = img_width << 2;
+ int img_width_8x = img_width << 3;
+ int img_width_3x = img_width_2x + img_width;
__m256i tmp_vec0, bs_vec;
__m256i tc_vec = {0x0101010100000000, 0x0303030302020202,
0x0101010100000000, 0x0303030302020202};
@@ -244,8 +244,8 @@ void ff_h264_h_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width,
void ff_h264_v_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width,
int alpha_in, int beta_in, int8_t *tc)
{
- ptrdiff_t img_width_2x = img_width << 1;
- ptrdiff_t img_width_3x = img_width + img_width_2x;
+ int img_width_2x = img_width << 1;
+ int img_width_3x = img_width + img_width_2x;
__m256i tmp_vec0, bs_vec;
__m256i tc_vec = {0x0101010100000000, 0x0303030302020202,
0x0101010100000000, 0x0303030302020202};
@@ -363,184 +363,6 @@ void ff_h264_v_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width,
}
}
-void ff_h264_h_lpf_chroma_8_lasx(uint8_t *data, ptrdiff_t img_width,
- int alpha_in, int beta_in, int8_t *tc)
-{
- __m256i tmp_vec0, bs_vec;
- __m256i tc_vec = {0x0303020201010000, 0x0303020201010000, 0x0, 0x0};
- __m256i zero = __lasx_xvldi(0);
- ptrdiff_t img_width_2x = img_width << 1;
- ptrdiff_t img_width_4x = img_width << 2;
- ptrdiff_t img_width_3x = img_width_2x + img_width;
-
- tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0);
- tc_vec = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
- bs_vec = __lasx_xvslti_b(tc_vec, 0);
- bs_vec = __lasx_xvxori_b(bs_vec, 255);
- bs_vec = __lasx_xvandi_b(bs_vec, 1);
- bs_vec = __lasx_xvpermi_q(zero, bs_vec, 0x30);
-
- if (__lasx_xbnz_v(bs_vec)) {
- uint8_t *src = data - 2;
- __m256i p1_org, p0_org, q0_org, q1_org;
- __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
- __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
- __m256i is_bs_greater_than0;
-
- is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec);
-
- {
- __m256i row0, row1, row2, row3, row4, row5, row6, row7;
-
- DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
- src, img_width_3x, row0, row1, row2, row3);
- src += img_width_4x;
- DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
- src, img_width_3x, row4, row5, row6, row7);
- src -= img_width_4x;
- /* LASX_TRANSPOSE8x4_B */
- DUP4_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row6, row4,
- row7, row5, p1_org, p0_org, q0_org, q1_org);
- row0 = __lasx_xvilvl_b(p0_org, p1_org);
- row1 = __lasx_xvilvl_b(q1_org, q0_org);
- row3 = __lasx_xvilvh_w(row1, row0);
- row2 = __lasx_xvilvl_w(row1, row0);
- p1_org = __lasx_xvpermi_d(row2, 0x00);
- p0_org = __lasx_xvpermi_d(row2, 0x55);
- q0_org = __lasx_xvpermi_d(row3, 0x00);
- q1_org = __lasx_xvpermi_d(row3, 0x55);
- }
-
- p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
- p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
- q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
-
- alpha = __lasx_xvreplgr2vr_b(alpha_in);
- beta = __lasx_xvreplgr2vr_b(beta_in);
-
- is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
- is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
- is_less_than = is_less_than_alpha & is_less_than_beta;
- is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
- is_less_than = is_less_than_beta & is_less_than;
- is_less_than = is_less_than & is_bs_greater_than0;
-
- if (__lasx_xbnz_v(is_less_than)) {
- __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
-
- p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
- p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
- q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
- q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
-
- {
- __m256i tc_h, neg_thresh_h, p0_h, q0_h;
-
- neg_thresh_h = __lasx_xvneg_b(tc_vec);
- neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
- tc_h = __lasx_vext2xv_hu_bu(tc_vec);
-
- AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h,
- neg_thresh_h, tc_h, p0_h, q0_h);
- DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
- p0_h, q0_h);
- DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8,
- p0_h, q0_h);
- p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
- q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
- }
-
- p0_org = __lasx_xvilvl_b(q0_org, p0_org);
- src = data - 1;
- __lasx_xvstelm_h(p0_org, src, 0, 0);
- src += img_width;
- __lasx_xvstelm_h(p0_org, src, 0, 1);
- src += img_width;
- __lasx_xvstelm_h(p0_org, src, 0, 2);
- src += img_width;
- __lasx_xvstelm_h(p0_org, src, 0, 3);
- src += img_width;
- __lasx_xvstelm_h(p0_org, src, 0, 4);
- src += img_width;
- __lasx_xvstelm_h(p0_org, src, 0, 5);
- src += img_width;
- __lasx_xvstelm_h(p0_org, src, 0, 6);
- src += img_width;
- __lasx_xvstelm_h(p0_org, src, 0, 7);
- }
- }
-}
-
-void ff_h264_v_lpf_chroma_8_lasx(uint8_t *data, ptrdiff_t img_width,
- int alpha_in, int beta_in, int8_t *tc)
-{
- int img_width_2x = img_width << 1;
- __m256i tmp_vec0, bs_vec;
- __m256i tc_vec = {0x0303020201010000, 0x0303020201010000, 0x0, 0x0};
- __m256i zero = __lasx_xvldi(0);
-
- tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0);
- tc_vec = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
- bs_vec = __lasx_xvslti_b(tc_vec, 0);
- bs_vec = __lasx_xvxori_b(bs_vec, 255);
- bs_vec = __lasx_xvandi_b(bs_vec, 1);
- bs_vec = __lasx_xvpermi_q(zero, bs_vec, 0x30);
-
- if (__lasx_xbnz_v(bs_vec)) {
- __m256i p1_org, p0_org, q0_org, q1_org;
- __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
- __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
- __m256i is_bs_greater_than0;
-
- alpha = __lasx_xvreplgr2vr_b(alpha_in);
- beta = __lasx_xvreplgr2vr_b(beta_in);
-
- DUP2_ARG2(__lasx_xvldx, data, -img_width_2x, data, -img_width,
- p1_org, p0_org);
- DUP2_ARG2(__lasx_xvldx, data, 0, data, img_width, q0_org, q1_org);
-
- is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec);
- p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
- p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
- q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
-
- is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
- is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
- is_less_than = is_less_than_alpha & is_less_than_beta;
- is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
- is_less_than = is_less_than_beta & is_less_than;
- is_less_than = is_less_than & is_bs_greater_than0;
-
- if (__lasx_xbnz_v(is_less_than)) {
- __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
-
- p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
- p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
- q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
- q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
-
- {
- __m256i neg_thresh_h, tc_h, p0_h, q0_h;
-
- neg_thresh_h = __lasx_xvneg_b(tc_vec);
- neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
- tc_h = __lasx_vext2xv_hu_bu(tc_vec);
-
- AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h,
- neg_thresh_h, tc_h, p0_h, q0_h);
- DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
- p0_h, q0_h);
- DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8,
- p0_h, q0_h);
- p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
- q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
- __lasx_xvstelm_d(p0_h, data - img_width, 0, 0);
- __lasx_xvstelm_d(q0_h, data, 0, 0);
- }
- }
- }
-}
-
#define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in, \
q3_or_p3_org_in, p1_or_q1_org_in, \
p2_or_q2_org_in, q1_or_p1_org_in, \
@@ -584,9 +406,9 @@ void ff_h264_v_lpf_chroma_8_lasx(uint8_t *data, ptrdiff_t img_width,
void ff_h264_h_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
int alpha_in, int beta_in)
{
- ptrdiff_t img_width_2x = img_width << 1;
- ptrdiff_t img_width_4x = img_width << 2;
- ptrdiff_t img_width_3x = img_width_2x + img_width;
+ int img_width_2x = img_width << 1;
+ int img_width_4x = img_width << 2;
+ int img_width_3x = img_width_2x + img_width;
uint8_t *src = data - 4;
__m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
__m256i is_less_than, is_less_than_beta, is_less_than_alpha;
@@ -760,8 +582,8 @@ void ff_h264_h_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
void ff_h264_v_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
int alpha_in, int beta_in)
{
- ptrdiff_t img_width_2x = img_width << 1;
- ptrdiff_t img_width_3x = img_width_2x + img_width;
+ int img_width_2x = img_width << 1;
+ int img_width_3x = img_width_2x + img_width;
uint8_t *src = data - img_width_2x;
__m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
__m256i is_less_than, is_less_than_beta, is_less_than_alpha;
@@ -877,1160 +699,6 @@ void ff_h264_v_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
}
}
-void ff_h264_h_lpf_chroma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
- int alpha_in, int beta_in)
-{
- uint8_t *src = data - 2;
- ptrdiff_t img_width_2x = img_width << 1;
- ptrdiff_t img_width_4x = img_width << 2;
- ptrdiff_t img_width_3x = img_width_2x + img_width;
- __m256i p1_org, p0_org, q0_org, q1_org;
- __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
- __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
-
- {
- __m256i row0, row1, row2, row3, row4, row5, row6, row7;
-
- DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, src,
- img_width_3x, row0, row1, row2, row3);
- src += img_width_4x;
- DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, src,
- img_width_3x, row4, row5, row6, row7);
-
- /* LASX_TRANSPOSE8x4_B */
- DUP4_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row6, row4, row7, row5,
- p1_org, p0_org, q0_org, q1_org);
- row0 = __lasx_xvilvl_b(p0_org, p1_org);
- row1 = __lasx_xvilvl_b(q1_org, q0_org);
- row3 = __lasx_xvilvh_w(row1, row0);
- row2 = __lasx_xvilvl_w(row1, row0);
- p1_org = __lasx_xvpermi_d(row2, 0x00);
- p0_org = __lasx_xvpermi_d(row2, 0x55);
- q0_org = __lasx_xvpermi_d(row3, 0x00);
- q1_org = __lasx_xvpermi_d(row3, 0x55);
- }
-
- alpha = __lasx_xvreplgr2vr_b(alpha_in);
- beta = __lasx_xvreplgr2vr_b(beta_in);
-
- p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
- p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
- q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
-
- is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
- is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
- is_less_than = is_less_than_alpha & is_less_than_beta;
- is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
- is_less_than = is_less_than_beta & is_less_than;
-
- if (__lasx_xbnz_v(is_less_than)) {
- __m256i p0_h, q0_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h;
-
- p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
- p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
- q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
- q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
-
- AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h);
- AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h);
- DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h, p0_h, q0_h);
- DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8, p0_h, q0_h);
- p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
- q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
- }
- p0_org = __lasx_xvilvl_b(q0_org, p0_org);
- src = data - 1;
- __lasx_xvstelm_h(p0_org, src, 0, 0);
- src += img_width;
- __lasx_xvstelm_h(p0_org, src, 0, 1);
- src += img_width;
- __lasx_xvstelm_h(p0_org, src, 0, 2);
- src += img_width;
- __lasx_xvstelm_h(p0_org, src, 0, 3);
- src += img_width;
- __lasx_xvstelm_h(p0_org, src, 0, 4);
- src += img_width;
- __lasx_xvstelm_h(p0_org, src, 0, 5);
- src += img_width;
- __lasx_xvstelm_h(p0_org, src, 0, 6);
- src += img_width;
- __lasx_xvstelm_h(p0_org, src, 0, 7);
-}
-
-void ff_h264_v_lpf_chroma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
- int alpha_in, int beta_in)
-{
- ptrdiff_t img_width_2x = img_width << 1;
- __m256i p1_org, p0_org, q0_org, q1_org;
- __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
- __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
-
- alpha = __lasx_xvreplgr2vr_b(alpha_in);
- beta = __lasx_xvreplgr2vr_b(beta_in);
-
- p1_org = __lasx_xvldx(data, -img_width_2x);
- p0_org = __lasx_xvldx(data, -img_width);
- DUP2_ARG2(__lasx_xvldx, data, 0, data, img_width, q0_org, q1_org);
-
- p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
- p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
- q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
-
- is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
- is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
- is_less_than = is_less_than_alpha & is_less_than_beta;
- is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
- is_less_than = is_less_than_beta & is_less_than;
-
- if (__lasx_xbnz_v(is_less_than)) {
- __m256i p0_h, q0_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h;
-
- p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
- p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
- q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
- q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
-
- AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h);
- AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h);
- DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h, p0_h, q0_h);
- DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8, p0_h, q0_h);
- p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
- q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
- __lasx_xvstelm_d(p0_h, data - img_width, 0, 0);
- __lasx_xvstelm_d(q0_h, data, 0, 0);
- }
-}
-
-void ff_biweight_h264_pixels16_8_lasx(uint8_t *dst, uint8_t *src,
- ptrdiff_t stride, int height,
- int log2_denom, int weight_dst,
- int weight_src, int offset_in)
-{
- __m256i wgt;
- __m256i src0, src1, src2, src3;
- __m256i dst0, dst1, dst2, dst3;
- __m256i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- __m256i denom, offset;
- int stride_2x = stride << 1;
- int stride_4x = stride << 2;
- int stride_3x = stride_2x + stride;
-
- offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
- offset_in += ((weight_src + weight_dst) << 7);
- log2_denom += 1;
-
- tmp0 = __lasx_xvreplgr2vr_b(weight_src);
- tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
- wgt = __lasx_xvilvh_b(tmp1, tmp0);
- offset = __lasx_xvreplgr2vr_h(offset_in);
- denom = __lasx_xvreplgr2vr_h(log2_denom);
-
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp0, tmp1, tmp2, tmp3);
- src += stride_4x;
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp4, tmp5, tmp6, tmp7);
- src += stride_4x;
- DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4,
- 0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3);
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
- dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
- dst += stride_4x;
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
- dst, stride_3x, tmp4, tmp5, tmp6, tmp7);
- dst -= stride_4x;
- DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4,
- 0x20, tmp7, tmp6, 0x20, dst0, dst1, dst2, dst3);
-
- DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
- src0, src1, src2, src3);
- DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128,
- dst0, dst1, dst2, dst3);
- DUP4_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, dst2, src2,
- dst3, src3, vec0, vec2, vec4, vec6);
- DUP4_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, dst2, src2,
- dst3, src3, vec1, vec3, vec5, vec7);
-
- DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
- offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
- DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec4, offset, wgt, vec5,
- offset, wgt, vec6, offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7);
-
- tmp0 = __lasx_xvsra_h(tmp0, denom);
- tmp1 = __lasx_xvsra_h(tmp1, denom);
- tmp2 = __lasx_xvsra_h(tmp2, denom);
- tmp3 = __lasx_xvsra_h(tmp3, denom);
- tmp4 = __lasx_xvsra_h(tmp4, denom);
- tmp5 = __lasx_xvsra_h(tmp5, denom);
- tmp6 = __lasx_xvsra_h(tmp6, denom);
- tmp7 = __lasx_xvsra_h(tmp7, denom);
-
- DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
- tmp0, tmp1, tmp2, tmp3);
- DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7,
- tmp4, tmp5, tmp6, tmp7);
- DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
- dst0, dst1, dst2, dst3);
- __lasx_xvstelm_d(dst0, dst, 0, 0);
- __lasx_xvstelm_d(dst0, dst, 8, 1);
- dst += stride;
- __lasx_xvstelm_d(dst0, dst, 0, 2);
- __lasx_xvstelm_d(dst0, dst, 8, 3);
- dst += stride;
- __lasx_xvstelm_d(dst1, dst, 0, 0);
- __lasx_xvstelm_d(dst1, dst, 8, 1);
- dst += stride;
- __lasx_xvstelm_d(dst1, dst, 0, 2);
- __lasx_xvstelm_d(dst1, dst, 8, 3);
- dst += stride;
- __lasx_xvstelm_d(dst2, dst, 0, 0);
- __lasx_xvstelm_d(dst2, dst, 8, 1);
- dst += stride;
- __lasx_xvstelm_d(dst2, dst, 0, 2);
- __lasx_xvstelm_d(dst2, dst, 8, 3);
- dst += stride;
- __lasx_xvstelm_d(dst3, dst, 0, 0);
- __lasx_xvstelm_d(dst3, dst, 8, 1);
- dst += stride;
- __lasx_xvstelm_d(dst3, dst, 0, 2);
- __lasx_xvstelm_d(dst3, dst, 8, 3);
- dst += stride;
-
- if (16 == height) {
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp0, tmp1, tmp2, tmp3);
- src += stride_4x;
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp4, tmp5, tmp6, tmp7);
- src += stride_4x;
- DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5,
- tmp4, 0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3);
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
- dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
- dst += stride_4x;
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
- dst, stride_3x, tmp4, tmp5, tmp6, tmp7);
- dst -= stride_4x;
- DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5,
- tmp4, 0x20, tmp7, tmp6, 0x20, dst0, dst1, dst2, dst3);
-
- DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
- src0, src1, src2, src3);
- DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128,
- dst0, dst1, dst2, dst3);
- DUP4_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, dst2, src2,
- dst3, src3, vec0, vec2, vec4, vec6);
- DUP4_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, dst2, src2,
- dst3, src3, vec1, vec3, vec5, vec7);
-
- DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
- offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
- DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec4, offset, wgt, vec5,
- offset, wgt, vec6, offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7);
-
- tmp0 = __lasx_xvsra_h(tmp0, denom);
- tmp1 = __lasx_xvsra_h(tmp1, denom);
- tmp2 = __lasx_xvsra_h(tmp2, denom);
- tmp3 = __lasx_xvsra_h(tmp3, denom);
- tmp4 = __lasx_xvsra_h(tmp4, denom);
- tmp5 = __lasx_xvsra_h(tmp5, denom);
- tmp6 = __lasx_xvsra_h(tmp6, denom);
- tmp7 = __lasx_xvsra_h(tmp7, denom);
-
- DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
- tmp0, tmp1, tmp2, tmp3);
- DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7,
- tmp4, tmp5, tmp6, tmp7);
- DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7,
- tmp6, dst0, dst1, dst2, dst3);
- __lasx_xvstelm_d(dst0, dst, 0, 0);
- __lasx_xvstelm_d(dst0, dst, 8, 1);
- dst += stride;
- __lasx_xvstelm_d(dst0, dst, 0, 2);
- __lasx_xvstelm_d(dst0, dst, 8, 3);
- dst += stride;
- __lasx_xvstelm_d(dst1, dst, 0, 0);
- __lasx_xvstelm_d(dst1, dst, 8, 1);
- dst += stride;
- __lasx_xvstelm_d(dst1, dst, 0, 2);
- __lasx_xvstelm_d(dst1, dst, 8, 3);
- dst += stride;
- __lasx_xvstelm_d(dst2, dst, 0, 0);
- __lasx_xvstelm_d(dst2, dst, 8, 1);
- dst += stride;
- __lasx_xvstelm_d(dst2, dst, 0, 2);
- __lasx_xvstelm_d(dst2, dst, 8, 3);
- dst += stride;
- __lasx_xvstelm_d(dst3, dst, 0, 0);
- __lasx_xvstelm_d(dst3, dst, 8, 1);
- dst += stride;
- __lasx_xvstelm_d(dst3, dst, 0, 2);
- __lasx_xvstelm_d(dst3, dst, 8, 3);
- }
-}
-
-static void avc_biwgt_8x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- int32_t log2_denom, int32_t weight_src,
- int32_t weight_dst, int32_t offset_in)
-{
- __m256i wgt, vec0, vec1;
- __m256i src0, dst0;
- __m256i tmp0, tmp1, tmp2, tmp3, denom, offset;
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
-
- offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
- offset_in += ((weight_src + weight_dst) << 7);
- log2_denom += 1;
-
- tmp0 = __lasx_xvreplgr2vr_b(weight_src);
- tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
- wgt = __lasx_xvilvh_b(tmp1, tmp0);
- offset = __lasx_xvreplgr2vr_h(offset_in);
- denom = __lasx_xvreplgr2vr_h(log2_denom);
-
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
- dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0);
- vec0 = __lasx_xvilvl_b(dst0, src0);
- vec1 = __lasx_xvilvh_b(dst0, src0);
- DUP2_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
- tmp0, tmp1);
- tmp0 = __lasx_xvsra_h(tmp0, denom);
- tmp1 = __lasx_xvsra_h(tmp1, denom);
- DUP2_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp0, tmp1);
- dst0 = __lasx_xvpickev_b(tmp1, tmp0);
- __lasx_xvstelm_d(dst0, dst, 0, 0);
- __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
- __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
- __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
-}
-
-static void avc_biwgt_8x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- int32_t log2_denom, int32_t weight_src,
- int32_t weight_dst, int32_t offset_in)
-{
- __m256i wgt, vec0, vec1, vec2, vec3;
- __m256i src0, src1, dst0, dst1;
- __m256i tmp0, tmp1, tmp2, tmp3, denom, offset;
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_4x = stride << 2;
- ptrdiff_t stride_3x = stride_2x + stride;
- uint8_t* dst_tmp = dst;
-
- offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
- offset_in += ((weight_src + weight_dst) << 7);
- log2_denom += 1;
-
- tmp0 = __lasx_xvreplgr2vr_b(weight_src);
- tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
- wgt = __lasx_xvilvh_b(tmp1, tmp0);
- offset = __lasx_xvreplgr2vr_h(offset_in);
- denom = __lasx_xvreplgr2vr_h(log2_denom);
-
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp0, tmp1, tmp2, tmp3);
- src += stride_4x;
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- tmp0 = __lasx_xvld(dst_tmp, 0);
- DUP2_ARG2(__lasx_xvldx, dst_tmp, stride, dst_tmp, stride_2x, tmp1, tmp2);
- tmp3 = __lasx_xvldx(dst_tmp, stride_3x);
- dst_tmp += stride_4x;
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
- dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- dst1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
-
- DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, dst0, 128, dst1, 128,
- src0, src1, dst0, dst1);
- DUP2_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, vec0, vec2);
- DUP2_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, vec1, vec3);
- DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
- offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
- tmp0 = __lasx_xvsra_h(tmp0, denom);
- tmp1 = __lasx_xvsra_h(tmp1, denom);
- tmp2 = __lasx_xvsra_h(tmp2, denom);
- tmp3 = __lasx_xvsra_h(tmp3, denom);
- DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
- tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, dst0, dst1);
- __lasx_xvstelm_d(dst0, dst, 0, 0);
- __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
- __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
- __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
- dst += stride_4x;
- __lasx_xvstelm_d(dst1, dst, 0, 0);
- __lasx_xvstelm_d(dst1, dst + stride, 0, 1);
- __lasx_xvstelm_d(dst1, dst + stride_2x, 0, 2);
- __lasx_xvstelm_d(dst1, dst + stride_3x, 0, 3);
-}
-
-static void avc_biwgt_8x16_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- int32_t log2_denom, int32_t weight_src,
- int32_t weight_dst, int32_t offset_in)
-{
- __m256i wgt, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- __m256i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
- __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_4x = stride << 2;
- ptrdiff_t stride_3x = stride_2x + stride;
- uint8_t* dst_tmp = dst;
-
- offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
- offset_in += ((weight_src + weight_dst) << 7);
- log2_denom += 1;
-
- tmp0 = __lasx_xvreplgr2vr_b(weight_src);
- tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
- wgt = __lasx_xvilvh_b(tmp1, tmp0);
- offset = __lasx_xvreplgr2vr_h(offset_in);
- denom = __lasx_xvreplgr2vr_h(log2_denom);
-
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp0, tmp1, tmp2, tmp3);
- src += stride_4x;
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp0, tmp1, tmp2, tmp3);
- src += stride_4x;
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp0, tmp1, tmp2, tmp3);
- src += stride_4x;
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- src2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- src3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
-
- DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
- dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
- dst_tmp += stride_4x;
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
- dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
- dst_tmp += stride_4x;
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- dst1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
- dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
- dst_tmp += stride_4x;
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- dst2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
- dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- dst3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
-
- DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
- src0, src1, src2, src3);
- DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128,
- dst0, dst1, dst2, dst3);
- DUP4_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, dst2, src2,
- dst3, src3, vec0, vec2, vec4, vec6);
- DUP4_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, dst2, src2,
- dst3, src3, vec1, vec3, vec5, vec7);
- DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
- offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
- DUP4_ARG3(__lasx_xvdp2add_h_b,offset, wgt, vec4, offset, wgt, vec5,
- offset, wgt, vec6, offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7);
- tmp0 = __lasx_xvsra_h(tmp0, denom);
- tmp1 = __lasx_xvsra_h(tmp1, denom);
- tmp2 = __lasx_xvsra_h(tmp2, denom);
- tmp3 = __lasx_xvsra_h(tmp3, denom);
- tmp4 = __lasx_xvsra_h(tmp4, denom);
- tmp5 = __lasx_xvsra_h(tmp5, denom);
- tmp6 = __lasx_xvsra_h(tmp6, denom);
- tmp7 = __lasx_xvsra_h(tmp7, denom);
- DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
- tmp0, tmp1, tmp2, tmp3);
- DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7,
- tmp4, tmp5, tmp6, tmp7);
- DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
- dst0, dst1, dst2, dst3)
- __lasx_xvstelm_d(dst0, dst, 0, 0);
- __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
- __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
- __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
- dst += stride_4x;
- __lasx_xvstelm_d(dst1, dst, 0, 0);
- __lasx_xvstelm_d(dst1, dst + stride, 0, 1);
- __lasx_xvstelm_d(dst1, dst + stride_2x, 0, 2);
- __lasx_xvstelm_d(dst1, dst + stride_3x, 0, 3);
- dst += stride_4x;
- __lasx_xvstelm_d(dst2, dst, 0, 0);
- __lasx_xvstelm_d(dst2, dst + stride, 0, 1);
- __lasx_xvstelm_d(dst2, dst + stride_2x, 0, 2);
- __lasx_xvstelm_d(dst2, dst + stride_3x, 0, 3);
- dst += stride_4x;
- __lasx_xvstelm_d(dst3, dst, 0, 0);
- __lasx_xvstelm_d(dst3, dst + stride, 0, 1);
- __lasx_xvstelm_d(dst3, dst + stride_2x, 0, 2);
- __lasx_xvstelm_d(dst3, dst + stride_3x, 0, 3);
-}
-
-void ff_biweight_h264_pixels8_8_lasx(uint8_t *dst, uint8_t *src,
- ptrdiff_t stride, int height,
- int log2_denom, int weight_dst,
- int weight_src, int offset)
-{
- if (4 == height) {
- avc_biwgt_8x4_lasx(src, dst, stride, log2_denom, weight_src, weight_dst,
- offset);
- } else if (8 == height) {
- avc_biwgt_8x8_lasx(src, dst, stride, log2_denom, weight_src, weight_dst,
- offset);
- } else {
- avc_biwgt_8x16_lasx(src, dst, stride, log2_denom, weight_src, weight_dst,
- offset);
- }
-}
-
-static void avc_biwgt_4x2_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- int32_t log2_denom, int32_t weight_src,
- int32_t weight_dst, int32_t offset_in)
-{
- __m256i wgt, vec0;
- __m256i src0, dst0;
- __m256i tmp0, tmp1, denom, offset;
-
- offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
- offset_in += ((weight_src + weight_dst) << 7);
- log2_denom += 1;
-
- tmp0 = __lasx_xvreplgr2vr_b(weight_src);
- tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
- wgt = __lasx_xvilvh_b(tmp1, tmp0);
- offset = __lasx_xvreplgr2vr_h(offset_in);
- denom = __lasx_xvreplgr2vr_h(log2_denom);
-
- DUP2_ARG2(__lasx_xvldx, src, 0, src, stride, tmp0, tmp1);
- src0 = __lasx_xvilvl_w(tmp1, tmp0);
- DUP2_ARG2(__lasx_xvldx, dst, 0, dst, stride, tmp0, tmp1);
- dst0 = __lasx_xvilvl_w(tmp1, tmp0);
- DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0);
- vec0 = __lasx_xvilvl_b(dst0, src0);
- tmp0 = __lasx_xvdp2add_h_b(offset, wgt, vec0);
- tmp0 = __lasx_xvsra_h(tmp0, denom);
- tmp0 = __lasx_xvclip255_h(tmp0);
- tmp0 = __lasx_xvpickev_b(tmp0, tmp0);
- __lasx_xvstelm_w(tmp0, dst, 0, 0);
- __lasx_xvstelm_w(tmp0, dst + stride, 0, 1);
-}
-
-static void avc_biwgt_4x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- int32_t log2_denom, int32_t weight_src,
- int32_t weight_dst, int32_t offset_in)
-{
- __m256i wgt, vec0;
- __m256i src0, dst0;
- __m256i tmp0, tmp1, tmp2, tmp3, denom, offset;
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
-
- offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
- offset_in += ((weight_src + weight_dst) << 7);
- log2_denom += 1;
-
- tmp0 = __lasx_xvreplgr2vr_b(weight_src);
- tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
- wgt = __lasx_xvilvh_b(tmp1, tmp0);
- offset = __lasx_xvreplgr2vr_h(offset_in);
- denom = __lasx_xvreplgr2vr_h(log2_denom);
-
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
- src0 = __lasx_xvilvl_w(tmp1, tmp0);
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
- dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
- dst0 = __lasx_xvilvl_w(tmp1, tmp0);
- DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0);
- vec0 = __lasx_xvilvl_b(dst0, src0);
- dst0 = __lasx_xvilvh_b(dst0, src0);
- vec0 = __lasx_xvpermi_q(vec0, dst0, 0x02);
- tmp0 = __lasx_xvdp2add_h_b(offset, wgt, vec0);
- tmp0 = __lasx_xvsra_h(tmp0, denom);
- tmp0 = __lasx_xvclip255_h(tmp0);
- tmp0 = __lasx_xvpickev_b(tmp0, tmp0);
- __lasx_xvstelm_w(tmp0, dst, 0, 0);
- __lasx_xvstelm_w(tmp0, dst + stride, 0, 1);
- __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 4);
- __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 5);
-}
-
-static void avc_biwgt_4x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- int32_t log2_denom, int32_t weight_src,
- int32_t weight_dst, int32_t offset_in)
-{
- __m256i wgt, vec0, vec1;
- __m256i src0, dst0;
- __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_4x = stride << 2;
- ptrdiff_t stride_3x = stride_2x + stride;
-
- offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
- offset_in += ((weight_src + weight_dst) << 7);
- log2_denom += 1;
-
- tmp0 = __lasx_xvreplgr2vr_b(weight_src);
- tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
- wgt = __lasx_xvilvh_b(tmp1, tmp0);
- offset = __lasx_xvreplgr2vr_h(offset_in);
- denom = __lasx_xvreplgr2vr_h(log2_denom);
-
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp0, tmp1, tmp2, tmp3);
- src += stride_4x;
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp4, tmp5, tmp6, tmp7);
- DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7, tmp5,
- tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
- dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
- dst += stride_4x;
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
- dst, stride_3x, tmp4, tmp5, tmp6, tmp7);
- dst -= stride_4x;
- DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7, tmp5,
- tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0);
- vec0 = __lasx_xvilvl_b(dst0, src0);
- vec1 = __lasx_xvilvh_b(dst0, src0);
- DUP2_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
- tmp0, tmp1);
- tmp0 = __lasx_xvsra_h(tmp0, denom);
- tmp1 = __lasx_xvsra_h(tmp1, denom);
- DUP2_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp0, tmp1);
- tmp0 = __lasx_xvpickev_b(tmp1, tmp0);
- __lasx_xvstelm_w(tmp0, dst, 0, 0);
- __lasx_xvstelm_w(tmp0, dst + stride, 0, 1);
- __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 2);
- __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 3);
- dst += stride_4x;
- __lasx_xvstelm_w(tmp0, dst, 0, 4);
- __lasx_xvstelm_w(tmp0, dst + stride, 0, 5);
- __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 6);
- __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 7);
-}
-
-void ff_biweight_h264_pixels4_8_lasx(uint8_t *dst, uint8_t *src,
- ptrdiff_t stride, int height,
- int log2_denom, int weight_dst,
- int weight_src, int offset)
-{
- if (2 == height) {
- avc_biwgt_4x2_lasx(src, dst, stride, log2_denom, weight_src,
- weight_dst, offset);
- } else if (4 == height) {
- avc_biwgt_4x4_lasx(src, dst, stride, log2_denom, weight_src,
- weight_dst, offset);
- } else {
- avc_biwgt_4x8_lasx(src, dst, stride, log2_denom, weight_src,
- weight_dst, offset);
- }
-}
-
-void ff_weight_h264_pixels16_8_lasx(uint8_t *src, ptrdiff_t stride,
- int height, int log2_denom,
- int weight_src, int offset_in)
-{
- uint32_t offset_val;
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_4x = stride << 2;
- ptrdiff_t stride_3x = stride_2x + stride;
- __m256i zero = __lasx_xvldi(0);
- __m256i src0, src1, src2, src3;
- __m256i src0_l, src1_l, src2_l, src3_l, src0_h, src1_h, src2_h, src3_h;
- __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- __m256i wgt, denom, offset;
-
- offset_val = (unsigned) offset_in << log2_denom;
-
- wgt = __lasx_xvreplgr2vr_h(weight_src);
- offset = __lasx_xvreplgr2vr_h(offset_val);
- denom = __lasx_xvreplgr2vr_h(log2_denom);
-
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp0, tmp1, tmp2, tmp3);
- src += stride_4x;
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp4, tmp5, tmp6, tmp7);
- src -= stride_4x;
- DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4,
- 0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3);
- DUP4_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, zero, src2,
- zero, src3, src0_l, src1_l, src2_l, src3_l);
- DUP4_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, zero, src2,
- zero, src3, src0_h, src1_h, src2_h, src3_h);
- src0_l = __lasx_xvmul_h(wgt, src0_l);
- src0_h = __lasx_xvmul_h(wgt, src0_h);
- src1_l = __lasx_xvmul_h(wgt, src1_l);
- src1_h = __lasx_xvmul_h(wgt, src1_h);
- src2_l = __lasx_xvmul_h(wgt, src2_l);
- src2_h = __lasx_xvmul_h(wgt, src2_h);
- src3_l = __lasx_xvmul_h(wgt, src3_l);
- src3_h = __lasx_xvmul_h(wgt, src3_h);
- DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l, offset,
- src1_h, offset, src0_l, src0_h, src1_l, src1_h);
- DUP4_ARG2(__lasx_xvsadd_h, src2_l, offset, src2_h, offset, src3_l, offset,
- src3_h, offset, src2_l, src2_h, src3_l, src3_h);
- src0_l = __lasx_xvmaxi_h(src0_l, 0);
- src0_h = __lasx_xvmaxi_h(src0_h, 0);
- src1_l = __lasx_xvmaxi_h(src1_l, 0);
- src1_h = __lasx_xvmaxi_h(src1_h, 0);
- src2_l = __lasx_xvmaxi_h(src2_l, 0);
- src2_h = __lasx_xvmaxi_h(src2_h, 0);
- src3_l = __lasx_xvmaxi_h(src3_l, 0);
- src3_h = __lasx_xvmaxi_h(src3_h, 0);
- src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
- src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
- src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
- src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
- src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom);
- src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom);
- src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom);
- src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom);
- __lasx_xvstelm_d(src0_l, src, 0, 0);
- __lasx_xvstelm_d(src0_h, src, 8, 0);
- src += stride;
- __lasx_xvstelm_d(src0_l, src, 0, 2);
- __lasx_xvstelm_d(src0_h, src, 8, 2);
- src += stride;
- __lasx_xvstelm_d(src1_l, src, 0, 0);
- __lasx_xvstelm_d(src1_h, src, 8, 0);
- src += stride;
- __lasx_xvstelm_d(src1_l, src, 0, 2);
- __lasx_xvstelm_d(src1_h, src, 8, 2);
- src += stride;
- __lasx_xvstelm_d(src2_l, src, 0, 0);
- __lasx_xvstelm_d(src2_h, src, 8, 0);
- src += stride;
- __lasx_xvstelm_d(src2_l, src, 0, 2);
- __lasx_xvstelm_d(src2_h, src, 8, 2);
- src += stride;
- __lasx_xvstelm_d(src3_l, src, 0, 0);
- __lasx_xvstelm_d(src3_h, src, 8, 0);
- src += stride;
- __lasx_xvstelm_d(src3_l, src, 0, 2);
- __lasx_xvstelm_d(src3_h, src, 8, 2);
- src += stride;
-
- if (16 == height) {
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp0, tmp1, tmp2, tmp3);
- src += stride_4x;
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp4, tmp5, tmp6, tmp7);
- src -= stride_4x;
- DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5,
- tmp4, 0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3);
- DUP4_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, zero, src2,
- zero, src3, src0_l, src1_l, src2_l, src3_l);
- DUP4_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, zero, src2,
- zero, src3, src0_h, src1_h, src2_h, src3_h);
- src0_l = __lasx_xvmul_h(wgt, src0_l);
- src0_h = __lasx_xvmul_h(wgt, src0_h);
- src1_l = __lasx_xvmul_h(wgt, src1_l);
- src1_h = __lasx_xvmul_h(wgt, src1_h);
- src2_l = __lasx_xvmul_h(wgt, src2_l);
- src2_h = __lasx_xvmul_h(wgt, src2_h);
- src3_l = __lasx_xvmul_h(wgt, src3_l);
- src3_h = __lasx_xvmul_h(wgt, src3_h);
- DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l,
- offset, src1_h, offset, src0_l, src0_h, src1_l, src1_h);
- DUP4_ARG2(__lasx_xvsadd_h, src2_l, offset, src2_h, offset, src3_l,
- offset, src3_h, offset, src2_l, src2_h, src3_l, src3_h);
- src0_l = __lasx_xvmaxi_h(src0_l, 0);
- src0_h = __lasx_xvmaxi_h(src0_h, 0);
- src1_l = __lasx_xvmaxi_h(src1_l, 0);
- src1_h = __lasx_xvmaxi_h(src1_h, 0);
- src2_l = __lasx_xvmaxi_h(src2_l, 0);
- src2_h = __lasx_xvmaxi_h(src2_h, 0);
- src3_l = __lasx_xvmaxi_h(src3_l, 0);
- src3_h = __lasx_xvmaxi_h(src3_h, 0);
- src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
- src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
- src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
- src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
- src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom);
- src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom);
- src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom);
- src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom);
- __lasx_xvstelm_d(src0_l, src, 0, 0);
- __lasx_xvstelm_d(src0_h, src, 8, 0);
- src += stride;
- __lasx_xvstelm_d(src0_l, src, 0, 2);
- __lasx_xvstelm_d(src0_h, src, 8, 2);
- src += stride;
- __lasx_xvstelm_d(src1_l, src, 0, 0);
- __lasx_xvstelm_d(src1_h, src, 8, 0);
- src += stride;
- __lasx_xvstelm_d(src1_l, src, 0, 2);
- __lasx_xvstelm_d(src1_h, src, 8, 2);
- src += stride;
- __lasx_xvstelm_d(src2_l, src, 0, 0);
- __lasx_xvstelm_d(src2_h, src, 8, 0);
- src += stride;
- __lasx_xvstelm_d(src2_l, src, 0, 2);
- __lasx_xvstelm_d(src2_h, src, 8, 2);
- src += stride;
- __lasx_xvstelm_d(src3_l, src, 0, 0);
- __lasx_xvstelm_d(src3_h, src, 8, 0);
- src += stride;
- __lasx_xvstelm_d(src3_l, src, 0, 2);
- __lasx_xvstelm_d(src3_h, src, 8, 2);
- }
-}
-
-static void avc_wgt_8x4_lasx(uint8_t *src, ptrdiff_t stride,
- int32_t log2_denom, int32_t weight_src,
- int32_t offset_in)
-{
- uint32_t offset_val;
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
- __m256i wgt, zero = __lasx_xvldi(0);
- __m256i src0, src0_h, src0_l;
- __m256i tmp0, tmp1, tmp2, tmp3, denom, offset;
-
- offset_val = (unsigned) offset_in << log2_denom;
-
- wgt = __lasx_xvreplgr2vr_h(weight_src);
- offset = __lasx_xvreplgr2vr_h(offset_val);
- denom = __lasx_xvreplgr2vr_h(log2_denom);
-
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- src0_l = __lasx_xvilvl_b(zero, src0);
- src0_h = __lasx_xvilvh_b(zero, src0);
- src0_l = __lasx_xvmul_h(wgt, src0_l);
- src0_h = __lasx_xvmul_h(wgt, src0_h);
- src0_l = __lasx_xvsadd_h(src0_l, offset);
- src0_h = __lasx_xvsadd_h(src0_h, offset);
- src0_l = __lasx_xvmaxi_h(src0_l, 0);
- src0_h = __lasx_xvmaxi_h(src0_h, 0);
- src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
- src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
-
- src0 = __lasx_xvpickev_d(src0_h, src0_l);
- __lasx_xvstelm_d(src0, src, 0, 0);
- __lasx_xvstelm_d(src0, src + stride, 0, 1);
- __lasx_xvstelm_d(src0, src + stride_2x, 0, 2);
- __lasx_xvstelm_d(src0, src + stride_3x, 0, 3);
-}
-
-static void avc_wgt_8x8_lasx(uint8_t *src, ptrdiff_t stride, int32_t log2_denom,
- int32_t src_weight, int32_t offset_in)
-{
- __m256i src0, src1, src0_h, src0_l, src1_h, src1_l, zero = __lasx_xvldi(0);
- __m256i tmp0, tmp1, tmp2, tmp3, denom, offset, wgt;
- uint32_t offset_val;
- uint8_t* src_tmp = src;
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_4x = stride << 2;
- ptrdiff_t stride_3x = stride_2x + stride;
-
- offset_val = (unsigned) offset_in << log2_denom;
-
- wgt = __lasx_xvreplgr2vr_h(src_weight);
- offset = __lasx_xvreplgr2vr_h(offset_val);
- denom = __lasx_xvreplgr2vr_h(log2_denom);
-
- DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
- src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
- src_tmp += stride_4x;
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
- src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP2_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, src0_l, src1_l);
- DUP2_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, src0_h, src1_h);
- src0_l = __lasx_xvmul_h(wgt, src0_l);
- src0_h = __lasx_xvmul_h(wgt, src0_h);
- src1_l = __lasx_xvmul_h(wgt, src1_l);
- src1_h = __lasx_xvmul_h(wgt, src1_h);
- DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l, offset,
- src1_h, offset, src0_l, src0_h, src1_l, src1_h);
- src0_l = __lasx_xvmaxi_h(src0_l, 0);
- src0_h = __lasx_xvmaxi_h(src0_h, 0);
- src1_l = __lasx_xvmaxi_h(src1_l, 0);
- src1_h = __lasx_xvmaxi_h(src1_h, 0);
- src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
- src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
- src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
- src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
-
- DUP2_ARG2(__lasx_xvpickev_d, src0_h, src0_l, src1_h, src1_l, src0, src1);
- __lasx_xvstelm_d(src0, src, 0, 0);
- __lasx_xvstelm_d(src0, src + stride, 0, 1);
- __lasx_xvstelm_d(src0, src + stride_2x, 0, 2);
- __lasx_xvstelm_d(src0, src + stride_3x, 0, 3);
- src += stride_4x;
- __lasx_xvstelm_d(src1, src, 0, 0);
- __lasx_xvstelm_d(src1, src + stride, 0, 1);
- __lasx_xvstelm_d(src1, src + stride_2x, 0, 2);
- __lasx_xvstelm_d(src1, src + stride_3x, 0, 3);
-}
-
-static void avc_wgt_8x16_lasx(uint8_t *src, ptrdiff_t stride,
- int32_t log2_denom, int32_t src_weight,
- int32_t offset_in)
-{
- __m256i src0, src1, src2, src3;
- __m256i src0_h, src0_l, src1_h, src1_l, src2_h, src2_l, src3_h, src3_l;
- __m256i tmp0, tmp1, tmp2, tmp3, denom, offset, wgt;
- __m256i zero = __lasx_xvldi(0);
- uint32_t offset_val;
- uint8_t* src_tmp = src;
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_4x = stride << 2;
- ptrdiff_t stride_3x = stride_2x + stride;
-
- offset_val = (unsigned) offset_in << log2_denom;
-
- wgt = __lasx_xvreplgr2vr_h(src_weight);
- offset = __lasx_xvreplgr2vr_h(offset_val);
- denom = __lasx_xvreplgr2vr_h(log2_denom);
-
- DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
- src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
- src_tmp += stride_4x;
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
- src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
- src_tmp += stride_4x;
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
- src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
- src_tmp += stride_4x;
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- src2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
- src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- src3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
-
- DUP4_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, zero, src2, zero, src3,
- src0_l, src1_l, src2_l, src3_l);
- DUP4_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, zero, src2, zero, src3,
- src0_h, src1_h, src2_h, src3_h);
- src0_l = __lasx_xvmul_h(wgt, src0_l);
- src0_h = __lasx_xvmul_h(wgt, src0_h);
- src1_l = __lasx_xvmul_h(wgt, src1_l);
- src1_h = __lasx_xvmul_h(wgt, src1_h);
- src2_l = __lasx_xvmul_h(wgt, src2_l);
- src2_h = __lasx_xvmul_h(wgt, src2_h);
- src3_l = __lasx_xvmul_h(wgt, src3_l);
- src3_h = __lasx_xvmul_h(wgt, src3_h);
-
- DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l, offset,
- src1_h, offset, src0_l, src0_h, src1_l, src1_h);
- DUP4_ARG2(__lasx_xvsadd_h, src2_l, offset, src2_h, offset, src3_l, offset,
- src3_h, offset, src2_l, src2_h, src3_l, src3_h);
-
- src0_l = __lasx_xvmaxi_h(src0_l, 0);
- src0_h = __lasx_xvmaxi_h(src0_h, 0);
- src1_l = __lasx_xvmaxi_h(src1_l, 0);
- src1_h = __lasx_xvmaxi_h(src1_h, 0);
- src2_l = __lasx_xvmaxi_h(src2_l, 0);
- src2_h = __lasx_xvmaxi_h(src2_h, 0);
- src3_l = __lasx_xvmaxi_h(src3_l, 0);
- src3_h = __lasx_xvmaxi_h(src3_h, 0);
- src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
- src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
- src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
- src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
- src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom);
- src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom);
- src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom);
- src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom);
- DUP4_ARG2(__lasx_xvpickev_d, src0_h, src0_l, src1_h, src1_l, src2_h, src2_l,
- src3_h, src3_l, src0, src1, src2, src3);
-
- __lasx_xvstelm_d(src0, src, 0, 0);
- __lasx_xvstelm_d(src0, src + stride, 0, 1);
- __lasx_xvstelm_d(src0, src + stride_2x, 0, 2);
- __lasx_xvstelm_d(src0, src + stride_3x, 0, 3);
- src += stride_4x;
- __lasx_xvstelm_d(src1, src, 0, 0);
- __lasx_xvstelm_d(src1, src + stride, 0, 1);
- __lasx_xvstelm_d(src1, src + stride_2x, 0, 2);
- __lasx_xvstelm_d(src1, src + stride_3x, 0, 3);
- src += stride_4x;
- __lasx_xvstelm_d(src2, src, 0, 0);
- __lasx_xvstelm_d(src2, src + stride, 0, 1);
- __lasx_xvstelm_d(src2, src + stride_2x, 0, 2);
- __lasx_xvstelm_d(src2, src + stride_3x, 0, 3);
- src += stride_4x;
- __lasx_xvstelm_d(src3, src, 0, 0);
- __lasx_xvstelm_d(src3, src + stride, 0, 1);
- __lasx_xvstelm_d(src3, src + stride_2x, 0, 2);
- __lasx_xvstelm_d(src3, src + stride_3x, 0, 3);
-}
-
-void ff_weight_h264_pixels8_8_lasx(uint8_t *src, ptrdiff_t stride,
- int height, int log2_denom,
- int weight_src, int offset)
-{
- if (4 == height) {
- avc_wgt_8x4_lasx(src, stride, log2_denom, weight_src, offset);
- } else if (8 == height) {
- avc_wgt_8x8_lasx(src, stride, log2_denom, weight_src, offset);
- } else {
- avc_wgt_8x16_lasx(src, stride, log2_denom, weight_src, offset);
- }
-}
-
-static void avc_wgt_4x2_lasx(uint8_t *src, ptrdiff_t stride,
- int32_t log2_denom, int32_t weight_src,
- int32_t offset_in)
-{
- uint32_t offset_val;
- __m256i wgt, zero = __lasx_xvldi(0);
- __m256i src0, tmp0, tmp1, denom, offset;
-
- offset_val = (unsigned) offset_in << log2_denom;
-
- wgt = __lasx_xvreplgr2vr_h(weight_src);
- offset = __lasx_xvreplgr2vr_h(offset_val);
- denom = __lasx_xvreplgr2vr_h(log2_denom);
-
- DUP2_ARG2(__lasx_xvldx, src, 0, src, stride, tmp0, tmp1);
- src0 = __lasx_xvilvl_w(tmp1, tmp0);
- src0 = __lasx_xvilvl_b(zero, src0);
- src0 = __lasx_xvmul_h(wgt, src0);
- src0 = __lasx_xvsadd_h(src0, offset);
- src0 = __lasx_xvmaxi_h(src0, 0);
- src0 = __lasx_xvssrlrn_bu_h(src0, denom);
- __lasx_xvstelm_w(src0, src, 0, 0);
- __lasx_xvstelm_w(src0, src + stride, 0, 1);
-}
-
-static void avc_wgt_4x4_lasx(uint8_t *src, ptrdiff_t stride,
- int32_t log2_denom, int32_t weight_src,
- int32_t offset_in)
-{
- __m256i wgt;
- __m256i src0, tmp0, tmp1, tmp2, tmp3, denom, offset;
- uint32_t offset_val;
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
-
- offset_val = (unsigned) offset_in << log2_denom;
-
- wgt = __lasx_xvreplgr2vr_h(weight_src);
- offset = __lasx_xvreplgr2vr_h(offset_val);
- denom = __lasx_xvreplgr2vr_h(log2_denom);
-
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
- src0 = __lasx_xvilvl_w(tmp1, tmp0);
- src0 = __lasx_vext2xv_hu_bu(src0);
- src0 = __lasx_xvmul_h(wgt, src0);
- src0 = __lasx_xvsadd_h(src0, offset);
- src0 = __lasx_xvmaxi_h(src0, 0);
- src0 = __lasx_xvssrlrn_bu_h(src0, denom);
- __lasx_xvstelm_w(src0, src, 0, 0);
- __lasx_xvstelm_w(src0, src + stride, 0, 1);
- __lasx_xvstelm_w(src0, src + stride_2x, 0, 4);
- __lasx_xvstelm_w(src0, src + stride_3x, 0, 5);
-}
-
-static void avc_wgt_4x8_lasx(uint8_t *src, ptrdiff_t stride,
- int32_t log2_denom, int32_t weight_src,
- int32_t offset_in)
-{
- __m256i src0, src0_h, src0_l;
- __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
- __m256i wgt, zero = __lasx_xvldi(0);
- uint32_t offset_val;
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_4x = stride << 2;
- ptrdiff_t stride_3x = stride_2x + stride;
-
- offset_val = (unsigned) offset_in << log2_denom;
-
- wgt = __lasx_xvreplgr2vr_h(weight_src);
- offset = __lasx_xvreplgr2vr_h(offset_val);
- denom = __lasx_xvreplgr2vr_h(log2_denom);
-
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp0, tmp1, tmp2, tmp3);
- src += stride_4x;
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
- src, stride_3x, tmp4, tmp5, tmp6, tmp7);
- src -= stride_4x;
- DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7,
- tmp5, tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
- src0_l = __lasx_xvilvl_b(zero, src0);
- src0_h = __lasx_xvilvh_b(zero, src0);
- src0_l = __lasx_xvmul_h(wgt, src0_l);
- src0_h = __lasx_xvmul_h(wgt, src0_h);
- src0_l = __lasx_xvsadd_h(src0_l, offset);
- src0_h = __lasx_xvsadd_h(src0_h, offset);
- src0_l = __lasx_xvmaxi_h(src0_l, 0);
- src0_h = __lasx_xvmaxi_h(src0_h, 0);
- src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
- src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
- __lasx_xvstelm_w(src0_l, src, 0, 0);
- __lasx_xvstelm_w(src0_l, src + stride, 0, 1);
- __lasx_xvstelm_w(src0_h, src + stride_2x, 0, 0);
- __lasx_xvstelm_w(src0_h, src + stride_3x, 0, 1);
- src += stride_4x;
- __lasx_xvstelm_w(src0_l, src, 0, 4);
- __lasx_xvstelm_w(src0_l, src + stride, 0, 5);
- __lasx_xvstelm_w(src0_h, src + stride_2x, 0, 4);
- __lasx_xvstelm_w(src0_h, src + stride_3x, 0, 5);
-}
-
-void ff_weight_h264_pixels4_8_lasx(uint8_t *src, ptrdiff_t stride,
- int height, int log2_denom,
- int weight_src, int offset)
-{
- if (2 == height) {
- avc_wgt_4x2_lasx(src, stride, log2_denom, weight_src, offset);
- } else if (4 == height) {
- avc_wgt_4x4_lasx(src, stride, log2_denom, weight_src, offset);
- } else {
- avc_wgt_4x8_lasx(src, stride, log2_denom, weight_src, offset);
- }
-}
-
void ff_h264_add_pixels4_8_lasx(uint8_t *_dst, int16_t *_src, int stride)
{
__m256i src0, dst0, dst1, dst2, dst3, zero;
diff --git a/libavcodec/loongarch/h264dsp_loongarch.h b/libavcodec/loongarch/h264dsp_loongarch.h
index 28dca2b537..e17522dfe0 100644
--- a/libavcodec/loongarch/h264dsp_loongarch.h
+++ b/libavcodec/loongarch/h264dsp_loongarch.h
@@ -47,6 +47,50 @@ void ff_h264_idct_add16_intra_8_lsx(uint8_t *dst, const int32_t *blk_offset,
int16_t *block, int32_t dst_stride,
const uint8_t nzc[15 * 8]);
+void ff_h264_h_lpf_luma_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int alpha, int beta, int8_t *tc0);
+void ff_h264_v_lpf_luma_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int alpha, int beta, int8_t *tc0);
+void ff_h264_h_lpf_luma_intra_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int alpha, int beta);
+void ff_h264_v_lpf_luma_intra_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int alpha, int beta);
+void ff_h264_h_lpf_chroma_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int alpha, int beta, int8_t *tc0);
+void ff_h264_v_lpf_chroma_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int alpha, int beta, int8_t *tc0);
+void ff_h264_h_lpf_chroma_intra_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int alpha, int beta);
+void ff_h264_v_lpf_chroma_intra_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int alpha, int beta);
+void ff_biweight_h264_pixels16_8_lsx(uint8_t *dst, uint8_t *src,
+ ptrdiff_t stride, int height,
+ int log2_denom, int weight_dst,
+ int weight_src, int offset_in);
+void ff_biweight_h264_pixels8_8_lsx(uint8_t *dst, uint8_t *src,
+ ptrdiff_t stride, int height,
+ int log2_denom, int weight_dst,
+ int weight_src, int offset);
+void ff_biweight_h264_pixels4_8_lsx(uint8_t *dst, uint8_t *src,
+ ptrdiff_t stride, int height,
+ int log2_denom, int weight_dst,
+ int weight_src, int offset);
+void ff_weight_h264_pixels16_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int height, int log2_denom,
+ int weight_src, int offset_in);
+void ff_weight_h264_pixels8_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int height, int log2_denom,
+ int weight_src, int offset);
+void ff_weight_h264_pixels4_8_lsx(uint8_t *src, ptrdiff_t stride,
+ int height, int log2_denom,
+ int weight_src, int offset);
+void ff_h264_add_pixels4_8_lsx(uint8_t *_dst, int16_t *_src, int stride);
+void ff_h264_add_pixels8_8_lsx(uint8_t *_dst, int16_t *_src, int stride);
+void ff_h264_loop_filter_strength_lsx(int16_t bS[2][4][4], uint8_t nnz[40],
+ int8_t ref[2][40], int16_t mv[2][40][2],
+ int bidir, int edges, int step,
+ int mask_mv0, int mask_mv1, int field);
+
#if HAVE_LASX
void ff_h264_h_lpf_luma_8_lasx(uint8_t *src, ptrdiff_t stride,
int alpha, int beta, int8_t *tc0);
@@ -56,24 +100,12 @@ void ff_h264_h_lpf_luma_intra_8_lasx(uint8_t *src, ptrdiff_t stride,
int alpha, int beta);
void ff_h264_v_lpf_luma_intra_8_lasx(uint8_t *src, ptrdiff_t stride,
int alpha, int beta);
-void ff_h264_h_lpf_chroma_8_lasx(uint8_t *src, ptrdiff_t stride,
- int alpha, int beta, int8_t *tc0);
-void ff_h264_v_lpf_chroma_8_lasx(uint8_t *src, ptrdiff_t stride,
- int alpha, int beta, int8_t *tc0);
-void ff_h264_h_lpf_chroma_intra_8_lasx(uint8_t *src, ptrdiff_t stride,
- int alpha, int beta);
-void ff_h264_v_lpf_chroma_intra_8_lasx(uint8_t *src, ptrdiff_t stride,
- int alpha, int beta);
-void ff_biweight_h264_pixels16_8_lasx(uint8_t *dst, uint8_t *src,
- ptrdiff_t stride, int height,
+void ff_biweight_h264_pixels16_8_lasx(unsigned char *dst, unsigned char *src,
+ long int stride, int height,
int log2_denom, int weight_dst,
int weight_src, int offset_in);
-void ff_biweight_h264_pixels8_8_lasx(uint8_t *dst, uint8_t *src,
- ptrdiff_t stride, int height,
- int log2_denom, int weight_dst,
- int weight_src, int offset);
-void ff_biweight_h264_pixels4_8_lasx(uint8_t *dst, uint8_t *src,
- ptrdiff_t stride, int height,
+void ff_biweight_h264_pixels8_8_lasx(unsigned char *dst, unsigned char *src,
+ long int stride, int height,
int log2_denom, int weight_dst,
int weight_src, int offset);
void ff_weight_h264_pixels16_8_lasx(uint8_t *src, ptrdiff_t stride,
@@ -82,9 +114,6 @@ void ff_weight_h264_pixels16_8_lasx(uint8_t *src, ptrdiff_t stride,
void ff_weight_h264_pixels8_8_lasx(uint8_t *src, ptrdiff_t stride,
int height, int log2_denom,
int weight_src, int offset);
-void ff_weight_h264_pixels4_8_lasx(uint8_t *src, ptrdiff_t stride,
- int height, int log2_denom,
- int weight_src, int offset);
void ff_h264_add_pixels4_8_lasx(uint8_t *_dst, int16_t *_src, int stride);
void ff_h264_add_pixels8_8_lasx(uint8_t *_dst, int16_t *_src, int stride);
--
2.20.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 17+ messages in thread
* [FFmpeg-devel] [PATCH v5 3/7] avcodec/la: Add LSX optimization for h264 chroma and intrapred.
2023-05-25 7:24 [FFmpeg-devel] Add LSX optimization in avcodec and swscale Hao Chen
2023-05-25 7:24 ` [FFmpeg-devel] [PATCH v5 1/7] avcodec/la: add LSX optimization for h264 idct Hao Chen
2023-05-25 7:24 ` [FFmpeg-devel] [PATCH v5 2/7] avcodec/la: Add LSX optimization for loop filter Hao Chen
@ 2023-05-25 7:24 ` Hao Chen
2023-05-25 7:24 ` [FFmpeg-devel] [PATCH v5 4/7] avcodec/la: Add LSX optimization for h264 qpel Hao Chen
` (3 subsequent siblings)
6 siblings, 0 replies; 17+ messages in thread
From: Hao Chen @ 2023-05-25 7:24 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Lu Wang
From: Lu Wang <wanglu@loongson.cn>
./configure --disable-lasx
ffmpeg -i 1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -y /dev/null -an
before: 199fps
after: 214fps
---
libavcodec/loongarch/Makefile | 4 +-
.../loongarch/h264_intrapred_init_loongarch.c | 18 +-
libavcodec/loongarch/h264_intrapred_lasx.c | 121 --
...pred_lasx.h => h264_intrapred_loongarch.h} | 12 +-
libavcodec/loongarch/h264chroma.S | 966 +++++++++++++
.../loongarch/h264chroma_init_loongarch.c | 10 +-
libavcodec/loongarch/h264chroma_lasx.c | 1280 -----------------
libavcodec/loongarch/h264chroma_lasx.h | 36 -
libavcodec/loongarch/h264chroma_loongarch.h | 41 +
libavcodec/loongarch/h264intrapred.S | 299 ++++
10 files changed, 1342 insertions(+), 1445 deletions(-)
delete mode 100644 libavcodec/loongarch/h264_intrapred_lasx.c
rename libavcodec/loongarch/{h264_intrapred_lasx.h => h264_intrapred_loongarch.h} (70%)
create mode 100644 libavcodec/loongarch/h264chroma.S
delete mode 100644 libavcodec/loongarch/h264chroma_lasx.c
delete mode 100644 libavcodec/loongarch/h264chroma_lasx.h
create mode 100644 libavcodec/loongarch/h264chroma_loongarch.h
create mode 100644 libavcodec/loongarch/h264intrapred.S
diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index 111bc23e4e..a563055161 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -9,11 +9,9 @@ OBJS-$(CONFIG_HPELDSP) += loongarch/hpeldsp_init_loongarch.o
OBJS-$(CONFIG_IDCTDSP) += loongarch/idctdsp_init_loongarch.o
OBJS-$(CONFIG_VIDEODSP) += loongarch/videodsp_init.o
OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_init_loongarch.o
-LASX-OBJS-$(CONFIG_H264CHROMA) += loongarch/h264chroma_lasx.o
LASX-OBJS-$(CONFIG_H264QPEL) += loongarch/h264qpel_lasx.o
LASX-OBJS-$(CONFIG_H264DSP) += loongarch/h264dsp_lasx.o \
loongarch/h264_deblock_lasx.o
-LASX-OBJS-$(CONFIG_H264PRED) += loongarch/h264_intrapred_lasx.o
LASX-OBJS-$(CONFIG_VC1_DECODER) += loongarch/vc1dsp_lasx.o
LASX-OBJS-$(CONFIG_HPELDSP) += loongarch/hpeldsp_lasx.o
LASX-OBJS-$(CONFIG_IDCTDSP) += loongarch/simple_idct_lasx.o \
@@ -33,3 +31,5 @@ LSX-OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_lsx.o \
LSX-OBJS-$(CONFIG_H264DSP) += loongarch/h264idct.o \
loongarch/h264idct_loongarch.o \
loongarch/h264dsp.o
+LSX-OBJS-$(CONFIG_H264CHROMA) += loongarch/h264chroma.o
+LSX-OBJS-$(CONFIG_H264PRED) += loongarch/h264intrapred.o
diff --git a/libavcodec/loongarch/h264_intrapred_init_loongarch.c b/libavcodec/loongarch/h264_intrapred_init_loongarch.c
index 12620bd842..c415fa30da 100644
--- a/libavcodec/loongarch/h264_intrapred_init_loongarch.c
+++ b/libavcodec/loongarch/h264_intrapred_init_loongarch.c
@@ -21,7 +21,7 @@
#include "libavutil/loongarch/cpu.h"
#include "libavcodec/h264pred.h"
-#include "h264_intrapred_lasx.h"
+#include "h264_intrapred_loongarch.h"
av_cold void ff_h264_pred_init_loongarch(H264PredContext *h, int codec_id,
const int bit_depth,
@@ -30,6 +30,22 @@ av_cold void ff_h264_pred_init_loongarch(H264PredContext *h, int codec_id,
int cpu_flags = av_get_cpu_flags();
if (bit_depth == 8) {
+ if (have_lsx(cpu_flags)) {
+ if (chroma_format_idc <= 1) {
+ }
+ if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
+ } else {
+ if (chroma_format_idc <= 1) {
+ }
+ if (codec_id == AV_CODEC_ID_SVQ3) {
+ h->pred16x16[PLANE_PRED8x8] = ff_h264_pred16x16_plane_svq3_8_lsx;
+ } else if (codec_id == AV_CODEC_ID_RV40) {
+ h->pred16x16[PLANE_PRED8x8] = ff_h264_pred16x16_plane_rv40_8_lsx;
+ } else {
+ h->pred16x16[PLANE_PRED8x8] = ff_h264_pred16x16_plane_h264_8_lsx;
+ }
+ }
+ }
if (have_lasx(cpu_flags)) {
if (chroma_format_idc <= 1) {
}
diff --git a/libavcodec/loongarch/h264_intrapred_lasx.c b/libavcodec/loongarch/h264_intrapred_lasx.c
deleted file mode 100644
index c38cd611b8..0000000000
--- a/libavcodec/loongarch/h264_intrapred_lasx.c
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2021 Loongson Technology Corporation Limited
- * Contributed by Hao Chen <chenhao@loongson.cn>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/loongarch/loongson_intrinsics.h"
-#include "h264_intrapred_lasx.h"
-
-#define PRED16X16_PLANE \
- ptrdiff_t stride_1, stride_2, stride_3, stride_4, stride_5, stride_6; \
- ptrdiff_t stride_8, stride_15; \
- int32_t res0, res1, res2, res3, cnt; \
- uint8_t *src0, *src1; \
- __m256i reg0, reg1, reg2, reg3, reg4; \
- __m256i tmp0, tmp1, tmp2, tmp3; \
- __m256i shuff = {0x0B040A0509060807, 0x0F000E010D020C03, 0, 0}; \
- __m256i mult = {0x0004000300020001, 0x0008000700060005, 0, 0}; \
- __m256i int_mult1 = {0x0000000100000000, 0x0000000300000002, \
- 0x0000000500000004, 0x0000000700000006}; \
- \
- stride_1 = -stride; \
- stride_2 = stride << 1; \
- stride_3 = stride_2 + stride; \
- stride_4 = stride_2 << 1; \
- stride_5 = stride_4 + stride; \
- stride_6 = stride_3 << 1; \
- stride_8 = stride_4 << 1; \
- stride_15 = (stride_8 << 1) - stride; \
- src0 = src - 1; \
- src1 = src0 + stride_8; \
- \
- reg0 = __lasx_xvldx(src0, -stride); \
- reg1 = __lasx_xvldx(src, (8 - stride)); \
- reg0 = __lasx_xvilvl_d(reg1, reg0); \
- reg0 = __lasx_xvshuf_b(reg0, reg0, shuff); \
- reg0 = __lasx_xvhsubw_hu_bu(reg0, reg0); \
- reg0 = __lasx_xvmul_h(reg0, mult); \
- res1 = (src1[0] - src0[stride_6]) + \
- 2 * (src1[stride] - src0[stride_5]) + \
- 3 * (src1[stride_2] - src0[stride_4]) + \
- 4 * (src1[stride_3] - src0[stride_3]) + \
- 5 * (src1[stride_4] - src0[stride_2]) + \
- 6 * (src1[stride_5] - src0[stride]) + \
- 7 * (src1[stride_6] - src0[0]) + \
- 8 * (src0[stride_15] - src0[stride_1]); \
- reg0 = __lasx_xvhaddw_w_h(reg0, reg0); \
- reg0 = __lasx_xvhaddw_d_w(reg0, reg0); \
- reg0 = __lasx_xvhaddw_q_d(reg0, reg0); \
- res0 = __lasx_xvpickve2gr_w(reg0, 0); \
-
-#define PRED16X16_PLANE_END \
- res2 = (src0[stride_15] + src[15 - stride] + 1) << 4; \
- res3 = 7 * (res0 + res1); \
- res2 -= res3; \
- reg0 = __lasx_xvreplgr2vr_w(res0); \
- reg1 = __lasx_xvreplgr2vr_w(res1); \
- reg2 = __lasx_xvreplgr2vr_w(res2); \
- reg3 = __lasx_xvmul_w(reg0, int_mult1); \
- reg4 = __lasx_xvslli_w(reg0, 3); \
- reg4 = __lasx_xvadd_w(reg4, reg3); \
- for (cnt = 8; cnt--;) { \
- tmp0 = __lasx_xvadd_w(reg2, reg3); \
- tmp1 = __lasx_xvadd_w(reg2, reg4); \
- tmp0 = __lasx_xvssrani_hu_w(tmp1, tmp0, 5); \
- tmp0 = __lasx_xvpermi_d(tmp0, 0xD8); \
- reg2 = __lasx_xvadd_w(reg2, reg1); \
- tmp2 = __lasx_xvadd_w(reg2, reg3); \
- tmp3 = __lasx_xvadd_w(reg2, reg4); \
- tmp1 = __lasx_xvssrani_hu_w(tmp3, tmp2, 5); \
- tmp1 = __lasx_xvpermi_d(tmp1, 0xD8); \
- tmp0 = __lasx_xvssrani_bu_h(tmp1, tmp0, 0); \
- reg2 = __lasx_xvadd_w(reg2, reg1); \
- __lasx_xvstelm_d(tmp0, src, 0, 0); \
- __lasx_xvstelm_d(tmp0, src, 8, 2); \
- src += stride; \
- __lasx_xvstelm_d(tmp0, src, 0, 1); \
- __lasx_xvstelm_d(tmp0, src, 8, 3); \
- src += stride; \
- }
-
-
-void ff_h264_pred16x16_plane_h264_8_lasx(uint8_t *src, ptrdiff_t stride)
-{
- PRED16X16_PLANE
- res0 = (5 * res0 + 32) >> 6;
- res1 = (5 * res1 + 32) >> 6;
- PRED16X16_PLANE_END
-}
-
-void ff_h264_pred16x16_plane_rv40_8_lasx(uint8_t *src, ptrdiff_t stride)
-{
- PRED16X16_PLANE
- res0 = (res0 + (res0 >> 2)) >> 4;
- res1 = (res1 + (res1 >> 2)) >> 4;
- PRED16X16_PLANE_END
-}
-
-void ff_h264_pred16x16_plane_svq3_8_lasx(uint8_t *src, ptrdiff_t stride)
-{
- PRED16X16_PLANE
- cnt = (5 * (res0/4)) / 16;
- res0 = (5 * (res1/4)) / 16;
- res1 = cnt;
- PRED16X16_PLANE_END
-}
diff --git a/libavcodec/loongarch/h264_intrapred_lasx.h b/libavcodec/loongarch/h264_intrapred_loongarch.h
similarity index 70%
rename from libavcodec/loongarch/h264_intrapred_lasx.h
rename to libavcodec/loongarch/h264_intrapred_loongarch.h
index 0c2653300c..39be87ee9f 100644
--- a/libavcodec/loongarch/h264_intrapred_lasx.h
+++ b/libavcodec/loongarch/h264_intrapred_loongarch.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
* Contributed by Hao Chen <chenhao@loongson.cn>
*
* This file is part of FFmpeg.
@@ -19,13 +19,17 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#ifndef AVCODEC_LOONGARCH_H264_INTRAPRED_LASX_H
-#define AVCODEC_LOONGARCH_H264_INTRAPRED_LASX_H
+#ifndef AVCODEC_LOONGARCH_H264_INTRAPRED_LOONGARCH_H
+#define AVCODEC_LOONGARCH_H264_INTRAPRED_LOONGARCH_H
#include "libavcodec/avcodec.h"
+void ff_h264_pred16x16_plane_h264_8_lsx(uint8_t *src, ptrdiff_t stride);
+void ff_h264_pred16x16_plane_rv40_8_lsx(uint8_t *src, ptrdiff_t stride);
+void ff_h264_pred16x16_plane_svq3_8_lsx(uint8_t *src, ptrdiff_t stride);
+
void ff_h264_pred16x16_plane_h264_8_lasx(uint8_t *src, ptrdiff_t stride);
void ff_h264_pred16x16_plane_rv40_8_lasx(uint8_t *src, ptrdiff_t stride);
void ff_h264_pred16x16_plane_svq3_8_lasx(uint8_t *src, ptrdiff_t stride);
-#endif // #ifndef AVCODEC_LOONGARCH_H264_INTRAPRED_LASX_H
+#endif // #ifndef AVCODEC_LOONGARCH_H264_INTRAPRED_LOONGARCH_H
diff --git a/libavcodec/loongarch/h264chroma.S b/libavcodec/loongarch/h264chroma.S
new file mode 100644
index 0000000000..353b8d004b
--- /dev/null
+++ b/libavcodec/loongarch/h264chroma.S
@@ -0,0 +1,966 @@
+/*
+ * Loongson LSX/LASX optimized h264chroma
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+/* void ff_put_h264_chroma_mc8_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y) */
+function ff_put_h264_chroma_mc8_lsx
+ li.d t8, 8
+ sub.d t1, t8, a4 // 8-x
+ sub.d t2, t8, a5 // 8-y
+ mul.d t3, t1, t2 // A
+ mul.d t4, a4, t2 // B
+ mul.d t5, t1, a5 // C
+ mul.d t6, a4, a5 // D
+ add.d t0, t4, t5 // E
+ vreplgr2vr.b vr0, t3
+ vreplgr2vr.b vr1, t4
+ vreplgr2vr.b vr2, t5
+ vreplgr2vr.b vr3, t6
+ vreplgr2vr.b vr4, t0
+ slli.d t2, a2, 1
+ add.d t3, t2, a2
+ slli.d t4, a2, 2
+
+ bge zero, t6, .ENDLOOP_D
+ move t1, a3
+ vilvl.b vr9, vr1, vr0
+ vilvl.b vr10, vr3, vr2
+.LOOP_D:
+ vld vr5, a1, 0
+ vld vr6, a1, 1
+ add.d a1, a1, a2
+ vld vr7, a1, 0
+ vld vr8, a1, 1
+ vilvl.b vr11, vr6, vr5
+ vilvl.b vr12, vr8, vr7
+ vmulwev.h.bu vr13, vr9, vr11
+ vmaddwod.h.bu vr13, vr9, vr11
+ vmulwev.h.bu vr14, vr10, vr12
+ vmaddwod.h.bu vr14, vr10, vr12
+ vadd.h vr13, vr13, vr14
+ vsrarni.b.h vr13, vr13, 6
+ vstelm.d vr13, a0, 0, 0
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+ vld vr5, a1, 0
+ vld vr6, a1, 1
+ vilvl.b vr11, vr8, vr7
+ vilvl.b vr12, vr6, vr5
+ vmulwev.h.bu vr13, vr9, vr11
+ vmaddwod.h.bu vr13, vr9, vr11
+ vmulwev.h.bu vr14, vr10, vr12
+ vmaddwod.h.bu vr14, vr10, vr12
+ vadd.h vr13, vr13, vr14
+ vsrarni.b.h vr13, vr13, 6
+ vstelm.d vr13, a0, 0, 0
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+ vld vr7, a1, 0
+ vld vr8, a1, 1
+ vilvl.b vr11, vr6, vr5
+ vilvl.b vr12, vr8, vr7
+ vmulwev.h.bu vr13, vr9, vr11
+ vmaddwod.h.bu vr13, vr9, vr11
+ vmulwev.h.bu vr14, vr10, vr12
+ vmaddwod.h.bu vr14, vr10, vr12
+ vadd.h vr13, vr13, vr14
+ vsrarni.b.h vr13, vr13, 6
+ vstelm.d vr13, a0, 0, 0
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+ vld vr5, a1, 0
+ vld vr6, a1, 1
+ vilvl.b vr11, vr8, vr7
+ vilvl.b vr12, vr6, vr5
+ vmulwev.h.bu vr13, vr9, vr11
+ vmaddwod.h.bu vr13, vr9, vr11
+ vmulwev.h.bu vr14, vr10, vr12
+ vmaddwod.h.bu vr14, vr10, vr12
+ vadd.h vr13, vr13, vr14
+ vsrarni.b.h vr13, vr13, 6
+ vstelm.d vr13, a0, 0, 0
+ add.d a0, a0, a2
+
+ addi.d t1, t1, -4
+ blt zero, t1, .LOOP_D
+ b .ENDLOOP
+.ENDLOOP_D:
+
+ bge zero, t0, .ENDLOOP_E
+ move t1, a3
+ li.d t7, 1
+ slt t8, zero, t5
+ maskeqz t5, a2, t8
+ masknez t7, t7, t8
+ or t7, t7, t5
+ vilvl.b vr7, vr4, vr0
+.LOOP_E:
+ vld vr5, a1, 0
+ vldx vr6, a1, t7
+ vilvl.b vr5, vr6, vr5
+ vmulwev.h.bu vr6, vr7, vr5
+ vmaddwod.h.bu vr6, vr7, vr5
+ vsrarni.b.h vr6, vr6, 6
+ vstelm.d vr6, a0, 0, 0
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+ vld vr5, a1, 0
+ vldx vr6, a1, t7
+ vilvl.b vr5, vr6, vr5
+ vmulwev.h.bu vr6, vr7, vr5
+ vmaddwod.h.bu vr6, vr7, vr5
+ vsrarni.b.h vr6, vr6, 6
+ vstelm.d vr6, a0, 0, 0
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+ vld vr5, a1, 0
+ vldx vr6, a1, t7
+ vilvl.b vr5, vr6, vr5
+ vmulwev.h.bu vr6, vr7, vr5
+ vmaddwod.h.bu vr6, vr7, vr5
+ vsrarni.b.h vr6, vr6, 6
+ vstelm.d vr6, a0, 0, 0
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+ vld vr5, a1, 0
+ vldx vr6, a1, t7
+ vilvl.b vr5, vr6, vr5
+ vmulwev.h.bu vr6, vr7, vr5
+ vmaddwod.h.bu vr6, vr7, vr5
+ vsrarni.b.h vr6, vr6, 6
+ vstelm.d vr6, a0, 0, 0
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+
+ addi.d t1, t1, -4
+ blt zero, t1, .LOOP_E
+ b .ENDLOOP
+.ENDLOOP_E:
+
+ move t1, a3
+.LOOP:
+ vld vr5, a1, 0
+ vmulwev.h.bu vr6, vr0, vr5
+ vmulwod.h.bu vr7, vr0, vr5
+ vsrarni.b.h vr6, vr6, 6
+ vsrarni.b.h vr7, vr7, 6
+ vilvl.b vr6, vr7, vr6
+ vstelm.d vr6, a0, 0, 0
+ add.d a0, a0, a2
+ vldx vr5, a1, a2
+ vmulwev.h.bu vr6, vr0, vr5
+ vmulwod.h.bu vr7, vr0, vr5
+ vsrarni.b.h vr6, vr6, 6
+ vsrarni.b.h vr7, vr7, 6
+ vilvl.b vr6, vr7, vr6
+ vstelm.d vr6, a0, 0, 0
+ add.d a0, a0, a2
+ vldx vr5, a1, t2
+ vmulwev.h.bu vr6, vr0, vr5
+ vmulwod.h.bu vr7, vr0, vr5
+ vsrarni.b.h vr6, vr6, 6
+ vsrarni.b.h vr7, vr7, 6
+ vilvl.b vr6, vr7, vr6
+ vstelm.d vr6, a0, 0, 0
+ add.d a0, a0, a2
+ vldx vr5, a1, t3
+ vmulwev.h.bu vr6, vr0, vr5
+ vmulwod.h.bu vr7, vr0, vr5
+ vsrarni.b.h vr6, vr6, 6
+ vsrarni.b.h vr7, vr7, 6
+ vilvl.b vr6, vr7, vr6
+ vstelm.d vr6, a0, 0, 0
+ add.d a0, a0, a2
+ add.d a1, a1, t4
+
+ addi.d t1, t1, -4
+ blt zero, t1, .LOOP
+.ENDLOOP:
+endfunc
+
+/* void ff_avg_h264_chroma_mc8_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y) */
+function ff_avg_h264_chroma_mc8_lsx
+ li.d t8, 8
+ sub.d t1, t8, a4 // 8-x
+ sub.d t2, t8, a5 // 8-y
+ mul.d t3, t1, t2 // A
+ mul.d t4, a4, t2 // B
+ mul.d t5, t1, a5 // C
+ mul.d t6, a4, a5 // D
+ add.d t0, t4, t5 // E
+ vreplgr2vr.b vr0, t3
+ vreplgr2vr.b vr1, t4
+ vreplgr2vr.b vr2, t5
+ vreplgr2vr.b vr3, t6
+ vreplgr2vr.b vr4, t0
+ slli.d t2, a2, 1
+ add.d t3, t2, a2
+ slli.d t4, a2, 2
+
+ bge zero, t6, .ENDLOOPD
+ move t1, a3
+ vilvl.b vr9, vr1, vr0
+ vilvl.b vr10, vr3, vr2
+.LOOPD:
+ vld vr5, a1, 0
+ vld vr6, a1, 1
+ add.d a1, a1, a2
+ vld vr7, a1, 0
+ vld vr8, a1, 1
+ vld vr11, a0, 0
+ vilvl.b vr12, vr6, vr5
+ vilvl.b vr13, vr8, vr7
+ vmulwev.h.bu vr14, vr9, vr12
+ vmaddwod.h.bu vr14, vr9, vr12
+ vmulwev.h.bu vr15, vr10, vr13
+ vmaddwod.h.bu vr15, vr10, vr13
+ vadd.h vr14, vr14, vr15
+ vsrari.h vr14, vr14, 6
+ vsllwil.hu.bu vr11, vr11, 0
+ vadd.h vr11, vr14, vr11
+ vsrarni.b.h vr11, vr11, 1
+ vstelm.d vr11, a0, 0, 0
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+ vld vr5, a1, 0
+ vld vr6, a1, 1
+ vld vr11, a0, 0
+ vilvl.b vr12, vr8, vr7
+ vilvl.b vr13, vr6, vr5
+ vmulwev.h.bu vr14, vr9, vr12
+ vmaddwod.h.bu vr14, vr9, vr12
+ vmulwev.h.bu vr15, vr10, vr13
+ vmaddwod.h.bu vr15, vr10, vr13
+ vadd.h vr14, vr14, vr15
+ vsrari.h vr14, vr14, 6
+ vsllwil.hu.bu vr11, vr11, 0
+ vadd.h vr11, vr14, vr11
+ vsrarni.b.h vr11, vr11, 1
+ vstelm.d vr11, a0, 0, 0
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+ vld vr7, a1, 0
+ vld vr8, a1, 1
+ vld vr11, a0, 0
+ vilvl.b vr12, vr6, vr5
+ vilvl.b vr13, vr8, vr7
+ vmulwev.h.bu vr14, vr9, vr12
+ vmaddwod.h.bu vr14, vr9, vr12
+ vmulwev.h.bu vr15, vr10, vr13
+ vmaddwod.h.bu vr15, vr10, vr13
+ vadd.h vr14, vr14, vr15
+ vsrari.h vr14, vr14, 6
+ vsllwil.hu.bu vr11, vr11, 0
+ vadd.h vr11, vr14, vr11
+ vsrarni.b.h vr11, vr11, 1
+ vstelm.d vr11, a0, 0, 0
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+ vld vr5, a1, 0
+ vld vr6, a1, 1
+ vld vr11, a0, 0
+ vilvl.b vr12, vr8, vr7
+ vilvl.b vr13, vr6, vr5
+ vmulwev.h.bu vr14, vr9, vr12
+ vmaddwod.h.bu vr14, vr9, vr12
+ vmulwev.h.bu vr15, vr10, vr13
+ vmaddwod.h.bu vr15, vr10, vr13
+ vadd.h vr14, vr14, vr15
+ vsrari.h vr14, vr14, 6
+ vsllwil.hu.bu vr11, vr11, 0
+ vadd.h vr11, vr14, vr11
+ vsrarni.b.h vr11, vr11, 1
+ vstelm.d vr11, a0, 0, 0
+ add.d a0, a0, a2
+
+ addi.d t1, t1, -4
+ blt zero, t1, .LOOPD
+ b .ENDLOOPELSE
+.ENDLOOPD:
+
+ bge zero, t0, .ENDLOOPE
+ move t1, a3
+ li.d t7, 1
+ slt t8, zero, t5
+ maskeqz t5, a2, t8
+ masknez t7, t7, t8
+ or t7, t7, t5
+ vilvl.b vr7, vr4, vr0
+.LOOPE:
+ vld vr5, a1, 0
+ vldx vr6, a1, t7
+ vld vr8, a0, 0
+ vilvl.b vr5, vr6, vr5
+ vmulwev.h.bu vr6, vr7, vr5
+ vmaddwod.h.bu vr6, vr7, vr5
+ vsrari.h vr6, vr6, 6
+ vsllwil.hu.bu vr8, vr8, 0
+ vadd.h vr8, vr6, vr8
+ vsrarni.b.h vr8, vr8, 1
+ vstelm.d vr8, a0, 0, 0
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+ vld vr5, a1, 0
+ vldx vr6, a1, t7
+ vld vr8, a0, 0
+ vilvl.b vr5, vr6, vr5
+ vmulwev.h.bu vr6, vr7, vr5
+ vmaddwod.h.bu vr6, vr7, vr5
+ vsrari.h vr6, vr6, 6
+ vsllwil.hu.bu vr8, vr8, 0
+ vadd.h vr8, vr6, vr8
+ vsrarni.b.h vr8, vr8, 1
+ vstelm.d vr8, a0, 0, 0
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+ vld vr5, a1, 0
+ vldx vr6, a1, t7
+ vld vr8, a0, 0
+ vilvl.b vr5, vr6, vr5
+ vmulwev.h.bu vr6, vr7, vr5
+ vmaddwod.h.bu vr6, vr7, vr5
+ vsrari.h vr6, vr6, 6
+ vsllwil.hu.bu vr8, vr8, 0
+ vadd.h vr8, vr6, vr8
+ vsrarni.b.h vr8, vr8, 1
+ vstelm.d vr8, a0, 0, 0
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+ vld vr5, a1, 0
+ vldx vr6, a1, t7
+ vld vr8, a0, 0
+ vilvl.b vr5, vr6, vr5
+ vmulwev.h.bu vr6, vr7, vr5
+ vmaddwod.h.bu vr6, vr7, vr5
+ vsrari.h vr6, vr6, 6
+ vsllwil.hu.bu vr8, vr8, 0
+ vadd.h vr8, vr6, vr8
+ vsrarni.b.h vr8, vr8, 1
+ vstelm.d vr8, a0, 0, 0
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+
+ addi.d t1, t1, -4
+ blt zero, t1, .LOOPE
+ b .ENDLOOPELSE
+.ENDLOOPE:
+
+ move t1, a3
+.LOOPELSE:
+ vld vr5, a1, 0
+ vld vr8, a0, 0
+ vmulwev.h.bu vr6, vr0, vr5
+ vmulwod.h.bu vr7, vr0, vr5
+ vilvl.h vr6, vr7, vr6
+ vsrari.h vr6, vr6, 6
+ vsllwil.hu.bu vr8, vr8, 0
+ vadd.h vr8, vr6, vr8
+ vsrarni.b.h vr8, vr8, 1
+ vstelm.d vr8, a0, 0, 0
+ add.d a0, a0, a2
+ vldx vr5, a1, a2
+ vld vr8, a0, 0
+ vmulwev.h.bu vr6, vr0, vr5
+ vmulwod.h.bu vr7, vr0, vr5
+ vilvl.h vr6, vr7, vr6
+ vsrari.h vr6, vr6, 6
+ vsllwil.hu.bu vr8, vr8, 0
+ vadd.h vr8, vr6, vr8
+ vsrarni.b.h vr8, vr8, 1
+ vstelm.d vr8, a0, 0, 0
+ add.d a0, a0, a2
+ vldx vr5, a1, t2
+ vld vr8, a0, 0
+ vmulwev.h.bu vr6, vr0, vr5
+ vmulwod.h.bu vr7, vr0, vr5
+ vilvl.h vr6, vr7, vr6
+ vsrari.h vr6, vr6, 6
+ vsllwil.hu.bu vr8, vr8, 0
+ vadd.h vr8, vr6, vr8
+ vsrarni.b.h vr8, vr8, 1
+ vstelm.d vr8, a0, 0, 0
+ add.d a0, a0, a2
+ vldx vr5, a1, t3
+ vld vr8, a0, 0
+ vmulwev.h.bu vr6, vr0, vr5
+ vmulwod.h.bu vr7, vr0, vr5
+ vilvl.h vr6, vr7, vr6
+ vsrari.h vr6, vr6, 6
+ vsllwil.hu.bu vr8, vr8, 0
+ vadd.h vr8, vr6, vr8
+ vsrarni.b.h vr8, vr8, 1
+ vstelm.d vr8, a0, 0, 0
+ add.d a0, a0, a2
+ add.d a1, a1, t4
+
+ addi.d t1, t1, -4
+ blt zero, t1, .LOOPELSE
+.ENDLOOPELSE:
+endfunc
+
+/* void ff_put_h264_chroma_mc4_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y) */
+function ff_put_h264_chroma_mc4_lsx
+ li.d t8, 8
+ sub.d t1, t8, a4 // 8-x
+ sub.d t2, t8, a5 // 8-y
+ mul.d t3, t1, t2 // A
+ mul.d t4, a4, t2 // B
+ mul.d t5, t1, a5 // C
+ mul.d t6, a4, a5 // D
+ add.d t0, t4, t5 // E
+ slli.d t8, a2, 1
+ vreplgr2vr.b vr0, t3
+ vreplgr2vr.b vr1, t4
+ vreplgr2vr.b vr2, t5
+ vreplgr2vr.b vr3, t6
+ vreplgr2vr.b vr4, t0
+
+ bge zero, t6, .ENDPUT_D
+ move t1, a3
+ vilvl.b vr9, vr1, vr0
+ vilvl.b vr10, vr3, vr2
+.PUT_D:
+ vld vr5, a1, 0
+ vld vr6, a1, 1
+ add.d a1, a1, a2
+ vld vr7, a1, 0
+ vld vr8, a1, 1
+ add.d a1, a1, a2
+ vld vr11, a1, 0
+ vld vr12, a1, 1
+ vilvl.b vr5, vr6, vr5
+ vilvl.b vr7, vr8, vr7
+ vilvl.b vr13, vr12, vr11
+ vilvl.d vr5, vr7, vr5
+ vilvl.d vr13, vr13, vr7
+ vmulwev.h.bu vr14, vr9, vr5
+ vmaddwod.h.bu vr14, vr9, vr5
+ vmulwev.h.bu vr15, vr10, vr13
+ vmaddwod.h.bu vr15, vr10, vr13
+ vadd.h vr14, vr14, vr15
+ vsrarni.b.h vr14, vr14, 6
+ vstelm.w vr14, a0, 0, 0
+ add.d a0, a0, a2
+ vstelm.w vr14, a0, 0, 1
+ add.d a0, a0, a2
+ addi.d t1, t1, -2
+ blt zero, t1, .PUT_D
+ b .ENDPUT
+.ENDPUT_D:
+
+ bge zero, t0, .ENDPUT_E
+ move t1, a3
+ li.d t7, 1
+ slt t8, zero, t5
+ maskeqz t5, a2, t8
+ masknez t7, t7, t8
+ or t7, t7, t5
+ vilvl.b vr7, vr4, vr0
+.PUT_E:
+ vld vr5, a1, 0
+ vldx vr6, a1, t7
+ vilvl.b vr5, vr6, vr5
+ add.d a1, a1, a2
+ vld vr8, a1, 0
+ vldx vr9, a1, t7
+ vilvl.b vr8, vr9, vr8
+ vilvl.d vr5, vr8, vr5
+ vmulwev.h.bu vr6, vr7, vr5
+ vmaddwod.h.bu vr6, vr7, vr5
+ vsrarni.b.h vr6, vr6, 6
+ vstelm.w vr6, a0, 0, 0
+ add.d a0, a0, a2
+ vstelm.w vr6, a0, 0, 1
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+ addi.d t1, t1, -2
+ blt zero, t1, .PUT_E
+ b .ENDPUT
+.ENDPUT_E:
+
+ move t1, a3
+.PUT:
+ vld vr5, a1, 0
+ vldx vr8, a1, a2
+ vilvl.w vr5, vr8, vr5
+ vmulwev.h.bu vr6, vr0, vr5
+ vmulwod.h.bu vr7, vr0, vr5
+ vsrarni.b.h vr6, vr6, 6
+ vsrarni.b.h vr7, vr7, 6
+ vilvl.b vr6, vr7, vr6
+ vstelm.w vr6, a0, 0, 0
+ add.d a0, a0, a2
+ vstelm.w vr6, a0, 0, 1
+ add.d a0, a0, a2
+ add.d a1, a1, t8
+ addi.d t1, t1, -2
+ blt zero, t1, .PUT
+.ENDPUT:
+endfunc
+
+/* void ff_put_h264_chroma_mc8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y) */
+function ff_put_h264_chroma_mc8_lasx
+ li.d t8, 8
+ sub.d t1, t8, a4 // 8-x
+ sub.d t2, t8, a5 // 8-y
+ mul.d t3, t1, t2 // A
+ mul.d t4, a4, t2 // B
+ mul.d t5, t1, a5 // C
+ mul.d t6, a4, a5 // D
+ add.d t0, t4, t5 // E
+ xvreplgr2vr.b xr0, t3
+ xvreplgr2vr.b xr1, t4
+ xvreplgr2vr.b xr2, t5
+ xvreplgr2vr.b xr3, t6
+ xvreplgr2vr.b xr4, t0
+ slli.d t2, a2, 1
+ add.d t3, t2, a2
+ slli.d t4, a2, 2
+
+ bge zero, t6, .ENDLOOP_DA
+ move t1, a3
+ xvilvl.b xr9, xr1, xr0
+ xvilvl.b xr10, xr3, xr2
+.LOOP_DA:
+ fld.d f5, a1, 0
+ fld.d f6, a1, 1
+ add.d a1, a1, a2
+ fld.d f7, a1, 0
+ fld.d f8, a1, 1
+ add.d a1, a1, a2
+ fld.d f13, a1, 0
+ fld.d f14, a1, 1
+ add.d a1, a1, a2
+ fld.d f15, a1, 0
+ fld.d f16, a1, 1
+ add.d a1, a1, a2
+ fld.d f17, a1, 0
+ fld.d f18, a1, 1
+ vilvl.b vr11, vr6, vr5
+ vilvl.b vr12, vr8, vr7
+ vilvl.b vr14, vr14, vr13
+ vilvl.b vr15, vr16, vr15
+ vilvl.b vr16, vr18, vr17
+ xvpermi.q xr11, xr12, 0x02
+ xvpermi.q xr12, xr14, 0x02
+ xvpermi.q xr14, xr15, 0x02
+ xvpermi.q xr15, xr16, 0x02
+
+ xvmulwev.h.bu xr19, xr9, xr11
+ xvmaddwod.h.bu xr19, xr9, xr11
+ xvmulwev.h.bu xr20, xr10, xr12
+ xvmaddwod.h.bu xr20, xr10, xr12
+ xvadd.h xr21, xr19, xr20
+ xvsrarni.b.h xr21, xr21, 6
+ vstelm.d vr21, a0, 0, 0
+ add.d a0, a0, a2
+ xvstelm.d xr21, a0, 0, 2
+ add.d a0, a0, a2
+ xvmulwev.h.bu xr13, xr9, xr14
+ xvmaddwod.h.bu xr13, xr9, xr14
+ xvmulwev.h.bu xr14, xr10, xr15
+ xvmaddwod.h.bu xr14, xr10, xr15
+ xvadd.h xr13, xr13, xr14
+ xvsrarni.b.h xr13, xr13, 6
+ vstelm.d vr13, a0, 0, 0
+ add.d a0, a0, a2
+ xvstelm.d xr13, a0, 0, 2
+ add.d a0, a0, a2
+
+ addi.d t1, t1, -4
+ blt zero, t1, .LOOP_DA
+ b .ENDLOOPA
+.ENDLOOP_DA:
+
+ bge zero, t0, .ENDLOOP_EA
+ move t1, a3
+ li.d t7, 1
+ slt t8, zero, t5
+ maskeqz t5, a2, t8
+ masknez t7, t7, t8
+ or t7, t7, t5
+ xvilvl.b xr7, xr4, xr0
+.LOOP_EA:
+ fld.d f5, a1, 0
+ fldx.d f6, a1, t7
+ add.d a1, a1, a2
+ fld.d f9, a1, 0
+ fldx.d f10, a1, t7
+ add.d a1, a1, a2
+ fld.d f11, a1, 0
+ fldx.d f12, a1, t7
+ add.d a1, a1, a2
+ fld.d f13, a1, 0
+ fldx.d f14, a1, t7
+ vilvl.b vr5, vr6, vr5
+ vilvl.b vr9, vr10, vr9
+ vilvl.b vr11, vr12, vr11
+ vilvl.b vr13, vr14, vr13
+ xvpermi.q xr5, xr9, 0x02
+ xvpermi.q xr11, xr13, 0x02
+
+ xvmulwev.h.bu xr8, xr7, xr5
+ xvmaddwod.h.bu xr8, xr7, xr5
+ xvmulwev.h.bu xr6, xr7, xr11
+ xvmaddwod.h.bu xr6, xr7, xr11
+ xvsrarni.b.h xr8, xr8, 6
+ vstelm.d vr8, a0, 0, 0
+ add.d a0, a0, a2
+ xvstelm.d xr8, a0, 0, 2
+ add.d a0, a0, a2
+ xvsrarni.b.h xr6, xr6, 6
+ vstelm.d vr6, a0, 0, 0
+ add.d a0, a0, a2
+ xvstelm.d xr6, a0, 0, 2
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+
+ addi.d t1, t1, -4
+ blt zero, t1, .LOOP_EA
+ b .ENDLOOPA
+.ENDLOOP_EA:
+
+ move t1, a3
+.LOOPA:
+ fld.d f5, a1, 0
+ fldx.d f6, a1, a2
+ fldx.d f7, a1, t2
+ fldx.d f8, a1, t3
+ vilvl.d vr5, vr6, vr5
+ vilvl.d vr7, vr8, vr7
+ xvpermi.q xr5, xr7, 0x02
+ xvmulwev.h.bu xr6, xr0, xr5
+ xvmulwod.h.bu xr7, xr0, xr5
+ xvilvl.h xr8, xr7, xr6
+ xvilvh.h xr9, xr7, xr6
+ xvsrarni.b.h xr9, xr8, 6
+ vstelm.d vr9, a0, 0, 0
+ add.d a0, a0, a2
+ vstelm.d vr9, a0, 0, 1
+ add.d a0, a0, a2
+ xvstelm.d xr9, a0, 0, 2
+ add.d a0, a0, a2
+ xvstelm.d xr9, a0, 0, 3
+ add.d a0, a0, a2
+ add.d a1, a1, t4
+
+ addi.d t1, t1, -4
+ blt zero, t1, .LOOPA
+.ENDLOOPA:
+endfunc
+
+/* void ff_avg_h264_chroma_mc8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y) */
+function ff_avg_h264_chroma_mc8_lasx
+ li.d t8, 8
+ sub.d t1, t8, a4 // 8-x
+ sub.d t2, t8, a5 // 8-y
+ mul.d t3, t1, t2 // A
+ mul.d t4, a4, t2 // B
+ mul.d t5, t1, a5 // C
+ mul.d t6, a4, a5 // D
+ add.d t0, t4, t5 // E
+ xvreplgr2vr.b xr0, t3
+ xvreplgr2vr.b xr1, t4
+ xvreplgr2vr.b xr2, t5
+ xvreplgr2vr.b xr3, t6
+ xvreplgr2vr.b xr4, t0
+ slli.d t2, a2, 1
+ add.d t3, t2, a2
+ slli.d t4, a2, 2
+
+ bge zero, t6, .ENDLOOPDA
+ move t1, a3
+ xvilvl.b xr9, xr1, xr0
+ xvilvl.b xr10, xr3, xr2
+.LOOPDA:
+ fld.d f5, a1, 0
+ fld.d f6, a1, 1
+ add.d a1, a1, a2
+ fld.d f7, a1, 0
+ fld.d f8, a1, 1
+ add.d a1, a1, a2
+ fld.d f11, a1, 0
+ fld.d f12, a1, 1
+ add.d a1, a1, a2
+ fld.d f13, a1, 0
+ fld.d f14, a1, 1
+ add.d a1, a1, a2
+ fld.d f15, a1, 0
+ fld.d f16, a1, 1
+ fld.d f17, a0, 0
+ fldx.d f18, a0, a2
+ fldx.d f19, a0, t2
+ fldx.d f20, a0, t3
+ vilvl.b vr5, vr6, vr5
+ vilvl.b vr7, vr8, vr7
+ vilvl.b vr11, vr12, vr11
+ vilvl.b vr13, vr14, vr13
+ vilvl.b vr16, vr16, vr15
+ xvpermi.q xr5, xr7, 0x02
+ xvpermi.q xr7, xr11, 0x02
+ xvpermi.q xr11, xr13, 0x02
+ xvpermi.q xr13, xr16, 0x02
+ xvpermi.q xr17, xr18, 0x02
+ xvpermi.q xr19, xr20, 0x02
+
+ xvmulwev.h.bu xr14, xr9, xr5
+ xvmaddwod.h.bu xr14, xr9, xr5
+ xvmulwev.h.bu xr15, xr10, xr7
+ xvmaddwod.h.bu xr15, xr10, xr7
+ xvadd.h xr14, xr14, xr15
+ xvsrari.h xr14, xr14, 6
+ xvsllwil.hu.bu xr17, xr17, 0
+ xvadd.h xr20, xr14, xr17
+ xvsrarni.b.h xr20, xr20, 1
+ xvstelm.d xr20, a0, 0, 0
+ add.d a0, a0, a2
+ xvstelm.d xr20, a0, 0, 2
+ add.d a0, a0, a2
+ xvmulwev.h.bu xr14, xr9, xr11
+ xvmaddwod.h.bu xr14, xr9, xr11
+ xvmulwev.h.bu xr15, xr10, xr13
+ xvmaddwod.h.bu xr15, xr10, xr13
+ xvadd.h xr14, xr14, xr15
+ xvsrari.h xr14, xr14, 6
+ xvsllwil.hu.bu xr19, xr19, 0
+ xvadd.h xr21, xr14, xr19
+ xvsrarni.b.h xr21, xr21, 1
+ xvstelm.d xr21, a0, 0, 0
+ add.d a0, a0, a2
+ xvstelm.d xr21, a0, 0, 2
+ add.d a0, a0, a2
+
+ addi.d t1, t1, -4
+ blt zero, t1, .LOOPDA
+ b .ENDLOOPELSEA
+.ENDLOOPDA:
+
+ bge zero, t0, .ENDLOOPEA
+ move t1, a3
+ li.d t7, 1
+ slt t8, zero, t5
+ maskeqz t5, a2, t8
+ masknez t7, t7, t8
+ or t7, t7, t5
+ xvilvl.b xr7, xr4, xr0
+.LOOPEA:
+ fld.d f5, a1, 0
+ fldx.d f6, a1, t7
+ add.d a1, a1, a2
+ fld.d f8, a1, 0
+ fldx.d f9, a1, t7
+ add.d a1, a1, a2
+ fld.d f10, a1, 0
+ fldx.d f11, a1, t7
+ add.d a1, a1, a2
+ fld.d f12, a1, 0
+ fldx.d f13, a1, t7
+ add.d a1, a1, a2
+ fld.d f14, a0, 0
+ fldx.d f15, a0, a2
+ fldx.d f16, a0, t2
+ fldx.d f17, a0, t3
+ vilvl.b vr5, vr6, vr5
+ vilvl.b vr8, vr9, vr8
+ vilvl.b vr10, vr11, vr10
+ vilvl.b vr12, vr13, vr12
+ xvpermi.q xr5, xr8, 0x02
+ xvpermi.q xr10, xr12, 0x02
+ xvpermi.q xr14, xr15, 0x02
+ xvpermi.q xr16, xr17, 0x02
+
+ xvmulwev.h.bu xr6, xr7, xr5
+ xvmaddwod.h.bu xr6, xr7, xr5
+ xvsrari.h xr6, xr6, 6
+ xvsllwil.hu.bu xr14, xr14, 0
+ xvadd.h xr8, xr6, xr14
+ xvsrarni.b.h xr8, xr8, 1
+ xvstelm.d xr8, a0, 0, 0
+ add.d a0, a0, a2
+ xvstelm.d xr8, a0, 0, 2
+ add.d a0, a0, a2
+ xvmulwev.h.bu xr6, xr7, xr10
+ xvmaddwod.h.bu xr6, xr7, xr10
+ xvsrari.h xr6, xr6, 6
+ xvsllwil.hu.bu xr16, xr16, 0
+ xvadd.h xr8, xr6, xr16
+ xvsrarni.b.h xr8, xr8, 1
+ xvstelm.d xr8, a0, 0, 0
+ add.d a0, a0, a2
+ xvstelm.d xr8, a0, 0, 2
+ add.d a0, a0, a2
+
+ addi.d t1, t1, -4
+ blt zero, t1, .LOOPEA
+ b .ENDLOOPELSEA
+.ENDLOOPEA:
+
+ move t1, a3
+.LOOPELSEA:
+ fld.d f5, a1, 0
+ fldx.d f6, a1, a2
+ fldx.d f7, a1, t2
+ fldx.d f8, a1, t3
+ fld.d f9, a0, 0
+ fldx.d f10, a0, a2
+ fldx.d f11, a0, t2
+ fldx.d f12, a0, t3
+ xvpermi.q xr5, xr6, 0x02
+ xvpermi.q xr7, xr8, 0x02
+ xvpermi.q xr9, xr10, 0x02
+ xvpermi.q xr11, xr12, 0x02
+
+ xvmulwev.h.bu xr12, xr0, xr5
+ xvmulwod.h.bu xr13, xr0, xr5
+ xvilvl.h xr12, xr13, xr12
+ xvsrari.h xr12, xr12, 6
+ xvsllwil.hu.bu xr9, xr9, 0
+ xvadd.h xr9, xr12, xr9
+ xvsrarni.b.h xr9, xr9, 1
+ xvstelm.d xr9, a0, 0, 0
+ add.d a0, a0, a2
+ xvstelm.d xr9, a0, 0, 2
+ add.d a0, a0, a2
+ xvmulwev.h.bu xr12, xr0, xr7
+ xvmulwod.h.bu xr13, xr0, xr7
+ xvilvl.h xr12, xr13, xr12
+ xvsrari.h xr12, xr12, 6
+ xvsllwil.hu.bu xr11, xr11, 0
+ xvadd.h xr13, xr12, xr11
+ xvsrarni.b.h xr13, xr13, 1
+ xvstelm.d xr13, a0, 0, 0
+ add.d a0, a0, a2
+ xvstelm.d xr13, a0, 0, 2
+ add.d a0, a0, a2
+ add.d a1, a1, t4
+
+ addi.d t1, t1, -4
+ blt zero, t1, .LOOPELSEA
+.ENDLOOPELSEA:
+endfunc
+
+/* void ff_put_h264_chroma_mc4_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y) */
+function ff_put_h264_chroma_mc4_lasx
+ li.d t8, 8
+ sub.d t1, t8, a4 // 8-x
+ sub.d t2, t8, a5 // 8-y
+ mul.d t3, t1, t2 // A
+ mul.d t4, a4, t2 // B
+ mul.d t5, t1, a5 // C
+ mul.d t6, a4, a5 // D
+ add.d t0, t4, t5 // E
+ slli.d t8, a2, 1
+ vreplgr2vr.b vr0, t3
+ vreplgr2vr.b vr1, t4
+ vreplgr2vr.b vr2, t5
+ vreplgr2vr.b vr3, t6
+ vreplgr2vr.b vr4, t0
+
+ bge zero, t6, .ENDPUT_DA
+ move t1, a3
+ vilvl.b vr9, vr1, vr0
+ vilvl.b vr10, vr3, vr2
+.PUT_DA:
+ fld.d f5, a1, 0
+ fld.d f6, a1, 1
+ add.d a1, a1, a2
+ fld.d f7, a1, 0
+ fld.d f8, a1, 1
+ add.d a1, a1, a2
+ fld.d f11, a1, 0
+ fld.d f12, a1, 1
+ vilvl.b vr5, vr6, vr5
+ vilvl.b vr7, vr8, vr7
+ vilvl.b vr13, vr12, vr11
+ vilvl.d vr5, vr7, vr5
+ vilvl.d vr13, vr13, vr7
+ vmulwev.h.bu vr14, vr9, vr5
+ vmaddwod.h.bu vr14, vr9, vr5
+ vmulwev.h.bu vr15, vr10, vr13
+ vmaddwod.h.bu vr15, vr10, vr13
+ xvadd.h xr14, xr14, xr15
+ vsrarni.b.h vr16, vr14, 6
+ vstelm.w vr16, a0, 0, 0
+ add.d a0, a0, a2
+ vstelm.w vr16, a0, 0, 1
+ add.d a0, a0, a2
+ addi.d t1, t1, -2
+ blt zero, t1, .PUT_DA
+ b .ENDPUTA
+.ENDPUT_DA:
+
+ bge zero, t0, .ENDPUT_EA
+ move t1, a3
+ li.d t7, 1
+ slt t8, zero, t5
+ maskeqz t5, a2, t8
+ masknez t7, t7, t8
+ or t7, t7, t5
+ vilvl.b vr7, vr4, vr0
+.PUT_EA:
+ fld.d f5, a1, 0
+ fldx.d f6, a1, t7
+ vilvl.b vr5, vr6, vr5
+ add.d a1, a1, a2
+ fld.d f8, a1, 0
+ fldx.d f9, a1, t7
+ vilvl.b vr8, vr9, vr8
+ vilvl.d vr5, vr8, vr5
+ vmulwev.h.bu vr6, vr7, vr5
+ vmaddwod.h.bu vr6, vr7, vr5
+ vsrarni.b.h vr6, vr6, 6
+ vstelm.w vr6, a0, 0, 0
+ add.d a0, a0, a2
+ vstelm.w vr6, a0, 0, 1
+ add.d a0, a0, a2
+ add.d a1, a1, a2
+ addi.d t1, t1, -2
+ blt zero, t1, .PUT_EA
+ b .ENDPUTA
+.ENDPUT_EA:
+
+ move t1, a3
+.PUTA:
+ fld.d f5, a1, 0
+ fldx.d f8, a1, a2
+ vilvl.w vr5, vr8, vr5
+ vmulwev.h.bu vr6, vr0, vr5
+ vmulwod.h.bu vr7, vr0, vr5
+ vilvl.h vr6, vr7, vr6
+ vsrarni.b.h vr6, vr6, 6
+ vstelm.w vr6, a0, 0, 0
+ add.d a0, a0, a2
+ vstelm.w vr6, a0, 0, 1
+ add.d a0, a0, a2
+ add.d a1, a1, t8
+ addi.d t1, t1, -2
+ blt zero, t1, .PUTA
+.ENDPUTA:
+endfunc
diff --git a/libavcodec/loongarch/h264chroma_init_loongarch.c b/libavcodec/loongarch/h264chroma_init_loongarch.c
index 0ca24ecc47..40a957aad3 100644
--- a/libavcodec/loongarch/h264chroma_init_loongarch.c
+++ b/libavcodec/loongarch/h264chroma_init_loongarch.c
@@ -19,7 +19,7 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#include "h264chroma_lasx.h"
+#include "h264chroma_loongarch.h"
#include "libavutil/attributes.h"
#include "libavutil/loongarch/cpu.h"
#include "libavcodec/h264chroma.h"
@@ -27,6 +27,14 @@
av_cold void ff_h264chroma_init_loongarch(H264ChromaContext *c, int bit_depth)
{
int cpu_flags = av_get_cpu_flags();
+ if (have_lsx(cpu_flags)) {
+ if (bit_depth <= 8) {
+ c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_lsx;
+ c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_lsx;
+ c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_lsx;
+ }
+ }
+
if (have_lasx(cpu_flags)) {
if (bit_depth <= 8) {
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_lasx;
diff --git a/libavcodec/loongarch/h264chroma_lasx.c b/libavcodec/loongarch/h264chroma_lasx.c
deleted file mode 100644
index 1c0e002bdf..0000000000
--- a/libavcodec/loongarch/h264chroma_lasx.c
+++ /dev/null
@@ -1,1280 +0,0 @@
-/*
- * Loongson LASX optimized h264chroma
- *
- * Copyright (c) 2020 Loongson Technology Corporation Limited
- * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "h264chroma_lasx.h"
-#include "libavutil/attributes.h"
-#include "libavutil/avassert.h"
-#include "libavutil/loongarch/loongson_intrinsics.h"
-
-static const uint8_t chroma_mask_arr[64] = {
- 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
- 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
- 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
- 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
-};
-
-static av_always_inline void avc_chroma_hv_8x4_lasx(const uint8_t *src, uint8_t *dst,
- ptrdiff_t stride, uint32_t coef_hor0,
- uint32_t coef_hor1, uint32_t coef_ver0,
- uint32_t coef_ver1)
-{
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
- ptrdiff_t stride_4x = stride_2x << 1;
- __m256i src0, src1, src2, src3, src4, out;
- __m256i res_hz0, res_hz1, res_hz2, res_vt0, res_vt1;
- __m256i mask;
- __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
- __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
- __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
- __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
- __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
-
- DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0);
- DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x,
- src1, src2, src3, src4);
- DUP2_ARG3(__lasx_xvpermi_q, src2, src1, 0x20, src4, src3, 0x20, src1, src3);
- src0 = __lasx_xvshuf_b(src0, src0, mask);
- DUP2_ARG3(__lasx_xvshuf_b, src1, src1, mask, src3, src3, mask, src1, src3);
- DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, res_hz0, res_hz1);
- res_hz2 = __lasx_xvdp2_h_bu(src3, coeff_hz_vec);
- res_vt0 = __lasx_xvmul_h(res_hz1, coeff_vt_vec0);
- res_vt1 = __lasx_xvmul_h(res_hz2, coeff_vt_vec0);
- res_hz0 = __lasx_xvpermi_q(res_hz1, res_hz0, 0x20);
- res_hz1 = __lasx_xvpermi_q(res_hz1, res_hz2, 0x3);
- res_vt0 = __lasx_xvmadd_h(res_vt0, res_hz0, coeff_vt_vec1);
- res_vt1 = __lasx_xvmadd_h(res_vt1, res_hz1, coeff_vt_vec1);
- out = __lasx_xvssrarni_bu_h(res_vt1, res_vt0, 6);
- __lasx_xvstelm_d(out, dst, 0, 0);
- __lasx_xvstelm_d(out, dst + stride, 0, 2);
- __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void avc_chroma_hv_8x8_lasx(const uint8_t *src, uint8_t *dst,
- ptrdiff_t stride, uint32_t coef_hor0,
- uint32_t coef_hor1, uint32_t coef_ver0,
- uint32_t coef_ver1)
-{
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
- ptrdiff_t stride_4x = stride << 2;
- __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8;
- __m256i out0, out1;
- __m256i res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
- __m256i res_vt0, res_vt1, res_vt2, res_vt3;
- __m256i mask;
- __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
- __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
- __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
- __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
- __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
-
- DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0);
- DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x,
- src1, src2, src3, src4);
- src += stride_4x;
- DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x,
- src5, src6, src7, src8);
- DUP4_ARG3(__lasx_xvpermi_q, src2, src1, 0x20, src4, src3, 0x20, src6, src5, 0x20,
- src8, src7, 0x20, src1, src3, src5, src7);
- src0 = __lasx_xvshuf_b(src0, src0, mask);
- DUP4_ARG3(__lasx_xvshuf_b, src1, src1, mask, src3, src3, mask, src5, src5, mask, src7,
- src7, mask, src1, src3, src5, src7);
- DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, src3,
- coeff_hz_vec, src5, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
- res_hz4 = __lasx_xvdp2_h_bu(src7, coeff_hz_vec);
- res_vt0 = __lasx_xvmul_h(res_hz1, coeff_vt_vec0);
- res_vt1 = __lasx_xvmul_h(res_hz2, coeff_vt_vec0);
- res_vt2 = __lasx_xvmul_h(res_hz3, coeff_vt_vec0);
- res_vt3 = __lasx_xvmul_h(res_hz4, coeff_vt_vec0);
- res_hz0 = __lasx_xvpermi_q(res_hz1, res_hz0, 0x20);
- res_hz1 = __lasx_xvpermi_q(res_hz1, res_hz2, 0x3);
- res_hz2 = __lasx_xvpermi_q(res_hz2, res_hz3, 0x3);
- res_hz3 = __lasx_xvpermi_q(res_hz3, res_hz4, 0x3);
- DUP4_ARG3(__lasx_xvmadd_h, res_vt0, res_hz0, coeff_vt_vec1, res_vt1, res_hz1, coeff_vt_vec1,
- res_vt2, res_hz2, coeff_vt_vec1, res_vt3, res_hz3, coeff_vt_vec1,
- res_vt0, res_vt1, res_vt2, res_vt3);
- DUP2_ARG3(__lasx_xvssrarni_bu_h, res_vt1, res_vt0, 6, res_vt3, res_vt2, 6, out0, out1);
- __lasx_xvstelm_d(out0, dst, 0, 0);
- __lasx_xvstelm_d(out0, dst + stride, 0, 2);
- __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3);
- dst += stride_4x;
- __lasx_xvstelm_d(out1, dst, 0, 0);
- __lasx_xvstelm_d(out1, dst + stride, 0, 2);
- __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void avc_chroma_hz_8x4_lasx(const uint8_t *src, uint8_t *dst,
- ptrdiff_t stride, uint32_t coeff0, uint32_t coeff1)
-{
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
- __m256i src0, src1, src2, src3, out;
- __m256i res0, res1;
- __m256i mask;
- __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
- __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
- __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
- coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
- DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0);
- DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src1, src2);
- src3 = __lasx_xvldx(src, stride_3x);
- DUP2_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src0, src2);
- DUP2_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src0, src2);
- DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1);
- out = __lasx_xvssrarni_bu_h(res1, res0, 6);
- __lasx_xvstelm_d(out, dst, 0, 0);
- __lasx_xvstelm_d(out, dst + stride, 0, 2);
- __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
-
-}
-
-static av_always_inline void avc_chroma_hz_8x8_lasx(const uint8_t *src, uint8_t *dst,
- ptrdiff_t stride, uint32_t coeff0, uint32_t coeff1)
-{
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
- ptrdiff_t stride_4x = stride << 2;
- __m256i src0, src1, src2, src3, src4, src5, src6, src7;
- __m256i out0, out1;
- __m256i res0, res1, res2, res3;
- __m256i mask;
- __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
- __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
- __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
- coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
- DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0);
- DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x,
- src1, src2, src3, src4);
- src += stride_4x;
- DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src5, src6);
- src7 = __lasx_xvldx(src, stride_3x);
- DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src5, src4, 0x20,
- src7, src6, 0x20, src0, src2, src4, src6);
- DUP4_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src4, src4, mask,
- src6, src6, mask, src0, src2, src4, src6);
- DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, src4, coeff_vec, src6,
- coeff_vec, res0, res1, res2, res3);
- DUP2_ARG3(__lasx_xvssrarni_bu_h, res1, res0, 6, res3, res2, 6, out0, out1);
- __lasx_xvstelm_d(out0, dst, 0, 0);
- __lasx_xvstelm_d(out0, dst + stride, 0, 2);
- __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3);
- dst += stride_4x;
- __lasx_xvstelm_d(out1, dst, 0, 0);
- __lasx_xvstelm_d(out1, dst + stride, 0, 2);
- __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void avc_chroma_hz_nonmult_lasx(const uint8_t *src,
- uint8_t *dst, ptrdiff_t stride, uint32_t coeff0,
- uint32_t coeff1, int32_t height)
-{
- uint32_t row;
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
- ptrdiff_t stride_4x = stride << 2;
- __m256i src0, src1, src2, src3, out;
- __m256i res0, res1;
- __m256i mask;
- __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
- __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
- __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
- mask = __lasx_xvld(chroma_mask_arr, 0);
- coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
-
- for (row = height >> 2; row--;) {
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
- src0, src1, src2, src3);
- src += stride_4x;
- DUP2_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src0, src2);
- DUP2_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src0, src2);
- DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1);
- out = __lasx_xvssrarni_bu_h(res1, res0, 6);
- __lasx_xvstelm_d(out, dst, 0, 0);
- __lasx_xvstelm_d(out, dst + stride, 0, 2);
- __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
- dst += stride_4x;
- }
-
- if ((height & 3)) {
- src0 = __lasx_xvld(src, 0);
- src1 = __lasx_xvldx(src, stride);
- src1 = __lasx_xvpermi_q(src1, src0, 0x20);
- src0 = __lasx_xvshuf_b(src1, src1, mask);
- res0 = __lasx_xvdp2_h_bu(src0, coeff_vec);
- out = __lasx_xvssrarni_bu_h(res0, res0, 6);
- __lasx_xvstelm_d(out, dst, 0, 0);
- dst += stride;
- __lasx_xvstelm_d(out, dst, 0, 2);
- }
-}
-
-static av_always_inline void avc_chroma_vt_8x4_lasx(const uint8_t *src, uint8_t *dst,
- ptrdiff_t stride, uint32_t coeff0, uint32_t coeff1)
-{
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
- __m256i src0, src1, src2, src3, src4, out;
- __m256i res0, res1;
- __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
- __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
- __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
- coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
- src0 = __lasx_xvld(src, 0);
- src += stride;
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
- src1, src2, src3, src4);
- DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20,
- src4, src3, 0x20, src0, src1, src2, src3);
- DUP2_ARG2(__lasx_xvilvl_b, src1, src0, src3, src2, src0, src2);
- DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1);
- out = __lasx_xvssrarni_bu_h(res1, res0, 6);
- __lasx_xvstelm_d(out, dst, 0, 0);
- __lasx_xvstelm_d(out, dst + stride, 0, 2);
- __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void avc_chroma_vt_8x8_lasx(const uint8_t *src, uint8_t *dst,
- ptrdiff_t stride, uint32_t coeff0, uint32_t coeff1)
-{
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
- ptrdiff_t stride_4x = stride << 2;
- __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8;
- __m256i out0, out1;
- __m256i res0, res1, res2, res3;
- __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
- __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
- __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
- coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
- src0 = __lasx_xvld(src, 0);
- src += stride;
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
- src1, src2, src3, src4);
- src += stride_4x;
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
- src5, src6, src7, src8);
- DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20,
- src4, src3, 0x20, src0, src1, src2, src3);
- DUP4_ARG3(__lasx_xvpermi_q, src5, src4, 0x20, src6, src5, 0x20, src7, src6, 0x20,
- src8, src7, 0x20, src4, src5, src6, src7);
- DUP4_ARG2(__lasx_xvilvl_b, src1, src0, src3, src2, src5, src4, src7, src6,
- src0, src2, src4, src6);
- DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, src4, coeff_vec,
- src6, coeff_vec, res0, res1, res2, res3);
- DUP2_ARG3(__lasx_xvssrarni_bu_h, res1, res0, 6, res3, res2, 6, out0, out1);
- __lasx_xvstelm_d(out0, dst, 0, 0);
- __lasx_xvstelm_d(out0, dst + stride, 0, 2);
- __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3);
- dst += stride_4x;
- __lasx_xvstelm_d(out1, dst, 0, 0);
- __lasx_xvstelm_d(out1, dst + stride, 0, 2);
- __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void copy_width8x8_lasx(const uint8_t *src, uint8_t *dst,
- ptrdiff_t stride)
-{
- uint64_t tmp[8];
- ptrdiff_t stride_2, stride_3, stride_4;
- __asm__ volatile (
- "slli.d %[stride_2], %[stride], 1 \n\t"
- "add.d %[stride_3], %[stride_2], %[stride] \n\t"
- "slli.d %[stride_4], %[stride_2], 1 \n\t"
- "ld.d %[tmp0], %[src], 0x0 \n\t"
- "ldx.d %[tmp1], %[src], %[stride] \n\t"
- "ldx.d %[tmp2], %[src], %[stride_2] \n\t"
- "ldx.d %[tmp3], %[src], %[stride_3] \n\t"
- "add.d %[src], %[src], %[stride_4] \n\t"
- "ld.d %[tmp4], %[src], 0x0 \n\t"
- "ldx.d %[tmp5], %[src], %[stride] \n\t"
- "ldx.d %[tmp6], %[src], %[stride_2] \n\t"
- "ldx.d %[tmp7], %[src], %[stride_3] \n\t"
-
- "st.d %[tmp0], %[dst], 0x0 \n\t"
- "stx.d %[tmp1], %[dst], %[stride] \n\t"
- "stx.d %[tmp2], %[dst], %[stride_2] \n\t"
- "stx.d %[tmp3], %[dst], %[stride_3] \n\t"
- "add.d %[dst], %[dst], %[stride_4] \n\t"
- "st.d %[tmp4], %[dst], 0x0 \n\t"
- "stx.d %[tmp5], %[dst], %[stride] \n\t"
- "stx.d %[tmp6], %[dst], %[stride_2] \n\t"
- "stx.d %[tmp7], %[dst], %[stride_3] \n\t"
- : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
- [tmp2]"=&r"(tmp[2]), [tmp3]"=&r"(tmp[3]),
- [tmp4]"=&r"(tmp[4]), [tmp5]"=&r"(tmp[5]),
- [tmp6]"=&r"(tmp[6]), [tmp7]"=&r"(tmp[7]),
- [dst]"+&r"(dst), [src]"+&r"(src),
- [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3),
- [stride_4]"=&r"(stride_4)
- : [stride]"r"(stride)
- : "memory"
- );
-}
-
-static av_always_inline void copy_width8x4_lasx(const uint8_t *src, uint8_t *dst,
- ptrdiff_t stride)
-{
- uint64_t tmp[4];
- ptrdiff_t stride_2, stride_3;
- __asm__ volatile (
- "slli.d %[stride_2], %[stride], 1 \n\t"
- "add.d %[stride_3], %[stride_2], %[stride] \n\t"
- "ld.d %[tmp0], %[src], 0x0 \n\t"
- "ldx.d %[tmp1], %[src], %[stride] \n\t"
- "ldx.d %[tmp2], %[src], %[stride_2] \n\t"
- "ldx.d %[tmp3], %[src], %[stride_3] \n\t"
-
- "st.d %[tmp0], %[dst], 0x0 \n\t"
- "stx.d %[tmp1], %[dst], %[stride] \n\t"
- "stx.d %[tmp2], %[dst], %[stride_2] \n\t"
- "stx.d %[tmp3], %[dst], %[stride_3] \n\t"
- : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
- [tmp2]"=&r"(tmp[2]), [tmp3]"=&r"(tmp[3]),
- [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3)
- : [stride]"r"(stride), [dst]"r"(dst), [src]"r"(src)
- : "memory"
- );
-}
-
-static void avc_chroma_hv_8w_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- uint32_t coef_hor0, uint32_t coef_hor1,
- uint32_t coef_ver0, uint32_t coef_ver1,
- int32_t height)
-{
- if (4 == height) {
- avc_chroma_hv_8x4_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
- coef_ver1);
- } else if (8 == height) {
- avc_chroma_hv_8x8_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
- coef_ver1);
- }
-}
-
-static void avc_chroma_hv_4x2_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- uint32_t coef_hor0, uint32_t coef_hor1,
- uint32_t coef_ver0, uint32_t coef_ver1)
-{
- ptrdiff_t stride_2 = stride << 1;
- __m256i src0, src1, src2;
- __m256i res_hz, res_vt;
- __m256i mask;
- __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
- __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
- __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
- __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
- __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
- __m256i coeff_vt_vec = __lasx_xvpermi_q(coeff_vt_vec1, coeff_vt_vec0, 0x02);
-
- DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0);
- DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2, src1, src2);
- DUP2_ARG3(__lasx_xvshuf_b, src1, src0, mask, src2, src1, mask, src0, src1);
- src0 = __lasx_xvpermi_q(src0, src1, 0x02);
- res_hz = __lasx_xvdp2_h_bu(src0, coeff_hz_vec);
- res_vt = __lasx_xvmul_h(res_hz, coeff_vt_vec);
- res_hz = __lasx_xvpermi_q(res_hz, res_vt, 0x01);
- res_vt = __lasx_xvadd_h(res_hz, res_vt);
- res_vt = __lasx_xvssrarni_bu_h(res_vt, res_vt, 6);
- __lasx_xvstelm_w(res_vt, dst, 0, 0);
- __lasx_xvstelm_w(res_vt, dst + stride, 0, 1);
-}
-
-static void avc_chroma_hv_4x4_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- uint32_t coef_hor0, uint32_t coef_hor1,
- uint32_t coef_ver0, uint32_t coef_ver1)
-{
- ptrdiff_t stride_2 = stride << 1;
- ptrdiff_t stride_3 = stride_2 + stride;
- ptrdiff_t stride_4 = stride_2 << 1;
- __m256i src0, src1, src2, src3, src4;
- __m256i res_hz0, res_hz1, res_vt0, res_vt1;
- __m256i mask;
- __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
- __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
- __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
- __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
- __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
-
- DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0);
- DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
- src, stride_4, src1, src2, src3, src4);
- DUP4_ARG3(__lasx_xvshuf_b, src1, src0, mask, src2, src1, mask, src3, src2, mask,
- src4, src3, mask, src0, src1, src2, src3);
- DUP2_ARG3(__lasx_xvpermi_q, src0, src2, 0x02, src1, src3, 0x02, src0, src1);
- DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, res_hz0, res_hz1);
- DUP2_ARG2(__lasx_xvmul_h, res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
- res_hz0 = __lasx_xvadd_h(res_vt0, res_vt1);
- res_hz0 = __lasx_xvssrarni_bu_h(res_hz0, res_hz0, 6);
- __lasx_xvstelm_w(res_hz0, dst, 0, 0);
- __lasx_xvstelm_w(res_hz0, dst + stride, 0, 1);
- __lasx_xvstelm_w(res_hz0, dst + stride_2, 0, 4);
- __lasx_xvstelm_w(res_hz0, dst + stride_3, 0, 5);
-}
-
-static void avc_chroma_hv_4x8_lasx(const uint8_t *src, uint8_t * dst, ptrdiff_t stride,
- uint32_t coef_hor0, uint32_t coef_hor1,
- uint32_t coef_ver0, uint32_t coef_ver1)
-{
- ptrdiff_t stride_2 = stride << 1;
- ptrdiff_t stride_3 = stride_2 + stride;
- ptrdiff_t stride_4 = stride_2 << 1;
- __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8;
- __m256i res_hz0, res_hz1, res_hz2, res_hz3;
- __m256i res_vt0, res_vt1, res_vt2, res_vt3;
- __m256i mask;
- __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
- __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
- __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
- __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
- __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
-
- DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0);
- DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
- src, stride_4, src1, src2, src3, src4);
- src += stride_4;
- DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
- src, stride_4, src5, src6, src7, src8);
- DUP4_ARG3(__lasx_xvshuf_b, src1, src0, mask, src2, src1, mask, src3, src2, mask,
- src4, src3, mask, src0, src1, src2, src3);
- DUP4_ARG3(__lasx_xvshuf_b, src5, src4, mask, src6, src5, mask, src7, src6, mask,
- src8, src7, mask, src4, src5, src6, src7);
- DUP4_ARG3(__lasx_xvpermi_q, src0, src2, 0x02, src1, src3, 0x02, src4, src6, 0x02,
- src5, src7, 0x02, src0, src1, src4, src5);
- DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, src4, coeff_hz_vec,
- src5, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
- DUP4_ARG2(__lasx_xvmul_h, res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
- coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
- DUP2_ARG2(__lasx_xvadd_h, res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt2);
- res_hz0 = __lasx_xvssrarni_bu_h(res_vt2, res_vt0, 6);
- __lasx_xvstelm_w(res_hz0, dst, 0, 0);
- __lasx_xvstelm_w(res_hz0, dst + stride, 0, 1);
- __lasx_xvstelm_w(res_hz0, dst + stride_2, 0, 4);
- __lasx_xvstelm_w(res_hz0, dst + stride_3, 0, 5);
- dst += stride_4;
- __lasx_xvstelm_w(res_hz0, dst, 0, 2);
- __lasx_xvstelm_w(res_hz0, dst + stride, 0, 3);
- __lasx_xvstelm_w(res_hz0, dst + stride_2, 0, 6);
- __lasx_xvstelm_w(res_hz0, dst + stride_3, 0, 7);
-}
-
-static void avc_chroma_hv_4w_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- uint32_t coef_hor0, uint32_t coef_hor1,
- uint32_t coef_ver0, uint32_t coef_ver1,
- int32_t height)
-{
- if (8 == height) {
- avc_chroma_hv_4x8_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
- coef_ver1);
- } else if (4 == height) {
- avc_chroma_hv_4x4_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
- coef_ver1);
- } else if (2 == height) {
- avc_chroma_hv_4x2_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
- coef_ver1);
- }
-}
-
-static void avc_chroma_hz_4x2_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- uint32_t coeff0, uint32_t coeff1)
-{
- __m256i src0, src1;
- __m256i res, mask;
- __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
- __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
- __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
- DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0);
- src1 = __lasx_xvldx(src, stride);
- src0 = __lasx_xvshuf_b(src1, src0, mask);
- res = __lasx_xvdp2_h_bu(src0, coeff_vec);
- res = __lasx_xvslli_h(res, 3);
- res = __lasx_xvssrarni_bu_h(res, res, 6);
- __lasx_xvstelm_w(res, dst, 0, 0);
- __lasx_xvstelm_w(res, dst + stride, 0, 1);
-}
-
-static void avc_chroma_hz_4x4_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- uint32_t coeff0, uint32_t coeff1)
-{
- ptrdiff_t stride_2 = stride << 1;
- ptrdiff_t stride_3 = stride_2 + stride;
- __m256i src0, src1, src2, src3;
- __m256i res, mask;
- __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
- __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
- __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
- DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0);
- DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2, src1, src2);
- src3 = __lasx_xvldx(src, stride_3);
- DUP2_ARG3(__lasx_xvshuf_b, src1, src0, mask, src3, src2, mask, src0, src2);
- src0 = __lasx_xvpermi_q(src0, src2, 0x02);
- res = __lasx_xvdp2_h_bu(src0, coeff_vec);
- res = __lasx_xvslli_h(res, 3);
- res = __lasx_xvssrarni_bu_h(res, res, 6);
- __lasx_xvstelm_w(res, dst, 0, 0);
- __lasx_xvstelm_w(res, dst + stride, 0, 1);
- __lasx_xvstelm_w(res, dst + stride_2, 0, 4);
- __lasx_xvstelm_w(res, dst + stride_3, 0, 5);
-}
-
-static void avc_chroma_hz_4x8_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- uint32_t coeff0, uint32_t coeff1)
-{
- ptrdiff_t stride_2 = stride << 1;
- ptrdiff_t stride_3 = stride_2 + stride;
- ptrdiff_t stride_4 = stride_2 << 1;
- __m256i src0, src1, src2, src3, src4, src5, src6, src7;
- __m256i res0, res1, mask;
- __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
- __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
- __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
- coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
- DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0);
- DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
- src, stride_4, src1, src2, src3, src4);
- src += stride_4;
- DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2, src5, src6);
- src7 = __lasx_xvldx(src, stride_3);
- DUP4_ARG3(__lasx_xvshuf_b, src1, src0, mask, src3, src2, mask, src5, src4, mask,
- src7, src6, mask, src0, src2, src4, src6);
- DUP2_ARG3(__lasx_xvpermi_q, src0, src2, 0x02, src4, src6, 0x02, src0, src4);
- DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src4, coeff_vec, res0, res1);
- res0 = __lasx_xvssrarni_bu_h(res1, res0, 6);
- __lasx_xvstelm_w(res0, dst, 0, 0);
- __lasx_xvstelm_w(res0, dst + stride, 0, 1);
- __lasx_xvstelm_w(res0, dst + stride_2, 0, 4);
- __lasx_xvstelm_w(res0, dst + stride_3, 0, 5);
- dst += stride_4;
- __lasx_xvstelm_w(res0, dst, 0, 2);
- __lasx_xvstelm_w(res0, dst + stride, 0, 3);
- __lasx_xvstelm_w(res0, dst + stride_2, 0, 6);
- __lasx_xvstelm_w(res0, dst + stride_3, 0, 7);
-}
-
-static void avc_chroma_hz_4w_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- uint32_t coeff0, uint32_t coeff1,
- int32_t height)
-{
- if (8 == height) {
- avc_chroma_hz_4x8_lasx(src, dst, stride, coeff0, coeff1);
- } else if (4 == height) {
- avc_chroma_hz_4x4_lasx(src, dst, stride, coeff0, coeff1);
- } else if (2 == height) {
- avc_chroma_hz_4x2_lasx(src, dst, stride, coeff0, coeff1);
- }
-}
-
-static void avc_chroma_hz_8w_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- uint32_t coeff0, uint32_t coeff1,
- int32_t height)
-{
- if (4 == height) {
- avc_chroma_hz_8x4_lasx(src, dst, stride, coeff0, coeff1);
- } else if (8 == height) {
- avc_chroma_hz_8x8_lasx(src, dst, stride, coeff0, coeff1);
- } else {
- avc_chroma_hz_nonmult_lasx(src, dst, stride, coeff0, coeff1, height);
- }
-}
-
-static void avc_chroma_vt_4x2_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- uint32_t coeff0, uint32_t coeff1)
-{
- __m256i src0, src1, src2;
- __m256i tmp0, tmp1;
- __m256i res;
- __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
- __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
- __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
- src0 = __lasx_xvld(src, 0);
- DUP2_ARG2(__lasx_xvldx, src, stride, src, stride << 1, src1, src2);
- DUP2_ARG2(__lasx_xvilvl_b, src1, src0, src2, src1, tmp0, tmp1);
- tmp0 = __lasx_xvilvl_d(tmp1, tmp0);
- res = __lasx_xvdp2_h_bu(tmp0, coeff_vec);
- res = __lasx_xvslli_h(res, 3);
- res = __lasx_xvssrarni_bu_h(res, res, 6);
- __lasx_xvstelm_w(res, dst, 0, 0);
- __lasx_xvstelm_w(res, dst + stride, 0, 1);
-}
-
-static void avc_chroma_vt_4x4_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- uint32_t coeff0, uint32_t coeff1)
-{
- ptrdiff_t stride_2 = stride << 1;
- ptrdiff_t stride_3 = stride_2 + stride;
- ptrdiff_t stride_4 = stride_2 << 1;
- __m256i src0, src1, src2, src3, src4;
- __m256i tmp0, tmp1, tmp2, tmp3;
- __m256i res;
- __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
- __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
- __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
- src0 = __lasx_xvld(src, 0);
- DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
- src, stride_4, src1, src2, src3, src4);
- DUP4_ARG2(__lasx_xvilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
- tmp0, tmp1, tmp2, tmp3);
- DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
- tmp0 = __lasx_xvpermi_q(tmp0, tmp2, 0x02);
- res = __lasx_xvdp2_h_bu(tmp0, coeff_vec);
- res = __lasx_xvslli_h(res, 3);
- res = __lasx_xvssrarni_bu_h(res, res, 6);
- __lasx_xvstelm_w(res, dst, 0, 0);
- __lasx_xvstelm_w(res, dst + stride, 0, 1);
- __lasx_xvstelm_w(res, dst + stride_2, 0, 4);
- __lasx_xvstelm_w(res, dst + stride_3, 0, 5);
-}
-
-static void avc_chroma_vt_4x8_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- uint32_t coeff0, uint32_t coeff1)
-{
- ptrdiff_t stride_2 = stride << 1;
- ptrdiff_t stride_3 = stride_2 + stride;
- ptrdiff_t stride_4 = stride_2 << 1;
- __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8;
- __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- __m256i res0, res1;
- __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
- __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
- __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
- coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
- src0 = __lasx_xvld(src, 0);
- DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
- src, stride_4, src1, src2, src3, src4);
- src += stride_4;
- DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
- src, stride_4, src5, src6, src7, src8);
- DUP4_ARG2(__lasx_xvilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
- tmp0, tmp1, tmp2, tmp3);
- DUP4_ARG2(__lasx_xvilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
- tmp4, tmp5, tmp6, tmp7);
- DUP4_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
- tmp0, tmp2, tmp4, tmp6);
- tmp0 = __lasx_xvpermi_q(tmp0, tmp2, 0x02);
- tmp4 = __lasx_xvpermi_q(tmp4, tmp6, 0x02);
- DUP2_ARG2(__lasx_xvdp2_h_bu, tmp0, coeff_vec, tmp4, coeff_vec, res0, res1);
- res0 = __lasx_xvssrarni_bu_h(res1, res0, 6);
- __lasx_xvstelm_w(res0, dst, 0, 0);
- __lasx_xvstelm_w(res0, dst + stride, 0, 1);
- __lasx_xvstelm_w(res0, dst + stride_2, 0, 4);
- __lasx_xvstelm_w(res0, dst + stride_3, 0, 5);
- dst += stride_4;
- __lasx_xvstelm_w(res0, dst, 0, 2);
- __lasx_xvstelm_w(res0, dst + stride, 0, 3);
- __lasx_xvstelm_w(res0, dst + stride_2, 0, 6);
- __lasx_xvstelm_w(res0, dst + stride_3, 0, 7);
-}
-
-static void avc_chroma_vt_4w_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- uint32_t coeff0, uint32_t coeff1,
- int32_t height)
-{
- if (8 == height) {
- avc_chroma_vt_4x8_lasx(src, dst, stride, coeff0, coeff1);
- } else if (4 == height) {
- avc_chroma_vt_4x4_lasx(src, dst, stride, coeff0, coeff1);
- } else if (2 == height) {
- avc_chroma_vt_4x2_lasx(src, dst, stride, coeff0, coeff1);
- }
-}
-
-static void avc_chroma_vt_8w_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- uint32_t coeff0, uint32_t coeff1,
- int32_t height)
-{
- if (4 == height) {
- avc_chroma_vt_8x4_lasx(src, dst, stride, coeff0, coeff1);
- } else if (8 == height) {
- avc_chroma_vt_8x8_lasx(src, dst, stride, coeff0, coeff1);
- }
-}
-
-static void copy_width4_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- int32_t height)
-{
- uint32_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
-
- if (8 == height) {
- ptrdiff_t stride_2, stride_3, stride_4;
-
- __asm__ volatile (
- "slli.d %[stride_2], %[stride], 1 \n\t"
- "add.d %[stride_3], %[stride_2], %[stride] \n\t"
- "slli.d %[stride_4], %[stride_2], 1 \n\t"
- "ld.wu %[tp0], %[src], 0 \n\t"
- "ldx.wu %[tp1], %[src], %[stride] \n\t"
- "ldx.wu %[tp2], %[src], %[stride_2] \n\t"
- "ldx.wu %[tp3], %[src], %[stride_3] \n\t"
- "add.d %[src], %[src], %[stride_4] \n\t"
- "ld.wu %[tp4], %[src], 0 \n\t"
- "ldx.wu %[tp5], %[src], %[stride] \n\t"
- "ldx.wu %[tp6], %[src], %[stride_2] \n\t"
- "ldx.wu %[tp7], %[src], %[stride_3] \n\t"
- "st.w %[tp0], %[dst], 0 \n\t"
- "stx.w %[tp1], %[dst], %[stride] \n\t"
- "stx.w %[tp2], %[dst], %[stride_2] \n\t"
- "stx.w %[tp3], %[dst], %[stride_3] \n\t"
- "add.d %[dst], %[dst], %[stride_4] \n\t"
- "st.w %[tp4], %[dst], 0 \n\t"
- "stx.w %[tp5], %[dst], %[stride] \n\t"
- "stx.w %[tp6], %[dst], %[stride_2] \n\t"
- "stx.w %[tp7], %[dst], %[stride_3] \n\t"
- : [stride_2]"+&r"(stride_2), [stride_3]"+&r"(stride_3), [stride_4]"+&r"(stride_4),
- [src]"+&r"(src), [dst]"+&r"(dst), [tp0]"+&r"(tp0), [tp1]"+&r"(tp1),
- [tp2]"+&r"(tp2), [tp3]"+&r"(tp3), [tp4]"+&r"(tp4), [tp5]"+&r"(tp5),
- [tp6]"+&r"(tp6), [tp7]"+&r"(tp7)
- : [stride]"r"(stride)
- : "memory"
- );
- } else if (4 == height) {
- ptrdiff_t stride_2, stride_3;
-
- __asm__ volatile (
- "slli.d %[stride_2], %[stride], 1 \n\t"
- "add.d %[stride_3], %[stride_2], %[stride] \n\t"
- "ld.wu %[tp0], %[src], 0 \n\t"
- "ldx.wu %[tp1], %[src], %[stride] \n\t"
- "ldx.wu %[tp2], %[src], %[stride_2] \n\t"
- "ldx.wu %[tp3], %[src], %[stride_3] \n\t"
- "st.w %[tp0], %[dst], 0 \n\t"
- "stx.w %[tp1], %[dst], %[stride] \n\t"
- "stx.w %[tp2], %[dst], %[stride_2] \n\t"
- "stx.w %[tp3], %[dst], %[stride_3] \n\t"
- : [stride_2]"+&r"(stride_2), [stride_3]"+&r"(stride_3),
- [src]"+&r"(src), [dst]"+&r"(dst), [tp0]"+&r"(tp0), [tp1]"+&r"(tp1),
- [tp2]"+&r"(tp2), [tp3]"+&r"(tp3)
- : [stride]"r"(stride)
- : "memory"
- );
- } else if (2 == height) {
- __asm__ volatile (
- "ld.wu %[tp0], %[src], 0 \n\t"
- "ldx.wu %[tp1], %[src], %[stride] \n\t"
- "st.w %[tp0], %[dst], 0 \n\t"
- "stx.w %[tp1], %[dst], %[stride] \n\t"
- : [tp0]"+&r"(tp0), [tp1]"+&r"(tp1)
- : [src]"r"(src), [dst]"r"(dst), [stride]"r"(stride)
- : "memory"
- );
- }
-}
-
-static void copy_width8_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- int32_t height)
-{
- if (8 == height) {
- copy_width8x8_lasx(src, dst, stride);
- } else if (4 == height) {
- copy_width8x4_lasx(src, dst, stride);
- }
-}
-
-void ff_put_h264_chroma_mc4_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride,
- int height, int x, int y)
-{
- av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
-
- if(x && y) {
- avc_chroma_hv_4w_lasx(src, dst, stride, x, (8 - x), y, (8 - y), height);
- } else if (x) {
- avc_chroma_hz_4w_lasx(src, dst, stride, x, (8 - x), height);
- } else if (y) {
- avc_chroma_vt_4w_lasx(src, dst, stride, y, (8 - y), height);
- } else {
- copy_width4_lasx(src, dst, stride, height);
- }
-}
-
-void ff_put_h264_chroma_mc8_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride,
- int height, int x, int y)
-{
- av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
-
- if (!(x || y)) {
- copy_width8_lasx(src, dst, stride, height);
- } else if (x && y) {
- avc_chroma_hv_8w_lasx(src, dst, stride, x, (8 - x), y, (8 - y), height);
- } else if (x) {
- avc_chroma_hz_8w_lasx(src, dst, stride, x, (8 - x), height);
- } else {
- avc_chroma_vt_8w_lasx(src, dst, stride, y, (8 - y), height);
- }
-}
-
-static av_always_inline void avc_chroma_hv_and_aver_dst_8x4_lasx(const uint8_t *src,
- uint8_t *dst, ptrdiff_t stride, uint32_t coef_hor0,
- uint32_t coef_hor1, uint32_t coef_ver0,
- uint32_t coef_ver1)
-{
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
- ptrdiff_t stride_4x = stride << 2;
- __m256i tp0, tp1, tp2, tp3;
- __m256i src0, src1, src2, src3, src4, out;
- __m256i res_hz0, res_hz1, res_hz2, res_vt0, res_vt1;
- __m256i mask;
- __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
- __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
- __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
- __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
- __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
-
- DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0);
- DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x,
- src1, src2, src3, src4);
- DUP2_ARG3(__lasx_xvpermi_q, src2, src1, 0x20, src4, src3, 0x20, src1, src3);
- src0 = __lasx_xvshuf_b(src0, src0, mask);
- DUP2_ARG3(__lasx_xvshuf_b, src1, src1, mask, src3, src3, mask, src1, src3);
- DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, res_hz0, res_hz1);
- res_hz2 = __lasx_xvdp2_h_bu(src3, coeff_hz_vec);
- res_vt0 = __lasx_xvmul_h(res_hz1, coeff_vt_vec0);
- res_vt1 = __lasx_xvmul_h(res_hz2, coeff_vt_vec0);
- res_hz0 = __lasx_xvpermi_q(res_hz1, res_hz0, 0x20);
- res_hz1 = __lasx_xvpermi_q(res_hz1, res_hz2, 0x3);
- res_vt0 = __lasx_xvmadd_h(res_vt0, res_hz0, coeff_vt_vec1);
- res_vt1 = __lasx_xvmadd_h(res_vt1, res_hz1, coeff_vt_vec1);
- out = __lasx_xvssrarni_bu_h(res_vt1, res_vt0, 6);
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
- tp0, tp1, tp2, tp3);
- DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
- tp0 = __lasx_xvpermi_q(tp2, tp0, 0x20);
- out = __lasx_xvavgr_bu(out, tp0);
- __lasx_xvstelm_d(out, dst, 0, 0);
- __lasx_xvstelm_d(out, dst + stride, 0, 2);
- __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void avc_chroma_hv_and_aver_dst_8x8_lasx(const uint8_t *src,
- uint8_t *dst, ptrdiff_t stride, uint32_t coef_hor0,
- uint32_t coef_hor1, uint32_t coef_ver0,
- uint32_t coef_ver1)
-{
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
- ptrdiff_t stride_4x = stride << 2;
- __m256i tp0, tp1, tp2, tp3, dst0, dst1;
- __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8;
- __m256i out0, out1;
- __m256i res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
- __m256i res_vt0, res_vt1, res_vt2, res_vt3;
- __m256i mask;
- __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
- __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
- __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
- __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
- __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
-
- DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0);
- src += stride;
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
- src1, src2, src3, src4);
- src += stride_4x;
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
- src5, src6, src7, src8);
- DUP4_ARG3(__lasx_xvpermi_q, src2, src1, 0x20, src4, src3, 0x20, src6, src5, 0x20,
- src8, src7, 0x20, src1, src3, src5, src7);
- src0 = __lasx_xvshuf_b(src0, src0, mask);
- DUP4_ARG3(__lasx_xvshuf_b, src1, src1, mask, src3, src3, mask, src5, src5, mask, src7,
- src7, mask, src1, src3, src5, src7);
- DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, src3,
- coeff_hz_vec, src5, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
- res_hz4 = __lasx_xvdp2_h_bu(src7, coeff_hz_vec);
- res_vt0 = __lasx_xvmul_h(res_hz1, coeff_vt_vec0);
- res_vt1 = __lasx_xvmul_h(res_hz2, coeff_vt_vec0);
- res_vt2 = __lasx_xvmul_h(res_hz3, coeff_vt_vec0);
- res_vt3 = __lasx_xvmul_h(res_hz4, coeff_vt_vec0);
- res_hz0 = __lasx_xvpermi_q(res_hz1, res_hz0, 0x20);
- res_hz1 = __lasx_xvpermi_q(res_hz1, res_hz2, 0x3);
- res_hz2 = __lasx_xvpermi_q(res_hz2, res_hz3, 0x3);
- res_hz3 = __lasx_xvpermi_q(res_hz3, res_hz4, 0x3);
- res_vt0 = __lasx_xvmadd_h(res_vt0, res_hz0, coeff_vt_vec1);
- res_vt1 = __lasx_xvmadd_h(res_vt1, res_hz1, coeff_vt_vec1);
- res_vt2 = __lasx_xvmadd_h(res_vt2, res_hz2, coeff_vt_vec1);
- res_vt3 = __lasx_xvmadd_h(res_vt3, res_hz3, coeff_vt_vec1);
- DUP2_ARG3(__lasx_xvssrarni_bu_h, res_vt1, res_vt0, 6, res_vt3, res_vt2, 6,
- out0, out1);
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
- tp0, tp1, tp2, tp3);
- DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
- dst0 = __lasx_xvpermi_q(tp2, tp0, 0x20);
- dst += stride_4x;
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
- tp0, tp1, tp2, tp3);
- dst -= stride_4x;
- DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
- dst1 = __lasx_xvpermi_q(tp2, tp0, 0x20);
- out0 = __lasx_xvavgr_bu(out0, dst0);
- out1 = __lasx_xvavgr_bu(out1, dst1);
- __lasx_xvstelm_d(out0, dst, 0, 0);
- __lasx_xvstelm_d(out0, dst + stride, 0, 2);
- __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3);
- dst += stride_4x;
- __lasx_xvstelm_d(out1, dst, 0, 0);
- __lasx_xvstelm_d(out1, dst + stride, 0, 2);
- __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void avc_chroma_hz_and_aver_dst_8x4_lasx(const uint8_t *src,
- uint8_t *dst, ptrdiff_t stride, uint32_t coeff0,
- uint32_t coeff1)
-{
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
- __m256i tp0, tp1, tp2, tp3;
- __m256i src0, src1, src2, src3, out;
- __m256i res0, res1;
- __m256i mask;
- __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
- __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
- __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
- coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
- mask = __lasx_xvld(chroma_mask_arr, 0);
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
- src0, src1, src2, src3);
- DUP2_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src0, src2);
- DUP2_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src0, src2);
- DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1);
- out = __lasx_xvssrarni_bu_h(res1, res0, 6);
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
- tp0, tp1, tp2, tp3);
- DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
- tp0 = __lasx_xvpermi_q(tp2, tp0, 0x20);
- out = __lasx_xvavgr_bu(out, tp0);
- __lasx_xvstelm_d(out, dst, 0, 0);
- __lasx_xvstelm_d(out, dst + stride, 0, 2);
- __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void avc_chroma_hz_and_aver_dst_8x8_lasx(const uint8_t *src,
- uint8_t *dst, ptrdiff_t stride, uint32_t coeff0,
- uint32_t coeff1)
-{
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
- ptrdiff_t stride_4x = stride << 2;
- __m256i tp0, tp1, tp2, tp3, dst0, dst1;
- __m256i src0, src1, src2, src3, src4, src5, src6, src7;
- __m256i out0, out1;
- __m256i res0, res1, res2, res3;
- __m256i mask;
- __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
- __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
- __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
- coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
- mask = __lasx_xvld(chroma_mask_arr, 0);
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
- src0, src1, src2, src3);
- src += stride_4x;
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
- src4, src5, src6, src7);
- DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src5, src4, 0x20,
- src7, src6, 0x20, src0, src2, src4, src6);
- DUP4_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src4, src4,
- mask, src6, src6, mask, src0, src2, src4, src6);
- DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, src4, coeff_vec, src6,
- coeff_vec, res0, res1, res2, res3);
- DUP2_ARG3(__lasx_xvssrarni_bu_h, res1, res0, 6, res3, res2, 6, out0, out1);
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
- tp0, tp1, tp2, tp3);
- DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
- dst0 = __lasx_xvpermi_q(tp2, tp0, 0x20);
- dst += stride_4x;
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
- tp0, tp1, tp2, tp3);
- dst -= stride_4x;
- DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
- dst1 = __lasx_xvpermi_q(tp2, tp0, 0x20);
- out0 = __lasx_xvavgr_bu(out0, dst0);
- out1 = __lasx_xvavgr_bu(out1, dst1);
- __lasx_xvstelm_d(out0, dst, 0, 0);
- __lasx_xvstelm_d(out0, dst + stride, 0, 2);
- __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3);
- dst += stride_4x;
- __lasx_xvstelm_d(out1, dst, 0, 0);
- __lasx_xvstelm_d(out1, dst + stride, 0, 2);
- __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void avc_chroma_vt_and_aver_dst_8x4_lasx(const uint8_t *src,
- uint8_t *dst, ptrdiff_t stride, uint32_t coeff0,
- uint32_t coeff1)
-{
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
- ptrdiff_t stride_4x = stride << 2;
- __m256i tp0, tp1, tp2, tp3;
- __m256i src0, src1, src2, src3, src4, out;
- __m256i res0, res1;
- __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
- __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
- __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
- coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
- src0 = __lasx_xvld(src, 0);
- DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x,
- src1, src2, src3, src4);
- DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20,
- src4, src3, 0x20, src0, src1, src2, src3);
- DUP2_ARG2(__lasx_xvilvl_b, src1, src0, src3, src2, src0, src2);
- DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1);
- out = __lasx_xvssrarni_bu_h(res1, res0, 6);
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
- tp0, tp1, tp2, tp3);
- DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
- tp0 = __lasx_xvpermi_q(tp2, tp0, 0x20);
- out = __lasx_xvavgr_bu(out, tp0);
- __lasx_xvstelm_d(out, dst, 0, 0);
- __lasx_xvstelm_d(out, dst + stride, 0, 2);
- __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void avc_chroma_vt_and_aver_dst_8x8_lasx(const uint8_t *src,
- uint8_t *dst, ptrdiff_t stride, uint32_t coeff0,
- uint32_t coeff1)
-{
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
- ptrdiff_t stride_4x = stride << 2;
- __m256i tp0, tp1, tp2, tp3, dst0, dst1;
- __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8;
- __m256i out0, out1;
- __m256i res0, res1, res2, res3;
- __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
- __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
- __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
-
- coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
- src0 = __lasx_xvld(src, 0);
- src += stride;
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
- src1, src2, src3, src4);
- src += stride_4x;
- DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
- src5, src6, src7, src8);
- DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20,
- src4, src3, 0x20, src0, src1, src2, src3);
- DUP4_ARG3(__lasx_xvpermi_q, src5, src4, 0x20, src6, src5, 0x20, src7, src6, 0x20,
- src8, src7, 0x20, src4, src5, src6, src7);
- DUP4_ARG2(__lasx_xvilvl_b, src1, src0, src3, src2, src5, src4, src7, src6,
- src0, src2, src4, src6);
- DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, src4, coeff_vec, src6,
- coeff_vec, res0, res1, res2, res3);
- DUP2_ARG3(__lasx_xvssrarni_bu_h, res1, res0, 6, res3, res2, 6, out0, out1);
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
- tp0, tp1, tp2, tp3);
- DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
- dst0 = __lasx_xvpermi_q(tp2, tp0, 0x20);
- dst += stride_4x;
- DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
- tp0, tp1, tp2, tp3);
- dst -= stride_4x;
- DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
- dst1 = __lasx_xvpermi_q(tp2, tp0, 0x20);
- out0 = __lasx_xvavgr_bu(out0, dst0);
- out1 = __lasx_xvavgr_bu(out1, dst1);
- __lasx_xvstelm_d(out0, dst, 0, 0);
- __lasx_xvstelm_d(out0, dst + stride, 0, 2);
- __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3);
- dst += stride_4x;
- __lasx_xvstelm_d(out1, dst, 0, 0);
- __lasx_xvstelm_d(out1, dst + stride, 0, 2);
- __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1);
- __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void avg_width8x8_lasx(const uint8_t *src, uint8_t *dst,
- ptrdiff_t stride)
-{
- __m256i src0, src1, src2, src3;
- __m256i dst0, dst1, dst2, dst3;
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
- ptrdiff_t stride_4x = stride << 2;
-
- src0 = __lasx_xvldrepl_d(src, 0);
- src1 = __lasx_xvldrepl_d(src + stride, 0);
- src2 = __lasx_xvldrepl_d(src + stride_2x, 0);
- src3 = __lasx_xvldrepl_d(src + stride_3x, 0);
- dst0 = __lasx_xvldrepl_d(dst, 0);
- dst1 = __lasx_xvldrepl_d(dst + stride, 0);
- dst2 = __lasx_xvldrepl_d(dst + stride_2x, 0);
- dst3 = __lasx_xvldrepl_d(dst + stride_3x, 0);
- src0 = __lasx_xvpackev_d(src1,src0);
- src2 = __lasx_xvpackev_d(src3,src2);
- src0 = __lasx_xvpermi_q(src0, src2, 0x02);
- dst0 = __lasx_xvpackev_d(dst1,dst0);
- dst2 = __lasx_xvpackev_d(dst3,dst2);
- dst0 = __lasx_xvpermi_q(dst0, dst2, 0x02);
- dst0 = __lasx_xvavgr_bu(src0, dst0);
- __lasx_xvstelm_d(dst0, dst, 0, 0);
- __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
- __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
- __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
-
- src += stride_4x;
- dst += stride_4x;
- src0 = __lasx_xvldrepl_d(src, 0);
- src1 = __lasx_xvldrepl_d(src + stride, 0);
- src2 = __lasx_xvldrepl_d(src + stride_2x, 0);
- src3 = __lasx_xvldrepl_d(src + stride_3x, 0);
- dst0 = __lasx_xvldrepl_d(dst, 0);
- dst1 = __lasx_xvldrepl_d(dst + stride, 0);
- dst2 = __lasx_xvldrepl_d(dst + stride_2x, 0);
- dst3 = __lasx_xvldrepl_d(dst + stride_3x, 0);
- src0 = __lasx_xvpackev_d(src1,src0);
- src2 = __lasx_xvpackev_d(src3,src2);
- src0 = __lasx_xvpermi_q(src0, src2, 0x02);
- dst0 = __lasx_xvpackev_d(dst1,dst0);
- dst2 = __lasx_xvpackev_d(dst3,dst2);
- dst0 = __lasx_xvpermi_q(dst0, dst2, 0x02);
- dst0 = __lasx_xvavgr_bu(src0, dst0);
- __lasx_xvstelm_d(dst0, dst, 0, 0);
- __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
- __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
- __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
-}
-
-static av_always_inline void avg_width8x4_lasx(const uint8_t *src, uint8_t *dst,
- ptrdiff_t stride)
-{
- __m256i src0, src1, src2, src3;
- __m256i dst0, dst1, dst2, dst3;
- ptrdiff_t stride_2x = stride << 1;
- ptrdiff_t stride_3x = stride_2x + stride;
-
- src0 = __lasx_xvldrepl_d(src, 0);
- src1 = __lasx_xvldrepl_d(src + stride, 0);
- src2 = __lasx_xvldrepl_d(src + stride_2x, 0);
- src3 = __lasx_xvldrepl_d(src + stride_3x, 0);
- dst0 = __lasx_xvldrepl_d(dst, 0);
- dst1 = __lasx_xvldrepl_d(dst + stride, 0);
- dst2 = __lasx_xvldrepl_d(dst + stride_2x, 0);
- dst3 = __lasx_xvldrepl_d(dst + stride_3x, 0);
- src0 = __lasx_xvpackev_d(src1,src0);
- src2 = __lasx_xvpackev_d(src3,src2);
- src0 = __lasx_xvpermi_q(src0, src2, 0x02);
- dst0 = __lasx_xvpackev_d(dst1,dst0);
- dst2 = __lasx_xvpackev_d(dst3,dst2);
- dst0 = __lasx_xvpermi_q(dst0, dst2, 0x02);
- dst0 = __lasx_xvavgr_bu(src0, dst0);
- __lasx_xvstelm_d(dst0, dst, 0, 0);
- __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
- __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
- __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
-}
-
-static void avc_chroma_hv_and_aver_dst_8w_lasx(const uint8_t *src, uint8_t *dst,
- ptrdiff_t stride,
- uint32_t coef_hor0,
- uint32_t coef_hor1,
- uint32_t coef_ver0,
- uint32_t coef_ver1,
- int32_t height)
-{
- if (4 == height) {
- avc_chroma_hv_and_aver_dst_8x4_lasx(src, dst, stride, coef_hor0,
- coef_hor1, coef_ver0, coef_ver1);
- } else if (8 == height) {
- avc_chroma_hv_and_aver_dst_8x8_lasx(src, dst, stride, coef_hor0,
- coef_hor1, coef_ver0, coef_ver1);
- }
-}
-
-static void avc_chroma_hz_and_aver_dst_8w_lasx(const uint8_t *src, uint8_t *dst,
- ptrdiff_t stride, uint32_t coeff0,
- uint32_t coeff1, int32_t height)
-{
- if (4 == height) {
- avc_chroma_hz_and_aver_dst_8x4_lasx(src, dst, stride, coeff0, coeff1);
- } else if (8 == height) {
- avc_chroma_hz_and_aver_dst_8x8_lasx(src, dst, stride, coeff0, coeff1);
- }
-}
-
-static void avc_chroma_vt_and_aver_dst_8w_lasx(const uint8_t *src, uint8_t *dst,
- ptrdiff_t stride, uint32_t coeff0,
- uint32_t coeff1, int32_t height)
-{
- if (4 == height) {
- avc_chroma_vt_and_aver_dst_8x4_lasx(src, dst, stride, coeff0, coeff1);
- } else if (8 == height) {
- avc_chroma_vt_and_aver_dst_8x8_lasx(src, dst, stride, coeff0, coeff1);
- }
-}
-
-static void avg_width8_lasx(const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
- int32_t height)
-{
- if (8 == height) {
- avg_width8x8_lasx(src, dst, stride);
- } else if (4 == height) {
- avg_width8x4_lasx(src, dst, stride);
- }
-}
-
-void ff_avg_h264_chroma_mc8_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride,
- int height, int x, int y)
-{
- av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
-
- if (!(x || y)) {
- avg_width8_lasx(src, dst, stride, height);
- } else if (x && y) {
- avc_chroma_hv_and_aver_dst_8w_lasx(src, dst, stride, x, (8 - x), y,
- (8 - y), height);
- } else if (x) {
- avc_chroma_hz_and_aver_dst_8w_lasx(src, dst, stride, x, (8 - x), height);
- } else {
- avc_chroma_vt_and_aver_dst_8w_lasx(src, dst, stride, y, (8 - y), height);
- }
-}
diff --git a/libavcodec/loongarch/h264chroma_lasx.h b/libavcodec/loongarch/h264chroma_lasx.h
deleted file mode 100644
index 633752035e..0000000000
--- a/libavcodec/loongarch/h264chroma_lasx.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2020 Loongson Technology Corporation Limited
- * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_LOONGARCH_H264CHROMA_LASX_H
-#define AVCODEC_LOONGARCH_H264CHROMA_LASX_H
-
-#include <stdint.h>
-#include <stddef.h>
-#include "libavcodec/h264.h"
-
-void ff_put_h264_chroma_mc4_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride,
- int h, int x, int y);
-void ff_put_h264_chroma_mc8_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride,
- int h, int x, int y);
-void ff_avg_h264_chroma_mc8_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride,
- int h, int x, int y);
-
-#endif /* AVCODEC_LOONGARCH_H264CHROMA_LASX_H */
diff --git a/libavcodec/loongarch/h264chroma_loongarch.h b/libavcodec/loongarch/h264chroma_loongarch.h
new file mode 100644
index 0000000000..e65fcfe9f3
--- /dev/null
+++ b/libavcodec/loongarch/h264chroma_loongarch.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_LOONGARCH_H264CHROMA_LOONGARCH_H
+#define AVCODEC_LOONGARCH_H264CHROMA_LOONGARCH_H
+
+#include "libavcodec/h264.h"
+
+void ff_put_h264_chroma_mc8_lsx(unsigned char *dst, const unsigned char *src,
+ long int stride, int h, int x, int y);
+void ff_avg_h264_chroma_mc8_lsx(unsigned char *dst, const unsigned char *src,
+ long int stride, int h, int x, int y);
+void ff_put_h264_chroma_mc4_lsx(unsigned char *dst, const unsigned char *src,
+ long int stride, int h, int x, int y);
+
+void ff_put_h264_chroma_mc4_lasx(unsigned char *dst, const unsigned char *src,
+ long int stride, int h, int x, int y);
+void ff_put_h264_chroma_mc8_lasx(unsigned char *dst, const unsigned char *src,
+ long int stride, int h, int x, int y);
+void ff_avg_h264_chroma_mc8_lasx(unsigned char *dst, const unsigned char *src,
+ long int stride, int h, int x, int y);
+
+#endif /* AVCODEC_LOONGARCH_H264CHROMA_LOONGARCH_H */
diff --git a/libavcodec/loongarch/h264intrapred.S b/libavcodec/loongarch/h264intrapred.S
new file mode 100644
index 0000000000..a03f467b6e
--- /dev/null
+++ b/libavcodec/loongarch/h264intrapred.S
@@ -0,0 +1,299 @@
+/*
+ * Loongson LSX optimized h264intrapred
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+const shufa
+.byte 6, 5, 4, 3, 2, 1, 0
+endconst
+
+const mulk
+.byte 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0
+endconst
+
+const mulh
+.byte 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0
+.byte 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 0
+endconst
+
+.macro PRED16X16_PLANE
+ slli.d t6, a1, 1
+ slli.d t4, a1, 3
+ addi.d t0, a0, 7
+ sub.d t0, t0, a1
+ add.d t1, a0, t4
+ addi.d t1, t1, -1
+ sub.d t2, t1, t6
+
+ ld.bu t3, t0, 1
+ ld.bu t4, t0, -1
+ ld.bu t5, t1, 0
+ ld.bu t7, t2, 0
+ sub.d t3, t3, t4
+ sub.d t4, t5, t7
+
+ la.local t5, mulk
+ vld vr0, t5, 0
+ fld.d f1, t0, 2
+ fld.d f2, t0, -8
+ la.local t5, shufa
+ fld.d f3, t5, 0
+ vshuf.b vr2, vr2, vr2, vr3
+ vilvl.b vr1, vr1, vr2
+ vhsubw.hu.bu vr1, vr1, vr1
+ vmul.h vr0, vr0, vr1
+ vhaddw.w.h vr1, vr0, vr0
+ vhaddw.d.w vr0, vr1, vr1
+ vhaddw.q.d vr1, vr0, vr0
+ vpickve2gr.w t5, vr1, 0
+ add.d t3, t3, t5
+//2
+ sub.d t2, t2, a1
+ ld.bu t8, t2, 0
+ ldx.bu t7, t1, a1
+ sub.d t5, t7, t8
+ slli.d t5, t5, 1
+
+//3&4
+ add.d t1, t1, t6
+ sub.d t2, t2, a1
+ ld.bu t8, t2, 0
+ ld.bu t7, t1, 0
+ sub.d t7, t7, t8
+ slli.d t8, t7, 1
+ add.d t7, t7, t8
+ add.d t5, t5, t7
+ sub.d t2, t2, a1
+ ld.bu t8, t2, 0
+ ldx.bu t7, t1, a1
+ sub.d t7, t7, t8
+ slli.d t7, t7, 2
+ add.d t5, t5, t7
+
+//5&6
+ add.d t1, t1, t6
+ sub.d t2, t2, a1
+ ld.bu t8, t2, 0
+ ld.bu t7, t1, 0
+ sub.d t7, t7, t8
+ slli.d t8, t7, 2
+ add.d t7, t7, t8
+ add.d t5, t5, t7
+ sub.d t2, t2, a1
+ ld.bu t8, t2, 0
+ ldx.bu t7, t1, a1
+ sub.d t7, t7, t8
+ slli.d t8, t7, 1
+ slli.d t7, t7, 2
+ add.d t7, t7, t8
+ add.d t5, t5, t7
+
+//7&8
+ add.d t1, t1, t6
+ sub.d t2, t2, a1
+ ld.bu t8, t2, 0
+ ld.bu t7, t1, 0
+ sub.d t7, t7, t8
+ slli.d t8, t7, 3
+ sub.d t7, t8, t7
+ add.d t5, t5, t7
+ sub.d t2, t2, a1
+ ld.bu t8, t2, 0
+ ldx.bu t7, t1, a1
+ sub.d t7, t7, t8
+ slli.d t7, t7, 3
+ add.d t5, t5, t7
+ add.d t4, t4, t5
+ add.d t1, t1, a1
+.endm
+
+.macro PRED16X16_PLANE_END
+ ld.bu t7, t1, 0
+ ld.bu t8, t2, 16
+ add.d t5, t7, t8
+ addi.d t5, t5, 1
+ slli.d t5, t5, 4
+ add.d t7, t3, t4
+ slli.d t8, t7, 3
+ sub.d t7, t8, t7
+ sub.d t5, t5, t7
+
+ la.local t8, mulh
+ vld vr3, t8, 0
+ slli.d t8, t3, 3
+ vreplgr2vr.h vr4, t3
+ vreplgr2vr.h vr9, t8
+ vmul.h vr5, vr3, vr4
+
+.rept 16
+ move t7, t5
+ add.d t5, t5, t4
+ vreplgr2vr.h vr6, t7
+ vadd.h vr7, vr6, vr5
+ vadd.h vr8, vr9, vr7
+ vssrani.bu.h vr8, vr7, 5
+ vst vr8, a0, 0
+ add.d a0, a0, a1
+.endr
+.endm
+
+.macro PRED16X16_PLANE_END_LASX
+ ld.bu t7, t1, 0
+ ld.bu t8, t2, 16
+ add.d t5, t7, t8
+ addi.d t5, t5, 1
+ slli.d t5, t5, 4
+ add.d t7, t3, t4
+ slli.d t8, t7, 3
+ sub.d t7, t8, t7
+ sub.d t5, t5, t7
+
+ la.local t8, mulh
+ xvld xr3, t8, 0
+ xvreplgr2vr.h xr4, t3
+ xvmul.h xr5, xr3, xr4
+
+.rept 8
+ move t7, t5
+ add.d t5, t5, t4
+ xvreplgr2vr.h xr6, t7
+ xvreplgr2vr.h xr8, t5
+ add.d t5, t5, t4
+ xvadd.h xr7, xr6, xr5
+ xvadd.h xr9, xr8, xr5
+
+ xvssrani.bu.h xr9, xr7, 5
+ vstelm.d vr9, a0, 0, 0
+ xvstelm.d xr9, a0, 8, 2
+ add.d a0, a0, a1
+ vstelm.d vr9, a0, 0, 1
+ xvstelm.d xr9, a0, 8, 3
+ add.d a0, a0, a1
+.endr
+.endm
+
+/* void ff_h264_pred16x16_plane_h264_8_lsx(uint8_t *src, ptrdiff_t stride)
+ */
+function ff_h264_pred16x16_plane_h264_8_lsx
+ PRED16X16_PLANE
+
+ slli.d t7, t3, 2
+ add.d t3, t3, t7
+ addi.d t3, t3, 32
+ srai.d t3, t3, 6
+ slli.d t7, t4, 2
+ add.d t4, t4, t7
+ addi.d t4, t4, 32
+ srai.d t4, t4, 6
+
+ PRED16X16_PLANE_END
+endfunc
+
+/* void ff_h264_pred16x16_plane_rv40_8_lsx(uint8_t *src, ptrdiff_t stride)
+ */
+function ff_h264_pred16x16_plane_rv40_8_lsx
+ PRED16X16_PLANE
+
+ srai.d t7, t3, 2
+ add.d t3, t3, t7
+ srai.d t3, t3, 4
+ srai.d t7, t4, 2
+ add.d t4, t4, t7
+ srai.d t4, t4, 4
+
+ PRED16X16_PLANE_END
+endfunc
+
+/* void ff_h264_pred16x16_plane_svq3_8_lsx(uint8_t *src, ptrdiff_t stride)
+ */
+function ff_h264_pred16x16_plane_svq3_8_lsx
+ PRED16X16_PLANE
+
+ li.d t6, 4
+ li.d t7, 5
+ li.d t8, 16
+ div.d t3, t3, t6
+ mul.d t3, t3, t7
+ div.d t3, t3, t8
+ div.d t4, t4, t6
+ mul.d t4, t4, t7
+ div.d t4, t4, t8
+ move t7, t3
+ move t3, t4
+ move t4, t7
+
+ PRED16X16_PLANE_END
+endfunc
+
+/* void ff_h264_pred16x16_plane_h264_8_lasx(uint8_t *src, ptrdiff_t stride)
+ */
+function ff_h264_pred16x16_plane_h264_8_lasx
+ PRED16X16_PLANE
+
+ slli.d t7, t3, 2
+ add.d t3, t3, t7
+ addi.d t3, t3, 32
+ srai.d t3, t3, 6
+ slli.d t7, t4, 2
+ add.d t4, t4, t7
+ addi.d t4, t4, 32
+ srai.d t4, t4, 6
+
+ PRED16X16_PLANE_END_LASX
+endfunc
+
+/* void ff_h264_pred16x16_plane_rv40_8_lasx(uint8_t *src, ptrdiff_t stride)
+ */
+function ff_h264_pred16x16_plane_rv40_8_lasx
+ PRED16X16_PLANE
+
+ srai.d t7, t3, 2
+ add.d t3, t3, t7
+ srai.d t3, t3, 4
+ srai.d t7, t4, 2
+ add.d t4, t4, t7
+ srai.d t4, t4, 4
+
+ PRED16X16_PLANE_END_LASX
+endfunc
+
+/* void ff_h264_pred16x16_plane_svq3_8_lasx(uint8_t *src, ptrdiff_t stride)
+ */
+function ff_h264_pred16x16_plane_svq3_8_lasx
+ PRED16X16_PLANE
+
+ li.d t5, 4
+ li.d t7, 5
+ li.d t8, 16
+ div.d t3, t3, t5
+ mul.d t3, t3, t7
+ div.d t3, t3, t8
+ div.d t4, t4, t5
+ mul.d t4, t4, t7
+ div.d t4, t4, t8
+ move t7, t3
+ move t3, t4
+ move t4, t7
+
+ PRED16X16_PLANE_END_LASX
+endfunc
--
2.20.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 17+ messages in thread
* [FFmpeg-devel] [PATCH v5 4/7] avcodec/la: Add LSX optimization for h264 qpel.
2023-05-25 7:24 [FFmpeg-devel] Add LSX optimization in avcodec and swscale Hao Chen
` (2 preceding siblings ...)
2023-05-25 7:24 ` [FFmpeg-devel] [PATCH v5 3/7] avcodec/la: Add LSX optimization for h264 chroma and intrapred Hao Chen
@ 2023-05-25 7:24 ` Hao Chen
2023-05-25 7:24 ` [FFmpeg-devel] [PATCH v5 5/7] swscale/la: Optimize the functions of the swscale series with lsx Hao Chen
` (2 subsequent siblings)
6 siblings, 0 replies; 17+ messages in thread
From: Hao Chen @ 2023-05-25 7:24 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: yuanhecai
From: yuanhecai <yuanhecai@loongson.cn>
./configure --disable-lasx
ffmpeg -i 1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -y /dev/null -an
before: 214fps
after: 274fps
---
libavcodec/loongarch/Makefile | 2 +
libavcodec/loongarch/h264qpel.S | 1686 +++++++++++++++++
.../loongarch/h264qpel_init_loongarch.c | 74 +-
libavcodec/loongarch/h264qpel_lasx.c | 401 +---
libavcodec/loongarch/h264qpel_lasx.h | 158 --
libavcodec/loongarch/h264qpel_loongarch.h | 312 +++
libavcodec/loongarch/h264qpel_lsx.c | 487 +++++
7 files changed, 2561 insertions(+), 559 deletions(-)
create mode 100644 libavcodec/loongarch/h264qpel.S
delete mode 100644 libavcodec/loongarch/h264qpel_lasx.h
create mode 100644 libavcodec/loongarch/h264qpel_loongarch.h
create mode 100644 libavcodec/loongarch/h264qpel_lsx.c
diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index a563055161..06cfab5c20 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -31,5 +31,7 @@ LSX-OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_lsx.o \
LSX-OBJS-$(CONFIG_H264DSP) += loongarch/h264idct.o \
loongarch/h264idct_loongarch.o \
loongarch/h264dsp.o
+LSX-OBJS-$(CONFIG_H264QPEL) += loongarch/h264qpel.o \
+ loongarch/h264qpel_lsx.o
LSX-OBJS-$(CONFIG_H264CHROMA) += loongarch/h264chroma.o
LSX-OBJS-$(CONFIG_H264PRED) += loongarch/h264intrapred.o
diff --git a/libavcodec/loongarch/h264qpel.S b/libavcodec/loongarch/h264qpel.S
new file mode 100644
index 0000000000..3f885b6ce2
--- /dev/null
+++ b/libavcodec/loongarch/h264qpel.S
@@ -0,0 +1,1686 @@
+/*
+ * Loongson LSX optimized h264qpel
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Hecai Yuan <yuanhecai@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+.macro VLD_QPEL8_H_SSRANI_LSX in0, in1, in2, in3, in4
+ vld vr0, \in4, 0
+ vldx vr1, \in4, a2
+ QPEL8_H_LSX \in0, \in1
+ vssrani.bu.h \in0, \in2, 5
+ vssrani.bu.h \in1, \in3, 5
+.endm
+
+.macro VLDX_QPEL8_H_SSRANI_LSX in0, in1, in2, in3, in4
+ vldx vr0, \in4, t1
+ vldx vr1, \in4, t2
+ QPEL8_H_LSX \in0, \in1
+ vssrani.bu.h \in0, \in2, 5
+ vssrani.bu.h \in1, \in3, 5
+.endm
+
+.macro VLD_DOUBLE_QPEL8_H_SSRANI_LSX in0, in1, in2, in3, in4, in5, in6, in7, in8
+ vld vr0, \in8, 0
+ vldx vr1, \in8, a2
+ QPEL8_H_LSX \in0, \in1
+ vssrani.bu.h \in0, \in4, 5
+ vssrani.bu.h \in1, \in5, 5
+ vldx vr0, \in8, t1
+ vldx vr1, \in8, t2
+ QPEL8_H_LSX \in2, \in3
+ vssrani.bu.h \in2, \in6, 5
+ vssrani.bu.h \in3, \in7, 5
+.endm
+
+function ff_put_h264_qpel16_mc00_lsx
+ slli.d t0, a2, 1
+ add.d t1, t0, a2
+ slli.d t2, t0, 1
+.rept 4
+ vld vr0, a1, 0
+ vldx vr1, a1, a2
+ vldx vr2, a1, t0
+ vldx vr3, a1, t1
+ add.d a1, a1, t2
+ vst vr0, a0, 0
+ vstx vr1, a0, a2
+ vstx vr2, a0, t0
+ vstx vr3, a0, t1
+ add.d a0, a0, t2
+.endr
+endfunc
+
+.macro QPEL8_H_LSX out0, out1
+ vbsrl.v vr2, vr0, 1
+ vbsrl.v vr3, vr1, 1
+ vbsrl.v vr4, vr0, 2
+ vbsrl.v vr5, vr1, 2
+ vbsrl.v vr6, vr0, 3
+ vbsrl.v vr7, vr1, 3
+ vbsrl.v vr8, vr0, 4
+ vbsrl.v vr9, vr1, 4
+ vbsrl.v vr10, vr0, 5
+ vbsrl.v vr11, vr1, 5
+
+ vilvl.b vr6, vr4, vr6
+ vilvl.b vr7, vr5, vr7
+ vilvl.b vr8, vr2, vr8
+ vilvl.b vr9, vr3, vr9
+ vilvl.b vr10, vr0, vr10
+ vilvl.b vr11, vr1, vr11
+ vhaddw.hu.bu vr6, vr6, vr6
+ vhaddw.hu.bu vr7, vr7, vr7
+ vhaddw.hu.bu vr8, vr8, vr8
+ vhaddw.hu.bu vr9, vr9, vr9
+ vhaddw.hu.bu vr10, vr10, vr10
+ vhaddw.hu.bu vr11, vr11, vr11
+ vmul.h vr2, vr6, vr20
+ vmul.h vr3, vr7, vr20
+ vmul.h vr4, vr8, vr21
+ vmul.h vr5, vr9, vr21
+ vssub.h vr2, vr2, vr4
+ vssub.h vr3, vr3, vr5
+ vsadd.h vr2, vr2, vr10
+ vsadd.h vr3, vr3, vr11
+ vsadd.h \out0, vr2, vr22
+ vsadd.h \out1, vr3, vr22
+.endm
+
+.macro VLD_DOUBLE_QPEL8_H_LSX in0, in1, in2, in3, in4
+ vld vr0, \in4, 0
+ vldx vr1, \in4, a2
+ QPEL8_H_LSX \in0, \in1
+ vldx vr0, \in4, t1
+ vldx vr1, \in4, t2
+ QPEL8_H_LSX \in2, \in3
+.endm
+
+.macro put_h264_qpel16 in0
+function ff_put_h264_qpel16_mc\in0\()_lsx
+.ifc \in0, 10
+ addi.d t8, a1, 0
+.else
+ addi.d t8, a1, 1
+.endif
+ vldi vr20, 0x414
+ vldi vr21, 0x405
+ vldi vr22, 0x410
+ slli.d t1, a2, 1
+ add.d t2, t1, a2
+ addi.d t0, a1, -2 // t0 = src - 2
+ addi.d a1, t0, 8 // a1 = t0 + 8
+.rept 4
+ VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0
+ VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr12, vr13, a1
+ vld vr10, t8, 0
+ vldx vr11, t8, a2
+ vavgr.bu vr0, vr2, vr10
+ vavgr.bu vr1, vr3, vr11
+ vst vr0, a0, 0
+ vstx vr1, a0, a2
+ VLDX_QPEL8_H_SSRANI_LSX vr4, vr5, vr14, vr15, a1
+ vldx vr12, t8, t1
+ vldx vr13, t8, t2
+ vavgr.bu vr2, vr4, vr12
+ vavgr.bu vr3, vr5, vr13
+ vstx vr2, a0, t1
+ vstx vr3, a0, t2
+ alsl.d a0, a2, a0, 2
+ alsl.d t8, a2, t8, 2
+ alsl.d a1, a2, a1, 2
+ alsl.d t0, a2, t0, 2
+.endr
+endfunc
+.endm
+
+put_h264_qpel16 10
+put_h264_qpel16 30
+
+function ff_put_h264_qpel16_mc20_lsx
+ vldi vr20, 0x414
+ vldi vr21, 0x405
+ vldi vr22, 0x410
+ slli.d t1, a2, 1
+ add.d t2, t1, a2
+ addi.d t0, a1, -2 // t0 = src - 2
+ addi.d a1, t0, 8 // a1 = t0 + 8
+.rept 4
+ VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0
+ VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr12, vr13, a1
+ vst vr2, a0, 0
+ vstx vr3, a0, a2
+ VLDX_QPEL8_H_SSRANI_LSX vr4, vr5, vr14, vr15, a1
+ vstx vr4, a0, t1
+ vstx vr5, a0, t2
+ alsl.d a0, a2, a0, 2
+ alsl.d a1, a2, a1, 2
+ alsl.d t0, a2, t0, 2
+.endr
+endfunc
+
+.macro QPEL8_V_LSX in0, in1, in2, in3, in4, in5, in6
+ vilvl.b vr7, \in3, \in2
+ vilvl.b vr8, \in4, \in3
+ vilvl.b vr9, \in4, \in1
+ vilvl.b vr10, \in5, \in2
+ vilvl.b vr11, \in5, \in0
+ vilvl.b vr12, \in6, \in1
+ vhaddw.hu.bu vr7, vr7, vr7
+ vhaddw.hu.bu vr8, vr8, vr8
+ vhaddw.hu.bu vr9, vr9, vr9
+ vhaddw.hu.bu vr10, vr10, vr10
+ vhaddw.hu.bu vr11, vr11, vr11
+ vhaddw.hu.bu vr12, vr12, vr12
+ vmul.h vr7, vr7, vr20
+ vmul.h vr8, vr8, vr20
+ vmul.h vr9, vr9, vr21
+ vmul.h vr10, vr10, vr21
+ vssub.h vr7, vr7, vr9
+ vssub.h vr8, vr8, vr10
+ vsadd.h vr7, vr7, vr11
+ vsadd.h vr8, vr8, vr12
+ vsadd.h vr7, vr7, vr22
+ vsadd.h vr8, vr8, vr22
+
+ vilvh.b vr13, \in3, \in2
+ vilvh.b vr14, \in4, \in3
+ vilvh.b vr15, \in4, \in1
+ vilvh.b vr16, \in5, \in2
+ vilvh.b vr17, \in5, \in0
+ vilvh.b vr18, \in6, \in1
+ vhaddw.hu.bu vr13, vr13, vr13
+ vhaddw.hu.bu vr14, vr14, vr14
+ vhaddw.hu.bu vr15, vr15, vr15
+ vhaddw.hu.bu vr16, vr16, vr16
+ vhaddw.hu.bu vr17, vr17, vr17
+ vhaddw.hu.bu vr18, vr18, vr18
+ vmul.h vr13, vr13, vr20
+ vmul.h vr14, vr14, vr20
+ vmul.h vr15, vr15, vr21
+ vmul.h vr16, vr16, vr21
+ vssub.h vr13, vr13, vr15
+ vssub.h vr14, vr14, vr16
+ vsadd.h vr13, vr13, vr17
+ vsadd.h vr14, vr14, vr18
+ vsadd.h vr13, vr13, vr22
+ vsadd.h vr14, vr14, vr22
+ vssrani.bu.h vr13, vr7, 5
+ vssrani.bu.h vr14, vr8, 5
+.endm
+
+.macro put_h264_qpel16_mc1 in0
+function ff_put_h264_qpel16_mc\in0\()_lsx
+ slli.d t0, a2, 1
+ add.d t1, t0, a2
+ sub.d t2, a1, t0 // t2 = src - 2 * stride
+ vldi vr20, 0x414
+ vldi vr21, 0x405
+ vldi vr22, 0x410
+
+ vld vr0, t2, 0
+ vldx vr1, t2, a2
+ vldx vr2, t2, t0
+ vldx vr3, t2, t1
+ alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride
+ vld vr4, t2, 0
+ vldx vr5, t2, a2
+ vldx vr6, t2, t0
+ QPEL8_V_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6
+.ifc \in0, 01
+ vavgr.bu vr13, vr2, vr13
+ vavgr.bu vr14, vr3, vr14
+.else
+ vavgr.bu vr13, vr3, vr13
+ vavgr.bu vr14, vr4, vr14
+.endif
+ vst vr13, a0, 0
+ vstx vr14, a0, a2
+
+ vldx vr0, t2, t1
+ alsl.d t2, a2, t2, 2 // t2 = t2 + 4 *stride
+ vld vr1, t2, 0
+ QPEL8_V_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1
+.ifc \in0, 01
+ vavgr.bu vr13, vr4, vr13
+ vavgr.bu vr14, vr5, vr14
+.else
+ vavgr.bu vr13, vr5, vr13
+ vavgr.bu vr14, vr6, vr14
+.endif
+ vstx vr13, a0, t0
+ vstx vr14, a0, t1
+
+ alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
+
+ vldx vr2, t2, a2
+ vldx vr3, t2, t0
+ QPEL8_V_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3
+.ifc \in0, 01
+ vavgr.bu vr13, vr6, vr13
+ vavgr.bu vr14, vr0, vr14
+.else
+ vavgr.bu vr13, vr0, vr13
+ vavgr.bu vr14, vr1, vr14
+.endif
+ vst vr13, a0, 0
+ vstx vr14, a0, a2
+
+ vldx vr4, t2, t1
+ alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride
+ vld vr5, t2, 0
+ QPEL8_V_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5
+.ifc \in0, 01
+ vavgr.bu vr13, vr1, vr13
+ vavgr.bu vr14, vr2, vr14
+.else
+ vavgr.bu vr13, vr2, vr13
+ vavgr.bu vr14, vr3, vr14
+.endif
+ vstx vr13, a0, t0
+ vstx vr14, a0, t1
+
+ alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
+
+ vldx vr6, t2, a2
+ vldx vr0, t2, t0
+ QPEL8_V_LSX vr1, vr2, vr3, vr4, vr5, vr6, vr0
+.ifc \in0, 01
+ vavgr.bu vr13, vr3, vr13
+ vavgr.bu vr14, vr4, vr14
+.else
+ vavgr.bu vr13, vr4, vr13
+ vavgr.bu vr14, vr5, vr14
+.endif
+ vst vr13, a0, 0
+ vstx vr14, a0, a2
+
+ vldx vr1, t2, t1
+ alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride
+ vld vr2, t2, 0
+ QPEL8_V_LSX vr3, vr4, vr5, vr6, vr0, vr1, vr2
+.ifc \in0, 01
+ vavgr.bu vr13, vr5, vr13
+ vavgr.bu vr14, vr6, vr14
+.else
+ vavgr.bu vr13, vr6, vr13
+ vavgr.bu vr14, vr0, vr14
+.endif
+ vstx vr13, a0, t0
+ vstx vr14, a0, t1
+
+ alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
+
+ vldx vr3, t2, a2
+ vldx vr4, t2, t0
+ QPEL8_V_LSX vr5, vr6, vr0, vr1, vr2, vr3, vr4
+.ifc \in0, 01
+ vavgr.bu vr13, vr0, vr13
+ vavgr.bu vr14, vr1, vr14
+.else
+ vavgr.bu vr13, vr1, vr13
+ vavgr.bu vr14, vr2, vr14
+.endif
+ vst vr13, a0, 0
+ vstx vr14, a0, a2
+
+ vldx vr5, t2, t1
+ alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride
+ vld vr6, t2, 0
+ QPEL8_V_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6
+.ifc \in0, 01
+ vavgr.bu vr13, vr2, vr13
+ vavgr.bu vr14, vr3, vr14
+.else
+ vavgr.bu vr13, vr3, vr13
+ vavgr.bu vr14, vr4, vr14
+.endif
+ vstx vr13, a0, t0
+ vstx vr14, a0, t1
+endfunc
+.endm
+
+put_h264_qpel16_mc1 01
+put_h264_qpel16_mc1 03
+
+.macro VST_QPEL8_V_LOWPASS_LSX in0, in1, in2, in3, in4, in5, in6, in7, in8
+ QPEL8_V_LSX \in0, \in1, \in2, \in3, \in4, \in5, \in6
+ vavgr.bu vr13, \in7, vr13
+ vavgr.bu vr14, \in8, vr14
+ vst vr13, a0, 0
+ vstx vr14, a0, a2
+.endm
+
+.macro VSTX_QPEL8_V_LOWPASS_LSX in0, in1, in2, in3, in4, in5, in6, in7, in8
+ QPEL8_V_LSX \in0, \in1, \in2, \in3, \in4, \in5, \in6
+ vavgr.bu vr13, \in7, vr13
+ vavgr.bu vr14, \in8, vr14
+ vstx vr13, a0, t1
+ vstx vr14, a0, t2
+.endm
+
+function ff_put_h264_qpel16_mc11_lsx
+ addi.d sp, sp, -64
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+ fst.d f30, sp, 48
+ fst.d f31, sp, 56
+ slli.d t1, a2, 1
+ add.d t2, t1, a2
+ slli.d t6, t1, 1
+ vldi vr20, 0x414
+ vldi vr21, 0x405
+ vldi vr22, 0x410
+ sub.d t4, a1, t1 // t4 = src - 2 * stride
+ addi.d t0, a1, -2 // t0 = src - 2
+ addi.d a1, t0, 8 // a1 = t0 + 8
+.rept 2
+ VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0
+ alsl.d t0, a2, t0, 2
+ VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, t0
+ VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \
+ vr14, vr15, a1
+ alsl.d a1, a2, a1, 2
+ VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \
+ vr18, vr19, a1
+
+ vld vr0, t4, 0 // t4 = src - 2 * stride
+ vldx vr1, t4, a2
+ vldx vr2, t4, t1
+ vldx vr3, t4, t2
+ alsl.d t4, a2, t4, 2 // src + 2 *stride
+ vld vr4, t4, 0
+ vldx vr5, t4, a2
+ vldx vr6, t4, t1
+ VST_QPEL8_V_LOWPASS_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr23, vr24
+ vldx vr0, t4, t2
+ alsl.d t4, a2, t4, 2 // src + 6 *stride
+ vld vr1, t4, 0
+ VSTX_QPEL8_V_LOWPASS_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1, vr25, vr26
+ alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
+ vldx vr2, t4, a2
+ vldx vr3, t4, t1
+ VST_QPEL8_V_LOWPASS_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3, vr27, vr28
+ vldx vr4, t4, t2
+ alsl.d t4, a2, t4, 2 // src + 10 *stride
+ vld vr5, t4, 0
+ VSTX_QPEL8_V_LOWPASS_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5, vr29, vr30
+ alsl.d t0, a2, t0, 2
+ alsl.d a1, a2, a1, 2 // a1 = src + 8 * stride
+ alsl.d a0, a2, a0, 2 // dst = dst + 8 * stride
+ sub.d t4, t4, t6
+.endr
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ fld.d f30, sp, 48
+ fld.d f31, sp, 56
+ addi.d sp, sp, 64
+endfunc
+
+function ff_avg_h264_qpel16_mc00_lsx
+ slli.d t0, a2, 1
+ add.d t1, t0, a2
+ slli.d t2, t0, 1
+ addi.d t3, a0, 0
+.rept 4
+ vld vr0, a1, 0
+ vldx vr1, a1, a2
+ vldx vr2, a1, t0
+ vldx vr3, a1, t1
+ add.d a1, a1, t2
+ vld vr8, t3, 0
+ vldx vr9, t3, a2
+ vldx vr10, t3, t0
+ vldx vr11, t3, t1
+ add.d t3, t3, t2
+ vavgr.bu vr0, vr8, vr0
+ vavgr.bu vr1, vr9, vr1
+ vavgr.bu vr2, vr10, vr2
+ vavgr.bu vr3, vr11, vr3
+ vst vr0, a0, 0
+ vstx vr1, a0, a2
+ vstx vr2, a0, t0
+ vstx vr3, a0, t1
+ add.d a0, a0, t2
+.endr
+endfunc
+
+.macro put_h264_qpel16_mc in0
+function ff_put_h264_qpel16_mc\in0\()_lsx
+ addi.d sp, sp, -64
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+ fst.d f30, sp, 48
+ fst.d f31, sp, 56
+ slli.d t1, a2, 1
+ add.d t2, t1, a2
+ vldi vr20, 0x414
+ vldi vr21, 0x405
+ vldi vr22, 0x410
+ addi.d t0, a1, -2 // t0 = src - 2
+
+.ifc \in0, 33
+ add.d t0, t0, a2
+.endif
+ add.d t3, a1, zero // t3 = src
+ sub.d t4, a1, t1 // t4 = src - 2 * stride
+ addi.d t4, t4, 1
+
+ VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0
+ alsl.d a1, a2, t0, 2
+ VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1
+ addi.d a1, t0, 8
+ VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \
+ vr14, vr15, a1
+ alsl.d a1, a2, a1, 2
+ VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \
+ vr18, vr19, a1
+ vld vr0, t4, 0 // t4 = src - 2 * stride + 1
+ vldx vr1, t4, a2
+ vldx vr2, t4, t1
+ vldx vr3, t4, t2
+ alsl.d t4, a2, t4, 2
+ vld vr4, t4, 0
+ vldx vr5, t4, a2
+ vldx vr6, t4, t1
+ VST_QPEL8_V_LOWPASS_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr23, vr24
+ vldx vr0, t4, t2
+ alsl.d t4, a2, t4, 2
+ vld vr1, t4, 0
+ VSTX_QPEL8_V_LOWPASS_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1, vr25, vr26
+ add.d t6, t4, zero // t6 = src + 6 * stride
+ alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
+ vldx vr2, t4, a2
+ vldx vr3, t4, t1
+ VST_QPEL8_V_LOWPASS_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3, vr27, vr28
+ vldx vr4, t4, t2
+ alsl.d t4, a2, t4, 2
+ vld vr5, t4, 0
+ VSTX_QPEL8_V_LOWPASS_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5, vr29, vr30
+ alsl.d a1, a2, t0, 3 // a1 = src + 8 * stride
+ addi.d t5, a1, 8 // a1 = src + 8 * stride + 8
+ VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, a1
+ alsl.d a1, a2, a1, 2
+ VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1
+ VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \
+ vr14, vr15, t5
+ alsl.d t5, a2, t5, 2
+ VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \
+ vr18, vr19, t5
+ alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
+
+ // t6 = src + 6 * stride + 1
+ vld vr0, t6, 0
+ vldx vr1, t6, a2
+ vldx vr2, t6, t1
+ vldx vr3, t6, t2
+ alsl.d t6, a2, t6, 2
+ vld vr4, t6, 0
+ vldx vr5, t6, a2
+ vldx vr6, t6, t1
+ VST_QPEL8_V_LOWPASS_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr23, vr24
+ vldx vr0, t6, t2
+ alsl.d t6, a2, t6, 2
+ vld vr1, t6, 0
+ VSTX_QPEL8_V_LOWPASS_LSX vr2, vr3, vr4, vr5 ,vr6, vr0, vr1, vr25, vr26
+ alsl.d a0, a2, a0, 2 // dst = dst + 4 *stride
+ vldx vr2, t6, a2
+ vldx vr3, t6, t1
+ VST_QPEL8_V_LOWPASS_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3, vr27, vr28
+ vldx vr4, t6, t2
+ alsl.d t6, a2, t6, 2
+ vld vr5, t6, 0
+ VSTX_QPEL8_V_LOWPASS_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5, vr29, vr30
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ fld.d f30, sp, 48
+ fld.d f31, sp, 56
+ addi.d sp, sp, 64
+endfunc
+.endm
+
+put_h264_qpel16_mc 33
+put_h264_qpel16_mc 31
+
+function ff_put_h264_qpel16_mc13_lsx
+ slli.d t1, a2, 1
+ add.d t2, t1, a2
+ vldi vr20, 0x414
+ vldi vr21, 0x405
+ vldi vr22, 0x410
+ addi.d sp, sp, -64
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+ fst.d f30, sp, 48
+ fst.d f31, sp, 56
+ addi.d t0, a1, -2 // t0 = src - 2
+ add.d t0, t0, a2
+ add.d t3, a1, zero // t3 = src
+ sub.d t4, a1, t1 // t4 = src - 2 * stride
+
+ VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0
+ alsl.d a1, a2, t0, 2
+ VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1
+ addi.d a1, t0, 8
+ VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \
+ vr14, vr15, a1
+ alsl.d a1, a2, a1, 2
+ VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \
+ vr18, vr19, a1
+ vld vr0, t4, 0 // t4 = src - 2 * stride + 1
+ vldx vr1, t4, a2
+ vldx vr2, t4, t1
+ vldx vr3, t4, t2
+ alsl.d t4, a2, t4, 2
+ vld vr4, t4, 0
+ vldx vr5, t4, a2
+ vldx vr6, t4, t1
+ VST_QPEL8_V_LOWPASS_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr23, vr24
+ vldx vr0, t4, t2
+ alsl.d t4, a2, t4, 2
+ vld vr1, t4, 0
+ VSTX_QPEL8_V_LOWPASS_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1, vr25, vr26
+ add.d t6, t4, zero
+ alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
+ vldx vr2, t4, a2
+ vldx vr3, t4, t1
+ VST_QPEL8_V_LOWPASS_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3, vr27, vr28
+ vldx vr4, t4, t2
+ alsl.d t4, a2, t4, 2
+ vld vr5, t4, 0
+ VSTX_QPEL8_V_LOWPASS_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5, vr29, vr30
+ alsl.d a1, a2, t0, 3 // a1 = src + 8 * stride
+ addi.d t5, a1, 8 // a1 = src + 8 * stride + 8
+ VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, a1
+ alsl.d a1, a2, a1, 2
+ VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1
+ VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \
+ vr14, vr15, t5
+ alsl.d t5, a2, t5, 2
+ VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \
+ vr18, vr19, t5
+ alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
+
+ vld vr0, t6, 0 // // t6 = src + 6 * stride + 1
+ vldx vr1, t6, a2
+ vldx vr2, t6, t1
+ vldx vr3, t6, t2
+ alsl.d t6, a2, t6, 2
+ vld vr4, t6, 0
+ vldx vr5, t6, a2
+ vldx vr6, t6, t1
+ VST_QPEL8_V_LOWPASS_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr23, vr24
+ vldx vr0, t6, t2
+ alsl.d t6, a2, t6, 2
+ vld vr1, t6, 0
+ VSTX_QPEL8_V_LOWPASS_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1, vr25, vr26
+ alsl.d a0, a2, a0, 2 // dst = dst + 4 *stride
+ vldx vr2, t6, a2
+ vldx vr3, t6, t1
+ VST_QPEL8_V_LOWPASS_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3, vr27, vr28
+ vldx vr4, t6, t2
+ alsl.d t6, a2, t6, 2
+ vld vr5, t6, 0
+ VSTX_QPEL8_V_LOWPASS_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5, vr29, vr30
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ fld.d f30, sp, 48
+ fld.d f31, sp, 56
+ addi.d sp, sp, 64
+endfunc
+
+function ff_avg_h264_qpel16_mc10_lsx
+ addi.d t0, a0, 0 // t0 = dst
+ addi.d t4, a1, -2 // t1 = src - 2
+ addi.d t5, t4, 8
+ slli.d t1, a2, 1
+ add.d t2, a2, t1
+ vldi vr20, 0x414
+ vldi vr21, 0x405
+ vldi vr22, 0x410
+.rept 2
+ VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t4
+ alsl.d t4, a2, t4, 2
+ VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, t4
+ VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr12, vr13, t5
+ vld vr0, a1, 0
+ vldx vr1, a1, a2
+ vld vr12, t0, 0
+ vldx vr13, t0, a2
+ vavgr.bu vr0, vr0, vr2
+ vavgr.bu vr1, vr1, vr3
+ vavgr.bu vr0, vr0, vr12
+ vavgr.bu vr1, vr1, vr13
+ vst vr0, a0, 0
+ vstx vr1, a0, a2
+ VLDX_QPEL8_H_SSRANI_LSX vr2, vr3, vr14, vr15, t5
+ vldx vr0, a1, t1
+ vldx vr1, a1, t2
+ vldx vr12, t0, t1
+ vldx vr13, t0, t2
+ vavgr.bu vr0, vr0, vr2
+ vavgr.bu vr1, vr1, vr3
+ vavgr.bu vr0, vr0, vr12
+ vavgr.bu vr1, vr1, vr13
+ vstx vr0, a0, t1
+ vstx vr1, a0, t2
+ alsl.d t5, a2, t5, 2
+ alsl.d a1, a2, a1, 2
+ alsl.d t0, a2, t0, 2
+ alsl.d a0, a2, a0, 2
+ VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr16, vr17, t5
+ vld vr0, a1, 0
+ vldx vr1, a1, a2
+ vld vr12, t0, 0
+ vldx vr13, t0, a2
+ vavgr.bu vr0, vr0, vr2
+ vavgr.bu vr1, vr1, vr3
+ vavgr.bu vr0, vr0, vr12
+ vavgr.bu vr1, vr1, vr13
+ vst vr0, a0, 0
+ vstx vr1, a0, a2
+ VLDX_QPEL8_H_SSRANI_LSX vr2, vr3, vr18, vr19, t5
+ vldx vr0, a1, t1
+ vldx vr1, a1, t2
+ vldx vr12, t0, t1
+ vldx vr13, t0, t2
+ vavgr.bu vr0, vr0, vr2
+ vavgr.bu vr1, vr1, vr3
+ vavgr.bu vr0, vr0, vr12
+ vavgr.bu vr1, vr1, vr13
+ vstx vr0, a0, t1
+ vstx vr1, a0, t2
+ alsl.d t5, a2, t5, 2
+ alsl.d a1, a2, a1, 2
+ alsl.d t0, a2, t0, 2
+ alsl.d a0, a2, a0, 2
+ alsl.d t4, a2, t4, 2 // src + 8 * stride -2
+.endr
+endfunc
+
+function ff_avg_h264_qpel16_mc30_lsx
+ addi.d t0, a0, 0 // t0 = dst
+ addi.d t4, a1, -2 // t1 = src - 2
+ addi.d t5, t4, 8
+ addi.d a1, a1, 1 // a1 = a1 + 1
+ slli.d t1, a2, 1
+ add.d t2, a2, t1
+ vldi vr20, 0x414
+ vldi vr21, 0x405
+ vldi vr22, 0x410
+.rept 2
+ VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t4
+ alsl.d t4, a2, t4, 2
+ VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, t4
+ VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr12, vr13, t5
+ vld vr0, a1, 0
+ vldx vr1, a1, a2
+ vld vr12, t0, 0
+ vldx vr13, t0, a2
+ vavgr.bu vr0, vr0, vr2
+ vavgr.bu vr1, vr1, vr3
+ vavgr.bu vr0, vr0, vr12
+ vavgr.bu vr1, vr1, vr13
+ vst vr0, a0, 0
+ vstx vr1, a0, a2
+ VLDX_QPEL8_H_SSRANI_LSX vr2, vr3, vr14, vr15, t5
+ vldx vr0, a1, t1
+ vldx vr1, a1, t2
+ vldx vr12, t0, t1
+ vldx vr13, t0, t2
+ vavgr.bu vr0, vr0, vr2
+ vavgr.bu vr1, vr1, vr3
+ vavgr.bu vr0, vr0, vr12
+ vavgr.bu vr1, vr1, vr13
+ vstx vr0, a0, t1
+ vstx vr1, a0, t2
+ alsl.d t5, a2, t5, 2
+ alsl.d a1, a2, a1, 2
+ alsl.d t0, a2, t0, 2
+ alsl.d a0, a2, a0, 2
+ VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr16, vr17, t5
+ vld vr0, a1, 0
+ vldx vr1, a1, a2
+ vld vr12, t0, 0
+ vldx vr13, t0, a2
+ vavgr.bu vr0, vr0, vr2
+ vavgr.bu vr1, vr1, vr3
+ vavgr.bu vr0, vr0, vr12
+ vavgr.bu vr1, vr1, vr13
+ vst vr0, a0, 0
+ vstx vr1, a0, a2
+ VLDX_QPEL8_H_SSRANI_LSX vr2, vr3, vr18, vr19, t5
+ vldx vr0, a1, t1
+ vldx vr1, a1, t2
+ vldx vr12, t0, t1
+ vldx vr13, t0, t2
+ vavgr.bu vr0, vr0, vr2
+ vavgr.bu vr1, vr1, vr3
+ vavgr.bu vr0, vr0, vr12
+ vavgr.bu vr1, vr1, vr13
+ vstx vr0, a0, t1
+ vstx vr1, a0, t2
+ alsl.d t5, a2, t5, 2
+ alsl.d a1, a2, a1, 2
+ alsl.d t0, a2, t0, 2
+ alsl.d a0, a2, a0, 2
+ alsl.d t4, a2, t4, 2 // t1 = src + 8 * stride -2
+.endr
+endfunc
+
+function ff_put_h264_qpel16_mc02_lsx
+ slli.d t0, a2, 1
+ add.d t1, t0, a2
+ sub.d t2, a1, t0 // t2 = src - 2 * stride
+ vldi vr20, 0x414
+ vldi vr21, 0x405
+ vldi vr22, 0x410
+
+ vld vr0, t2, 0
+ vldx vr1, t2, a2
+ vldx vr2, t2, t0
+ vldx vr3, t2, t1
+ alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride
+ vld vr4, t2, 0
+ vldx vr5, t2, a2
+ vldx vr6, t2, t0
+ QPEL8_V_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6
+ vst vr13, a0, 0
+ vstx vr14, a0, a2
+ vldx vr0, t2, t1
+ alsl.d t2, a2, t2, 2 // t2 = t2 + 4 *stride
+ vld vr1, t2, 0
+ QPEL8_V_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1
+ vstx vr13, a0, t0
+ vstx vr14, a0, t1
+ alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
+ vldx vr2, t2, a2
+ vldx vr3, t2, t0
+ QPEL8_V_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3
+ vst vr13, a0, 0
+ vstx vr14, a0, a2
+ vldx vr4, t2, t1
+ alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride
+ vld vr5, t2, 0
+ QPEL8_V_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5
+ vstx vr13, a0, t0
+ vstx vr14, a0, t1
+
+ alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
+
+ vldx vr6, t2, a2
+ vldx vr0, t2, t0
+ QPEL8_V_LSX vr1, vr2, vr3, vr4, vr5, vr6, vr0
+ vst vr13, a0, 0
+ vstx vr14, a0, a2
+ vldx vr1, t2, t1
+ alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride
+ vld vr2, t2, 0
+ QPEL8_V_LSX vr3, vr4, vr5, vr6, vr0, vr1, vr2
+ vstx vr13, a0, t0
+ vstx vr14, a0, t1
+ alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
+ vldx vr3, t2, a2
+ vldx vr4, t2, t0
+ QPEL8_V_LSX vr5, vr6, vr0, vr1, vr2, vr3, vr4
+ vst vr13, a0, 0
+ vstx vr14, a0, a2
+ vldx vr5, t2, t1
+ alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride
+ vld vr6, t2, 0
+ QPEL8_V_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6
+ vstx vr13, a0, t0
+ vstx vr14, a0, t1
+endfunc
+
+.macro avc_luma_hv_qrt_and_aver_dst_16x16_lsx
+ addi.d sp, sp, -64
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+ fst.d f30, sp, 48
+ fst.d f31, sp, 56
+ vldi vr20, 0x414
+ vldi vr21, 0x405
+ vldi vr22, 0x410
+
+ VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0
+ alsl.d a1, a2, t0, 2
+ VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1
+ addi.d a1, t0, 8
+ VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \
+ vr14, vr15, a1
+ alsl.d a1, a2, a1, 2
+ VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \
+ vr18, vr19, a1
+ vld vr0, t4, 0 // t4 = src - 2 * stride + 1
+ vldx vr1, t4, a2
+ vldx vr2, t4, t1
+ vldx vr3, t4, t2
+ alsl.d t4, a2, t4, 2
+ vld vr4, t4, 0
+ vldx vr5, t4, a2
+ vldx vr6, t4, t1
+ QPEL8_V_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6
+ vld vr0, t8, 0
+ vldx vr1, t8, a2
+ vavgr.bu vr13, vr23, vr13
+ vavgr.bu vr14, vr24, vr14
+ vavgr.bu vr13, vr13, vr0
+ vavgr.bu vr14, vr14, vr1
+ vst vr13, a0, 0
+ vstx vr14, a0, a2
+ vldx vr0, t4, t2
+ alsl.d t4, a2, t4, 2
+ vld vr1, t4, 0
+ QPEL8_V_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1
+ vldx vr2, t8, t1
+ vldx vr3, t8, t2
+ vavgr.bu vr13, vr25, vr13
+ vavgr.bu vr14, vr26, vr14
+ vavgr.bu vr13, vr13, vr2
+ vavgr.bu vr14, vr14, vr3
+ add.d t6, t4, zero // t6 = src + 6 * stride
+ vstx vr13, a0, t1
+ vstx vr14, a0, t2
+ alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
+ alsl.d t8, a2, t8, 2
+ vldx vr2, t4, a2
+ vldx vr3, t4, t1
+ QPEL8_V_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3
+ vld vr4, t8, 0
+ vldx vr5, t8, a2
+ vavgr.bu vr13, vr27, vr13
+ vavgr.bu vr14, vr28, vr14
+ vavgr.bu vr13, vr13, vr4
+ vavgr.bu vr14, vr14, vr5
+ vst vr13, a0, 0
+ vstx vr14, a0, a2
+ vldx vr4, t4, t2
+ alsl.d t4, a2, t4, 2
+ vld vr5, t4, 0
+ QPEL8_V_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5
+ vldx vr6, t8, t1
+ vldx vr0, t8, t2
+ vavgr.bu vr13, vr29, vr13
+ vavgr.bu vr14, vr30, vr14
+ vavgr.bu vr13, vr13, vr6
+ vavgr.bu vr14, vr14, vr0
+ vstx vr13, a0, t1
+ vstx vr14, a0, t2
+ alsl.d a1, a2, t0, 3 // a1 = src + 8 * stride
+ addi.d t5, a1, 8 // a1 = src + 8 * stride + 8
+ VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, a1
+ alsl.d a1, a2, a1, 2
+ VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1
+ VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \
+ vr14, vr15, t5
+ alsl.d t5, a2, t5, 2
+ VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \
+ vr18, vr19, t5
+ alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
+ alsl.d t8, a2, t8, 2
+ // t6 = src + 6 * stride + 1
+ vld vr0, t6, 0
+ vldx vr1, t6, a2
+ vldx vr2, t6, t1
+ vldx vr3, t6, t2
+ alsl.d t6, a2, t6, 2
+ vld vr4, t6, 0
+ vldx vr5, t6, a2
+ vldx vr6, t6, t1
+ QPEL8_V_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6
+ vld vr0, t8, 0
+ vldx vr1, t8, a2
+ vavgr.bu vr13, vr23, vr13
+ vavgr.bu vr14, vr24, vr14
+ vavgr.bu vr13, vr13, vr0
+ vavgr.bu vr14, vr14, vr1
+ vst vr13, a0, 0
+ vstx vr14, a0, a2
+ vldx vr0, t6, t2
+ alsl.d t6, a2, t6, 2
+ vld vr1, t6, 0
+ QPEL8_V_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1
+ vldx vr2, t8, t1
+ vldx vr3, t8, t2
+ vavgr.bu vr13, vr25, vr13
+ vavgr.bu vr14, vr26, vr14
+ vavgr.bu vr13, vr13, vr2
+ vavgr.bu vr14, vr14, vr3
+ vstx vr13, a0, t1
+ vstx vr14, a0, t2
+ alsl.d a0, a2, a0, 2 // dst = dst + 4 *stride
+ alsl.d t8, a2, t8, 2
+ vldx vr2, t6, a2
+ vldx vr3, t6, t1
+ QPEL8_V_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3
+ vld vr4, t8, 0
+ vldx vr5, t8, a2
+ vavgr.bu vr13, vr27, vr13
+ vavgr.bu vr14, vr28, vr14
+ vavgr.bu vr13, vr13, vr4
+ vavgr.bu vr14, vr14, vr5
+ vst vr13, a0, 0
+ vstx vr14, a0, a2
+ vldx vr4, t6, t2
+ alsl.d t6, a2, t6, 2
+ vld vr5, t6, 0
+ QPEL8_V_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5
+ vldx vr6, t8, t1
+ vldx vr0, t8, t2
+ vavgr.bu vr13, vr29, vr13
+ vavgr.bu vr14, vr30, vr14
+ vavgr.bu vr13, vr13, vr6
+ vavgr.bu vr14, vr14, vr0
+ vstx vr13, a0, t1
+ vstx vr14, a0, t2
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ fld.d f30, sp, 48
+ fld.d f31, sp, 56
+ addi.d sp, sp, 64
+.endm
+
+function ff_avg_h264_qpel16_mc33_lsx
+ slli.d t1, a2, 1
+ add.d t2, t1, a2
+ addi.d t0, a1, -2 // t0 = src - 2
+ add.d t0, t0, a2 // t0 = src + stride - 2
+ add.d t3, a1, zero // t3 = src
+ sub.d t4, a1, t1 // t4 = src - 2 * stride
+ addi.d t4, t4, 1
+ addi.d t8, a0, 0
+ avc_luma_hv_qrt_and_aver_dst_16x16_lsx
+endfunc
+
+function ff_avg_h264_qpel16_mc11_lsx
+ slli.d t1, a2, 1
+ add.d t2, t1, a2
+ addi.d t0, a1, -2 // t0 = src - 2
+ add.d t3, a1, zero // t3 = src
+ sub.d t4, a1, t1 // t4 = src - 2 * stride
+ addi.d t8, a0, 0
+ avc_luma_hv_qrt_and_aver_dst_16x16_lsx
+endfunc
+
+function ff_avg_h264_qpel16_mc31_lsx
+ slli.d t1, a2, 1
+ add.d t2, t1, a2
+ addi.d t0, a1, -2 // t0 = src - 2
+ add.d t3, a1, zero // t3 = src
+ sub.d t4, a1, t1 // t4 = src - 2 * stride
+ addi.d t4, t4, 1
+ addi.d t8, a0, 0
+ avc_luma_hv_qrt_and_aver_dst_16x16_lsx
+endfunc
+
+function ff_avg_h264_qpel16_mc13_lsx
+ slli.d t1, a2, 1
+ add.d t2, t1, a2
+ addi.d t0, a1, -2 // t0 = src - 2
+ add.d t0, t0, a2
+ add.d t3, a1, zero // t3 = src
+ sub.d t4, a1, t1 // t4 = src - 2 * stride
+ addi.d t8, a0, 0
+ avc_luma_hv_qrt_and_aver_dst_16x16_lsx
+endfunc
+
+function ff_avg_h264_qpel16_mc20_lsx
+ slli.d t1, a2, 1
+ add.d t2, t1, a2
+ vldi vr20, 0x414
+ vldi vr21, 0x405
+ vldi vr22, 0x410
+ addi.d t0, a1, -2 // t0 = src - 2
+ addi.d t5, a0, 0
+ addi.d a1, t0, 8
+.rept 4
+ VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0
+ VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr12, vr13, a1
+ vld vr0, t5, 0
+ vldx vr1, t5, a2
+ vavgr.bu vr0, vr0, vr2
+ vavgr.bu vr1, vr1, vr3
+ vst vr0, a0, 0
+ vstx vr1, a0, a2
+ add.d a1, a1, t1
+ VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr14, vr15, a1
+ vldx vr0, t5, t1
+ vldx vr1, t5, t2
+ vavgr.bu vr0, vr0, vr2
+ vavgr.bu vr1, vr1, vr3
+ vstx vr0, a0, t1
+ vstx vr1, a0, t2
+ alsl.d t0, a2, t0, 2
+ alsl.d t5, a2, t5, 2
+ alsl.d a0, a2, a0, 2
+ alsl.d a1, a2, a1, 1
+.endr
+endfunc
+
+.macro QPEL8_HV_H_LSX out0, out1
+ vbsrl.v vr2, vr0, 1
+ vbsrl.v vr3, vr1, 1
+ vbsrl.v vr4, vr0, 2
+ vbsrl.v vr5, vr1, 2
+ vbsrl.v vr6, vr0, 3
+ vbsrl.v vr7, vr1, 3
+ vbsrl.v vr8, vr0, 4
+ vbsrl.v vr9, vr1, 4
+ vbsrl.v vr10, vr0, 5
+ vbsrl.v vr11, vr1, 5
+ vilvl.b vr6, vr4, vr6
+ vilvl.b vr7, vr5, vr7
+ vilvl.b vr8, vr2, vr8
+ vilvl.b vr9, vr3, vr9
+ vilvl.b vr10, vr0, vr10
+ vilvl.b vr11, vr1, vr11
+ vhaddw.hu.bu vr6, vr6, vr6
+ vhaddw.hu.bu vr7, vr7, vr7
+ vhaddw.hu.bu vr8, vr8, vr8
+ vhaddw.hu.bu vr9, vr9, vr9
+ vhaddw.hu.bu vr10, vr10, vr10
+ vhaddw.hu.bu vr11, vr11, vr11
+ vmul.h vr2, vr6, vr20
+ vmul.h vr3, vr7, vr20
+ vmul.h vr4, vr8, vr21
+ vmul.h vr5, vr9, vr21
+ vssub.h vr2, vr2, vr4
+ vssub.h vr3, vr3, vr5
+ vsadd.h \out0, vr2, vr10
+ vsadd.h \out1, vr3, vr11
+.endm
+
+.macro QPEL8_HV_V_LSX in0, in1, in2, in3, in4, in5, in6, out0, out1, out2, out3
+ vilvl.h vr0, \in2, \in3
+ vilvl.h vr1, \in3, \in4 // tmp0
+ vilvl.h vr2, \in1, \in4
+ vilvl.h vr3, \in2, \in5 // tmp2
+ vilvl.h vr4, \in0, \in5
+ vilvl.h vr5, \in1, \in6 // tmp4
+ vhaddw.w.h vr0, vr0, vr0
+ vhaddw.w.h vr1, vr1, vr1
+ vhaddw.w.h vr2, vr2, vr2
+ vhaddw.w.h vr3, vr3, vr3
+ vhaddw.w.h vr4, vr4, vr4
+ vhaddw.w.h vr5, vr5, vr5
+ vmul.w vr0, vr0, vr22
+ vmul.w vr1, vr1, vr22
+ vmul.w vr2, vr2, vr23
+ vmul.w vr3, vr3, vr23
+ vssub.w vr0, vr0, vr2
+ vssub.w vr1, vr1, vr3
+ vsadd.w vr0, vr0, vr4
+ vsadd.w vr1, vr1, vr5
+ vsadd.w \out0, vr0, vr24
+ vsadd.w \out1, vr1, vr24
+ vilvh.h vr0, \in2, \in3
+ vilvh.h vr1, \in3, \in4 // tmp0
+ vilvh.h vr2, \in1, \in4
+ vilvh.h vr3, \in2, \in5 // tmp2
+ vilvh.h vr4, \in0, \in5
+ vilvh.h vr5, \in1, \in6 // tmp4
+ vhaddw.w.h vr0, vr0, vr0
+ vhaddw.w.h vr1, vr1, vr1
+ vhaddw.w.h vr2, vr2, vr2
+ vhaddw.w.h vr3, vr3, vr3
+ vhaddw.w.h vr4, vr4, vr4
+ vhaddw.w.h vr5, vr5, vr5
+ vmul.w vr0, vr0, vr22
+ vmul.w vr1, vr1, vr22
+ vmul.w vr2, vr2, vr23
+ vmul.w vr3, vr3, vr23
+ vssub.w vr0, vr0, vr2
+ vssub.w vr1, vr1, vr3
+ vsadd.w vr0, vr0, vr4
+ vsadd.w vr1, vr1, vr5
+ vsadd.w \out2, vr0, vr24
+ vsadd.w \out3, vr1, vr24
+ vssrani.hu.w \out2, \out0, 10
+ vssrani.hu.w \out3, \out1, 10
+ vssrani.bu.h \out3, \out2, 0
+.endm
+
+.macro h264_qpel8_hv_lowpass_core_lsx in0, in1, type
+ vld vr0, \in0, 0
+ vldx vr1, \in0, a3
+ QPEL8_HV_H_LSX vr12, vr13 // a b$
+ vldx vr0, \in0, t1
+ vldx vr1, \in0, t2
+ QPEL8_HV_H_LSX vr14, vr15 // c d$
+
+ alsl.d \in0, a3, \in0, 2
+
+ vld vr0, \in0, 0
+ vldx vr1, \in0, a3
+ QPEL8_HV_H_LSX vr16, vr17 // e f$
+ vldx vr0, \in0, t1
+ vldx vr1, \in0, t2
+ QPEL8_HV_H_LSX vr18, vr19 // g h$
+ QPEL8_HV_V_LSX vr12, vr13, vr14, vr15, vr16, vr17, vr18, vr6, vr7, vr0, vr1
+.ifc \type, avg
+ fld.d f2, t3, 0
+ fldx.d f3, t3, a2
+ vilvl.d vr2, vr3, vr2
+ vavgr.bu vr1, vr2, vr1
+.endif
+ vstelm.d vr1, \in1, 0, 0
+ add.d \in1, \in1, a2
+ vstelm.d vr1, \in1, 0, 1
+
+ alsl.d \in0, a3, \in0, 2
+
+ // tmp8
+ vld vr0, \in0, 0
+ vldx vr1, \in0, a3
+ QPEL8_HV_H_LSX vr12, vr13
+ QPEL8_HV_V_LSX vr14, vr15, vr16, vr17, vr18, vr19, vr12, vr6, vr7, vr0, vr1
+.ifc \type, avg
+ fldx.d f2, t3, t5
+ fldx.d f3, t3, t6
+ vilvl.d vr2, vr3, vr2
+ vavgr.bu vr1, vr2, vr1
+.endif
+ add.d \in1, \in1, a2
+ vstelm.d vr1, \in1, 0, 0
+ add.d \in1, \in1, a2
+ vstelm.d vr1, \in1, 0, 1
+
+ // tmp10
+ vldx vr0, \in0, t1
+ vldx vr1, \in0, t2
+ QPEL8_HV_H_LSX vr14, vr15
+ QPEL8_HV_V_LSX vr16, vr17, vr18, vr19, vr12, vr13, vr14, vr6, vr7, vr0, vr1
+.ifc \type, avg
+ alsl.d t3, a2, t3, 2
+ fld.d f2, t3, 0
+ fldx.d f3, t3, a2
+ vilvl.d vr2, vr3, vr2
+ vavgr.bu vr1, vr2, vr1
+.endif
+ add.d \in1, \in1, a2
+ vstelm.d vr1, \in1, 0, 0
+ add.d \in1, \in1, a2
+ vstelm.d vr1, \in1, 0, 1
+
+ // tmp12
+ alsl.d \in0, a3, \in0, 2
+
+ vld vr0, \in0, 0
+ vldx vr1, \in0, a3
+ QPEL8_HV_H_LSX vr16, vr17
+ QPEL8_HV_V_LSX vr18, vr19, vr12, vr13, vr14, vr15, vr16, vr6, vr7, vr0, vr1
+.ifc \type, avg
+ fldx.d f2, t3, t5
+ fldx.d f3, t3, t6
+ vilvl.d vr2, vr3, vr2
+ vavgr.bu vr1, vr2, vr1
+.endif
+ add.d \in1, \in1, a2
+ vstelm.d vr1, \in1, 0, 0
+ add.d \in1, \in1, a2
+ vstelm.d vr1, \in1, 0, 1
+.endm
+
+function put_h264_qpel8_hv_lowpass_lsx
+ slli.d t1, a3, 1
+ add.d t2, t1, a3
+ addi.d sp, sp, -8
+ fst.d f24, sp, 0
+ addi.d t0, a1, -2 // t0 = src - 2
+ sub.d t0, t0, t1 // t0 = t0 - 2 * stride
+ vldi vr20, 0x414 // h_20
+ vldi vr21, 0x405 // h_5
+ vldi vr22, 0x814 // w_20
+ vldi vr23, 0x805 // w_5
+ addi.d t4, zero, 512
+ vreplgr2vr.w vr24, t4 // w_512
+ h264_qpel8_hv_lowpass_core_lsx t0, a0, put
+ fld.d f24, sp, 0
+ addi.d sp, sp, 8
+endfunc
+
+function put_h264_qpel8_h_lowpass_lsx
+ slli.d t1, a3, 1
+ add.d t2, t1, a3
+ vldi vr20, 0x414
+ vldi vr21, 0x405
+ vldi vr22, 0x410
+ addi.d t0, a1, -2 // t0 = src - 2
+ add.d t3, a1, zero // t3 = src
+.rept 2
+ vld vr0, t0, 0
+ vldx vr1, t0, a3
+ QPEL8_H_LSX vr12, vr13
+ vssrani.bu.h vr13, vr12, 5
+ vstelm.d vr13, a0, 0, 0
+ add.d a0, a0, a2
+ vstelm.d vr13, a0, 0, 1
+ add.d a0, a0, a2
+ vldx vr0, t0, t1
+ vldx vr1, t0, t2
+ QPEL8_H_LSX vr12, vr13
+ vssrani.bu.h vr13, vr12, 5
+ vstelm.d vr13, a0, 0, 0
+ add.d a0, a0, a2
+ vstelm.d vr13, a0, 0, 1
+ add.d a0, a0, a2
+ alsl.d t0, a3, t0, 2
+.endr
+endfunc
+
+function put_pixels16_l2_8_lsx
+ slli.d t0, a4, 1
+ add.d t1, t0, a4
+ slli.d t2, t0, 1
+ slli.d t3, a3, 1
+ add.d t4, t3, a3
+ slli.d t5, t3, 1
+.rept 4
+ vld vr0, a1, 0
+ vldx vr1, a1, a4
+ vldx vr2, a1, t0
+ vldx vr3, a1, t1
+ add.d a1, a1, t2
+ vld vr8, a2, 0x00
+ vld vr9, a2, 0x10
+ vld vr10, a2, 0x20
+ vld vr11, a2, 0x30
+ addi.d a2, a2, 0x40
+ vavgr.bu vr0, vr8, vr0
+ vavgr.bu vr1, vr9, vr1
+ vavgr.bu vr2, vr10, vr2
+ vavgr.bu vr3, vr11, vr3
+ vst vr0, a0, 0
+ vstx vr1, a0, a3
+ vstx vr2, a0, t3
+ vstx vr3, a0, t4
+ add.d a0, a0, t5
+.endr
+endfunc
+
+.macro QPEL8_V1_LSX in0, in1, in2, in3, in4, in5, in6
+ vilvl.b vr7, \in3, \in2
+ vilvl.b vr8, \in4, \in3
+ vilvl.b vr9, \in4, \in1
+ vilvl.b vr10, \in5, \in2
+ vilvl.b vr11, \in5, \in0
+ vilvl.b vr12, \in6, \in1
+ vhaddw.hu.bu vr7, vr7, vr7
+ vhaddw.hu.bu vr8, vr8, vr8
+ vhaddw.hu.bu vr9, vr9, vr9
+ vhaddw.hu.bu vr10, vr10, vr10
+ vhaddw.hu.bu vr11, vr11, vr11
+ vhaddw.hu.bu vr12, vr12, vr12
+ vmul.h vr7, vr7, vr20
+ vmul.h vr8, vr8, vr20
+ vmul.h vr9, vr9, vr21
+ vmul.h vr10, vr10, vr21
+ vssub.h vr7, vr7, vr9
+ vssub.h vr8, vr8, vr10
+ vsadd.h vr7, vr7, vr11
+ vsadd.h vr8, vr8, vr12
+ vsadd.h vr7, vr7, vr22
+ vsadd.h vr8, vr8, vr22
+ vssrani.bu.h vr8, vr7, 5
+.endm
+
+.macro h264_qpel8_v_lowpass_lsx type
+function \type\()_h264_qpel8_v_lowpass_lsx
+ slli.d t0, a3, 1
+ add.d t1, t0, a3
+ sub.d t2, a1, t0 // t2 = src - 2 * stride
+.ifc \type, avg
+ addi.d t3, a0, 0
+ slli.d t4, a2, 1
+ add.d t5, t4, a2
+.endif
+ vldi vr20, 0x414
+ vldi vr21, 0x405
+ vldi vr22, 0x410
+
+ fld.d f0, t2, 0
+ fldx.d f1, t2, a3
+ fldx.d f2, t2, t0
+ fldx.d f3, t2, t1
+ alsl.d t2, a3, t2, 2 // t2 = t2 + 4 * stride
+ fld.d f4, t2, 0
+ fldx.d f5, t2, a3
+ fldx.d f6, t2, t0
+ QPEL8_V1_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6
+.ifc \type, avg
+ fld.d f0, t3, 0
+ fldx.d f1, t3, a2
+ vilvl.d vr0, vr1, vr0
+ vavgr.bu vr8, vr8, vr0
+.endif
+ vstelm.d vr8, a0, 0, 0
+ add.d a0, a0, a2
+ vstelm.d vr8, a0, 0, 1
+ add.d a0, a0, a2
+
+ fldx.d f0, t2, t1
+ alsl.d t2, a3, t2, 2 // t2 = t2 + 4 *stride
+ fld.d f1, t2, 0
+ QPEL8_V1_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1
+.ifc \type, avg
+ fldx.d f2, t3, t4
+ fldx.d f3, t3, t5
+ vilvl.d vr2, vr3, vr2
+ vavgr.bu vr8, vr8, vr2
+.endif
+ vstelm.d vr8, a0, 0, 0
+ add.d a0, a0, a2
+ vstelm.d vr8, a0, 0, 1
+ add.d a0, a0, a2
+
+ alsl.d t3, a2, t3, 2
+
+ fldx.d f2, t2, a3
+ fldx.d f3, t2, t0
+ QPEL8_V1_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3
+.ifc \type, avg
+ fld.d f4, t3, 0
+ fldx.d f5, t3, a2
+ vilvl.d vr4, vr5, vr4
+ vavgr.bu vr8, vr8, vr4
+.endif
+ vstelm.d vr8, a0, 0, 0
+ add.d a0, a0, a2
+ vstelm.d vr8, a0, 0, 1
+ add.d a0, a0, a2
+
+ fldx.d f4, t2, t1
+ alsl.d t2, a3, t2, 2 // t2 = t2 + 4 * stride
+ fld.d f5, t2, 0
+ QPEL8_V1_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5
+.ifc \type, avg
+ fldx.d f6, t3, t4
+ fldx.d f0, t3, t5
+ vilvl.d vr6, vr0, vr6
+ vavgr.bu vr8, vr8, vr6
+.endif
+ vstelm.d vr8, a0, 0, 0
+ add.d a0, a0, a2
+ vstelm.d vr8, a0, 0, 1
+endfunc
+.endm
+
+h264_qpel8_v_lowpass_lsx put
+h264_qpel8_v_lowpass_lsx avg
+
+function avg_pixels16_l2_8_lsx
+ slli.d t0, a4, 1
+ add.d t1, t0, a4
+ slli.d t2, t0, 1
+ slli.d t3, a3, 1
+ add.d t4, t3, a3
+ slli.d t5, t3, 1
+ addi.d t6, a0, 0
+.rept 4
+ vld vr0, a1, 0
+ vldx vr1, a1, a4
+ vldx vr2, a1, t0
+ vldx vr3, a1, t1
+ add.d a1, a1, t2
+ vld vr8, a2, 0x00
+ vld vr9, a2, 0x10
+ vld vr10, a2, 0x20
+ vld vr11, a2, 0x30
+ addi.d a2, a2, 0x40
+ vavgr.bu vr0, vr8, vr0
+ vavgr.bu vr1, vr9, vr1
+ vavgr.bu vr2, vr10, vr2
+ vavgr.bu vr3, vr11, vr3
+ vld vr8, t6, 0
+ vldx vr9, t6, a3
+ vldx vr10, t6, t3
+ vldx vr11, t6, t4
+ add.d t6, t6, t5
+ vavgr.bu vr0, vr8, vr0
+ vavgr.bu vr1, vr9, vr1
+ vavgr.bu vr2, vr10, vr2
+ vavgr.bu vr3, vr11, vr3
+ vst vr0, a0, 0
+ vstx vr1, a0, a3
+ vstx vr2, a0, t3
+ vstx vr3, a0, t4
+ add.d a0, a0, t5
+.endr
+endfunc
+
+function avg_h264_qpel8_hv_lowpass_lsx
+ slli.d t1, a3, 1
+ add.d t2, t1, a3
+ slli.d t5, a2, 1
+ add.d t6, a2, t5
+ addi.d sp, sp, -8
+ fst.d f24, sp, 0
+ vldi vr20, 0x414 // h_20
+ vldi vr21, 0x405 // h_5
+ vldi vr22, 0x814 // w_20
+ vldi vr23, 0x805 // w_5
+ addi.d t4, zero, 512
+ vreplgr2vr.w vr24, t4 // w_512
+ addi.d t0, a1, -2 // t0 = src - 2
+ sub.d t0, t0, t1 // t0 = t0 - 2 * stride
+ addi.d t3, a0, 0 // t3 = dst
+ h264_qpel8_hv_lowpass_core_lsx t0, a0, avg
+ fld.d f24, sp, 0
+ addi.d sp, sp, 8
+endfunc
+
+function put_pixels8_l2_8_lsx
+ slli.d t0, a4, 1
+ add.d t1, t0, a4
+ slli.d t2, t0, 1
+.rept 2
+ vld vr0, a1, 0
+ vldx vr1, a1, a4
+ vldx vr2, a1, t0
+ vldx vr3, a1, t1
+ add.d a1, a1, t2
+ vilvl.d vr0, vr1, vr0
+ vilvl.d vr2, vr3, vr2
+ vld vr8, a2, 0x00
+ vld vr9, a2, 0x08
+ vld vr10, a2, 0x10
+ vld vr11, a2, 0x18
+ vilvl.d vr8, vr9, vr8
+ vilvl.d vr10, vr11, vr10
+ addi.d a2, a2, 32
+ vavgr.bu vr0, vr8, vr0
+ vavgr.bu vr2, vr10, vr2
+ vstelm.d vr0, a0, 0, 0
+ add.d a0, a0, a3
+ vstelm.d vr0, a0, 0, 1
+ add.d a0, a0, a3
+ vstelm.d vr2, a0, 0, 0
+ add.d a0, a0, a3
+ vstelm.d vr2, a0, 0, 1
+ add.d a0, a0, a3
+.endr
+endfunc
+
+function ff_put_h264_qpel8_mc00_lsx
+ slli.d t0, a2, 1
+ add.d t1, t0, a2
+ slli.d t2, t0, 1
+ ld.d t3, a1, 0x0
+ ldx.d t4, a1, a2
+ ldx.d t5, a1, t0
+ ldx.d t6, a1, t1
+ st.d t3, a0, 0x0
+ stx.d t4, a0, a2
+ stx.d t5, a0, t0
+ stx.d t6, a0, t1
+ add.d a1, a1, t2
+ add.d a0, a0, t2
+ ld.d t3, a1, 0x0
+ ldx.d t4, a1, a2
+ ldx.d t5, a1, t0
+ ldx.d t6, a1, t1
+ st.d t3, a0, 0x0
+ stx.d t4, a0, a2
+ stx.d t5, a0, t0
+ stx.d t6, a0, t1
+endfunc
+
+function ff_avg_h264_qpel8_mc00_lsx
+ slli.d t0, a2, 1
+ add.d t1, t0, a2
+ slli.d t2, t0, 1
+ addi.d t3, a0, 0
+.rept 2
+ vld vr0, a1, 0
+ vldx vr1, a1, a2
+ vldx vr2, a1, t0
+ vldx vr3, a1, t1
+ add.d a1, a1, t2
+ vilvl.d vr0, vr1, vr0
+ vilvl.d vr2, vr3, vr2
+ vld vr8, t3, 0
+ vldx vr9, t3, a2
+ vldx vr10, t3, t0
+ vldx vr11, t3, t1
+ add.d t3, t3, t2
+ vilvl.d vr8, vr9, vr8
+ vilvl.d vr10, vr11, vr10
+ vavgr.bu vr0, vr8, vr0
+ vavgr.bu vr2, vr10, vr2
+ vstelm.d vr0, a0, 0, 0
+ add.d a0, a0, a2
+ vstelm.d vr0, a0, 0, 1
+ add.d a0, a0, a2
+ vstelm.d vr2, a0, 0, 0
+ add.d a0, a0, a2
+ vstelm.d vr2, a0, 0, 1
+ add.d a0, a0, a2
+.endr
+endfunc
+
+function avg_pixels8_l2_8_lsx
+ slli.d t0, a4, 1
+ add.d t1, t0, a4
+ slli.d t2, t0, 1
+ addi.d t3, a0, 0
+ slli.d t4, a3, 1
+ add.d t5, t4, a3
+ slli.d t6, t4, 1
+.rept 2
+ vld vr0, a1, 0
+ vldx vr1, a1, a4
+ vldx vr2, a1, t0
+ vldx vr3, a1, t1
+ add.d a1, a1, t2
+ vilvl.d vr0, vr1, vr0
+ vilvl.d vr2, vr3, vr2
+ vld vr8, a2, 0x00
+ vld vr9, a2, 0x08
+ vld vr10, a2, 0x10
+ vld vr11, a2, 0x18
+ addi.d a2, a2, 0x20
+ vilvl.d vr8, vr9, vr8
+ vilvl.d vr10, vr11, vr10
+ vavgr.bu vr0, vr8, vr0
+ vavgr.bu vr2, vr10, vr2
+ vld vr8, t3, 0
+ vldx vr9, t3, a3
+ vldx vr10, t3, t4
+ vldx vr11, t3, t5
+ add.d t3, t3, t6
+ vilvl.d vr8, vr9, vr8
+ vilvl.d vr10, vr11, vr10
+ vavgr.bu vr0, vr8, vr0
+ vavgr.bu vr2, vr10, vr2
+ vstelm.d vr0, a0, 0, 0
+ add.d a0, a0, a3
+ vstelm.d vr0, a0, 0, 1
+ add.d a0, a0, a3
+ vstelm.d vr2, a0, 0, 0
+ add.d a0, a0, a3
+ vstelm.d vr2, a0, 0, 1
+ add.d a0, a0, a3
+.endr
+endfunc
+
+function avg_h264_qpel8_h_lowpass_lsx
+ slli.d t1, a3, 1
+ add.d t2, t1, a3
+ slli.d t5, a2, 1
+ add.d t6, t5, a2
+ vldi vr20, 0x414
+ vldi vr21, 0x405
+ vldi vr22, 0x410
+ addi.d t0, a1, -2 // t0 = src - 2
+ add.d t3, a1, zero // t3 = src
+ addi.d t4, a0, 0 // t4 = dst
+.rept 4
+ vld vr0, t0, 0
+ vldx vr1, t0, a3
+ QPEL8_H_LSX vr12, vr13
+ vssrani.bu.h vr13, vr12, 5
+ fld.d f0, t4, 0
+ fldx.d f1, t4, a2
+ vilvl.d vr0, vr1, vr0
+ vavgr.bu vr13, vr13, vr0
+ vstelm.d vr13, a0, 0, 0
+ add.d a0, a0, a2
+ vstelm.d vr13, a0, 0, 1
+ add.d a0, a0, a2
+ add.d t0, t0, t1
+ add.d t4, t4, t1
+.endr
+endfunc
diff --git a/libavcodec/loongarch/h264qpel_init_loongarch.c b/libavcodec/loongarch/h264qpel_init_loongarch.c
index 969c9c376c..9d3a5cb164 100644
--- a/libavcodec/loongarch/h264qpel_init_loongarch.c
+++ b/libavcodec/loongarch/h264qpel_init_loongarch.c
@@ -19,7 +19,7 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#include "h264qpel_lasx.h"
+#include "h264qpel_loongarch.h"
#include "libavutil/attributes.h"
#include "libavutil/loongarch/cpu.h"
#include "libavcodec/h264qpel.h"
@@ -27,6 +27,77 @@
av_cold void ff_h264qpel_init_loongarch(H264QpelContext *c, int bit_depth)
{
int cpu_flags = av_get_cpu_flags();
+
+ if (have_lsx(cpu_flags)) {
+ if (8 == bit_depth) {
+ c->put_h264_qpel_pixels_tab[0][0] = ff_put_h264_qpel16_mc00_lsx;
+ c->put_h264_qpel_pixels_tab[0][1] = ff_put_h264_qpel16_mc10_lsx;
+ c->put_h264_qpel_pixels_tab[0][2] = ff_put_h264_qpel16_mc20_lsx;
+ c->put_h264_qpel_pixels_tab[0][3] = ff_put_h264_qpel16_mc30_lsx;
+ c->put_h264_qpel_pixels_tab[0][4] = ff_put_h264_qpel16_mc01_lsx;
+ c->put_h264_qpel_pixels_tab[0][5] = ff_put_h264_qpel16_mc11_lsx;
+ c->put_h264_qpel_pixels_tab[0][6] = ff_put_h264_qpel16_mc21_lsx;
+ c->put_h264_qpel_pixels_tab[0][7] = ff_put_h264_qpel16_mc31_lsx;
+ c->put_h264_qpel_pixels_tab[0][8] = ff_put_h264_qpel16_mc02_lsx;
+ c->put_h264_qpel_pixels_tab[0][9] = ff_put_h264_qpel16_mc12_lsx;
+ c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_lsx;
+ c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_lsx;
+ c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_lsx;
+ c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_lsx;
+ c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_lsx;
+ c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_lsx;
+
+ c->avg_h264_qpel_pixels_tab[0][0] = ff_avg_h264_qpel16_mc00_lsx;
+ c->avg_h264_qpel_pixels_tab[0][1] = ff_avg_h264_qpel16_mc10_lsx;
+ c->avg_h264_qpel_pixels_tab[0][2] = ff_avg_h264_qpel16_mc20_lsx;
+ c->avg_h264_qpel_pixels_tab[0][3] = ff_avg_h264_qpel16_mc30_lsx;
+ c->avg_h264_qpel_pixels_tab[0][4] = ff_avg_h264_qpel16_mc01_lsx;
+ c->avg_h264_qpel_pixels_tab[0][5] = ff_avg_h264_qpel16_mc11_lsx;
+ c->avg_h264_qpel_pixels_tab[0][6] = ff_avg_h264_qpel16_mc21_lsx;
+ c->avg_h264_qpel_pixels_tab[0][7] = ff_avg_h264_qpel16_mc31_lsx;
+ c->avg_h264_qpel_pixels_tab[0][8] = ff_avg_h264_qpel16_mc02_lsx;
+ c->avg_h264_qpel_pixels_tab[0][9] = ff_avg_h264_qpel16_mc12_lsx;
+ c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_lsx;
+ c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_lsx;
+ c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_lsx;
+ c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_lsx;
+ c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_lsx;
+ c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_lsx;
+
+ c->put_h264_qpel_pixels_tab[1][0] = ff_put_h264_qpel8_mc00_lsx;
+ c->put_h264_qpel_pixels_tab[1][1] = ff_put_h264_qpel8_mc10_lsx;
+ c->put_h264_qpel_pixels_tab[1][2] = ff_put_h264_qpel8_mc20_lsx;
+ c->put_h264_qpel_pixels_tab[1][3] = ff_put_h264_qpel8_mc30_lsx;
+ c->put_h264_qpel_pixels_tab[1][4] = ff_put_h264_qpel8_mc01_lsx;
+ c->put_h264_qpel_pixels_tab[1][5] = ff_put_h264_qpel8_mc11_lsx;
+ c->put_h264_qpel_pixels_tab[1][6] = ff_put_h264_qpel8_mc21_lsx;
+ c->put_h264_qpel_pixels_tab[1][7] = ff_put_h264_qpel8_mc31_lsx;
+ c->put_h264_qpel_pixels_tab[1][8] = ff_put_h264_qpel8_mc02_lsx;
+ c->put_h264_qpel_pixels_tab[1][9] = ff_put_h264_qpel8_mc12_lsx;
+ c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_lsx;
+ c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_lsx;
+ c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_lsx;
+ c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_lsx;
+ c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_lsx;
+ c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_lsx;
+
+ c->avg_h264_qpel_pixels_tab[1][0] = ff_avg_h264_qpel8_mc00_lsx;
+ c->avg_h264_qpel_pixels_tab[1][1] = ff_avg_h264_qpel8_mc10_lsx;
+ c->avg_h264_qpel_pixels_tab[1][2] = ff_avg_h264_qpel8_mc20_lsx;
+ c->avg_h264_qpel_pixels_tab[1][3] = ff_avg_h264_qpel8_mc30_lsx;
+ c->avg_h264_qpel_pixels_tab[1][5] = ff_avg_h264_qpel8_mc11_lsx;
+ c->avg_h264_qpel_pixels_tab[1][6] = ff_avg_h264_qpel8_mc21_lsx;
+ c->avg_h264_qpel_pixels_tab[1][7] = ff_avg_h264_qpel8_mc31_lsx;
+ c->avg_h264_qpel_pixels_tab[1][8] = ff_avg_h264_qpel8_mc02_lsx;
+ c->avg_h264_qpel_pixels_tab[1][9] = ff_avg_h264_qpel8_mc12_lsx;
+ c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_lsx;
+ c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_lsx;
+ c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_lsx;
+ c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_lsx;
+ c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_lsx;
+ }
+ }
+#if HAVE_LASX
if (have_lasx(cpu_flags)) {
if (8 == bit_depth) {
c->put_h264_qpel_pixels_tab[0][0] = ff_put_h264_qpel16_mc00_lasx;
@@ -95,4 +166,5 @@ av_cold void ff_h264qpel_init_loongarch(H264QpelContext *c, int bit_depth)
c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_lasx;
}
}
+#endif
}
diff --git a/libavcodec/loongarch/h264qpel_lasx.c b/libavcodec/loongarch/h264qpel_lasx.c
index 1c142e510e..519bb03fe6 100644
--- a/libavcodec/loongarch/h264qpel_lasx.c
+++ b/libavcodec/loongarch/h264qpel_lasx.c
@@ -21,7 +21,7 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#include "h264qpel_lasx.h"
+#include "h264qpel_loongarch.h"
#include "libavutil/loongarch/loongson_intrinsics.h"
#include "libavutil/attributes.h"
@@ -418,157 +418,6 @@ avg_pixels8_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
);
}
-/* avg_pixels8_8_lsx : dst = avg(src, dst)
- * put_pixels8_l2_8_lsx: dst = avg(src, half) , half stride is 8.
- * avg_pixels8_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
-static av_always_inline void
-put_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half,
- ptrdiff_t dstStride, ptrdiff_t srcStride)
-{
- ptrdiff_t stride_2, stride_3, stride_4;
- __asm__ volatile (
- /* h0~h7 */
- "slli.d %[stride_2], %[srcStride], 1 \n\t"
- "add.d %[stride_3], %[stride_2], %[srcStride] \n\t"
- "slli.d %[stride_4], %[stride_2], 1 \n\t"
- "vld $vr0, %[src], 0 \n\t"
- "vldx $vr1, %[src], %[srcStride] \n\t"
- "vldx $vr2, %[src], %[stride_2] \n\t"
- "vldx $vr3, %[src], %[stride_3] \n\t"
- "add.d %[src], %[src], %[stride_4] \n\t"
- "vld $vr4, %[src], 0 \n\t"
- "vldx $vr5, %[src], %[srcStride] \n\t"
- "vldx $vr6, %[src], %[stride_2] \n\t"
- "vldx $vr7, %[src], %[stride_3] \n\t"
-
- "vld $vr8, %[half], 0x00 \n\t"
- "vld $vr9, %[half], 0x08 \n\t"
- "vld $vr10, %[half], 0x10 \n\t"
- "vld $vr11, %[half], 0x18 \n\t"
- "vld $vr12, %[half], 0x20 \n\t"
- "vld $vr13, %[half], 0x28 \n\t"
- "vld $vr14, %[half], 0x30 \n\t"
- "vld $vr15, %[half], 0x38 \n\t"
-
- "vavgr.bu $vr0, $vr8, $vr0 \n\t"
- "vavgr.bu $vr1, $vr9, $vr1 \n\t"
- "vavgr.bu $vr2, $vr10, $vr2 \n\t"
- "vavgr.bu $vr3, $vr11, $vr3 \n\t"
- "vavgr.bu $vr4, $vr12, $vr4 \n\t"
- "vavgr.bu $vr5, $vr13, $vr5 \n\t"
- "vavgr.bu $vr6, $vr14, $vr6 \n\t"
- "vavgr.bu $vr7, $vr15, $vr7 \n\t"
-
- "vstelm.d $vr0, %[dst], 0, 0 \n\t"
- "add.d %[dst], %[dst], %[dstStride] \n\t"
- "vstelm.d $vr1, %[dst], 0, 0 \n\t"
- "add.d %[dst], %[dst], %[dstStride] \n\t"
- "vstelm.d $vr2, %[dst], 0, 0 \n\t"
- "add.d %[dst], %[dst], %[dstStride] \n\t"
- "vstelm.d $vr3, %[dst], 0, 0 \n\t"
- "add.d %[dst], %[dst], %[dstStride] \n\t"
- "vstelm.d $vr4, %[dst], 0, 0 \n\t"
- "add.d %[dst], %[dst], %[dstStride] \n\t"
- "vstelm.d $vr5, %[dst], 0, 0 \n\t"
- "add.d %[dst], %[dst], %[dstStride] \n\t"
- "vstelm.d $vr6, %[dst], 0, 0 \n\t"
- "add.d %[dst], %[dst], %[dstStride] \n\t"
- "vstelm.d $vr7, %[dst], 0, 0 \n\t"
- : [dst]"+&r"(dst), [half]"+&r"(half), [src]"+&r"(src),
- [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3),
- [stride_4]"=&r"(stride_4)
- : [srcStride]"r"(srcStride), [dstStride]"r"(dstStride)
- : "memory"
- );
-}
-
-/* avg_pixels8_8_lsx : dst = avg(src, dst)
- * put_pixels8_l2_8_lsx: dst = avg(src, half) , half stride is 8.
- * avg_pixels8_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
-static av_always_inline void
-avg_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half,
- ptrdiff_t dstStride, ptrdiff_t srcStride)
-{
- uint8_t *tmp = dst;
- ptrdiff_t stride_2, stride_3, stride_4;
- __asm__ volatile (
- /* h0~h7 */
- "slli.d %[stride_2], %[srcStride], 1 \n\t"
- "add.d %[stride_3], %[stride_2], %[srcStride] \n\t"
- "slli.d %[stride_4], %[stride_2], 1 \n\t"
- "vld $vr0, %[src], 0 \n\t"
- "vldx $vr1, %[src], %[srcStride] \n\t"
- "vldx $vr2, %[src], %[stride_2] \n\t"
- "vldx $vr3, %[src], %[stride_3] \n\t"
- "add.d %[src], %[src], %[stride_4] \n\t"
- "vld $vr4, %[src], 0 \n\t"
- "vldx $vr5, %[src], %[srcStride] \n\t"
- "vldx $vr6, %[src], %[stride_2] \n\t"
- "vldx $vr7, %[src], %[stride_3] \n\t"
-
- "vld $vr8, %[half], 0x00 \n\t"
- "vld $vr9, %[half], 0x08 \n\t"
- "vld $vr10, %[half], 0x10 \n\t"
- "vld $vr11, %[half], 0x18 \n\t"
- "vld $vr12, %[half], 0x20 \n\t"
- "vld $vr13, %[half], 0x28 \n\t"
- "vld $vr14, %[half], 0x30 \n\t"
- "vld $vr15, %[half], 0x38 \n\t"
-
- "vavgr.bu $vr0, $vr8, $vr0 \n\t"
- "vavgr.bu $vr1, $vr9, $vr1 \n\t"
- "vavgr.bu $vr2, $vr10, $vr2 \n\t"
- "vavgr.bu $vr3, $vr11, $vr3 \n\t"
- "vavgr.bu $vr4, $vr12, $vr4 \n\t"
- "vavgr.bu $vr5, $vr13, $vr5 \n\t"
- "vavgr.bu $vr6, $vr14, $vr6 \n\t"
- "vavgr.bu $vr7, $vr15, $vr7 \n\t"
-
- "slli.d %[stride_2], %[dstStride], 1 \n\t"
- "add.d %[stride_3], %[stride_2], %[dstStride] \n\t"
- "slli.d %[stride_4], %[stride_2], 1 \n\t"
- "vld $vr8, %[tmp], 0 \n\t"
- "vldx $vr9, %[tmp], %[dstStride] \n\t"
- "vldx $vr10, %[tmp], %[stride_2] \n\t"
- "vldx $vr11, %[tmp], %[stride_3] \n\t"
- "add.d %[tmp], %[tmp], %[stride_4] \n\t"
- "vld $vr12, %[tmp], 0 \n\t"
- "vldx $vr13, %[tmp], %[dstStride] \n\t"
- "vldx $vr14, %[tmp], %[stride_2] \n\t"
- "vldx $vr15, %[tmp], %[stride_3] \n\t"
-
- "vavgr.bu $vr0, $vr8, $vr0 \n\t"
- "vavgr.bu $vr1, $vr9, $vr1 \n\t"
- "vavgr.bu $vr2, $vr10, $vr2 \n\t"
- "vavgr.bu $vr3, $vr11, $vr3 \n\t"
- "vavgr.bu $vr4, $vr12, $vr4 \n\t"
- "vavgr.bu $vr5, $vr13, $vr5 \n\t"
- "vavgr.bu $vr6, $vr14, $vr6 \n\t"
- "vavgr.bu $vr7, $vr15, $vr7 \n\t"
-
- "vstelm.d $vr0, %[dst], 0, 0 \n\t"
- "add.d %[dst], %[dst], %[dstStride] \n\t"
- "vstelm.d $vr1, %[dst], 0, 0 \n\t"
- "add.d %[dst], %[dst], %[dstStride] \n\t"
- "vstelm.d $vr2, %[dst], 0, 0 \n\t"
- "add.d %[dst], %[dst], %[dstStride] \n\t"
- "vstelm.d $vr3, %[dst], 0, 0 \n\t"
- "add.d %[dst], %[dst], %[dstStride] \n\t"
- "vstelm.d $vr4, %[dst], 0, 0 \n\t"
- "add.d %[dst], %[dst], %[dstStride] \n\t"
- "vstelm.d $vr5, %[dst], 0, 0 \n\t"
- "add.d %[dst], %[dst], %[dstStride] \n\t"
- "vstelm.d $vr6, %[dst], 0, 0 \n\t"
- "add.d %[dst], %[dst], %[dstStride] \n\t"
- "vstelm.d $vr7, %[dst], 0, 0 \n\t"
- : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [half]"+&r"(half),
- [src]"+&r"(src), [stride_2]"=&r"(stride_2),
- [stride_3]"=&r"(stride_3), [stride_4]"=&r"(stride_4)
- : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride)
- : "memory"
- );
-}
-
/* put_pixels16_8_lsx: dst = src */
static av_always_inline void
put_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
@@ -729,254 +578,6 @@ avg_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
);
}
-/* avg_pixels16_8_lsx : dst = avg(src, dst)
- * put_pixels16_l2_8_lsx: dst = avg(src, half) , half stride is 8.
- * avg_pixels16_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
-static av_always_inline void
-put_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half,
- ptrdiff_t dstStride, ptrdiff_t srcStride)
-{
- ptrdiff_t stride_2, stride_3, stride_4;
- ptrdiff_t dstride_2, dstride_3, dstride_4;
- __asm__ volatile (
- "slli.d %[stride_2], %[srcStride], 1 \n\t"
- "add.d %[stride_3], %[stride_2], %[srcStride] \n\t"
- "slli.d %[stride_4], %[stride_2], 1 \n\t"
- "slli.d %[dstride_2], %[dstStride], 1 \n\t"
- "add.d %[dstride_3], %[dstride_2], %[dstStride] \n\t"
- "slli.d %[dstride_4], %[dstride_2], 1 \n\t"
- /* h0~h7 */
- "vld $vr0, %[src], 0 \n\t"
- "vldx $vr1, %[src], %[srcStride] \n\t"
- "vldx $vr2, %[src], %[stride_2] \n\t"
- "vldx $vr3, %[src], %[stride_3] \n\t"
- "add.d %[src], %[src], %[stride_4] \n\t"
- "vld $vr4, %[src], 0 \n\t"
- "vldx $vr5, %[src], %[srcStride] \n\t"
- "vldx $vr6, %[src], %[stride_2] \n\t"
- "vldx $vr7, %[src], %[stride_3] \n\t"
- "add.d %[src], %[src], %[stride_4] \n\t"
-
- "vld $vr8, %[half], 0x00 \n\t"
- "vld $vr9, %[half], 0x10 \n\t"
- "vld $vr10, %[half], 0x20 \n\t"
- "vld $vr11, %[half], 0x30 \n\t"
- "vld $vr12, %[half], 0x40 \n\t"
- "vld $vr13, %[half], 0x50 \n\t"
- "vld $vr14, %[half], 0x60 \n\t"
- "vld $vr15, %[half], 0x70 \n\t"
-
- "vavgr.bu $vr0, $vr8, $vr0 \n\t"
- "vavgr.bu $vr1, $vr9, $vr1 \n\t"
- "vavgr.bu $vr2, $vr10, $vr2 \n\t"
- "vavgr.bu $vr3, $vr11, $vr3 \n\t"
- "vavgr.bu $vr4, $vr12, $vr4 \n\t"
- "vavgr.bu $vr5, $vr13, $vr5 \n\t"
- "vavgr.bu $vr6, $vr14, $vr6 \n\t"
- "vavgr.bu $vr7, $vr15, $vr7 \n\t"
-
- "vst $vr0, %[dst], 0 \n\t"
- "vstx $vr1, %[dst], %[dstStride] \n\t"
- "vstx $vr2, %[dst], %[dstride_2] \n\t"
- "vstx $vr3, %[dst], %[dstride_3] \n\t"
- "add.d %[dst], %[dst], %[dstride_4] \n\t"
- "vst $vr4, %[dst], 0 \n\t"
- "vstx $vr5, %[dst], %[dstStride] \n\t"
- "vstx $vr6, %[dst], %[dstride_2] \n\t"
- "vstx $vr7, %[dst], %[dstride_3] \n\t"
- "add.d %[dst], %[dst], %[dstride_4] \n\t"
-
- /* h8~h15 */
- "vld $vr0, %[src], 0 \n\t"
- "vldx $vr1, %[src], %[srcStride] \n\t"
- "vldx $vr2, %[src], %[stride_2] \n\t"
- "vldx $vr3, %[src], %[stride_3] \n\t"
- "add.d %[src], %[src], %[stride_4] \n\t"
- "vld $vr4, %[src], 0 \n\t"
- "vldx $vr5, %[src], %[srcStride] \n\t"
- "vldx $vr6, %[src], %[stride_2] \n\t"
- "vldx $vr7, %[src], %[stride_3] \n\t"
-
- "vld $vr8, %[half], 0x80 \n\t"
- "vld $vr9, %[half], 0x90 \n\t"
- "vld $vr10, %[half], 0xa0 \n\t"
- "vld $vr11, %[half], 0xb0 \n\t"
- "vld $vr12, %[half], 0xc0 \n\t"
- "vld $vr13, %[half], 0xd0 \n\t"
- "vld $vr14, %[half], 0xe0 \n\t"
- "vld $vr15, %[half], 0xf0 \n\t"
-
- "vavgr.bu $vr0, $vr8, $vr0 \n\t"
- "vavgr.bu $vr1, $vr9, $vr1 \n\t"
- "vavgr.bu $vr2, $vr10, $vr2 \n\t"
- "vavgr.bu $vr3, $vr11, $vr3 \n\t"
- "vavgr.bu $vr4, $vr12, $vr4 \n\t"
- "vavgr.bu $vr5, $vr13, $vr5 \n\t"
- "vavgr.bu $vr6, $vr14, $vr6 \n\t"
- "vavgr.bu $vr7, $vr15, $vr7 \n\t"
-
- "vst $vr0, %[dst], 0 \n\t"
- "vstx $vr1, %[dst], %[dstStride] \n\t"
- "vstx $vr2, %[dst], %[dstride_2] \n\t"
- "vstx $vr3, %[dst], %[dstride_3] \n\t"
- "add.d %[dst], %[dst], %[dstride_4] \n\t"
- "vst $vr4, %[dst], 0 \n\t"
- "vstx $vr5, %[dst], %[dstStride] \n\t"
- "vstx $vr6, %[dst], %[dstride_2] \n\t"
- "vstx $vr7, %[dst], %[dstride_3] \n\t"
- : [dst]"+&r"(dst), [half]"+&r"(half), [src]"+&r"(src),
- [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3),
- [stride_4]"=&r"(stride_4), [dstride_2]"=&r"(dstride_2),
- [dstride_3]"=&r"(dstride_3), [dstride_4]"=&r"(dstride_4)
- : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride)
- : "memory"
- );
-}
-
-/* avg_pixels16_8_lsx : dst = avg(src, dst)
- * put_pixels16_l2_8_lsx: dst = avg(src, half) , half stride is 8.
- * avg_pixels16_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
-static av_always_inline void
-avg_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half,
- ptrdiff_t dstStride, ptrdiff_t srcStride)
-{
- uint8_t *tmp = dst;
- ptrdiff_t stride_2, stride_3, stride_4;
- ptrdiff_t dstride_2, dstride_3, dstride_4;
- __asm__ volatile (
- "slli.d %[stride_2], %[srcStride], 1 \n\t"
- "add.d %[stride_3], %[stride_2], %[srcStride] \n\t"
- "slli.d %[stride_4], %[stride_2], 1 \n\t"
- "slli.d %[dstride_2], %[dstStride], 1 \n\t"
- "add.d %[dstride_3], %[dstride_2], %[dstStride] \n\t"
- "slli.d %[dstride_4], %[dstride_2], 1 \n\t"
- /* h0~h7 */
- "vld $vr0, %[src], 0 \n\t"
- "vldx $vr1, %[src], %[srcStride] \n\t"
- "vldx $vr2, %[src], %[stride_2] \n\t"
- "vldx $vr3, %[src], %[stride_3] \n\t"
- "add.d %[src], %[src], %[stride_4] \n\t"
- "vld $vr4, %[src], 0 \n\t"
- "vldx $vr5, %[src], %[srcStride] \n\t"
- "vldx $vr6, %[src], %[stride_2] \n\t"
- "vldx $vr7, %[src], %[stride_3] \n\t"
- "add.d %[src], %[src], %[stride_4] \n\t"
-
- "vld $vr8, %[half], 0x00 \n\t"
- "vld $vr9, %[half], 0x10 \n\t"
- "vld $vr10, %[half], 0x20 \n\t"
- "vld $vr11, %[half], 0x30 \n\t"
- "vld $vr12, %[half], 0x40 \n\t"
- "vld $vr13, %[half], 0x50 \n\t"
- "vld $vr14, %[half], 0x60 \n\t"
- "vld $vr15, %[half], 0x70 \n\t"
-
- "vavgr.bu $vr0, $vr8, $vr0 \n\t"
- "vavgr.bu $vr1, $vr9, $vr1 \n\t"
- "vavgr.bu $vr2, $vr10, $vr2 \n\t"
- "vavgr.bu $vr3, $vr11, $vr3 \n\t"
- "vavgr.bu $vr4, $vr12, $vr4 \n\t"
- "vavgr.bu $vr5, $vr13, $vr5 \n\t"
- "vavgr.bu $vr6, $vr14, $vr6 \n\t"
- "vavgr.bu $vr7, $vr15, $vr7 \n\t"
-
- "vld $vr8, %[tmp], 0 \n\t"
- "vldx $vr9, %[tmp], %[dstStride] \n\t"
- "vldx $vr10, %[tmp], %[dstride_2] \n\t"
- "vldx $vr11, %[tmp], %[dstride_3] \n\t"
- "add.d %[tmp], %[tmp], %[dstride_4] \n\t"
- "vld $vr12, %[tmp], 0 \n\t"
- "vldx $vr13, %[tmp], %[dstStride] \n\t"
- "vldx $vr14, %[tmp], %[dstride_2] \n\t"
- "vldx $vr15, %[tmp], %[dstride_3] \n\t"
- "add.d %[tmp], %[tmp], %[dstride_4] \n\t"
-
- "vavgr.bu $vr0, $vr8, $vr0 \n\t"
- "vavgr.bu $vr1, $vr9, $vr1 \n\t"
- "vavgr.bu $vr2, $vr10, $vr2 \n\t"
- "vavgr.bu $vr3, $vr11, $vr3 \n\t"
- "vavgr.bu $vr4, $vr12, $vr4 \n\t"
- "vavgr.bu $vr5, $vr13, $vr5 \n\t"
- "vavgr.bu $vr6, $vr14, $vr6 \n\t"
- "vavgr.bu $vr7, $vr15, $vr7 \n\t"
-
- "vst $vr0, %[dst], 0 \n\t"
- "vstx $vr1, %[dst], %[dstStride] \n\t"
- "vstx $vr2, %[dst], %[dstride_2] \n\t"
- "vstx $vr3, %[dst], %[dstride_3] \n\t"
- "add.d %[dst], %[dst], %[dstride_4] \n\t"
- "vst $vr4, %[dst], 0 \n\t"
- "vstx $vr5, %[dst], %[dstStride] \n\t"
- "vstx $vr6, %[dst], %[dstride_2] \n\t"
- "vstx $vr7, %[dst], %[dstride_3] \n\t"
- "add.d %[dst], %[dst], %[dstride_4] \n\t"
-
- /* h8~h15 */
- "vld $vr0, %[src], 0 \n\t"
- "vldx $vr1, %[src], %[srcStride] \n\t"
- "vldx $vr2, %[src], %[stride_2] \n\t"
- "vldx $vr3, %[src], %[stride_3] \n\t"
- "add.d %[src], %[src], %[stride_4] \n\t"
- "vld $vr4, %[src], 0 \n\t"
- "vldx $vr5, %[src], %[srcStride] \n\t"
- "vldx $vr6, %[src], %[stride_2] \n\t"
- "vldx $vr7, %[src], %[stride_3] \n\t"
-
- "vld $vr8, %[half], 0x80 \n\t"
- "vld $vr9, %[half], 0x90 \n\t"
- "vld $vr10, %[half], 0xa0 \n\t"
- "vld $vr11, %[half], 0xb0 \n\t"
- "vld $vr12, %[half], 0xc0 \n\t"
- "vld $vr13, %[half], 0xd0 \n\t"
- "vld $vr14, %[half], 0xe0 \n\t"
- "vld $vr15, %[half], 0xf0 \n\t"
-
- "vavgr.bu $vr0, $vr8, $vr0 \n\t"
- "vavgr.bu $vr1, $vr9, $vr1 \n\t"
- "vavgr.bu $vr2, $vr10, $vr2 \n\t"
- "vavgr.bu $vr3, $vr11, $vr3 \n\t"
- "vavgr.bu $vr4, $vr12, $vr4 \n\t"
- "vavgr.bu $vr5, $vr13, $vr5 \n\t"
- "vavgr.bu $vr6, $vr14, $vr6 \n\t"
- "vavgr.bu $vr7, $vr15, $vr7 \n\t"
-
- "vld $vr8, %[tmp], 0 \n\t"
- "vldx $vr9, %[tmp], %[dstStride] \n\t"
- "vldx $vr10, %[tmp], %[dstride_2] \n\t"
- "vldx $vr11, %[tmp], %[dstride_3] \n\t"
- "add.d %[tmp], %[tmp], %[dstride_4] \n\t"
- "vld $vr12, %[tmp], 0 \n\t"
- "vldx $vr13, %[tmp], %[dstStride] \n\t"
- "vldx $vr14, %[tmp], %[dstride_2] \n\t"
- "vldx $vr15, %[tmp], %[dstride_3] \n\t"
-
- "vavgr.bu $vr0, $vr8, $vr0 \n\t"
- "vavgr.bu $vr1, $vr9, $vr1 \n\t"
- "vavgr.bu $vr2, $vr10, $vr2 \n\t"
- "vavgr.bu $vr3, $vr11, $vr3 \n\t"
- "vavgr.bu $vr4, $vr12, $vr4 \n\t"
- "vavgr.bu $vr5, $vr13, $vr5 \n\t"
- "vavgr.bu $vr6, $vr14, $vr6 \n\t"
- "vavgr.bu $vr7, $vr15, $vr7 \n\t"
-
- "vst $vr0, %[dst], 0 \n\t"
- "vstx $vr1, %[dst], %[dstStride] \n\t"
- "vstx $vr2, %[dst], %[dstride_2] \n\t"
- "vstx $vr3, %[dst], %[dstride_3] \n\t"
- "add.d %[dst], %[dst], %[dstride_4] \n\t"
- "vst $vr4, %[dst], 0 \n\t"
- "vstx $vr5, %[dst], %[dstStride] \n\t"
- "vstx $vr6, %[dst], %[dstride_2] \n\t"
- "vstx $vr7, %[dst], %[dstride_3] \n\t"
- : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [half]"+&r"(half), [src]"+&r"(src),
- [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3),
- [stride_4]"=&r"(stride_4), [dstride_2]"=&r"(dstride_2),
- [dstride_3]"=&r"(dstride_3), [dstride_4]"=&r"(dstride_4)
- : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride)
- : "memory"
- );
-}
-
#define QPEL8_H_LOWPASS(out_v) \
src00 = __lasx_xvld(src, - 2); \
src += srcStride; \
diff --git a/libavcodec/loongarch/h264qpel_lasx.h b/libavcodec/loongarch/h264qpel_lasx.h
deleted file mode 100644
index 32b6b50917..0000000000
--- a/libavcodec/loongarch/h264qpel_lasx.h
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Copyright (c) 2020 Loongson Technology Corporation Limited
- * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_LOONGARCH_H264QPEL_LASX_H
-#define AVCODEC_LOONGARCH_H264QPEL_LASX_H
-
-#include <stdint.h>
-#include <stddef.h>
-#include "libavcodec/h264.h"
-
-void ff_h264_h_lpf_luma_inter_lasx(uint8_t *src, int stride,
- int alpha, int beta, int8_t *tc0);
-void ff_h264_v_lpf_luma_inter_lasx(uint8_t *src, int stride,
- int alpha, int beta, int8_t *tc0);
-void ff_put_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-
-void ff_put_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride);
-void ff_put_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride);
-void ff_put_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride);
-void ff_put_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride);
-void ff_put_h264_qpel8_mc01_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride);
-void ff_put_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride);
-void ff_put_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride);
-void ff_put_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride);
-void ff_put_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride);
-void ff_put_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride);
-void ff_put_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride);
-void ff_put_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride);
-void ff_put_h264_qpel8_mc03_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride);
-void ff_put_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride);
-void ff_put_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride);
-void ff_put_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride);
-void ff_avg_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-#endif // #ifndef AVCODEC_LOONGARCH_H264QPEL_LASX_H
diff --git a/libavcodec/loongarch/h264qpel_loongarch.h b/libavcodec/loongarch/h264qpel_loongarch.h
new file mode 100644
index 0000000000..68232730da
--- /dev/null
+++ b/libavcodec/loongarch/h264qpel_loongarch.h
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_LOONGARCH_H264QPEL_LOONGARCH_H
+#define AVCODEC_LOONGARCH_H264QPEL_LOONGARCH_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include "libavcodec/h264.h"
+#include "config.h"
+
+void put_h264_qpel8_hv_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dstStride, ptrdiff_t srcStride);
+void put_h264_qpel8_h_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dstStride, ptrdiff_t srcStride);
+void put_h264_qpel8_v_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dstStride, ptrdiff_t srcStride);
+void put_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half,
+ ptrdiff_t dstStride, ptrdiff_t srcStride);
+void put_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half,
+ ptrdiff_t dstStride, ptrdiff_t srcStride);
+
+void avg_h264_qpel8_h_lowpass_lsx(uint8_t *dst, const uint8_t *src, int dstStride,
+ int srcStride);
+void avg_h264_qpel8_v_lowpass_lsx(uint8_t *dst, uint8_t *src, int dstStride,
+ int srcStride);
+void avg_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half,
+ ptrdiff_t dstStride, ptrdiff_t srcStride);
+void avg_h264_qpel8_hv_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dstStride, ptrdiff_t srcStride);
+void avg_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half,
+ ptrdiff_t dstStride, ptrdiff_t srcStride);
+
+void ff_put_h264_qpel16_mc00_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc10_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc20_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc30_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc01_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc11_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc13_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc31_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc33_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc03_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc02_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc22_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc21_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel16_mc12_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel16_mc32_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel16_mc23_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+
+void ff_avg_h264_qpel16_mc00_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc10_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc30_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc33_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc11_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc31_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc13_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc20_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc02_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc03_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc23_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc21_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc01_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc32_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc12_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc22_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+
+void ff_put_h264_qpel8_mc03_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc00_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc01_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc30_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc10_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc33_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc13_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc31_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc11_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc32_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc21_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc23_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc12_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc02_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc22_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc20_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+
+void ff_avg_h264_qpel8_mc00_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc10_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc20_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc30_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc11_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc21_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc31_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc02_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc12_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc22_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc32_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc13_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc23_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc33_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+
+#if HAVE_LASX
+void ff_h264_h_lpf_luma_inter_lasx(uint8_t *src, int stride,
+ int alpha, int beta, int8_t *tc0);
+void ff_h264_v_lpf_luma_inter_lasx(uint8_t *src, int stride,
+ int alpha, int beta, int8_t *tc0);
+void ff_put_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+
+void ff_put_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc01_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc03_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+#endif
+
+#endif // #ifndef AVCODEC_LOONGARCH_H264QPEL_LOONGARCH_H
diff --git a/libavcodec/loongarch/h264qpel_lsx.c b/libavcodec/loongarch/h264qpel_lsx.c
new file mode 100644
index 0000000000..12b3bae6d1
--- /dev/null
+++ b/libavcodec/loongarch/h264qpel_lsx.c
@@ -0,0 +1,487 @@
+/*
+ * Loongson LSX optimized h264qpel
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Hecai Yuan <yuanhecai@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264qpel_loongarch.h"
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "libavutil/attributes.h"
+
+static void put_h264_qpel16_hv_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dstStride, ptrdiff_t srcStride)
+{
+ put_h264_qpel8_hv_lowpass_lsx(dst, src, dstStride, srcStride);
+ put_h264_qpel8_hv_lowpass_lsx(dst + 8, src + 8, dstStride, srcStride);
+ src += srcStride << 3;
+ dst += dstStride << 3;
+ put_h264_qpel8_hv_lowpass_lsx(dst, src, dstStride, srcStride);
+ put_h264_qpel8_hv_lowpass_lsx(dst + 8, src + 8, dstStride, srcStride);
+}
+
+void ff_put_h264_qpel16_mc22_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ put_h264_qpel16_hv_lowpass_lsx(dst, src, stride, stride);
+}
+
+static void put_h264_qpel16_h_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+ int dstStride, int srcStride)
+{
+ put_h264_qpel8_h_lowpass_lsx(dst, src, dstStride, srcStride);
+ put_h264_qpel8_h_lowpass_lsx(dst+8, src+8, dstStride, srcStride);
+ src += srcStride << 3;
+ dst += dstStride << 3;
+ put_h264_qpel8_h_lowpass_lsx(dst, src, dstStride, srcStride);
+ put_h264_qpel8_h_lowpass_lsx(dst+8, src+8, dstStride, srcStride);
+}
+
+static void put_h264_qpel16_v_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+ int dstStride, int srcStride)
+{
+ put_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, dstStride, srcStride);
+ put_h264_qpel8_v_lowpass_lsx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
+ src += 8*srcStride;
+ dst += 8*dstStride;
+ put_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, dstStride, srcStride);
+ put_h264_qpel8_v_lowpass_lsx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
+}
+
+void ff_put_h264_qpel16_mc21_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t temp[512];
+ uint8_t *const halfH = temp;
+ uint8_t *const halfHV = temp + 256;
+
+ put_h264_qpel16_h_lowpass_lsx(halfH, src, 16, stride);
+ put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+ put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+void ff_put_h264_qpel16_mc12_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t temp[512];
+ uint8_t *const halfHV = temp;
+ uint8_t *const halfH = temp + 256;
+
+ put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+ put_h264_qpel16_v_lowpass_lsx(halfH, src, 16, stride);
+ put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+void ff_put_h264_qpel16_mc32_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t temp[512];
+ uint8_t *const halfHV = temp;
+ uint8_t *const halfH = temp + 256;
+
+ put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+ put_h264_qpel16_v_lowpass_lsx(halfH, src + 1, 16, stride);
+ put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+void ff_put_h264_qpel16_mc23_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t temp[512];
+ uint8_t *const halfH = temp;
+ uint8_t *const halfHV = temp + 256;
+
+ put_h264_qpel16_h_lowpass_lsx(halfH, src + stride, 16, stride);
+ put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+ put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+static void avg_h264_qpel16_v_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+ int dstStride, int srcStride)
+{
+ avg_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, dstStride, srcStride);
+ avg_h264_qpel8_v_lowpass_lsx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
+ src += 8*srcStride;
+ dst += 8*dstStride;
+ avg_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, dstStride, srcStride);
+ avg_h264_qpel8_v_lowpass_lsx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
+}
+
+void ff_avg_h264_qpel16_mc02_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ avg_h264_qpel16_v_lowpass_lsx(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel16_mc03_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t half[256];
+
+ put_h264_qpel16_v_lowpass_lsx(half, src, 16, stride);
+ avg_pixels16_l2_8_lsx(dst, src + stride, half, stride, stride);
+}
+
+void ff_avg_h264_qpel16_mc23_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t temp[512];
+ uint8_t *const halfH = temp;
+ uint8_t *const halfHV = temp + 256;
+
+ put_h264_qpel16_h_lowpass_lsx(halfH, src + stride, 16, stride);
+ put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+ avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+void ff_avg_h264_qpel16_mc21_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t temp[512];
+ uint8_t *const halfH = temp;
+ uint8_t *const halfHV = temp + 256;
+
+ put_h264_qpel16_h_lowpass_lsx(halfH, src, 16, stride);
+ put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+ avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+void ff_avg_h264_qpel16_mc01_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t half[256];
+
+ put_h264_qpel16_v_lowpass_lsx(half, src, 16, stride);
+ avg_pixels16_l2_8_lsx(dst, src, half, stride, stride);
+}
+
+void ff_avg_h264_qpel16_mc32_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t temp[512];
+ uint8_t *const halfHV = temp;
+ uint8_t *const halfH = temp + 256;
+
+ put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+ put_h264_qpel16_v_lowpass_lsx(halfH, src + 1, 16, stride);
+ avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+void ff_avg_h264_qpel16_mc12_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t temp[512];
+ uint8_t *const halfHV = temp;
+ uint8_t *const halfH = temp + 256;
+
+ put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+ put_h264_qpel16_v_lowpass_lsx(halfH, src, 16, stride);
+ avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+static void avg_h264_qpel16_hv_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dstStride, ptrdiff_t srcStride)
+{
+ avg_h264_qpel8_hv_lowpass_lsx(dst, src, dstStride, srcStride);
+ avg_h264_qpel8_hv_lowpass_lsx(dst + 8, src + 8, dstStride, srcStride);
+ src += srcStride << 3;
+ dst += dstStride << 3;
+ avg_h264_qpel8_hv_lowpass_lsx(dst, src, dstStride, srcStride);
+ avg_h264_qpel8_hv_lowpass_lsx(dst + 8, src + 8, dstStride, srcStride);
+}
+
+void ff_avg_h264_qpel16_mc22_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ avg_h264_qpel16_hv_lowpass_lsx(dst, src, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc03_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t half[64];
+
+ put_h264_qpel8_v_lowpass_lsx(half, (uint8_t*)src, 8, stride);
+ put_pixels8_l2_8_lsx(dst, src + stride, half, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc01_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t half[64];
+
+ put_h264_qpel8_v_lowpass_lsx(half, (uint8_t*)src, 8, stride);
+ put_pixels8_l2_8_lsx(dst, src, half, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc30_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t half[64];
+
+ put_h264_qpel8_h_lowpass_lsx(half, src, 8, stride);
+ put_pixels8_l2_8_lsx(dst, src+1, half, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc10_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t half[64];
+
+ put_h264_qpel8_h_lowpass_lsx(half, src, 8, stride);
+ put_pixels8_l2_8_lsx(dst, src, half, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc33_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t halfH[64];
+ uint8_t halfV[64];
+
+ put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride);
+ put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src + 1, 8, stride);
+ put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc13_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t halfH[64];
+ uint8_t halfV[64];
+
+ put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride);
+ put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src, 8, stride);
+ put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc31_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t halfH[64];
+ uint8_t halfV[64];
+
+ put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride);
+ put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src + 1, 8, stride);
+ put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc11_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t halfH[64];
+ uint8_t halfV[64];
+
+ put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride);
+ put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src, 8, stride);
+ put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc32_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t temp[128];
+ uint8_t *const halfHV = temp;
+ uint8_t *const halfH = temp + 64;
+
+ put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+ put_h264_qpel8_v_lowpass_lsx(halfH, (uint8_t*)src + 1, 8, stride);
+ put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc21_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t temp[128];
+ uint8_t *const halfH = temp;
+ uint8_t *const halfHV = temp + 64;
+
+ put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride);
+ put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+ put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc23_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t temp[128];
+ uint8_t *const halfH = temp;
+ uint8_t *const halfHV = temp + 64;
+
+ put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride);
+ put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+ put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc12_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t temp[128];
+ uint8_t *const halfHV = temp;
+ uint8_t *const halfH = temp + 64;
+
+ put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+ put_h264_qpel8_v_lowpass_lsx(halfH, (uint8_t*)src, 8, stride);
+ put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc02_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ put_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc22_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ put_h264_qpel8_hv_lowpass_lsx(dst, src, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc20_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ put_h264_qpel8_h_lowpass_lsx(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc10_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t half[64];
+
+ put_h264_qpel8_h_lowpass_lsx(half, src, 8, stride);
+ avg_pixels8_l2_8_lsx(dst, src, half, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc20_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ avg_h264_qpel8_h_lowpass_lsx(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc30_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t half[64];
+
+ put_h264_qpel8_h_lowpass_lsx(half, src, 8, stride);
+ avg_pixels8_l2_8_lsx(dst, src+1, half, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc11_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t halfH[64];
+ uint8_t halfV[64];
+
+ put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride);
+ put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src, 8, stride);
+ avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc21_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t temp[128];
+ uint8_t *const halfH = temp;
+ uint8_t *const halfHV = temp + 64;
+
+ put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride);
+ put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+ avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc31_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t halfH[64];
+ uint8_t halfV[64];
+
+ put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride);
+ put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src + 1, 8, stride);
+ avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc02_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ avg_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc12_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t temp[128];
+ uint8_t *const halfHV = temp;
+ uint8_t *const halfH = temp + 64;
+
+ put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+ put_h264_qpel8_v_lowpass_lsx(halfH, (uint8_t*)src, 8, stride);
+ avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc22_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ avg_h264_qpel8_hv_lowpass_lsx(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc32_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t temp[128];
+ uint8_t *const halfHV = temp;
+ uint8_t *const halfH = temp + 64;
+
+ put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+ put_h264_qpel8_v_lowpass_lsx(halfH, (uint8_t*)src + 1, 8, stride);
+ avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc13_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t halfH[64];
+ uint8_t halfV[64];
+
+ put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride);
+ put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src, 8, stride);
+ avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc23_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t temp[128];
+ uint8_t *const halfH = temp;
+ uint8_t *const halfHV = temp + 64;
+
+ put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride);
+ put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+ avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc33_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t halfH[64];
+ uint8_t halfV[64];
+
+ put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride);
+ put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src + 1, 8, stride);
+ avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
--
2.20.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 17+ messages in thread
* [FFmpeg-devel] [PATCH v5 5/7] swscale/la: Optimize the functions of the swscale series with lsx.
2023-05-25 7:24 [FFmpeg-devel] Add LSX optimization in avcodec and swscale Hao Chen
` (3 preceding siblings ...)
2023-05-25 7:24 ` [FFmpeg-devel] [PATCH v5 4/7] avcodec/la: Add LSX optimization for h264 qpel Hao Chen
@ 2023-05-25 7:24 ` Hao Chen
2023-05-25 7:24 ` [FFmpeg-devel] [PATCH v5 6/7] swscale/la: Add following builtin optimized functions Hao Chen
2023-05-25 7:24 ` [FFmpeg-devel] [PATCH v5 7/7] avutil/la: Add function performance testing Hao Chen
6 siblings, 0 replies; 17+ messages in thread
From: Hao Chen @ 2023-05-25 7:24 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Lu Wang
From: Lu Wang <wanglu@loongson.cn>
./configure --disable-lasx
ffmpeg -i ~/media/1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -s 640x480
-pix_fmt bgra -y /dev/null -an
before: 91fps
after: 160fps
---
libswscale/loongarch/Makefile | 5 +
libswscale/loongarch/input.S | 285 +++
libswscale/loongarch/output.S | 138 ++
libswscale/loongarch/output_lasx.c | 4 +-
libswscale/loongarch/output_lsx.c | 1828 ++++++++++++++++
libswscale/loongarch/swscale.S | 1868 +++++++++++++++++
libswscale/loongarch/swscale_init_loongarch.c | 32 +-
libswscale/loongarch/swscale_loongarch.h | 43 +-
libswscale/loongarch/swscale_lsx.c | 57 +
libswscale/utils.c | 3 +-
10 files changed, 4256 insertions(+), 7 deletions(-)
create mode 100644 libswscale/loongarch/input.S
create mode 100644 libswscale/loongarch/output.S
create mode 100644 libswscale/loongarch/output_lsx.c
create mode 100644 libswscale/loongarch/swscale.S
create mode 100644 libswscale/loongarch/swscale_lsx.c
diff --git a/libswscale/loongarch/Makefile b/libswscale/loongarch/Makefile
index 8e665e826c..c0b6a449c0 100644
--- a/libswscale/loongarch/Makefile
+++ b/libswscale/loongarch/Makefile
@@ -4,3 +4,8 @@ LASX-OBJS-$(CONFIG_SWSCALE) += loongarch/swscale_lasx.o \
loongarch/yuv2rgb_lasx.o \
loongarch/rgb2rgb_lasx.o \
loongarch/output_lasx.o
+LSX-OBJS-$(CONFIG_SWSCALE) += loongarch/swscale.o \
+ loongarch/swscale_lsx.o \
+ loongarch/input.o \
+ loongarch/output.o \
+ loongarch/output_lsx.o
diff --git a/libswscale/loongarch/input.S b/libswscale/loongarch/input.S
new file mode 100644
index 0000000000..d01f7384b1
--- /dev/null
+++ b/libswscale/loongarch/input.S
@@ -0,0 +1,285 @@
+/*
+ * Loongson LSX optimized swscale
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/loongarch/loongson_asm.S"
+
+/* void planar_rgb_to_y_lsx(uint8_t *_dst, const uint8_t *src[4],
+ * int width, int32_t *rgb2yuv)
+ */
+function planar_rgb_to_y_lsx
+ ld.d a5, a1, 0
+ ld.d a6, a1, 8
+ ld.d a7, a1, 16
+
+ ld.w t1, a3, 0 // ry
+ ld.w t2, a3, 4 // gy
+ ld.w t3, a3, 8 // by
+ li.w t4, 9
+ li.w t5, 524544
+ li.w t7, 4
+ li.w t8, 8
+ vldi vr7, 0
+ vreplgr2vr.w vr1, t1
+ vreplgr2vr.w vr2, t2
+ vreplgr2vr.w vr3, t3
+ vreplgr2vr.w vr4, t4
+ vreplgr2vr.w vr5, t5
+ bge a2, t8, .WIDTH8
+ bge a2, t7, .WIDTH4
+ blt zero, a2, .WIDTH
+ b .END
+
+.WIDTH8:
+ vld vr8, a5, 0
+ vld vr9, a6, 0
+ vld vr10, a7, 0
+ vilvl.b vr11, vr7, vr8
+ vilvl.b vr12, vr7, vr9
+ vilvl.b vr13, vr7, vr10
+ vilvl.h vr14, vr7, vr11
+ vilvl.h vr15, vr7, vr12
+ vilvl.h vr16, vr7, vr13
+ vilvh.h vr17, vr7, vr11
+ vilvh.h vr18, vr7, vr12
+ vilvh.h vr19, vr7, vr13
+ vmul.w vr20, vr1, vr16
+ vmul.w vr21, vr1, vr19
+ vmadd.w vr20, vr2, vr14
+ vmadd.w vr20, vr3, vr15
+ vmadd.w vr21, vr2, vr17
+ vmadd.w vr21, vr3, vr18
+ vadd.w vr20, vr20, vr5
+ vadd.w vr21, vr21, vr5
+ vsra.w vr20, vr20, vr4
+ vsra.w vr21, vr21, vr4
+ vpickev.h vr20, vr21, vr20
+ vst vr20, a0, 0
+ addi.d a2, a2, -8
+ addi.d a5, a5, 8
+ addi.d a6, a6, 8
+ addi.d a7, a7, 8
+ addi.d a0, a0, 16
+ bge a2, t8, .WIDTH8
+ bge a2, t7, .WIDTH4
+ blt zero, a2, .WIDTH
+ b .END
+
+.WIDTH4:
+ vld vr8, a5, 0
+ vld vr9, a6, 0
+ vld vr10, a7, 0
+ vilvl.b vr11, vr7, vr8
+ vilvl.b vr12, vr7, vr9
+ vilvl.b vr13, vr7, vr10
+ vilvl.h vr14, vr7, vr11
+ vilvl.h vr15, vr7, vr12
+ vilvl.h vr16, vr7, vr13
+ vmul.w vr17, vr1, vr16
+ vmadd.w vr17, vr2, vr14
+ vmadd.w vr17, vr3, vr15
+ vadd.w vr17, vr17, vr5
+ vsra.w vr17, vr17, vr4
+ vpickev.h vr17, vr17, vr17
+ vstelm.d vr17, a0, 0, 0
+ addi.d a2, a2, -4
+ addi.d a5, a5, 4
+ addi.d a6, a6, 4
+ addi.d a7, a7, 4
+ addi.d a0, a0, 8
+ bge a2, t7, .WIDTH4
+ blt zero, a2, .WIDTH
+ b .END
+
+.WIDTH:
+ ld.bu t0, a5, 0
+ ld.bu t4, a6, 0
+ ld.bu t6, a7, 0
+ mul.w t8, t6, t1
+ mul.w t7, t0, t2
+ add.w t8, t8, t7
+ mul.w t7, t4, t3
+ add.w t8, t8, t7
+ add.w t8, t8, t5
+ srai.w t8, t8, 9
+ st.h t8, a0, 0
+ addi.d a2, a2, -1
+ addi.d a5, a5, 1
+ addi.d a6, a6, 1
+ addi.d a7, a7, 1
+ addi.d a0, a0, 2
+ blt zero, a2, .WIDTH
+.END:
+endfunc
+
+/* void planar_rgb_to_uv_lsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
+ * int width, int32_t *rgb2yuv)
+ */
+function planar_rgb_to_uv_lsx
+ addi.d sp, sp, -24
+ st.d s1, sp, 0
+ st.d s2, sp, 8
+ st.d s3, sp, 16
+
+ ld.d a5, a2, 0
+ ld.d a6, a2, 8
+ ld.d a7, a2, 16
+ ld.w t1, a4, 12 // ru
+ ld.w t2, a4, 16 // gu
+ ld.w t3, a4, 20 // bu
+ ld.w s1, a4, 24 // rv
+ ld.w s2, a4, 28 // gv
+ ld.w s3, a4, 32 // bv
+ li.w t4, 9
+ li.w t5, 4194560
+ li.w t7, 4
+ li.w t8, 8
+ vldi vr0, 0
+ vreplgr2vr.w vr1, t1
+ vreplgr2vr.w vr2, t2
+ vreplgr2vr.w vr3, t3
+ vreplgr2vr.w vr4, s1
+ vreplgr2vr.w vr5, s2
+ vreplgr2vr.w vr6, s3
+ vreplgr2vr.w vr7, t4
+ vreplgr2vr.w vr8, t5
+ bge a2, t8, .LOOP_WIDTH8
+ bge a2, t7, .LOOP_WIDTH4
+ blt zero, a2, .LOOP_WIDTH
+ b .LOOP_END
+
+.LOOP_WIDTH8:
+ vld vr9, a5, 0
+ vld vr10, a6, 0
+ vld vr11, a7, 0
+ vilvl.b vr9, vr0, vr9
+ vilvl.b vr10, vr0, vr10
+ vilvl.b vr11, vr0, vr11
+ vilvl.h vr12, vr0, vr9
+ vilvl.h vr13, vr0, vr10
+ vilvl.h vr14, vr0, vr11
+ vilvh.h vr15, vr0, vr9
+ vilvh.h vr16, vr0, vr10
+ vilvh.h vr17, vr0, vr11
+ vmul.w vr18, vr1, vr14
+ vmul.w vr19, vr1, vr17
+ vmul.w vr20, vr4, vr14
+ vmul.w vr21, vr4, vr17
+ vmadd.w vr18, vr2, vr12
+ vmadd.w vr18, vr3, vr13
+ vmadd.w vr19, vr2, vr15
+ vmadd.w vr19, vr3, vr16
+ vmadd.w vr20, vr5, vr12
+ vmadd.w vr20, vr6, vr13
+ vmadd.w vr21, vr5, vr15
+ vmadd.w vr21, vr6, vr16
+ vadd.w vr18, vr18, vr8
+ vadd.w vr19, vr19, vr8
+ vadd.w vr20, vr20, vr8
+ vadd.w vr21, vr21, vr8
+ vsra.w vr18, vr18, vr7
+ vsra.w vr19, vr19, vr7
+ vsra.w vr20, vr20, vr7
+ vsra.w vr21, vr21, vr7
+ vpickev.h vr18, vr19, vr18
+ vpickev.h vr20, vr21, vr20
+ vst vr18, a0, 0
+ vst vr20, a1, 0
+ addi.d a3, a3, -8
+ addi.d a5, a5, 8
+ addi.d a6, a6, 8
+ addi.d a7, a7, 8
+ addi.d a0, a0, 16
+ addi.d a1, a1, 16
+ bge a3, t8, .LOOP_WIDTH8
+ bge a3, t7, .LOOP_WIDTH4
+ blt zero, a3, .LOOP_WIDTH
+ b .LOOP_END
+
+.LOOP_WIDTH4:
+ vld vr9, a5, 0
+ vld vr10, a6, 0
+ vld vr11, a7, 0
+ vilvl.b vr9, vr0, vr9
+ vilvl.b vr10, vr0, vr10
+ vilvl.b vr11, vr0, vr11
+ vilvl.h vr12, vr0, vr9
+ vilvl.h vr13, vr0, vr10
+ vilvl.h vr14, vr0, vr11
+ vmul.w vr18, vr1, vr14
+ vmul.w vr19, vr4, vr14
+ vmadd.w vr18, vr2, vr12
+ vmadd.w vr18, vr3, vr13
+ vmadd.w vr19, vr5, vr12
+ vmadd.w vr19, vr6, vr13
+ vadd.w vr18, vr18, vr8
+ vadd.w vr19, vr19, vr8
+ vsra.w vr18, vr18, vr7
+ vsra.w vr19, vr19, vr7
+ vpickev.h vr18, vr18, vr18
+ vpickev.h vr19, vr19, vr19
+ vstelm.d vr18, a0, 0, 0
+ vstelm.d vr19, a1, 0, 0
+ addi.d a3, a3, -4
+ addi.d a5, a5, 4
+ addi.d a6, a6, 4
+ addi.d a7, a7, 4
+ addi.d a0, a0, 8
+ addi.d a1, a1, 8
+ bge a3, t7, .LOOP_WIDTH4
+ blt zero, a3, .LOOP_WIDTH
+ b .LOOP_END
+
+.LOOP_WIDTH:
+ ld.bu t0, a5, 0
+ ld.bu t4, a6, 0
+ ld.bu t6, a7, 0
+ mul.w t8, t6, t1
+ mul.w t7, t0, t2
+ add.w t8, t8, t7
+ mul.w t7, t4, t3
+ add.w t8, t8, t7
+ add.w t8, t8, t5
+ srai.w t8, t8, 9
+ st.h t8, a0, 0
+ mul.w t8, t6, s1
+ mul.w t7, t0, s2
+ add.w t8, t8, t7
+ mul.w t7, t4, s3
+ add.w t8, t8, t7
+ add.w t8, t8, t5
+ srai.w t8, t8, 9
+ st.h t8, a1, 0
+ addi.d a3, a3, -1
+ addi.d a5, a5, 1
+ addi.d a6, a6, 1
+ addi.d a7, a7, 1
+ addi.d a0, a0, 2
+ addi.d a1, a1, 2
+ blt zero, a3, .LOOP_WIDTH
+
+.LOOP_END:
+ ld.d s1, sp, 0
+ ld.d s2, sp, 8
+ ld.d s3, sp, 16
+ addi.d sp, sp, 24
+endfunc
diff --git a/libswscale/loongarch/output.S b/libswscale/loongarch/output.S
new file mode 100644
index 0000000000..b44bac502a
--- /dev/null
+++ b/libswscale/loongarch/output.S
@@ -0,0 +1,138 @@
+/*
+ * Loongson LSX optimized swscale
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/loongarch/loongson_asm.S"
+
+/* static void ff_yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
+ * const int16_t **src, uint8_t *dest, int dstW,
+ * const uint8_t *dither, int offset)
+ */
+function ff_yuv2planeX_8_lsx
+ addi.w t1, a6, 1
+ addi.w t2, a6, 2
+ addi.w t3, a6, 3
+ addi.w t4, a6, 4
+ addi.w t5, a6, 5
+ addi.w t6, a6, 6
+ addi.w t7, a6, 7
+ andi t0, a6, 7
+ andi t1, t1, 7
+ andi t2, t2, 7
+ andi t3, t3, 7
+ andi t4, t4, 7
+ andi t5, t5, 7
+ andi t6, t6, 7
+ andi t7, t7, 7
+ ldx.bu t0, a5, t0
+ ldx.bu t1, a5, t1
+ ldx.bu t2, a5, t2
+ ldx.bu t3, a5, t3
+ ldx.bu t4, a5, t4
+ ldx.bu t5, a5, t5
+ ldx.bu t6, a5, t6
+ ldx.bu t7, a5, t7
+ vreplgr2vr.w vr0, t0
+ vreplgr2vr.w vr1, t1
+ vreplgr2vr.w vr2, t2
+ vreplgr2vr.w vr3, t3
+ vreplgr2vr.w vr4, t4
+ vreplgr2vr.w vr5, t5
+ vreplgr2vr.w vr6, t6
+ vreplgr2vr.w vr7, t7
+ vilvl.w vr0, vr2, vr0
+ vilvl.w vr4, vr6, vr4
+ vilvl.w vr1, vr3, vr1
+ vilvl.w vr5, vr7, vr5
+ vilvl.d vr12, vr4, vr0
+ vilvl.d vr13, vr5, vr1
+ li.w t5, 0
+ li.w t8, 8
+ bge a4, t8, .WIDTH8
+ blt zero, a4, .WIDTH
+ b .END
+
+.WIDTH8:
+ li.d t1, 0
+ li.d t4, 0
+ vslli.w vr2, vr12, 12
+ vslli.w vr3, vr13, 12
+ move t3, a0
+
+.FILTERSIZE8:
+ ldx.d t2, a2, t1
+ vldx vr4, t2, t5
+ vldrepl.h vr5, t3, 0
+ vmaddwev.w.h vr2, vr4, vr5
+ vmaddwod.w.h vr3, vr4, vr5
+ addi.d t1, t1, 8
+ addi.d t3, t3, 2
+ addi.d t4, t4, 1
+ blt t4, a1, .FILTERSIZE8
+ vsrai.w vr2, vr2, 19
+ vsrai.w vr3, vr3, 19
+ vclip255.w vr2, vr2
+ vclip255.w vr3, vr3
+ vpickev.h vr2, vr3, vr2
+ vpickev.b vr2, vr2, vr2
+ vbsrl.v vr3, vr2, 4
+ vilvl.b vr2, vr3, vr2
+ fst.d f2, a3, 0
+ addi.d t5, t5, 16
+ addi.d a4, a4, -8
+ addi.d a3, a3, 8
+ bge a4, t8, .WIDTH8
+ blt zero, a4, .WIDTH
+ b .END
+
+.WIDTH:
+ li.d t1, 0
+ li.d t4, 0
+ vslli.w vr2, vr12, 12
+ vslli.w vr3, vr13, 12
+.FILTERSIZE:
+ ldx.d t2, a2, t1
+ vldx vr4, t2, t5
+ vldrepl.h vr5, a0, 0
+ vmaddwev.w.h vr2, vr4, vr5
+ vmaddwod.w.h vr3, vr4, vr5
+ addi.d t1, t1, 8
+ addi.d a0, a0, 2
+ addi.d t4, t4, 1
+ blt t4, a1, .FILTERSIZE
+ vsrai.w vr2, vr2, 19
+ vsrai.w vr3, vr3, 19
+ vclip255.w vr2, vr2
+ vclip255.w vr3, vr3
+ vpickev.h vr2, vr3, vr2
+ vpickev.b vr2, vr2, vr2
+ vbsrl.v vr3, vr2, 4
+ vilvl.b vr2, vr3, vr2
+
+.DEST:
+ vstelm.b vr2, a3, 0, 0
+ vbsrl.v vr2, vr2, 1
+ addi.d a4, a4, -1
+ addi.d a3, a3, 1
+ blt zero, a4, .DEST
+.END:
+endfunc
diff --git a/libswscale/loongarch/output_lasx.c b/libswscale/loongarch/output_lasx.c
index 36a4c4503b..277d7063e6 100644
--- a/libswscale/loongarch/output_lasx.c
+++ b/libswscale/loongarch/output_lasx.c
@@ -1773,11 +1773,9 @@ YUV2RGBWRAPPER(yuv2, rgb_full, bgr4_byte_full, AV_PIX_FMT_BGR4_BYTE, 0)
YUV2RGBWRAPPER(yuv2, rgb_full, rgb4_byte_full, AV_PIX_FMT_RGB4_BYTE, 0)
YUV2RGBWRAPPER(yuv2, rgb_full, bgr8_full, AV_PIX_FMT_BGR8, 0)
YUV2RGBWRAPPER(yuv2, rgb_full, rgb8_full, AV_PIX_FMT_RGB8, 0)
-#undef yuvTorgb
-#undef yuvTorgb_setup
-av_cold void ff_sws_init_output_loongarch(SwsContext *c)
+av_cold void ff_sws_init_output_lasx(SwsContext *c)
{
if(c->flags & SWS_FULL_CHR_H_INT) {
diff --git a/libswscale/loongarch/output_lsx.c b/libswscale/loongarch/output_lsx.c
new file mode 100644
index 0000000000..768cc3abc6
--- /dev/null
+++ b/libswscale/loongarch/output_lsx.c
@@ -0,0 +1,1828 @@
+/*
+ * Copyright (C) 2023 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "swscale_loongarch.h"
+#include "libavutil/loongarch/loongson_intrinsics.h"
+
+
+/*Copy from libswscale/output.c*/
+static av_always_inline void
+yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2,
+ unsigned A1, unsigned A2,
+ const void *_r, const void *_g, const void *_b, int y,
+ enum AVPixelFormat target, int hasAlpha)
+{
+ if (target == AV_PIX_FMT_ARGB || target == AV_PIX_FMT_RGBA ||
+ target == AV_PIX_FMT_ABGR || target == AV_PIX_FMT_BGRA) {
+ uint32_t *dest = (uint32_t *) _dest;
+ const uint32_t *r = (const uint32_t *) _r;
+ const uint32_t *g = (const uint32_t *) _g;
+ const uint32_t *b = (const uint32_t *) _b;
+
+#if CONFIG_SMALL
+ dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
+ dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
+#else
+#if defined(ASSERT_LEVEL) && ASSERT_LEVEL > 1
+ int sh = (target == AV_PIX_FMT_RGB32_1 ||
+ target == AV_PIX_FMT_BGR32_1) ? 0 : 24;
+ av_assert2((((r[Y1] + g[Y1] + b[Y1]) >> sh) & 0xFF) == 0xFF);
+#endif
+ dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
+ dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
+#endif
+ } else if (target == AV_PIX_FMT_RGB24 || target == AV_PIX_FMT_BGR24) {
+ uint8_t *dest = (uint8_t *) _dest;
+ const uint8_t *r = (const uint8_t *) _r;
+ const uint8_t *g = (const uint8_t *) _g;
+ const uint8_t *b = (const uint8_t *) _b;
+
+#define r_b ((target == AV_PIX_FMT_RGB24) ? r : b)
+#define b_r ((target == AV_PIX_FMT_RGB24) ? b : r)
+
+ dest[i * 6 + 0] = r_b[Y1];
+ dest[i * 6 + 1] = g[Y1];
+ dest[i * 6 + 2] = b_r[Y1];
+ dest[i * 6 + 3] = r_b[Y2];
+ dest[i * 6 + 4] = g[Y2];
+ dest[i * 6 + 5] = b_r[Y2];
+#undef r_b
+#undef b_r
+ } else if (target == AV_PIX_FMT_RGB565 || target == AV_PIX_FMT_BGR565 ||
+ target == AV_PIX_FMT_RGB555 || target == AV_PIX_FMT_BGR555 ||
+ target == AV_PIX_FMT_RGB444 || target == AV_PIX_FMT_BGR444) {
+ uint16_t *dest = (uint16_t *) _dest;
+ const uint16_t *r = (const uint16_t *) _r;
+ const uint16_t *g = (const uint16_t *) _g;
+ const uint16_t *b = (const uint16_t *) _b;
+ int dr1, dg1, db1, dr2, dg2, db2;
+
+ if (target == AV_PIX_FMT_RGB565 || target == AV_PIX_FMT_BGR565) {
+ dr1 = ff_dither_2x2_8[ y & 1 ][0];
+ dg1 = ff_dither_2x2_4[ y & 1 ][0];
+ db1 = ff_dither_2x2_8[(y & 1) ^ 1][0];
+ dr2 = ff_dither_2x2_8[ y & 1 ][1];
+ dg2 = ff_dither_2x2_4[ y & 1 ][1];
+ db2 = ff_dither_2x2_8[(y & 1) ^ 1][1];
+ } else if (target == AV_PIX_FMT_RGB555 || target == AV_PIX_FMT_BGR555) {
+ dr1 = ff_dither_2x2_8[ y & 1 ][0];
+ dg1 = ff_dither_2x2_8[ y & 1 ][1];
+ db1 = ff_dither_2x2_8[(y & 1) ^ 1][0];
+ dr2 = ff_dither_2x2_8[ y & 1 ][1];
+ dg2 = ff_dither_2x2_8[ y & 1 ][0];
+ db2 = ff_dither_2x2_8[(y & 1) ^ 1][1];
+ } else {
+ dr1 = ff_dither_4x4_16[ y & 3 ][0];
+ dg1 = ff_dither_4x4_16[ y & 3 ][1];
+ db1 = ff_dither_4x4_16[(y & 3) ^ 3][0];
+ dr2 = ff_dither_4x4_16[ y & 3 ][1];
+ dg2 = ff_dither_4x4_16[ y & 3 ][0];
+ db2 = ff_dither_4x4_16[(y & 3) ^ 3][1];
+ }
+
+ dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
+ dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
+ } else { /* 8/4 bits */
+ uint8_t *dest = (uint8_t *) _dest;
+ const uint8_t *r = (const uint8_t *) _r;
+ const uint8_t *g = (const uint8_t *) _g;
+ const uint8_t *b = (const uint8_t *) _b;
+ int dr1, dg1, db1, dr2, dg2, db2;
+
+ if (target == AV_PIX_FMT_RGB8 || target == AV_PIX_FMT_BGR8) {
+ const uint8_t * const d64 = ff_dither_8x8_73[y & 7];
+ const uint8_t * const d32 = ff_dither_8x8_32[y & 7];
+ dr1 = dg1 = d32[(i * 2 + 0) & 7];
+ db1 = d64[(i * 2 + 0) & 7];
+ dr2 = dg2 = d32[(i * 2 + 1) & 7];
+ db2 = d64[(i * 2 + 1) & 7];
+ } else {
+ const uint8_t * const d64 = ff_dither_8x8_73 [y & 7];
+ const uint8_t * const d128 = ff_dither_8x8_220[y & 7];
+ dr1 = db1 = d128[(i * 2 + 0) & 7];
+ dg1 = d64[(i * 2 + 0) & 7];
+ dr2 = db2 = d128[(i * 2 + 1) & 7];
+ dg2 = d64[(i * 2 + 1) & 7];
+ }
+
+ if (target == AV_PIX_FMT_RGB4 || target == AV_PIX_FMT_BGR4) {
+ dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] +
+ ((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4);
+ } else {
+ dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
+ dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
+ }
+ }
+}
+
+#define WRITE_YUV2RGB_LSX(vec_y1, vec_y2, vec_u, vec_v, t1, t2, t3, t4) \
+{ \
+ Y1 = __lsx_vpickve2gr_w(vec_y1, t1); \
+ Y2 = __lsx_vpickve2gr_w(vec_y2, t2); \
+ U = __lsx_vpickve2gr_w(vec_u, t3); \
+ V = __lsx_vpickve2gr_w(vec_v, t4); \
+ r = c->table_rV[V]; \
+ g = (c->table_gU[U] + c->table_gV[V]); \
+ b = c->table_bU[U]; \
+ yuv2rgb_write(dest, count, Y1, Y2, 0, 0, \
+ r, g, b, y, target, 0); \
+ count++; \
+}
+
+static void
+yuv2rgb_X_template_lsx(SwsContext *c, const int16_t *lumFilter,
+ const int16_t **lumSrc, int lumFilterSize,
+ const int16_t *chrFilter, const int16_t **chrUSrc,
+ const int16_t **chrVSrc, int chrFilterSize,
+ const int16_t **alpSrc, uint8_t *dest, int dstW,
+ int y, enum AVPixelFormat target, int hasAlpha)
+{
+ int i, j;
+ int count = 0;
+ int t = 1 << 18;
+ int len = dstW >> 5;
+ int res = dstW & 31;
+ int len_count = (dstW + 1) >> 1;
+ const void *r, *g, *b;
+ int head = YUVRGB_TABLE_HEADROOM;
+ __m128i headroom = __lsx_vreplgr2vr_w(head);
+
+ for (i = 0; i < len; i++) {
+ int Y1, Y2, U, V, count_lum = count << 1;
+ __m128i l_src1, l_src2, l_src3, l_src4, u_src1, u_src2, v_src1, v_src2;
+ __m128i yl_ev, yl_ev1, yl_ev2, yl_od1, yl_od2, yh_ev1, yh_ev2, yh_od1, yh_od2;
+ __m128i u_ev1, u_ev2, u_od1, u_od2, v_ev1, v_ev2, v_od1, v_od2, temp;
+
+ yl_ev = __lsx_vldrepl_w(&t, 0);
+ yl_ev1 = yl_ev;
+ yl_od1 = yl_ev;
+ yh_ev1 = yl_ev;
+ yh_od1 = yl_ev;
+ u_ev1 = yl_ev;
+ v_ev1 = yl_ev;
+ u_od1 = yl_ev;
+ v_od1 = yl_ev;
+ yl_ev2 = yl_ev;
+ yl_od2 = yl_ev;
+ yh_ev2 = yl_ev;
+ yh_od2 = yl_ev;
+ u_ev2 = yl_ev;
+ v_ev2 = yl_ev;
+ u_od2 = yl_ev;
+ v_od2 = yl_ev;
+
+ for (j = 0; j < lumFilterSize; j++) {
+ temp = __lsx_vldrepl_h((lumFilter + j), 0);
+ DUP2_ARG2(__lsx_vld, lumSrc[j] + count_lum, 0, lumSrc[j] + count_lum,
+ 16, l_src1, l_src2);
+ DUP2_ARG2(__lsx_vld, lumSrc[j] + count_lum, 32, lumSrc[j] + count_lum,
+ 48, l_src3, l_src4);
+ yl_ev1 = __lsx_vmaddwev_w_h(yl_ev1, temp, l_src1);
+ yl_od1 = __lsx_vmaddwod_w_h(yl_od1, temp, l_src1);
+ yh_ev1 = __lsx_vmaddwev_w_h(yh_ev1, temp, l_src3);
+ yh_od1 = __lsx_vmaddwod_w_h(yh_od1, temp, l_src3);
+ yl_ev2 = __lsx_vmaddwev_w_h(yl_ev2, temp, l_src2);
+ yl_od2 = __lsx_vmaddwod_w_h(yl_od2, temp, l_src2);
+ yh_ev2 = __lsx_vmaddwev_w_h(yh_ev2, temp, l_src4);
+ yh_od2 = __lsx_vmaddwod_w_h(yh_od2, temp, l_src4);
+ }
+ for (j = 0; j < chrFilterSize; j++) {
+ DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
+ u_src1, v_src1);
+ DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 16, chrVSrc[j] + count, 16,
+ u_src2, v_src2);
+ temp = __lsx_vldrepl_h((chrFilter + j), 0);
+ u_ev1 = __lsx_vmaddwev_w_h(u_ev1, temp, u_src1);
+ u_od1 = __lsx_vmaddwod_w_h(u_od1, temp, u_src1);
+ v_ev1 = __lsx_vmaddwev_w_h(v_ev1, temp, v_src1);
+ v_od1 = __lsx_vmaddwod_w_h(v_od1, temp, v_src1);
+ u_ev2 = __lsx_vmaddwev_w_h(u_ev2, temp, u_src2);
+ u_od2 = __lsx_vmaddwod_w_h(u_od2, temp, u_src2);
+ v_ev2 = __lsx_vmaddwev_w_h(v_ev2, temp, v_src2);
+ v_od2 = __lsx_vmaddwod_w_h(v_od2, temp, v_src2);
+ }
+ yl_ev1 = __lsx_vsrai_w(yl_ev1, 19);
+ yh_ev1 = __lsx_vsrai_w(yh_ev1, 19);
+ yl_od1 = __lsx_vsrai_w(yl_od1, 19);
+ yh_od1 = __lsx_vsrai_w(yh_od1, 19);
+ u_ev1 = __lsx_vsrai_w(u_ev1, 19);
+ v_ev1 = __lsx_vsrai_w(v_ev1, 19);
+ u_od1 = __lsx_vsrai_w(u_od1, 19);
+ v_od1 = __lsx_vsrai_w(v_od1, 19);
+ yl_ev2 = __lsx_vsrai_w(yl_ev2, 19);
+ yh_ev2 = __lsx_vsrai_w(yh_ev2, 19);
+ yl_od2 = __lsx_vsrai_w(yl_od2, 19);
+ yh_od2 = __lsx_vsrai_w(yh_od2, 19);
+ u_ev2 = __lsx_vsrai_w(u_ev2, 19);
+ v_ev2 = __lsx_vsrai_w(v_ev2, 19);
+ u_od2 = __lsx_vsrai_w(u_od2, 19);
+ v_od2 = __lsx_vsrai_w(v_od2, 19);
+ u_ev1 = __lsx_vadd_w(u_ev1, headroom);
+ v_ev1 = __lsx_vadd_w(v_ev1, headroom);
+ u_od1 = __lsx_vadd_w(u_od1, headroom);
+ v_od1 = __lsx_vadd_w(v_od1, headroom);
+ u_ev2 = __lsx_vadd_w(u_ev2, headroom);
+ v_ev2 = __lsx_vadd_w(v_ev2, headroom);
+ u_od2 = __lsx_vadd_w(u_od2, headroom);
+ v_od2 = __lsx_vadd_w(v_od2, headroom);
+
+ WRITE_YUV2RGB_LSX(yl_ev1, yl_od1, u_ev1, v_ev1, 0, 0, 0, 0);
+ WRITE_YUV2RGB_LSX(yl_ev1, yl_od1, u_od1, v_od1, 1, 1, 0, 0);
+ WRITE_YUV2RGB_LSX(yl_ev1, yl_od1, u_ev1, v_ev1, 2, 2, 1, 1);
+ WRITE_YUV2RGB_LSX(yl_ev1, yl_od1, u_od1, v_od1, 3, 3, 1, 1);
+ WRITE_YUV2RGB_LSX(yl_ev2, yl_od2, u_ev1, v_ev1, 0, 0, 2, 2);
+ WRITE_YUV2RGB_LSX(yl_ev2, yl_od2, u_od1, v_od1, 1, 1, 2, 2);
+ WRITE_YUV2RGB_LSX(yl_ev2, yl_od2, u_ev1, v_ev1, 2, 2, 3, 3);
+ WRITE_YUV2RGB_LSX(yl_ev2, yl_od2, u_od1, v_od1, 3, 3, 3, 3);
+ WRITE_YUV2RGB_LSX(yh_ev1, yh_od1, u_ev2, v_ev2, 0, 0, 0, 0);
+ WRITE_YUV2RGB_LSX(yh_ev1, yh_od1, u_od2, v_od2, 1, 1, 0, 0);
+ WRITE_YUV2RGB_LSX(yh_ev1, yh_od1, u_ev2, v_ev2, 2, 2, 1, 1);
+ WRITE_YUV2RGB_LSX(yh_ev1, yh_od1, u_od2, v_od2, 3, 3, 1, 1);
+ WRITE_YUV2RGB_LSX(yh_ev2, yh_od2, u_ev2, v_ev2, 0, 0, 2, 2);
+ WRITE_YUV2RGB_LSX(yh_ev2, yh_od2, u_od2, v_od2, 1, 1, 2, 2);
+ WRITE_YUV2RGB_LSX(yh_ev2, yh_od2, u_ev2, v_ev2, 2, 2, 3, 3);
+ WRITE_YUV2RGB_LSX(yh_ev2, yh_od2, u_od2, v_od2, 3, 3, 3, 3);
+ }
+
+ if (res >= 16) {
+ int Y1, Y2, U, V, count_lum = count << 1;
+ __m128i l_src1, l_src2, u_src1, v_src1;
+ __m128i yl_ev, yl_ev1, yl_ev2, yl_od1, yl_od2;
+ __m128i u_ev1, u_od1, v_ev1, v_od1, temp;
+
+ yl_ev = __lsx_vldrepl_w(&t, 0);
+ yl_ev1 = yl_ev;
+ yl_od1 = yl_ev;
+ u_ev1 = yl_ev;
+ v_ev1 = yl_ev;
+ u_od1 = yl_ev;
+ v_od1 = yl_ev;
+ yl_ev2 = yl_ev;
+ yl_od2 = yl_ev;
+
+ for (j = 0; j < lumFilterSize; j++) {
+ temp = __lsx_vldrepl_h((lumFilter + j), 0);
+ DUP2_ARG2(__lsx_vld, lumSrc[j] + count_lum, 0, lumSrc[j] + count_lum,
+ 16, l_src1, l_src2);
+ yl_ev1 = __lsx_vmaddwev_w_h(yl_ev1, temp, l_src1);
+ yl_od1 = __lsx_vmaddwod_w_h(yl_od1, temp, l_src1);
+ yl_ev2 = __lsx_vmaddwev_w_h(yl_ev2, temp, l_src2);
+ yl_od2 = __lsx_vmaddwod_w_h(yl_od2, temp, l_src2);
+ }
+ for (j = 0; j < chrFilterSize; j++) {
+ DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
+ u_src1, v_src1);
+ temp = __lsx_vldrepl_h((chrFilter + j), 0);
+ u_ev1 = __lsx_vmaddwev_w_h(u_ev1, temp, u_src1);
+ u_od1 = __lsx_vmaddwod_w_h(u_od1, temp, u_src1);
+ v_ev1 = __lsx_vmaddwev_w_h(v_ev1, temp, v_src1);
+ v_od1 = __lsx_vmaddwod_w_h(v_od1, temp, v_src1);
+ }
+ yl_ev1 = __lsx_vsrai_w(yl_ev1, 19);
+ yl_od1 = __lsx_vsrai_w(yl_od1, 19);
+ u_ev1 = __lsx_vsrai_w(u_ev1, 19);
+ v_ev1 = __lsx_vsrai_w(v_ev1, 19);
+ u_od1 = __lsx_vsrai_w(u_od1, 19);
+ v_od1 = __lsx_vsrai_w(v_od1, 19);
+ yl_ev2 = __lsx_vsrai_w(yl_ev2, 19);
+ yl_od2 = __lsx_vsrai_w(yl_od2, 19);
+ u_ev1 = __lsx_vadd_w(u_ev1, headroom);
+ v_ev1 = __lsx_vadd_w(v_ev1, headroom);
+ u_od1 = __lsx_vadd_w(u_od1, headroom);
+ v_od1 = __lsx_vadd_w(v_od1, headroom);
+
+ WRITE_YUV2RGB_LSX(yl_ev1, yl_od1, u_ev1, v_ev1, 0, 0, 0, 0);
+ WRITE_YUV2RGB_LSX(yl_ev1, yl_od1, u_od1, v_od1, 1, 1, 0, 0);
+ WRITE_YUV2RGB_LSX(yl_ev1, yl_od1, u_ev1, v_ev1, 2, 2, 1, 1);
+ WRITE_YUV2RGB_LSX(yl_ev1, yl_od1, u_od1, v_od1, 3, 3, 1, 1);
+ WRITE_YUV2RGB_LSX(yl_ev2, yl_od2, u_ev1, v_ev1, 0, 0, 2, 2);
+ WRITE_YUV2RGB_LSX(yl_ev2, yl_od2, u_od1, v_od1, 1, 1, 2, 2);
+ WRITE_YUV2RGB_LSX(yl_ev2, yl_od2, u_ev1, v_ev1, 2, 2, 3, 3);
+ WRITE_YUV2RGB_LSX(yl_ev2, yl_od2, u_od1, v_od1, 3, 3, 3, 3);
+ res -= 16;
+ }
+
+ if (res >= 8) {
+ int Y1, Y2, U, V, count_lum = count << 1;
+ __m128i l_src1, u_src, v_src;
+ __m128i yl_ev, yl_od;
+ __m128i u_ev, u_od, v_ev, v_od, temp;
+
+ yl_ev = __lsx_vldrepl_w(&t, 0);
+ yl_od = yl_ev;
+ u_ev = yl_ev;
+ v_ev = yl_ev;
+ u_od = yl_ev;
+ v_od = yl_ev;
+ for (j = 0; j < lumFilterSize; j++) {
+ temp = __lsx_vldrepl_h((lumFilter + j), 0);
+ l_src1 = __lsx_vld(lumSrc[j] + count_lum, 0);
+ yl_ev = __lsx_vmaddwev_w_h(yl_ev, temp, l_src1);
+ yl_od = __lsx_vmaddwod_w_h(yl_od, temp, l_src1);
+ }
+ for (j = 0; j < chrFilterSize; j++) {
+ DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
+ u_src, v_src);
+ temp = __lsx_vldrepl_h((chrFilter + j), 0);
+ u_ev = __lsx_vmaddwev_w_h(u_ev, temp, u_src);
+ u_od = __lsx_vmaddwod_w_h(u_od, temp, u_src);
+ v_ev = __lsx_vmaddwev_w_h(v_ev, temp, v_src);
+ v_od = __lsx_vmaddwod_w_h(v_od, temp, v_src);
+ }
+ yl_ev = __lsx_vsrai_w(yl_ev, 19);
+ yl_od = __lsx_vsrai_w(yl_od, 19);
+ u_ev = __lsx_vsrai_w(u_ev, 19);
+ v_ev = __lsx_vsrai_w(v_ev, 19);
+ u_od = __lsx_vsrai_w(u_od, 19);
+ v_od = __lsx_vsrai_w(v_od, 19);
+ u_ev = __lsx_vadd_w(u_ev, headroom);
+ v_ev = __lsx_vadd_w(v_ev, headroom);
+ u_od = __lsx_vadd_w(u_od, headroom);
+ v_od = __lsx_vadd_w(v_od, headroom);
+ WRITE_YUV2RGB_LSX(yl_ev, yl_od, u_ev, v_ev, 0, 0, 0, 0);
+ WRITE_YUV2RGB_LSX(yl_ev, yl_od, u_od, v_od, 1, 1, 0, 0);
+ WRITE_YUV2RGB_LSX(yl_ev, yl_od, u_ev, v_ev, 2, 2, 1, 1);
+ WRITE_YUV2RGB_LSX(yl_ev, yl_od, u_od, v_od, 3, 3, 1, 1);
+ res -= 8;
+ }
+
+ if (res >= 4) {
+ int Y1, Y2, U, V, count_lum = count << 1;
+ __m128i l_src1, u_src, v_src;
+ __m128i yl_ev, yl_od;
+ __m128i u_ev, u_od, v_ev, v_od, temp;
+
+ yl_ev = __lsx_vldrepl_w(&t, 0);
+ yl_od = yl_ev;
+ u_ev = yl_ev;
+ v_ev = yl_ev;
+ u_od = yl_ev;
+ v_od = yl_ev;
+ for (j = 0; j < lumFilterSize; j++) {
+ temp = __lsx_vldrepl_h((lumFilter + j), 0);
+ l_src1 = __lsx_vld(lumSrc[j] + count_lum, 0);
+ yl_ev = __lsx_vmaddwev_w_h(yl_ev, temp, l_src1);
+ yl_od = __lsx_vmaddwod_w_h(yl_od, temp, l_src1);
+ }
+ for (j = 0; j < chrFilterSize; j++) {
+ DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
+ u_src, v_src);
+ temp = __lsx_vldrepl_h((chrFilter + j), 0);
+ u_ev = __lsx_vmaddwev_w_h(u_ev, temp, u_src);
+ u_od = __lsx_vmaddwod_w_h(u_od, temp, u_src);
+ v_ev = __lsx_vmaddwev_w_h(v_ev, temp, v_src);
+ v_od = __lsx_vmaddwod_w_h(v_od, temp, v_src);
+ }
+ yl_ev = __lsx_vsrai_w(yl_ev, 19);
+ yl_od = __lsx_vsrai_w(yl_od, 19);
+ u_ev = __lsx_vsrai_w(u_ev, 19);
+ v_ev = __lsx_vsrai_w(v_ev, 19);
+ u_od = __lsx_vsrai_w(u_od, 19);
+ v_od = __lsx_vsrai_w(v_od, 19);
+ u_ev = __lsx_vadd_w(u_ev, headroom);
+ v_ev = __lsx_vadd_w(v_ev, headroom);
+ u_od = __lsx_vadd_w(u_od, headroom);
+ v_od = __lsx_vadd_w(v_od, headroom);
+ WRITE_YUV2RGB_LSX(yl_ev, yl_od, u_ev, v_ev, 0, 0, 0, 0);
+ WRITE_YUV2RGB_LSX(yl_ev, yl_od, u_od, v_od, 1, 1, 0, 0);
+ res -= 4;
+ }
+
+ if (res >= 2) {
+ int Y1, Y2, U, V, count_lum = count << 1;
+ __m128i l_src1, u_src, v_src;
+ __m128i yl_ev, yl_od;
+ __m128i u_ev, u_od, v_ev, v_od, temp;
+
+ yl_ev = __lsx_vldrepl_w(&t, 0);
+ yl_od = yl_ev;
+ u_ev = yl_ev;
+ v_ev = yl_ev;
+ u_od = yl_ev;
+ v_od = yl_ev;
+ for (j = 0; j < lumFilterSize; j++) {
+ temp = __lsx_vldrepl_h((lumFilter + j), 0);
+ l_src1 = __lsx_vld(lumSrc[j] + count_lum, 0);
+ yl_ev = __lsx_vmaddwev_w_h(yl_ev, temp, l_src1);
+ yl_od = __lsx_vmaddwod_w_h(yl_od, temp, l_src1);
+ }
+ for (j = 0; j < chrFilterSize; j++) {
+ DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
+ u_src, v_src);
+ temp = __lsx_vldrepl_h((chrFilter + j), 0);
+ u_ev = __lsx_vmaddwev_w_h(u_ev, temp, u_src);
+ u_od = __lsx_vmaddwod_w_h(u_od, temp, u_src);
+ v_ev = __lsx_vmaddwev_w_h(v_ev, temp, v_src);
+ v_od = __lsx_vmaddwod_w_h(v_od, temp, v_src);
+ }
+ yl_ev = __lsx_vsrai_w(yl_ev, 19);
+ yl_od = __lsx_vsrai_w(yl_od, 19);
+ u_ev = __lsx_vsrai_w(u_ev, 19);
+ v_ev = __lsx_vsrai_w(v_ev, 19);
+ u_od = __lsx_vsrai_w(u_od, 19);
+ v_od = __lsx_vsrai_w(v_od, 19);
+ u_ev = __lsx_vadd_w(u_ev, headroom);
+ v_ev = __lsx_vadd_w(v_ev, headroom);
+ u_od = __lsx_vadd_w(u_od, headroom);
+ v_od = __lsx_vadd_w(v_od, headroom);
+ WRITE_YUV2RGB_LSX(yl_ev, yl_od, u_ev, v_ev, 0, 0, 0, 0);
+ res -= 2;
+ }
+
+ for (; count < len_count; count++) {
+ int Y1 = 1 << 18;
+ int Y2 = Y1;
+ int U = Y1;
+ int V = Y1;
+
+ for (j = 0; j < lumFilterSize; j++) {
+ Y1 += lumSrc[j][count * 2] * lumFilter[j];
+ Y2 += lumSrc[j][count * 2 + 1] * lumFilter[j];
+ }
+ for (j = 0; j < chrFilterSize; j++) {
+ U += chrUSrc[j][count] * chrFilter[j];
+ V += chrVSrc[j][count] * chrFilter[j];
+ }
+ Y1 >>= 19;
+ Y2 >>= 19;
+ U >>= 19;
+ V >>= 19;
+ r = c->table_rV[V + YUVRGB_TABLE_HEADROOM];
+ g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
+ c->table_gV[V + YUVRGB_TABLE_HEADROOM]);
+ b = c->table_bU[U + YUVRGB_TABLE_HEADROOM];
+
+ yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
+ r, g, b, y, target, 0);
+ }
+}
+
+static void
+yuv2rgb_2_template_lsx(SwsContext *c, const int16_t *buf[2],
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf[2], uint8_t *dest, int dstW,
+ int yalpha, int uvalpha, int y,
+ enum AVPixelFormat target, int hasAlpha)
+{
+ const int16_t *buf0 = buf[0], *buf1 = buf[1],
+ *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
+ *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
+ int yalpha1 = 4096 - yalpha;
+ int uvalpha1 = 4096 - uvalpha;
+ int i, count = 0;
+ int len = dstW - 7;
+ int len_count = (dstW + 1) >> 1;
+ const void *r, *g, *b;
+ int head = YUVRGB_TABLE_HEADROOM;
+ __m128i v_yalpha1 = __lsx_vreplgr2vr_w(yalpha1);
+ __m128i v_uvalpha1 = __lsx_vreplgr2vr_w(uvalpha1);
+ __m128i v_yalpha = __lsx_vreplgr2vr_w(yalpha);
+ __m128i v_uvalpha = __lsx_vreplgr2vr_w(uvalpha);
+ __m128i headroom = __lsx_vreplgr2vr_w(head);
+ __m128i zero = __lsx_vldi(0);
+
+ for (i = 0; i < len; i += 8) {
+ int Y1, Y2, U, V;
+ int i_dex = i << 1;
+ int c_dex = count << 1;
+ __m128i y0_h, y0_l, y0, u0, v0;
+ __m128i y1_h, y1_l, y1, u1, v1;
+ __m128i y_l, y_h, u, v;
+
+ DUP4_ARG2(__lsx_vldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
+ buf1, i_dex, y0, u0, v0, y1);
+ DUP2_ARG2(__lsx_vldx, ubuf1, c_dex, vbuf1, c_dex, u1, v1);
+ DUP2_ARG2(__lsx_vsllwil_w_h, y0, 0, y1, 0, y0_l, y1_l);
+ DUP2_ARG1(__lsx_vexth_w_h, y0, y1, y0_h, y1_h);
+ DUP4_ARG2(__lsx_vilvl_h, zero, u0, zero, u1, zero, v0, zero, v1,
+ u0, u1, v0, v1);
+ y0_l = __lsx_vmul_w(y0_l, v_yalpha1);
+ y0_h = __lsx_vmul_w(y0_h, v_yalpha1);
+ u0 = __lsx_vmul_w(u0, v_uvalpha1);
+ v0 = __lsx_vmul_w(v0, v_uvalpha1);
+ y_l = __lsx_vmadd_w(y0_l, v_yalpha, y1_l);
+ y_h = __lsx_vmadd_w(y0_h, v_yalpha, y1_h);
+ u = __lsx_vmadd_w(u0, v_uvalpha, u1);
+ v = __lsx_vmadd_w(v0, v_uvalpha, v1);
+ y_l = __lsx_vsrai_w(y_l, 19);
+ y_h = __lsx_vsrai_w(y_h, 19);
+ u = __lsx_vsrai_w(u, 19);
+ v = __lsx_vsrai_w(v, 19);
+ u = __lsx_vadd_w(u, headroom);
+ v = __lsx_vadd_w(v, headroom);
+ WRITE_YUV2RGB_LSX(y_l, y_l, u, v, 0, 1, 0, 0);
+ WRITE_YUV2RGB_LSX(y_l, y_l, u, v, 2, 3, 1, 1);
+ WRITE_YUV2RGB_LSX(y_h, y_h, u, v, 0, 1, 2, 2);
+ WRITE_YUV2RGB_LSX(y_h, y_h, u, v, 2, 3, 3, 3);
+ }
+ if (dstW - i >= 4) {
+ int Y1, Y2, U, V;
+ int i_dex = i << 1;
+ __m128i y0_l, y0, u0, v0;
+ __m128i y1_l, y1, u1, v1;
+ __m128i y_l, u, v;
+
+ y0 = __lsx_vldx(buf0, i_dex);
+ u0 = __lsx_vldrepl_d((ubuf0 + count), 0);
+ v0 = __lsx_vldrepl_d((vbuf0 + count), 0);
+ y1 = __lsx_vldx(buf1, i_dex);
+ u1 = __lsx_vldrepl_d((ubuf1 + count), 0);
+ v1 = __lsx_vldrepl_d((vbuf1 + count), 0);
+ DUP2_ARG2(__lsx_vilvl_h, zero, y0, zero, y1, y0_l, y1_l);
+ DUP4_ARG2(__lsx_vilvl_h, zero, u0, zero, u1, zero, v0, zero, v1,
+ u0, u1, v0, v1);
+ y0_l = __lsx_vmul_w(y0_l, v_yalpha1);
+ u0 = __lsx_vmul_w(u0, v_uvalpha1);
+ v0 = __lsx_vmul_w(v0, v_uvalpha1);
+ y_l = __lsx_vmadd_w(y0_l, v_yalpha, y1_l);
+ u = __lsx_vmadd_w(u0, v_uvalpha, u1);
+ v = __lsx_vmadd_w(v0, v_uvalpha, v1);
+ y_l = __lsx_vsrai_w(y_l, 19);
+ u = __lsx_vsrai_w(u, 19);
+ v = __lsx_vsrai_w(v, 19);
+ u = __lsx_vadd_w(u, headroom);
+ v = __lsx_vadd_w(v, headroom);
+ WRITE_YUV2RGB_LSX(y_l, y_l, u, v, 0, 1, 0, 0);
+ WRITE_YUV2RGB_LSX(y_l, y_l, u, v, 2, 3, 1, 1);
+ i += 4;
+ }
+ for (; count < len_count; count++) {
+ int Y1 = (buf0[count * 2] * yalpha1 +
+ buf1[count * 2] * yalpha) >> 19;
+ int Y2 = (buf0[count * 2 + 1] * yalpha1 +
+ buf1[count * 2 + 1] * yalpha) >> 19;
+ int U = (ubuf0[count] * uvalpha1 + ubuf1[count] * uvalpha) >> 19;
+ int V = (vbuf0[count] * uvalpha1 + vbuf1[count] * uvalpha) >> 19;
+
+ r = c->table_rV[V + YUVRGB_TABLE_HEADROOM],
+ g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
+ c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
+ b = c->table_bU[U + YUVRGB_TABLE_HEADROOM];
+
+ yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
+ r, g, b, y, target, 0);
+ }
+}
+
+static void
+yuv2rgb_1_template_lsx(SwsContext *c, const int16_t *buf0,
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf0, uint8_t *dest, int dstW,
+ int uvalpha, int y, enum AVPixelFormat target,
+ int hasAlpha)
+{
+ const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
+ int i;
+ int len = (dstW - 7);
+ int len_count = (dstW + 1) >> 1;
+ const void *r, *g, *b;
+
+ if (uvalpha < 2048) {
+ int count = 0;
+ int head = YUVRGB_TABLE_HEADROOM;
+ __m128i headroom = __lsx_vreplgr2vr_h(head);
+
+ for (i = 0; i < len; i += 8) {
+ int Y1, Y2, U, V;
+ int i_dex = i << 1;
+ int c_dex = count << 1;
+ __m128i src_y, src_u, src_v;
+ __m128i u, v, uv, y_l, y_h;
+
+ src_y = __lsx_vldx(buf0, i_dex);
+ DUP2_ARG2(__lsx_vldx, ubuf0, c_dex, vbuf0, c_dex, src_u, src_v);
+ src_y = __lsx_vsrari_h(src_y, 7);
+ src_u = __lsx_vsrari_h(src_u, 7);
+ src_v = __lsx_vsrari_h(src_v, 7);
+ y_l = __lsx_vsllwil_w_h(src_y, 0);
+ y_h = __lsx_vexth_w_h(src_y);
+ uv = __lsx_vilvl_h(src_v, src_u);
+ u = __lsx_vaddwev_w_h(uv, headroom);
+ v = __lsx_vaddwod_w_h(uv, headroom);
+ WRITE_YUV2RGB_LSX(y_l, y_l, u, v, 0, 1, 0, 0);
+ WRITE_YUV2RGB_LSX(y_l, y_l, u, v, 2, 3, 1, 1);
+ WRITE_YUV2RGB_LSX(y_h, y_h, u, v, 0, 1, 2, 2);
+ WRITE_YUV2RGB_LSX(y_h, y_h, u, v, 2, 3, 3, 3);
+ }
+ if (dstW - i >= 4){
+ int Y1, Y2, U, V;
+ int i_dex = i << 1;
+ __m128i src_y, src_u, src_v;
+ __m128i y_l, u, v, uv;
+
+ src_y = __lsx_vldx(buf0, i_dex);
+ src_u = __lsx_vldrepl_d((ubuf0 + count), 0);
+ src_v = __lsx_vldrepl_d((vbuf0 + count), 0);
+ y_l = __lsx_vsrari_h(src_y, 7);
+ y_l = __lsx_vsllwil_w_h(y_l, 0);
+ uv = __lsx_vilvl_h(src_v, src_u);
+ uv = __lsx_vsrari_h(uv, 7);
+ u = __lsx_vaddwev_w_h(uv, headroom);
+ v = __lsx_vaddwod_w_h(uv, headroom);
+ WRITE_YUV2RGB_LSX(y_l, y_l, u, v, 0, 1, 0, 0);
+ WRITE_YUV2RGB_LSX(y_l, y_l, u, v, 2, 3, 1, 1);
+ i += 4;
+ }
+ for (; count < len_count; count++) {
+ int Y1 = (buf0[count * 2 ] + 64) >> 7;
+ int Y2 = (buf0[count * 2 + 1] + 64) >> 7;
+ int U = (ubuf0[count] + 64) >> 7;
+ int V = (vbuf0[count] + 64) >> 7;
+
+ r = c->table_rV[V + YUVRGB_TABLE_HEADROOM],
+ g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
+ c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
+ b = c->table_bU[U + YUVRGB_TABLE_HEADROOM];
+
+ yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
+ r, g, b, y, target, 0);
+ }
+ } else {
+ const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
+ int count = 0;
+ int HEADROOM = YUVRGB_TABLE_HEADROOM;
+ __m128i headroom = __lsx_vreplgr2vr_w(HEADROOM);
+
+ for (i = 0; i < len; i += 8) {
+ int Y1, Y2, U, V;
+ int i_dex = i << 1;
+ int c_dex = count << 1;
+ __m128i src_y, src_u0, src_v0, src_u1, src_v1;
+ __m128i y_l, y_h, u1, u2, v1, v2;
+
+ DUP4_ARG2(__lsx_vldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
+ ubuf1, c_dex, src_y, src_u0, src_v0, src_u1);
+ src_v1 = __lsx_vldx(vbuf1, c_dex);
+ src_y = __lsx_vsrari_h(src_y, 7);
+ u1 = __lsx_vaddwev_w_h(src_u0, src_u1);
+ v1 = __lsx_vaddwod_w_h(src_u0, src_u1);
+ u2 = __lsx_vaddwev_w_h(src_v0, src_v1);
+ v2 = __lsx_vaddwod_w_h(src_v0, src_v1);
+ y_l = __lsx_vsllwil_w_h(src_y, 0);
+ y_h = __lsx_vexth_w_h(src_y);
+ u1 = __lsx_vsrari_w(u1, 8);
+ v1 = __lsx_vsrari_w(v1, 8);
+ u2 = __lsx_vsrari_w(u2, 8);
+ v2 = __lsx_vsrari_w(v2, 8);
+ u1 = __lsx_vadd_w(u1, headroom);
+ v1 = __lsx_vadd_w(v1, headroom);
+ u2 = __lsx_vadd_w(u2, headroom);
+ v2 = __lsx_vadd_w(v2, headroom);
+ WRITE_YUV2RGB_LSX(y_l, y_l, u1, v1, 0, 1, 0, 0);
+ WRITE_YUV2RGB_LSX(y_l, y_l, u2, v2, 2, 3, 0, 0);
+ WRITE_YUV2RGB_LSX(y_h, y_h, u1, v1, 0, 1, 1, 1);
+ WRITE_YUV2RGB_LSX(y_h, y_h, u2, v2, 2, 3, 1, 1);
+ }
+ if (dstW - i >= 4) {
+ int Y1, Y2, U, V;
+ int i_dex = i << 1;
+ __m128i src_y, src_u0, src_v0, src_u1, src_v1;
+ __m128i uv;
+
+ src_y = __lsx_vldx(buf0, i_dex);
+ src_u0 = __lsx_vldrepl_d((ubuf0 + count), 0);
+ src_v0 = __lsx_vldrepl_d((vbuf0 + count), 0);
+ src_u1 = __lsx_vldrepl_d((ubuf1 + count), 0);
+ src_v1 = __lsx_vldrepl_d((vbuf1 + count), 0);
+
+ src_u0 = __lsx_vilvl_h(src_u1, src_u0);
+ src_v0 = __lsx_vilvl_h(src_v1, src_v0);
+ src_y = __lsx_vsrari_h(src_y, 7);
+ src_y = __lsx_vsllwil_w_h(src_y, 0);
+ uv = __lsx_vilvl_h(src_v0, src_u0);
+ uv = __lsx_vhaddw_w_h(uv, uv);
+ uv = __lsx_vsrari_w(uv, 8);
+ uv = __lsx_vadd_w(uv, headroom);
+ WRITE_YUV2RGB_LSX(src_y, src_y, uv, uv, 0, 1, 0, 1);
+ WRITE_YUV2RGB_LSX(src_y, src_y, uv, uv, 2, 3, 2, 3);
+ i += 4;
+ }
+ for (; count < len_count; count++) {
+ int Y1 = (buf0[count * 2 ] + 64) >> 7;
+ int Y2 = (buf0[count * 2 + 1] + 64) >> 7;
+ int U = (ubuf0[count] + ubuf1[count] + 128) >> 8;
+ int V = (vbuf0[count] + vbuf1[count] + 128) >> 8;
+
+ r = c->table_rV[V + YUVRGB_TABLE_HEADROOM],
+ g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
+ c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
+ b = c->table_bU[U + YUVRGB_TABLE_HEADROOM];
+
+ yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
+ r, g, b, y, target, 0);
+ }
+ }
+}
+
+#define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
+static void name ## ext ## _X_lsx(SwsContext *c, const int16_t *lumFilter, \
+ const int16_t **lumSrc, int lumFilterSize, \
+ const int16_t *chrFilter, const int16_t **chrUSrc, \
+ const int16_t **chrVSrc, int chrFilterSize, \
+ const int16_t **alpSrc, uint8_t *dest, int dstW, \
+ int y) \
+{ \
+ name ## base ## _X_template_lsx(c, lumFilter, lumSrc, lumFilterSize, \
+ chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
+ alpSrc, dest, dstW, y, fmt, hasAlpha); \
+}
+
+#define YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha) \
+YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
+static void name ## ext ## _2_lsx(SwsContext *c, const int16_t *buf[2], \
+ const int16_t *ubuf[2], const int16_t *vbuf[2], \
+ const int16_t *abuf[2], uint8_t *dest, int dstW, \
+ int yalpha, int uvalpha, int y) \
+{ \
+ name ## base ## _2_template_lsx(c, buf, ubuf, vbuf, abuf, dest, \
+ dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
+}
+
+#define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
+YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha) \
+static void name ## ext ## _1_lsx(SwsContext *c, const int16_t *buf0, \
+ const int16_t *ubuf[2], const int16_t *vbuf[2], \
+ const int16_t *abuf0, uint8_t *dest, int dstW, \
+ int uvalpha, int y) \
+{ \
+ name ## base ## _1_template_lsx(c, buf0, ubuf, vbuf, abuf0, dest, \
+ dstW, uvalpha, y, fmt, hasAlpha); \
+}
+
+#if CONFIG_SMALL
+#else
+#if CONFIG_SWSCALE_ALPHA
+#endif
+YUV2RGBWRAPPER(yuv2rgb,, x32_1, AV_PIX_FMT_RGB32_1, 0)
+YUV2RGBWRAPPER(yuv2rgb,, x32, AV_PIX_FMT_RGB32, 0)
+#endif
+YUV2RGBWRAPPER(yuv2, rgb, rgb24, AV_PIX_FMT_RGB24, 0)
+YUV2RGBWRAPPER(yuv2, rgb, bgr24, AV_PIX_FMT_BGR24, 0)
+YUV2RGBWRAPPER(yuv2rgb,, 16, AV_PIX_FMT_RGB565, 0)
+YUV2RGBWRAPPER(yuv2rgb,, 15, AV_PIX_FMT_RGB555, 0)
+YUV2RGBWRAPPER(yuv2rgb,, 12, AV_PIX_FMT_RGB444, 0)
+YUV2RGBWRAPPER(yuv2rgb,, 8, AV_PIX_FMT_RGB8, 0)
+YUV2RGBWRAPPER(yuv2rgb,, 4, AV_PIX_FMT_RGB4, 0)
+YUV2RGBWRAPPER(yuv2rgb,, 4b, AV_PIX_FMT_RGB4_BYTE, 0)
+
+// This function is copied from libswscale/output.c
+static av_always_inline void yuv2rgb_write_full(SwsContext *c,
+ uint8_t *dest, int i, int R, int A, int G, int B,
+ int y, enum AVPixelFormat target, int hasAlpha, int err[4])
+{
+ int isrgb8 = target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8;
+
+ if ((R | G | B) & 0xC0000000) {
+ R = av_clip_uintp2(R, 30);
+ G = av_clip_uintp2(G, 30);
+ B = av_clip_uintp2(B, 30);
+ }
+
+ switch(target) {
+ case AV_PIX_FMT_ARGB:
+ dest[0] = hasAlpha ? A : 255;
+ dest[1] = R >> 22;
+ dest[2] = G >> 22;
+ dest[3] = B >> 22;
+ break;
+ case AV_PIX_FMT_RGB24:
+ dest[0] = R >> 22;
+ dest[1] = G >> 22;
+ dest[2] = B >> 22;
+ break;
+ case AV_PIX_FMT_RGBA:
+ dest[0] = R >> 22;
+ dest[1] = G >> 22;
+ dest[2] = B >> 22;
+ dest[3] = hasAlpha ? A : 255;
+ break;
+ case AV_PIX_FMT_ABGR:
+ dest[0] = hasAlpha ? A : 255;
+ dest[1] = B >> 22;
+ dest[2] = G >> 22;
+ dest[3] = R >> 22;
+ break;
+ case AV_PIX_FMT_BGR24:
+ dest[0] = B >> 22;
+ dest[1] = G >> 22;
+ dest[2] = R >> 22;
+ break;
+ case AV_PIX_FMT_BGRA:
+ dest[0] = B >> 22;
+ dest[1] = G >> 22;
+ dest[2] = R >> 22;
+ dest[3] = hasAlpha ? A : 255;
+ break;
+ case AV_PIX_FMT_BGR4_BYTE:
+ case AV_PIX_FMT_RGB4_BYTE:
+ case AV_PIX_FMT_BGR8:
+ case AV_PIX_FMT_RGB8:
+ {
+ int r,g,b;
+
+ switch (c->dither) {
+ default:
+ case SWS_DITHER_AUTO:
+ case SWS_DITHER_ED:
+ R >>= 22;
+ G >>= 22;
+ B >>= 22;
+ R += (7*err[0] + 1*c->dither_error[0][i] + 5*c->dither_error[0][i+1] + 3*c->dither_error[0][i+2])>>4;
+ G += (7*err[1] + 1*c->dither_error[1][i] + 5*c->dither_error[1][i+1] + 3*c->dither_error[1][i+2])>>4;
+ B += (7*err[2] + 1*c->dither_error[2][i] + 5*c->dither_error[2][i+1] + 3*c->dither_error[2][i+2])>>4;
+ c->dither_error[0][i] = err[0];
+ c->dither_error[1][i] = err[1];
+ c->dither_error[2][i] = err[2];
+ r = R >> (isrgb8 ? 5 : 7);
+ g = G >> (isrgb8 ? 5 : 6);
+ b = B >> (isrgb8 ? 6 : 7);
+ r = av_clip(r, 0, isrgb8 ? 7 : 1);
+ g = av_clip(g, 0, isrgb8 ? 7 : 3);
+ b = av_clip(b, 0, isrgb8 ? 3 : 1);
+ err[0] = R - r*(isrgb8 ? 36 : 255);
+ err[1] = G - g*(isrgb8 ? 36 : 85);
+ err[2] = B - b*(isrgb8 ? 85 : 255);
+ break;
+ case SWS_DITHER_A_DITHER:
+ if (isrgb8) {
+ /* see http://pippin.gimp.org/a_dither/ for details/origin */
+#define A_DITHER(u,v) (((((u)+((v)*236))*119)&0xff))
+ r = (((R >> 19) + A_DITHER(i,y) -96)>>8);
+ g = (((G >> 19) + A_DITHER(i + 17,y) - 96)>>8);
+ b = (((B >> 20) + A_DITHER(i + 17*2,y) -96)>>8);
+ r = av_clip_uintp2(r, 3);
+ g = av_clip_uintp2(g, 3);
+ b = av_clip_uintp2(b, 2);
+ } else {
+ r = (((R >> 21) + A_DITHER(i,y)-256)>>8);
+ g = (((G >> 19) + A_DITHER(i + 17,y)-256)>>8);
+ b = (((B >> 21) + A_DITHER(i + 17*2,y)-256)>>8);
+ r = av_clip_uintp2(r, 1);
+ g = av_clip_uintp2(g, 2);
+ b = av_clip_uintp2(b, 1);
+ }
+ break;
+ case SWS_DITHER_X_DITHER:
+ if (isrgb8) {
+ /* see http://pippin.gimp.org/a_dither/ for details/origin */
+#define X_DITHER(u,v) (((((u)^((v)*237))*181)&0x1ff)/2)
+ r = (((R >> 19) + X_DITHER(i,y) - 96)>>8);
+ g = (((G >> 19) + X_DITHER(i + 17,y) - 96)>>8);
+ b = (((B >> 20) + X_DITHER(i + 17*2,y) - 96)>>8);
+ r = av_clip_uintp2(r, 3);
+ g = av_clip_uintp2(g, 3);
+ b = av_clip_uintp2(b, 2);
+ } else {
+ r = (((R >> 21) + X_DITHER(i,y)-256)>>8);
+ g = (((G >> 19) + X_DITHER(i + 17,y)-256)>>8);
+ b = (((B >> 21) + X_DITHER(i + 17*2,y)-256)>>8);
+ r = av_clip_uintp2(r, 1);
+ g = av_clip_uintp2(g, 2);
+ b = av_clip_uintp2(b, 1);
+ }
+
+ break;
+ }
+
+ if(target == AV_PIX_FMT_BGR4_BYTE) {
+ dest[0] = r + 2*g + 8*b;
+ } else if(target == AV_PIX_FMT_RGB4_BYTE) {
+ dest[0] = b + 2*g + 8*r;
+ } else if(target == AV_PIX_FMT_BGR8) {
+ dest[0] = r + 8*g + 64*b;
+ } else if(target == AV_PIX_FMT_RGB8) {
+ dest[0] = b + 4*g + 32*r;
+ } else
+ av_assert2(0);
+ break; }
+ }
+}
+
+#define YUVTORGB_SETUP_LSX \
+ int y_offset = c->yuv2rgb_y_offset; \
+ int y_coeff = c->yuv2rgb_y_coeff; \
+ int v2r_coe = c->yuv2rgb_v2r_coeff; \
+ int v2g_coe = c->yuv2rgb_v2g_coeff; \
+ int u2g_coe = c->yuv2rgb_u2g_coeff; \
+ int u2b_coe = c->yuv2rgb_u2b_coeff; \
+ __m128i offset = __lsx_vreplgr2vr_w(y_offset); \
+ __m128i coeff = __lsx_vreplgr2vr_w(y_coeff); \
+ __m128i v2r = __lsx_vreplgr2vr_w(v2r_coe); \
+ __m128i v2g = __lsx_vreplgr2vr_w(v2g_coe); \
+ __m128i u2g = __lsx_vreplgr2vr_w(u2g_coe); \
+ __m128i u2b = __lsx_vreplgr2vr_w(u2b_coe); \
+
+#define YUVTORGB_LSX(y, u, v, R, G, B, offset, coeff, \
+ y_temp, v2r, v2g, u2g, u2b) \
+{ \
+ y = __lsx_vsub_w(y, offset); \
+ y = __lsx_vmul_w(y, coeff); \
+ y = __lsx_vadd_w(y, y_temp); \
+ R = __lsx_vmadd_w(y, v, v2r); \
+ v = __lsx_vmadd_w(y, v, v2g); \
+ G = __lsx_vmadd_w(v, u, u2g); \
+ B = __lsx_vmadd_w(y, u, u2b); \
+}
+
+#define WRITE_FULL_A_LSX(r, g, b, a, t1, s) \
+{ \
+ R = __lsx_vpickve2gr_w(r, t1); \
+ G = __lsx_vpickve2gr_w(g, t1); \
+ B = __lsx_vpickve2gr_w(b, t1); \
+ A = __lsx_vpickve2gr_w(a, t1); \
+ if (A & 0x100) \
+ A = av_clip_uint8(A); \
+ yuv2rgb_write_full(c, dest, i + s, R, A, G, B, y, target, hasAlpha, err);\
+ dest += step; \
+}
+
+#define WRITE_FULL_LSX(r, g, b, t1, s) \
+{ \
+ R = __lsx_vpickve2gr_w(r, t1); \
+ G = __lsx_vpickve2gr_w(g, t1); \
+ B = __lsx_vpickve2gr_w(b, t1); \
+ yuv2rgb_write_full(c, dest, i + s, R, 0, G, B, y, target, hasAlpha, err); \
+ dest += step; \
+}
+
+static void
+yuv2rgb_full_X_template_lsx(SwsContext *c, const int16_t *lumFilter,
+ const int16_t **lumSrc, int lumFilterSize,
+ const int16_t *chrFilter, const int16_t **chrUSrc,
+ const int16_t **chrVSrc, int chrFilterSize,
+ const int16_t **alpSrc, uint8_t *dest,
+ int dstW, int y, enum AVPixelFormat target,
+ int hasAlpha)
+{
+ int i, j, B, G, R, A;
+ int step = (target == AV_PIX_FMT_RGB24 ||
+ target == AV_PIX_FMT_BGR24) ? 3 : 4;
+ int err[4] = {0};
+ int a_temp = 1 << 18;
+ int templ = 1 << 9;
+ int tempc = templ - (128 << 19);
+ int ytemp = 1 << 21;
+ int len = dstW - 7;
+ __m128i y_temp = __lsx_vreplgr2vr_w(ytemp);
+ YUVTORGB_SETUP_LSX
+
+ if( target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
+ || target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8)
+ step = 1;
+
+ for (i = 0; i < len; i += 8) {
+ __m128i l_src, u_src, v_src;
+ __m128i y_ev, y_od, u_ev, u_od, v_ev, v_od, temp;
+ __m128i R_ev, R_od, G_ev, G_od, B_ev, B_od;
+ int n = i << 1;
+
+ y_ev = y_od = __lsx_vreplgr2vr_w(templ);
+ u_ev = u_od = v_ev = v_od = __lsx_vreplgr2vr_w(tempc);
+ for (j = 0; j < lumFilterSize; j++) {
+ temp = __lsx_vldrepl_h((lumFilter + j), 0);
+ l_src = __lsx_vldx(lumSrc[j], n);
+ y_ev = __lsx_vmaddwev_w_h(y_ev, l_src, temp);
+ y_od = __lsx_vmaddwod_w_h(y_od, l_src, temp);
+ }
+ for (j = 0; j < chrFilterSize; j++) {
+ temp = __lsx_vldrepl_h((chrFilter + j), 0);
+ DUP2_ARG2(__lsx_vldx, chrUSrc[j], n, chrVSrc[j], n,
+ u_src, v_src);
+ DUP2_ARG3(__lsx_vmaddwev_w_h, u_ev, u_src, temp, v_ev,
+ v_src, temp, u_ev, v_ev);
+ DUP2_ARG3(__lsx_vmaddwod_w_h, u_od, u_src, temp, v_od,
+ v_src, temp, u_od, v_od);
+ }
+ y_ev = __lsx_vsrai_w(y_ev, 10);
+ y_od = __lsx_vsrai_w(y_od, 10);
+ u_ev = __lsx_vsrai_w(u_ev, 10);
+ u_od = __lsx_vsrai_w(u_od, 10);
+ v_ev = __lsx_vsrai_w(v_ev, 10);
+ v_od = __lsx_vsrai_w(v_od, 10);
+ YUVTORGB_LSX(y_ev, u_ev, v_ev, R_ev, G_ev, B_ev, offset, coeff,
+ y_temp, v2r, v2g, u2g, u2b);
+ YUVTORGB_LSX(y_od, u_od, v_od, R_od, G_od, B_od, offset, coeff,
+ y_temp, v2r, v2g, u2g, u2b);
+
+ if (hasAlpha) {
+ __m128i a_src, a_ev, a_od;
+
+ a_ev = a_od = __lsx_vreplgr2vr_w(a_temp);
+ for (j = 0; j < lumFilterSize; j++) {
+ temp = __lsx_vldrepl_h(lumFilter + j, 0);
+ a_src = __lsx_vldx(alpSrc[j], n);
+ a_ev = __lsx_vmaddwev_w_h(a_ev, a_src, temp);
+ a_od = __lsx_vmaddwod_w_h(a_od, a_src, temp);
+ }
+ a_ev = __lsx_vsrai_w(a_ev, 19);
+ a_od = __lsx_vsrai_w(a_od, 19);
+ WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 0, 0);
+ WRITE_FULL_A_LSX(R_od, G_od, B_od, a_od, 0, 1);
+ WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 1, 2);
+ WRITE_FULL_A_LSX(R_od, G_od, B_od, a_od, 1, 3);
+ WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 2, 4);
+ WRITE_FULL_A_LSX(R_od, G_od, B_od, a_od, 2, 5);
+ WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 3, 6);
+ WRITE_FULL_A_LSX(R_od, G_od, B_od, a_od, 3, 7);
+ } else {
+ WRITE_FULL_LSX(R_ev, G_ev, B_ev, 0, 0);
+ WRITE_FULL_LSX(R_od, G_od, B_od, 0, 1);
+ WRITE_FULL_LSX(R_ev, G_ev, B_ev, 1, 2);
+ WRITE_FULL_LSX(R_od, G_od, B_od, 1, 3);
+ WRITE_FULL_LSX(R_ev, G_ev, B_ev, 2, 4);
+ WRITE_FULL_LSX(R_od, G_od, B_od, 2, 5);
+ WRITE_FULL_LSX(R_ev, G_ev, B_ev, 3, 6);
+ WRITE_FULL_LSX(R_od, G_od, B_od, 3, 7);
+ }
+ }
+ if (dstW - i >= 4) {
+ __m128i l_src, u_src, v_src;
+ __m128i y_ev, u_ev, v_ev, uv, temp;
+ __m128i R_ev, G_ev, B_ev;
+ int n = i << 1;
+
+ y_ev = __lsx_vreplgr2vr_w(templ);
+ u_ev = v_ev = __lsx_vreplgr2vr_w(tempc);
+ for (j = 0; j < lumFilterSize; j++) {
+ temp = __lsx_vldrepl_h((lumFilter + j), 0);
+ l_src = __lsx_vldx(lumSrc[j], n);
+ l_src = __lsx_vilvl_h(l_src, l_src);
+ y_ev = __lsx_vmaddwev_w_h(y_ev, l_src, temp);
+ }
+ for (j = 0; j < chrFilterSize; j++) {
+ temp = __lsx_vldrepl_h((chrFilter + j), 0);
+ DUP2_ARG2(__lsx_vldx, chrUSrc[j], n, chrVSrc[j], n, u_src, v_src);
+ uv = __lsx_vilvl_h(v_src, u_src);
+ u_ev = __lsx_vmaddwev_w_h(u_ev, uv, temp);
+ v_ev = __lsx_vmaddwod_w_h(v_ev, uv, temp);
+ }
+ y_ev = __lsx_vsrai_w(y_ev, 10);
+ u_ev = __lsx_vsrai_w(u_ev, 10);
+ v_ev = __lsx_vsrai_w(v_ev, 10);
+ YUVTORGB_LSX(y_ev, u_ev, v_ev, R_ev, G_ev, B_ev, offset, coeff,
+ y_temp, v2r, v2g, u2g, u2b);
+
+ if (hasAlpha) {
+ __m128i a_src, a_ev;
+
+ a_ev = __lsx_vreplgr2vr_w(a_temp);
+ for (j = 0; j < lumFilterSize; j++) {
+ temp = __lsx_vldrepl_h(lumFilter + j, 0);
+ a_src = __lsx_vldx(alpSrc[j], n);
+ a_src = __lsx_vilvl_h(a_src, a_src);
+ a_ev = __lsx_vmaddwev_w_h(a_ev, a_src, temp);
+ }
+ a_ev = __lsx_vsrai_w(a_ev, 19);
+ WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 0, 0);
+ WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 1, 1);
+ WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 2, 2);
+ WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 3, 3);
+ } else {
+ WRITE_FULL_LSX(R_ev, G_ev, B_ev, 0, 0);
+ WRITE_FULL_LSX(R_ev, G_ev, B_ev, 1, 1);
+ WRITE_FULL_LSX(R_ev, G_ev, B_ev, 2, 2);
+ WRITE_FULL_LSX(R_ev, G_ev, B_ev, 3, 3);
+ }
+ i += 4;
+ }
+ for (; i < dstW; i++) {
+ int Y = templ;
+ int V, U = V = tempc;
+
+ A = 0;
+ for (j = 0; j < lumFilterSize; j++) {
+ Y += lumSrc[j][i] * lumFilter[j];
+ }
+ for (j = 0; j < chrFilterSize; j++) {
+ U += chrUSrc[j][i] * chrFilter[j];
+ V += chrVSrc[j][i] * chrFilter[j];
+
+ }
+ Y >>= 10;
+ U >>= 10;
+ V >>= 10;
+ if (hasAlpha) {
+ A = 1 << 18;
+ for (j = 0; j < lumFilterSize; j++) {
+ A += alpSrc[j][i] * lumFilter[j];
+ }
+ A >>= 19;
+ if (A & 0x100)
+ A = av_clip_uint8(A);
+ }
+ Y -= y_offset;
+ Y *= y_coeff;
+ Y += ytemp;
+ R = (unsigned)Y + V * v2r_coe;
+ G = (unsigned)Y + V * v2g_coe + U * u2g_coe;
+ B = (unsigned)Y + U * u2b_coe;
+ yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
+ dest += step;
+ }
+ c->dither_error[0][i] = err[0];
+ c->dither_error[1][i] = err[1];
+ c->dither_error[2][i] = err[2];
+}
+
+static void
+yuv2rgb_full_2_template_lsx(SwsContext *c, const int16_t *buf[2],
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf[2], uint8_t *dest, int dstW,
+ int yalpha, int uvalpha, int y,
+ enum AVPixelFormat target, int hasAlpha)
+{
+ const int16_t *buf0 = buf[0], *buf1 = buf[1],
+ *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
+ *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
+ *abuf0 = hasAlpha ? abuf[0] : NULL,
+ *abuf1 = hasAlpha ? abuf[1] : NULL;
+ int yalpha1 = 4096 - yalpha;
+ int uvalpha1 = 4096 - uvalpha;
+ int uvtemp = 128 << 19;
+ int atemp = 1 << 18;
+ int err[4] = {0};
+ int ytemp = 1 << 21;
+ int len = dstW - 7;
+ int i, R, G, B, A;
+ int step = (target == AV_PIX_FMT_RGB24 ||
+ target == AV_PIX_FMT_BGR24) ? 3 : 4;
+ __m128i v_uvalpha1 = __lsx_vreplgr2vr_w(uvalpha1);
+ __m128i v_yalpha1 = __lsx_vreplgr2vr_w(yalpha1);
+ __m128i v_uvalpha = __lsx_vreplgr2vr_w(uvalpha);
+ __m128i v_yalpha = __lsx_vreplgr2vr_w(yalpha);
+ __m128i uv = __lsx_vreplgr2vr_w(uvtemp);
+ __m128i a_bias = __lsx_vreplgr2vr_w(atemp);
+ __m128i y_temp = __lsx_vreplgr2vr_w(ytemp);
+ YUVTORGB_SETUP_LSX
+
+ av_assert2(yalpha <= 4096U);
+ av_assert2(uvalpha <= 4096U);
+
+ if( target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
+ || target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8)
+ step = 1;
+
+ for (i = 0; i < len; i += 8) {
+ __m128i b0, b1, ub0, ub1, vb0, vb1;
+ __m128i y0_l, y0_h, y1_l, y1_h, u0_l, u0_h;
+ __m128i v0_l, v0_h, u1_l, u1_h, v1_l, v1_h;
+ __m128i y_l, y_h, v_l, v_h, u_l, u_h;
+ __m128i R_l, R_h, G_l, G_h, B_l, B_h;
+ int n = i << 1;
+
+ DUP4_ARG2(__lsx_vldx, buf0, n, buf1, n, ubuf0,
+ n, ubuf1, n, b0, b1, ub0, ub1);
+ DUP2_ARG2(__lsx_vldx, vbuf0, n, vbuf1, n, vb0 , vb1);
+ DUP2_ARG2(__lsx_vsllwil_w_h, b0, 0, b1, 0, y0_l, y1_l);
+ DUP4_ARG2(__lsx_vsllwil_w_h, ub0, 0, ub1, 0, vb0, 0, vb1, 0,
+ u0_l, u1_l, v0_l, v1_l);
+ DUP2_ARG1(__lsx_vexth_w_h, b0, b1, y0_h, y1_h);
+ DUP4_ARG1(__lsx_vexth_w_h, ub0, ub1, vb0, vb1,
+ u0_h, u1_h, v0_h, v1_h);
+ y0_l = __lsx_vmul_w(y0_l, v_yalpha1);
+ y0_h = __lsx_vmul_w(y0_h, v_yalpha1);
+ u0_l = __lsx_vmul_w(u0_l, v_uvalpha1);
+ u0_h = __lsx_vmul_w(u0_h, v_uvalpha1);
+ v0_l = __lsx_vmul_w(v0_l, v_uvalpha1);
+ v0_h = __lsx_vmul_w(v0_h, v_uvalpha1);
+ y_l = __lsx_vmadd_w(y0_l, v_yalpha, y1_l);
+ y_h = __lsx_vmadd_w(y0_h, v_yalpha, y1_h);
+ u_l = __lsx_vmadd_w(u0_l, v_uvalpha, u1_l);
+ u_h = __lsx_vmadd_w(u0_h, v_uvalpha, u1_h);
+ v_l = __lsx_vmadd_w(v0_l, v_uvalpha, v1_l);
+ v_h = __lsx_vmadd_w(v0_h, v_uvalpha, v1_h);
+ u_l = __lsx_vsub_w(u_l, uv);
+ u_h = __lsx_vsub_w(u_h, uv);
+ v_l = __lsx_vsub_w(v_l, uv);
+ v_h = __lsx_vsub_w(v_h, uv);
+ y_l = __lsx_vsrai_w(y_l, 10);
+ y_h = __lsx_vsrai_w(y_h, 10);
+ u_l = __lsx_vsrai_w(u_l, 10);
+ u_h = __lsx_vsrai_w(u_h, 10);
+ v_l = __lsx_vsrai_w(v_l, 10);
+ v_h = __lsx_vsrai_w(v_h, 10);
+ YUVTORGB_LSX(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
+ y_temp, v2r, v2g, u2g, u2b);
+ YUVTORGB_LSX(y_h, u_h, v_h, R_h, G_h, B_h, offset, coeff,
+ y_temp, v2r, v2g, u2g, u2b);
+
+ if (hasAlpha) {
+ __m128i a0, a1, a0_l, a0_h;
+ __m128i a_l, a_h, a1_l, a1_h;
+
+ DUP2_ARG2(__lsx_vldx, abuf0, n, abuf1, n, a0, a1);
+ DUP2_ARG2(__lsx_vsllwil_w_h, a0, 0, a1, 0, a0_l, a1_l);
+ DUP2_ARG1(__lsx_vexth_w_h, a0, a1, a0_h, a1_h);
+ a_l = __lsx_vmadd_w(a_bias, a0_l, v_yalpha1);
+ a_h = __lsx_vmadd_w(a_bias, a0_h, v_yalpha1);
+ a_l = __lsx_vmadd_w(a_l, v_yalpha, a1_l);
+ a_h = __lsx_vmadd_w(a_h, v_yalpha, a1_h);
+ a_l = __lsx_vsrai_w(a_l, 19);
+ a_h = __lsx_vsrai_w(a_h, 19);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 0, 0);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 1, 1);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 2, 2);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 3, 3);
+ WRITE_FULL_A_LSX(R_h, G_h, B_h, a_h, 0, 4);
+ WRITE_FULL_A_LSX(R_h, G_h, B_h, a_h, 1, 5);
+ WRITE_FULL_A_LSX(R_h, G_h, B_h, a_h, 2, 6);
+ WRITE_FULL_A_LSX(R_h, G_h, B_h, a_h, 3, 7);
+ } else {
+ WRITE_FULL_LSX(R_l, G_l, B_l, 0, 0);
+ WRITE_FULL_LSX(R_l, G_l, B_l, 1, 1);
+ WRITE_FULL_LSX(R_l, G_l, B_l, 2, 2);
+ WRITE_FULL_LSX(R_l, G_l, B_l, 3, 3);
+ WRITE_FULL_LSX(R_h, G_h, B_h, 0, 4);
+ WRITE_FULL_LSX(R_h, G_h, B_h, 1, 5);
+ WRITE_FULL_LSX(R_h, G_h, B_h, 2, 6);
+ WRITE_FULL_LSX(R_h, G_h, B_h, 3, 7);
+ }
+ }
+ if (dstW - i >= 4) {
+ __m128i b0, b1, ub0, ub1, vb0, vb1;
+ __m128i y0_l, y1_l, u0_l;
+ __m128i v0_l, u1_l, v1_l;
+ __m128i y_l, u_l, v_l;
+ __m128i R_l, G_l, B_l;
+ int n = i << 1;
+
+ DUP4_ARG2(__lsx_vldx, buf0, n, buf1, n, ubuf0, n,
+ ubuf1, n, b0, b1, ub0, ub1);
+ DUP2_ARG2(__lsx_vldx, vbuf0, n, vbuf1, n, vb0, vb1);
+ DUP2_ARG2(__lsx_vsllwil_w_h, b0, 0, b1, 0, y0_l, y1_l);
+ DUP4_ARG2(__lsx_vsllwil_w_h, ub0, 0, ub1, 0, vb0, 0, vb1, 0,
+ u0_l, u1_l, v0_l, v1_l);
+ y0_l = __lsx_vmul_w(y0_l, v_yalpha1);
+ u0_l = __lsx_vmul_w(u0_l, v_uvalpha1);
+ v0_l = __lsx_vmul_w(v0_l, v_uvalpha1);
+ y_l = __lsx_vmadd_w(y0_l, v_yalpha, y1_l);
+ u_l = __lsx_vmadd_w(u0_l, v_uvalpha, u1_l);
+ v_l = __lsx_vmadd_w(v0_l, v_uvalpha, v1_l);
+ u_l = __lsx_vsub_w(u_l, uv);
+ v_l = __lsx_vsub_w(v_l, uv);
+ y_l = __lsx_vsrai_w(y_l, 10);
+ u_l = __lsx_vsrai_w(u_l, 10);
+ v_l = __lsx_vsrai_w(v_l, 10);
+ YUVTORGB_LSX(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
+ y_temp, v2r, v2g, u2g, u2b);
+
+ if (hasAlpha) {
+ __m128i a0, a1, a0_l;
+ __m128i a_l, a1_l;
+
+ DUP2_ARG2(__lsx_vldx, abuf0, n, abuf1, n, a0, a1);
+ DUP2_ARG2(__lsx_vsllwil_w_h, a0, 0, a1, 0, a0_l, a1_l);
+ a_l = __lsx_vmadd_w(a_bias, a0_l, v_yalpha1);
+ a_l = __lsx_vmadd_w(a_l, v_yalpha, a1_l);
+ a_l = __lsx_vsrai_w(a_l, 19);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 0, 0);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 1, 1);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 2, 2);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 3, 3);
+ } else {
+ WRITE_FULL_LSX(R_l, G_l, B_l, 0, 0);
+ WRITE_FULL_LSX(R_l, G_l, B_l, 1, 1);
+ WRITE_FULL_LSX(R_l, G_l, B_l, 2, 2);
+ WRITE_FULL_LSX(R_l, G_l, B_l, 3, 3);
+ }
+ i += 4;
+ }
+ for (; i < dstW; i++){
+ int Y = ( buf0[i] * yalpha1 + buf1[i] * yalpha ) >> 10;
+ int U = (ubuf0[i] * uvalpha1 + ubuf1[i] * uvalpha- uvtemp) >> 10;
+ int V = (vbuf0[i] * uvalpha1 + vbuf1[i] * uvalpha- uvtemp) >> 10;
+
+ A = 0;
+ if (hasAlpha){
+ A = (abuf0[i] * yalpha1 + abuf1[i] * yalpha + atemp) >> 19;
+ if (A & 0x100)
+ A = av_clip_uint8(A);
+ }
+
+ Y -= y_offset;
+ Y *= y_coeff;
+ Y += ytemp;
+ R = (unsigned)Y + V * v2r_coe;
+ G = (unsigned)Y + V * v2g_coe + U * u2g_coe;
+ B = (unsigned)Y + U * u2b_coe;
+ yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
+ dest += step;
+ }
+ c->dither_error[0][i] = err[0];
+ c->dither_error[1][i] = err[1];
+ c->dither_error[2][i] = err[2];
+}
+
+static void
+yuv2rgb_full_1_template_lsx(SwsContext *c, const int16_t *buf0,
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf0, uint8_t *dest, int dstW,
+ int uvalpha, int y, enum AVPixelFormat target,
+ int hasAlpha)
+{
+ const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
+ int i, B, G, R, A;
+ int step = (target == AV_PIX_FMT_RGB24 || target == AV_PIX_FMT_BGR24) ? 3 : 4;
+ int err[4] = {0};
+ int ytemp = 1 << 21;
+ int bias_int = 64;
+ int len = dstW - 7;
+ __m128i y_temp = __lsx_vreplgr2vr_w(ytemp);
+ YUVTORGB_SETUP_LSX
+
+ if( target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
+ || target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8)
+ step = 1;
+ if (uvalpha < 2048) {
+ int uvtemp = 128 << 7;
+ __m128i uv = __lsx_vreplgr2vr_w(uvtemp);
+ __m128i bias = __lsx_vreplgr2vr_w(bias_int);
+
+ for (i = 0; i < len; i += 8) {
+ __m128i b, ub, vb, ub_l, ub_h, vb_l, vb_h;
+ __m128i y_l, y_h, u_l, u_h, v_l, v_h;
+ __m128i R_l, R_h, G_l, G_h, B_l, B_h;
+ int n = i << 1;
+
+ DUP2_ARG2(__lsx_vldx, buf0, n, ubuf0, n, b, ub);
+ vb = __lsx_vldx(vbuf0, n);
+ y_l = __lsx_vsllwil_w_h(b, 2);
+ y_h = __lsx_vexth_w_h(b);
+ DUP2_ARG2(__lsx_vsllwil_w_h, ub, 0, vb, 0, ub_l, vb_l);
+ DUP2_ARG1(__lsx_vexth_w_h, ub, vb, ub_h, vb_h);
+ y_h = __lsx_vslli_w(y_h, 2);
+ u_l = __lsx_vsub_w(ub_l, uv);
+ u_h = __lsx_vsub_w(ub_h, uv);
+ v_l = __lsx_vsub_w(vb_l, uv);
+ v_h = __lsx_vsub_w(vb_h, uv);
+ u_l = __lsx_vslli_w(u_l, 2);
+ u_h = __lsx_vslli_w(u_h, 2);
+ v_l = __lsx_vslli_w(v_l, 2);
+ v_h = __lsx_vslli_w(v_h, 2);
+ YUVTORGB_LSX(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
+ y_temp, v2r, v2g, u2g, u2b);
+ YUVTORGB_LSX(y_h, u_h, v_h, R_h, G_h, B_h, offset, coeff,
+ y_temp, v2r, v2g, u2g, u2b);
+
+ if(hasAlpha) {
+ __m128i a_src;
+ __m128i a_l, a_h;
+
+ a_src = __lsx_vld(abuf0 + i, 0);
+ a_l = __lsx_vsllwil_w_h(a_src, 0);
+ a_h = __lsx_vexth_w_h(a_src);
+ a_l = __lsx_vadd_w(a_l, bias);
+ a_h = __lsx_vadd_w(a_h, bias);
+ a_l = __lsx_vsrai_w(a_l, 7);
+ a_h = __lsx_vsrai_w(a_h, 7);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 0, 0);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 1, 1);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 2, 2);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 3, 3);
+ WRITE_FULL_A_LSX(R_h, G_h, B_h, a_h, 0, 4);
+ WRITE_FULL_A_LSX(R_h, G_h, B_h, a_h, 1, 5);
+ WRITE_FULL_A_LSX(R_h, G_h, B_h, a_h, 2, 6);
+ WRITE_FULL_A_LSX(R_h, G_h, B_h, a_h, 3, 7);
+ } else {
+ WRITE_FULL_LSX(R_l, G_l, B_l, 0, 0);
+ WRITE_FULL_LSX(R_l, G_l, B_l, 1, 1);
+ WRITE_FULL_LSX(R_l, G_l, B_l, 2, 2);
+ WRITE_FULL_LSX(R_l, G_l, B_l, 3, 3);
+ WRITE_FULL_LSX(R_h, G_h, B_h, 0, 4);
+ WRITE_FULL_LSX(R_h, G_h, B_h, 1, 5);
+ WRITE_FULL_LSX(R_h, G_h, B_h, 2, 6);
+ WRITE_FULL_LSX(R_h, G_h, B_h, 3, 7);
+ }
+ }
+ if (dstW - i >= 4) {
+ __m128i b, ub, vb, ub_l, vb_l;
+ __m128i y_l, u_l, v_l;
+ __m128i R_l, G_l, B_l;
+ int n = i << 1;
+
+ DUP2_ARG2(__lsx_vldx, buf0, n, ubuf0, n, b, ub);
+ vb = __lsx_vldx(vbuf0, n);
+ y_l = __lsx_vsllwil_w_h(b, 0);
+ DUP2_ARG2(__lsx_vsllwil_w_h, ub, 0, vb, 0, ub_l, vb_l);
+ y_l = __lsx_vslli_w(y_l, 2);
+ u_l = __lsx_vsub_w(ub_l, uv);
+ v_l = __lsx_vsub_w(vb_l, uv);
+ u_l = __lsx_vslli_w(u_l, 2);
+ v_l = __lsx_vslli_w(v_l, 2);
+ YUVTORGB_LSX(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
+ y_temp, v2r, v2g, u2g, u2b);
+
+ if(hasAlpha) {
+ __m128i a_src, a_l;
+
+ a_src = __lsx_vldx(abuf0, n);
+ a_src = __lsx_vsllwil_w_h(a_src, 0);
+ a_l = __lsx_vadd_w(bias, a_src);
+ a_l = __lsx_vsrai_w(a_l, 7);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 0, 0);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 1, 1);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 2, 2);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 3, 3);
+ } else {
+ WRITE_FULL_LSX(R_l, G_l, B_l, 0, 0);
+ WRITE_FULL_LSX(R_l, G_l, B_l, 1, 1);
+ WRITE_FULL_LSX(R_l, G_l, B_l, 2, 2);
+ WRITE_FULL_LSX(R_l, G_l, B_l, 3, 3);
+ }
+ i += 4;
+ }
+ for (; i < dstW; i++) {
+ int Y = buf0[i] << 2;
+ int U = (ubuf0[i] - uvtemp) << 2;
+ int V = (vbuf0[i] - uvtemp) << 2;
+
+ A = 0;
+ if(hasAlpha) {
+ A = (abuf0[i] + 64) >> 7;
+ if (A & 0x100)
+ A = av_clip_uint8(A);
+ }
+ Y -= y_offset;
+ Y *= y_coeff;
+ Y += ytemp;
+ R = (unsigned)Y + V * v2r_coe;
+ G = (unsigned)Y + V * v2g_coe + U * u2g_coe;
+ B = (unsigned)Y + U * u2b_coe;
+ yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
+ dest += step;
+ }
+ } else {
+ const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
+ int uvtemp = 128 << 8;
+ __m128i uv = __lsx_vreplgr2vr_w(uvtemp);
+ __m128i zero = __lsx_vldi(0);
+ __m128i bias = __lsx_vreplgr2vr_h(bias_int);
+
+ for (i = 0; i < len; i += 8) {
+ __m128i b, ub0, ub1, vb0, vb1;
+ __m128i y_ev, y_od, u_ev, u_od, v_ev, v_od;
+ __m128i R_ev, R_od, G_ev, G_od, B_ev, B_od;
+ int n = i << 1;
+
+ DUP4_ARG2(__lsx_vldx, buf0, n, ubuf0, n, vbuf0, n,
+ ubuf1, n, b, ub0, vb0, ub1);
+ vb1 = __lsx_vldx(vbuf, n);
+ y_ev = __lsx_vaddwev_w_h(b, zero);
+ y_od = __lsx_vaddwod_w_h(b, zero);
+ DUP2_ARG2(__lsx_vaddwev_w_h, ub0, vb0, ub1, vb1, u_ev, v_ev);
+ DUP2_ARG2(__lsx_vaddwod_w_h, ub0, vb0, ub1, vb1, u_od, v_od);
+ DUP2_ARG2(__lsx_vslli_w, y_ev, 2, y_od, 2, y_ev, y_od);
+ DUP4_ARG2(__lsx_vsub_w, u_ev, uv, u_od, uv, v_ev, uv, v_od, uv,
+ u_ev, u_od, v_ev, v_od);
+ DUP4_ARG2(__lsx_vslli_w, u_ev, 1, u_od, 1, v_ev, 1, v_od, 1,
+ u_ev, u_od, v_ev, v_od);
+ YUVTORGB_LSX(y_ev, u_ev, v_ev, R_ev, G_ev, B_ev, offset, coeff,
+ y_temp, v2r, v2g, u2g, u2b);
+ YUVTORGB_LSX(y_od, u_od, v_od, R_od, G_od, B_od, offset, coeff,
+ y_temp, v2r, v2g, u2g, u2b);
+
+ if(hasAlpha) {
+ __m128i a_src;
+ __m128i a_ev, a_od;
+
+ a_src = __lsx_vld(abuf0 + i, 0);
+ a_ev = __lsx_vaddwev_w_h(bias, a_src);
+ a_od = __lsx_vaddwod_w_h(bias, a_src);
+ a_ev = __lsx_vsrai_w(a_ev, 7);
+ a_od = __lsx_vsrai_w(a_od, 7);
+ WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 0, 0);
+ WRITE_FULL_A_LSX(R_od, G_od, B_od, a_od, 0, 1);
+ WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 1, 2);
+ WRITE_FULL_A_LSX(R_od, G_od, B_od, a_od, 1, 3);
+ WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 2, 4);
+ WRITE_FULL_A_LSX(R_od, G_od, B_od, a_od, 2, 5);
+ WRITE_FULL_A_LSX(R_ev, G_ev, B_ev, a_ev, 3, 6);
+ WRITE_FULL_A_LSX(R_od, G_od, B_od, a_od, 3, 7);
+ } else {
+ WRITE_FULL_LSX(R_ev, G_ev, B_ev, 0, 0);
+ WRITE_FULL_LSX(R_od, G_od, B_od, 0, 1);
+ WRITE_FULL_LSX(R_ev, G_ev, B_ev, 1, 2);
+ WRITE_FULL_LSX(R_od, G_od, B_od, 1, 3);
+ WRITE_FULL_LSX(R_ev, G_ev, B_ev, 2, 4);
+ WRITE_FULL_LSX(R_od, G_od, B_od, 2, 5);
+ WRITE_FULL_LSX(R_ev, G_ev, B_ev, 3, 6);
+ WRITE_FULL_LSX(R_od, G_od, B_od, 3, 7);
+ }
+ }
+ if (dstW - i >= 4) {
+ __m128i b, ub0, ub1, vb0, vb1;
+ __m128i y_l, u_l, v_l;
+ __m128i R_l, G_l, B_l;
+ int n = i << 1;
+
+ DUP4_ARG2(__lsx_vldx, buf0, n, ubuf0, n, vbuf0, n,
+ ubuf1, n, b, ub0, vb0, ub1);
+ vb1 = __lsx_vldx(vbuf1, n);
+ y_l = __lsx_vsllwil_w_h(b, 0);
+ y_l = __lsx_vslli_w(y_l, 2);
+ DUP4_ARG2(__lsx_vsllwil_w_h, ub0, 0, vb0, 0, ub1, 0, vb1, 0,
+ ub0, vb0, ub1, vb1);
+ DUP2_ARG2(__lsx_vadd_w, ub0, ub1, vb0, vb1, u_l, v_l);
+ u_l = __lsx_vsub_w(u_l, uv);
+ v_l = __lsx_vsub_w(v_l, uv);
+ u_l = __lsx_vslli_w(u_l, 1);
+ v_l = __lsx_vslli_w(v_l, 1);
+ YUVTORGB_LSX(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
+ y_temp, v2r, v2g, u2g, u2b);
+
+ if(hasAlpha) {
+ __m128i a_src;
+ __m128i a_l;
+
+ a_src = __lsx_vld(abuf0 + i, 0);
+ a_src = __lsx_vilvl_h(a_src, a_src);
+ a_l = __lsx_vaddwev_w_h(bias, a_l);
+ a_l = __lsx_vsrai_w(a_l, 7);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 0, 0);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 1, 1);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 2, 2);
+ WRITE_FULL_A_LSX(R_l, G_l, B_l, a_l, 3, 3);
+ } else {
+ WRITE_FULL_LSX(R_l, G_l, B_l, 0, 0);
+ WRITE_FULL_LSX(R_l, G_l, B_l, 1, 1);
+ WRITE_FULL_LSX(R_l, G_l, B_l, 2, 2);
+ WRITE_FULL_LSX(R_l, G_l, B_l, 3, 3);
+ }
+ i += 4;
+ }
+ for (; i < dstW; i++) {
+ int Y = buf0[i] << 2;
+ int U = (ubuf0[i] + ubuf1[i] - uvtemp) << 1;
+ int V = (vbuf0[i] + vbuf1[i] - uvtemp) << 1;
+
+ A = 0;
+ if(hasAlpha) {
+ A = (abuf0[i] + 64) >> 7;
+ if (A & 0x100)
+ A = av_clip_uint8(A);
+ }
+ Y -= y_offset;
+ Y *= y_coeff;
+ Y += ytemp;
+ R = (unsigned)Y + V * v2r_coe;
+ G = (unsigned)Y + V * v2g_coe + U * u2g_coe;
+ B = (unsigned)Y + U * u2b_coe;
+ yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
+ dest += step;
+ }
+ }
+ c->dither_error[0][i] = err[0];
+ c->dither_error[1][i] = err[1];
+ c->dither_error[2][i] = err[2];
+}
+
+#if CONFIG_SMALL
+YUV2RGBWRAPPER(yuv2, rgb_full, bgra32_full, AV_PIX_FMT_BGRA,
+ CONFIG_SWSCALE_ALPHA && c->needAlpha)
+YUV2RGBWRAPPER(yuv2, rgb_full, abgr32_full, AV_PIX_FMT_ABGR,
+ CONFIG_SWSCALE_ALPHA && c->needAlpha)
+YUV2RGBWRAPPER(yuv2, rgb_full, rgba32_full, AV_PIX_FMT_RGBA,
+ CONFIG_SWSCALE_ALPHA && c->needAlpha)
+YUV2RGBWRAPPER(yuv2, rgb_full, argb32_full, AV_PIX_FMT_ARGB,
+ CONFIG_SWSCALE_ALPHA && c->needAlpha)
+#else
+#if CONFIG_SWSCALE_ALPHA
+YUV2RGBWRAPPER(yuv2, rgb_full, bgra32_full, AV_PIX_FMT_BGRA, 1)
+YUV2RGBWRAPPER(yuv2, rgb_full, abgr32_full, AV_PIX_FMT_ABGR, 1)
+YUV2RGBWRAPPER(yuv2, rgb_full, rgba32_full, AV_PIX_FMT_RGBA, 1)
+YUV2RGBWRAPPER(yuv2, rgb_full, argb32_full, AV_PIX_FMT_ARGB, 1)
+#endif
+YUV2RGBWRAPPER(yuv2, rgb_full, bgrx32_full, AV_PIX_FMT_BGRA, 0)
+YUV2RGBWRAPPER(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR, 0)
+YUV2RGBWRAPPER(yuv2, rgb_full, rgbx32_full, AV_PIX_FMT_RGBA, 0)
+YUV2RGBWRAPPER(yuv2, rgb_full, xrgb32_full, AV_PIX_FMT_ARGB, 0)
+#endif
+YUV2RGBWRAPPER(yuv2, rgb_full, bgr24_full, AV_PIX_FMT_BGR24, 0)
+YUV2RGBWRAPPER(yuv2, rgb_full, rgb24_full, AV_PIX_FMT_RGB24, 0)
+
+YUV2RGBWRAPPER(yuv2, rgb_full, bgr4_byte_full, AV_PIX_FMT_BGR4_BYTE, 0)
+YUV2RGBWRAPPER(yuv2, rgb_full, rgb4_byte_full, AV_PIX_FMT_RGB4_BYTE, 0)
+YUV2RGBWRAPPER(yuv2, rgb_full, bgr8_full, AV_PIX_FMT_BGR8, 0)
+YUV2RGBWRAPPER(yuv2, rgb_full, rgb8_full, AV_PIX_FMT_RGB8, 0)
+
+
+av_cold void ff_sws_init_output_lsx(SwsContext *c)
+{
+ if(c->flags & SWS_FULL_CHR_H_INT) {
+ switch (c->dstFormat) {
+ case AV_PIX_FMT_RGBA:
+#if CONFIG_SMALL
+ c->yuv2packedX = yuv2rgba32_full_X_lsx;
+ c->yuv2packed2 = yuv2rgba32_full_2_lsx;
+ c->yuv2packed1 = yuv2rgba32_full_1_lsx;
+#else
+#if CONFIG_SWSCALE_ALPHA
+ if (c->needAlpha) {
+ c->yuv2packedX = yuv2rgba32_full_X_lsx;
+ c->yuv2packed2 = yuv2rgba32_full_2_lsx;
+ c->yuv2packed1 = yuv2rgba32_full_1_lsx;
+ } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+ {
+ c->yuv2packedX = yuv2rgbx32_full_X_lsx;
+ c->yuv2packed2 = yuv2rgbx32_full_2_lsx;
+ c->yuv2packed1 = yuv2rgbx32_full_1_lsx;
+ }
+#endif /* !CONFIG_SMALL */
+ break;
+ case AV_PIX_FMT_ARGB:
+#if CONFIG_SMALL
+ c->yuv2packedX = yuv2argb32_full_X_lsx;
+ c->yuv2packed2 = yuv2argb32_full_2_lsx;
+ c->yuv2packed1 = yuv2argb32_full_1_lsx;
+#else
+#if CONFIG_SWSCALE_ALPHA
+ if (c->needAlpha) {
+ c->yuv2packedX = yuv2argb32_full_X_lsx;
+ c->yuv2packed2 = yuv2argb32_full_2_lsx;
+ c->yuv2packed1 = yuv2argb32_full_1_lsx;
+ } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+ {
+ c->yuv2packedX = yuv2xrgb32_full_X_lsx;
+ c->yuv2packed2 = yuv2xrgb32_full_2_lsx;
+ c->yuv2packed1 = yuv2xrgb32_full_1_lsx;
+ }
+#endif /* !CONFIG_SMALL */
+ break;
+ case AV_PIX_FMT_BGRA:
+#if CONFIG_SMALL
+ c->yuv2packedX = yuv2bgra32_full_X_lsx;
+ c->yuv2packed2 = yuv2bgra32_full_2_lsx;
+ c->yuv2packed1 = yuv2bgra32_full_1_lsx;
+#else
+#if CONFIG_SWSCALE_ALPHA
+ if (c->needAlpha) {
+ c->yuv2packedX = yuv2bgra32_full_X_lsx;
+ c->yuv2packed2 = yuv2bgra32_full_2_lsx;
+ c->yuv2packed1 = yuv2bgra32_full_1_lsx;
+ } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+ {
+ c->yuv2packedX = yuv2bgrx32_full_X_lsx;
+ c->yuv2packed2 = yuv2bgrx32_full_2_lsx;
+ c->yuv2packed1 = yuv2bgrx32_full_1_lsx;
+ }
+#endif /* !CONFIG_SMALL */
+ break;
+ case AV_PIX_FMT_ABGR:
+#if CONFIG_SMALL
+ c->yuv2packedX = yuv2abgr32_full_X_lsx;
+ c->yuv2packed2 = yuv2abgr32_full_2_lsx;
+ c->yuv2packed1 = yuv2abgr32_full_1_lsx;
+#else
+#if CONFIG_SWSCALE_ALPHA
+ if (c->needAlpha) {
+ c->yuv2packedX = yuv2abgr32_full_X_lsx;
+ c->yuv2packed2 = yuv2abgr32_full_2_lsx;
+ c->yuv2packed1 = yuv2abgr32_full_1_lsx;
+ } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+ {
+ c->yuv2packedX = yuv2xbgr32_full_X_lsx;
+ c->yuv2packed2 = yuv2xbgr32_full_2_lsx;
+ c->yuv2packed1 = yuv2xbgr32_full_1_lsx;
+ }
+#endif /* !CONFIG_SMALL */
+ break;
+ case AV_PIX_FMT_RGB24:
+ c->yuv2packedX = yuv2rgb24_full_X_lsx;
+ c->yuv2packed2 = yuv2rgb24_full_2_lsx;
+ c->yuv2packed1 = yuv2rgb24_full_1_lsx;
+ break;
+ case AV_PIX_FMT_BGR24:
+ c->yuv2packedX = yuv2bgr24_full_X_lsx;
+ c->yuv2packed2 = yuv2bgr24_full_2_lsx;
+ c->yuv2packed1 = yuv2bgr24_full_1_lsx;
+ break;
+ case AV_PIX_FMT_BGR4_BYTE:
+ c->yuv2packedX = yuv2bgr4_byte_full_X_lsx;
+ c->yuv2packed2 = yuv2bgr4_byte_full_2_lsx;
+ c->yuv2packed1 = yuv2bgr4_byte_full_1_lsx;
+ break;
+ case AV_PIX_FMT_RGB4_BYTE:
+ c->yuv2packedX = yuv2rgb4_byte_full_X_lsx;
+ c->yuv2packed2 = yuv2rgb4_byte_full_2_lsx;
+ c->yuv2packed1 = yuv2rgb4_byte_full_1_lsx;
+ break;
+ case AV_PIX_FMT_BGR8:
+ c->yuv2packedX = yuv2bgr8_full_X_lsx;
+ c->yuv2packed2 = yuv2bgr8_full_2_lsx;
+ c->yuv2packed1 = yuv2bgr8_full_1_lsx;
+ break;
+ case AV_PIX_FMT_RGB8:
+ c->yuv2packedX = yuv2rgb8_full_X_lsx;
+ c->yuv2packed2 = yuv2rgb8_full_2_lsx;
+ c->yuv2packed1 = yuv2rgb8_full_1_lsx;
+ break;
+ }
+ } else {
+ switch (c->dstFormat) {
+ case AV_PIX_FMT_RGB32:
+ case AV_PIX_FMT_BGR32:
+#if CONFIG_SMALL
+#else
+#if CONFIG_SWSCALE_ALPHA
+ if (c->needAlpha) {
+ } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+ {
+ c->yuv2packed1 = yuv2rgbx32_1_lsx;
+ c->yuv2packed2 = yuv2rgbx32_2_lsx;
+ c->yuv2packedX = yuv2rgbx32_X_lsx;
+ }
+#endif /* !CONFIG_SMALL */
+ break;
+ case AV_PIX_FMT_RGB32_1:
+ case AV_PIX_FMT_BGR32_1:
+#if CONFIG_SMALL
+#else
+#if CONFIG_SWSCALE_ALPHA
+ if (c->needAlpha) {
+ } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+ {
+ c->yuv2packed1 = yuv2rgbx32_1_1_lsx;
+ c->yuv2packed2 = yuv2rgbx32_1_2_lsx;
+ c->yuv2packedX = yuv2rgbx32_1_X_lsx;
+ }
+#endif /* !CONFIG_SMALL */
+ break;
+ case AV_PIX_FMT_RGB24:
+ c->yuv2packed1 = yuv2rgb24_1_lsx;
+ c->yuv2packed2 = yuv2rgb24_2_lsx;
+ c->yuv2packedX = yuv2rgb24_X_lsx;
+ break;
+ case AV_PIX_FMT_BGR24:
+ c->yuv2packed1 = yuv2bgr24_1_lsx;
+ c->yuv2packed2 = yuv2bgr24_2_lsx;
+ c->yuv2packedX = yuv2bgr24_X_lsx;
+ break;
+ case AV_PIX_FMT_RGB565LE:
+ case AV_PIX_FMT_RGB565BE:
+ case AV_PIX_FMT_BGR565LE:
+ case AV_PIX_FMT_BGR565BE:
+ c->yuv2packed1 = yuv2rgb16_1_lsx;
+ c->yuv2packed2 = yuv2rgb16_2_lsx;
+ c->yuv2packedX = yuv2rgb16_X_lsx;
+ break;
+ case AV_PIX_FMT_RGB555LE:
+ case AV_PIX_FMT_RGB555BE:
+ case AV_PIX_FMT_BGR555LE:
+ case AV_PIX_FMT_BGR555BE:
+ c->yuv2packed1 = yuv2rgb15_1_lsx;
+ c->yuv2packed2 = yuv2rgb15_2_lsx;
+ c->yuv2packedX = yuv2rgb15_X_lsx;
+ break;
+ case AV_PIX_FMT_RGB444LE:
+ case AV_PIX_FMT_RGB444BE:
+ case AV_PIX_FMT_BGR444LE:
+ case AV_PIX_FMT_BGR444BE:
+ c->yuv2packed1 = yuv2rgb12_1_lsx;
+ c->yuv2packed2 = yuv2rgb12_2_lsx;
+ c->yuv2packedX = yuv2rgb12_X_lsx;
+ break;
+ case AV_PIX_FMT_RGB8:
+ case AV_PIX_FMT_BGR8:
+ c->yuv2packed1 = yuv2rgb8_1_lsx;
+ c->yuv2packed2 = yuv2rgb8_2_lsx;
+ c->yuv2packedX = yuv2rgb8_X_lsx;
+ break;
+ case AV_PIX_FMT_RGB4:
+ case AV_PIX_FMT_BGR4:
+ c->yuv2packed1 = yuv2rgb4_1_lsx;
+ c->yuv2packed2 = yuv2rgb4_2_lsx;
+ c->yuv2packedX = yuv2rgb4_X_lsx;
+ break;
+ case AV_PIX_FMT_RGB4_BYTE:
+ case AV_PIX_FMT_BGR4_BYTE:
+ c->yuv2packed1 = yuv2rgb4b_1_lsx;
+ c->yuv2packed2 = yuv2rgb4b_2_lsx;
+ c->yuv2packedX = yuv2rgb4b_X_lsx;
+ break;
+ }
+ }
+}
diff --git a/libswscale/loongarch/swscale.S b/libswscale/loongarch/swscale.S
new file mode 100644
index 0000000000..aa4c5cbe28
--- /dev/null
+++ b/libswscale/loongarch/swscale.S
@@ -0,0 +1,1868 @@
+/*
+ * Loongson LSX optimized swscale
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/loongarch/loongson_asm.S"
+
+/* void ff_hscale_8_to_15_lsx(SwsContext *c, int16_t *dst, int dstW,
+ * const uint8_t *src, const int16_t *filter,
+ * const int32_t *filterPos, int filterSize)
+ */
+function ff_hscale_8_to_15_lsx
+ addi.d sp, sp, -72
+ st.d s0, sp, 0
+ st.d s1, sp, 8
+ st.d s2, sp, 16
+ st.d s3, sp, 24
+ st.d s4, sp, 32
+ st.d s5, sp, 40
+ st.d s6, sp, 48
+ st.d s7, sp, 56
+ st.d s8, sp, 64
+ li.w t0, 32767
+ li.w t8, 8
+ li.w t7, 4
+ vldi vr0, 0
+ vreplgr2vr.w vr20, t0
+ beq a6, t7, .LOOP_DSTW4
+ beq a6, t8, .LOOP_DSTW8
+ blt t8, a6, .LOOP_START
+ b .END_DSTW4
+
+.LOOP_START:
+ li.w t1, 0
+ li.w s1, 0
+ li.w s2, 0
+ li.w s3, 0
+ li.w s4, 0
+ li.w s5, 0
+ vldi vr22, 0
+ addi.w s0, a6, -7
+ slli.w s7, a6, 1
+ slli.w s8, a6, 2
+ add.w t6, s7, s8
+.LOOP_DSTW:
+ ld.w t2, a5, 0
+ ld.w t3, a5, 4
+ ld.w t4, a5, 8
+ ld.w t5, a5, 12
+ fldx.d f1, a3, t2
+ fldx.d f2, a3, t3
+ fldx.d f3, a3, t4
+ fldx.d f4, a3, t5
+ vld vr9, a4, 0
+ vldx vr10, a4, s7
+ vldx vr11, a4, s8
+ vldx vr12, a4, t6
+ vilvl.b vr1, vr0, vr1
+ vilvl.b vr2, vr0, vr2
+ vilvl.b vr3, vr0, vr3
+ vilvl.b vr4, vr0, vr4
+ vdp2.w.h vr17, vr1, vr9
+ vdp2.w.h vr18, vr2, vr10
+ vdp2.w.h vr19, vr3, vr11
+ vdp2.w.h vr21, vr4, vr12
+ vhaddw.d.w vr1, vr17, vr17
+ vhaddw.d.w vr2, vr18, vr18
+ vhaddw.d.w vr3, vr19, vr19
+ vhaddw.d.w vr4, vr21, vr21
+ vhaddw.q.d vr1, vr1, vr1
+ vhaddw.q.d vr2, vr2, vr2
+ vhaddw.q.d vr3, vr3, vr3
+ vhaddw.q.d vr4, vr4, vr4
+ vilvl.w vr1, vr2, vr1
+ vilvl.w vr3, vr4, vr3
+ vilvl.d vr1, vr3, vr1
+ vadd.w vr22, vr22, vr1
+ addi.w s1, s1, 8
+ addi.d a3, a3, 8
+ addi.d a4, a4, 16
+ blt s1, s0, .LOOP_DSTW
+ blt s1, a6, .DSTWA
+ b .END_FILTER
+.DSTWA:
+ ld.w t2, a5, 0
+ li.w t3, 0
+ move s6, s1
+.FILTERSIZEA:
+ add.w t4, t2, t3
+ ldx.bu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t6, t6, 1
+ ldx.h t6, a4, t6
+ mul.w t6, t5, t6
+ add.w s2, s2, t6
+ addi.w t3, t3, 1
+ addi.w s6, s6, 1
+ blt s6, a6, .FILTERSIZEA
+
+ ld.w t2, a5, 4
+ li.w t3, 0
+ move s6, s1
+ addi.w t1, t1, 1
+.FILTERSIZEB:
+ add.w t4, t2, t3
+ ldx.bu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t6, t6, 1
+ ldx.h t6, a4, t6
+ mul.w t6, t5, t6
+ add.w s3, s3, t6
+ addi.w t3, t3, 1
+ addi.w s6, s6, 1
+ blt s6, a6, .FILTERSIZEB
+ ld.w t2, a5, 8
+ addi.w t1, t1, 1
+ li.w t3, 0
+ move s6, s1
+.FILTERSIZEC:
+ add.w t4, t2, t3
+ ldx.bu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t6, t6, 1
+ ldx.h t6, a4, t6
+ mul.w t6, t5, t6
+ add.w s4, s4, t6
+ addi.w t3, t3, 1
+ addi.w s6, s6, 1
+ blt s6, a6, .FILTERSIZEC
+ ld.w t2, a5, 12
+ addi.w t1, t1, 1
+ move s6, s1
+ li.w t3, 0
+.FILTERSIZED:
+ add.w t4, t2, t3
+ ldx.bu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t6, t6, 1
+ ldx.h t6, a4, t6
+ mul.w t6, t5, t6
+ add.w s5, s5, t6
+ addi.w t3, t3, 1
+ addi.w s6, s6, 1
+ blt s6, a6, .FILTERSIZED
+.END_FILTER:
+ vpickve2gr.w t1, vr22, 0
+ vpickve2gr.w t2, vr22, 1
+ vpickve2gr.w t3, vr22, 2
+ vpickve2gr.w t4, vr22, 3
+ add.w s2, s2, t1
+ add.w s3, s3, t2
+ add.w s4, s4, t3
+ add.w s5, s5, t4
+ srai.w s2, s2, 7
+ srai.w s3, s3, 7
+ srai.w s4, s4, 7
+ srai.w s5, s5, 7
+ slt t1, s2, t0
+ slt t2, s3, t0
+ slt t3, s4, t0
+ slt t4, s5, t0
+ maskeqz s2, s2, t1
+ maskeqz s3, s3, t2
+ maskeqz s4, s4, t3
+ maskeqz s5, s5, t4
+ masknez t1, t0, t1
+ masknez t2, t0, t2
+ masknez t3, t0, t3
+ masknez t4, t0, t4
+ or s2, s2, t1
+ or s3, s3, t2
+ or s4, s4, t3
+ or s5, s5, t4
+ st.h s2, a1, 0
+ st.h s3, a1, 2
+ st.h s4, a1, 4
+ st.h s5, a1, 6
+
+ addi.d a1, a1, 8
+ sub.d a3, a3, s1
+ addi.d a5, a5, 16
+ slli.d t3, a6, 3
+ add.d a4, a4, t3
+ sub.d a4, a4, s1
+ sub.d a4, a4, s1
+ addi.d a2, a2, -4
+ bge a2, t7, .LOOP_START
+ blt zero, a2, .RES
+ b .END_LOOP
+.RES:
+ li.w t1, 0
+.DSTW:
+ slli.w t2, t1, 2
+ ldx.w t2, a5, t2
+ li.w t3, 0
+ li.w t8, 0
+.FILTERSIZE:
+ add.w t4, t2, t3
+ ldx.bu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t7, t6, 1
+ ldx.h t7, a4, t7
+ mul.w t7, t5, t7
+ add.w t8, t8, t7
+ addi.w t3, t3, 1
+ blt t3, a6, .FILTERSIZE
+ srai.w t8, t8, 7
+ slt t5, t8, t0
+ maskeqz t8, t8, t5
+ masknez t5, t0, t5
+ or t8, t8, t5
+ slli.w t4, t1, 1
+ stx.h t8, a1, t4
+ addi.w t1, t1, 1
+ blt t1, a2, .DSTW
+ b .END_LOOP
+
+.LOOP_DSTW8:
+ ld.w t1, a5, 0
+ ld.w t2, a5, 4
+ ld.w t3, a5, 8
+ ld.w t4, a5, 12
+ fldx.d f1, a3, t1
+ fldx.d f2, a3, t2
+ fldx.d f3, a3, t3
+ fldx.d f4, a3, t4
+ ld.w t1, a5, 16
+ ld.w t2, a5, 20
+ ld.w t3, a5, 24
+ ld.w t4, a5, 28
+ fldx.d f5, a3, t1
+ fldx.d f6, a3, t2
+ fldx.d f7, a3, t3
+ fldx.d f8, a3, t4
+ vld vr9, a4, 0
+ vld vr10, a4, 16
+ vld vr11, a4, 32
+ vld vr12, a4, 48
+ vld vr13, a4, 64
+ vld vr14, a4, 80
+ vld vr15, a4, 96
+ vld vr16, a4, 112
+ vilvl.b vr1, vr0, vr1
+ vilvl.b vr2, vr0, vr2
+ vilvl.b vr3, vr0, vr3
+ vilvl.b vr4, vr0, vr4
+ vilvl.b vr5, vr0, vr5
+ vilvl.b vr6, vr0, vr6
+ vilvl.b vr7, vr0, vr7
+ vilvl.b vr8, vr0, vr8
+
+ vdp2.w.h vr17, vr1, vr9
+ vdp2.w.h vr18, vr2, vr10
+ vdp2.w.h vr19, vr3, vr11
+ vdp2.w.h vr21, vr4, vr12
+ vdp2.w.h vr1, vr5, vr13
+ vdp2.w.h vr2, vr6, vr14
+ vdp2.w.h vr3, vr7, vr15
+ vdp2.w.h vr4, vr8, vr16
+ vhaddw.d.w vr5, vr1, vr1
+ vhaddw.d.w vr6, vr2, vr2
+ vhaddw.d.w vr7, vr3, vr3
+ vhaddw.d.w vr8, vr4, vr4
+ vhaddw.d.w vr1, vr17, vr17
+ vhaddw.d.w vr2, vr18, vr18
+ vhaddw.d.w vr3, vr19, vr19
+ vhaddw.d.w vr4, vr21, vr21
+ vhaddw.q.d vr1, vr1, vr1
+ vhaddw.q.d vr2, vr2, vr2
+ vhaddw.q.d vr3, vr3, vr3
+ vhaddw.q.d vr4, vr4, vr4
+ vhaddw.q.d vr5, vr5, vr5
+ vhaddw.q.d vr6, vr6, vr6
+ vhaddw.q.d vr7, vr7, vr7
+ vhaddw.q.d vr8, vr8, vr8
+ vilvl.w vr1, vr2, vr1
+ vilvl.w vr3, vr4, vr3
+ vilvl.w vr5, vr6, vr5
+ vilvl.w vr7, vr8, vr7
+ vilvl.d vr1, vr3, vr1
+ vilvl.d vr5, vr7, vr5
+ vsrai.w vr1, vr1, 7
+ vsrai.w vr5, vr5, 7
+ vmin.w vr1, vr1, vr20
+ vmin.w vr5, vr5, vr20
+
+ vpickev.h vr1, vr5, vr1
+ vst vr1, a1, 0
+ addi.d a1, a1, 16
+ addi.d a5, a5, 32
+ addi.d a4, a4, 128
+ addi.d a2, a2, -8
+ bge a2, t8, .LOOP_DSTW8
+ blt zero, a2, .RES8
+ b .END_LOOP
+.RES8:
+ li.w t1, 0
+.DSTW8:
+ slli.w t2, t1, 2
+ ldx.w t2, a5, t2
+ li.w t3, 0
+ li.w t8, 0
+.FILTERSIZE8:
+ add.w t4, t2, t3
+ ldx.bu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t7, t6, 1
+ ldx.h t7, a4, t7
+ mul.w t7, t5, t7
+ add.w t8, t8, t7
+ addi.w t3, t3, 1
+ blt t3, a6, .FILTERSIZE8
+ srai.w t8, t8, 7
+ slt t5, t8, t0
+ maskeqz t8, t8, t5
+ masknez t5, t0, t5
+ or t8, t8, t5
+ slli.w t4, t1, 1
+ stx.h t8, a1, t4
+ addi.w t1, t1, 1
+ blt t1, a2, .DSTW8
+ b .END_LOOP
+
+.LOOP_DSTW4:
+ ld.w t1, a5, 0
+ ld.w t2, a5, 4
+ ld.w t3, a5, 8
+ ld.w t4, a5, 12
+ fldx.s f1, a3, t1
+ fldx.s f2, a3, t2
+ fldx.s f3, a3, t3
+ fldx.s f4, a3, t4
+ ld.w t1, a5, 16
+ ld.w t2, a5, 20
+ ld.w t3, a5, 24
+ ld.w t4, a5, 28
+ fldx.s f5, a3, t1
+ fldx.s f6, a3, t2
+ fldx.s f7, a3, t3
+ fldx.s f8, a3, t4
+ vld vr9, a4, 0
+ vld vr10, a4, 16
+ vld vr11, a4, 32
+ vld vr12, a4, 48
+ vilvl.w vr1, vr2, vr1
+ vilvl.w vr3, vr4, vr3
+ vilvl.w vr5, vr6, vr5
+ vilvl.w vr7, vr8, vr7
+ vilvl.b vr1, vr0, vr1
+ vilvl.b vr3, vr0, vr3
+ vilvl.b vr5, vr0, vr5
+ vilvl.b vr7, vr0, vr7
+
+ vdp2.w.h vr13, vr1, vr9
+ vdp2.w.h vr14, vr3, vr10
+ vdp2.w.h vr15, vr5, vr11
+ vdp2.w.h vr16, vr7, vr12
+ vhaddw.d.w vr13, vr13, vr13
+ vhaddw.d.w vr14, vr14, vr14
+ vhaddw.d.w vr15, vr15, vr15
+ vhaddw.d.w vr16, vr16, vr16
+ vpickev.w vr13, vr14, vr13
+ vpickev.w vr15, vr16, vr15
+ vsrai.w vr13, vr13, 7
+ vsrai.w vr15, vr15, 7
+ vmin.w vr13, vr13, vr20
+ vmin.w vr15, vr15, vr20
+
+ vpickev.h vr13, vr15, vr13
+ vst vr13, a1, 0
+ addi.d a1, a1, 16
+ addi.d a5, a5, 32
+ addi.d a4, a4, 64
+ addi.d a2, a2, -8
+ bge a2, t8, .LOOP_DSTW4
+ blt zero, a2, .RES4
+ b .END_LOOP
+.RES4:
+ li.w t1, 0
+.DSTW4:
+ slli.w t2, t1, 2
+ ldx.w t2, a5, t2
+ li.w t3, 0
+ li.w t8, 0
+.FILTERSIZE4:
+ add.w t4, t2, t3
+ ldx.bu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t7, t6, 1
+ ldx.h t7, a4, t7
+ mul.w t7, t5, t7
+ add.w t8, t8, t7
+ addi.w t3, t3, 1
+ blt t3, a6, .FILTERSIZE4
+ srai.w t8, t8, 7
+ slt t5, t8, t0
+ maskeqz t8, t8, t5
+ masknez t5, t0, t5
+ or t8, t8, t5
+ slli.w t4, t1, 1
+ stx.h t8, a1, t4
+ addi.w t1, t1, 1
+ blt t1, a2, .DSTW4
+ b .END_LOOP
+.END_DSTW4:
+
+ li.w t1, 0
+.LOOP_DSTW1:
+ slli.w t2, t1, 2
+ ldx.w t2, a5, t2
+ li.w t3, 0
+ li.w t8, 0
+.FILTERSIZE1:
+ add.w t4, t2, t3
+ ldx.bu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t7, t6, 1
+ ldx.h t7, a4, t7
+ mul.w t7, t5, t7
+ add.w t8, t8, t7
+ addi.w t3, t3, 1
+ blt t3, a6, .FILTERSIZE1
+ srai.w t8, t8, 7
+ slt t5, t8, t0
+ maskeqz t8, t8, t5
+ masknez t5, t0, t5
+ or t8, t8, t5
+ slli.w t4, t1, 1
+ stx.h t8, a1, t4
+ addi.w t1, t1, 1
+ blt t1, a2, .LOOP_DSTW1
+ b .END_LOOP
+.END_LOOP:
+
+ ld.d s0, sp, 0
+ ld.d s1, sp, 8
+ ld.d s2, sp, 16
+ ld.d s3, sp, 24
+ ld.d s4, sp, 32
+ ld.d s5, sp, 40
+ ld.d s6, sp, 48
+ ld.d s7, sp, 56
+ ld.d s8, sp, 64
+ addi.d sp, sp, 72
+endfunc
+
+/* void ff_hscale_8_to_19_lsx(SwsContext *c, int16_t *dst, int dstW,
+ * const uint8_t *src, const int16_t *filter,
+ * const int32_t *filterPos, int filterSize)
+ */
+function ff_hscale_8_to_19_lsx
+ addi.d sp, sp, -72
+ st.d s0, sp, 0
+ st.d s1, sp, 8
+ st.d s2, sp, 16
+ st.d s3, sp, 24
+ st.d s4, sp, 32
+ st.d s5, sp, 40
+ st.d s6, sp, 48
+ st.d s7, sp, 56
+ st.d s8, sp, 64
+ li.w t0, 524287
+ li.w t8, 8
+ li.w t7, 4
+ vldi vr0, 0
+ vreplgr2vr.w vr20, t0
+ beq a6, t7, .LOOP_DST4
+ beq a6, t8, .LOOP_DST8
+ blt t8, a6, .LOOP
+ b .END_DST4
+
+.LOOP:
+ li.w t1, 0
+ li.w s1, 0
+ li.w s2, 0
+ li.w s3, 0
+ li.w s4, 0
+ li.w s5, 0
+ vldi vr22, 0
+ addi.w s0, a6, -7
+ slli.w s7, a6, 1
+ slli.w s8, a6, 2
+ add.w t6, s7, s8
+.LOOP_DST:
+ ld.w t2, a5, 0
+ ld.w t3, a5, 4
+ ld.w t4, a5, 8
+ ld.w t5, a5, 12
+ fldx.d f1, a3, t2
+ fldx.d f2, a3, t3
+ fldx.d f3, a3, t4
+ fldx.d f4, a3, t5
+ vld vr9, a4, 0
+ vldx vr10, a4, s7
+ vldx vr11, a4, s8
+ vldx vr12, a4, t6
+ vilvl.b vr1, vr0, vr1
+ vilvl.b vr2, vr0, vr2
+ vilvl.b vr3, vr0, vr3
+ vilvl.b vr4, vr0, vr4
+ vdp2.w.h vr17, vr1, vr9
+ vdp2.w.h vr18, vr2, vr10
+ vdp2.w.h vr19, vr3, vr11
+ vdp2.w.h vr21, vr4, vr12
+ vhaddw.d.w vr1, vr17, vr17
+ vhaddw.d.w vr2, vr18, vr18
+ vhaddw.d.w vr3, vr19, vr19
+ vhaddw.d.w vr4, vr21, vr21
+ vhaddw.q.d vr1, vr1, vr1
+ vhaddw.q.d vr2, vr2, vr2
+ vhaddw.q.d vr3, vr3, vr3
+ vhaddw.q.d vr4, vr4, vr4
+ vilvl.w vr1, vr2, vr1
+ vilvl.w vr3, vr4, vr3
+ vilvl.d vr1, vr3, vr1
+ vadd.w vr22, vr22, vr1
+ addi.w s1, s1, 8
+ addi.d a3, a3, 8
+ addi.d a4, a4, 16
+ blt s1, s0, .LOOP_DST
+ blt s1, a6, .DSTA
+ b .END_FILTERA
+.DSTA:
+ ld.w t2, a5, 0
+ li.w t3, 0
+ move s6, s1
+.FILTERA:
+ add.w t4, t2, t3
+ ldx.bu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t6, t6, 1
+ ldx.h t6, a4, t6
+ mul.w t6, t5, t6
+ add.w s2, s2, t6
+ addi.w t3, t3, 1
+ addi.w s6, s6, 1
+ blt s6, a6, .FILTERA
+
+ ld.w t2, a5, 4
+ li.w t3, 0
+ move s6, s1
+ addi.w t1, t1, 1
+.FILTERB:
+ add.w t4, t2, t3
+ ldx.bu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t6, t6, 1
+ ldx.h t6, a4, t6
+ mul.w t6, t5, t6
+ add.w s3, s3, t6
+ addi.w t3, t3, 1
+ addi.w s6, s6, 1
+ blt s6, a6, .FILTERB
+ ld.w t2, a5, 8
+ addi.w t1, t1, 1
+ li.w t3, 0
+ move s6, s1
+.FILTERC:
+ add.w t4, t2, t3
+ ldx.bu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t6, t6, 1
+ ldx.h t6, a4, t6
+ mul.w t6, t5, t6
+ add.w s4, s4, t6
+ addi.w t3, t3, 1
+ addi.w s6, s6, 1
+ blt s6, a6, .FILTERC
+ ld.w t2, a5, 12
+ addi.w t1, t1, 1
+ move s6, s1
+ li.w t3, 0
+.FILTERD:
+ add.w t4, t2, t3
+ ldx.bu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t6, t6, 1
+ ldx.h t6, a4, t6
+ mul.w t6, t5, t6
+ add.w s5, s5, t6
+ addi.w t3, t3, 1
+ addi.w s6, s6, 1
+ blt s6, a6, .FILTERD
+.END_FILTERA:
+ vpickve2gr.w t1, vr22, 0
+ vpickve2gr.w t2, vr22, 1
+ vpickve2gr.w t3, vr22, 2
+ vpickve2gr.w t4, vr22, 3
+ add.w s2, s2, t1
+ add.w s3, s3, t2
+ add.w s4, s4, t3
+ add.w s5, s5, t4
+ srai.w s2, s2, 3
+ srai.w s3, s3, 3
+ srai.w s4, s4, 3
+ srai.w s5, s5, 3
+ slt t1, s2, t0
+ slt t2, s3, t0
+ slt t3, s4, t0
+ slt t4, s5, t0
+ maskeqz s2, s2, t1
+ maskeqz s3, s3, t2
+ maskeqz s4, s4, t3
+ maskeqz s5, s5, t4
+ masknez t1, t0, t1
+ masknez t2, t0, t2
+ masknez t3, t0, t3
+ masknez t4, t0, t4
+ or s2, s2, t1
+ or s3, s3, t2
+ or s4, s4, t3
+ or s5, s5, t4
+ st.w s2, a1, 0
+ st.w s3, a1, 4
+ st.w s4, a1, 8
+ st.w s5, a1, 12
+
+ addi.d a1, a1, 16
+ sub.d a3, a3, s1
+ addi.d a5, a5, 16
+ slli.d t3, a6, 3
+ add.d a4, a4, t3
+ sub.d a4, a4, s1
+ sub.d a4, a4, s1
+ addi.d a2, a2, -4
+ bge a2, t7, .LOOP
+ blt zero, a2, .RESA
+ b .END
+.RESA:
+ li.w t1, 0
+.DST:
+ slli.w t2, t1, 2
+ ldx.w t2, a5, t2
+ li.w t3, 0
+ li.w t8, 0
+.FILTER:
+ add.w t4, t2, t3
+ ldx.bu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t7, t6, 1
+ ldx.h t7, a4, t7
+ mul.w t7, t5, t7
+ add.w t8, t8, t7
+ addi.w t3, t3, 1
+ blt t3, a6, .FILTER
+ srai.w t8, t8, 3
+ slt t5, t8, t0
+ maskeqz t8, t8, t5
+ masknez t5, t0, t5
+ or t8, t8, t5
+ slli.w t4, t1, 2
+ stx.w t8, a1, t4
+ addi.w t1, t1, 1
+ blt t1, a2, .DST
+ b .END
+
+.LOOP_DST8:
+ ld.w t1, a5, 0
+ ld.w t2, a5, 4
+ ld.w t3, a5, 8
+ ld.w t4, a5, 12
+ fldx.d f1, a3, t1
+ fldx.d f2, a3, t2
+ fldx.d f3, a3, t3
+ fldx.d f4, a3, t4
+ ld.w t1, a5, 16
+ ld.w t2, a5, 20
+ ld.w t3, a5, 24
+ ld.w t4, a5, 28
+ fldx.d f5, a3, t1
+ fldx.d f6, a3, t2
+ fldx.d f7, a3, t3
+ fldx.d f8, a3, t4
+ vld vr9, a4, 0
+ vld vr10, a4, 16
+ vld vr11, a4, 32
+ vld vr12, a4, 48
+ vld vr13, a4, 64
+ vld vr14, a4, 80
+ vld vr15, a4, 96
+ vld vr16, a4, 112
+ vilvl.b vr1, vr0, vr1
+ vilvl.b vr2, vr0, vr2
+ vilvl.b vr3, vr0, vr3
+ vilvl.b vr4, vr0, vr4
+ vilvl.b vr5, vr0, vr5
+ vilvl.b vr6, vr0, vr6
+ vilvl.b vr7, vr0, vr7
+ vilvl.b vr8, vr0, vr8
+
+ vdp2.w.h vr17, vr1, vr9
+ vdp2.w.h vr18, vr2, vr10
+ vdp2.w.h vr19, vr3, vr11
+ vdp2.w.h vr21, vr4, vr12
+ vdp2.w.h vr1, vr5, vr13
+ vdp2.w.h vr2, vr6, vr14
+ vdp2.w.h vr3, vr7, vr15
+ vdp2.w.h vr4, vr8, vr16
+ vhaddw.d.w vr5, vr1, vr1
+ vhaddw.d.w vr6, vr2, vr2
+ vhaddw.d.w vr7, vr3, vr3
+ vhaddw.d.w vr8, vr4, vr4
+ vhaddw.d.w vr1, vr17, vr17
+ vhaddw.d.w vr2, vr18, vr18
+ vhaddw.d.w vr3, vr19, vr19
+ vhaddw.d.w vr4, vr21, vr21
+ vhaddw.q.d vr1, vr1, vr1
+ vhaddw.q.d vr2, vr2, vr2
+ vhaddw.q.d vr3, vr3, vr3
+ vhaddw.q.d vr4, vr4, vr4
+ vhaddw.q.d vr5, vr5, vr5
+ vhaddw.q.d vr6, vr6, vr6
+ vhaddw.q.d vr7, vr7, vr7
+ vhaddw.q.d vr8, vr8, vr8
+ vilvl.w vr1, vr2, vr1
+ vilvl.w vr3, vr4, vr3
+ vilvl.w vr5, vr6, vr5
+ vilvl.w vr7, vr8, vr7
+ vilvl.d vr1, vr3, vr1
+ vilvl.d vr5, vr7, vr5
+ vsrai.w vr1, vr1, 3
+ vsrai.w vr5, vr5, 3
+ vmin.w vr1, vr1, vr20
+ vmin.w vr5, vr5, vr20
+
+ vst vr1, a1, 0
+ vst vr5, a1, 16
+ addi.d a1, a1, 32
+ addi.d a5, a5, 32
+ addi.d a4, a4, 128
+ addi.d a2, a2, -8
+ bge a2, t8, .LOOP_DST8
+ blt zero, a2, .REST8
+ b .END
+.REST8:
+ li.w t1, 0
+.DST8:
+ slli.w t2, t1, 2
+ ldx.w t2, a5, t2
+ li.w t3, 0
+ li.w t8, 0
+.FILTER8:
+ add.w t4, t2, t3
+ ldx.bu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t7, t6, 1
+ ldx.h t7, a4, t7
+ mul.w t7, t5, t7
+ add.w t8, t8, t7
+ addi.w t3, t3, 1
+ blt t3, a6, .FILTER8
+ srai.w t8, t8, 3
+ slt t5, t8, t0
+ maskeqz t8, t8, t5
+ masknez t5, t0, t5
+ or t8, t8, t5
+ slli.w t4, t1, 2
+ stx.w t8, a1, t4
+ addi.w t1, t1, 1
+ blt t1, a2, .DST8
+ b .END
+
+.LOOP_DST4:
+ ld.w t1, a5, 0
+ ld.w t2, a5, 4
+ ld.w t3, a5, 8
+ ld.w t4, a5, 12
+ fldx.s f1, a3, t1
+ fldx.s f2, a3, t2
+ fldx.s f3, a3, t3
+ fldx.s f4, a3, t4
+ ld.w t1, a5, 16
+ ld.w t2, a5, 20
+ ld.w t3, a5, 24
+ ld.w t4, a5, 28
+ fldx.s f5, a3, t1
+ fldx.s f6, a3, t2
+ fldx.s f7, a3, t3
+ fldx.s f8, a3, t4
+ vld vr9, a4, 0
+ vld vr10, a4, 16
+ vld vr11, a4, 32
+ vld vr12, a4, 48
+ vilvl.w vr1, vr2, vr1
+ vilvl.w vr3, vr4, vr3
+ vilvl.w vr5, vr6, vr5
+ vilvl.w vr7, vr8, vr7
+ vilvl.b vr1, vr0, vr1
+ vilvl.b vr3, vr0, vr3
+ vilvl.b vr5, vr0, vr5
+ vilvl.b vr7, vr0, vr7
+
+ vdp2.w.h vr13, vr1, vr9
+ vdp2.w.h vr14, vr3, vr10
+ vdp2.w.h vr15, vr5, vr11
+ vdp2.w.h vr16, vr7, vr12
+ vhaddw.d.w vr13, vr13, vr13
+ vhaddw.d.w vr14, vr14, vr14
+ vhaddw.d.w vr15, vr15, vr15
+ vhaddw.d.w vr16, vr16, vr16
+ vpickev.w vr13, vr14, vr13
+ vpickev.w vr15, vr16, vr15
+ vsrai.w vr13, vr13, 3
+ vsrai.w vr15, vr15, 3
+ vmin.w vr13, vr13, vr20
+ vmin.w vr15, vr15, vr20
+
+ vst vr13, a1, 0
+ vst vr15, a1, 16
+ addi.d a1, a1, 32
+ addi.d a5, a5, 32
+ addi.d a4, a4, 64
+ addi.d a2, a2, -8
+ bge a2, t8, .LOOP_DST4
+ blt zero, a2, .REST4
+ b .END
+.REST4:
+ li.w t1, 0
+.DST4:
+ slli.w t2, t1, 2
+ ldx.w t2, a5, t2
+ li.w t3, 0
+ li.w t8, 0
+.FILTER4:
+ add.w t4, t2, t3
+ ldx.bu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t7, t6, 1
+ ldx.h t7, a4, t7
+ mul.w t7, t5, t7
+ add.w t8, t8, t7
+ addi.w t3, t3, 1
+ blt t3, a6, .FILTER4
+ srai.w t8, t8, 3
+ slt t5, t8, t0
+ maskeqz t8, t8, t5
+ masknez t5, t0, t5
+ or t8, t8, t5
+ slli.w t4, t1, 2
+ stx.w t8, a1, t4
+ addi.w t1, t1, 1
+ blt t1, a2, .DST4
+ b .END
+.END_DST4:
+
+ li.w t1, 0
+.LOOP_DST1:
+ slli.w t2, t1, 2
+ ldx.w t2, a5, t2
+ li.w t3, 0
+ li.w t8, 0
+.FILTER1:
+ add.w t4, t2, t3
+ ldx.bu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t7, t6, 1
+ ldx.h t7, a4, t7
+ mul.w t7, t5, t7
+ add.w t8, t8, t7
+ addi.w t3, t3, 1
+ blt t3, a6, .FILTER1
+ srai.w t8, t8, 3
+ slt t5, t8, t0
+ maskeqz t8, t8, t5
+ masknez t5, t0, t5
+ or t8, t8, t5
+ slli.w t4, t1, 2
+ stx.w t8, a1, t4
+ addi.w t1, t1, 1
+ blt t1, a2, .LOOP_DST1
+ b .END
+.END:
+
+ ld.d s0, sp, 0
+ ld.d s1, sp, 8
+ ld.d s2, sp, 16
+ ld.d s3, sp, 24
+ ld.d s4, sp, 32
+ ld.d s5, sp, 40
+ ld.d s6, sp, 48
+ ld.d s7, sp, 56
+ ld.d s8, sp, 64
+ addi.d sp, sp, 72
+endfunc
+
+/* void ff_hscale_16_to_15_sub_lsx(SwsContext *c, int16_t *dst, int dstW,
+ * const uint8_t *src, const int16_t *filter,
+ * const int32_t *filterPos, int filterSize, int sh)
+ */
+function ff_hscale_16_to_15_sub_lsx
+ addi.d sp, sp, -72
+ st.d s0, sp, 0
+ st.d s1, sp, 8
+ st.d s2, sp, 16
+ st.d s3, sp, 24
+ st.d s4, sp, 32
+ st.d s5, sp, 40
+ st.d s6, sp, 48
+ st.d s7, sp, 56
+ st.d s8, sp, 64
+ li.w t0, 32767
+ li.w t8, 8
+ li.w t7, 4
+ vreplgr2vr.w vr20, t0
+ vreplgr2vr.w vr0, a7
+ beq a6, t7, .LOOP_HS15_DST4
+ beq a6, t8, .LOOP_HS15_DST8
+ blt t8, a6, .LOOP_HS15
+ b .END_HS15_DST4
+
+.LOOP_HS15:
+ li.w t1, 0
+ li.w s1, 0
+ li.w s2, 0
+ li.w s3, 0
+ li.w s4, 0
+ li.w s5, 0
+ vldi vr22, 0
+ addi.w s0, a6, -7
+ slli.w s7, a6, 1
+ slli.w s8, a6, 2
+ add.w t6, s7, s8
+.LOOP_HS15_DST:
+ ld.w t2, a5, 0
+ ld.w t3, a5, 4
+ ld.w t4, a5, 8
+ ld.w t5, a5, 12
+ slli.w t2, t2, 1
+ slli.w t3, t3, 1
+ slli.w t4, t4, 1
+ slli.w t5, t5, 1
+ vldx vr1, a3, t2
+ vldx vr2, a3, t3
+ vldx vr3, a3, t4
+ vldx vr4, a3, t5
+ vld vr9, a4, 0
+ vldx vr10, a4, s7
+ vldx vr11, a4, s8
+ vldx vr12, a4, t6
+ vmulwev.w.hu.h vr17, vr1, vr9
+ vmulwev.w.hu.h vr18, vr2, vr10
+ vmulwev.w.hu.h vr19, vr3, vr11
+ vmulwev.w.hu.h vr21, vr4, vr12
+ vmaddwod.w.hu.h vr17, vr1, vr9
+ vmaddwod.w.hu.h vr18, vr2, vr10
+ vmaddwod.w.hu.h vr19, vr3, vr11
+ vmaddwod.w.hu.h vr21, vr4, vr12
+ vhaddw.d.w vr1, vr17, vr17
+ vhaddw.d.w vr2, vr18, vr18
+ vhaddw.d.w vr3, vr19, vr19
+ vhaddw.d.w vr4, vr21, vr21
+ vhaddw.q.d vr1, vr1, vr1
+ vhaddw.q.d vr2, vr2, vr2
+ vhaddw.q.d vr3, vr3, vr3
+ vhaddw.q.d vr4, vr4, vr4
+ vilvl.w vr1, vr2, vr1
+ vilvl.w vr3, vr4, vr3
+ vilvl.d vr1, vr3, vr1
+ vadd.w vr22, vr22, vr1
+ addi.w s1, s1, 8
+ addi.d a3, a3, 16
+ addi.d a4, a4, 16
+ blt s1, s0, .LOOP_HS15_DST
+ blt s1, a6, .HS15_DSTA
+ b .END_HS15_FILTERA
+.HS15_DSTA:
+ ld.w t2, a5, 0
+ li.w t3, 0
+ move s6, s1
+.HS15_FILTERA:
+ add.w t4, t2, t3
+ slli.w t4, t4, 1
+ ldx.hu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t6, t6, 1
+ ldx.h t6, a4, t6
+ mul.w t6, t5, t6
+ add.w s2, s2, t6
+ addi.w t3, t3, 1
+ addi.w s6, s6, 1
+ blt s6, a6, .HS15_FILTERA
+
+ ld.w t2, a5, 4
+ li.w t3, 0
+ move s6, s1
+ addi.w t1, t1, 1
+.HS15_FILTERB:
+ add.w t4, t2, t3
+ slli.w t4, t4, 1
+ ldx.hu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t6, t6, 1
+ ldx.h t6, a4, t6
+ mul.w t6, t5, t6
+ add.w s3, s3, t6
+ addi.w t3, t3, 1
+ addi.w s6, s6, 1
+ blt s6, a6, .HS15_FILTERB
+ ld.w t2, a5, 8
+ addi.w t1, t1, 1
+ li.w t3, 0
+ move s6, s1
+.HS15_FILTERC:
+ add.w t4, t2, t3
+ slli.w t4, t4, 1
+ ldx.hu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t6, t6, 1
+ ldx.h t6, a4, t6
+ mul.w t6, t5, t6
+ add.w s4, s4, t6
+ addi.w t3, t3, 1
+ addi.w s6, s6, 1
+ blt s6, a6, .HS15_FILTERC
+ ld.w t2, a5, 12
+ addi.w t1, t1, 1
+ move s6, s1
+ li.w t3, 0
+.HS15_FILTERD:
+ add.w t4, t2, t3
+ slli.w t4, t4, 1
+ ldx.hu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t6, t6, 1
+ ldx.h t6, a4, t6
+ mul.w t6, t5, t6
+ add.w s5, s5, t6
+ addi.w t3, t3, 1
+ addi.w s6, s6, 1
+ blt s6, a6, .HS15_FILTERD
+.END_HS15_FILTERA:
+ vpickve2gr.w t1, vr22, 0
+ vpickve2gr.w t2, vr22, 1
+ vpickve2gr.w t3, vr22, 2
+ vpickve2gr.w t4, vr22, 3
+ add.w s2, s2, t1
+ add.w s3, s3, t2
+ add.w s4, s4, t3
+ add.w s5, s5, t4
+ sra.w s2, s2, a7
+ sra.w s3, s3, a7
+ sra.w s4, s4, a7
+ sra.w s5, s5, a7
+ slt t1, s2, t0
+ slt t2, s3, t0
+ slt t3, s4, t0
+ slt t4, s5, t0
+ maskeqz s2, s2, t1
+ maskeqz s3, s3, t2
+ maskeqz s4, s4, t3
+ maskeqz s5, s5, t4
+ masknez t1, t0, t1
+ masknez t2, t0, t2
+ masknez t3, t0, t3
+ masknez t4, t0, t4
+ or s2, s2, t1
+ or s3, s3, t2
+ or s4, s4, t3
+ or s5, s5, t4
+ st.h s2, a1, 0
+ st.h s3, a1, 2
+ st.h s4, a1, 4
+ st.h s5, a1, 6
+
+ addi.d a1, a1, 8
+ sub.d a3, a3, s1
+ sub.d a3, a3, s1
+ addi.d a5, a5, 16
+ slli.d t3, a6, 3
+ add.d a4, a4, t3
+ sub.d a4, a4, s1
+ sub.d a4, a4, s1
+ addi.d a2, a2, -4
+ bge a2, t7, .LOOP_HS15
+ blt zero, a2, .HS15_RESA
+ b .HS15_END
+.HS15_RESA:
+ li.w t1, 0
+.HS15_DST:
+ slli.w t2, t1, 2
+ ldx.w t2, a5, t2
+ li.w t3, 0
+ li.w t8, 0
+.HS15_FILTER:
+ add.w t4, t2, t3
+ slli.w t4, t4, 1
+ ldx.hu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t7, t6, 1
+ ldx.h t7, a4, t7
+ mul.w t7, t5, t7
+ add.w t8, t8, t7
+ addi.w t3, t3, 1
+ blt t3, a6, .HS15_FILTER
+ sra.w t8, t8, a7
+ slt t5, t8, t0
+ maskeqz t8, t8, t5
+ masknez t5, t0, t5
+ or t8, t8, t5
+ slli.w t4, t1, 1
+ stx.h t8, a1, t4
+ addi.w t1, t1, 1
+ blt t1, a2, .HS15_DST
+ b .HS15_END
+
+.LOOP_HS15_DST8:
+ ld.w t1, a5, 0
+ ld.w t2, a5, 4
+ ld.w t3, a5, 8
+ ld.w t4, a5, 12
+ slli.w t1, t1, 1
+ slli.w t2, t2, 1
+ slli.w t3, t3, 1
+ slli.w t4, t4, 1
+ vldx vr1, a3, t1
+ vldx vr2, a3, t2
+ vldx vr3, a3, t3
+ vldx vr4, a3, t4
+ ld.w t1, a5, 16
+ ld.w t2, a5, 20
+ ld.w t3, a5, 24
+ ld.w t4, a5, 28
+ slli.w t1, t1, 1
+ slli.w t2, t2, 1
+ slli.w t3, t3, 1
+ slli.w t4, t4, 1
+ vldx vr5, a3, t1
+ vldx vr6, a3, t2
+ vldx vr7, a3, t3
+ vldx vr8, a3, t4
+ vld vr9, a4, 0
+ vld vr10, a4, 16
+ vld vr11, a4, 32
+ vld vr12, a4, 48
+ vld vr13, a4, 64
+ vld vr14, a4, 80
+ vld vr15, a4, 96
+ vld vr16, a4, 112
+
+ vmulwev.w.hu.h vr17, vr1, vr9
+ vmulwev.w.hu.h vr18, vr2, vr10
+ vmulwev.w.hu.h vr19, vr3, vr11
+ vmulwev.w.hu.h vr21, vr4, vr12
+ vmaddwod.w.hu.h vr17, vr1, vr9
+ vmaddwod.w.hu.h vr18, vr2, vr10
+ vmaddwod.w.hu.h vr19, vr3, vr11
+ vmaddwod.w.hu.h vr21, vr4, vr12
+ vmulwev.w.hu.h vr1, vr5, vr13
+ vmulwev.w.hu.h vr2, vr6, vr14
+ vmulwev.w.hu.h vr3, vr7, vr15
+ vmulwev.w.hu.h vr4, vr8, vr16
+ vmaddwod.w.hu.h vr1, vr5, vr13
+ vmaddwod.w.hu.h vr2, vr6, vr14
+ vmaddwod.w.hu.h vr3, vr7, vr15
+ vmaddwod.w.hu.h vr4, vr8, vr16
+ vhaddw.d.w vr5, vr1, vr1
+ vhaddw.d.w vr6, vr2, vr2
+ vhaddw.d.w vr7, vr3, vr3
+ vhaddw.d.w vr8, vr4, vr4
+ vhaddw.d.w vr1, vr17, vr17
+ vhaddw.d.w vr2, vr18, vr18
+ vhaddw.d.w vr3, vr19, vr19
+ vhaddw.d.w vr4, vr21, vr21
+ vhaddw.q.d vr1, vr1, vr1
+ vhaddw.q.d vr2, vr2, vr2
+ vhaddw.q.d vr3, vr3, vr3
+ vhaddw.q.d vr4, vr4, vr4
+ vhaddw.q.d vr5, vr5, vr5
+ vhaddw.q.d vr6, vr6, vr6
+ vhaddw.q.d vr7, vr7, vr7
+ vhaddw.q.d vr8, vr8, vr8
+ vilvl.w vr1, vr2, vr1
+ vilvl.w vr3, vr4, vr3
+ vilvl.w vr5, vr6, vr5
+ vilvl.w vr7, vr8, vr7
+ vilvl.d vr1, vr3, vr1
+ vilvl.d vr5, vr7, vr5
+ vsra.w vr1, vr1, vr0
+ vsra.w vr5, vr5, vr0
+ vmin.w vr1, vr1, vr20
+ vmin.w vr5, vr5, vr20
+
+ vpickev.h vr1, vr5, vr1
+ vst vr1, a1, 0
+ addi.d a1, a1, 16
+ addi.d a5, a5, 32
+ addi.d a4, a4, 128
+ addi.d a2, a2, -8
+ bge a2, t8, .LOOP_HS15_DST8
+ blt zero, a2, .HS15_REST8
+ b .HS15_END
+.HS15_REST8:
+ li.w t1, 0
+.HS15_DST8:
+ slli.w t2, t1, 2
+ ldx.w t2, a5, t2
+ li.w t3, 0
+ li.w t8, 0
+.HS15_FILTER8:
+ add.w t4, t2, t3
+ slli.w t4, t4, 1
+ ldx.hu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t7, t6, 1
+ ldx.h t7, a4, t7
+ mul.w t7, t5, t7
+ add.w t8, t8, t7
+ addi.w t3, t3, 1
+ blt t3, a6, .HS15_FILTER8
+ sra.w t8, t8, a7
+ slt t5, t8, t0
+ maskeqz t8, t8, t5
+ masknez t5, t0, t5
+ or t8, t8, t5
+ slli.w t4, t1, 1
+ stx.h t8, a1, t4
+ addi.w t1, t1, 1
+ blt t1, a2, .HS15_DST8
+ b .HS15_END
+
+.LOOP_HS15_DST4:
+ ld.w t1, a5, 0
+ ld.w t2, a5, 4
+ ld.w t3, a5, 8
+ ld.w t4, a5, 12
+ slli.w t1, t1, 1
+ slli.w t2, t2, 1
+ slli.w t3, t3, 1
+ slli.w t4, t4, 1
+ fldx.d f1, a3, t1
+ fldx.d f2, a3, t2
+ fldx.d f3, a3, t3
+ fldx.d f4, a3, t4
+ ld.w t1, a5, 16
+ ld.w t2, a5, 20
+ ld.w t3, a5, 24
+ ld.w t4, a5, 28
+ slli.w t1, t1, 1
+ slli.w t2, t2, 1
+ slli.w t3, t3, 1
+ slli.w t4, t4, 1
+ fldx.d f5, a3, t1
+ fldx.d f6, a3, t2
+ fldx.d f7, a3, t3
+ fldx.d f8, a3, t4
+ vld vr9, a4, 0
+ vld vr10, a4, 16
+ vld vr11, a4, 32
+ vld vr12, a4, 48
+ vilvl.d vr1, vr2, vr1
+ vilvl.d vr3, vr4, vr3
+ vilvl.d vr5, vr6, vr5
+ vilvl.d vr7, vr8, vr7
+ vmulwev.w.hu.h vr13, vr1, vr9
+ vmulwev.w.hu.h vr14, vr3, vr10
+ vmulwev.w.hu.h vr15, vr5, vr11
+ vmulwev.w.hu.h vr16, vr7, vr12
+ vmaddwod.w.hu.h vr13, vr1, vr9
+ vmaddwod.w.hu.h vr14, vr3, vr10
+ vmaddwod.w.hu.h vr15, vr5, vr11
+ vmaddwod.w.hu.h vr16, vr7, vr12
+ vhaddw.d.w vr13, vr13, vr13
+ vhaddw.d.w vr14, vr14, vr14
+ vhaddw.d.w vr15, vr15, vr15
+ vhaddw.d.w vr16, vr16, vr16
+ vpickev.w vr13, vr14, vr13
+ vpickev.w vr15, vr16, vr15
+ vsra.w vr13, vr13, vr0
+ vsra.w vr15, vr15, vr0
+ vmin.w vr13, vr13, vr20
+ vmin.w vr15, vr15, vr20
+
+ vpickev.h vr13, vr15, vr13
+ vst vr13, a1, 0
+ addi.d a1, a1, 16
+ addi.d a5, a5, 32
+ addi.d a4, a4, 64
+ addi.d a2, a2, -8
+ bge a2, t8, .LOOP_HS15_DST4
+ blt zero, a2, .HS15_REST4
+ b .HS15_END
+.HS15_REST4:
+ li.w t1, 0
+.HS15_DST4:
+ slli.w t2, t1, 2
+ ldx.w t2, a5, t2
+ li.w t3, 0
+ li.w t8, 0
+.HS15_FILTER4:
+ add.w t4, t2, t3
+ slli.w t4, t4, 1
+ ldx.hu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t7, t6, 1
+ ldx.h t7, a4, t7
+ mul.w t7, t5, t7
+ add.w t8, t8, t7
+ addi.w t3, t3, 1
+ blt t3, a6, .HS15_FILTER4
+ sra.w t8, t8, a7
+ slt t5, t8, t0
+ maskeqz t8, t8, t5
+ masknez t5, t0, t5
+ or t8, t8, t5
+ slli.w t4, t1, 1
+ stx.h t8, a1, t4
+ addi.w t1, t1, 1
+ blt t1, a2, .HS15_DST4
+ b .HS15_END
+.END_HS15_DST4:
+
+ li.w t1, 0
+.LOOP_HS15_DST1:
+ slli.w t2, t1, 2
+ ldx.w t2, a5, t2
+ li.w t3, 0
+ li.w t8, 0
+.HS15_FILTER1:
+ add.w t4, t2, t3
+ slli.w t4, t4, 1
+ ldx.hu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t7, t6, 1
+ ldx.h t7, a4, t7
+ mul.w t7, t5, t7
+ add.w t8, t8, t7
+ addi.w t3, t3, 1
+ blt t3, a6, .HS15_FILTER1
+ sra.w t8, t8, a7
+ slt t5, t8, t0
+ maskeqz t8, t8, t5
+ masknez t5, t0, t5
+ or t8, t8, t5
+ slli.w t4, t1, 1
+ stx.h t8, a1, t4
+ addi.w t1, t1, 1
+ blt t1, a2, .LOOP_HS15_DST1
+ b .HS15_END
+.HS15_END:
+
+ ld.d s0, sp, 0
+ ld.d s1, sp, 8
+ ld.d s2, sp, 16
+ ld.d s3, sp, 24
+ ld.d s4, sp, 32
+ ld.d s5, sp, 40
+ ld.d s6, sp, 48
+ ld.d s7, sp, 56
+ ld.d s8, sp, 64
+ addi.d sp, sp, 72
+endfunc
+
+/* void ff_hscale_16_to_19_sub_lsx(SwsContext *c, int16_t *dst, int dstW,
+ * const uint8_t *src, const int16_t *filter,
+ * const int32_t *filterPos, int filterSize, int sh)
+ */
+function ff_hscale_16_to_19_sub_lsx
+ addi.d sp, sp, -72
+ st.d s0, sp, 0
+ st.d s1, sp, 8
+ st.d s2, sp, 16
+ st.d s3, sp, 24
+ st.d s4, sp, 32
+ st.d s5, sp, 40
+ st.d s6, sp, 48
+ st.d s7, sp, 56
+ st.d s8, sp, 64
+
+ li.w t0, 524287
+ li.w t8, 8
+ li.w t7, 4
+ vreplgr2vr.w vr20, t0
+ vreplgr2vr.w vr0, a7
+ beq a6, t7, .LOOP_HS19_DST4
+ beq a6, t8, .LOOP_HS19_DST8
+ blt t8, a6, .LOOP_HS19
+ b .END_HS19_DST4
+
+.LOOP_HS19:
+ li.w t1, 0
+ li.w s1, 0
+ li.w s2, 0
+ li.w s3, 0
+ li.w s4, 0
+ li.w s5, 0
+ vldi vr22, 0
+ addi.w s0, a6, -7
+ slli.w s7, a6, 1
+ slli.w s8, a6, 2
+ add.w t6, s7, s8
+.LOOP_HS19_DST:
+ ld.w t2, a5, 0
+ ld.w t3, a5, 4
+ ld.w t4, a5, 8
+ ld.w t5, a5, 12
+ slli.w t2, t2, 1
+ slli.w t3, t3, 1
+ slli.w t4, t4, 1
+ slli.w t5, t5, 1
+ vldx vr1, a3, t2
+ vldx vr2, a3, t3
+ vldx vr3, a3, t4
+ vldx vr4, a3, t5
+ vld vr9, a4, 0
+ vldx vr10, a4, s7
+ vldx vr11, a4, s8
+ vldx vr12, a4, t6
+ vmulwev.w.hu.h vr17, vr1, vr9
+ vmulwev.w.hu.h vr18, vr2, vr10
+ vmulwev.w.hu.h vr19, vr3, vr11
+ vmulwev.w.hu.h vr21, vr4, vr12
+ vmaddwod.w.hu.h vr17, vr1, vr9
+ vmaddwod.w.hu.h vr18, vr2, vr10
+ vmaddwod.w.hu.h vr19, vr3, vr11
+ vmaddwod.w.hu.h vr21, vr4, vr12
+ vhaddw.d.w vr1, vr17, vr17
+ vhaddw.d.w vr2, vr18, vr18
+ vhaddw.d.w vr3, vr19, vr19
+ vhaddw.d.w vr4, vr21, vr21
+ vhaddw.q.d vr1, vr1, vr1
+ vhaddw.q.d vr2, vr2, vr2
+ vhaddw.q.d vr3, vr3, vr3
+ vhaddw.q.d vr4, vr4, vr4
+ vilvl.w vr1, vr2, vr1
+ vilvl.w vr3, vr4, vr3
+ vilvl.d vr1, vr3, vr1
+ vadd.w vr22, vr22, vr1
+ addi.w s1, s1, 8
+ addi.d a3, a3, 16
+ addi.d a4, a4, 16
+ blt s1, s0, .LOOP_HS19_DST
+ blt s1, a6, .HS19_DSTA
+ b .END_HS19_FILTERA
+.HS19_DSTA:
+ ld.w t2, a5, 0
+ li.w t3, 0
+ move s6, s1
+.HS19_FILTERA:
+ add.w t4, t2, t3
+ slli.w t4, t4, 1
+ ldx.hu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t6, t6, 1
+ ldx.h t6, a4, t6
+ mul.w t6, t5, t6
+ add.w s2, s2, t6
+ addi.w t3, t3, 1
+ addi.w s6, s6, 1
+ blt s6, a6, .HS19_FILTERA
+
+ ld.w t2, a5, 4
+ li.w t3, 0
+ move s6, s1
+ addi.w t1, t1, 1
+.HS19_FILTERB:
+ add.w t4, t2, t3
+ slli.w t4, t4, 1
+ ldx.hu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t6, t6, 1
+ ldx.h t6, a4, t6
+ mul.w t6, t5, t6
+ add.w s3, s3, t6
+ addi.w t3, t3, 1
+ addi.w s6, s6, 1
+ blt s6, a6, .HS19_FILTERB
+ ld.w t2, a5, 8
+ addi.w t1, t1, 1
+ li.w t3, 0
+ move s6, s1
+.HS19_FILTERC:
+ add.w t4, t2, t3
+ slli.w t4, t4, 1
+ ldx.hu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t6, t6, 1
+ ldx.h t6, a4, t6
+ mul.w t6, t5, t6
+ add.w s4, s4, t6
+ addi.w t3, t3, 1
+ addi.w s6, s6, 1
+ blt s6, a6, .HS19_FILTERC
+ ld.w t2, a5, 12
+ addi.w t1, t1, 1
+ move s6, s1
+ li.w t3, 0
+.HS19_FILTERD:
+ add.w t4, t2, t3
+ slli.w t4, t4, 1
+ ldx.hu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t6, t6, 1
+ ldx.h t6, a4, t6
+ mul.w t6, t5, t6
+ add.w s5, s5, t6
+ addi.w t3, t3, 1
+ addi.w s6, s6, 1
+ blt s6, a6, .HS19_FILTERD
+.END_HS19_FILTERA:
+ vpickve2gr.w t1, vr22, 0
+ vpickve2gr.w t2, vr22, 1
+ vpickve2gr.w t3, vr22, 2
+ vpickve2gr.w t4, vr22, 3
+ add.w s2, s2, t1
+ add.w s3, s3, t2
+ add.w s4, s4, t3
+ add.w s5, s5, t4
+ sra.w s2, s2, a7
+ sra.w s3, s3, a7
+ sra.w s4, s4, a7
+ sra.w s5, s5, a7
+ slt t1, s2, t0
+ slt t2, s3, t0
+ slt t3, s4, t0
+ slt t4, s5, t0
+ maskeqz s2, s2, t1
+ maskeqz s3, s3, t2
+ maskeqz s4, s4, t3
+ maskeqz s5, s5, t4
+ masknez t1, t0, t1
+ masknez t2, t0, t2
+ masknez t3, t0, t3
+ masknez t4, t0, t4
+ or s2, s2, t1
+ or s3, s3, t2
+ or s4, s4, t3
+ or s5, s5, t4
+ st.w s2, a1, 0
+ st.w s3, a1, 4
+ st.w s4, a1, 8
+ st.w s5, a1, 12
+
+ addi.d a1, a1, 16
+ sub.d a3, a3, s1
+ sub.d a3, a3, s1
+ addi.d a5, a5, 16
+ slli.d t3, a6, 3
+ add.d a4, a4, t3
+ sub.d a4, a4, s1
+ sub.d a4, a4, s1
+ addi.d a2, a2, -4
+ bge a2, t7, .LOOP_HS19
+ blt zero, a2, .HS19_RESA
+ b .HS19_END
+.HS19_RESA:
+ li.w t1, 0
+.HS19_DST:
+ slli.w t2, t1, 2
+ ldx.w t2, a5, t2
+ li.w t3, 0
+ li.w t8, 0
+.HS19_FILTER:
+ add.w t4, t2, t3
+ slli.w t4, t4, 1
+ ldx.hu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t7, t6, 1
+ ldx.h t7, a4, t7
+ mul.w t7, t5, t7
+ add.w t8, t8, t7
+ addi.w t3, t3, 1
+ blt t3, a6, .HS19_FILTER
+ sra.w t8, t8, a7
+ slt t5, t8, t0
+ maskeqz t8, t8, t5
+ masknez t5, t0, t5
+ or t8, t8, t5
+ slli.w t4, t1, 2
+ stx.w t8, a1, t4
+ addi.w t1, t1, 1
+ blt t1, a2, .HS19_DST
+ b .HS19_END
+
+.LOOP_HS19_DST8:
+ ld.w t1, a5, 0
+ ld.w t2, a5, 4
+ ld.w t3, a5, 8
+ ld.w t4, a5, 12
+ slli.w t1, t1, 1
+ slli.w t2, t2, 1
+ slli.w t3, t3, 1
+ slli.w t4, t4, 1
+ vldx vr1, a3, t1
+ vldx vr2, a3, t2
+ vldx vr3, a3, t3
+ vldx vr4, a3, t4
+ ld.w t1, a5, 16
+ ld.w t2, a5, 20
+ ld.w t3, a5, 24
+ ld.w t4, a5, 28
+ slli.w t1, t1, 1
+ slli.w t2, t2, 1
+ slli.w t3, t3, 1
+ slli.w t4, t4, 1
+ vldx vr5, a3, t1
+ vldx vr6, a3, t2
+ vldx vr7, a3, t3
+ vldx vr8, a3, t4
+ vld vr9, a4, 0
+ vld vr10, a4, 16
+ vld vr11, a4, 32
+ vld vr12, a4, 48
+ vld vr13, a4, 64
+ vld vr14, a4, 80
+ vld vr15, a4, 96
+ vld vr16, a4, 112
+ vmulwev.w.hu.h vr17, vr1, vr9
+ vmulwev.w.hu.h vr18, vr2, vr10
+ vmulwev.w.hu.h vr19, vr3, vr11
+ vmulwev.w.hu.h vr21, vr4, vr12
+ vmaddwod.w.hu.h vr17, vr1, vr9
+ vmaddwod.w.hu.h vr18, vr2, vr10
+ vmaddwod.w.hu.h vr19, vr3, vr11
+ vmaddwod.w.hu.h vr21, vr4, vr12
+ vmulwev.w.hu.h vr1, vr5, vr13
+ vmulwev.w.hu.h vr2, vr6, vr14
+ vmulwev.w.hu.h vr3, vr7, vr15
+ vmulwev.w.hu.h vr4, vr8, vr16
+ vmaddwod.w.hu.h vr1, vr5, vr13
+ vmaddwod.w.hu.h vr2, vr6, vr14
+ vmaddwod.w.hu.h vr3, vr7, vr15
+ vmaddwod.w.hu.h vr4, vr8, vr16
+ vhaddw.d.w vr5, vr1, vr1
+ vhaddw.d.w vr6, vr2, vr2
+ vhaddw.d.w vr7, vr3, vr3
+ vhaddw.d.w vr8, vr4, vr4
+ vhaddw.d.w vr1, vr17, vr17
+ vhaddw.d.w vr2, vr18, vr18
+ vhaddw.d.w vr3, vr19, vr19
+ vhaddw.d.w vr4, vr21, vr21
+ vhaddw.q.d vr1, vr1, vr1
+ vhaddw.q.d vr2, vr2, vr2
+ vhaddw.q.d vr3, vr3, vr3
+ vhaddw.q.d vr4, vr4, vr4
+ vhaddw.q.d vr5, vr5, vr5
+ vhaddw.q.d vr6, vr6, vr6
+ vhaddw.q.d vr7, vr7, vr7
+ vhaddw.q.d vr8, vr8, vr8
+ vilvl.w vr1, vr2, vr1
+ vilvl.w vr3, vr4, vr3
+ vilvl.w vr5, vr6, vr5
+ vilvl.w vr7, vr8, vr7
+ vilvl.d vr1, vr3, vr1
+ vilvl.d vr5, vr7, vr5
+ vsra.w vr1, vr1, vr0
+ vsra.w vr5, vr5, vr0
+ vmin.w vr1, vr1, vr20
+ vmin.w vr5, vr5, vr20
+
+ vst vr1, a1, 0
+ vst vr5, a1, 16
+ addi.d a1, a1, 32
+ addi.d a5, a5, 32
+ addi.d a4, a4, 128
+ addi.d a2, a2, -8
+ bge a2, t8, .LOOP_HS19_DST8
+ blt zero, a2, .HS19_REST8
+ b .HS19_END
+.HS19_REST8:
+ li.w t1, 0
+.HS19_DST8:
+ slli.w t2, t1, 2
+ ldx.w t2, a5, t2
+ li.w t3, 0
+ li.w t8, 0
+.HS19_FILTER8:
+ add.w t4, t2, t3
+ slli.w t4, t4, 1
+ ldx.hu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t7, t6, 1
+ ldx.h t7, a4, t7
+ mul.w t7, t5, t7
+ add.w t8, t8, t7
+ addi.w t3, t3, 1
+ blt t3, a6, .HS19_FILTER8
+ sra.w t8, t8, a7
+ slt t5, t8, t0
+ maskeqz t8, t8, t5
+ masknez t5, t0, t5
+ or t8, t8, t5
+ slli.w t4, t1, 2
+ stx.w t8, a1, t4
+ addi.w t1, t1, 1
+ blt t1, a2, .HS19_DST8
+ b .HS19_END
+
+.LOOP_HS19_DST4:
+ ld.w t1, a5, 0
+ ld.w t2, a5, 4
+ ld.w t3, a5, 8
+ ld.w t4, a5, 12
+ slli.w t1, t1, 1
+ slli.w t2, t2, 1
+ slli.w t3, t3, 1
+ slli.w t4, t4, 1
+ fldx.d f1, a3, t1
+ fldx.d f2, a3, t2
+ fldx.d f3, a3, t3
+ fldx.d f4, a3, t4
+ ld.w t1, a5, 16
+ ld.w t2, a5, 20
+ ld.w t3, a5, 24
+ ld.w t4, a5, 28
+ slli.w t1, t1, 1
+ slli.w t2, t2, 1
+ slli.w t3, t3, 1
+ slli.w t4, t4, 1
+ fldx.d f5, a3, t1
+ fldx.d f6, a3, t2
+ fldx.d f7, a3, t3
+ fldx.d f8, a3, t4
+ vld vr9, a4, 0
+ vld vr10, a4, 16
+ vld vr11, a4, 32
+ vld vr12, a4, 48
+ vilvl.d vr1, vr2, vr1
+ vilvl.d vr3, vr4, vr3
+ vilvl.d vr5, vr6, vr5
+ vilvl.d vr7, vr8, vr7
+ vmulwev.w.hu.h vr13, vr1, vr9
+ vmulwev.w.hu.h vr14, vr3, vr10
+ vmulwev.w.hu.h vr15, vr5, vr11
+ vmulwev.w.hu.h vr16, vr7, vr12
+ vmaddwod.w.hu.h vr13, vr1, vr9
+ vmaddwod.w.hu.h vr14, vr3, vr10
+ vmaddwod.w.hu.h vr15, vr5, vr11
+ vmaddwod.w.hu.h vr16, vr7, vr12
+ vhaddw.d.w vr13, vr13, vr13
+ vhaddw.d.w vr14, vr14, vr14
+ vhaddw.d.w vr15, vr15, vr15
+ vhaddw.d.w vr16, vr16, vr16
+ vpickev.w vr13, vr14, vr13
+ vpickev.w vr15, vr16, vr15
+ vsra.w vr13, vr13, vr0
+ vsra.w vr15, vr15, vr0
+ vmin.w vr13, vr13, vr20
+ vmin.w vr15, vr15, vr20
+
+ vst vr13, a1, 0
+ vst vr15, a1, 16
+ addi.d a1, a1, 32
+ addi.d a5, a5, 32
+ addi.d a4, a4, 64
+ addi.d a2, a2, -8
+ bge a2, t8, .LOOP_HS19_DST4
+ blt zero, a2, .HS19_REST4
+ b .HS19_END
+.HS19_REST4:
+ li.w t1, 0
+.HS19_DST4:
+ slli.w t2, t1, 2
+ ldx.w t2, a5, t2
+ li.w t3, 0
+ li.w t8, 0
+.HS19_FILTER4:
+ add.w t4, t2, t3
+ slli.w t4, t4, 1
+ ldx.hu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t7, t6, 1
+ ldx.h t7, a4, t7
+ mul.w t7, t5, t7
+ add.w t8, t8, t7
+ addi.w t3, t3, 1
+ blt t3, a6, .HS19_FILTER4
+ sra.w t8, t8, a7
+ slt t5, t8, t0
+ maskeqz t8, t8, t5
+ masknez t5, t0, t5
+ or t8, t8, t5
+ slli.w t4, t1, 2
+ stx.w t8, a1, t4
+ addi.w t1, t1, 1
+ blt t1, a2, .HS19_DST4
+ b .HS19_END
+.END_HS19_DST4:
+
+ li.w t1, 0
+.LOOP_HS19_DST1:
+ slli.w t2, t1, 2
+ ldx.w t2, a5, t2
+ li.w t3, 0
+ li.w t8, 0
+.HS19_FILTER1:
+ add.w t4, t2, t3
+ slli.w t4, t4, 1
+ ldx.hu t5, a3, t4
+ mul.w t6, a6, t1
+ add.w t6, t6, t3
+ slli.w t7, t6, 1
+ ldx.h t7, a4, t7
+ mul.w t7, t5, t7
+ add.w t8, t8, t7
+ addi.w t3, t3, 1
+ blt t3, a6, .HS19_FILTER1
+ sra.w t8, t8, a7
+ slt t5, t8, t0
+ maskeqz t8, t8, t5
+ masknez t5, t0, t5
+ or t8, t8, t5
+ slli.w t4, t1, 2
+ stx.w t8, a1, t4
+ addi.w t1, t1, 1
+ blt t1, a2, .LOOP_HS19_DST1
+ b .HS19_END
+.HS19_END:
+
+ ld.d s0, sp, 0
+ ld.d s1, sp, 8
+ ld.d s2, sp, 16
+ ld.d s3, sp, 24
+ ld.d s4, sp, 32
+ ld.d s5, sp, 40
+ ld.d s6, sp, 48
+ ld.d s7, sp, 56
+ ld.d s8, sp, 64
+ addi.d sp, sp, 72
+endfunc
diff --git a/libswscale/loongarch/swscale_init_loongarch.c b/libswscale/loongarch/swscale_init_loongarch.c
index 97fe947e2e..c13a1662ec 100644
--- a/libswscale/loongarch/swscale_init_loongarch.c
+++ b/libswscale/loongarch/swscale_init_loongarch.c
@@ -27,8 +27,33 @@
av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
+ if (have_lsx(cpu_flags)) {
+ ff_sws_init_output_lsx(c);
+ if (c->srcBpc == 8) {
+ if (c->dstBpc <= 14) {
+ c->hyScale = c->hcScale = ff_hscale_8_to_15_lsx;
+ } else {
+ c->hyScale = c->hcScale = ff_hscale_8_to_19_lsx;
+ }
+ } else {
+ c->hyScale = c->hcScale = c->dstBpc > 14 ? ff_hscale_16_to_19_lsx
+ : ff_hscale_16_to_15_lsx;
+ }
+ switch (c->srcFormat) {
+ case AV_PIX_FMT_GBRAP:
+ case AV_PIX_FMT_GBRP:
+ {
+ c->readChrPlanar = planar_rgb_to_uv_lsx;
+ c->readLumPlanar = planar_rgb_to_y_lsx;
+ }
+ break;
+ }
+ if (c->dstBpc == 8)
+ c->yuv2planeX = ff_yuv2planeX_8_lsx;
+ }
+#if HAVE_LASX
if (have_lasx(cpu_flags)) {
- ff_sws_init_output_loongarch(c);
+ ff_sws_init_output_lasx(c);
if (c->srcBpc == 8) {
if (c->dstBpc <= 14) {
c->hyScale = c->hcScale = ff_hscale_8_to_15_lasx;
@@ -51,17 +76,21 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
if (c->dstBpc == 8)
c->yuv2planeX = ff_yuv2planeX_8_lasx;
}
+#endif // #if HAVE_LASX
}
av_cold void rgb2rgb_init_loongarch(void)
{
+#if HAVE_LASX
int cpu_flags = av_get_cpu_flags();
if (have_lasx(cpu_flags))
interleaveBytes = ff_interleave_bytes_lasx;
+#endif // #if HAVE_LASX
}
av_cold SwsFunc ff_yuv2rgb_init_loongarch(SwsContext *c)
{
+#if HAVE_LASX
int cpu_flags = av_get_cpu_flags();
if (have_lasx(cpu_flags)) {
switch (c->dstFormat) {
@@ -91,5 +120,6 @@ av_cold SwsFunc ff_yuv2rgb_init_loongarch(SwsContext *c)
return yuv420_abgr32_lasx;
}
}
+#endif // #if HAVE_LASX
return NULL;
}
diff --git a/libswscale/loongarch/swscale_loongarch.h b/libswscale/loongarch/swscale_loongarch.h
index c52eb1016b..bc29913ac6 100644
--- a/libswscale/loongarch/swscale_loongarch.h
+++ b/libswscale/loongarch/swscale_loongarch.h
@@ -24,7 +24,45 @@
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
+#include "config.h"
+void ff_hscale_8_to_15_lsx(SwsContext *c, int16_t *dst, int dstW,
+ const uint8_t *src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize);
+
+void ff_hscale_8_to_19_lsx(SwsContext *c, int16_t *_dst, int dstW,
+ const uint8_t *src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize);
+
+void ff_hscale_16_to_15_lsx(SwsContext *c, int16_t *_dst, int dstW,
+ const uint8_t *_src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize);
+
+void ff_hscale_16_to_15_sub_lsx(SwsContext *c, int16_t *_dst, int dstW,
+ const uint8_t *_src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize, int sh);
+
+void ff_hscale_16_to_19_lsx(SwsContext *c, int16_t *_dst, int dstW,
+ const uint8_t *_src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize);
+
+void ff_hscale_16_to_19_sub_lsx(SwsContext *c, int16_t *_dst, int dstW,
+ const uint8_t *_src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize, int sh);
+
+void planar_rgb_to_uv_lsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
+ int width, int32_t *rgb2yuv, void *opq);
+
+void planar_rgb_to_y_lsx(uint8_t *_dst, const uint8_t *src[4], int width,
+ int32_t *rgb2yuv, void *opq);
+
+void ff_yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
+ const int16_t **src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset);
+
+av_cold void ff_sws_init_output_lsx(SwsContext *c);
+
+#if HAVE_LASX
void ff_hscale_8_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
const uint8_t *src, const int16_t *filter,
const int32_t *filterPos, int filterSize);
@@ -69,10 +107,11 @@ void ff_interleave_bytes_lasx(const uint8_t *src1, const uint8_t *src2,
uint8_t *dest, int width, int height,
int src1Stride, int src2Stride, int dstStride);
-av_cold void ff_sws_init_output_loongarch(SwsContext *c);
-
void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset);
+av_cold void ff_sws_init_output_lasx(SwsContext *c);
+#endif // #if HAVE_LASX
+
#endif /* SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H */
diff --git a/libswscale/loongarch/swscale_lsx.c b/libswscale/loongarch/swscale_lsx.c
new file mode 100644
index 0000000000..da8eabfca3
--- /dev/null
+++ b/libswscale/loongarch/swscale_lsx.c
@@ -0,0 +1,57 @@
+/*
+ * Loongson LSX optimized swscale
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "swscale_loongarch.h"
+
+void ff_hscale_16_to_15_lsx(SwsContext *c, int16_t *_dst, int dstW,
+ const uint8_t *_src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize)
+{
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
+ int sh = desc->comp[0].depth - 1;
+
+ if (sh < 15) {
+ sh = isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8 ? 13 :
+ (desc->comp[0].depth - 1);
+ } else if (desc->flags && AV_PIX_FMT_FLAG_FLOAT) {
+ sh = 15;
+ }
+ ff_hscale_16_to_15_sub_lsx(c, _dst, dstW, _src, filter, filterPos, filterSize, sh);
+}
+
+void ff_hscale_16_to_19_lsx(SwsContext *c, int16_t *_dst, int dstW,
+ const uint8_t *_src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize)
+{
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
+ int bits = desc->comp[0].depth - 1;
+ int sh = bits - 4;
+
+ if ((isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8) && desc->comp[0].depth<16) {
+
+ sh = 9;
+ } else if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) { /* float input are process like uint 16bpc */
+ sh = 16 - 1 - 4;
+ }
+ ff_hscale_16_to_19_sub_lsx(c, _dst, dstW, _src, filter, filterPos, filterSize, sh);
+}
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 925c536bf1..b02e6cdc64 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -653,7 +653,7 @@ static av_cold int initFilter(int16_t **outFilter, int32_t **filterPos,
filterAlign = 1;
}
- if (have_lasx(cpu_flags)) {
+ if (have_lasx(cpu_flags) || have_lsx(cpu_flags)) {
int reNum = minFilterSize & (0x07);
if (minFilterSize < 5)
@@ -1806,6 +1806,7 @@ static av_cold int sws_init_single_context(SwsContext *c, SwsFilter *srcFilter,
const int filterAlign = X86_MMX(cpu_flags) ? 4 :
PPC_ALTIVEC(cpu_flags) ? 8 :
have_neon(cpu_flags) ? 4 :
+ have_lsx(cpu_flags) ? 8 :
have_lasx(cpu_flags) ? 8 : 1;
if ((ret = initFilter(&c->hLumFilter, &c->hLumFilterPos,
--
2.20.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 17+ messages in thread
* [FFmpeg-devel] [PATCH v5 6/7] swscale/la: Add following builtin optimized functions
2023-05-25 7:24 [FFmpeg-devel] Add LSX optimization in avcodec and swscale Hao Chen
` (4 preceding siblings ...)
2023-05-25 7:24 ` [FFmpeg-devel] [PATCH v5 5/7] swscale/la: Optimize the functions of the swscale series with lsx Hao Chen
@ 2023-05-25 7:24 ` Hao Chen
2023-05-25 7:24 ` [FFmpeg-devel] [PATCH v5 7/7] avutil/la: Add function performance testing Hao Chen
6 siblings, 0 replies; 17+ messages in thread
From: Hao Chen @ 2023-05-25 7:24 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Jin Bo
From: Jin Bo <jinbo@loongson.cn>
yuv420_rgb24_lsx
yuv420_bgr24_lsx
yuv420_rgba32_lsx
yuv420_argb32_lsx
yuv420_bgra32_lsx
yuv420_abgr32_lsx
./configure --disable-lasx
ffmpeg -i ~/media/1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo
-pix_fmt rgb24 -y /dev/null -an
before: 184fps
after: 207fps
---
libswscale/loongarch/Makefile | 3 +-
libswscale/loongarch/swscale_init_loongarch.c | 30 +-
libswscale/loongarch/swscale_loongarch.h | 18 +
libswscale/loongarch/yuv2rgb_lsx.c | 361 ++++++++++++++++++
4 files changed, 410 insertions(+), 2 deletions(-)
create mode 100644 libswscale/loongarch/yuv2rgb_lsx.c
diff --git a/libswscale/loongarch/Makefile b/libswscale/loongarch/Makefile
index c0b6a449c0..c35ba309a4 100644
--- a/libswscale/loongarch/Makefile
+++ b/libswscale/loongarch/Makefile
@@ -8,4 +8,5 @@ LSX-OBJS-$(CONFIG_SWSCALE) += loongarch/swscale.o \
loongarch/swscale_lsx.o \
loongarch/input.o \
loongarch/output.o \
- loongarch/output_lsx.o
+ loongarch/output_lsx.o \
+ loongarch/yuv2rgb_lsx.o
diff --git a/libswscale/loongarch/swscale_init_loongarch.c b/libswscale/loongarch/swscale_init_loongarch.c
index c13a1662ec..53e4f970b6 100644
--- a/libswscale/loongarch/swscale_init_loongarch.c
+++ b/libswscale/loongarch/swscale_init_loongarch.c
@@ -90,8 +90,8 @@ av_cold void rgb2rgb_init_loongarch(void)
av_cold SwsFunc ff_yuv2rgb_init_loongarch(SwsContext *c)
{
-#if HAVE_LASX
int cpu_flags = av_get_cpu_flags();
+#if HAVE_LASX
if (have_lasx(cpu_flags)) {
switch (c->dstFormat) {
case AV_PIX_FMT_RGB24:
@@ -121,5 +121,33 @@ av_cold SwsFunc ff_yuv2rgb_init_loongarch(SwsContext *c)
}
}
#endif // #if HAVE_LASX
+ if (have_lsx(cpu_flags)) {
+ switch (c->dstFormat) {
+ case AV_PIX_FMT_RGB24:
+ return yuv420_rgb24_lsx;
+ case AV_PIX_FMT_BGR24:
+ return yuv420_bgr24_lsx;
+ case AV_PIX_FMT_RGBA:
+ if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat)) {
+ break;
+ } else
+ return yuv420_rgba32_lsx;
+ case AV_PIX_FMT_ARGB:
+ if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat)) {
+ break;
+ } else
+ return yuv420_argb32_lsx;
+ case AV_PIX_FMT_BGRA:
+ if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat)) {
+ break;
+ } else
+ return yuv420_bgra32_lsx;
+ case AV_PIX_FMT_ABGR:
+ if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat)) {
+ break;
+ } else
+ return yuv420_abgr32_lsx;
+ }
+ }
return NULL;
}
diff --git a/libswscale/loongarch/swscale_loongarch.h b/libswscale/loongarch/swscale_loongarch.h
index bc29913ac6..0514abae21 100644
--- a/libswscale/loongarch/swscale_loongarch.h
+++ b/libswscale/loongarch/swscale_loongarch.h
@@ -62,6 +62,24 @@ void ff_yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
av_cold void ff_sws_init_output_lsx(SwsContext *c);
+int yuv420_rgb24_lsx(SwsContext *c, const uint8_t *src[], int srcStride[],
+ int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
+
+int yuv420_bgr24_lsx(SwsContext *c, const uint8_t *src[], int srcStride[],
+ int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
+
+int yuv420_rgba32_lsx(SwsContext *c, const uint8_t *src[], int srcStride[],
+ int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
+
+int yuv420_bgra32_lsx(SwsContext *c, const uint8_t *src[], int srcStride[],
+ int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
+
+int yuv420_argb32_lsx(SwsContext *c, const uint8_t *src[], int srcStride[],
+ int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
+
+int yuv420_abgr32_lsx(SwsContext *c, const uint8_t *src[], int srcStride[],
+ int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
+
#if HAVE_LASX
void ff_hscale_8_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
const uint8_t *src, const int16_t *filter,
diff --git a/libswscale/loongarch/yuv2rgb_lsx.c b/libswscale/loongarch/yuv2rgb_lsx.c
new file mode 100644
index 0000000000..11cd2f79d9
--- /dev/null
+++ b/libswscale/loongarch/yuv2rgb_lsx.c
@@ -0,0 +1,361 @@
+/*
+ * Copyright (C) 2023 Loongson Technology Co. Ltd.
+ * Contributed by Bo Jin(jinbo@loongson.cn)
+ * All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "swscale_loongarch.h"
+#include "libavutil/loongarch/loongson_intrinsics.h"
+
+#define YUV2RGB_LOAD_COE \
+ /* Load x_offset */ \
+ __m128i y_offset = __lsx_vreplgr2vr_d(c->yOffset); \
+ __m128i u_offset = __lsx_vreplgr2vr_d(c->uOffset); \
+ __m128i v_offset = __lsx_vreplgr2vr_d(c->vOffset); \
+ /* Load x_coeff */ \
+ __m128i ug_coeff = __lsx_vreplgr2vr_d(c->ugCoeff); \
+ __m128i vg_coeff = __lsx_vreplgr2vr_d(c->vgCoeff); \
+ __m128i y_coeff = __lsx_vreplgr2vr_d(c->yCoeff); \
+ __m128i ub_coeff = __lsx_vreplgr2vr_d(c->ubCoeff); \
+ __m128i vr_coeff = __lsx_vreplgr2vr_d(c->vrCoeff); \
+
+#define LOAD_YUV_16 \
+ m_y1 = __lsx_vld(py_1, 0); \
+ m_y2 = __lsx_vld(py_2, 0); \
+ m_u = __lsx_vldrepl_d(pu, 0); \
+ m_v = __lsx_vldrepl_d(pv, 0); \
+ DUP2_ARG2(__lsx_vilvl_b, m_u, m_u, m_v, m_v, m_u, m_v); \
+ DUP2_ARG2(__lsx_vilvh_b, zero, m_u, zero, m_v, m_u_h, m_v_h); \
+ DUP2_ARG2(__lsx_vilvl_b, zero, m_u, zero, m_v, m_u, m_v); \
+ DUP2_ARG2(__lsx_vilvh_b, zero, m_y1, zero, m_y2, m_y1_h, m_y2_h); \
+ DUP2_ARG2(__lsx_vilvl_b, zero, m_y1, zero, m_y2, m_y1, m_y2); \
+
+/* YUV2RGB method
+ * The conversion method is as follows:
+ * R = Y' * y_coeff + V' * vr_coeff
+ * G = Y' * y_coeff + V' * vg_coeff + U' * ug_coeff
+ * B = Y' * y_coeff + U' * ub_coeff
+ *
+ * where X' = X * 8 - x_offset
+ *
+ */
+
+#define YUV2RGB(y1, y2, u, v, r1, g1, b1, r2, g2, b2) \
+{ \
+ y1 = __lsx_vslli_h(y1, 3); \
+ y2 = __lsx_vslli_h(y2, 3); \
+ u = __lsx_vslli_h(u, 3); \
+ v = __lsx_vslli_h(v, 3); \
+ y1 = __lsx_vsub_h(y1, y_offset); \
+ y2 = __lsx_vsub_h(y2, y_offset); \
+ u = __lsx_vsub_h(u, u_offset); \
+ v = __lsx_vsub_h(v, v_offset); \
+ y_1 = __lsx_vmuh_h(y1, y_coeff); \
+ y_2 = __lsx_vmuh_h(y2, y_coeff); \
+ u2g = __lsx_vmuh_h(u, ug_coeff); \
+ u2b = __lsx_vmuh_h(u, ub_coeff); \
+ v2r = __lsx_vmuh_h(v, vr_coeff); \
+ v2g = __lsx_vmuh_h(v, vg_coeff); \
+ r1 = __lsx_vsadd_h(y_1, v2r); \
+ v2g = __lsx_vsadd_h(v2g, u2g); \
+ g1 = __lsx_vsadd_h(y_1, v2g); \
+ b1 = __lsx_vsadd_h(y_1, u2b); \
+ r2 = __lsx_vsadd_h(y_2, v2r); \
+ g2 = __lsx_vsadd_h(y_2, v2g); \
+ b2 = __lsx_vsadd_h(y_2, u2b); \
+ DUP4_ARG1(__lsx_vclip255_h, r1, g1, b1, r2, r1, g1, b1, r2); \
+ DUP2_ARG1(__lsx_vclip255_h, g2, b2, g2, b2); \
+}
+
+#define RGB_PACK(r, g, b, rgb_l, rgb_h) \
+{ \
+ __m128i rg; \
+ rg = __lsx_vpackev_b(g, r); \
+ DUP2_ARG3(__lsx_vshuf_b, b, rg, shuf2, b, rg, shuf3, rgb_l, rgb_h); \
+}
+
+#define RGB32_PACK(a, r, g, b, rgb_l, rgb_h) \
+{ \
+ __m128i ra, bg; \
+ ra = __lsx_vpackev_b(r, a); \
+ bg = __lsx_vpackev_b(b, g); \
+ rgb_l = __lsx_vilvl_h(bg, ra); \
+ rgb_h = __lsx_vilvh_h(bg, ra); \
+}
+
+#define RGB_STORE(rgb_l, rgb_h, image) \
+{ \
+ __lsx_vstelm_d(rgb_l, image, 0, 0); \
+ __lsx_vstelm_d(rgb_l, image, 8, 1); \
+ __lsx_vstelm_d(rgb_h, image, 16, 0); \
+}
+
+#define RGB32_STORE(rgb_l, rgb_h, image) \
+{ \
+ __lsx_vst(rgb_l, image, 0); \
+ __lsx_vst(rgb_h, image, 16); \
+}
+
+#define YUV2RGBFUNC(func_name, dst_type, alpha) \
+ int func_name(SwsContext *c, const uint8_t *src[], \
+ int srcStride[], int srcSliceY, int srcSliceH, \
+ uint8_t *dst[], int dstStride[]) \
+{ \
+ int x, y, h_size, vshift, res; \
+ __m128i m_y1, m_y2, m_u, m_v; \
+ __m128i m_y1_h, m_y2_h, m_u_h, m_v_h; \
+ __m128i y_1, y_2, u2g, v2g, u2b, v2r, rgb1_l, rgb1_h; \
+ __m128i rgb2_l, rgb2_h, r1, g1, b1, r2, g2, b2; \
+ __m128i shuf2 = {0x0504120302100100, 0x0A18090816070614}; \
+ __m128i shuf3 = {0x1E0F0E1C0D0C1A0B, 0x0101010101010101}; \
+ __m128i zero = __lsx_vldi(0); \
+ \
+ YUV2RGB_LOAD_COE \
+ \
+ h_size = c->dstW >> 4; \
+ res = (c->dstW & 15) >> 1; \
+ vshift = c->srcFormat != AV_PIX_FMT_YUV422P; \
+ for (y = 0; y < srcSliceH; y += 2) { \
+ dst_type av_unused *r, *g, *b; \
+ dst_type *image1 = (dst_type *)(dst[0] + (y + srcSliceY) * dstStride[0]);\
+ dst_type *image2 = (dst_type *)(image1 + dstStride[0]);\
+ const uint8_t *py_1 = src[0] + y * srcStride[0]; \
+ const uint8_t *py_2 = py_1 + srcStride[0]; \
+ const uint8_t *pu = src[1] + (y >> vshift) * srcStride[1]; \
+ const uint8_t *pv = src[2] + (y >> vshift) * srcStride[2]; \
+ for(x = 0; x < h_size; x++) { \
+
+#define YUV2RGBFUNC32(func_name, dst_type, alpha) \
+ int func_name(SwsContext *c, const uint8_t *src[], \
+ int srcStride[], int srcSliceY, int srcSliceH, \
+ uint8_t *dst[], int dstStride[]) \
+{ \
+ int x, y, h_size, vshift, res; \
+ __m128i m_y1, m_y2, m_u, m_v; \
+ __m128i m_y1_h, m_y2_h, m_u_h, m_v_h; \
+ __m128i y_1, y_2, u2g, v2g, u2b, v2r, rgb1_l, rgb1_h; \
+ __m128i rgb2_l, rgb2_h, r1, g1, b1, r2, g2, b2; \
+ __m128i a = __lsx_vldi(0xFF); \
+ __m128i zero = __lsx_vldi(0); \
+ \
+ YUV2RGB_LOAD_COE \
+ \
+ h_size = c->dstW >> 4; \
+ res = (c->dstW & 15) >> 1; \
+ vshift = c->srcFormat != AV_PIX_FMT_YUV422P; \
+ for (y = 0; y < srcSliceH; y += 2) { \
+ int yd = y + srcSliceY; \
+ dst_type av_unused *r, *g, *b; \
+ dst_type *image1 = (dst_type *)(dst[0] + (yd) * dstStride[0]); \
+ dst_type *image2 = (dst_type *)(dst[0] + (yd + 1) * dstStride[0]); \
+ const uint8_t *py_1 = src[0] + y * srcStride[0]; \
+ const uint8_t *py_2 = py_1 + srcStride[0]; \
+ const uint8_t *pu = src[1] + (y >> vshift) * srcStride[1]; \
+ const uint8_t *pv = src[2] + (y >> vshift) * srcStride[2]; \
+ for(x = 0; x < h_size; x++) { \
+
+#define DEALYUV2RGBREMAIN \
+ py_1 += 16; \
+ py_2 += 16; \
+ pu += 8; \
+ pv += 8; \
+ image1 += 48; \
+ image2 += 48; \
+ } \
+ for (x = 0; x < res; x++) { \
+ int av_unused U, V, Y; \
+ U = pu[0]; \
+ V = pv[0]; \
+ r = (void *)c->table_rV[V+YUVRGB_TABLE_HEADROOM]; \
+ g = (void *)(c->table_gU[U+YUVRGB_TABLE_HEADROOM] \
+ + c->table_gV[V+YUVRGB_TABLE_HEADROOM]); \
+ b = (void *)c->table_bU[U+YUVRGB_TABLE_HEADROOM];
+
+#define DEALYUV2RGBREMAIN32 \
+ py_1 += 16; \
+ py_2 += 16; \
+ pu += 8; \
+ pv += 8; \
+ image1 += 16; \
+ image2 += 16; \
+ } \
+ for (x = 0; x < res; x++) { \
+ int av_unused U, V, Y; \
+ U = pu[0]; \
+ V = pv[0]; \
+ r = (void *)c->table_rV[V+YUVRGB_TABLE_HEADROOM]; \
+ g = (void *)(c->table_gU[U+YUVRGB_TABLE_HEADROOM] \
+ + c->table_gV[V+YUVRGB_TABLE_HEADROOM]); \
+ b = (void *)c->table_bU[U+YUVRGB_TABLE_HEADROOM]; \
+
+#define PUTRGB24(dst, src) \
+ Y = src[0]; \
+ dst[0] = r[Y]; \
+ dst[1] = g[Y]; \
+ dst[2] = b[Y]; \
+ Y = src[1]; \
+ dst[3] = r[Y]; \
+ dst[4] = g[Y]; \
+ dst[5] = b[Y];
+
+#define PUTBGR24(dst, src) \
+ Y = src[0]; \
+ dst[0] = b[Y]; \
+ dst[1] = g[Y]; \
+ dst[2] = r[Y]; \
+ Y = src[1]; \
+ dst[3] = b[Y]; \
+ dst[4] = g[Y]; \
+ dst[5] = r[Y];
+
+#define PUTRGB(dst, src) \
+ Y = src[0]; \
+ dst[0] = r[Y] + g[Y] + b[Y]; \
+ Y = src[1]; \
+ dst[1] = r[Y] + g[Y] + b[Y]; \
+
+#define ENDRES \
+ pu += 1; \
+ pv += 1; \
+ py_1 += 2; \
+ py_2 += 2; \
+ image1 += 6; \
+ image2 += 6; \
+
+#define ENDRES32 \
+ pu += 1; \
+ pv += 1; \
+ py_1 += 2; \
+ py_2 += 2; \
+ image1 += 2; \
+ image2 += 2; \
+
+#define END_FUNC() \
+ } \
+ } \
+ return srcSliceH; \
+}
+
+YUV2RGBFUNC(yuv420_rgb24_lsx, uint8_t, 0)
+ LOAD_YUV_16
+ YUV2RGB(m_y1, m_y2, m_u, m_v, r1, g1, b1, r2, g2, b2);
+ RGB_PACK(r1, g1, b1, rgb1_l, rgb1_h);
+ RGB_PACK(r2, g2, b2, rgb2_l, rgb2_h);
+ RGB_STORE(rgb1_l, rgb1_h, image1);
+ RGB_STORE(rgb2_l, rgb2_h, image2);
+ YUV2RGB(m_y1_h, m_y2_h, m_u_h, m_v_h, r1, g1, b1, r2, g2, b2);
+ RGB_PACK(r1, g1, b1, rgb1_l, rgb1_h);
+ RGB_PACK(r2, g2, b2, rgb2_l, rgb2_h);
+ RGB_STORE(rgb1_l, rgb1_h, image1 + 24);
+ RGB_STORE(rgb2_l, rgb2_h, image2 + 24);
+ DEALYUV2RGBREMAIN
+ PUTRGB24(image1, py_1);
+ PUTRGB24(image2, py_2);
+ ENDRES
+ END_FUNC()
+
+YUV2RGBFUNC(yuv420_bgr24_lsx, uint8_t, 0)
+ LOAD_YUV_16
+ YUV2RGB(m_y1, m_y2, m_u, m_v, r1, g1, b1, r2, g2, b2);
+ RGB_PACK(b1, g1, r1, rgb1_l, rgb1_h);
+ RGB_PACK(b2, g2, r2, rgb2_l, rgb2_h);
+ RGB_STORE(rgb1_l, rgb1_h, image1);
+ RGB_STORE(rgb2_l, rgb2_h, image2);
+ YUV2RGB(m_y1_h, m_y2_h, m_u_h, m_v_h, r1, g1, b1, r2, g2, b2);
+ RGB_PACK(b1, g1, r1, rgb1_l, rgb1_h);
+ RGB_PACK(b2, g2, r2, rgb2_l, rgb2_h);
+ RGB_STORE(rgb1_l, rgb1_h, image1 + 24);
+ RGB_STORE(rgb2_l, rgb2_h, image2 + 24);
+ DEALYUV2RGBREMAIN
+ PUTBGR24(image1, py_1);
+ PUTBGR24(image2, py_2);
+ ENDRES
+ END_FUNC()
+
+YUV2RGBFUNC32(yuv420_rgba32_lsx, uint32_t, 0)
+ LOAD_YUV_16
+ YUV2RGB(m_y1, m_y2, m_u, m_v, r1, g1, b1, r2, g2, b2);
+ RGB32_PACK(r1, g1, b1, a, rgb1_l, rgb1_h);
+ RGB32_PACK(r2, g2, b2, a, rgb2_l, rgb2_h);
+ RGB32_STORE(rgb1_l, rgb1_h, image1);
+ RGB32_STORE(rgb2_l, rgb2_h, image2);
+ YUV2RGB(m_y1_h, m_y2_h, m_u_h, m_v_h, r1, g1, b1, r2, g2, b2);
+ RGB32_PACK(r1, g1, b1, a, rgb1_l, rgb1_h);
+ RGB32_PACK(r2, g2, b2, a, rgb2_l, rgb2_h);
+ RGB32_STORE(rgb1_l, rgb1_h, image1 + 8);
+ RGB32_STORE(rgb2_l, rgb2_h, image2 + 8);
+ DEALYUV2RGBREMAIN32
+ PUTRGB(image1, py_1);
+ PUTRGB(image2, py_2);
+ ENDRES32
+ END_FUNC()
+
+YUV2RGBFUNC32(yuv420_bgra32_lsx, uint32_t, 0)
+ LOAD_YUV_16
+ YUV2RGB(m_y1, m_y2, m_u, m_v, r1, g1, b1, r2, g2, b2);
+ RGB32_PACK(b1, g1, r1, a, rgb1_l, rgb1_h);
+ RGB32_PACK(b2, g2, r2, a, rgb2_l, rgb2_h);
+ RGB32_STORE(rgb1_l, rgb1_h, image1);
+ RGB32_STORE(rgb2_l, rgb2_h, image2);
+ YUV2RGB(m_y1_h, m_y2_h, m_u_h, m_v_h, r1, g1, b1, r2, g2, b2);
+ RGB32_PACK(b1, g1, r1, a, rgb1_l, rgb1_h);
+ RGB32_PACK(b2, g2, r2, a, rgb2_l, rgb2_h);
+ RGB32_STORE(rgb1_l, rgb1_h, image1 + 8);
+ RGB32_STORE(rgb2_l, rgb2_h, image2 + 8);
+ DEALYUV2RGBREMAIN32
+ PUTRGB(image1, py_1);
+ PUTRGB(image2, py_2);
+ ENDRES32
+ END_FUNC()
+
+YUV2RGBFUNC32(yuv420_argb32_lsx, uint32_t, 0)
+ LOAD_YUV_16
+ YUV2RGB(m_y1, m_y2, m_u, m_v, r1, g1, b1, r2, g2, b2);
+ RGB32_PACK(a, r1, g1, b1, rgb1_l, rgb1_h);
+ RGB32_PACK(a, r2, g2, b2, rgb2_l, rgb2_h);
+ RGB32_STORE(rgb1_l, rgb1_h, image1);
+ RGB32_STORE(rgb2_l, rgb2_h, image2);
+ YUV2RGB(m_y1_h, m_y2_h, m_u_h, m_v_h, r1, g1, b1, r2, g2, b2);
+ RGB32_PACK(a, r1, g1, b1, rgb1_l, rgb1_h);
+ RGB32_PACK(a, r2, g2, b2, rgb2_l, rgb2_h);
+ RGB32_STORE(rgb1_l, rgb1_h, image1 + 8);
+ RGB32_STORE(rgb2_l, rgb2_h, image2 + 8);
+ DEALYUV2RGBREMAIN32
+ PUTRGB(image1, py_1);
+ PUTRGB(image2, py_2);
+ ENDRES32
+ END_FUNC()
+
+YUV2RGBFUNC32(yuv420_abgr32_lsx, uint32_t, 0)
+ LOAD_YUV_16
+ YUV2RGB(m_y1, m_y2, m_u, m_v, r1, g1, b1, r2, g2, b2);
+ RGB32_PACK(a, b1, g1, r1, rgb1_l, rgb1_h);
+ RGB32_PACK(a, b2, g2, r2, rgb2_l, rgb2_h);
+ RGB32_STORE(rgb1_l, rgb1_h, image1);
+ RGB32_STORE(rgb2_l, rgb2_h, image2);
+ YUV2RGB(m_y1_h, m_y2_h, m_u_h, m_v_h, r1, g1, b1, r2, g2, b2);
+ RGB32_PACK(a, b1, g1, r1, rgb1_l, rgb1_h);
+ RGB32_PACK(a, b2, g2, r2, rgb2_l, rgb2_h);
+ RGB32_STORE(rgb1_l, rgb1_h, image1 + 8);
+ RGB32_STORE(rgb2_l, rgb2_h, image2 + 8);
+ DEALYUV2RGBREMAIN32
+ PUTRGB(image1, py_1);
+ PUTRGB(image2, py_2);
+ ENDRES32
+ END_FUNC()
--
2.20.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 17+ messages in thread
* [FFmpeg-devel] [PATCH v5 7/7] avutil/la: Add function performance testing
2023-05-25 7:24 [FFmpeg-devel] Add LSX optimization in avcodec and swscale Hao Chen
` (5 preceding siblings ...)
2023-05-25 7:24 ` [FFmpeg-devel] [PATCH v5 6/7] swscale/la: Add following builtin optimized functions Hao Chen
@ 2023-05-25 7:24 ` Hao Chen
6 siblings, 0 replies; 17+ messages in thread
From: Hao Chen @ 2023-05-25 7:24 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: yuanhecai
From: yuanhecai <yuanhecai@loongson.cn>
This patch supports the use of the "checkasm --bench" testing feature
on loongarch platform.
Change-Id: I42790388d057c9ade0dfa38a19d9c1fd44ca0bc3
---
libavutil/loongarch/timer.h | 48 +++++++++++++++++++++++++++++++++++++
libavutil/timer.h | 2 ++
2 files changed, 50 insertions(+)
create mode 100644 libavutil/loongarch/timer.h
diff --git a/libavutil/loongarch/timer.h b/libavutil/loongarch/timer.h
new file mode 100644
index 0000000000..2f998aaba0
--- /dev/null
+++ b/libavutil/loongarch/timer.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Hecai Yuan <yuanhecai@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_LOONGARCH_TIMER_H
+#define AVUTIL_LOONGARCH_TIMER_H
+
+#include <stdint.h>
+#include "config.h"
+
+#if HAVE_INLINE_ASM
+
+#define AV_READ_TIME read_time
+
+static inline uint64_t read_time(void)
+{
+
+#if ARCH_LOONGARCH64
+ uint64_t a, id;
+ __asm__ volatile ( "rdtime.d %0, %1" : "=r"(a), "=r"(id) :: );
+ return a;
+#else
+ uint32_t a, id;
+ __asm__ volatile ( "rdtimel.w %0, %1" : "=r"(a), "=r"(id) :: );
+ return (uint64_t)a;
+#endif
+}
+
+#endif /* HAVE_INLINE_ASM */
+
+#endif /* AVUTIL_LOONGARCH_TIMER_H */
diff --git a/libavutil/timer.h b/libavutil/timer.h
index d3db5a27ef..861ba7e9d7 100644
--- a/libavutil/timer.h
+++ b/libavutil/timer.h
@@ -61,6 +61,8 @@
# include "riscv/timer.h"
#elif ARCH_X86
# include "x86/timer.h"
+#elif ARCH_LOONGARCH
+# include "loongarch/timer.h"
#endif
#if !defined(AV_READ_TIME)
--
2.20.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 17+ messages in thread