[FFmpeg-devel] Add LoongArch SIMD optimization in swscale lib.

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed

* [FFmpeg-devel] Add LoongArch SIMD optimization in swscale lib.
@ 2022-09-09  9:00 Hao Chen
  2022-09-09  9:00 ` [FFmpeg-devel] [PATCH v5 1/3] swscale/la: Optimize hscale functions with lasx Hao Chen
                   ` (3 more replies)
  0 siblings, 4 replies; 6+ messages in thread
From: Hao Chen @ 2022-09-09  9:00 UTC (permalink / raw)
  To: ffmpeg-devel

v2: Some modifications were made according to the comments of the reviewers.
v3: Update and run CI test again.
v4: Resolve the warning for the build.
v5: Re-trigger the Patchwork test.

[PATCH v5 1/3] swscale/la: Optimize hscale functions with lasx.
[PATCH v5 2/3] swscale/la: Add yuv2rgb_lasx.c and rgb2rgb_lasx.c
[PATCH v5 3/3] swscale/la: Add output_lasx.c file.

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [FFmpeg-devel] [PATCH v5 1/3] swscale/la: Optimize hscale functions with lasx.
  2022-09-09  9:00 [FFmpeg-devel] Add LoongArch SIMD optimization in swscale lib Hao Chen
@ 2022-09-09  9:00 ` Hao Chen
  2022-09-09  9:00 ` [FFmpeg-devel] [PATCH v5 2/3] swscale/la: Add yuv2rgb_lasx.c and rgb2rgb_lasx.c files Hao Chen
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 6+ messages in thread
From: Hao Chen @ 2022-09-09  9:00 UTC (permalink / raw)
  To: ffmpeg-devel

ffmpeg -i 1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -s 640x480 -y /dev/null -an
before: 101fps
after:  138fps

Signed-off-by: Hao Chen <chenhao@loongson.cn>
---
 libswscale/loongarch/Makefile                 |   3 +
 libswscale/loongarch/input_lasx.c             | 202 ++++
 libswscale/loongarch/swscale_init_loongarch.c |  50 +
 libswscale/loongarch/swscale_lasx.c           | 972 ++++++++++++++++++
 libswscale/loongarch/swscale_loongarch.h      |  50 +
 libswscale/swscale.c                          |   2 +
 libswscale/swscale_internal.h                 |   2 +
 libswscale/utils.c                            |  13 +-
 8 files changed, 1293 insertions(+), 1 deletion(-)
 create mode 100644 libswscale/loongarch/Makefile
 create mode 100644 libswscale/loongarch/input_lasx.c
 create mode 100644 libswscale/loongarch/swscale_init_loongarch.c
 create mode 100644 libswscale/loongarch/swscale_lasx.c
 create mode 100644 libswscale/loongarch/swscale_loongarch.h

diff --git a/libswscale/loongarch/Makefile b/libswscale/loongarch/Makefile
new file mode 100644
index 0000000000..586a1717b6
--- /dev/null
+++ b/libswscale/loongarch/Makefile
@@ -0,0 +1,3 @@
+OBJS-$(CONFIG_SWSCALE)      += loongarch/swscale_init_loongarch.o
+LASX-OBJS-$(CONFIG_SWSCALE) += loongarch/swscale_lasx.o \
+                               loongarch/input_lasx.o   \
diff --git a/libswscale/loongarch/input_lasx.c b/libswscale/loongarch/input_lasx.c
new file mode 100644
index 0000000000..4830072eaf
--- /dev/null
+++ b/libswscale/loongarch/input_lasx.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright (C) 2022 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen(chenhao@loongson.cn)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "swscale_loongarch.h"
+#include "libavutil/loongarch/loongson_intrinsics.h"
+
+void planar_rgb_to_uv_lasx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
+                           int width, int32_t *rgb2yuv, void *opq)
+{
+    int i;
+    uint16_t *dstU   = (uint16_t *)_dstU;
+    uint16_t *dstV   = (uint16_t *)_dstV;
+    int set          = 0x4001 << (RGB2YUV_SHIFT - 7);
+    int len          = width - 15;
+    int32_t tem_ru   = rgb2yuv[RU_IDX], tem_gu = rgb2yuv[GU_IDX];
+    int32_t tem_bu = rgb2yuv[BU_IDX], tem_rv   = rgb2yuv[RV_IDX];
+    int32_t tem_gv = rgb2yuv[GV_IDX], tem_bv = rgb2yuv[BV_IDX];
+    int shift        = RGB2YUV_SHIFT - 6;
+    const uint8_t *src0 = src[0], *src1 = src[1], *src2 = src[2];
+    __m256i ru, gu, bu, rv, gv, bv;
+    __m256i mask = {0x0D0C090805040100, 0x1D1C191815141110,
+                    0x0D0C090805040100, 0x1D1C191815141110};
+    __m256i temp = __lasx_xvreplgr2vr_w(set);
+    __m256i sra  = __lasx_xvreplgr2vr_w(shift);
+
+    ru = __lasx_xvreplgr2vr_w(tem_ru);
+    gu = __lasx_xvreplgr2vr_w(tem_gu);
+    bu = __lasx_xvreplgr2vr_w(tem_bu);
+    rv = __lasx_xvreplgr2vr_w(tem_rv);
+    gv = __lasx_xvreplgr2vr_w(tem_gv);
+    bv = __lasx_xvreplgr2vr_w(tem_bv);
+    for (i = 0; i < len; i += 16) {
+        __m256i _g, _b, _r;
+        __m256i g_l, g_h, b_l, b_h, r_l, r_h;
+        __m256i v_l, v_h, u_l, u_h, u_lh, v_lh;
+
+        _g  = __lasx_xvldx(src0, i);
+        _b  = __lasx_xvldx(src1, i);
+        _r  = __lasx_xvldx(src2, i);
+        g_l = __lasx_vext2xv_wu_bu(_g);
+        b_l = __lasx_vext2xv_wu_bu(_b);
+        r_l = __lasx_vext2xv_wu_bu(_r);
+        _g  = __lasx_xvpermi_d(_g, 0x01);
+        _b  = __lasx_xvpermi_d(_b, 0x01);
+        _r  = __lasx_xvpermi_d(_r, 0x01);
+        g_h = __lasx_vext2xv_wu_bu(_g);
+        b_h = __lasx_vext2xv_wu_bu(_b);
+        r_h = __lasx_vext2xv_wu_bu(_r);
+        u_l  = __lasx_xvmadd_w(temp, ru, r_l);
+        u_h  = __lasx_xvmadd_w(temp, ru, r_h);
+        v_l  = __lasx_xvmadd_w(temp, rv, r_l);
+        v_h  = __lasx_xvmadd_w(temp, rv, r_h);
+        u_l  = __lasx_xvmadd_w(u_l, gu, g_l);
+        u_l  = __lasx_xvmadd_w(u_l, bu, b_l);
+        u_h  = __lasx_xvmadd_w(u_h, gu, g_h);
+        u_h  = __lasx_xvmadd_w(u_h, bu, b_h);
+        v_l  = __lasx_xvmadd_w(v_l, gv, g_l);
+        v_l  = __lasx_xvmadd_w(v_l, bv, b_l);
+        v_h  = __lasx_xvmadd_w(v_h, gv, g_h);
+        v_h  = __lasx_xvmadd_w(v_h, bv, b_h);
+        u_l  = __lasx_xvsra_w(u_l, sra);
+        u_h  = __lasx_xvsra_w(u_h, sra);
+        v_l  = __lasx_xvsra_w(v_l, sra);
+        v_h  = __lasx_xvsra_w(v_h, sra);
+        u_lh = __lasx_xvshuf_b(u_h, u_l, mask);
+        v_lh = __lasx_xvshuf_b(v_h, v_l, mask);
+        u_lh = __lasx_xvpermi_d(u_lh, 0xD8);
+        v_lh = __lasx_xvpermi_d(v_lh, 0xD8);
+        __lasx_xvst(u_lh, (dstU + i), 0);
+        __lasx_xvst(v_lh, (dstV + i), 0);
+    }
+    if (width - i >= 8) {
+        __m256i _g, _b, _r;
+        __m256i g_l, b_l, r_l;
+        __m256i v_l, u_l, u, v;
+
+        _g  = __lasx_xvldrepl_d((src0 + i), 0);
+        _b  = __lasx_xvldrepl_d((src1 + i), 0);
+        _r  = __lasx_xvldrepl_d((src2 + i), 0);
+        g_l = __lasx_vext2xv_wu_bu(_g);
+        b_l = __lasx_vext2xv_wu_bu(_b);
+        r_l = __lasx_vext2xv_wu_bu(_r);
+        u_l = __lasx_xvmadd_w(temp, ru, r_l);
+        v_l = __lasx_xvmadd_w(temp, rv, r_l);
+        u_l = __lasx_xvmadd_w(u_l, gu, g_l);
+        u_l = __lasx_xvmadd_w(u_l, bu, b_l);
+        v_l = __lasx_xvmadd_w(v_l, gv, g_l);
+        v_l = __lasx_xvmadd_w(v_l, bv, b_l);
+        u_l = __lasx_xvsra_w(u_l, sra);
+        v_l = __lasx_xvsra_w(v_l, sra);
+        u   = __lasx_xvshuf_b(u_l, u_l, mask);
+        v   = __lasx_xvshuf_b(v_l, v_l, mask);
+        __lasx_xvstelm_d(u, (dstU + i), 0, 0);
+        __lasx_xvstelm_d(u, (dstU + i), 8, 2);
+        __lasx_xvstelm_d(v, (dstV + i), 0, 0);
+        __lasx_xvstelm_d(v, (dstV + i), 8, 2);
+        i += 8;
+    }
+    for (; i < width; i++) {
+        int g = src[0][i];
+        int b = src[1][i];
+        int r = src[2][i];
+
+        dstU[i] = (tem_ru * r + tem_gu * g + tem_bu * b + set) >> shift;
+        dstV[i] = (tem_rv * r + tem_gv * g + tem_bv * b + set) >> shift;
+    }
+}
+
+void planar_rgb_to_y_lasx(uint8_t *_dst, const uint8_t *src[4], int width,
+                          int32_t *rgb2yuv, void *opq)
+{
+    int i;
+    int shift        = (RGB2YUV_SHIFT - 6);
+    int set          = 0x801 << (RGB2YUV_SHIFT - 7);
+    int len          = width - 15;
+    uint16_t *dst    = (uint16_t *)_dst;
+    int32_t tem_ry   = rgb2yuv[RY_IDX], tem_gy = rgb2yuv[GY_IDX];
+    int32_t tem_by   = rgb2yuv[BY_IDX];
+    const uint8_t *src0 = src[0], *src1 = src[1], *src2 = src[2];
+    __m256i mask = {0x0D0C090805040100, 0x1D1C191815141110,
+                    0x0D0C090805040100, 0x1D1C191815141110};
+    __m256i temp = __lasx_xvreplgr2vr_w(set);
+    __m256i sra  = __lasx_xvreplgr2vr_w(shift);
+    __m256i ry   = __lasx_xvreplgr2vr_w(tem_ry);
+    __m256i gy   = __lasx_xvreplgr2vr_w(tem_gy);
+    __m256i by   = __lasx_xvreplgr2vr_w(tem_by);
+
+    for (i = 0; i < len; i += 16) {
+        __m256i _g, _b, _r;
+        __m256i g_l, g_h, b_l, b_h, r_l, r_h;
+        __m256i y_l, y_h, y_lh;
+
+        _g  = __lasx_xvldx(src0, i);
+        _b  = __lasx_xvldx(src1, i);
+        _r  = __lasx_xvldx(src2, i);
+        g_l = __lasx_vext2xv_wu_bu(_g);
+        b_l = __lasx_vext2xv_wu_bu(_b);
+        r_l = __lasx_vext2xv_wu_bu(_r);
+        _g  = __lasx_xvpermi_d(_g, 0x01);
+        _b  = __lasx_xvpermi_d(_b, 0x01);
+        _r  = __lasx_xvpermi_d(_r, 0x01);
+        g_h = __lasx_vext2xv_wu_bu(_g);
+        b_h = __lasx_vext2xv_wu_bu(_b);
+        r_h = __lasx_vext2xv_wu_bu(_r);
+        y_l  = __lasx_xvmadd_w(temp, ry, r_l);
+        y_h  = __lasx_xvmadd_w(temp, ry, r_h);
+        y_l  = __lasx_xvmadd_w(y_l, gy, g_l);
+        y_l  = __lasx_xvmadd_w(y_l, by, b_l);
+        y_h  = __lasx_xvmadd_w(y_h, gy, g_h);
+        y_h  = __lasx_xvmadd_w(y_h, by, b_h);
+        y_l  = __lasx_xvsra_w(y_l, sra);
+        y_h  = __lasx_xvsra_w(y_h, sra);
+        y_lh = __lasx_xvshuf_b(y_h, y_l, mask);
+        y_lh = __lasx_xvpermi_d(y_lh, 0xD8);
+        __lasx_xvst(y_lh, (dst + i), 0);
+    }
+    if (width - i >= 8) {
+        __m256i _g, _b, _r;
+        __m256i g_l, b_l, r_l;
+        __m256i y_l, y;
+
+        _g  = __lasx_xvldrepl_d((src0 + i), 0);
+        _b  = __lasx_xvldrepl_d((src1 + i), 0);
+        _r  = __lasx_xvldrepl_d((src2 + i), 0);
+        g_l = __lasx_vext2xv_wu_bu(_g);
+        b_l = __lasx_vext2xv_wu_bu(_b);
+        r_l = __lasx_vext2xv_wu_bu(_r);
+        y_l = __lasx_xvmadd_w(temp, ry, r_l);
+        y_l = __lasx_xvmadd_w(y_l, gy, g_l);
+        y_l = __lasx_xvmadd_w(y_l, by, b_l);
+        y_l = __lasx_xvsra_w(y_l, sra);
+        y = __lasx_xvshuf_b(y_l, y_l, mask);
+        __lasx_xvstelm_d(y, (dst + i), 0, 0);
+        __lasx_xvstelm_d(y, (dst + i), 8, 2);
+        i += 8;
+    }
+    for (; i < width; i++) {
+        int g = src[0][i];
+        int b = src[1][i];
+        int r = src[2][i];
+
+        dst[i] = (tem_ry * r + tem_gy * g + tem_by * b + set) >> shift;
+    }
+}
diff --git a/libswscale/loongarch/swscale_init_loongarch.c b/libswscale/loongarch/swscale_init_loongarch.c
new file mode 100644
index 0000000000..197dc6e1e7
--- /dev/null
+++ b/libswscale/loongarch/swscale_init_loongarch.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) 2022 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen(chenhao@loongson.cn)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "swscale_loongarch.h"
+#include "libswscale/swscale_internal.h"
+#include "libavutil/loongarch/cpu.h"
+
+av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+    if (have_lasx(cpu_flags)) {
+        if (c->srcBpc == 8) {
+            if (c->dstBpc <= 14) {
+                c->hyScale = c->hcScale = ff_hscale_8_to_15_lasx;
+            } else {
+                c->hyScale = c->hcScale = ff_hscale_8_to_19_lasx;
+            }
+        } else {
+            c->hyScale = c->hcScale = c->dstBpc > 14 ? ff_hscale_16_to_19_lasx
+                                                     : ff_hscale_16_to_15_lasx;
+        }
+        switch (c->srcFormat) {
+        case AV_PIX_FMT_GBRAP:
+        case AV_PIX_FMT_GBRP:
+            {
+                c->readChrPlanar = planar_rgb_to_uv_lasx;
+                c->readLumPlanar = planar_rgb_to_y_lasx;
+            }
+            break;
+        }
+    }
+}
diff --git a/libswscale/loongarch/swscale_lasx.c b/libswscale/loongarch/swscale_lasx.c
new file mode 100644
index 0000000000..3e0bae2cc2
--- /dev/null
+++ b/libswscale/loongarch/swscale_lasx.c
@@ -0,0 +1,972 @@
+/*
+ * Copyright (C) 2022 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen(chenhao@loongson.cn)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "swscale_loongarch.h"
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "libavutil/intreadwrite.h"
+
+#define SCALE_8_16(_sh)                                               \
+{                                                                     \
+    src0    = __lasx_xvldrepl_d(src + filterPos[0], 0);               \
+    src1    = __lasx_xvldrepl_d(src + filterPos[1], 0);               \
+    src2    = __lasx_xvldrepl_d(src + filterPos[2], 0);               \
+    src3    = __lasx_xvldrepl_d(src + filterPos[3], 0);               \
+    src4    = __lasx_xvldrepl_d(src + filterPos[4], 0);               \
+    src5    = __lasx_xvldrepl_d(src + filterPos[5], 0);               \
+    src6    = __lasx_xvldrepl_d(src + filterPos[6], 0);               \
+    src7    = __lasx_xvldrepl_d(src + filterPos[7], 0);               \
+    src8    = __lasx_xvldrepl_d(src + filterPos[8], 0);               \
+    src9    = __lasx_xvldrepl_d(src + filterPos[9], 0);               \
+    src10   = __lasx_xvldrepl_d(src + filterPos[10], 0);              \
+    src11   = __lasx_xvldrepl_d(src + filterPos[11], 0);              \
+    src12   = __lasx_xvldrepl_d(src + filterPos[12], 0);              \
+    src13   = __lasx_xvldrepl_d(src + filterPos[13], 0);              \
+    src14   = __lasx_xvldrepl_d(src + filterPos[14], 0);              \
+    src15   = __lasx_xvldrepl_d(src + filterPos[15], 0);              \
+    DUP4_ARG2(__lasx_xvld, filter, 0, filter, 32, filter, 64,         \
+              filter, 96, filter0, filter1, filter2, filter3);        \
+    DUP4_ARG2(__lasx_xvld, filter, 128, filter, 160,                  \
+              filter, 192, filter, 224, filter4,                      \
+              filter5, filter6, filter7);                             \
+    DUP4_ARG2(__lasx_xvilvl_d, src1, src0, src3, src2,                \
+              src5, src4, src7, src6, src0, src2, src4, src6);        \
+    DUP4_ARG2(__lasx_xvilvl_d, src9, src8, src11, src10,              \
+              src13, src12, src15, src14, src8, src10, src12, src14); \
+    DUP4_ARG1(__lasx_vext2xv_hu_bu, src0, src2, src4, src6,           \
+              src0, src2, src4, src6);                                \
+    DUP4_ARG1(__lasx_vext2xv_hu_bu, src8, src10, src12,               \
+              src14, src8, src10, src12, src14);                      \
+    DUP4_ARG2(__lasx_xvdp2_w_h, filter0, src0, filter1, src2,         \
+              filter2, src4, filter3, src6, src0, src1, src2, src3);  \
+    DUP4_ARG2(__lasx_xvdp2_w_h, filter4, src8, filter5, src10,        \
+              filter6, src12, filter7, src14, src4, src5, src6, src7);\
+    src0 = __lasx_xvhaddw_d_w(src0, src0);                            \
+    src1 = __lasx_xvhaddw_d_w(src1, src1);                            \
+    src2 = __lasx_xvhaddw_d_w(src2, src2);                            \
+    src3 = __lasx_xvhaddw_d_w(src3, src3);                            \
+    src4 = __lasx_xvhaddw_d_w(src4, src4);                            \
+    src5 = __lasx_xvhaddw_d_w(src5, src5);                            \
+    src6 = __lasx_xvhaddw_d_w(src6, src6);                            \
+    src7 = __lasx_xvhaddw_d_w(src7, src7);                            \
+    DUP4_ARG2(__lasx_xvpickev_w, src1, src0, src3, src2,              \
+              src5, src4, src7, src6, src0, src1, src2, src3);        \
+    src0 = __lasx_xvhaddw_d_w(src0, src0);                            \
+    src1 = __lasx_xvhaddw_d_w(src1, src1);                            \
+    src2 = __lasx_xvhaddw_d_w(src2, src2);                            \
+    src3 = __lasx_xvhaddw_d_w(src3, src3);                            \
+    src0 = __lasx_xvpickev_w(src1, src0);                             \
+    src1 = __lasx_xvpickev_w(src3, src2);                             \
+    src0 = __lasx_xvsrai_w(src0, _sh);                                \
+    src1 = __lasx_xvsrai_w(src1, _sh);                                \
+    src0 = __lasx_xvmin_w(src0, vmax);                                \
+    src1 = __lasx_xvmin_w(src1, vmax);                                \
+    src0 = __lasx_xvperm_w(src0, shuf);                               \
+    src1 = __lasx_xvperm_w(src1, shuf);                               \
+    src0 = __lasx_xvpickev_h(src1, src0);                             \
+    src0 = __lasx_xvpermi_d(src0, 0xd8);                              \
+    __lasx_xvst(src0, dst, 0);                                        \
+    filterPos += 16;                                                  \
+    filter    += 128;                                                 \
+    dst       += 16;                                                  \
+}
+
+#define SCALE_8_8(_sh)                                                \
+{                                                                     \
+    src0    = __lasx_xvldrepl_d(src + filterPos[0], 0);               \
+    src1    = __lasx_xvldrepl_d(src + filterPos[1], 0);               \
+    src2    = __lasx_xvldrepl_d(src + filterPos[2], 0);               \
+    src3    = __lasx_xvldrepl_d(src + filterPos[3], 0);               \
+    src4    = __lasx_xvldrepl_d(src + filterPos[4], 0);               \
+    src5    = __lasx_xvldrepl_d(src + filterPos[5], 0);               \
+    src6    = __lasx_xvldrepl_d(src + filterPos[6], 0);               \
+    src7    = __lasx_xvldrepl_d(src + filterPos[7], 0);               \
+    DUP4_ARG2(__lasx_xvld, filter, 0, filter, 32, filter, 64,         \
+              filter, 96, filter0, filter1, filter2, filter3);        \
+    filterPos += 8;                                                   \
+    filter    += 64;                                                  \
+    DUP4_ARG2(__lasx_xvilvl_d, src1, src0, src3, src2,                \
+              src5, src4, src7, src6, src0, src2, src4, src6);        \
+    DUP4_ARG1(__lasx_vext2xv_hu_bu, src0, src2, src4, src6,           \
+              src0, src2, src4, src6);                                \
+    DUP4_ARG2(__lasx_xvdp2_w_h, filter0, src0, filter1, src2,         \
+              filter2, src4, filter3, src6, src0, src1, src2,src3);   \
+    src0 = __lasx_xvhaddw_d_w(src0, src0);                            \
+    src1 = __lasx_xvhaddw_d_w(src1, src1);                            \
+    src2 = __lasx_xvhaddw_d_w(src2, src2);                            \
+    src3 = __lasx_xvhaddw_d_w(src3, src3);                            \
+    src0 = __lasx_xvpickev_w(src1, src0);                             \
+    src1 = __lasx_xvpickev_w(src3, src2);                             \
+    src0 = __lasx_xvhaddw_d_w(src0, src0);                            \
+    src1 = __lasx_xvhaddw_d_w(src1, src1);                            \
+    src0 = __lasx_xvpickev_w(src1, src0);                             \
+    src0 = __lasx_xvsrai_w(src0, _sh);                                \
+    src0 = __lasx_xvmin_w(src0, vmax);                                \
+    src0 = __lasx_xvperm_w(src0, shuf);                               \
+}
+
+#define SCALE_8_4(_sh)                                                    \
+{                                                                         \
+    src0    = __lasx_xvldrepl_d(src + filterPos[0], 0);                   \
+    src1    = __lasx_xvldrepl_d(src + filterPos[1], 0);                   \
+    src2    = __lasx_xvldrepl_d(src + filterPos[2], 0);                   \
+    src3    = __lasx_xvldrepl_d(src + filterPos[3], 0);                   \
+    filter0 = __lasx_xvld(filter, 0);                                     \
+    filter1 = __lasx_xvld(filter, 32);                                    \
+    filterPos += 4;                                                       \
+    filter    += 32;                                                      \
+    src0    = __lasx_xvilvl_d(src1, src0);                                \
+    src2    = __lasx_xvilvl_d(src3, src2);                                \
+    src0    = __lasx_vext2xv_hu_bu(src0);                                 \
+    src2    = __lasx_vext2xv_hu_bu(src2);                                 \
+    src0    = __lasx_xvdp2_w_h(src0, filter0);                            \
+    src1    = __lasx_xvdp2_w_h(src2, filter1);                            \
+    src0 = __lasx_xvhaddw_d_w(src0, src0);                                \
+    src1 = __lasx_xvhaddw_d_w(src1, src1);                                \
+    src0 = __lasx_xvpickev_w(src1, src0);                                 \
+    src0 = __lasx_xvhaddw_d_w(src0, src0);                                \
+    src0 = __lasx_xvpickev_w(src0, src0);                                 \
+    src0 = __lasx_xvsrai_w(src0, _sh);                                    \
+    src0 = __lasx_xvmin_w(src0, vmax);                                    \
+    src0 = __lasx_xvperm_w(src0, shuf);                                   \
+}
+
+#define SCALE_8_2(_sh)                                                \
+{                                                                     \
+    src0    = __lasx_xvldrepl_d(src + filterPos[0], 0);               \
+    src1    = __lasx_xvldrepl_d(src + filterPos[1], 0);               \
+    filter0 = __lasx_xvld(filter, 0);                                 \
+    src0 = __lasx_xvilvl_d(src1, src0);                               \
+    src0 = __lasx_vext2xv_hu_bu(src0);                                \
+    src0 = __lasx_xvdp2_w_h(filter0, src0);                           \
+    src0 = __lasx_xvhaddw_d_w(src0, src0);                            \
+    src0 = __lasx_xvhaddw_q_d(src0, src0);                            \
+    src0 = __lasx_xvsrai_w(src0, _sh);                                \
+    src0 = __lasx_xvmin_w(src0, vmax);                                \
+    dst[0] = __lasx_xvpickve2gr_w(src0, 0);                           \
+    dst[1] = __lasx_xvpickve2gr_w(src0, 4);                           \
+    filterPos += 2;                                                   \
+    filter    += 16;                                                  \
+    dst       += 2;                                                   \
+}
+
+#define SCALE_4_16(_sh)                                               \
+{                                                                     \
+    src0    = __lasx_xvldrepl_w(src + filterPos[0], 0);               \
+    src1    = __lasx_xvldrepl_w(src + filterPos[1], 0);               \
+    src2    = __lasx_xvldrepl_w(src + filterPos[2], 0);               \
+    src3    = __lasx_xvldrepl_w(src + filterPos[3], 0);               \
+    src4    = __lasx_xvldrepl_w(src + filterPos[4], 0);               \
+    src5    = __lasx_xvldrepl_w(src + filterPos[5], 0);               \
+    src6    = __lasx_xvldrepl_w(src + filterPos[6], 0);               \
+    src7    = __lasx_xvldrepl_w(src + filterPos[7], 0);               \
+    src8    = __lasx_xvldrepl_w(src + filterPos[8], 0);               \
+    src9    = __lasx_xvldrepl_w(src + filterPos[9], 0);               \
+    src10   = __lasx_xvldrepl_w(src + filterPos[10], 0);              \
+    src11   = __lasx_xvldrepl_w(src + filterPos[11], 0);              \
+    src12   = __lasx_xvldrepl_w(src + filterPos[12], 0);              \
+    src13   = __lasx_xvldrepl_w(src + filterPos[13], 0);              \
+    src14   = __lasx_xvldrepl_w(src + filterPos[14], 0);              \
+    src15   = __lasx_xvldrepl_w(src + filterPos[15], 0);              \
+    DUP4_ARG2(__lasx_xvld, filter, 0, filter, 32, filter, 64,         \
+              filter, 96, filter0, filter1, filter2, filter3);        \
+    DUP4_ARG2(__lasx_xvilvl_w, src1, src0, src3, src2, src5,          \
+              src4, src7, src6, src0, src2, src4, src6);              \
+    DUP4_ARG2(__lasx_xvilvl_w, src9, src8, src11, src10, src13,       \
+              src12, src15, src14, src8, src10, src12, src14);        \
+    DUP4_ARG2(__lasx_xvilvl_d, src2, src0, src6, src4, src10,         \
+                   src8, src14, src12, src0, src1, src2, src3);       \
+    DUP4_ARG1(__lasx_vext2xv_hu_bu, src0, src1, src2, src3,           \
+              src0, src1, src2, src3);                                \
+    DUP4_ARG2(__lasx_xvdp2_w_h, filter0, src0, filter1, src1,         \
+              filter2, src2, filter3, src3, src0, src1, src2, src3);  \
+    src0 = __lasx_xvhaddw_d_w(src0, src0);                            \
+    src1 = __lasx_xvhaddw_d_w(src1, src1);                            \
+    src2 = __lasx_xvhaddw_d_w(src2, src2);                            \
+    src3 = __lasx_xvhaddw_d_w(src3, src3);                            \
+    src0 = __lasx_xvpickev_w(src1, src0);                             \
+    src1 = __lasx_xvpickev_w(src3, src2);                             \
+    src0 = __lasx_xvsrai_w(src0, _sh);                                \
+    src1 = __lasx_xvsrai_w(src1, _sh);                                \
+    src0 = __lasx_xvmin_w(src0, vmax);                                \
+    src1 = __lasx_xvmin_w(src1, vmax);                                \
+    src0 = __lasx_xvpickev_h(src1, src0);                             \
+    src0 = __lasx_xvperm_w(src0, shuf);                               \
+    __lasx_xvst(src0, dst, 0);                                        \
+    filterPos += 16;                                                  \
+    filter    += 64;                                                  \
+    dst       += 16;                                                  \
+}
+
+#define SCALE_4_8(_sh)                                                    \
+{                                                                         \
+    src0    = __lasx_xvldrepl_w(src + filterPos[0], 0);                   \
+    src1    = __lasx_xvldrepl_w(src + filterPos[1], 0);                   \
+    src2    = __lasx_xvldrepl_w(src + filterPos[2], 0);                   \
+    src3    = __lasx_xvldrepl_w(src + filterPos[3], 0);                   \
+    src4    = __lasx_xvldrepl_w(src + filterPos[4], 0);                   \
+    src5    = __lasx_xvldrepl_w(src + filterPos[5], 0);                   \
+    src6    = __lasx_xvldrepl_w(src + filterPos[6], 0);                   \
+    src7    = __lasx_xvldrepl_w(src + filterPos[7], 0);                   \
+    filter0 = __lasx_xvld(filter, 0);                                     \
+    filter1 = __lasx_xvld(filter, 32);                                    \
+    filterPos += 8;                                                       \
+    filter    += 32;                                                      \
+    DUP4_ARG2(__lasx_xvilvl_w, src1, src0, src3, src2, src5,              \
+              src4, src7, src6, src0, src2, src4, src6);                  \
+    src0 = __lasx_xvilvl_d(src2, src0);                                   \
+    src1 = __lasx_xvilvl_d(src6, src4);                                   \
+                                                                          \
+    src0 = __lasx_vext2xv_hu_bu(src0);                                    \
+    src1 = __lasx_vext2xv_hu_bu(src1);                                    \
+    src0 = __lasx_xvdp2_w_h(filter0, src0);                               \
+    src1 = __lasx_xvdp2_w_h(filter1, src1);                               \
+    src0 = __lasx_xvhaddw_d_w(src0, src0);                                \
+    src1 = __lasx_xvhaddw_d_w(src1, src1);                                \
+    src0 = __lasx_xvpickev_w(src1, src0);                                 \
+    src0 = __lasx_xvsrai_w(src0, _sh);                                    \
+    src0 = __lasx_xvmin_w(src0, vmax);                                    \
+}
+
+#define SCALE_4_4(_sh)                                                \
+{                                                                     \
+    src0    = __lasx_xvldrepl_w(src + filterPos[0], 0);               \
+    src1    = __lasx_xvldrepl_w(src + filterPos[1], 0);               \
+    src2    = __lasx_xvldrepl_w(src + filterPos[2], 0);               \
+    src3    = __lasx_xvldrepl_w(src + filterPos[3], 0);               \
+    filter0 = __lasx_xvld(filter, 0);                                 \
+    filterPos += 4;                                                   \
+    filter    += 16;                                                  \
+    src0    = __lasx_xvilvl_w(src1, src0);                            \
+    src1    = __lasx_xvilvl_w(src3, src2);                            \
+                                                                      \
+    src0 = __lasx_xvilvl_d(src1, src0);                               \
+    src0 = __lasx_vext2xv_hu_bu(src0);                                \
+    src0 = __lasx_xvdp2_w_h(filter0, src0);                           \
+    src0 = __lasx_xvhaddw_d_w(src0, src0);                            \
+    src0 = __lasx_xvsrai_w(src0, _sh);                                \
+    src0 = __lasx_xvmin_w(src0, vmax);                                \
+    src0 = __lasx_xvpickev_w(src0, src0);                             \
+    src0 = __lasx_xvpermi_d(src0, 0xd8);                              \
+}
+
+#define SCALE_4_2(_sh)                                                \
+{                                                                     \
+    src0    = __lasx_xvldrepl_w(src + filterPos[0], 0);               \
+    src1    = __lasx_xvldrepl_w(src + filterPos[1], 0);               \
+    filter0 = __lasx_xvld(filter, 0);                                 \
+    src0 = __lasx_xvilvl_w(src1, src0);                               \
+    src0 = __lasx_vext2xv_hu_bu(src0);                                \
+    src0 = __lasx_xvdp2_w_h(filter0, src0);                           \
+    src0 = __lasx_xvhaddw_d_w(src0, src0);                            \
+    src0 = __lasx_xvsrai_w(src0, _sh);                                \
+    src0 = __lasx_xvmin_w(src0, vmax);                                \
+    dst[0] = __lasx_xvpickve2gr_w(src0, 0);                           \
+    dst[1] = __lasx_xvpickve2gr_w(src0, 2);                           \
+    filterPos += 2;                                                   \
+    filter    += 8;                                                   \
+    dst       += 2;                                                   \
+}
+
+#define SCALE_16                                                      \
+{                                                                     \
+    int dex  = j << 1;                                                \
+    src0     = __lasx_xvldrepl_d((srcPos1 + j), 0);                   \
+    src1     = __lasx_xvldrepl_d((srcPos2 + j), 0);                   \
+    src2     = __lasx_xvldrepl_d((srcPos3 + j), 0);                   \
+    src3     = __lasx_xvldrepl_d((srcPos4 + j), 0);                   \
+    DUP4_ARG2(__lasx_xvldx, filterStart1, dex, filterStart2, dex,     \
+              filterStart3, dex, filterStart4, dex, filter0,          \
+              filter1, filter2, filter3);                             \
+    src0     = __lasx_xvpermi_q(src0, src1, 0x02);                    \
+    src1     = __lasx_xvpermi_q(src2, src3, 0x02);                    \
+    filter0  = __lasx_xvpermi_q(filter0, filter1, 0x02);              \
+    filter1  = __lasx_xvpermi_q(filter2, filter3, 0x02);              \
+    src0     = __lasx_xvilvl_b(zero, src0);                           \
+    src1     = __lasx_xvilvl_b(zero, src1);                           \
+    out0     = __lasx_xvdp2_w_h(filter0, src0);                       \
+    out1     = __lasx_xvdp2_w_h(filter1, src1);                       \
+    src0     = __lasx_xvhaddw_d_w(out0, out0);                        \
+    src1     = __lasx_xvhaddw_d_w(out1, out1);                        \
+    out0     = __lasx_xvpackev_d(src1, src0);                         \
+    out1     = __lasx_xvpackod_d(src1, src0);                         \
+    out0     = __lasx_xvadd_w(out0, out1);                            \
+    out      = __lasx_xvadd_w(out, out0);                             \
+}
+
+void ff_hscale_8_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
+                            const uint8_t *src, const int16_t *filter,
+                            const int32_t *filterPos, int filterSize)
+{
+    int i;
+    int max = (1 << 15) - 1;
+
+    if (filterSize == 8) {
+        __m256i src0, src1, src2, src3, src4, src5, src6, src7;
+        __m256i src8, src9, src10, src11, src12, src13, src14, src15;
+        __m256i filter0, filter1, filter2, filter3;
+        __m256i filter4, filter5, filter6, filter7;
+        __m256i vmax = __lasx_xvreplgr2vr_w(max);
+        __m256i shuf = {0x0000000400000000, 0x0000000500000001,
+                        0x0000000600000002, 0x0000000700000003};
+        int len = dstW >> 4;
+        int res = dstW & 15;
+        while (len--) {
+            SCALE_8_16(7);
+        }
+        if (res & 8) {
+            SCALE_8_8(7);
+            src0 = __lasx_xvpickev_h(src0, src0);
+            __lasx_xvstelm_d(src0, dst, 0, 0);
+            __lasx_xvstelm_d(src0, dst, 8, 2);
+            dst       += 8;
+        }
+        if (res & 4) {
+            SCALE_8_4(7);
+            src0 = __lasx_xvpickev_h(src0, src0);
+            __lasx_xvstelm_d(src0, dst, 0, 0);
+            dst       += 4;
+        }
+        if (res & 2) {
+            SCALE_8_2(7);
+        }
+        if (res & 1) {
+            int val = 0;
+            src0    = __lasx_xvldrepl_d(src + filterPos[0], 0);
+            filter0 = __lasx_xvld(filter, 0);
+            src0 = __lasx_vext2xv_hu_bu(src0);
+            src0 = __lasx_xvdp2_w_h(filter0, src0);
+            src0    = __lasx_xvhaddw_d_w(src0, src0);
+            src0    = __lasx_xvhaddw_q_d(src0, src0);
+            val     = __lasx_xvpickve2gr_w(src0, 0);
+            dst[0]  = FFMIN(val >> 7, max);
+        }
+    } else if (filterSize == 4) {
+        __m256i src0, src1, src2, src3, src4, src5, src6, src7;
+        __m256i src8, src9, src10, src11, src12, src13, src14, src15;
+        __m256i filter0, filter1, filter2, filter3;
+        __m256i vmax = __lasx_xvreplgr2vr_w(max);
+        __m256i shuf = {0x0000000400000000, 0x0000000500000001,
+                        0x0000000600000002, 0x0000000700000003};
+        int len = dstW >> 4;
+        int res = dstW & 15;
+        while (len--) {
+            SCALE_4_16(7);
+        }
+        if (res & 8) {
+            SCALE_4_8(7);
+            src0 = __lasx_xvpickev_h(src1, src0);
+            src0 = __lasx_xvperm_w(src0, shuf);
+            __lasx_xvstelm_d(src0, dst, 0, 0);
+            __lasx_xvstelm_d(src0, dst, 8, 1);
+            dst       += 8;
+        }
+        if (res & 4) {
+            SCALE_4_4(7);
+            src0 = __lasx_xvpickev_h(src0, src0);
+            __lasx_xvstelm_d(src0, dst, 0, 0);
+            dst       += 4;
+        }
+        if (res & 2) {
+            SCALE_4_2(7);
+        }
+        if (res & 1) {
+            int val = 0;
+            const uint8_t *srcPos = src + filterPos[0];
+
+            for (int j = 0; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[0] = FFMIN(val >> 7, max);
+        }
+    } else if (filterSize > 8) {
+        int filterlen = filterSize - 7;
+        int len = dstW >> 2;
+        int res = dstW & 3;
+        __m256i zero = __lasx_xvldi(0);
+
+        while (len--) {
+            __m256i src0, src1, src2, src3;
+            __m256i filter0, filter1, filter2, filter3, out0, out1;
+            __m256i out = zero;
+            const uint8_t *srcPos1 = src + filterPos[0];
+            const uint8_t *srcPos2 = src + filterPos[1];
+            const uint8_t *srcPos3 = src + filterPos[2];
+            const uint8_t *srcPos4 = src + filterPos[3];
+            const int16_t *filterStart1 = filter;
+            const int16_t *filterStart2 = filterStart1 + filterSize;
+            const int16_t *filterStart3 = filterStart2 + filterSize;
+            const int16_t *filterStart4 = filterStart3 + filterSize;
+            int j, val1 = 0, val2 = 0, val3 = 0, val4 = 0;
+
+            for (j = 0; j < filterlen; j += 8) {
+                SCALE_16
+            }
+            val1 = __lasx_xvpickve2gr_w(out, 0);
+            val2 = __lasx_xvpickve2gr_w(out, 4);
+            val3 = __lasx_xvpickve2gr_w(out, 2);
+            val4 = __lasx_xvpickve2gr_w(out, 6);
+            for (; j < filterSize; j++) {
+                val1 += ((int)srcPos1[j]) * filterStart1[j];
+                val2 += ((int)srcPos2[j]) * filterStart2[j];
+                val3 += ((int)srcPos3[j]) * filterStart3[j];
+                val4 += ((int)srcPos4[j]) * filterStart4[j];
+            }
+            dst[0] = FFMIN(val1 >> 7, max);
+            dst[1] = FFMIN(val2 >> 7, max);
+            dst[2] = FFMIN(val3 >> 7, max);
+            dst[3] = FFMIN(val4 >> 7, max);
+            dst       += 4;
+            filterPos += 4;
+            filter     = filterStart4 + filterSize;
+        }
+        for(i = 0; i < res; i++) {
+            int j, val = 0;
+            const uint8_t *srcPos = src + filterPos[i];
+            __m256i src1, filter0, out0;
+
+            for (j = 0; j < filterlen; j += 8) {
+                src1   = __lasx_xvldrepl_d((srcPos + j), 0);
+                filter0 = __lasx_xvld(filter + j, 0);
+                src1 = __lasx_xvilvl_b(zero, src1);
+                out0 = __lasx_xvdp2_w_h(filter0, src1);
+                out0 = __lasx_xvhaddw_d_w(out0, out0);
+                out0 = __lasx_xvhaddw_q_d(out0, out0);
+                val += __lasx_xvpickve2gr_w(out0, 0);
+            }
+            for (; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[i]  = FFMIN(val >> 7, max);
+            filter += filterSize;
+        }
+    } else {
+        for (i = 0; i < dstW; i++) {
+            int val = 0;
+            const uint8_t *srcPos = src + filterPos[i];
+
+            for (int j = 0; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[i]  = FFMIN(val >> 7, max);
+            filter += filterSize;
+        }
+    }
+}
+
+void ff_hscale_8_to_19_lasx(SwsContext *c, int16_t *_dst, int dstW,
+                            const uint8_t *src, const int16_t *filter,
+                            const int32_t *filterPos, int filterSize)
+{
+    int i;
+    int max = (1 << 19) - 1;
+    int32_t *dst = (int32_t *) _dst;
+
+    if (filterSize == 8) {
+        __m256i src0, src1, src2, src3, src4, src5, src6, src7;
+        __m256i filter0, filter1, filter2, filter3;
+        __m256i vmax = __lasx_xvreplgr2vr_w(max);
+        __m256i shuf = {0x0000000400000000, 0x0000000500000001,
+                        0x0000000600000002, 0x0000000700000003};
+        int len = dstW >> 3;
+        int res = dstW & 7;
+        while (len--) {
+            SCALE_8_8(3);
+            __lasx_xvst(src0, dst, 0);
+            dst       += 8;
+        }
+        if (res & 4) {
+            SCALE_8_4(3);
+            __lasx_xvstelm_d(src0, dst, 0, 0);
+            __lasx_xvstelm_d(src0, dst, 8, 1);
+            dst       += 4;
+        }
+        if (res & 2) {
+            SCALE_8_2(3);
+        }
+        if (res & 1) {
+            int val = 0;
+            __m256i src0, filter0, out0;
+
+            src0    = __lasx_xvldrepl_d(src + filterPos[0], 0);
+            filter0 = __lasx_xvld(filter, 0);
+            src0 = __lasx_vext2xv_hu_bu(src0);
+            out0 = __lasx_xvdp2_w_h(filter0, src0);
+            out0    = __lasx_xvhaddw_d_w(out0, out0);
+            out0    = __lasx_xvhaddw_q_d(out0, out0);
+            val     = __lasx_xvpickve2gr_w(out0, 0);
+            dst[0]  = FFMIN(val >> 3, max);
+        }
+    } else if (filterSize == 4) {
+        __m256i src0, src1, src2, src3, src4, src5, src6, src7;
+        __m256i filter0, filter1;
+        __m256i vmax = __lasx_xvreplgr2vr_w(max);
+        __m256i shuf = {0x0000000100000000, 0x0000000500000004,
+                        0x0000000300000002, 0x0000000700000006};
+        int len = dstW >> 3;
+        int res = dstW & 7;
+        while (len--) {
+            SCALE_4_8(3);
+            src0 = __lasx_xvperm_w(src0, shuf);
+            __lasx_xvst(src0, dst, 0);
+            dst       += 8;
+        }
+        if (res & 4) {
+            SCALE_4_4(3);
+            __lasx_xvstelm_d(src0, dst, 0, 0);
+            __lasx_xvstelm_d(src0, dst, 8, 1);
+            dst       += 4;
+        }
+        if (res & 2) {
+            SCALE_4_2(3);
+        }
+        if (res & 1) {
+            int val = 0;
+            const uint8_t *srcPos = src + filterPos[0];
+
+            for (int j = 0; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[0] = FFMIN(val >> 3, max);
+        }
+    } else if (filterSize > 8) {
+        int len = dstW >> 2;
+        int res = dstW & 3;
+        int filterlen = filterSize - 7;
+        __m256i zero = __lasx_xvldi(0);
+
+        while (len--) {
+            __m256i src0, src1, src2, src3;
+            __m256i filter0, filter1, filter2, filter3, out0, out1;
+            __m256i out = zero;
+            const uint8_t *srcPos1 = src + filterPos[0];
+            const uint8_t *srcPos2 = src + filterPos[1];
+            const uint8_t *srcPos3 = src + filterPos[2];
+            const uint8_t *srcPos4 = src + filterPos[3];
+            const int16_t *filterStart1 = filter;
+            const int16_t *filterStart2 = filterStart1 + filterSize;
+            const int16_t *filterStart3 = filterStart2 + filterSize;
+            const int16_t *filterStart4 = filterStart3 + filterSize;
+            int j, val1 = 0, val2 = 0, val3 = 0, val4 = 0;
+
+            for (j = 0; j < filterlen; j += 8) {
+                SCALE_16
+            }
+            val1 = __lasx_xvpickve2gr_w(out, 0);
+            val2 = __lasx_xvpickve2gr_w(out, 4);
+            val3 = __lasx_xvpickve2gr_w(out, 2);
+            val4 = __lasx_xvpickve2gr_w(out, 6);
+            for (; j < filterSize; j++) {
+                val1 += ((int)srcPos1[j]) * filterStart1[j];
+                val2 += ((int)srcPos2[j]) * filterStart2[j];
+                val3 += ((int)srcPos3[j]) * filterStart3[j];
+                val4 += ((int)srcPos4[j]) * filterStart4[j];
+            }
+            dst[0] = FFMIN(val1 >> 3, max);
+            dst[1] = FFMIN(val2 >> 3, max);
+            dst[2] = FFMIN(val3 >> 3, max);
+            dst[3] = FFMIN(val4 >> 3, max);
+            dst       += 4;
+            filterPos += 4;
+            filter     = filterStart4 + filterSize;
+        }
+        for (i = 0; i < res; i++) {
+            int j, val = 0;
+            const uint8_t *srcPos = src + filterPos[i];
+            __m256i src1, filter0, out0;
+
+            for (j = 0; j < filterlen; j += 8) {
+                src1   = __lasx_xvldrepl_d((srcPos + j), 0);
+                filter0 = __lasx_xvld(filter + j, 0);
+                src1 = __lasx_xvilvl_b(zero, src1);
+                out0 = __lasx_xvdp2_w_h(filter0, src1);
+                out0 = __lasx_xvhaddw_d_w(out0, out0);
+                out0 = __lasx_xvhaddw_q_d(out0, out0);
+                val += __lasx_xvpickve2gr_w(out0, 0);
+            }
+            for (; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[i] = FFMIN(val >> 3, max);
+            filter += filterSize;
+        }
+    } else {
+        for (i = 0; i < dstW; i++) {
+            int val = 0;
+            const uint8_t *srcPos = src + filterPos[i];
+
+            for (int j = 0; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[i]  = FFMIN(val >> 3, max);
+            filter += filterSize;
+        }
+    }
+}
+
+#undef SCALE_16
+
+#define SCALE_8                                                              \
+{                                                                            \
+    __m256i src0, src1, src2, src3, filter0, filter1, out0, out1;            \
+    DUP4_ARG2(__lasx_xvld, src + filterPos[0], 0, src + filterPos[1], 0,     \
+              src + filterPos[2], 0, src + filterPos[3], 0, src0, src1, src2,\
+              src3);                                                         \
+    filter0 = __lasx_xvld(filter, 0);                                        \
+    filter1 = __lasx_xvld(filter, 32);                                       \
+    src0    = __lasx_xvpermi_q(src0, src1, 0x02);                            \
+    src2    = __lasx_xvpermi_q(src2, src3, 0x02);                            \
+    out0    = __lasx_xvdp2_w_hu_h(src0, filter0);                            \
+    out1    = __lasx_xvdp2_w_hu_h(src2, filter1);                            \
+    src0    = __lasx_xvhaddw_d_w(out0, out0);                                \
+    src1    = __lasx_xvhaddw_d_w(out1, out1);                                \
+    out0    = __lasx_xvpackev_d(src1, src0);                                 \
+    out1    = __lasx_xvpackod_d(src1, src0);                                 \
+    out0    = __lasx_xvadd_w(out0, out1);                                    \
+    out0    = __lasx_xvsra_w(out0, shift);                                   \
+    out0    = __lasx_xvmin_w(out0, v_max);                                   \
+    dst[0]  = __lasx_xvpickve2gr_w(out0, 0);                                 \
+    dst[1]  = __lasx_xvpickve2gr_w(out0, 4);                                 \
+    dst[2]  = __lasx_xvpickve2gr_w(out0, 2);                                 \
+    dst[3]  = __lasx_xvpickve2gr_w(out0, 6);                                 \
+    filterPos += 4;                                                          \
+    filter += 32;                                                            \
+    dst += 4;                                                                \
+}
+
+#define SCALE_16                                                             \
+{                                                                            \
+    int dex = j << 1;                                                        \
+    DUP4_ARG2(__lasx_xvldx, srcPos1, dex, srcPos2, dex, srcPos3, dex,        \
+              srcPos4, dex, src0, src1, src2, src3);                         \
+    DUP4_ARG2(__lasx_xvldx, filterStart1, dex, filterStart2, dex,            \
+              filterStart3, dex, filterStart4, dex, filter0,                 \
+              filter1, filter2, filter3);                                    \
+    src0     = __lasx_xvpermi_q(src0, src1, 0x02);                           \
+    src1     = __lasx_xvpermi_q(src2, src3, 0x02);                           \
+    filter0  = __lasx_xvpermi_q(filter0, filter1, 0x02);                     \
+    filter1  = __lasx_xvpermi_q(filter2, filter3, 0x02);                     \
+    out0     = __lasx_xvdp2_w_hu_h(src0, filter0);                           \
+    out1     = __lasx_xvdp2_w_hu_h(src1, filter1);                           \
+    src0     = __lasx_xvhaddw_d_w(out0, out0);                               \
+    src1     = __lasx_xvhaddw_d_w(out1, out1);                               \
+    out0     = __lasx_xvpackev_d(src1, src0);                                \
+    out1     = __lasx_xvpackod_d(src1, src0);                                \
+    out0     = __lasx_xvadd_w(out0, out1);                                   \
+    out      = __lasx_xvadd_w(out, out0);                                    \
+}
+
+void ff_hscale_16_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
+                             const uint8_t *_src, const int16_t *filter,
+                             const int32_t *filterPos, int filterSize)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
+    int i;
+    const uint16_t *src = (const uint16_t *) _src;
+    int sh              = desc->comp[0].depth - 1;
+    int max = (1 << 15) - 1;
+    int len = dstW >> 2;
+    int res = dstW & 3;
+    __m256i shift;
+    __m256i zero = __lasx_xvldi(0);
+
+    if (sh < 15) {
+        sh = isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8 ? 13 :
+                      (desc->comp[0].depth - 1);
+    } else if (desc->flags && AV_PIX_FMT_FLAG_FLOAT) {
+        sh = 15;
+    }
+    shift = __lasx_xvreplgr2vr_w(sh);
+
+    if (filterSize == 8) {
+        __m256i v_max = __lasx_xvreplgr2vr_w(max);
+        for (i = 0; i < len; i++) {
+            SCALE_8
+        }
+        for (i = 0; i < res; i++) {
+            int val = 0;
+            __m256i src0, filter0, out0;
+
+            src0    = __lasx_xvld(src + filterPos[i], 0);
+            filter0 = __lasx_xvld(filter, 0);
+            out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
+            out0    = __lasx_xvhaddw_d_w(out0, out0);
+            out0    = __lasx_xvhaddw_q_d(out0, out0);
+            val     = __lasx_xvpickve2gr_w(out0, 0);
+            dst[i]  = FFMIN(val >> sh, max);
+            filter += 8;
+        }
+    } else if (filterSize == 4) {
+        __m256i v_max = __lasx_xvreplgr2vr_w(max);
+        for (i = 0; i < len; i++) {
+            __m256i src1, src2, src3, src4, src0, filter0, out0;
+
+            src1 = __lasx_xvldrepl_d(src + filterPos[0], 0);
+            src2 = __lasx_xvldrepl_d(src + filterPos[1], 0);
+            src3 = __lasx_xvldrepl_d(src + filterPos[2], 0);
+            src4 = __lasx_xvldrepl_d(src + filterPos[3], 0);
+            filter0 = __lasx_xvld(filter, 0);
+            src1 = __lasx_xvextrins_d(src1, src2, 0x10);
+            src3 = __lasx_xvextrins_d(src3, src4, 0x10);
+            src0 = __lasx_xvpermi_q(src1, src3, 0x02);
+            out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
+            out0 = __lasx_xvhaddw_d_w(out0, out0);
+            out0 = __lasx_xvsra_w(out0, shift);
+            out0 = __lasx_xvmin_w(out0, v_max);
+            dst[0] = __lasx_xvpickve2gr_w(out0, 0);
+            dst[1] = __lasx_xvpickve2gr_w(out0, 2);
+            dst[2] = __lasx_xvpickve2gr_w(out0, 4);
+            dst[3] = __lasx_xvpickve2gr_w(out0, 6);
+            dst       += 4;
+            filterPos += 4;
+            filter    += 16;
+        }
+        for (i = 0; i < res; i++) {
+            int val = 0;
+            const uint16_t *srcPos = src + filterPos[i];
+
+            for (int j = 0; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[i]  = FFMIN(val >> sh, max);
+            filter += 4;
+        }
+    } else if (filterSize > 8) {
+        int filterlen = filterSize - 7;
+
+        for (i = 0; i < len; i++) {
+            __m256i src0, src1, src2, src3;
+            __m256i filter0, filter1, filter2, filter3, out0, out1;
+            __m256i out = zero;
+            const uint16_t *srcPos1 = src + filterPos[0];
+            const uint16_t *srcPos2 = src + filterPos[1];
+            const uint16_t *srcPos3 = src + filterPos[2];
+            const uint16_t *srcPos4 = src + filterPos[3];
+            const int16_t *filterStart1 = filter;
+            const int16_t *filterStart2 = filterStart1 + filterSize;
+            const int16_t *filterStart3 = filterStart2 + filterSize;
+            const int16_t *filterStart4 = filterStart3 + filterSize;
+            int j, val1 = 0, val2 = 0, val3 = 0, val4 = 0;
+
+            for (j = 0; j < filterlen; j += 8) {
+                SCALE_16
+            }
+            val1 = __lasx_xvpickve2gr_w(out, 0);
+            val2 = __lasx_xvpickve2gr_w(out, 4);
+            val3 = __lasx_xvpickve2gr_w(out, 2);
+            val4 = __lasx_xvpickve2gr_w(out, 6);
+            for (; j < filterSize; j++) {
+                val1 += ((int)srcPos1[j]) * filterStart1[j];
+                val2 += ((int)srcPos2[j]) * filterStart2[j];
+                val3 += ((int)srcPos3[j]) * filterStart3[j];
+                val4 += ((int)srcPos4[j]) * filterStart4[j];
+            }
+            dst[0] = FFMIN(val1 >> sh, max);
+            dst[1] = FFMIN(val2 >> sh, max);
+            dst[2] = FFMIN(val3 >> sh, max);
+            dst[3] = FFMIN(val4 >> sh, max);
+            dst       += 4;
+            filterPos += 4;
+            filter     = filterStart4 + filterSize;
+        }
+        for (i = 0; i < res; i++) {
+            int j, val = 0;
+            const uint16_t *srcPos      = src + filterPos[i];
+            __m256i src0, filter0, out0;
+
+            for (j = 0; j < filterlen; j += 8) {
+                int dex = j << 1;
+                src0    = __lasx_xvldx(srcPos, dex);
+                filter0 = __lasx_xvldx(filter, dex);
+                out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
+                out0    = __lasx_xvhaddw_d_w(out0, out0);
+                out0    = __lasx_xvhaddw_q_d(out0, out0);
+                val    += __lasx_xvpickve2gr_w(out0, 0);
+            }
+            for (; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[i]  = FFMIN(val >> sh, max);
+            filter += filterSize;
+        }
+    } else {
+        for (i = 0; i < dstW; i++) {
+            int val = 0;
+            const uint16_t *srcPos = src + filterPos[i];
+
+            for (int j = 0; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[i]  = FFMIN(val >> sh, max);
+            filter += filterSize;
+        }
+    }
+}
+
+void ff_hscale_16_to_19_lasx(SwsContext *c, int16_t *_dst, int dstW,
+                             const uint8_t *_src, const int16_t *filter,
+                             const int32_t *filterPos, int filterSize)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
+    int i;
+    int32_t *dst        = (int32_t *) _dst;
+    const uint16_t *src = (const uint16_t *) _src;
+    int sh              = desc->comp[0].depth - 5;
+    int max = (1 << 19) - 1;
+    int len = dstW >> 2;
+    int res = dstW & 3;
+    __m256i shift;
+    __m256i zero = __lasx_xvldi(0);
+
+    if ((isAnyRGB(c->srcFormat) || c->srcFormat == AV_PIX_FMT_PAL8)
+         && desc->comp[0].depth<16) {
+        sh = 9;
+    } else if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) {
+        sh = 11;
+    }
+    shift = __lasx_xvreplgr2vr_w(sh);
+
+    if (filterSize == 8) {
+        __m256i v_max = __lasx_xvreplgr2vr_w(max);
+        for (i = 0; i < len; i++) {
+            SCALE_8
+        }
+        for (i = 0; i < res; i++) {
+            int val = 0;
+            __m256i src0, filter0, out0;
+
+            src0 = __lasx_xvld(src + filterPos[i], 0);
+            filter0 = __lasx_xvld(filter, 0);
+            out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
+            out0 = __lasx_xvhaddw_d_w(out0, out0);
+            out0 = __lasx_xvhaddw_q_d(out0, out0);
+            val  = __lasx_xvpickve2gr_w(out0, 0);
+            dst[i] = FFMIN(val >> sh, max);
+            filter += 8;
+        }
+    } else if (filterSize == 4) {
+        __m256i v_max = __lasx_xvreplgr2vr_w(max);
+        for (i = 0; i < len; i++) {
+            __m256i src1, src2, src3, src4, src0, filter0, out0;
+
+            src1 = __lasx_xvldrepl_d(src + filterPos[0], 0);
+            src2 = __lasx_xvldrepl_d(src + filterPos[1], 0);
+            src3 = __lasx_xvldrepl_d(src + filterPos[2], 0);
+            src4 = __lasx_xvldrepl_d(src + filterPos[3], 0);
+            filter0 = __lasx_xvld(filter, 0);
+            src1 = __lasx_xvextrins_d(src1, src2, 0x10);
+            src3 = __lasx_xvextrins_d(src3, src4, 0x10);
+            src0 = __lasx_xvpermi_q(src1, src3, 0x02);
+            out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
+            out0 = __lasx_xvhaddw_d_w(out0, out0);
+            out0 = __lasx_xvsra_w(out0, shift);
+            out0 = __lasx_xvmin_w(out0, v_max);
+            dst[0] = __lasx_xvpickve2gr_w(out0, 0);
+            dst[1] = __lasx_xvpickve2gr_w(out0, 2);
+            dst[2] = __lasx_xvpickve2gr_w(out0, 4);
+            dst[3] = __lasx_xvpickve2gr_w(out0, 6);
+            dst       += 4;
+            filterPos += 4;
+            filter    += 16;
+        }
+        for (i = 0; i < res; i++) {
+            int val = 0;
+            const uint16_t *srcPos = src + filterPos[i];
+
+            for (int j = 0; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[i]  = FFMIN(val >> sh, max);
+            filter += 4;
+        }
+    } else if (filterSize > 8) {
+        int filterlen = filterSize - 7;
+
+        for (i = 0; i < len; i ++) {
+            __m256i src0, src1, src2, src3;
+            __m256i filter0, filter1, filter2, filter3, out0, out1;
+            __m256i out = zero;
+            const uint16_t *srcPos1 = src + filterPos[0];
+            const uint16_t *srcPos2 = src + filterPos[1];
+            const uint16_t *srcPos3 = src + filterPos[2];
+            const uint16_t *srcPos4 = src + filterPos[3];
+            const int16_t *filterStart1 = filter;
+            const int16_t *filterStart2 = filterStart1 + filterSize;
+            const int16_t *filterStart3 = filterStart2 + filterSize;
+            const int16_t *filterStart4 = filterStart3 + filterSize;
+            int j, val1 = 0, val2 = 0, val3 = 0, val4 = 0;
+
+            for (j = 0; j < filterlen; j += 8) {
+                SCALE_16
+            }
+            val1 = __lasx_xvpickve2gr_w(out, 0);
+            val2 = __lasx_xvpickve2gr_w(out, 4);
+            val3 = __lasx_xvpickve2gr_w(out, 2);
+            val4 = __lasx_xvpickve2gr_w(out, 6);
+            for (; j < filterSize; j++) {
+                val1 += ((int)srcPos1[j]) * filterStart1[j];
+                val2 += ((int)srcPos2[j]) * filterStart2[j];
+                val3 += ((int)srcPos3[j]) * filterStart3[j];
+                val4 += ((int)srcPos4[j]) * filterStart4[j];
+            }
+            dst[0] = FFMIN(val1 >> sh, max);
+            dst[1] = FFMIN(val2 >> sh, max);
+            dst[2] = FFMIN(val3 >> sh, max);
+            dst[3] = FFMIN(val4 >> sh, max);
+            dst       += 4;
+            filterPos += 4;
+            filter     = filterStart4 + filterSize;
+        }
+        for (i = 0; i < res; i++) {
+            int j, val = 0;
+            const uint16_t *srcPos      = src + filterPos[i];
+            __m256i src0, filter0, out0;
+
+            for (j = 0; j < filterlen; j += 8) {
+                int dex = j << 1;
+                src0 = __lasx_xvldx(srcPos, dex);
+                filter0 = __lasx_xvldx(filter, dex);
+                out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
+                out0    = __lasx_xvhaddw_d_w(out0, out0);
+                out0    = __lasx_xvhaddw_q_d(out0, out0);
+                val    += __lasx_xvpickve2gr_w(out0, 0);
+            }
+            for (; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[i]  = FFMIN(val >> sh, max);
+            filter += filterSize;
+        }
+    } else {
+        for (i = 0; i < dstW; i++) {
+            int val = 0;
+            const uint16_t *srcPos = src + filterPos[i];
+
+            for (int j = 0; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[i]  = FFMIN(val >> sh, max);
+            filter += filterSize;
+        }
+    }
+}
+
+#undef SCALE_8
+#undef SCALE_16
diff --git a/libswscale/loongarch/swscale_loongarch.h b/libswscale/loongarch/swscale_loongarch.h
new file mode 100644
index 0000000000..790304a01c
--- /dev/null
+++ b/libswscale/loongarch/swscale_loongarch.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) 2022 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen(chenhao@loongson.cn)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H
+#define SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H
+
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+
+void ff_hscale_8_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
+                            const uint8_t *src, const int16_t *filter,
+                            const int32_t *filterPos, int filterSize);
+
+void ff_hscale_8_to_19_lasx(SwsContext *c, int16_t *_dst, int dstW,
+                            const uint8_t *src, const int16_t *filter,
+                            const int32_t *filterPos, int filterSize);
+
+void ff_hscale_16_to_19_lasx(SwsContext *c, int16_t *_dst, int dstW,
+                             const uint8_t *_src, const int16_t *filter,
+                             const int32_t *filterPos, int filterSize);
+
+void ff_hscale_16_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
+                             const uint8_t *_src, const int16_t *filter,
+                             const int32_t *filterPos, int filterSize);
+
+void planar_rgb_to_uv_lasx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
+                           int width, int32_t *rgb2yuv, void *opq);
+
+void planar_rgb_to_y_lasx(uint8_t *_dst, const uint8_t *src[4], int width,
+                          int32_t *rgb2yuv, void *opq);
+
+#endif /* SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H */
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index 7b40f49da4..367d045a02 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -598,6 +598,8 @@ void ff_sws_init_scale(SwsContext *c)
     ff_sws_init_swscale_aarch64(c);
 #elif ARCH_ARM
     ff_sws_init_swscale_arm(c);
+#elif ARCH_LOONGARCH64
+    ff_sws_init_swscale_loongarch(c);
 #endif
 }
 
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 6c14ce8536..abeebbb002 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -698,6 +698,7 @@ av_cold void ff_sws_init_range_convert(SwsContext *c);
 
 SwsFunc ff_yuv2rgb_init_x86(SwsContext *c);
 SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c);
+SwsFunc ff_yuv2rgb_init_loongarch(SwsContext *c);
 
 static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)
 {
@@ -983,6 +984,7 @@ void ff_sws_init_swscale_vsx(SwsContext *c);
 void ff_sws_init_swscale_x86(SwsContext *c);
 void ff_sws_init_swscale_aarch64(SwsContext *c);
 void ff_sws_init_swscale_arm(SwsContext *c);
+void ff_sws_init_swscale_loongarch(SwsContext *c);
 
 void ff_hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
                        const uint8_t *src, int srcW, int xInc);
diff --git a/libswscale/utils.c b/libswscale/utils.c
index a5a9bc589a..0dde54b65b 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -53,6 +53,7 @@
 #include "libavutil/ppc/cpu.h"
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
+#include "libavutil/loongarch/cpu.h"
 
 #include "rgb2rgb.h"
 #include "swscale.h"
@@ -659,6 +660,15 @@ static av_cold int initFilter(int16_t **outFilter, int32_t **filterPos,
             filterAlign = 1;
     }
 
+    if (have_lasx(cpu_flags)) {
+        int reNum = minFilterSize & (0x07);
+
+        if (minFilterSize < 5)
+            filterAlign = 4;
+        if (reNum < 3)
+            filterAlign = 1;
+    }
+
     av_assert0(minFilterSize > 0);
     filterSize = (minFilterSize + (filterAlign - 1)) & (~(filterAlign - 1));
     av_assert0(filterSize > 0);
@@ -1844,7 +1854,8 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
         {
             const int filterAlign = X86_MMX(cpu_flags)     ? 4 :
                                     PPC_ALTIVEC(cpu_flags) ? 8 :
-                                    have_neon(cpu_flags)   ? 4 : 1;
+                                    have_neon(cpu_flags)   ? 4 :
+                                    have_lasx(cpu_flags)   ? 8 : 1;
 
             if ((ret = initFilter(&c->hLumFilter, &c->hLumFilterPos,
                            &c->hLumFilterSize, c->lumXInc,
-- 
2.20.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [FFmpeg-devel] [PATCH v5 2/3] swscale/la: Add yuv2rgb_lasx.c and rgb2rgb_lasx.c files
  2022-09-09  9:00 [FFmpeg-devel] Add LoongArch SIMD optimization in swscale lib Hao Chen
  2022-09-09  9:00 ` [FFmpeg-devel] [PATCH v5 1/3] swscale/la: Optimize hscale functions with lasx Hao Chen
@ 2022-09-09  9:00 ` Hao Chen
  2022-09-09  9:00 ` [FFmpeg-devel] [PATCH v5 3/3] swscale/la: Add output_lasx.c file Hao Chen
  2022-09-09  9:43 ` [FFmpeg-devel] Add LoongArch SIMD optimization in swscale lib yinshiyou-hf
  3 siblings, 0 replies; 6+ messages in thread
From: Hao Chen @ 2022-09-09  9:00 UTC (permalink / raw)
  To: ffmpeg-devel

ffmpeg -i 1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -pix_fmt rgb24 -y /dev/null -an
before: 178fps
after:  210fps

Signed-off-by: Hao Chen <chenhao@loongson.cn>
---
 libswscale/loongarch/Makefile                 |   2 +
 libswscale/loongarch/rgb2rgb_lasx.c           |  52 +++
 libswscale/loongarch/swscale_init_loongarch.c |  42 +++
 libswscale/loongarch/swscale_loongarch.h      |  22 ++
 libswscale/loongarch/yuv2rgb_lasx.c           | 321 ++++++++++++++++++
 libswscale/rgb2rgb.c                          |   2 +
 libswscale/rgb2rgb.h                          |   1 +
 libswscale/yuv2rgb.c                          |   2 +
 8 files changed, 444 insertions(+)
 create mode 100644 libswscale/loongarch/rgb2rgb_lasx.c
 create mode 100644 libswscale/loongarch/yuv2rgb_lasx.c

diff --git a/libswscale/loongarch/Makefile b/libswscale/loongarch/Makefile
index 586a1717b6..4345971514 100644
--- a/libswscale/loongarch/Makefile
+++ b/libswscale/loongarch/Makefile
@@ -1,3 +1,5 @@
 OBJS-$(CONFIG_SWSCALE)      += loongarch/swscale_init_loongarch.o
 LASX-OBJS-$(CONFIG_SWSCALE) += loongarch/swscale_lasx.o \
                                loongarch/input_lasx.o   \
+                               loongarch/yuv2rgb_lasx.o \
+                               loongarch/rgb2rgb_lasx.o
diff --git a/libswscale/loongarch/rgb2rgb_lasx.c b/libswscale/loongarch/rgb2rgb_lasx.c
new file mode 100644
index 0000000000..1b6be90217
--- /dev/null
+++ b/libswscale/loongarch/rgb2rgb_lasx.c
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen(chenhao@loongson.cn)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "swscale_loongarch.h"
+#include "libavutil/loongarch/loongson_intrinsics.h"
+
+void ff_interleave_bytes_lasx(const uint8_t *src1, const uint8_t *src2,
+                              uint8_t *dest, int width, int height,
+                              int src1Stride, int src2Stride, int dstStride)
+{
+    int h;
+    int len = width & (0xFFFFFFF0);
+
+    for (h = 0; h < height; h++) {
+        int w, index = 0;
+        __m256i src_1, src_2, dst;
+
+        for (w = 0; w < len; w += 16) {
+            DUP2_ARG2(__lasx_xvld, src1 + w, 0, src2 + w, 0, src_1, src_2);
+            src_1 = __lasx_xvpermi_d(src_1, 0xD8);
+            src_2 = __lasx_xvpermi_d(src_2, 0xD8);
+            dst   = __lasx_xvilvl_b(src_2, src_1);
+            __lasx_xvst(dst, dest + index, 0);
+            index  += 32;
+        }
+        for (; w < width; w++) {
+            dest[(w << 1) + 0] = src1[w];
+            dest[(w << 1) + 1] = src2[w];
+        }
+        dest += dstStride;
+        src1 += src1Stride;
+        src2 += src2Stride;
+    }
+}
diff --git a/libswscale/loongarch/swscale_init_loongarch.c b/libswscale/loongarch/swscale_init_loongarch.c
index 197dc6e1e7..1e0bb1b116 100644
--- a/libswscale/loongarch/swscale_init_loongarch.c
+++ b/libswscale/loongarch/swscale_init_loongarch.c
@@ -21,6 +21,7 @@
 
 #include "swscale_loongarch.h"
 #include "libswscale/swscale_internal.h"
+#include "libswscale/rgb2rgb.h"
 #include "libavutil/loongarch/cpu.h"
 
 av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
@@ -48,3 +49,44 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
         }
     }
 }
+
+av_cold void rgb2rgb_init_loongarch(void)
+{
+    int cpu_flags = av_get_cpu_flags();
+    if (have_lasx(cpu_flags))
+        interleaveBytes = ff_interleave_bytes_lasx;
+}
+
+av_cold SwsFunc ff_yuv2rgb_init_loongarch(SwsContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+    if (have_lasx(cpu_flags)) {
+        switch (c->dstFormat) {
+            case AV_PIX_FMT_RGB24:
+                return yuv420_rgb24_lasx;
+            case AV_PIX_FMT_BGR24:
+                return yuv420_bgr24_lasx;
+            case AV_PIX_FMT_RGBA:
+                if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat)) {
+                    break;
+                } else
+                    return yuv420_rgba32_lasx;
+            case AV_PIX_FMT_ARGB:
+                if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat)) {
+                    break;
+                } else
+                    return yuv420_argb32_lasx;
+            case AV_PIX_FMT_BGRA:
+                if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat)) {
+                    break;
+                } else
+                    return yuv420_bgra32_lasx;
+            case AV_PIX_FMT_ABGR:
+                if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat)) {
+                    break;
+                } else
+                    return yuv420_abgr32_lasx;
+        }
+    }
+    return NULL;
+}
diff --git a/libswscale/loongarch/swscale_loongarch.h b/libswscale/loongarch/swscale_loongarch.h
index 790304a01c..f5afbd7633 100644
--- a/libswscale/loongarch/swscale_loongarch.h
+++ b/libswscale/loongarch/swscale_loongarch.h
@@ -47,4 +47,26 @@ void planar_rgb_to_uv_lasx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4]
 void planar_rgb_to_y_lasx(uint8_t *_dst, const uint8_t *src[4], int width,
                           int32_t *rgb2yuv, void *opq);
 
+int yuv420_rgb24_lasx(SwsContext *c, const uint8_t *src[], int srcStride[],
+                      int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
+
+int yuv420_bgr24_lasx(SwsContext *c, const uint8_t *src[], int srcStride[],
+                      int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
+
+int yuv420_rgba32_lasx(SwsContext *c, const uint8_t *src[], int srcStride[],
+                       int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
+
+int yuv420_bgra32_lasx(SwsContext *c, const uint8_t *src[], int srcStride[],
+                       int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
+
+int yuv420_argb32_lasx(SwsContext *c, const uint8_t *src[], int srcStride[],
+                       int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
+
+int yuv420_abgr32_lasx(SwsContext *c, const uint8_t *src[], int srcStride[],
+                       int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
+
+void ff_interleave_bytes_lasx(const uint8_t *src1, const uint8_t *src2,
+                              uint8_t *dest, int width, int height,
+                              int src1Stride, int src2Stride, int dstStride);
+
 #endif /* SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H */
diff --git a/libswscale/loongarch/yuv2rgb_lasx.c b/libswscale/loongarch/yuv2rgb_lasx.c
new file mode 100644
index 0000000000..64e434f50c
--- /dev/null
+++ b/libswscale/loongarch/yuv2rgb_lasx.c
@@ -0,0 +1,321 @@
+/*
+ * Copyright (C) 2022 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen(chenhao@loongson.cn)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "swscale_loongarch.h"
+#include "libavutil/loongarch/loongson_intrinsics.h"
+
+#define YUV2RGB_LOAD_COE                                     \
+    /* Load x_offset */                                      \
+    __m256i y_offset = __lasx_xvreplgr2vr_d(c->yOffset);     \
+    __m256i u_offset = __lasx_xvreplgr2vr_d(c->uOffset);     \
+    __m256i v_offset = __lasx_xvreplgr2vr_d(c->vOffset);     \
+    /* Load x_coeff  */                                      \
+    __m256i ug_coeff = __lasx_xvreplgr2vr_d(c->ugCoeff);     \
+    __m256i vg_coeff = __lasx_xvreplgr2vr_d(c->vgCoeff);     \
+    __m256i y_coeff  = __lasx_xvreplgr2vr_d(c->yCoeff);      \
+    __m256i ub_coeff = __lasx_xvreplgr2vr_d(c->ubCoeff);     \
+    __m256i vr_coeff = __lasx_xvreplgr2vr_d(c->vrCoeff);     \
+
+#define LOAD_YUV_16                                          \
+    m_y1 = __lasx_xvld(py_1, 0);                             \
+    m_y2 = __lasx_xvld(py_2, 0);                             \
+    m_u  = __lasx_xvldrepl_d(pu, 0);                         \
+    m_v  = __lasx_xvldrepl_d(pv, 0);                         \
+    m_u  = __lasx_xvilvl_b(m_u, m_u);                        \
+    m_v  = __lasx_xvilvl_b(m_v, m_v);                        \
+    DUP4_ARG1(__lasx_vext2xv_hu_bu, m_y1, m_y2, m_u, m_v,    \
+              m_y1, m_y2, m_u, m_v);                         \
+
+/* YUV2RGB method
+ * The conversion method is as follows:
+ * R = Y' * y_coeff + V' * vr_coeff
+ * G = Y' * y_coeff + V' * vg_coeff + U' * ug_coeff
+ * B = Y' * y_coeff + U' * ub_coeff
+ *
+ * where X' = X * 8 - x_offset
+ *
+ */
+
+#define YUV2RGB                                                            \
+    m_y1 = __lasx_xvslli_h(m_y1, 3);                                       \
+    m_y2 = __lasx_xvslli_h(m_y2, 3);                                       \
+    m_u  = __lasx_xvslli_h(m_u, 3);                                        \
+    m_v  = __lasx_xvslli_h(m_v, 3);                                        \
+    m_y1 = __lasx_xvsub_h(m_y1, y_offset);                                 \
+    m_y2 = __lasx_xvsub_h(m_y2, y_offset);                                 \
+    m_u  = __lasx_xvsub_h(m_u, u_offset);                                  \
+    m_v  = __lasx_xvsub_h(m_v, v_offset);                                  \
+    y_1  = __lasx_xvmuh_h(m_y1, y_coeff);                                  \
+    y_2  = __lasx_xvmuh_h(m_y2, y_coeff);                                  \
+    u2g  = __lasx_xvmuh_h(m_u, ug_coeff);                                  \
+    u2b  = __lasx_xvmuh_h(m_u, ub_coeff);                                  \
+    v2r  = __lasx_xvmuh_h(m_v, vr_coeff);                                  \
+    v2g  = __lasx_xvmuh_h(m_v, vg_coeff);                                  \
+    r1   = __lasx_xvsadd_h(y_1, v2r);                                      \
+    v2g  = __lasx_xvsadd_h(v2g, u2g);                                      \
+    g1   = __lasx_xvsadd_h(y_1, v2g);                                      \
+    b1   = __lasx_xvsadd_h(y_1, u2b);                                      \
+    r2   = __lasx_xvsadd_h(y_2, v2r);                                      \
+    g2   = __lasx_xvsadd_h(y_2, v2g);                                      \
+    b2   = __lasx_xvsadd_h(y_2, u2b);                                      \
+    DUP4_ARG1(__lasx_xvclip255_h, r1, g1, b1, r2, r1, g1, b1, r2);         \
+    DUP2_ARG1(__lasx_xvclip255_h, g2, b2, g2, b2);                         \
+
+#define YUV2RGB_RES                                                        \
+    m_y1 = __lasx_xvldrepl_d(py_1, 0);                       \
+    m_y2 = __lasx_xvldrepl_d(py_2, 0);                       \
+    m_u  = __lasx_xvldrepl_d(pu, 0);                         \
+    m_v  = __lasx_xvldrepl_d(pv, 0);                         \
+    m_y1 = __lasx_xvilvl_d(m_y2, m_y1);                      \
+    m_u  = __lasx_xvilvl_b(m_u, m_u);                        \
+    m_v  = __lasx_xvilvl_b(m_v, m_v);                        \
+    m_y1 = __lasx_vext2xv_hu_bu(m_y1);                       \
+    m_u  = __lasx_vext2xv_hu_bu(m_u);                        \
+    m_v  = __lasx_vext2xv_hu_bu(m_v);                        \
+    m_y1 = __lasx_xvslli_h(m_y1, 3);                         \
+    m_u  = __lasx_xvslli_h(m_u, 3);                          \
+    m_v  = __lasx_xvslli_h(m_v, 3);                          \
+    m_y1 = __lasx_xvsub_h(m_y1, y_offset);                   \
+    m_u  = __lasx_xvsub_h(m_u, u_offset);                    \
+    m_v  = __lasx_xvsub_h(m_v, v_offset);                    \
+    y_1  = __lasx_xvmuh_h(m_y1, y_coeff);                    \
+    u2g  = __lasx_xvmuh_h(m_u, ug_coeff);                    \
+    u2b  = __lasx_xvmuh_h(m_u, ub_coeff);                    \
+    v2r  = __lasx_xvmuh_h(m_v, vr_coeff);                    \
+    v2g  = __lasx_xvmuh_h(m_v, vg_coeff);                    \
+    r1   = __lasx_xvsadd_h(y_1, v2r);                        \
+    v2g  = __lasx_xvsadd_h(v2g, u2g);                        \
+    g1   = __lasx_xvsadd_h(y_1, v2g);                        \
+    b1   = __lasx_xvsadd_h(y_1, u2b);                        \
+    r1   = __lasx_xvclip255_h(r1);                           \
+    g1   = __lasx_xvclip255_h(g1);                           \
+    b1   = __lasx_xvclip255_h(b1);                           \
+
+#define RGB_PACK(r, g, b, rgb_l, rgb_h)                                    \
+{                                                                          \
+    __m256i rg;                                                            \
+    rg = __lasx_xvpackev_b(g, r);                                          \
+    DUP2_ARG3(__lasx_xvshuf_b, b, rg, shuf2, b, rg, shuf3, rgb_l, rgb_h);  \
+}
+
+#define RGB32_PACK(a, r, g, b, rgb_l, rgb_h)                               \
+{                                                                          \
+    __m256i ra, bg, tmp0, tmp1;                                            \
+    ra    = __lasx_xvpackev_b(r, a);                                       \
+    bg    = __lasx_xvpackev_b(b, g);                                       \
+    tmp0  = __lasx_xvilvl_h(bg, ra);                                       \
+    tmp1  = __lasx_xvilvh_h(bg, ra);                                       \
+    rgb_l = __lasx_xvpermi_q(tmp1, tmp0, 0x20);                            \
+    rgb_h = __lasx_xvpermi_q(tmp1, tmp0, 0x31);                            \
+}
+
+#define RGB_STORE_RES(rgb_l, rgb_h, image_1, image_2)                      \
+{                                                                          \
+    __lasx_xvstelm_d(rgb_l, image_1, 0,  0);                               \
+    __lasx_xvstelm_d(rgb_l, image_1, 8,  1);                               \
+    __lasx_xvstelm_d(rgb_h, image_1, 16, 0);                               \
+    __lasx_xvstelm_d(rgb_l, image_2, 0,  2);                               \
+    __lasx_xvstelm_d(rgb_l, image_2, 8,  3);                               \
+    __lasx_xvstelm_d(rgb_h, image_2, 16, 2);                               \
+}
+
+#define RGB_STORE(rgb_l, rgb_h, image)                                       \
+{                                                                            \
+    __lasx_xvstelm_d(rgb_l, image, 0,  0);                                   \
+    __lasx_xvstelm_d(rgb_l, image, 8,  1);                                   \
+    __lasx_xvstelm_d(rgb_h, image, 16, 0);                                   \
+    __lasx_xvstelm_d(rgb_l, image, 24, 2);                                   \
+    __lasx_xvstelm_d(rgb_l, image, 32, 3);                                   \
+    __lasx_xvstelm_d(rgb_h, image, 40, 2);                                   \
+}
+
+#define RGB32_STORE(rgb_l, rgb_h, image)                                     \
+{                                                                            \
+    __lasx_xvst(rgb_l, image, 0);                                            \
+    __lasx_xvst(rgb_h, image, 32);                                           \
+}
+
+#define RGB32_STORE_RES(rgb_l, rgb_h, image_1, image_2)                      \
+{                                                                            \
+    __lasx_xvst(rgb_l, image_1, 0);                                          \
+    __lasx_xvst(rgb_h, image_2, 0);                                          \
+}
+
+#define YUV2RGBFUNC(func_name, dst_type, alpha)                                     \
+           int func_name(SwsContext *c, const uint8_t *src[],                       \
+                         int srcStride[], int srcSliceY, int srcSliceH,             \
+                         uint8_t *dst[], int dstStride[])                           \
+{                                                                                   \
+    int x, y, h_size, vshift, res;                                                  \
+    __m256i m_y1, m_y2, m_u, m_v;                                                   \
+    __m256i y_1, y_2, u2g, v2g, u2b, v2r, rgb1_l, rgb1_h;                           \
+    __m256i rgb2_l, rgb2_h, r1, g1, b1, r2, g2, b2;                                 \
+    __m256i shuf2 = {0x0504120302100100, 0x0A18090816070614,                        \
+                     0x0504120302100100, 0x0A18090816070614};                       \
+    __m256i shuf3 = {0x1E0F0E1C0D0C1A0B, 0x0101010101010101,                        \
+                     0x1E0F0E1C0D0C1A0B, 0x0101010101010101};                       \
+    YUV2RGB_LOAD_COE                                                                \
+    y      = (c->dstW + 7) & ~7;                                                    \
+    h_size = y >> 4;                                                                \
+    res    = y & 15;                                                                \
+                                                                                    \
+    vshift = c->srcFormat != AV_PIX_FMT_YUV422P;                                    \
+    for (y = 0; y < srcSliceH; y += 2) {                                            \
+        dst_type *image1    = (dst_type *)(dst[0] + (y + srcSliceY) * dstStride[0]);\
+        dst_type *image2    = (dst_type *)(image1 +                   dstStride[0]);\
+        const uint8_t *py_1 = src[0] +               y * srcStride[0];              \
+        const uint8_t *py_2 = py_1   +                   srcStride[0];              \
+        const uint8_t *pu   = src[1] +   (y >> vshift) * srcStride[1];              \
+        const uint8_t *pv   = src[2] +   (y >> vshift) * srcStride[2];              \
+        for(x = 0; x < h_size; x++) {                                               \
+
+#define YUV2RGBFUNC32(func_name, dst_type, alpha)                                   \
+           int func_name(SwsContext *c, const uint8_t *src[],                       \
+                         int srcStride[], int srcSliceY, int srcSliceH,             \
+                         uint8_t *dst[], int dstStride[])                           \
+{                                                                                   \
+    int x, y, h_size, vshift, res;                                                  \
+    __m256i m_y1, m_y2, m_u, m_v;                                                   \
+    __m256i y_1, y_2, u2g, v2g, u2b, v2r, rgb1_l, rgb1_h;                           \
+    __m256i rgb2_l, rgb2_h, r1, g1, b1, r2, g2, b2;                                 \
+    __m256i a = __lasx_xvldi(0xFF);                                                 \
+                                                                                    \
+    YUV2RGB_LOAD_COE                                                                \
+    y      = (c->dstW + 7) & ~7;                                                    \
+    h_size = y >> 4;                                                                \
+    res    = y & 15;                                                                \
+                                                                                    \
+    vshift = c->srcFormat != AV_PIX_FMT_YUV422P;                                    \
+    for (y = 0; y < srcSliceH; y += 2) {                                            \
+        int yd = y + srcSliceY;                                                     \
+        dst_type av_unused *r, *g, *b;                                              \
+        dst_type *image1    = (dst_type *)(dst[0] + (yd)     * dstStride[0]);       \
+        dst_type *image2    = (dst_type *)(dst[0] + (yd + 1) * dstStride[0]);       \
+        const uint8_t *py_1 = src[0] +               y * srcStride[0];              \
+        const uint8_t *py_2 = py_1   +                   srcStride[0];              \
+        const uint8_t *pu   = src[1] +   (y >> vshift) * srcStride[1];              \
+        const uint8_t *pv   = src[2] +   (y >> vshift) * srcStride[2];              \
+        for(x = 0; x < h_size; x++) {                                               \
+
+#define DEALYUV2RGBREMAIN                                                           \
+            py_1 += 16;                                                             \
+            py_2 += 16;                                                             \
+            pu += 8;                                                                \
+            pv += 8;                                                                \
+            image1 += 48;                                                           \
+            image2 += 48;                                                           \
+        }                                                                           \
+        if (res) {                                                                  \
+
+#define DEALYUV2RGBREMAIN32                                                         \
+            py_1 += 16;                                                             \
+            py_2 += 16;                                                             \
+            pu += 8;                                                                \
+            pv += 8;                                                                \
+            image1 += 16;                                                           \
+            image2 += 16;                                                           \
+        }                                                                           \
+        if (res) {                                                                  \
+
+
+#define END_FUNC()                          \
+        }                                   \
+    }                                       \
+    return srcSliceH;                       \
+}
+
+YUV2RGBFUNC(yuv420_rgb24_lasx, uint8_t, 0)
+    LOAD_YUV_16
+    YUV2RGB
+    RGB_PACK(r1, g1, b1, rgb1_l, rgb1_h);
+    RGB_PACK(r2, g2, b2, rgb2_l, rgb2_h);
+    RGB_STORE(rgb1_l, rgb1_h, image1);
+    RGB_STORE(rgb2_l, rgb2_h, image2);
+    DEALYUV2RGBREMAIN
+    YUV2RGB_RES
+    RGB_PACK(r1, g1, b1, rgb1_l, rgb1_h);
+    RGB_STORE_RES(rgb1_l, rgb1_h, image1, image2);
+    END_FUNC()
+
+YUV2RGBFUNC(yuv420_bgr24_lasx, uint8_t, 0)
+    LOAD_YUV_16
+    YUV2RGB
+    RGB_PACK(b1, g1, r1, rgb1_l, rgb1_h);
+    RGB_PACK(b2, g2, r2, rgb2_l, rgb2_h);
+    RGB_STORE(rgb1_l, rgb1_h, image1);
+    RGB_STORE(rgb2_l, rgb2_h, image2);
+    DEALYUV2RGBREMAIN
+    YUV2RGB_RES
+    RGB_PACK(b1, g1, r1, rgb1_l, rgb1_h);
+    RGB_STORE_RES(rgb1_l, rgb1_h, image1, image2);
+    END_FUNC()
+
+YUV2RGBFUNC32(yuv420_rgba32_lasx, uint32_t, 0)
+    LOAD_YUV_16
+    YUV2RGB
+    RGB32_PACK(r1, g1, b1, a, rgb1_l, rgb1_h);
+    RGB32_PACK(r2, g2, b2, a, rgb2_l, rgb2_h);
+    RGB32_STORE(rgb1_l, rgb1_h, image1);
+    RGB32_STORE(rgb2_l, rgb2_h, image2);
+    DEALYUV2RGBREMAIN32
+    YUV2RGB_RES
+    RGB32_PACK(r1, g1, b1, a, rgb1_l, rgb1_h);
+    RGB32_STORE_RES(rgb1_l, rgb1_h, image1, image2);
+    END_FUNC()
+
+YUV2RGBFUNC32(yuv420_bgra32_lasx, uint32_t, 0)
+    LOAD_YUV_16
+    YUV2RGB
+    RGB32_PACK(b1, g1, r1, a, rgb1_l, rgb1_h);
+    RGB32_PACK(b2, g2, r2, a, rgb2_l, rgb2_h);
+    RGB32_STORE(rgb1_l, rgb1_h, image1);
+    RGB32_STORE(rgb2_l, rgb2_h, image2);
+    DEALYUV2RGBREMAIN32
+    YUV2RGB_RES
+    RGB32_PACK(b1, g1, r1, a, rgb1_l, rgb1_h);
+    RGB32_STORE_RES(rgb1_l, rgb1_h, image1, image2);
+    END_FUNC()
+
+YUV2RGBFUNC32(yuv420_argb32_lasx, uint32_t, 0)
+    LOAD_YUV_16
+    YUV2RGB
+    RGB32_PACK(a, r1, g1, b1, rgb1_l, rgb1_h);
+    RGB32_PACK(a, r2, g2, b2, rgb2_l, rgb2_h);
+    RGB32_STORE(rgb1_l, rgb1_h, image1);
+    RGB32_STORE(rgb2_l, rgb2_h, image2);
+    DEALYUV2RGBREMAIN32
+    YUV2RGB_RES
+    RGB32_PACK(a, r1, g1, b1, rgb1_l, rgb1_h);
+    RGB32_STORE_RES(rgb1_l, rgb1_h, image1, image2);
+    END_FUNC()
+
+YUV2RGBFUNC32(yuv420_abgr32_lasx, uint32_t, 0)
+    LOAD_YUV_16
+    YUV2RGB
+    RGB32_PACK(a, b1, g1, r1, rgb1_l, rgb1_h);
+    RGB32_PACK(a, b2, g2, r2, rgb2_l, rgb2_h);
+    RGB32_STORE(rgb1_l, rgb1_h, image1);
+    RGB32_STORE(rgb2_l, rgb2_h, image2);
+    DEALYUV2RGBREMAIN32
+    YUV2RGB_RES
+    RGB32_PACK(a, b1, g1, r1, rgb1_l, rgb1_h);
+    RGB32_STORE_RES(rgb1_l, rgb1_h, image1, image2);
+    END_FUNC()
diff --git a/libswscale/rgb2rgb.c b/libswscale/rgb2rgb.c
index 4f1ac9c465..3af775b389 100644
--- a/libswscale/rgb2rgb.c
+++ b/libswscale/rgb2rgb.c
@@ -141,6 +141,8 @@ av_cold void ff_sws_rgb2rgb_init(void)
     rgb2rgb_init_aarch64();
 #elif ARCH_X86
     rgb2rgb_init_x86();
+#elif ARCH_LOONGARCH64
+    rgb2rgb_init_loongarch();
 #endif
 }
 
diff --git a/libswscale/rgb2rgb.h b/libswscale/rgb2rgb.h
index 7272e98c57..db85bfc42f 100644
--- a/libswscale/rgb2rgb.h
+++ b/libswscale/rgb2rgb.h
@@ -168,5 +168,6 @@ void ff_sws_rgb2rgb_init(void);
 
 void rgb2rgb_init_aarch64(void);
 void rgb2rgb_init_x86(void);
+void rgb2rgb_init_loongarch(void);
 
 #endif /* SWSCALE_RGB2RGB_H */
diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c
index 6ee483d12a..9c3f5e23c6 100644
--- a/libswscale/yuv2rgb.c
+++ b/libswscale/yuv2rgb.c
@@ -683,6 +683,8 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
     t = ff_yuv2rgb_init_ppc(c);
 #elif ARCH_X86
     t = ff_yuv2rgb_init_x86(c);
+#elif ARCH_LOONGARCH64
+    t = ff_yuv2rgb_init_loongarch(c);
 #endif
 
     if (t)
-- 
2.20.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [FFmpeg-devel] [PATCH v5 3/3] swscale/la: Add output_lasx.c file.
  2022-09-09  9:00 [FFmpeg-devel] Add LoongArch SIMD optimization in swscale lib Hao Chen
  2022-09-09  9:00 ` [FFmpeg-devel] [PATCH v5 1/3] swscale/la: Optimize hscale functions with lasx Hao Chen
  2022-09-09  9:00 ` [FFmpeg-devel] [PATCH v5 2/3] swscale/la: Add yuv2rgb_lasx.c and rgb2rgb_lasx.c files Hao Chen
@ 2022-09-09  9:00 ` Hao Chen
  2022-09-09  9:43 ` [FFmpeg-devel] Add LoongArch SIMD optimization in swscale lib yinshiyou-hf
  3 siblings, 0 replies; 6+ messages in thread
From: Hao Chen @ 2022-09-09  9:00 UTC (permalink / raw)
  To: ffmpeg-devel

ffmpeg -i 1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -s 640x480 -pix_fmt
rgb24 -y /dev/null -an
before: 150fps
after:  183fps

Signed-off-by: Hao Chen <chenhao@loongson.cn>
---
 libswscale/loongarch/Makefile                 |    3 +-
 libswscale/loongarch/output_lasx.c            | 1982 +++++++++++++++++
 libswscale/loongarch/swscale_init_loongarch.c |    3 +
 libswscale/loongarch/swscale_loongarch.h      |    6 +
 4 files changed, 1993 insertions(+), 1 deletion(-)
 create mode 100644 libswscale/loongarch/output_lasx.c

diff --git a/libswscale/loongarch/Makefile b/libswscale/loongarch/Makefile
index 4345971514..8e665e826c 100644
--- a/libswscale/loongarch/Makefile
+++ b/libswscale/loongarch/Makefile
@@ -2,4 +2,5 @@ OBJS-$(CONFIG_SWSCALE)      += loongarch/swscale_init_loongarch.o
 LASX-OBJS-$(CONFIG_SWSCALE) += loongarch/swscale_lasx.o \
                                loongarch/input_lasx.o   \
                                loongarch/yuv2rgb_lasx.o \
-                               loongarch/rgb2rgb_lasx.o
+                               loongarch/rgb2rgb_lasx.o \
+                               loongarch/output_lasx.o
diff --git a/libswscale/loongarch/output_lasx.c b/libswscale/loongarch/output_lasx.c
new file mode 100644
index 0000000000..36a4c4503b
--- /dev/null
+++ b/libswscale/loongarch/output_lasx.c
@@ -0,0 +1,1982 @@
+/*
+ * Copyright (C) 2022 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen(chenhao@loongson.cn)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "swscale_loongarch.h"
+#include "libavutil/loongarch/loongson_intrinsics.h"
+
+void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
+                          const int16_t **src, uint8_t *dest, int dstW,
+                          const uint8_t *dither, int offset)
+{
+    int i;
+    int len = dstW - 15;
+    __m256i mask = {0x1C0C180814041000, 0x1C1814100C080400,
+                    0x1C0C180814041000, 0x1C1814100C080400};
+    __m256i val1, val2, val3;
+    uint8_t dither0 = dither[offset & 7];
+    uint8_t dither1 = dither[(offset + 1) & 7];
+    uint8_t dither2 = dither[(offset + 2) & 7];
+    uint8_t dither3 = dither[(offset + 3) & 7];
+    uint8_t dither4 = dither[(offset + 4) & 7];
+    uint8_t dither5 = dither[(offset + 5) & 7];
+    uint8_t dither6 = dither[(offset + 6) & 7];
+    uint8_t dither7 = dither[(offset + 7) & 7];
+    int val_1[8] = {dither0, dither2, dither4, dither6,
+                    dither0, dither2, dither4, dither6};
+    int val_2[8] = {dither1, dither3, dither5, dither7,
+                    dither1, dither3, dither5, dither7};
+    int val_3[8] = {dither0, dither1, dither2, dither3,
+                    dither4, dither5, dither6, dither7};
+
+    DUP2_ARG2(__lasx_xvld, val_1, 0, val_2, 0, val1, val2);
+    val3 = __lasx_xvld(val_3, 0);
+
+    for (i = 0; i < len; i += 16) {
+        int j;
+        __m256i src0, filter0, val;
+        __m256i val_ev, val_od;
+
+        val_ev = __lasx_xvslli_w(val1, 12);
+        val_od = __lasx_xvslli_w(val2, 12);
+
+        for (j = 0; j < filterSize; j++) {
+            src0  = __lasx_xvld(src[j]+ i, 0);
+            filter0 = __lasx_xvldrepl_h((filter + j), 0);
+            val_ev = __lasx_xvmaddwev_w_h(val_ev, src0, filter0);
+            val_od = __lasx_xvmaddwod_w_h(val_od, src0, filter0);
+        }
+        val_ev = __lasx_xvsrai_w(val_ev, 19);
+        val_od = __lasx_xvsrai_w(val_od, 19);
+        val_ev = __lasx_xvclip255_w(val_ev);
+        val_od = __lasx_xvclip255_w(val_od);
+        val    = __lasx_xvshuf_b(val_od, val_ev, mask);
+        __lasx_xvstelm_d(val, (dest + i), 0, 0);
+        __lasx_xvstelm_d(val, (dest + i), 8, 2);
+    }
+    if (dstW - i >= 8){
+        int j;
+        __m256i src0, filter0, val_h;
+        __m256i val_l;
+
+        val_l = __lasx_xvslli_w(val3, 12);
+
+        for (j = 0; j < filterSize; j++) {
+            src0  = __lasx_xvld(src[j] + i, 0);
+            src0  = __lasx_vext2xv_w_h(src0);
+            filter0 = __lasx_xvldrepl_h((filter + j), 0);
+            filter0 = __lasx_vext2xv_w_h(filter0);
+            val_l = __lasx_xvmadd_w(val_l, src0, filter0);
+        }
+        val_l = __lasx_xvsrai_w(val_l, 19);
+        val_l = __lasx_xvclip255_w(val_l);
+        val_h = __lasx_xvpermi_d(val_l, 0x4E);
+        val_l = __lasx_xvshuf_b(val_h, val_l, mask);
+        __lasx_xvstelm_d(val_l, (dest + i), 0, 1);
+        i += 8;
+    }
+    for (; i < dstW; i++) {
+        int val = dither[(i + offset) & 7] << 12;
+        int j;
+        for (j = 0; j< filterSize; j++)
+            val += src[j][i] * filter[j];
+
+        dest[i] = av_clip_uint8(val >> 19);
+    }
+}
+
+/*Copy from libswscale/output.c*/
+static av_always_inline void
+yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2,
+              unsigned A1, unsigned A2,
+              const void *_r, const void *_g, const void *_b, int y,
+              enum AVPixelFormat target, int hasAlpha)
+{
+    if (target == AV_PIX_FMT_ARGB || target == AV_PIX_FMT_RGBA ||
+        target == AV_PIX_FMT_ABGR || target == AV_PIX_FMT_BGRA) {
+        uint32_t *dest = (uint32_t *) _dest;
+        const uint32_t *r = (const uint32_t *) _r;
+        const uint32_t *g = (const uint32_t *) _g;
+        const uint32_t *b = (const uint32_t *) _b;
+
+#if CONFIG_SMALL
+        dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
+        dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
+#else
+#if defined(ASSERT_LEVEL) && ASSERT_LEVEL > 1
+        int sh = (target == AV_PIX_FMT_RGB32_1 ||
+                  target == AV_PIX_FMT_BGR32_1) ? 0 : 24;
+        av_assert2((((r[Y1] + g[Y1] + b[Y1]) >> sh) & 0xFF) == 0xFF);
+#endif
+        dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
+        dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
+#endif
+    } else if (target == AV_PIX_FMT_RGB24 || target == AV_PIX_FMT_BGR24) {
+        uint8_t *dest = (uint8_t *) _dest;
+        const uint8_t *r = (const uint8_t *) _r;
+        const uint8_t *g = (const uint8_t *) _g;
+        const uint8_t *b = (const uint8_t *) _b;
+
+#define r_b ((target == AV_PIX_FMT_RGB24) ? r : b)
+#define b_r ((target == AV_PIX_FMT_RGB24) ? b : r)
+
+        dest[i * 6 + 0] = r_b[Y1];
+        dest[i * 6 + 1] =   g[Y1];
+        dest[i * 6 + 2] = b_r[Y1];
+        dest[i * 6 + 3] = r_b[Y2];
+        dest[i * 6 + 4] =   g[Y2];
+        dest[i * 6 + 5] = b_r[Y2];
+#undef r_b
+#undef b_r
+    } else if (target == AV_PIX_FMT_RGB565 || target == AV_PIX_FMT_BGR565 ||
+               target == AV_PIX_FMT_RGB555 || target == AV_PIX_FMT_BGR555 ||
+               target == AV_PIX_FMT_RGB444 || target == AV_PIX_FMT_BGR444) {
+        uint16_t *dest = (uint16_t *) _dest;
+        const uint16_t *r = (const uint16_t *) _r;
+        const uint16_t *g = (const uint16_t *) _g;
+        const uint16_t *b = (const uint16_t *) _b;
+        int dr1, dg1, db1, dr2, dg2, db2;
+
+        if (target == AV_PIX_FMT_RGB565 || target == AV_PIX_FMT_BGR565) {
+            dr1 = ff_dither_2x2_8[ y & 1     ][0];
+            dg1 = ff_dither_2x2_4[ y & 1     ][0];
+            db1 = ff_dither_2x2_8[(y & 1) ^ 1][0];
+            dr2 = ff_dither_2x2_8[ y & 1     ][1];
+            dg2 = ff_dither_2x2_4[ y & 1     ][1];
+            db2 = ff_dither_2x2_8[(y & 1) ^ 1][1];
+    } else if (target == AV_PIX_FMT_RGB555 || target == AV_PIX_FMT_BGR555) {
+            dr1 = ff_dither_2x2_8[ y & 1     ][0];
+            dg1 = ff_dither_2x2_8[ y & 1     ][1];
+            db1 = ff_dither_2x2_8[(y & 1) ^ 1][0];
+            dr2 = ff_dither_2x2_8[ y & 1     ][1];
+            dg2 = ff_dither_2x2_8[ y & 1     ][0];
+            db2 = ff_dither_2x2_8[(y & 1) ^ 1][1];
+        } else {
+            dr1 = ff_dither_4x4_16[ y & 3     ][0];
+            dg1 = ff_dither_4x4_16[ y & 3     ][1];
+            db1 = ff_dither_4x4_16[(y & 3) ^ 3][0];
+            dr2 = ff_dither_4x4_16[ y & 3     ][1];
+            dg2 = ff_dither_4x4_16[ y & 3     ][0];
+            db2 = ff_dither_4x4_16[(y & 3) ^ 3][1];
+        }
+
+        dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
+        dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
+    } else /* 8/4 bits */ {
+        uint8_t *dest = (uint8_t *) _dest;
+        const uint8_t *r = (const uint8_t *) _r;
+        const uint8_t *g = (const uint8_t *) _g;
+        const uint8_t *b = (const uint8_t *) _b;
+        int dr1, dg1, db1, dr2, dg2, db2;
+
+        if (target == AV_PIX_FMT_RGB8 || target == AV_PIX_FMT_BGR8) {
+            const uint8_t * const d64 = ff_dither_8x8_73[y & 7];
+            const uint8_t * const d32 = ff_dither_8x8_32[y & 7];
+            dr1 = dg1 = d32[(i * 2 + 0) & 7];
+            db1 =       d64[(i * 2 + 0) & 7];
+            dr2 = dg2 = d32[(i * 2 + 1) & 7];
+            db2 =       d64[(i * 2 + 1) & 7];
+        } else {
+            const uint8_t * const d64  = ff_dither_8x8_73 [y & 7];
+            const uint8_t * const d128 = ff_dither_8x8_220[y & 7];
+            dr1 = db1 = d128[(i * 2 + 0) & 7];
+            dg1 =        d64[(i * 2 + 0) & 7];
+            dr2 = db2 = d128[(i * 2 + 1) & 7];
+            dg2 =        d64[(i * 2 + 1) & 7];
+        }
+
+        if (target == AV_PIX_FMT_RGB4 || target == AV_PIX_FMT_BGR4) {
+            dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] +
+                    ((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4);
+        } else {
+            dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
+            dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
+        }
+    }
+}
+
+#define WRITE_YUV2RGB(vec_y1, vec_y2, vec_u, vec_v, t1, t2, t3, t4)    \
+{                                                                      \
+    Y1 = __lasx_xvpickve2gr_w(vec_y1, t1);                             \
+    Y2 = __lasx_xvpickve2gr_w(vec_y2, t2);                             \
+    U  = __lasx_xvpickve2gr_w(vec_u, t3);                              \
+    V  = __lasx_xvpickve2gr_w(vec_v, t4);                              \
+    r  =  c->table_rV[V];                                              \
+    g  = (c->table_gU[U] + c->table_gV[V]);                            \
+    b  =  c->table_bU[U];                                              \
+    yuv2rgb_write(dest, count, Y1, Y2, 0, 0,                           \
+                  r, g, b, y, target, 0);                              \
+    count++;                                                           \
+}
+
+static void
+yuv2rgb_X_template_lasx(SwsContext *c, const int16_t *lumFilter,
+                        const int16_t **lumSrc, int lumFilterSize,
+                        const int16_t *chrFilter, const int16_t **chrUSrc,
+                        const int16_t **chrVSrc, int chrFilterSize,
+                        const int16_t **alpSrc, uint8_t *dest, int dstW,
+                        int y, enum AVPixelFormat target, int hasAlpha)
+{
+    int i, j;
+    int count = 0;
+    int t     = 1 << 18;
+    int len   = dstW >> 6;
+    int res   = dstW & 63;
+    int len_count = (dstW + 1) >> 1;
+    const void *r, *g, *b;
+    int head = YUVRGB_TABLE_HEADROOM;
+    __m256i headroom  = __lasx_xvreplgr2vr_w(head);
+
+    for (i = 0; i < len; i++) {
+        int Y1, Y2, U, V, count_lum = count << 1;
+        __m256i l_src1, l_src2, l_src3, l_src4, u_src1, u_src2, v_src1, v_src2;
+        __m256i yl1_ev, yl1_od, yh1_ev, yh1_od, yl2_ev, yl2_od, yh2_ev, yh2_od;
+        __m256i u1_ev, u1_od, v1_ev, v1_od, u2_ev, u2_od, v2_ev, v2_od, temp;
+
+        yl1_ev = __lasx_xvldrepl_w(&t, 0);
+        yl1_od = yl1_ev;
+        yh1_ev = yl1_ev;
+        yh1_od = yl1_ev;
+        u1_ev  = yl1_ev;
+        v1_ev  = yl1_ev;
+        u1_od  = yl1_ev;
+        v1_od  = yl1_ev;
+        yl2_ev = yl1_ev;
+        yl2_od = yl1_ev;
+        yh2_ev = yl1_ev;
+        yh2_od = yl1_ev;
+        u2_ev  = yl1_ev;
+        v2_ev  = yl1_ev;
+        u2_od  = yl1_ev;
+        v2_od  = yl1_ev;
+        for (j = 0; j < lumFilterSize; j++) {
+            const int16_t *src_lum = lumSrc[j] + count_lum;
+            temp    = __lasx_xvldrepl_h((lumFilter + j), 0);
+            DUP4_ARG2(__lasx_xvld, src_lum, 0, src_lum, 32, src_lum, 64,
+                      src_lum, 96, l_src1, l_src2, l_src3, l_src4);
+
+            yl1_ev  = __lasx_xvmaddwev_w_h(yl1_ev, temp, l_src1);
+            yl1_od  = __lasx_xvmaddwod_w_h(yl1_od, temp, l_src1);
+            yh1_ev  = __lasx_xvmaddwev_w_h(yh1_ev, temp, l_src2);
+            yh1_od  = __lasx_xvmaddwod_w_h(yh1_od, temp, l_src2);
+            yl2_ev  = __lasx_xvmaddwev_w_h(yl2_ev, temp, l_src3);
+            yl2_od  = __lasx_xvmaddwod_w_h(yl2_od, temp, l_src3);
+            yh2_ev  = __lasx_xvmaddwev_w_h(yh2_ev, temp, l_src4);
+            yh2_od  = __lasx_xvmaddwod_w_h(yh2_od, temp, l_src4);
+        }
+        for (j = 0; j < chrFilterSize; j++) {
+            DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrUSrc[j] + count, 32,
+                      u_src1, u_src2);
+            DUP2_ARG2(__lasx_xvld, chrVSrc[j] + count, 0, chrVSrc[j] + count, 32,
+                      v_src1, v_src2);
+            temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
+            u1_ev  = __lasx_xvmaddwev_w_h(u1_ev, temp, u_src1);
+            u1_od  = __lasx_xvmaddwod_w_h(u1_od, temp, u_src1);
+            v1_ev  = __lasx_xvmaddwev_w_h(v1_ev, temp, v_src1);
+            v1_od  = __lasx_xvmaddwod_w_h(v1_od, temp, v_src1);
+            u2_ev  = __lasx_xvmaddwev_w_h(u2_ev, temp, u_src2);
+            u2_od  = __lasx_xvmaddwod_w_h(u2_od, temp, u_src2);
+            v2_ev  = __lasx_xvmaddwev_w_h(v2_ev, temp, v_src2);
+            v2_od  = __lasx_xvmaddwod_w_h(v2_od, temp, v_src2);
+        }
+        yl1_ev = __lasx_xvsrai_w(yl1_ev, 19);
+        yh1_ev = __lasx_xvsrai_w(yh1_ev, 19);
+        yl1_od = __lasx_xvsrai_w(yl1_od, 19);
+        yh1_od = __lasx_xvsrai_w(yh1_od, 19);
+        u1_ev  = __lasx_xvsrai_w(u1_ev, 19);
+        v1_ev  = __lasx_xvsrai_w(v1_ev, 19);
+        u1_od  = __lasx_xvsrai_w(u1_od, 19);
+        v1_od  = __lasx_xvsrai_w(v1_od, 19);
+        yl2_ev = __lasx_xvsrai_w(yl2_ev, 19);
+        yh2_ev = __lasx_xvsrai_w(yh2_ev, 19);
+        yl2_od = __lasx_xvsrai_w(yl2_od, 19);
+        yh2_od = __lasx_xvsrai_w(yh2_od, 19);
+        u2_ev  = __lasx_xvsrai_w(u2_ev, 19);
+        v2_ev  = __lasx_xvsrai_w(v2_ev, 19);
+        u2_od  = __lasx_xvsrai_w(u2_od, 19);
+        v2_od  = __lasx_xvsrai_w(v2_od, 19);
+        u1_ev  = __lasx_xvadd_w(u1_ev, headroom);
+        v1_ev  = __lasx_xvadd_w(v1_ev, headroom);
+        u1_od  = __lasx_xvadd_w(u1_od, headroom);
+        v1_od  = __lasx_xvadd_w(v1_od, headroom);
+        u2_ev  = __lasx_xvadd_w(u2_ev, headroom);
+        v2_ev  = __lasx_xvadd_w(v2_ev, headroom);
+        u2_od  = __lasx_xvadd_w(u2_od, headroom);
+        v2_od  = __lasx_xvadd_w(v2_od, headroom);
+        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 0, 0, 0, 0);
+        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 1, 1, 0, 0);
+        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 2, 2, 1, 1);
+        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 3, 3, 1, 1);
+        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 4, 4, 2, 2);
+        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 5, 5, 2, 2);
+        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 6, 6, 3, 3);
+        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 7, 7, 3, 3);
+        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 0, 0, 4, 4);
+        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 1, 1, 4, 4);
+        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 2, 2, 5, 5);
+        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 3, 3, 5, 5);
+        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 4, 4, 6, 6);
+        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 5, 5, 6, 6);
+        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 6, 6, 7, 7);
+        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 7, 7, 7, 7);
+        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 0, 0, 0, 0);
+        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 1, 1, 0, 0);
+        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 2, 2, 1, 1);
+        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 3, 3, 1, 1);
+        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 4, 4, 2, 2);
+        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 5, 5, 2, 2);
+        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 6, 6, 3, 3);
+        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 7, 7, 3, 3);
+        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 0, 0, 4, 4);
+        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 1, 1, 4, 4);
+        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 2, 2, 5, 5);
+        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 3, 3, 5, 5);
+        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 4, 4, 6, 6);
+        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 5, 5, 6, 6);
+        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 6, 6, 7, 7);
+        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 7, 7, 7, 7);
+    }
+    if (res >= 32) {
+        int Y1, Y2, U, V, count_lum = count << 1;
+        __m256i l_src1, l_src2, u_src, v_src;
+        __m256i yl_ev, yl_od, yh_ev, yh_od;
+        __m256i u_ev, u_od, v_ev, v_od, temp;
+
+        yl_ev = __lasx_xvldrepl_w(&t, 0);
+        yl_od = yl_ev;
+        yh_ev = yl_ev;
+        yh_od = yl_ev;
+        u_ev  = yl_ev;
+        v_ev  = yl_ev;
+        u_od  = yl_ev;
+        v_od  = yl_ev;
+        for (j = 0; j < lumFilterSize; j++) {
+            temp   = __lasx_xvldrepl_h((lumFilter + j), 0);
+            DUP2_ARG2(__lasx_xvld, lumSrc[j] + count_lum, 0, lumSrc[j] + count_lum,
+                      32, l_src1, l_src2);
+            yl_ev  = __lasx_xvmaddwev_w_h(yl_ev, temp, l_src1);
+            yl_od  = __lasx_xvmaddwod_w_h(yl_od, temp, l_src1);
+            yh_ev  = __lasx_xvmaddwev_w_h(yh_ev, temp, l_src2);
+            yh_od  = __lasx_xvmaddwod_w_h(yh_od, temp, l_src2);
+        }
+        for (j = 0; j < chrFilterSize; j++) {
+            DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
+                      u_src, v_src);
+            temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
+            u_ev  = __lasx_xvmaddwev_w_h(u_ev, temp, u_src);
+            u_od  = __lasx_xvmaddwod_w_h(u_od, temp, u_src);
+            v_ev  = __lasx_xvmaddwev_w_h(v_ev, temp, v_src);
+            v_od  = __lasx_xvmaddwod_w_h(v_od, temp, v_src);
+        }
+        yl_ev = __lasx_xvsrai_w(yl_ev, 19);
+        yh_ev = __lasx_xvsrai_w(yh_ev, 19);
+        yl_od = __lasx_xvsrai_w(yl_od, 19);
+        yh_od = __lasx_xvsrai_w(yh_od, 19);
+        u_ev  = __lasx_xvsrai_w(u_ev, 19);
+        v_ev  = __lasx_xvsrai_w(v_ev, 19);
+        u_od  = __lasx_xvsrai_w(u_od, 19);
+        v_od  = __lasx_xvsrai_w(v_od, 19);
+        u_ev  = __lasx_xvadd_w(u_ev, headroom);
+        v_ev  = __lasx_xvadd_w(v_ev, headroom);
+        u_od  = __lasx_xvadd_w(u_od, headroom);
+        v_od  = __lasx_xvadd_w(v_od, headroom);
+        WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 0, 0, 0, 0);
+        WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 1, 1, 0, 0);
+        WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 2, 2, 1, 1);
+        WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 3, 3, 1, 1);
+        WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 4, 4, 2, 2);
+        WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 5, 5, 2, 2);
+        WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 6, 6, 3, 3);
+        WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 7, 7, 3, 3);
+        WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 0, 0, 4, 4);
+        WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 1, 1, 4, 4);
+        WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 2, 2, 5, 5);
+        WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 3, 3, 5, 5);
+        WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 4, 4, 6, 6);
+        WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 5, 5, 6, 6);
+        WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 6, 6, 7, 7);
+        WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 7, 7, 7, 7);
+        res -= 32;
+    }
+    if (res >= 16) {
+        int Y1, Y2, U, V;
+        int count_lum = count << 1;
+        __m256i l_src, u_src, v_src;
+        __m256i y_ev, y_od, u, v, temp;
+
+        y_ev = __lasx_xvldrepl_w(&t, 0);
+        y_od = y_ev;
+        u    = y_ev;
+        v    = y_ev;
+        for (j = 0; j < lumFilterSize; j++) {
+            temp  = __lasx_xvldrepl_h((lumFilter + j), 0);
+            l_src = __lasx_xvld(lumSrc[j] + count_lum, 0);
+            y_ev  = __lasx_xvmaddwev_w_h(y_ev, temp, l_src);
+            y_od  = __lasx_xvmaddwod_w_h(y_od, temp, l_src);
+        }
+        for (j = 0; j < chrFilterSize; j++) {
+            DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrVSrc[j] + count,
+                      0, u_src, v_src);
+            temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
+            u_src = __lasx_vext2xv_w_h(u_src);
+            v_src = __lasx_vext2xv_w_h(v_src);
+            u     = __lasx_xvmaddwev_w_h(u, temp, u_src);
+            v     = __lasx_xvmaddwev_w_h(v, temp, v_src);
+        }
+        y_ev = __lasx_xvsrai_w(y_ev, 19);
+        y_od = __lasx_xvsrai_w(y_od, 19);
+        u    = __lasx_xvsrai_w(u, 19);
+        v    = __lasx_xvsrai_w(v, 19);
+        u    = __lasx_xvadd_w(u, headroom);
+        v    = __lasx_xvadd_w(v, headroom);
+        WRITE_YUV2RGB(y_ev, y_od, u, v, 0, 0, 0, 0);
+        WRITE_YUV2RGB(y_ev, y_od, u, v, 1, 1, 1, 1);
+        WRITE_YUV2RGB(y_ev, y_od, u, v, 2, 2, 2, 2);
+        WRITE_YUV2RGB(y_ev, y_od, u, v, 3, 3, 3, 3);
+        WRITE_YUV2RGB(y_ev, y_od, u, v, 4, 4, 4, 4);
+        WRITE_YUV2RGB(y_ev, y_od, u, v, 5, 5, 5, 5);
+        WRITE_YUV2RGB(y_ev, y_od, u, v, 6, 6, 6, 6);
+        WRITE_YUV2RGB(y_ev, y_od, u, v, 7, 7, 7, 7);
+        res -= 16;
+    }
+    if (res >= 8) {
+        int Y1, Y2, U, V;
+        int count_lum = count << 1;
+        __m256i l_src, u_src, v_src;
+        __m256i y_ev, uv, temp;
+
+        y_ev = __lasx_xvldrepl_w(&t, 0);
+        uv   = y_ev;
+        for (j = 0; j < lumFilterSize; j++) {
+            temp  = __lasx_xvldrepl_h((lumFilter + j), 0);
+            l_src = __lasx_xvld(lumSrc[j] + count_lum, 0);
+            l_src = __lasx_vext2xv_w_h(l_src);
+            y_ev  = __lasx_xvmaddwev_w_h(y_ev, temp, l_src);
+        }
+        for (j = 0; j < chrFilterSize; j++) {
+            u_src = __lasx_xvldrepl_d((chrUSrc[j] + count), 0);
+            v_src = __lasx_xvldrepl_d((chrVSrc[j] + count), 0);
+            temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
+            u_src = __lasx_xvilvl_d(v_src, u_src);
+            u_src = __lasx_vext2xv_w_h(u_src);
+            uv    = __lasx_xvmaddwev_w_h(uv, temp, u_src);
+        }
+        y_ev = __lasx_xvsrai_w(y_ev, 19);
+        uv   = __lasx_xvsrai_w(uv, 19);
+        uv   = __lasx_xvadd_w(uv, headroom);
+        WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 0, 1, 0, 4);
+        WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 2, 3, 1, 5);
+        WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 4, 5, 2, 6);
+        WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 6, 7, 3, 7);
+    }
+    for (; count < len_count; count++) {
+        int Y1 = 1 << 18;
+        int Y2 = Y1;
+        int U  = Y1;
+        int V  = Y1;
+
+        for (j = 0; j < lumFilterSize; j++) {
+            Y1 += lumSrc[j][count * 2]     * lumFilter[j];
+            Y2 += lumSrc[j][count * 2 + 1] * lumFilter[j];
+        }
+        for (j = 0; j < chrFilterSize; j++) {
+            U += chrUSrc[j][count] * chrFilter[j];
+            V += chrVSrc[j][count] * chrFilter[j];
+        }
+        Y1 >>= 19;
+        Y2 >>= 19;
+        U  >>= 19;
+        V  >>= 19;
+        r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM];
+        g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
+             c->table_gV[V + YUVRGB_TABLE_HEADROOM]);
+        b =  c->table_bU[U + YUVRGB_TABLE_HEADROOM];
+
+        yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
+                      r, g, b, y, target, 0);
+    }
+}
+
+static void
+yuv2rgb_2_template_lasx(SwsContext *c, const int16_t *buf[2],
+                        const int16_t *ubuf[2], const int16_t *vbuf[2],
+                        const int16_t *abuf[2], uint8_t *dest, int dstW,
+                        int yalpha, int uvalpha, int y,
+                        enum AVPixelFormat target, int hasAlpha)
+{
+    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
+                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
+                  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
+    int yalpha1   = 4096 - yalpha;
+    int uvalpha1  = 4096 - uvalpha;
+    int i, count  = 0;
+    int len       = dstW - 15;
+    int len_count = (dstW + 1) >> 1;
+    const void *r, *g, *b;
+    int head  = YUVRGB_TABLE_HEADROOM;
+    __m256i v_yalpha1  = __lasx_xvreplgr2vr_w(yalpha1);
+    __m256i v_uvalpha1 = __lasx_xvreplgr2vr_w(uvalpha1);
+    __m256i v_yalpha   = __lasx_xvreplgr2vr_w(yalpha);
+    __m256i v_uvalpha  = __lasx_xvreplgr2vr_w(uvalpha);
+    __m256i headroom   = __lasx_xvreplgr2vr_w(head);
+
+    for (i = 0; i < len; i += 16) {
+        int Y1, Y2, U, V;
+        int i_dex = i << 1;
+        int c_dex = count << 1;
+        __m256i y0_h, y0_l, y0, u0, v0;
+        __m256i y1_h, y1_l, y1, u1, v1;
+        __m256i y_l, y_h, u, v;
+
+        DUP4_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
+                  buf1, i_dex, y0, u0, v0, y1);
+        DUP2_ARG2(__lasx_xvldx, ubuf1, c_dex, vbuf1, c_dex, u1, v1);
+        DUP2_ARG2(__lasx_xvsllwil_w_h, y0, 0, y1, 0, y0_l, y1_l);
+        DUP2_ARG1(__lasx_xvexth_w_h, y0, y1, y0_h, y1_h);
+        DUP4_ARG1(__lasx_vext2xv_w_h, u0, u1, v0, v1, u0, u1, v0, v1);
+        y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
+        y0_h = __lasx_xvmul_w(y0_h, v_yalpha1);
+        u0   = __lasx_xvmul_w(u0, v_uvalpha1);
+        v0   = __lasx_xvmul_w(v0, v_uvalpha1);
+        y_l  = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
+        y_h  = __lasx_xvmadd_w(y0_h, v_yalpha, y1_h);
+        u    = __lasx_xvmadd_w(u0, v_uvalpha, u1);
+        v    = __lasx_xvmadd_w(v0, v_uvalpha, v1);
+        y_l  = __lasx_xvsrai_w(y_l, 19);
+        y_h  = __lasx_xvsrai_w(y_h, 19);
+        u    = __lasx_xvsrai_w(u, 19);
+        v    = __lasx_xvsrai_w(v, 19);
+        u    = __lasx_xvadd_w(u, headroom);
+        v    = __lasx_xvadd_w(v, headroom);
+        WRITE_YUV2RGB(y_l, y_l, u, v, 0, 1, 0, 0);
+        WRITE_YUV2RGB(y_l, y_l, u, v, 2, 3, 1, 1);
+        WRITE_YUV2RGB(y_h, y_h, u, v, 0, 1, 2, 2);
+        WRITE_YUV2RGB(y_h, y_h, u, v, 2, 3, 3, 3);
+        WRITE_YUV2RGB(y_l, y_l, u, v, 4, 5, 4, 4);
+        WRITE_YUV2RGB(y_l, y_l, u, v, 6, 7, 5, 5);
+        WRITE_YUV2RGB(y_h, y_h, u, v, 4, 5, 6, 6);
+        WRITE_YUV2RGB(y_h, y_h, u, v, 6, 7, 7, 7);
+    }
+    if (dstW - i >= 8) {
+        int Y1, Y2, U, V;
+        int i_dex = i << 1;
+        __m256i y0_l, y0, u0, v0;
+        __m256i y1_l, y1, u1, v1;
+        __m256i y_l, u, v;
+
+        y0   = __lasx_xvldx(buf0, i_dex);
+        u0   = __lasx_xvldrepl_d((ubuf0 + count), 0);
+        v0   = __lasx_xvldrepl_d((vbuf0 + count), 0);
+        y1   = __lasx_xvldx(buf1, i_dex);
+        u1   = __lasx_xvldrepl_d((ubuf1 + count), 0);
+        v1   = __lasx_xvldrepl_d((vbuf1 + count), 0);
+        DUP2_ARG1(__lasx_vext2xv_w_h, y0, y1, y0_l, y1_l);
+        DUP4_ARG1(__lasx_vext2xv_w_h, u0, u1, v0, v1, u0, u1, v0, v1);
+        y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
+        u0   = __lasx_xvmul_w(u0, v_uvalpha1);
+        v0   = __lasx_xvmul_w(v0, v_uvalpha1);
+        y_l  = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
+        u    = __lasx_xvmadd_w(u0, v_uvalpha, u1);
+        v    = __lasx_xvmadd_w(v0, v_uvalpha, v1);
+        y_l  = __lasx_xvsrai_w(y_l, 19);
+        u    = __lasx_xvsrai_w(u, 19);
+        v    = __lasx_xvsrai_w(v, 19);
+        u    = __lasx_xvadd_w(u, headroom);
+        v    = __lasx_xvadd_w(v, headroom);
+        WRITE_YUV2RGB(y_l, y_l, u, v, 0, 1, 0, 0);
+        WRITE_YUV2RGB(y_l, y_l, u, v, 2, 3, 1, 1);
+        WRITE_YUV2RGB(y_l, y_l, u, v, 4, 5, 2, 2);
+        WRITE_YUV2RGB(y_l, y_l, u, v, 6, 7, 3, 3);
+        i += 8;
+    }
+    for (; count < len_count; count++) {
+        int Y1 = (buf0[count * 2]     * yalpha1  +
+                  buf1[count * 2]     * yalpha)  >> 19;
+        int Y2 = (buf0[count * 2 + 1] * yalpha1  +
+                  buf1[count * 2 + 1] * yalpha) >> 19;
+        int U  = (ubuf0[count] * uvalpha1 + ubuf1[count] * uvalpha) >> 19;
+        int V  = (vbuf0[count] * uvalpha1 + vbuf1[count] * uvalpha) >> 19;
+
+        r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM],
+        g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
+             c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
+        b =  c->table_bU[U + YUVRGB_TABLE_HEADROOM];
+
+        yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
+                      r, g, b, y, target, 0);
+    }
+}
+
+static void
+yuv2rgb_1_template_lasx(SwsContext *c, const int16_t *buf0,
+                        const int16_t *ubuf[2], const int16_t *vbuf[2],
+                        const int16_t *abuf0, uint8_t *dest, int dstW,
+                        int uvalpha, int y, enum AVPixelFormat target,
+                        int hasAlpha)
+{
+    const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
+    int i;
+    int len       = (dstW - 15);
+    int len_count = (dstW + 1) >> 1;
+    const void *r, *g, *b;
+
+    if (uvalpha < 2048) {
+        int count    = 0;
+        int head = YUVRGB_TABLE_HEADROOM;
+        __m256i headroom  = __lasx_xvreplgr2vr_h(head);
+
+        for (i = 0; i < len; i += 16) {
+            int Y1, Y2, U, V;
+            int i_dex = i << 1;
+            int c_dex = count << 1;
+            __m256i src_y, src_u, src_v;
+            __m256i u, v, y_l, y_h;
+
+            DUP2_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, src_y, src_u);
+            src_v = __lasx_xvldx(vbuf0, c_dex);
+            src_u = __lasx_xvpermi_q(src_u, src_v, 0x02);
+            src_y = __lasx_xvsrari_h(src_y, 7);
+            src_u = __lasx_xvsrari_h(src_u, 7);
+            y_l   = __lasx_xvsllwil_w_h(src_y, 0);
+            y_h   = __lasx_xvexth_w_h(src_y);
+            u     = __lasx_xvaddwev_w_h(src_u, headroom);
+            v     = __lasx_xvaddwod_w_h(src_u, headroom);
+            WRITE_YUV2RGB(y_l, y_l, u, u, 0, 1, 0, 4);
+            WRITE_YUV2RGB(y_l, y_l, v, v, 2, 3, 0, 4);
+            WRITE_YUV2RGB(y_h, y_h, u, u, 0, 1, 1, 5);
+            WRITE_YUV2RGB(y_h, y_h, v, v, 2, 3, 1, 5);
+            WRITE_YUV2RGB(y_l, y_l, u, u, 4, 5, 2, 6);
+            WRITE_YUV2RGB(y_l, y_l, v, v, 6, 7, 2, 6);
+            WRITE_YUV2RGB(y_h, y_h, u, u, 4, 5, 3, 7);
+            WRITE_YUV2RGB(y_h, y_h, v, v, 6, 7, 3, 7);
+        }
+        if (dstW - i >= 8){
+            int Y1, Y2, U, V;
+            int i_dex = i << 1;
+            __m256i src_y, src_u, src_v;
+            __m256i y_l, uv;
+
+            src_y  = __lasx_xvldx(buf0, i_dex);
+            src_u  = __lasx_xvldrepl_d((ubuf0 + count), 0);
+            src_v  = __lasx_xvldrepl_d((vbuf0 + count), 0);
+            src_u  = __lasx_xvilvl_d(src_v, src_u);
+            y_l    = __lasx_xvsrari_h(src_y, 7);
+            uv     = __lasx_xvsrari_h(src_u, 7);
+            y_l    = __lasx_vext2xv_w_h(y_l);
+            uv     = __lasx_vext2xv_w_h(uv);
+            uv     = __lasx_xvaddwev_w_h(uv, headroom);
+            WRITE_YUV2RGB(y_l, y_l, uv, uv, 0, 1, 0, 4);
+            WRITE_YUV2RGB(y_l, y_l, uv, uv, 2, 3, 1, 5);
+            WRITE_YUV2RGB(y_l, y_l, uv, uv, 4, 5, 2, 6);
+            WRITE_YUV2RGB(y_l, y_l, uv, uv, 6, 7, 3, 7);
+            i += 8;
+        }
+        for (; count < len_count; count++) {
+            int Y1 = (buf0[count * 2    ] + 64) >> 7;
+            int Y2 = (buf0[count * 2 + 1] + 64) >> 7;
+            int U  = (ubuf0[count]        + 64) >> 7;
+            int V  = (vbuf0[count]        + 64) >> 7;
+
+            r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM],
+            g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
+                 c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
+            b =  c->table_bU[U + YUVRGB_TABLE_HEADROOM];
+
+            yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
+                          r, g, b, y, target, 0);
+        }
+    } else {
+        const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
+        int count = 0;
+        int HEADROOM = YUVRGB_TABLE_HEADROOM;
+        __m256i headroom    = __lasx_xvreplgr2vr_w(HEADROOM);
+
+        for (i = 0; i < len; i += 16) {
+            int Y1, Y2, U, V;
+            int i_dex = i << 1;
+            int c_dex = count << 1;
+            __m256i src_y, src_u0, src_v0, src_u1, src_v1;
+            __m256i y_l, y_h, u, v;
+
+            DUP4_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
+                      ubuf1, c_dex, src_y, src_u0, src_v0, src_u1);
+            src_v1 = __lasx_xvldx(vbuf1, c_dex);
+            src_u0 = __lasx_xvpermi_q(src_u0, src_v0, 0x02);
+            src_u1 = __lasx_xvpermi_q(src_u1, src_v1, 0x02);
+            src_y  = __lasx_xvsrari_h(src_y, 7);
+            u      = __lasx_xvaddwev_w_h(src_u0, src_u1);
+            v      = __lasx_xvaddwod_w_h(src_u0, src_u1);
+            y_l    = __lasx_xvsllwil_w_h(src_y, 0);
+            y_h    = __lasx_xvexth_w_h(src_y);
+            u      = __lasx_xvsrari_w(u, 8);
+            v      = __lasx_xvsrari_w(v, 8);
+            u      = __lasx_xvadd_w(u, headroom);
+            v      = __lasx_xvadd_w(v, headroom);
+            WRITE_YUV2RGB(y_l, y_l, u, u, 0, 1, 0, 4);
+            WRITE_YUV2RGB(y_l, y_l, v, v, 2, 3, 0, 4);
+            WRITE_YUV2RGB(y_h, y_h, u, u, 0, 1, 1, 5);
+            WRITE_YUV2RGB(y_h, y_h, v, v, 2, 3, 1, 5);
+            WRITE_YUV2RGB(y_l, y_l, u, u, 4, 5, 2, 6);
+            WRITE_YUV2RGB(y_l, y_l, v, v, 6, 7, 2, 6);
+            WRITE_YUV2RGB(y_h, y_h, u, u, 4, 5, 3, 7);
+            WRITE_YUV2RGB(y_h, y_h, v, v, 6, 7, 3, 7);
+        }
+        if (dstW - i >= 8) {
+            int Y1, Y2, U, V;
+            int i_dex = i << 1;
+            __m256i src_y, src_u0, src_v0, src_u1, src_v1;
+            __m256i uv;
+
+            src_y  = __lasx_xvldx(buf0, i_dex);
+            src_u0 = __lasx_xvldrepl_d((ubuf0 + count), 0);
+            src_v0 = __lasx_xvldrepl_d((vbuf0 + count), 0);
+            src_u1 = __lasx_xvldrepl_d((ubuf1 + count), 0);
+            src_v1 = __lasx_xvldrepl_d((vbuf1 + count), 0);
+
+            src_u0 = __lasx_xvilvl_h(src_u1, src_u0);
+            src_v0 = __lasx_xvilvl_h(src_v1, src_v0);
+            src_u0 = __lasx_xvpermi_q(src_u0, src_v0, 0x02);
+            src_y  = __lasx_xvsrari_h(src_y, 7);
+            uv     = __lasx_xvhaddw_w_h(src_u0, src_u0);
+            src_y  = __lasx_vext2xv_w_h(src_y);
+            uv     = __lasx_xvsrari_w(uv, 8);
+            uv     = __lasx_xvadd_w(uv, headroom);
+            WRITE_YUV2RGB(src_y, src_y, uv, uv, 0, 1, 0, 4);
+            WRITE_YUV2RGB(src_y, src_y, uv, uv, 2, 3, 1, 5);
+            WRITE_YUV2RGB(src_y, src_y, uv, uv, 4, 5, 2, 6);
+            WRITE_YUV2RGB(src_y, src_y, uv, uv, 6, 7, 3, 7);
+            i += 8;
+        }
+        for (; count < len_count; count++) {
+            int Y1 = (buf0[count * 2    ]         +  64) >> 7;
+            int Y2 = (buf0[count * 2 + 1]         +  64) >> 7;
+            int U  = (ubuf0[count] + ubuf1[count] + 128) >> 8;
+            int V  = (vbuf0[count] + vbuf1[count] + 128) >> 8;
+
+            r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM],
+            g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
+                 c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
+            b =  c->table_bU[U + YUVRGB_TABLE_HEADROOM];
+
+            yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
+                          r, g, b, y, target, 0);
+        }
+    }
+}
+
+#define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha)                                \
+static void name ## ext ## _X_lasx(SwsContext *c, const int16_t *lumFilter,            \
+                                   const int16_t **lumSrc, int lumFilterSize,          \
+                                   const int16_t *chrFilter, const int16_t **chrUSrc,  \
+                                   const int16_t **chrVSrc, int chrFilterSize,         \
+                                   const int16_t **alpSrc, uint8_t *dest, int dstW,    \
+                                   int y)                                              \
+{                                                                                      \
+    name ## base ## _X_template_lasx(c, lumFilter, lumSrc, lumFilterSize,              \
+                                     chrFilter, chrUSrc, chrVSrc, chrFilterSize,       \
+                                     alpSrc, dest, dstW, y, fmt, hasAlpha);            \
+}
+
+#define YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha)                               \
+YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha)                                        \
+static void name ## ext ## _2_lasx(SwsContext *c, const int16_t *buf[2],               \
+                                   const int16_t *ubuf[2], const int16_t *vbuf[2],     \
+                                   const int16_t *abuf[2], uint8_t *dest, int dstW,    \
+                                   int yalpha, int uvalpha, int y)                     \
+{                                                                                      \
+    name ## base ## _2_template_lasx(c, buf, ubuf, vbuf, abuf, dest,                   \
+                                     dstW, yalpha, uvalpha, y, fmt, hasAlpha);         \
+}
+
+#define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha)                                 \
+YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha)                                       \
+static void name ## ext ## _1_lasx(SwsContext *c, const int16_t *buf0,                 \
+                                   const int16_t *ubuf[2], const int16_t *vbuf[2],     \
+                                   const int16_t *abuf0, uint8_t *dest, int dstW,      \
+                                   int uvalpha, int y)                                 \
+{                                                                                      \
+    name ## base ## _1_template_lasx(c, buf0, ubuf, vbuf, abuf0, dest,                 \
+                                     dstW, uvalpha, y, fmt, hasAlpha);                 \
+}
+
+
+#if CONFIG_SMALL
+#else
+#if CONFIG_SWSCALE_ALPHA
+#endif
+YUV2RGBWRAPPER(yuv2rgb,, x32_1,  AV_PIX_FMT_RGB32_1, 0)
+YUV2RGBWRAPPER(yuv2rgb,, x32,    AV_PIX_FMT_RGB32,   0)
+#endif
+YUV2RGBWRAPPER(yuv2, rgb, rgb24, AV_PIX_FMT_RGB24,     0)
+YUV2RGBWRAPPER(yuv2, rgb, bgr24, AV_PIX_FMT_BGR24,     0)
+YUV2RGBWRAPPER(yuv2rgb,,  16,    AV_PIX_FMT_RGB565,    0)
+YUV2RGBWRAPPER(yuv2rgb,,  15,    AV_PIX_FMT_RGB555,    0)
+YUV2RGBWRAPPER(yuv2rgb,,  12,    AV_PIX_FMT_RGB444,    0)
+YUV2RGBWRAPPER(yuv2rgb,,   8,    AV_PIX_FMT_RGB8,      0)
+YUV2RGBWRAPPER(yuv2rgb,,   4,    AV_PIX_FMT_RGB4,      0)
+YUV2RGBWRAPPER(yuv2rgb,,   4b,   AV_PIX_FMT_RGB4_BYTE, 0)
+
+// This function is copied from libswscale/output.c
+static av_always_inline void yuv2rgb_write_full(SwsContext *c,
+    uint8_t *dest, int i, int R, int A, int G, int B,
+    int y, enum AVPixelFormat target, int hasAlpha, int err[4])
+{
+    int isrgb8 = target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8;
+
+    if ((R | G | B) & 0xC0000000) {
+        R = av_clip_uintp2(R, 30);
+        G = av_clip_uintp2(G, 30);
+        B = av_clip_uintp2(B, 30);
+    }
+
+    switch(target) {
+    case AV_PIX_FMT_ARGB:
+        dest[0] = hasAlpha ? A : 255;
+        dest[1] = R >> 22;
+        dest[2] = G >> 22;
+        dest[3] = B >> 22;
+        break;
+    case AV_PIX_FMT_RGB24:
+        dest[0] = R >> 22;
+        dest[1] = G >> 22;
+        dest[2] = B >> 22;
+        break;
+    case AV_PIX_FMT_RGBA:
+        dest[0] = R >> 22;
+        dest[1] = G >> 22;
+        dest[2] = B >> 22;
+        dest[3] = hasAlpha ? A : 255;
+        break;
+    case AV_PIX_FMT_ABGR:
+        dest[0] = hasAlpha ? A : 255;
+        dest[1] = B >> 22;
+        dest[2] = G >> 22;
+        dest[3] = R >> 22;
+        break;
+    case AV_PIX_FMT_BGR24:
+        dest[0] = B >> 22;
+        dest[1] = G >> 22;
+        dest[2] = R >> 22;
+        break;
+    case AV_PIX_FMT_BGRA:
+        dest[0] = B >> 22;
+        dest[1] = G >> 22;
+        dest[2] = R >> 22;
+        dest[3] = hasAlpha ? A : 255;
+        break;
+    case AV_PIX_FMT_BGR4_BYTE:
+    case AV_PIX_FMT_RGB4_BYTE:
+    case AV_PIX_FMT_BGR8:
+    case AV_PIX_FMT_RGB8:
+    {
+        int r,g,b;
+
+        switch (c->dither) {
+        default:
+        case SWS_DITHER_AUTO:
+        case SWS_DITHER_ED:
+            R >>= 22;
+            G >>= 22;
+            B >>= 22;
+            R += (7*err[0] + 1*c->dither_error[0][i] + 5*c->dither_error[0][i+1] + 3*c->dither_error[0][i+2])>>4;
+            G += (7*err[1] + 1*c->dither_error[1][i] + 5*c->dither_error[1][i+1] + 3*c->dither_error[1][i+2])>>4;
+            B += (7*err[2] + 1*c->dither_error[2][i] + 5*c->dither_error[2][i+1] + 3*c->dither_error[2][i+2])>>4;
+            c->dither_error[0][i] = err[0];
+            c->dither_error[1][i] = err[1];
+            c->dither_error[2][i] = err[2];
+            r = R >> (isrgb8 ? 5 : 7);
+            g = G >> (isrgb8 ? 5 : 6);
+            b = B >> (isrgb8 ? 6 : 7);
+            r = av_clip(r, 0, isrgb8 ? 7 : 1);
+            g = av_clip(g, 0, isrgb8 ? 7 : 3);
+            b = av_clip(b, 0, isrgb8 ? 3 : 1);
+            err[0] = R - r*(isrgb8 ? 36 : 255);
+            err[1] = G - g*(isrgb8 ? 36 : 85);
+            err[2] = B - b*(isrgb8 ? 85 : 255);
+            break;
+        case SWS_DITHER_A_DITHER:
+            if (isrgb8) {
+  /* see http://pippin.gimp.org/a_dither/ for details/origin */
+#define A_DITHER(u,v)   (((((u)+((v)*236))*119)&0xff))
+                r = (((R >> 19) + A_DITHER(i,y)  -96)>>8);
+                g = (((G >> 19) + A_DITHER(i + 17,y) - 96)>>8);
+                b = (((B >> 20) + A_DITHER(i + 17*2,y) -96)>>8);
+                r = av_clip_uintp2(r, 3);
+                g = av_clip_uintp2(g, 3);
+                b = av_clip_uintp2(b, 2);
+            } else {
+                r = (((R >> 21) + A_DITHER(i,y)-256)>>8);
+                g = (((G >> 19) + A_DITHER(i + 17,y)-256)>>8);
+                b = (((B >> 21) + A_DITHER(i + 17*2,y)-256)>>8);
+                r = av_clip_uintp2(r, 1);
+                g = av_clip_uintp2(g, 2);
+                b = av_clip_uintp2(b, 1);
+            }
+            break;
+        case SWS_DITHER_X_DITHER:
+            if (isrgb8) {
+  /* see http://pippin.gimp.org/a_dither/ for details/origin */
+#define X_DITHER(u,v)   (((((u)^((v)*237))*181)&0x1ff)/2)
+                r = (((R >> 19) + X_DITHER(i,y) - 96)>>8);
+                g = (((G >> 19) + X_DITHER(i + 17,y) - 96)>>8);
+                b = (((B >> 20) + X_DITHER(i + 17*2,y) - 96)>>8);
+                r = av_clip_uintp2(r, 3);
+                g = av_clip_uintp2(g, 3);
+                b = av_clip_uintp2(b, 2);
+            } else {
+                r = (((R >> 21) + X_DITHER(i,y)-256)>>8);
+                g = (((G >> 19) + X_DITHER(i + 17,y)-256)>>8);
+                b = (((B >> 21) + X_DITHER(i + 17*2,y)-256)>>8);
+                r = av_clip_uintp2(r, 1);
+                g = av_clip_uintp2(g, 2);
+                b = av_clip_uintp2(b, 1);
+            }
+
+            break;
+        }
+
+        if(target == AV_PIX_FMT_BGR4_BYTE) {
+            dest[0] = r + 2*g + 8*b;
+        } else if(target == AV_PIX_FMT_RGB4_BYTE) {
+            dest[0] = b + 2*g + 8*r;
+        } else if(target == AV_PIX_FMT_BGR8) {
+            dest[0] = r + 8*g + 64*b;
+        } else if(target == AV_PIX_FMT_RGB8) {
+            dest[0] = b + 4*g + 32*r;
+        } else
+            av_assert2(0);
+        break; }
+    }
+}
+
+#define YUV2RGB_SETUP                                            \
+    int y_offset   = c->yuv2rgb_y_offset;                        \
+    int y_coeff    = c->yuv2rgb_y_coeff;                         \
+    int v2r_coe    = c->yuv2rgb_v2r_coeff;                       \
+    int v2g_coe    = c->yuv2rgb_v2g_coeff;                       \
+    int u2g_coe    = c->yuv2rgb_u2g_coeff;                       \
+    int u2b_coe    = c->yuv2rgb_u2b_coeff;                       \
+    __m256i offset = __lasx_xvreplgr2vr_w(y_offset);             \
+    __m256i coeff  = __lasx_xvreplgr2vr_w(y_coeff);              \
+    __m256i v2r    = __lasx_xvreplgr2vr_w(v2r_coe);              \
+    __m256i v2g    = __lasx_xvreplgr2vr_w(v2g_coe);              \
+    __m256i u2g    = __lasx_xvreplgr2vr_w(u2g_coe);              \
+    __m256i u2b    = __lasx_xvreplgr2vr_w(u2b_coe);              \
+
+
+#define YUV2RGB(y, u, v, R, G, B, offset, coeff,               \
+                 y_temp, v2r, v2g, u2g, u2b)                   \
+{                                                              \
+     y = __lasx_xvsub_w(y, offset);                            \
+     y = __lasx_xvmul_w(y, coeff);                             \
+     y = __lasx_xvadd_w(y, y_temp);                            \
+     R = __lasx_xvmadd_w(y, v, v2r);                           \
+     v = __lasx_xvmadd_w(y, v, v2g);                           \
+     G = __lasx_xvmadd_w(v, u, u2g);                           \
+     B = __lasx_xvmadd_w(y, u, u2b);                           \
+}
+
+#define WRITE_FULL_A(r, g, b, a, t1, s)                                      \
+{                                                                            \
+    R = __lasx_xvpickve2gr_w(r, t1);                                         \
+    G = __lasx_xvpickve2gr_w(g, t1);                                         \
+    B = __lasx_xvpickve2gr_w(b, t1);                                         \
+    A = __lasx_xvpickve2gr_w(a, t1);                                         \
+    if (A & 0x100)                                                           \
+        A = av_clip_uint8(A);                                                \
+    yuv2rgb_write_full(c, dest, i + s, R, A, G, B, y, target, hasAlpha, err);\
+    dest += step;                                                            \
+}
+
+#define WRITE_FULL(r, g, b, t1, s)                                            \
+{                                                                             \
+    R = __lasx_xvpickve2gr_w(r, t1);                                          \
+    G = __lasx_xvpickve2gr_w(g, t1);                                          \
+    B = __lasx_xvpickve2gr_w(b, t1);                                          \
+    yuv2rgb_write_full(c, dest, i + s, R, 0, G, B, y, target, hasAlpha, err); \
+    dest += step;                                                             \
+}
+
+static void
+yuv2rgb_full_X_template_lasx(SwsContext *c, const int16_t *lumFilter,
+                             const int16_t **lumSrc, int lumFilterSize,
+                             const int16_t *chrFilter, const int16_t **chrUSrc,
+                             const int16_t **chrVSrc, int chrFilterSize,
+                             const int16_t **alpSrc, uint8_t *dest,
+                             int dstW, int y, enum AVPixelFormat target,
+                             int hasAlpha)
+{
+    int i, j, B, G, R, A;
+    int step       = (target == AV_PIX_FMT_RGB24 ||
+                      target == AV_PIX_FMT_BGR24) ? 3 : 4;
+    int err[4]     = {0};
+    int a_temp     = 1 << 18;
+    int templ      = 1 << 9;
+    int tempc      = templ - (128 << 19);
+    int ytemp      = 1 << 21;
+    int len        = dstW - 15;
+    __m256i y_temp = __lasx_xvreplgr2vr_w(ytemp);
+    YUV2RGB_SETUP
+
+    if(   target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
+       || target == AV_PIX_FMT_BGR8      || target == AV_PIX_FMT_RGB8)
+        step = 1;
+
+    for (i = 0; i < len; i += 16) {
+        __m256i l_src, u_src, v_src;
+        __m256i y_ev, y_od, u_ev, u_od, v_ev, v_od, temp;
+        __m256i R_ev, R_od, G_ev, G_od, B_ev, B_od;
+        int n = i << 1;
+
+        y_ev = y_od = __lasx_xvreplgr2vr_w(templ);
+        u_ev = u_od = v_ev = v_od = __lasx_xvreplgr2vr_w(tempc);
+        for (j = 0; j < lumFilterSize; j++) {
+            temp  = __lasx_xvldrepl_h((lumFilter + j), 0);
+            l_src = __lasx_xvldx(lumSrc[j], n);
+            y_ev  = __lasx_xvmaddwev_w_h(y_ev, l_src, temp);
+            y_od  = __lasx_xvmaddwod_w_h(y_od, l_src, temp);
+        }
+        for (j = 0; j < chrFilterSize; j++) {
+            temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
+            DUP2_ARG2(__lasx_xvldx, chrUSrc[j], n, chrVSrc[j], n,
+                      u_src, v_src);
+            DUP2_ARG3(__lasx_xvmaddwev_w_h, u_ev, u_src, temp, v_ev,
+                      v_src, temp, u_ev, v_ev);
+            DUP2_ARG3(__lasx_xvmaddwod_w_h, u_od, u_src, temp, v_od,
+                      v_src, temp, u_od, v_od);
+        }
+        y_ev = __lasx_xvsrai_w(y_ev, 10);
+        y_od = __lasx_xvsrai_w(y_od, 10);
+        u_ev = __lasx_xvsrai_w(u_ev, 10);
+        u_od = __lasx_xvsrai_w(u_od, 10);
+        v_ev = __lasx_xvsrai_w(v_ev, 10);
+        v_od = __lasx_xvsrai_w(v_od, 10);
+        YUV2RGB(y_ev, u_ev, v_ev, R_ev, G_ev, B_ev, offset, coeff,
+                y_temp, v2r, v2g, u2g, u2b);
+        YUV2RGB(y_od, u_od, v_od, R_od, G_od, B_od, offset, coeff,
+                y_temp, v2r, v2g, u2g, u2b);
+
+        if (hasAlpha) {
+            __m256i a_src, a_ev, a_od;
+
+            a_ev = a_od = __lasx_xvreplgr2vr_w(a_temp);
+            for (j = 0; j < lumFilterSize; j++) {
+                temp  = __lasx_xvldrepl_h(lumFilter + j, 0);
+                a_src = __lasx_xvldx(alpSrc[j], n);
+                a_ev  = __lasx_xvmaddwev_w_h(a_ev, a_src, temp);
+                a_od  = __lasx_xvmaddwod_w_h(a_od, a_src, temp);
+            }
+            a_ev = __lasx_xvsrai_w(a_ev, 19);
+            a_od = __lasx_xvsrai_w(a_od, 19);
+            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 0, 0);
+            WRITE_FULL_A(R_od, G_od, B_od, a_od, 0, 1);
+            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 1, 2);
+            WRITE_FULL_A(R_od, G_od, B_od, a_od, 1, 3);
+            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 2, 4);
+            WRITE_FULL_A(R_od, G_od, B_od, a_od, 2, 5);
+            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 3, 6);
+            WRITE_FULL_A(R_od, G_od, B_od, a_od, 3, 7);
+            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 4, 8);
+            WRITE_FULL_A(R_od, G_od, B_od, a_od, 4, 9);
+            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 5, 10);
+            WRITE_FULL_A(R_od, G_od, B_od, a_od, 5, 11);
+            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 6, 12);
+            WRITE_FULL_A(R_od, G_od, B_od, a_od, 6, 13);
+            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 7, 14);
+            WRITE_FULL_A(R_od, G_od, B_od, a_od, 7, 15);
+        } else {
+            WRITE_FULL(R_ev, G_ev, B_ev, 0, 0);
+            WRITE_FULL(R_od, G_od, B_od, 0, 1);
+            WRITE_FULL(R_ev, G_ev, B_ev, 1, 2);
+            WRITE_FULL(R_od, G_od, B_od, 1, 3);
+            WRITE_FULL(R_ev, G_ev, B_ev, 2, 4);
+            WRITE_FULL(R_od, G_od, B_od, 2, 5);
+            WRITE_FULL(R_ev, G_ev, B_ev, 3, 6);
+            WRITE_FULL(R_od, G_od, B_od, 3, 7);
+            WRITE_FULL(R_ev, G_ev, B_ev, 4, 8);
+            WRITE_FULL(R_od, G_od, B_od, 4, 9);
+            WRITE_FULL(R_ev, G_ev, B_ev, 5, 10);
+            WRITE_FULL(R_od, G_od, B_od, 5, 11);
+            WRITE_FULL(R_ev, G_ev, B_ev, 6, 12);
+            WRITE_FULL(R_od, G_od, B_od, 6, 13);
+            WRITE_FULL(R_ev, G_ev, B_ev, 7, 14);
+            WRITE_FULL(R_od, G_od, B_od, 7, 15);
+        }
+    }
+    if (dstW - i >= 8) {
+        __m256i l_src, u_src, v_src;
+        __m256i y_ev, u_ev, v_ev, uv, temp;
+        __m256i R_ev, G_ev, B_ev;
+        int n = i << 1;
+
+        y_ev = __lasx_xvreplgr2vr_w(templ);
+        u_ev = v_ev = __lasx_xvreplgr2vr_w(tempc);
+        for (j = 0; j < lumFilterSize; j++) {
+            temp  = __lasx_xvldrepl_h((lumFilter + j), 0);
+            l_src = __lasx_xvldx(lumSrc[j], n);
+            l_src = __lasx_xvpermi_d(l_src, 0xD8);
+            l_src = __lasx_xvilvl_h(l_src, l_src);
+            y_ev  = __lasx_xvmaddwev_w_h(y_ev, l_src, temp);
+        }
+        for (j = 0; j < chrFilterSize; j++) {
+            temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
+            DUP2_ARG2(__lasx_xvldx, chrUSrc[j], n, chrVSrc[j], n, u_src, v_src);
+            u_src = __lasx_xvpermi_d(u_src, 0xD8);
+            v_src = __lasx_xvpermi_d(v_src, 0xD8);
+            uv    = __lasx_xvilvl_h(v_src, u_src);
+            u_ev  = __lasx_xvmaddwev_w_h(u_ev, uv, temp);
+            v_ev  = __lasx_xvmaddwod_w_h(v_ev, uv, temp);
+        }
+        y_ev = __lasx_xvsrai_w(y_ev, 10);
+        u_ev = __lasx_xvsrai_w(u_ev, 10);
+        v_ev = __lasx_xvsrai_w(v_ev, 10);
+        YUV2RGB(y_ev, u_ev, v_ev, R_ev, G_ev, B_ev, offset, coeff,
+                y_temp, v2r, v2g, u2g, u2b);
+
+        if (hasAlpha) {
+            __m256i a_src, a_ev;
+
+            a_ev = __lasx_xvreplgr2vr_w(a_temp);
+            for (j = 0; j < lumFilterSize; j++) {
+                temp  = __lasx_xvldrepl_h(lumFilter + j, 0);
+                a_src = __lasx_xvldx(alpSrc[j], n);
+                a_src = __lasx_xvpermi_d(a_src, 0xD8);
+                a_src = __lasx_xvilvl_h(a_src, a_src);
+                a_ev  =  __lasx_xvmaddwev_w_h(a_ev, a_src, temp);
+            }
+            a_ev = __lasx_xvsrai_w(a_ev, 19);
+            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 0, 0);
+            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 1, 1);
+            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 2, 2);
+            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 3, 3);
+            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 4, 4);
+            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 5, 5);
+            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 6, 6);
+            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 7, 7);
+        } else {
+            WRITE_FULL(R_ev, G_ev, B_ev, 0, 0);
+            WRITE_FULL(R_ev, G_ev, B_ev, 1, 1);
+            WRITE_FULL(R_ev, G_ev, B_ev, 2, 2);
+            WRITE_FULL(R_ev, G_ev, B_ev, 3, 3);
+            WRITE_FULL(R_ev, G_ev, B_ev, 4, 4);
+            WRITE_FULL(R_ev, G_ev, B_ev, 5, 5);
+            WRITE_FULL(R_ev, G_ev, B_ev, 6, 6);
+            WRITE_FULL(R_ev, G_ev, B_ev, 7, 7);
+        }
+        i += 8;
+    }
+    for (; i < dstW; i++) {
+        int Y = templ;
+        int V, U = V = tempc;
+
+        A = 0;
+        for (j = 0; j < lumFilterSize; j++) {
+            Y += lumSrc[j][i] * lumFilter[j];
+        }
+        for (j = 0; j < chrFilterSize; j++) {
+            U += chrUSrc[j][i] * chrFilter[j];
+            V += chrVSrc[j][i] * chrFilter[j];
+
+        }
+        Y >>= 10;
+        U >>= 10;
+        V >>= 10;
+        if (hasAlpha) {
+            A = 1 << 18;
+            for (j = 0; j < lumFilterSize; j++) {
+                A += alpSrc[j][i] * lumFilter[j];
+            }
+            A >>= 19;
+            if (A & 0x100)
+                A = av_clip_uint8(A);
+        }
+        Y -= y_offset;
+        Y *= y_coeff;
+        Y += ytemp;
+        R  = (unsigned)Y + V * v2r_coe;
+        G  = (unsigned)Y + V * v2g_coe + U * u2g_coe;
+        B  = (unsigned)Y + U * u2b_coe;
+        yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
+        dest += step;
+    }
+    c->dither_error[0][i] = err[0];
+    c->dither_error[1][i] = err[1];
+    c->dither_error[2][i] = err[2];
+}
+
+static void
+yuv2rgb_full_2_template_lasx(SwsContext *c, const int16_t *buf[2],
+                             const int16_t *ubuf[2], const int16_t *vbuf[2],
+                             const int16_t *abuf[2], uint8_t *dest, int dstW,
+                             int yalpha, int uvalpha, int y,
+                             enum AVPixelFormat target, int hasAlpha)
+{
+    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
+                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
+                  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
+                  *abuf0 = hasAlpha ? abuf[0] : NULL,
+                  *abuf1 = hasAlpha ? abuf[1] : NULL;
+    int yalpha1  = 4096 - yalpha;
+    int uvalpha1 = 4096 - uvalpha;
+    int uvtemp   = 128 << 19;
+    int atemp    = 1 << 18;
+    int err[4]   = {0};
+    int ytemp    = 1 << 21;
+    int len      = dstW - 15;
+    int i, R, G, B, A;
+    int step = (target == AV_PIX_FMT_RGB24 ||
+                target == AV_PIX_FMT_BGR24) ? 3 : 4;
+    __m256i v_uvalpha1 = __lasx_xvreplgr2vr_w(uvalpha1);
+    __m256i v_yalpha1  = __lasx_xvreplgr2vr_w(yalpha1);
+    __m256i v_uvalpha  = __lasx_xvreplgr2vr_w(uvalpha);
+    __m256i v_yalpha   = __lasx_xvreplgr2vr_w(yalpha);
+    __m256i uv         = __lasx_xvreplgr2vr_w(uvtemp);
+    __m256i a_bias     = __lasx_xvreplgr2vr_w(atemp);
+    __m256i y_temp     = __lasx_xvreplgr2vr_w(ytemp);
+    YUV2RGB_SETUP
+
+    av_assert2(yalpha  <= 4096U);
+    av_assert2(uvalpha <= 4096U);
+
+    if(   target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
+       || target == AV_PIX_FMT_BGR8      || target == AV_PIX_FMT_RGB8)
+        step = 1;
+
+    for (i = 0; i < len; i += 16) {
+        __m256i b0, b1, ub0, ub1, vb0, vb1;
+        __m256i y0_l, y0_h, y1_l, y1_h, u0_l, u0_h;
+        __m256i v0_l, v0_h, u1_l, u1_h, v1_l, v1_h;
+        __m256i y_l, y_h, v_l, v_h, u_l, u_h;
+        __m256i R_l, R_h, G_l, G_h, B_l, B_h;
+        int n = i << 1;
+
+        DUP4_ARG2(__lasx_xvldx, buf0, n, buf1, n, ubuf0,
+                  n, ubuf1, n, b0, b1, ub0, ub1);
+        DUP2_ARG2(__lasx_xvldx, vbuf0, n, vbuf1, n, vb0 , vb1);
+        DUP2_ARG2(__lasx_xvsllwil_w_h, b0, 0, b1, 0, y0_l, y1_l);
+        DUP4_ARG2(__lasx_xvsllwil_w_h, ub0, 0, ub1, 0, vb0, 0, vb1, 0,
+                  u0_l, u1_l, v0_l, v1_l);
+        DUP2_ARG1(__lasx_xvexth_w_h, b0, b1, y0_h, y1_h);
+        DUP4_ARG1(__lasx_xvexth_w_h, ub0, ub1, vb0, vb1,
+                  u0_h, u1_h, v0_h, v1_h);
+        y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
+        y0_h = __lasx_xvmul_w(y0_h, v_yalpha1);
+        u0_l = __lasx_xvmul_w(u0_l, v_uvalpha1);
+        u0_h = __lasx_xvmul_w(u0_h, v_uvalpha1);
+        v0_l = __lasx_xvmul_w(v0_l, v_uvalpha1);
+        v0_h = __lasx_xvmul_w(v0_h, v_uvalpha1);
+        y_l  = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
+        y_h  = __lasx_xvmadd_w(y0_h, v_yalpha, y1_h);
+        u_l  = __lasx_xvmadd_w(u0_l, v_uvalpha, u1_l);
+        u_h  = __lasx_xvmadd_w(u0_h, v_uvalpha, u1_h);
+        v_l  = __lasx_xvmadd_w(v0_l, v_uvalpha, v1_l);
+        v_h  = __lasx_xvmadd_w(v0_h, v_uvalpha, v1_h);
+        u_l  = __lasx_xvsub_w(u_l, uv);
+        u_h  = __lasx_xvsub_w(u_h, uv);
+        v_l  = __lasx_xvsub_w(v_l, uv);
+        v_h  = __lasx_xvsub_w(v_h, uv);
+        y_l  = __lasx_xvsrai_w(y_l, 10);
+        y_h  = __lasx_xvsrai_w(y_h, 10);
+        u_l  = __lasx_xvsrai_w(u_l, 10);
+        u_h  = __lasx_xvsrai_w(u_h, 10);
+        v_l  = __lasx_xvsrai_w(v_l, 10);
+        v_h  = __lasx_xvsrai_w(v_h, 10);
+        YUV2RGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
+                y_temp, v2r, v2g, u2g, u2b);
+        YUV2RGB(y_h, u_h, v_h, R_h, G_h, B_h, offset, coeff,
+                y_temp, v2r, v2g, u2g, u2b);
+
+        if (hasAlpha) {
+            __m256i a0, a1, a0_l, a0_h;
+            __m256i a_l, a_h, a1_l, a1_h;
+
+            DUP2_ARG2(__lasx_xvldx, abuf0, n, abuf1, n, a0, a1);
+            DUP2_ARG2(__lasx_xvsllwil_w_h, a0, 0, a1, 0, a0_l, a1_l);
+            DUP2_ARG1(__lasx_xvexth_w_h, a0, a1, a0_h, a1_h);
+            a_l = __lasx_xvmadd_w(a_bias, a0_l, v_yalpha1);
+            a_h = __lasx_xvmadd_w(a_bias, a0_h, v_yalpha1);
+            a_l = __lasx_xvmadd_w(a_l, v_yalpha, a1_l);
+            a_h = __lasx_xvmadd_w(a_h, v_yalpha, a1_h);
+            a_l = __lasx_xvsrai_w(a_l, 19);
+            a_h = __lasx_xvsrai_w(a_h, 19);
+            WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
+            WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
+            WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
+            WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
+            WRITE_FULL_A(R_h, G_h, B_h, a_h, 0, 4);
+            WRITE_FULL_A(R_h, G_h, B_h, a_h, 1, 5);
+            WRITE_FULL_A(R_h, G_h, B_h, a_h, 2, 6);
+            WRITE_FULL_A(R_h, G_h, B_h, a_h, 3, 7);
+            WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 8);
+            WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 9);
+            WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 10);
+            WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 11);
+            WRITE_FULL_A(R_h, G_h, B_h, a_h, 4, 12);
+            WRITE_FULL_A(R_h, G_h, B_h, a_h, 5, 13);
+            WRITE_FULL_A(R_h, G_h, B_h, a_h, 6, 14);
+            WRITE_FULL_A(R_h, G_h, B_h, a_h, 7, 15);
+        } else {
+            WRITE_FULL(R_l, G_l, B_l, 0, 0);
+            WRITE_FULL(R_l, G_l, B_l, 1, 1);
+            WRITE_FULL(R_l, G_l, B_l, 2, 2);
+            WRITE_FULL(R_l, G_l, B_l, 3, 3);
+            WRITE_FULL(R_h, G_h, B_h, 0, 4);
+            WRITE_FULL(R_h, G_h, B_h, 1, 5);
+            WRITE_FULL(R_h, G_h, B_h, 2, 6);
+            WRITE_FULL(R_h, G_h, B_h, 3, 7);
+            WRITE_FULL(R_l, G_l, B_l, 4, 8);
+            WRITE_FULL(R_l, G_l, B_l, 5, 9);
+            WRITE_FULL(R_l, G_l, B_l, 6, 10);
+            WRITE_FULL(R_l, G_l, B_l, 7, 11);
+            WRITE_FULL(R_h, G_h, B_h, 4, 12);
+            WRITE_FULL(R_h, G_h, B_h, 5, 13);
+            WRITE_FULL(R_h, G_h, B_h, 6, 14);
+            WRITE_FULL(R_h, G_h, B_h, 7, 15);
+        }
+    }
+    if (dstW - i >= 8) {
+        __m256i b0, b1, ub0, ub1, vb0, vb1;
+        __m256i y0_l, y1_l, u0_l;
+        __m256i v0_l, u1_l, v1_l;
+        __m256i y_l, u_l, v_l;
+        __m256i R_l, G_l, B_l;
+        int n = i << 1;
+
+        DUP4_ARG2(__lasx_xvldx, buf0, n, buf1, n, ubuf0, n,
+                  ubuf1, n, b0, b1, ub0, ub1);
+        DUP2_ARG2(__lasx_xvldx, vbuf0, n, vbuf1, n, vb0, vb1);
+        DUP2_ARG1(__lasx_vext2xv_w_h, b0, b1, y0_l, y1_l);
+        DUP4_ARG1(__lasx_vext2xv_w_h, ub0, ub1, vb0, vb1,
+                  u0_l, u1_l, v0_l, v1_l);
+        y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
+        u0_l = __lasx_xvmul_w(u0_l, v_uvalpha1);
+        v0_l = __lasx_xvmul_w(v0_l, v_uvalpha1);
+        y_l  = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
+        u_l  = __lasx_xvmadd_w(u0_l, v_uvalpha, u1_l);
+        v_l  = __lasx_xvmadd_w(v0_l, v_uvalpha, v1_l);
+        u_l  = __lasx_xvsub_w(u_l, uv);
+        v_l  = __lasx_xvsub_w(v_l, uv);
+        y_l  = __lasx_xvsrai_w(y_l, 10);
+        u_l  = __lasx_xvsrai_w(u_l, 10);
+        v_l  = __lasx_xvsrai_w(v_l, 10);
+        YUV2RGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
+                y_temp, v2r, v2g, u2g, u2b);
+
+        if (hasAlpha) {
+            __m256i a0, a1, a0_l;
+            __m256i a_l, a1_l;
+
+            DUP2_ARG2(__lasx_xvldx, abuf0, n, abuf1, n, a0, a1);
+            DUP2_ARG1(__lasx_vext2xv_w_h, a0, a1, a0_l, a1_l);
+            a_l = __lasx_xvmadd_w(a_bias, a0_l, v_yalpha1);
+            a_l = __lasx_xvmadd_w(a_l, v_yalpha, a1_l);
+            a_l = __lasx_xvsrai_w(a_l, 19);
+            WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
+            WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
+            WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
+            WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
+            WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 4);
+            WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 5);
+            WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 6);
+            WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 7);
+        } else {
+            WRITE_FULL(R_l, G_l, B_l, 0, 0);
+            WRITE_FULL(R_l, G_l, B_l, 1, 1);
+            WRITE_FULL(R_l, G_l, B_l, 2, 2);
+            WRITE_FULL(R_l, G_l, B_l, 3, 3);
+            WRITE_FULL(R_l, G_l, B_l, 4, 4);
+            WRITE_FULL(R_l, G_l, B_l, 5, 5);
+            WRITE_FULL(R_l, G_l, B_l, 6, 6);
+            WRITE_FULL(R_l, G_l, B_l, 7, 7);
+        }
+        i += 8;
+    }
+    for (; i < dstW; i++){
+        int Y = ( buf0[i] * yalpha1  +  buf1[i] * yalpha         ) >> 10;
+        int U = (ubuf0[i] * uvalpha1 + ubuf1[i] * uvalpha- uvtemp) >> 10;
+        int V = (vbuf0[i] * uvalpha1 + vbuf1[i] * uvalpha- uvtemp) >> 10;
+
+        A = 0;
+        if (hasAlpha){
+            A = (abuf0[i] * yalpha1 + abuf1[i] * yalpha + atemp) >> 19;
+            if (A & 0x100)
+                A = av_clip_uint8(A);
+        }
+
+        Y -= y_offset;
+        Y *= y_coeff;
+        Y += ytemp;
+        R  = (unsigned)Y + V * v2r_coe;
+        G  = (unsigned)Y + V * v2g_coe + U * u2g_coe;
+        B  = (unsigned)Y + U * u2b_coe;
+        yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
+        dest += step;
+    }
+    c->dither_error[0][i] = err[0];
+    c->dither_error[1][i] = err[1];
+    c->dither_error[2][i] = err[2];
+}
+
+static void
+yuv2rgb_full_1_template_lasx(SwsContext *c, const int16_t *buf0,
+                             const int16_t *ubuf[2], const int16_t *vbuf[2],
+                             const int16_t *abuf0, uint8_t *dest, int dstW,
+                             int uvalpha, int y, enum AVPixelFormat target,
+                             int hasAlpha)
+{
+    const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
+    int i, B, G, R, A;
+    int step = (target == AV_PIX_FMT_RGB24 || target == AV_PIX_FMT_BGR24) ? 3 : 4;
+    int err[4]     = {0};
+    int ytemp      = 1 << 21;
+    int bias_int   = 64;
+    int len        = dstW - 15;
+    __m256i y_temp = __lasx_xvreplgr2vr_w(ytemp);
+    YUV2RGB_SETUP
+
+    if(   target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
+       || target == AV_PIX_FMT_BGR8      || target == AV_PIX_FMT_RGB8)
+        step = 1;
+    if (uvalpha < 2048) {
+        int uvtemp   = 128 << 7;
+        __m256i uv   = __lasx_xvreplgr2vr_w(uvtemp);
+        __m256i bias = __lasx_xvreplgr2vr_w(bias_int);
+
+        for (i = 0; i < len; i += 16) {
+            __m256i b, ub, vb, ub_l, ub_h, vb_l, vb_h;
+            __m256i y_l, y_h, u_l, u_h, v_l, v_h;
+            __m256i R_l, R_h, G_l, G_h, B_l, B_h;
+            int n = i << 1;
+
+            DUP2_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, b, ub);
+            vb  = __lasx_xvldx(vbuf0, n);
+            y_l = __lasx_xvsllwil_w_h(b, 2);
+            y_h = __lasx_xvexth_w_h(b);
+            DUP2_ARG2(__lasx_xvsllwil_w_h, ub, 0, vb, 0, ub_l, vb_l);
+            DUP2_ARG1(__lasx_xvexth_w_h, ub, vb, ub_h, vb_h);
+            y_h = __lasx_xvslli_w(y_h, 2);
+            u_l = __lasx_xvsub_w(ub_l, uv);
+            u_h = __lasx_xvsub_w(ub_h, uv);
+            v_l = __lasx_xvsub_w(vb_l, uv);
+            v_h = __lasx_xvsub_w(vb_h, uv);
+            u_l = __lasx_xvslli_w(u_l, 2);
+            u_h = __lasx_xvslli_w(u_h, 2);
+            v_l = __lasx_xvslli_w(v_l, 2);
+            v_h = __lasx_xvslli_w(v_h, 2);
+            YUV2RGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
+                    y_temp, v2r, v2g, u2g, u2b);
+            YUV2RGB(y_h, u_h, v_h, R_h, G_h, B_h, offset, coeff,
+                    y_temp, v2r, v2g, u2g, u2b);
+
+            if(hasAlpha) {
+                __m256i a_src;
+                __m256i a_l, a_h;
+
+                a_src = __lasx_xvld(abuf0 + i, 0);
+                a_l   = __lasx_xvsllwil_w_h(a_src, 0);
+                a_h   = __lasx_xvexth_w_h(a_src);
+                a_l   = __lasx_xvadd_w(a_l, bias);
+                a_h   = __lasx_xvadd_w(a_h, bias);
+                a_l   = __lasx_xvsrai_w(a_l, 7);
+                a_h   = __lasx_xvsrai_w(a_h, 7);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
+                WRITE_FULL_A(R_h, G_h, B_h, a_h, 0, 4);
+                WRITE_FULL_A(R_h, G_h, B_h, a_h, 1, 5);
+                WRITE_FULL_A(R_h, G_h, B_h, a_h, 2, 6);
+                WRITE_FULL_A(R_h, G_h, B_h, a_h, 3, 7);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 8);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 9);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 10);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 11);
+                WRITE_FULL_A(R_h, G_h, B_h, a_h, 4, 12);
+                WRITE_FULL_A(R_h, G_h, B_h, a_h, 5, 13);
+                WRITE_FULL_A(R_h, G_h, B_h, a_h, 6, 14);
+                WRITE_FULL_A(R_h, G_h, B_h, a_h, 7, 15);
+            } else {
+                WRITE_FULL(R_l, G_l, B_l, 0, 0);
+                WRITE_FULL(R_l, G_l, B_l, 1, 1);
+                WRITE_FULL(R_l, G_l, B_l, 2, 2);
+                WRITE_FULL(R_l, G_l, B_l, 3, 3);
+                WRITE_FULL(R_h, G_h, B_h, 0, 4);
+                WRITE_FULL(R_h, G_h, B_h, 1, 5);
+                WRITE_FULL(R_h, G_h, B_h, 2, 6);
+                WRITE_FULL(R_h, G_h, B_h, 3, 7);
+                WRITE_FULL(R_l, G_l, B_l, 4, 8);
+                WRITE_FULL(R_l, G_l, B_l, 5, 9);
+                WRITE_FULL(R_l, G_l, B_l, 6, 10);
+                WRITE_FULL(R_l, G_l, B_l, 7, 11);
+                WRITE_FULL(R_h, G_h, B_h, 4, 12);
+                WRITE_FULL(R_h, G_h, B_h, 5, 13);
+                WRITE_FULL(R_h, G_h, B_h, 6, 14);
+                WRITE_FULL(R_h, G_h, B_h, 7, 15);
+            }
+        }
+        if (dstW - i >= 8) {
+            __m256i b, ub, vb, ub_l, vb_l;
+            __m256i y_l, u_l, v_l;
+            __m256i R_l, G_l, B_l;
+            int n = i << 1;
+
+            DUP2_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, b, ub);
+            vb  = __lasx_xvldx(vbuf0, n);
+            y_l = __lasx_vext2xv_w_h(b);
+            DUP2_ARG1(__lasx_vext2xv_w_h, ub, vb, ub_l, vb_l);
+            y_l = __lasx_xvslli_w(y_l, 2);
+            u_l = __lasx_xvsub_w(ub_l, uv);
+            v_l = __lasx_xvsub_w(vb_l, uv);
+            u_l = __lasx_xvslli_w(u_l, 2);
+            v_l = __lasx_xvslli_w(v_l, 2);
+            YUV2RGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
+                    y_temp, v2r, v2g, u2g, u2b);
+
+            if(hasAlpha) {
+                __m256i a_src, a_l;
+
+                a_src = __lasx_xvldx(abuf0, n);
+                a_src = __lasx_vext2xv_w_h(a_src);
+                a_l   = __lasx_xvadd_w(bias, a_src);
+                a_l   = __lasx_xvsrai_w(a_l, 7);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 4);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 5);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 6);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 7);
+            } else {
+                WRITE_FULL(R_l, G_l, B_l, 0, 0);
+                WRITE_FULL(R_l, G_l, B_l, 1, 1);
+                WRITE_FULL(R_l, G_l, B_l, 2, 2);
+                WRITE_FULL(R_l, G_l, B_l, 3, 3);
+                WRITE_FULL(R_l, G_l, B_l, 4, 4);
+                WRITE_FULL(R_l, G_l, B_l, 5, 5);
+                WRITE_FULL(R_l, G_l, B_l, 6, 6);
+                WRITE_FULL(R_l, G_l, B_l, 7, 7);
+            }
+            i += 8;
+        }
+        for (; i < dstW; i++) {
+            int Y = buf0[i] << 2;
+            int U = (ubuf0[i] - uvtemp) << 2;
+            int V = (vbuf0[i] - uvtemp) << 2;
+
+            A = 0;
+            if(hasAlpha) {
+                A = (abuf0[i] + 64) >> 7;
+                if (A & 0x100)
+                    A = av_clip_uint8(A);
+            }
+            Y -= y_offset;
+            Y *= y_coeff;
+            Y += ytemp;
+            R  = (unsigned)Y + V * v2r_coe;
+            G  = (unsigned)Y + V * v2g_coe + U * u2g_coe;
+            B  = (unsigned)Y + U * u2b_coe;
+            yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
+            dest += step;
+        }
+    } else {
+        const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
+        int uvtemp   = 128 << 8;
+        __m256i uv   = __lasx_xvreplgr2vr_w(uvtemp);
+        __m256i zero = __lasx_xvldi(0);
+        __m256i bias = __lasx_xvreplgr2vr_h(bias_int);
+
+        for (i = 0; i < len; i += 16) {
+            __m256i b, ub0, ub1, vb0, vb1;
+            __m256i y_ev, y_od, u_ev, u_od, v_ev, v_od;
+            __m256i R_ev, R_od, G_ev, G_od, B_ev, B_od;
+            int n = i << 1;
+
+            DUP4_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, vbuf0, n,
+                      ubuf1, n, b, ub0, vb0, ub1);
+            vb1 = __lasx_xvldx(vbuf, n);
+            y_ev = __lasx_xvaddwev_w_h(b, zero);
+            y_od = __lasx_xvaddwod_w_h(b, zero);
+            DUP2_ARG2(__lasx_xvaddwev_w_h, ub0, vb0, ub1, vb1, u_ev, v_ev);
+            DUP2_ARG2(__lasx_xvaddwod_w_h, ub0, vb0, ub1, vb1, u_od, v_od);
+            DUP2_ARG2(__lasx_xvslli_w, y_ev, 2, y_od, 2, y_ev, y_od);
+            DUP4_ARG2(__lasx_xvsub_w, u_ev, uv, u_od, uv, v_ev, uv, v_od, uv,
+                      u_ev, u_od, v_ev, v_od);
+            DUP4_ARG2(__lasx_xvslli_w, u_ev, 1, u_od, 1, v_ev, 1, v_od, 1,
+                      u_ev, u_od, v_ev, v_od);
+            YUV2RGB(y_ev, u_ev, v_ev, R_ev, G_ev, B_ev, offset, coeff,
+                    y_temp, v2r, v2g, u2g, u2b);
+            YUV2RGB(y_od, u_od, v_od, R_od, G_od, B_od, offset, coeff,
+                    y_temp, v2r, v2g, u2g, u2b);
+
+            if(hasAlpha) {
+                __m256i a_src;
+                __m256i a_ev, a_od;
+
+                a_src = __lasx_xvld(abuf0 + i, 0);
+                a_ev  = __lasx_xvaddwev_w_h(bias, a_src);
+                a_od  = __lasx_xvaddwod_w_h(bias, a_src);
+                a_ev  = __lasx_xvsrai_w(a_ev, 7);
+                a_od  = __lasx_xvsrai_w(a_od, 7);
+                WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 0, 0);
+                WRITE_FULL_A(R_od, G_od, B_od, a_od, 0, 1);
+                WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 1, 2);
+                WRITE_FULL_A(R_od, G_od, B_od, a_od, 1, 3);
+                WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 2, 4);
+                WRITE_FULL_A(R_od, G_od, B_od, a_od, 2, 5);
+                WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 3, 6);
+                WRITE_FULL_A(R_od, G_od, B_od, a_od, 3, 7);
+                WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 4, 8);
+                WRITE_FULL_A(R_od, G_od, B_od, a_od, 4, 9);
+                WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 5, 10);
+                WRITE_FULL_A(R_od, G_od, B_od, a_od, 5, 11);
+                WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 6, 12);
+                WRITE_FULL_A(R_od, G_od, B_od, a_od, 6, 13);
+                WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 7, 14);
+                WRITE_FULL_A(R_od, G_od, B_od, a_od, 7, 15);
+            } else {
+                WRITE_FULL(R_ev, G_ev, B_ev, 0, 0);
+                WRITE_FULL(R_od, G_od, B_od, 0, 1);
+                WRITE_FULL(R_ev, G_ev, B_ev, 1, 2);
+                WRITE_FULL(R_od, G_od, B_od, 1, 3);
+                WRITE_FULL(R_ev, G_ev, B_ev, 2, 4);
+                WRITE_FULL(R_od, G_od, B_od, 2, 5);
+                WRITE_FULL(R_ev, G_ev, B_ev, 3, 6);
+                WRITE_FULL(R_od, G_od, B_od, 3, 7);
+                WRITE_FULL(R_ev, G_ev, B_ev, 4, 8);
+                WRITE_FULL(R_od, G_od, B_od, 4, 9);
+                WRITE_FULL(R_ev, G_ev, B_ev, 5, 10);
+                WRITE_FULL(R_od, G_od, B_od, 5, 11);
+                WRITE_FULL(R_ev, G_ev, B_ev, 6, 12);
+                WRITE_FULL(R_od, G_od, B_od, 6, 13);
+                WRITE_FULL(R_ev, G_ev, B_ev, 7, 14);
+                WRITE_FULL(R_od, G_od, B_od, 7, 15);
+            }
+        }
+        if (dstW - i >= 8) {
+            __m256i b, ub0, ub1, vb0, vb1;
+            __m256i y_l, u_l, v_l;
+            __m256i R_l, G_l, B_l;
+            int n = i << 1;
+
+            DUP4_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, vbuf0, n,
+                      ubuf1, n, b, ub0, vb0, ub1);
+            vb1 = __lasx_xvldx(vbuf1, n);
+            y_l = __lasx_vext2xv_w_h(b);
+            y_l = __lasx_xvslli_w(y_l, 2);
+            DUP4_ARG1(__lasx_vext2xv_w_h, ub0, vb0, ub1, vb1,
+                      ub0, vb0, ub1, vb1);
+            DUP2_ARG2(__lasx_xvadd_w, ub0, ub1, vb0, vb1, u_l, v_l);
+            u_l = __lasx_xvsub_w(u_l, uv);
+            v_l = __lasx_xvsub_w(v_l, uv);
+            u_l = __lasx_xvslli_w(u_l, 1);
+            v_l = __lasx_xvslli_w(v_l, 1);
+            YUV2RGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
+                    y_temp, v2r, v2g, u2g, u2b);
+
+            if(hasAlpha) {
+                __m256i a_src;
+                __m256i a_l;
+
+                a_src  = __lasx_xvld(abuf0 + i, 0);
+                a_src  = __lasx_xvpermi_d(a_src, 0xD8);
+                a_src  = __lasx_xvilvl_h(a_src, a_src);
+                a_l    = __lasx_xvaddwev_w_h(bias, a_src);
+                a_l   = __lasx_xvsrai_w(a_l, 7);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 4);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 5);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 6);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 7);
+            } else {
+                WRITE_FULL(R_l, G_l, B_l, 0, 0);
+                WRITE_FULL(R_l, G_l, B_l, 1, 1);
+                WRITE_FULL(R_l, G_l, B_l, 2, 2);
+                WRITE_FULL(R_l, G_l, B_l, 3, 3);
+                WRITE_FULL(R_l, G_l, B_l, 4, 4);
+                WRITE_FULL(R_l, G_l, B_l, 5, 5);
+                WRITE_FULL(R_l, G_l, B_l, 6, 6);
+                WRITE_FULL(R_l, G_l, B_l, 7, 7);
+            }
+            i += 8;
+        }
+        for (; i < dstW; i++) {
+            int Y = buf0[i] << 2;
+            int U = (ubuf0[i] + ubuf1[i] - uvtemp) << 1;
+            int V = (vbuf0[i] + vbuf1[i] - uvtemp) << 1;
+
+            A = 0;
+            if(hasAlpha) {
+                A = (abuf0[i] + 64) >> 7;
+                if (A & 0x100)
+                    A = av_clip_uint8(A);
+            }
+            Y -= y_offset;
+            Y *= y_coeff;
+            Y += ytemp;
+            R  = (unsigned)Y + V * v2r_coe;
+            G  = (unsigned)Y + V * v2g_coe + U * u2g_coe;
+            B  = (unsigned)Y + U * u2b_coe;
+            yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
+            dest += step;
+        }
+    }
+    c->dither_error[0][i] = err[0];
+    c->dither_error[1][i] = err[1];
+    c->dither_error[2][i] = err[2];
+}
+#if CONFIG_SMALL
+YUV2RGBWRAPPER(yuv2, rgb_full, bgra32_full, AV_PIX_FMT_BGRA,
+               CONFIG_SWSCALE_ALPHA && c->needAlpha)
+YUV2RGBWRAPPER(yuv2, rgb_full, abgr32_full, AV_PIX_FMT_ABGR,
+               CONFIG_SWSCALE_ALPHA && c->needAlpha)
+YUV2RGBWRAPPER(yuv2, rgb_full, rgba32_full, AV_PIX_FMT_RGBA,
+               CONFIG_SWSCALE_ALPHA && c->needAlpha)
+YUV2RGBWRAPPER(yuv2, rgb_full, argb32_full, AV_PIX_FMT_ARGB,
+               CONFIG_SWSCALE_ALPHA && c->needAlpha)
+#else
+#if CONFIG_SWSCALE_ALPHA
+YUV2RGBWRAPPER(yuv2, rgb_full, bgra32_full, AV_PIX_FMT_BGRA,  1)
+YUV2RGBWRAPPER(yuv2, rgb_full, abgr32_full, AV_PIX_FMT_ABGR,  1)
+YUV2RGBWRAPPER(yuv2, rgb_full, rgba32_full, AV_PIX_FMT_RGBA,  1)
+YUV2RGBWRAPPER(yuv2, rgb_full, argb32_full, AV_PIX_FMT_ARGB,  1)
+#endif
+YUV2RGBWRAPPER(yuv2, rgb_full, bgrx32_full, AV_PIX_FMT_BGRA,  0)
+YUV2RGBWRAPPER(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR,  0)
+YUV2RGBWRAPPER(yuv2, rgb_full, rgbx32_full, AV_PIX_FMT_RGBA,  0)
+YUV2RGBWRAPPER(yuv2, rgb_full, xrgb32_full, AV_PIX_FMT_ARGB,  0)
+#endif
+YUV2RGBWRAPPER(yuv2, rgb_full, bgr24_full,  AV_PIX_FMT_BGR24, 0)
+YUV2RGBWRAPPER(yuv2, rgb_full, rgb24_full,  AV_PIX_FMT_RGB24, 0)
+
+YUV2RGBWRAPPER(yuv2, rgb_full, bgr4_byte_full,  AV_PIX_FMT_BGR4_BYTE, 0)
+YUV2RGBWRAPPER(yuv2, rgb_full, rgb4_byte_full,  AV_PIX_FMT_RGB4_BYTE, 0)
+YUV2RGBWRAPPER(yuv2, rgb_full, bgr8_full,   AV_PIX_FMT_BGR8,  0)
+YUV2RGBWRAPPER(yuv2, rgb_full, rgb8_full,   AV_PIX_FMT_RGB8,  0)
+#undef yuvTorgb
+#undef yuvTorgb_setup
+
+
+av_cold void ff_sws_init_output_loongarch(SwsContext *c)
+{
+
+    if(c->flags & SWS_FULL_CHR_H_INT) {
+        switch (c->dstFormat) {
+        case AV_PIX_FMT_RGBA:
+#if CONFIG_SMALL
+            c->yuv2packedX = yuv2rgba32_full_X_lasx;
+            c->yuv2packed2 = yuv2rgba32_full_2_lasx;
+            c->yuv2packed1 = yuv2rgba32_full_1_lasx;
+#else
+#if CONFIG_SWSCALE_ALPHA
+            if (c->needAlpha) {
+                c->yuv2packedX = yuv2rgba32_full_X_lasx;
+                c->yuv2packed2 = yuv2rgba32_full_2_lasx;
+                c->yuv2packed1 = yuv2rgba32_full_1_lasx;
+            } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+            {
+                c->yuv2packedX = yuv2rgbx32_full_X_lasx;
+                c->yuv2packed2 = yuv2rgbx32_full_2_lasx;
+                c->yuv2packed1 = yuv2rgbx32_full_1_lasx;
+            }
+#endif /* !CONFIG_SMALL */
+            break;
+        case AV_PIX_FMT_ARGB:
+#if CONFIG_SMALL
+            c->yuv2packedX = yuv2argb32_full_X_lasx;
+            c->yuv2packed2 = yuv2argb32_full_2_lasx;
+            c->yuv2packed1 = yuv2argb32_full_1_lasx;
+#else
+#if CONFIG_SWSCALE_ALPHA
+            if (c->needAlpha) {
+                c->yuv2packedX = yuv2argb32_full_X_lasx;
+                c->yuv2packed2 = yuv2argb32_full_2_lasx;
+                c->yuv2packed1 = yuv2argb32_full_1_lasx;
+            } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+            {
+                c->yuv2packedX = yuv2xrgb32_full_X_lasx;
+                c->yuv2packed2 = yuv2xrgb32_full_2_lasx;
+                c->yuv2packed1 = yuv2xrgb32_full_1_lasx;
+            }
+#endif /* !CONFIG_SMALL */
+            break;
+        case AV_PIX_FMT_BGRA:
+#if CONFIG_SMALL
+            c->yuv2packedX = yuv2bgra32_full_X_lasx;
+            c->yuv2packed2 = yuv2bgra32_full_2_lasx;
+            c->yuv2packed1 = yuv2bgra32_full_1_lasx;
+#else
+#if CONFIG_SWSCALE_ALPHA
+            if (c->needAlpha) {
+                c->yuv2packedX = yuv2bgra32_full_X_lasx;
+                c->yuv2packed2 = yuv2bgra32_full_2_lasx;
+                c->yuv2packed1 = yuv2bgra32_full_1_lasx;
+            } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+            {
+                c->yuv2packedX = yuv2bgrx32_full_X_lasx;
+                c->yuv2packed2 = yuv2bgrx32_full_2_lasx;
+                c->yuv2packed1 = yuv2bgrx32_full_1_lasx;
+            }
+#endif /* !CONFIG_SMALL */
+            break;
+        case AV_PIX_FMT_ABGR:
+#if CONFIG_SMALL
+            c->yuv2packedX = yuv2abgr32_full_X_lasx;
+            c->yuv2packed2 = yuv2abgr32_full_2_lasx;
+            c->yuv2packed1 = yuv2abgr32_full_1_lasx;
+#else
+#if CONFIG_SWSCALE_ALPHA
+            if (c->needAlpha) {
+                c->yuv2packedX = yuv2abgr32_full_X_lasx;
+                c->yuv2packed2 = yuv2abgr32_full_2_lasx;
+                c->yuv2packed1 = yuv2abgr32_full_1_lasx;
+            } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+            {
+                c->yuv2packedX = yuv2xbgr32_full_X_lasx;
+                c->yuv2packed2 = yuv2xbgr32_full_2_lasx;
+                c->yuv2packed1 = yuv2xbgr32_full_1_lasx;
+            }
+#endif /* !CONFIG_SMALL */
+            break;
+        case AV_PIX_FMT_RGB24:
+            c->yuv2packedX = yuv2rgb24_full_X_lasx;
+            c->yuv2packed2 = yuv2rgb24_full_2_lasx;
+            c->yuv2packed1 = yuv2rgb24_full_1_lasx;
+            break;
+        case AV_PIX_FMT_BGR24:
+            c->yuv2packedX = yuv2bgr24_full_X_lasx;
+            c->yuv2packed2 = yuv2bgr24_full_2_lasx;
+            c->yuv2packed1 = yuv2bgr24_full_1_lasx;
+            break;
+        case AV_PIX_FMT_BGR4_BYTE:
+            c->yuv2packedX = yuv2bgr4_byte_full_X_lasx;
+            c->yuv2packed2 = yuv2bgr4_byte_full_2_lasx;
+            c->yuv2packed1 = yuv2bgr4_byte_full_1_lasx;
+            break;
+        case AV_PIX_FMT_RGB4_BYTE:
+            c->yuv2packedX = yuv2rgb4_byte_full_X_lasx;
+            c->yuv2packed2 = yuv2rgb4_byte_full_2_lasx;
+            c->yuv2packed1 = yuv2rgb4_byte_full_1_lasx;
+            break;
+        case AV_PIX_FMT_BGR8:
+            c->yuv2packedX = yuv2bgr8_full_X_lasx;
+            c->yuv2packed2 = yuv2bgr8_full_2_lasx;
+            c->yuv2packed1 = yuv2bgr8_full_1_lasx;
+            break;
+        case AV_PIX_FMT_RGB8:
+            c->yuv2packedX = yuv2rgb8_full_X_lasx;
+            c->yuv2packed2 = yuv2rgb8_full_2_lasx;
+            c->yuv2packed1 = yuv2rgb8_full_1_lasx;
+            break;
+    }
+    } else {
+        switch (c->dstFormat) {
+        case AV_PIX_FMT_RGB32:
+        case AV_PIX_FMT_BGR32:
+#if CONFIG_SMALL
+#else
+#if CONFIG_SWSCALE_ALPHA
+            if (c->needAlpha) {
+            } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+            {
+                c->yuv2packed1 = yuv2rgbx32_1_lasx;
+                c->yuv2packed2 = yuv2rgbx32_2_lasx;
+                c->yuv2packedX = yuv2rgbx32_X_lasx;
+            }
+#endif /* !CONFIG_SMALL */
+            break;
+        case AV_PIX_FMT_RGB32_1:
+        case AV_PIX_FMT_BGR32_1:
+#if CONFIG_SMALL
+#else
+#if CONFIG_SWSCALE_ALPHA
+            if (c->needAlpha) {
+            } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+            {
+                c->yuv2packed1 = yuv2rgbx32_1_1_lasx;
+                c->yuv2packed2 = yuv2rgbx32_1_2_lasx;
+                c->yuv2packedX = yuv2rgbx32_1_X_lasx;
+            }
+#endif /* !CONFIG_SMALL */
+            break;
+        case AV_PIX_FMT_RGB24:
+            c->yuv2packed1 = yuv2rgb24_1_lasx;
+            c->yuv2packed2 = yuv2rgb24_2_lasx;
+            c->yuv2packedX = yuv2rgb24_X_lasx;
+            break;
+        case AV_PIX_FMT_BGR24:
+            c->yuv2packed1 = yuv2bgr24_1_lasx;
+            c->yuv2packed2 = yuv2bgr24_2_lasx;
+            c->yuv2packedX = yuv2bgr24_X_lasx;
+            break;
+        case AV_PIX_FMT_RGB565LE:
+        case AV_PIX_FMT_RGB565BE:
+        case AV_PIX_FMT_BGR565LE:
+        case AV_PIX_FMT_BGR565BE:
+            c->yuv2packed1 = yuv2rgb16_1_lasx;
+            c->yuv2packed2 = yuv2rgb16_2_lasx;
+            c->yuv2packedX = yuv2rgb16_X_lasx;
+            break;
+        case AV_PIX_FMT_RGB555LE:
+        case AV_PIX_FMT_RGB555BE:
+        case AV_PIX_FMT_BGR555LE:
+        case AV_PIX_FMT_BGR555BE:
+            c->yuv2packed1 = yuv2rgb15_1_lasx;
+            c->yuv2packed2 = yuv2rgb15_2_lasx;
+            c->yuv2packedX = yuv2rgb15_X_lasx;
+            break;
+        case AV_PIX_FMT_RGB444LE:
+        case AV_PIX_FMT_RGB444BE:
+        case AV_PIX_FMT_BGR444LE:
+        case AV_PIX_FMT_BGR444BE:
+            c->yuv2packed1 = yuv2rgb12_1_lasx;
+            c->yuv2packed2 = yuv2rgb12_2_lasx;
+            c->yuv2packedX = yuv2rgb12_X_lasx;
+            break;
+        case AV_PIX_FMT_RGB8:
+        case AV_PIX_FMT_BGR8:
+            c->yuv2packed1 = yuv2rgb8_1_lasx;
+            c->yuv2packed2 = yuv2rgb8_2_lasx;
+            c->yuv2packedX = yuv2rgb8_X_lasx;
+            break;
+        case AV_PIX_FMT_RGB4:
+        case AV_PIX_FMT_BGR4:
+            c->yuv2packed1 = yuv2rgb4_1_lasx;
+            c->yuv2packed2 = yuv2rgb4_2_lasx;
+            c->yuv2packedX = yuv2rgb4_X_lasx;
+            break;
+        case AV_PIX_FMT_RGB4_BYTE:
+        case AV_PIX_FMT_BGR4_BYTE:
+            c->yuv2packed1 = yuv2rgb4b_1_lasx;
+            c->yuv2packed2 = yuv2rgb4b_2_lasx;
+            c->yuv2packedX = yuv2rgb4b_X_lasx;
+            break;
+        }
+    }
+}
diff --git a/libswscale/loongarch/swscale_init_loongarch.c b/libswscale/loongarch/swscale_init_loongarch.c
index 1e0bb1b116..97fe947e2e 100644
--- a/libswscale/loongarch/swscale_init_loongarch.c
+++ b/libswscale/loongarch/swscale_init_loongarch.c
@@ -28,6 +28,7 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
     if (have_lasx(cpu_flags)) {
+        ff_sws_init_output_loongarch(c);
         if (c->srcBpc == 8) {
             if (c->dstBpc <= 14) {
                 c->hyScale = c->hcScale = ff_hscale_8_to_15_lasx;
@@ -47,6 +48,8 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
             }
             break;
         }
+        if (c->dstBpc == 8)
+            c->yuv2planeX = ff_yuv2planeX_8_lasx;
     }
 }
 
diff --git a/libswscale/loongarch/swscale_loongarch.h b/libswscale/loongarch/swscale_loongarch.h
index f5afbd7633..c52eb1016b 100644
--- a/libswscale/loongarch/swscale_loongarch.h
+++ b/libswscale/loongarch/swscale_loongarch.h
@@ -69,4 +69,10 @@ void ff_interleave_bytes_lasx(const uint8_t *src1, const uint8_t *src2,
                               uint8_t *dest, int width, int height,
                               int src1Stride, int src2Stride, int dstStride);
 
+av_cold void ff_sws_init_output_loongarch(SwsContext *c);
+
+void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
+                          const int16_t **src, uint8_t *dest, int dstW,
+                          const uint8_t *dither, int offset);
+
 #endif /* SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H */
-- 
2.20.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [FFmpeg-devel] Add LoongArch SIMD optimization in swscale lib.
  2022-09-09  9:00 [FFmpeg-devel] Add LoongArch SIMD optimization in swscale lib Hao Chen
                   ` (2 preceding siblings ...)
  2022-09-09  9:00 ` [FFmpeg-devel] [PATCH v5 3/3] swscale/la: Add output_lasx.c file Hao Chen
@ 2022-09-09  9:43 ` yinshiyou-hf
  2022-09-10 17:12   ` Michael Niedermayer
  3 siblings, 1 reply; 6+ messages in thread
From: yinshiyou-hf @ 2022-09-09  9:43 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

> -----原始邮件-----
> 发件人: "Hao Chen" <chenhao@loongson.cn>
> 发送时间:2022-09-09 17:00:23 (星期五)
> 收件人: ffmpeg-devel@ffmpeg.org
> 抄送: 
> 主题: [FFmpeg-devel] Add LoongArch SIMD optimization in swscale lib.
> 
> v2: Some modifications were made according to the comments of the reviewers.
> v3: Update and run CI test again.
> v4: Resolve the warning for the build.
> v5: Re-trigger the Patchwork test.
> 
> [PATCH v5 1/3] swscale/la: Optimize hscale functions with lasx.
> [PATCH v5 2/3] swscale/la: Add yuv2rgb_lasx.c and rgb2rgb_lasx.c
> [PATCH v5 3/3] swscale/la: Add output_lasx.c file.
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

LGTM.

After update the compiler in patchwork runner service,
there are no new warnings now.
I have restart the patchwork service of LoongArch now.

本邮件及其附件含有龙芯中科的商业秘密信息，仅限于发送给上面地址中列出的个人或群组。禁止任何其他人以任何形式使用（包括但不限于全部或部分地泄露、复制或散发）本邮件及其附件中的信息。如果您错收本邮件，请您立即电话或邮件通知发件人并删除本邮件。 
This email and its attachments contain confidential information from Loongson Technology , which is intended only for the person or entity whose address is listed above. Any use of the information contained herein in any way (including, but not limited to, total or partial disclosure, reproduction or dissemination) by persons other than the intended recipient(s) is prohibited. If you receive this email in error, please notify the sender by phone or email immediately and delete it. 
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [FFmpeg-devel] Add LoongArch SIMD optimization in swscale lib.
  2022-09-09  9:43 ` [FFmpeg-devel] Add LoongArch SIMD optimization in swscale lib yinshiyou-hf
@ 2022-09-10 17:12   ` Michael Niedermayer
  0 siblings, 0 replies; 6+ messages in thread
From: Michael Niedermayer @ 2022-09-10 17:12 UTC (permalink / raw)
  To: FFmpeg development discussions and patches


[-- Attachment #1.1: Type: text/plain, Size: 1350 bytes --]

On Fri, Sep 09, 2022 at 05:43:38PM +0800, yinshiyou-hf@loongson.cn wrote:
> > -----原始邮件-----
> > 发件人: "Hao Chen" <chenhao@loongson.cn>
> > 发送时间:2022-09-09 17:00:23 (星期五)
> > 收件人: ffmpeg-devel@ffmpeg.org
> > 抄送: 
> > 主题: [FFmpeg-devel] Add LoongArch SIMD optimization in swscale lib.
> > 
> > v2: Some modifications were made according to the comments of the reviewers.
> > v3: Update and run CI test again.
> > v4: Resolve the warning for the build.
> > v5: Re-trigger the Patchwork test.
> > 
> > [PATCH v5 1/3] swscale/la: Optimize hscale functions with lasx.
> > [PATCH v5 2/3] swscale/la: Add yuv2rgb_lasx.c and rgb2rgb_lasx.c
> > [PATCH v5 3/3] swscale/la: Add output_lasx.c file.
> > 
> > _______________________________________________
> > ffmpeg-devel mailing list
> > ffmpeg-devel@ffmpeg.org
> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> > 
> > To unsubscribe, visit link above, or email
> > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> 
> LGTM.

will apply

thx

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

The real ebay dictionary, page 2
"100% positive feedback" - "All either got their money back or didnt complain"
"Best seller ever, very honest" - "Seller refunded buyer after failed scam"

[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

[-- Attachment #2: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2022-09-10 17:12 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-09-09  9:00 [FFmpeg-devel] Add LoongArch SIMD optimization in swscale lib Hao Chen
2022-09-09  9:00 ` [FFmpeg-devel] [PATCH v5 1/3] swscale/la: Optimize hscale functions with lasx Hao Chen
2022-09-09  9:00 ` [FFmpeg-devel] [PATCH v5 2/3] swscale/la: Add yuv2rgb_lasx.c and rgb2rgb_lasx.c files Hao Chen
2022-09-09  9:00 ` [FFmpeg-devel] [PATCH v5 3/3] swscale/la: Add output_lasx.c file Hao Chen
2022-09-09  9:43 ` [FFmpeg-devel] Add LoongArch SIMD optimization in swscale lib yinshiyou-hf
2022-09-10 17:12   ` Michael Niedermayer

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git