[FFmpeg-devel] Add loongarch SIMD optimization in swscale lib.

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed

* [FFmpeg-devel] Add loongarch SIMD optimization in swscale lib.
@ 2022-08-29 11:26 Hao Chen
  2022-08-29 11:26 ` [FFmpeg-devel] [PATCH v1 1/3] swscale/la: Optimize hscale functions with lasx Hao Chen
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Hao Chen @ 2022-08-29 11:26 UTC (permalink / raw)
  To: ffmpeg-devel

[PATCH v1 1/3] swscale/la: Optimize hscale functions with lasx.
[PATCH v1 2/3] swscale/la: Add yuv2rgb_lasx.c and rgb2rgb_lasx.c
[PATCH v1 3/3] swscale/la: Add output_lasx.c file.

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [FFmpeg-devel] [PATCH v1 1/3] swscale/la: Optimize hscale functions with lasx.
  2022-08-29 11:26 [FFmpeg-devel] Add loongarch SIMD optimization in swscale lib Hao Chen
@ 2022-08-29 11:26 ` Hao Chen
  2022-08-29 11:26 ` [FFmpeg-devel] [PATCH v1 2/3] swscale/la: Add yuv2rgb_lasx.c and rgb2rgb_lasx.c files Hao Chen
  2022-08-29 11:26 ` [FFmpeg-devel] [PATCH v1 3/3] swscale/la: Add output_lasx.c file Hao Chen
  2 siblings, 0 replies; 13+ messages in thread
From: Hao Chen @ 2022-08-29 11:26 UTC (permalink / raw)
  To: ffmpeg-devel

ffmpeg -i ~/media/1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -s 640x480 -y /dev/null -an
before: 101fps
after:  138fps

Signed-off-by: Hao Chen <chenhao@loongson.cn>
---
 libswscale/loongarch/Makefile                 |   3 +
 libswscale/loongarch/input_lasx.c             | 192 ++++
 libswscale/loongarch/swscale_init_loongarch.c |  50 +
 libswscale/loongarch/swscale_lasx.c           | 959 ++++++++++++++++++
 libswscale/loongarch/swscale_loongarch.h      |  50 +
 libswscale/swscale.c                          |   2 +
 libswscale/swscale_internal.h                 |   2 +
 libswscale/utils.c                            |  13 +-
 8 files changed, 1270 insertions(+), 1 deletion(-)
 create mode 100644 libswscale/loongarch/Makefile
 create mode 100644 libswscale/loongarch/input_lasx.c
 create mode 100644 libswscale/loongarch/swscale_init_loongarch.c
 create mode 100644 libswscale/loongarch/swscale_lasx.c
 create mode 100644 libswscale/loongarch/swscale_loongarch.h

diff --git a/libswscale/loongarch/Makefile b/libswscale/loongarch/Makefile
new file mode 100644
index 0000000000..586a1717b6
--- /dev/null
+++ b/libswscale/loongarch/Makefile
@@ -0,0 +1,3 @@
+OBJS-$(CONFIG_SWSCALE)      += loongarch/swscale_init_loongarch.o
+LASX-OBJS-$(CONFIG_SWSCALE) += loongarch/swscale_lasx.o \
+                               loongarch/input_lasx.o   \
diff --git a/libswscale/loongarch/input_lasx.c b/libswscale/loongarch/input_lasx.c
new file mode 100644
index 0000000000..6fcb998bc0
--- /dev/null
+++ b/libswscale/loongarch/input_lasx.c
@@ -0,0 +1,192 @@
+/*
+ * Copyright (C) 2022 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen(chenhao@loongson.cn)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "swscale_loongarch.h"
+#include "libavutil/loongarch/loongson_intrinsics.h"
+
+void planar_rgb_to_uv_lasx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
+                           int width, int32_t *rgb2yuv)
+{
+    int i;
+    uint16_t *dstU   = (uint16_t *)_dstU;
+    uint16_t *dstV   = (uint16_t *)_dstV;
+    int set          = 0x4001 << (RGB2YUV_SHIFT - 7);
+    int len          = width - 15;
+    int32_t tem_ru   = rgb2yuv[RU_IDX], tem_gu = rgb2yuv[GU_IDX];
+    int32_t tem_bu = rgb2yuv[BU_IDX], tem_rv   = rgb2yuv[RV_IDX];
+    int32_t tem_gv = rgb2yuv[GV_IDX], tem_bv = rgb2yuv[BV_IDX];
+    int shift        = RGB2YUV_SHIFT - 6;
+    const uint8_t *src0 = src[0], *src1 = src[1], *src2 = src[2];
+    __m256i ru, gu, bu, rv, gv, bv;
+    __m256i mask = {0x0D0C090805040100, 0x1D1C191815141110,
+                    0x0D0C090805040100, 0x1D1C191815141110};
+    __m256i temp = __lasx_xvreplgr2vr_w(set);
+    __m256i sra  = __lasx_xvreplgr2vr_w(shift);
+
+    ru = __lasx_xvreplgr2vr_w(tem_ru);
+    gu = __lasx_xvreplgr2vr_w(tem_gu);
+    bu = __lasx_xvreplgr2vr_w(tem_bu);
+    rv = __lasx_xvreplgr2vr_w(tem_rv);
+    gv = __lasx_xvreplgr2vr_w(tem_gv);
+    bv = __lasx_xvreplgr2vr_w(tem_bv);
+    for (i = 0; i < len; i += 16) {
+        __m256i _g, _b, _r;
+        __m256i g_l, g_h, b_l, b_h, r_l, r_h;
+        __m256i v_l, v_h, u_l, u_h, u_lh, v_lh;
+
+        DUP2_ARG2(__lasx_xvld, src0 + i, 0, src1 + i, 0, _g, _b);
+        _r   = __lasx_xvld(src2 + i, 0);
+        g_l = __lasx_vext2xv_wu_bu(_g);
+        DUP2_ARG1(__lasx_vext2xv_wu_bu, _b, _r, b_l, r_l);
+        _g   = __lasx_xvpermi_d(_g, 0x01);
+        _b   = __lasx_xvpermi_d(_b, 0x01);
+        _r   = __lasx_xvpermi_d(_r, 0x01);
+        g_h = __lasx_vext2xv_wu_bu(_g);
+        DUP2_ARG1(__lasx_vext2xv_wu_bu, _b, _r, b_h, r_h);
+        u_l  = __lasx_xvmadd_w(temp, ru, r_l);
+        u_h  = __lasx_xvmadd_w(temp, ru, r_h);
+        v_l  = __lasx_xvmadd_w(temp, rv, r_l);
+        v_h  = __lasx_xvmadd_w(temp, rv, r_h);
+        u_l  = __lasx_xvmadd_w(u_l, gu, g_l);
+        u_l  = __lasx_xvmadd_w(u_l, bu, b_l);
+        u_h  = __lasx_xvmadd_w(u_h, gu, g_h);
+        u_h  = __lasx_xvmadd_w(u_h, bu, b_h);
+        v_l  = __lasx_xvmadd_w(v_l, gv, g_l);
+        v_l  = __lasx_xvmadd_w(v_l, bv, b_l);
+        v_h  = __lasx_xvmadd_w(v_h, gv, g_h);
+        v_h  = __lasx_xvmadd_w(v_h, bv, b_h);
+        u_l  = __lasx_xvsra_w(u_l, sra);
+        u_h  = __lasx_xvsra_w(u_h, sra);
+        v_l  = __lasx_xvsra_w(v_l, sra);
+        v_h  = __lasx_xvsra_w(v_h, sra);
+        DUP2_ARG3(__lasx_xvshuf_b, u_h, u_l, mask, v_h, v_l, mask, u_lh, v_lh);
+        u_lh = __lasx_xvpermi_d(u_lh, 0xD8);
+        v_lh = __lasx_xvpermi_d(v_lh, 0xD8);
+        __lasx_xvst(u_lh, (dstU + i), 0);
+        __lasx_xvst(v_lh, (dstV + i), 0);
+    }
+    if (width - i >= 8) {
+        __m256i _g, _b, _r;
+        __m256i g_l, b_l, r_l;
+        __m256i v_l, u_l, u, v;
+
+        _g  = __lasx_xvldrepl_d((src0 + i), 0);
+        _b  = __lasx_xvldrepl_d((src1 + i), 0);
+        _r  = __lasx_xvldrepl_d((src2 + i), 0);
+        g_l = __lasx_vext2xv_wu_bu(_g);
+        DUP2_ARG1(__lasx_vext2xv_wu_bu, _b, _r, b_l, r_l);
+        u_l = __lasx_xvmadd_w(temp, ru, r_l);
+        v_l = __lasx_xvmadd_w(temp, rv, r_l);
+        u_l = __lasx_xvmadd_w(u_l, gu, g_l);
+        u_l = __lasx_xvmadd_w(u_l, bu, b_l);
+        v_l = __lasx_xvmadd_w(v_l, gv, g_l);
+        v_l = __lasx_xvmadd_w(v_l, bv, b_l);
+        u_l = __lasx_xvsra_w(u_l, sra);
+        v_l = __lasx_xvsra_w(v_l, sra);
+        DUP2_ARG3(__lasx_xvshuf_b, u_l, u_l, mask, v_l, v_l, mask, u, v);
+        __lasx_xvstelm_d(u, (dstU + i), 0, 0);
+        __lasx_xvstelm_d(u, (dstU + i), 8, 2);
+        __lasx_xvstelm_d(v, (dstV + i), 0, 0);
+        __lasx_xvstelm_d(v, (dstV + i), 8, 2);
+        i += 8;
+    }
+    for (; i < width; i++) {
+        int g = src[0][i];
+        int b = src[1][i];
+        int r = src[2][i];
+
+        dstU[i] = (tem_ru * r + tem_gu * g + tem_bu * b + set) >> shift;
+        dstV[i] = (tem_rv * r + tem_gv * g + tem_bv * b + set) >> shift;
+    }
+}
+
+void planar_rgb_to_y_lasx(uint8_t *_dst, const uint8_t *src[4], int width,
+                          int32_t *rgb2yuv)
+{
+    int i;
+    int shift        = (RGB2YUV_SHIFT - 6);
+    int set          = 0x801 << (RGB2YUV_SHIFT - 7);
+    int len          = width - 15;
+    uint16_t *dst    = (uint16_t *)_dst;
+    int32_t tem_ry   = rgb2yuv[RY_IDX], tem_gy = rgb2yuv[GY_IDX];
+    int32_t tem_by   = rgb2yuv[BY_IDX];
+    const uint8_t *src0 = src[0], *src1 = src[1], *src2 = src[2];
+    __m256i mask = {0x0D0C090805040100, 0x1D1C191815141110,
+                    0x0D0C090805040100, 0x1D1C191815141110};
+    __m256i temp = __lasx_xvreplgr2vr_w(set);
+    __m256i sra  = __lasx_xvreplgr2vr_w(shift);
+    __m256i ry   = __lasx_xvreplgr2vr_w(tem_ry);
+    __m256i gy   = __lasx_xvreplgr2vr_w(tem_gy);
+    __m256i by   = __lasx_xvreplgr2vr_w(tem_by);
+
+    for (i = 0; i < len; i += 16) {
+        __m256i _g, _b, _r;
+        __m256i g_l, g_h, b_l, b_h, r_l, r_h;
+        __m256i y_l, y_h, y_lh;
+
+        DUP2_ARG2(__lasx_xvld, src0 + i, 0, src1 + i, 0, _g, _b);
+        _r   = __lasx_xvld(src2 + i, 0);
+        g_l = __lasx_vext2xv_wu_bu(_g);
+        DUP2_ARG1(__lasx_vext2xv_wu_bu, _b, _r, b_l, r_l);
+        _g   = __lasx_xvpermi_d(_g, 0x01);
+        _b   = __lasx_xvpermi_d(_b, 0x01);
+        _r   = __lasx_xvpermi_d(_r, 0x01);
+        g_h  = __lasx_vext2xv_wu_bu(_g);
+        DUP2_ARG1(__lasx_vext2xv_wu_bu,_b, _r, b_h, r_h);
+        y_l  = __lasx_xvmadd_w(temp, ry, r_l);
+        y_h  = __lasx_xvmadd_w(temp, ry, r_h);
+        y_l  = __lasx_xvmadd_w(y_l, gy, g_l);
+        y_l  = __lasx_xvmadd_w(y_l, by, b_l);
+        y_h  = __lasx_xvmadd_w(y_h, gy, g_h);
+        y_h  = __lasx_xvmadd_w(y_h, by, b_h);
+        y_l  = __lasx_xvsra_w(y_l, sra);
+        y_h  = __lasx_xvsra_w(y_h, sra);
+        y_lh = __lasx_xvshuf_b(y_h, y_l, mask);
+        y_lh = __lasx_xvpermi_d(y_lh, 0xD8);
+        __lasx_xvst(y_lh, (dst + i), 0);
+    }
+    if (width - i >= 8) {
+        __m256i _g, _b, _r;
+        __m256i g_l, b_l, r_l;
+        __m256i y_l, y;
+
+        _g  = __lasx_xvldrepl_d((src0 + i), 0);
+        _b  = __lasx_xvldrepl_d((src1 + i), 0);
+        _r  = __lasx_xvldrepl_d((src2 + i), 0);
+        g_l = __lasx_vext2xv_wu_bu(_g);
+        DUP2_ARG1(__lasx_vext2xv_wu_bu, _b, _r, b_l, r_l);
+        y_l = __lasx_xvmadd_w(temp, ry, r_l);
+        y_l = __lasx_xvmadd_w(y_l, gy, g_l);
+        y_l = __lasx_xvmadd_w(y_l, by, b_l);
+        y_l = __lasx_xvsra_w(y_l, sra);
+        y = __lasx_xvshuf_b(y_l, y_l, mask);
+        __lasx_xvstelm_d(y, (dst + i), 0, 0);
+        __lasx_xvstelm_d(y, (dst + i), 8, 2);
+        i += 8;
+    }
+    for (; i < width; i++) {
+        int g = src[0][i];
+        int b = src[1][i];
+        int r = src[2][i];
+
+        dst[i] = (tem_ry * r + tem_gy * g + tem_by * b + set) >> shift;
+    }
+}
diff --git a/libswscale/loongarch/swscale_init_loongarch.c b/libswscale/loongarch/swscale_init_loongarch.c
new file mode 100644
index 0000000000..197dc6e1e7
--- /dev/null
+++ b/libswscale/loongarch/swscale_init_loongarch.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) 2022 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen(chenhao@loongson.cn)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "swscale_loongarch.h"
+#include "libswscale/swscale_internal.h"
+#include "libavutil/loongarch/cpu.h"
+
+av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+    if (have_lasx(cpu_flags)) {
+        if (c->srcBpc == 8) {
+            if (c->dstBpc <= 14) {
+                c->hyScale = c->hcScale = ff_hscale_8_to_15_lasx;
+            } else {
+                c->hyScale = c->hcScale = ff_hscale_8_to_19_lasx;
+            }
+        } else {
+            c->hyScale = c->hcScale = c->dstBpc > 14 ? ff_hscale_16_to_19_lasx
+                                                     : ff_hscale_16_to_15_lasx;
+        }
+        switch (c->srcFormat) {
+        case AV_PIX_FMT_GBRAP:
+        case AV_PIX_FMT_GBRP:
+            {
+                c->readChrPlanar = planar_rgb_to_uv_lasx;
+                c->readLumPlanar = planar_rgb_to_y_lasx;
+            }
+            break;
+        }
+    }
+}
diff --git a/libswscale/loongarch/swscale_lasx.c b/libswscale/loongarch/swscale_lasx.c
new file mode 100644
index 0000000000..53bfdf7e5c
--- /dev/null
+++ b/libswscale/loongarch/swscale_lasx.c
@@ -0,0 +1,959 @@
+/*
+ * Copyright (C) 2022 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen(chenhao@loongson.cn)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "swscale_loongarch.h"
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "libavutil/intreadwrite.h"
+
+#define SCALE_8_16(_sh)                                               \
+{                                                                     \
+    src0    = __lasx_xvldrepl_d(src + filterPos[0], 0);               \
+    src1    = __lasx_xvldrepl_d(src + filterPos[1], 0);               \
+    src2    = __lasx_xvldrepl_d(src + filterPos[2], 0);               \
+    src3    = __lasx_xvldrepl_d(src + filterPos[3], 0);               \
+    src4    = __lasx_xvldrepl_d(src + filterPos[4], 0);               \
+    src5    = __lasx_xvldrepl_d(src + filterPos[5], 0);               \
+    src6    = __lasx_xvldrepl_d(src + filterPos[6], 0);               \
+    src7    = __lasx_xvldrepl_d(src + filterPos[7], 0);               \
+    src8    = __lasx_xvldrepl_d(src + filterPos[8], 0);               \
+    src9    = __lasx_xvldrepl_d(src + filterPos[9], 0);               \
+    src10   = __lasx_xvldrepl_d(src + filterPos[10], 0);              \
+    src11   = __lasx_xvldrepl_d(src + filterPos[11], 0);              \
+    src12   = __lasx_xvldrepl_d(src + filterPos[12], 0);              \
+    src13   = __lasx_xvldrepl_d(src + filterPos[13], 0);              \
+    src14   = __lasx_xvldrepl_d(src + filterPos[14], 0);              \
+    src15   = __lasx_xvldrepl_d(src + filterPos[15], 0);              \
+    DUP4_ARG2(__lasx_xvld, filter, 0, filter, 32, filter, 64,         \
+              filter, 96, filter0, filter1, filter2, filter3);        \
+    DUP4_ARG2(__lasx_xvld, filter, 128, filter, 160,                  \
+              filter, 192, filter, 224, filter4,                      \
+              filter5, filter6, filter7);                             \
+    DUP4_ARG2(__lasx_xvilvl_d, src1, src0, src3, src2,                \
+              src5, src4, src7, src6, src0, src2, src4, src6);        \
+    DUP4_ARG2(__lasx_xvilvl_d, src9, src8, src11, src10,              \
+              src13, src12, src15, src14, src8, src10, src12, src14); \
+    DUP4_ARG1(__lasx_vext2xv_hu_bu, src0, src2, src4, src6,           \
+              src0, src2, src4, src6);                                \
+    DUP4_ARG1(__lasx_vext2xv_hu_bu, src8, src10, src12,               \
+              src14, src8, src10, src12, src14);                      \
+    DUP4_ARG2(__lasx_xvdp2_w_h, filter0, src0, filter1, src2,         \
+              filter2, src4, filter3, src6, src0, src1, src2, src3);  \
+    DUP4_ARG2(__lasx_xvdp2_w_h, filter4, src8, filter5, src10,        \
+              filter6, src12, filter7, src14, src4, src5, src6, src7);\
+    src0 = __lasx_xvhaddw_d_w(src0, src0);                            \
+    src1 = __lasx_xvhaddw_d_w(src1, src1);                            \
+    src2 = __lasx_xvhaddw_d_w(src2, src2);                            \
+    src3 = __lasx_xvhaddw_d_w(src3, src3);                            \
+    src4 = __lasx_xvhaddw_d_w(src4, src4);                            \
+    src5 = __lasx_xvhaddw_d_w(src5, src5);                            \
+    src6 = __lasx_xvhaddw_d_w(src6, src6);                            \
+    src7 = __lasx_xvhaddw_d_w(src7, src7);                            \
+    DUP4_ARG2(__lasx_xvpickev_w, src1, src0, src3, src2,              \
+              src5, src4, src7, src6, src0, src1, src2, src3);        \
+    src0 = __lasx_xvhaddw_d_w(src0, src0);                            \
+    src1 = __lasx_xvhaddw_d_w(src1, src1);                            \
+    src2 = __lasx_xvhaddw_d_w(src2, src2);                            \
+    src3 = __lasx_xvhaddw_d_w(src3, src3);                            \
+    DUP2_ARG2(__lasx_xvpickev_w, src1, src0, src3, src2, src0, src1); \
+    DUP2_ARG2(__lasx_xvsrai_w, src0, _sh, src1, _sh, src0, src1);     \
+    src0 = __lasx_xvmin_w(src0, vmax);                                \
+    src1 = __lasx_xvmin_w(src1, vmax);                                \
+    src0 = __lasx_xvperm_w(src0, shuf);                               \
+    src1 = __lasx_xvperm_w(src1, shuf);                               \
+    src0 = __lasx_xvpickev_h(src1, src0);                             \
+    src0 = __lasx_xvpermi_d(src0, 0xd8);                              \
+    __lasx_xvst(src0, dst, 0);                                        \
+    filterPos += 16;                                                  \
+    filter    += 128;                                                 \
+    dst       += 16;                                                  \
+}
+
+#define SCALE_8_8(_sh)                                                \
+{                                                                     \
+    src0    = __lasx_xvldrepl_d(src + filterPos[0], 0);               \
+    src1    = __lasx_xvldrepl_d(src + filterPos[1], 0);               \
+    src2    = __lasx_xvldrepl_d(src + filterPos[2], 0);               \
+    src3    = __lasx_xvldrepl_d(src + filterPos[3], 0);               \
+    src4    = __lasx_xvldrepl_d(src + filterPos[4], 0);               \
+    src5    = __lasx_xvldrepl_d(src + filterPos[5], 0);               \
+    src6    = __lasx_xvldrepl_d(src + filterPos[6], 0);               \
+    src7    = __lasx_xvldrepl_d(src + filterPos[7], 0);               \
+    DUP4_ARG2(__lasx_xvld, filter, 0, filter, 32, filter, 64,         \
+              filter, 96, filter0, filter1, filter2, filter3);        \
+    DUP4_ARG2(__lasx_xvilvl_d, src1, src0, src3, src2,                \
+              src5, src4, src7, src6, src0, src2, src4, src6);        \
+    DUP4_ARG1(__lasx_vext2xv_hu_bu, src0, src2, src4, src6,           \
+              src0, src2, src4, src6);                                \
+    DUP4_ARG2(__lasx_xvdp2_w_h, filter0, src0, filter1, src2,         \
+              filter2, src4, filter3, src6, src0, src1, src2,src3);   \
+    src0 = __lasx_xvhaddw_d_w(src0, src0);                            \
+    src1 = __lasx_xvhaddw_d_w(src1, src1);                            \
+    src2 = __lasx_xvhaddw_d_w(src2, src2);                            \
+    src3 = __lasx_xvhaddw_d_w(src3, src3);                            \
+    DUP2_ARG2(__lasx_xvpickev_w, src1, src0, src3, src2, src0, src1); \
+    src0 = __lasx_xvhaddw_d_w(src0, src0);                            \
+    src1 = __lasx_xvhaddw_d_w(src1, src1);                            \
+    src0 = __lasx_xvpickev_w(src1, src0);                             \
+    src0 = __lasx_xvsrai_w(src0, _sh);                                \
+    src0 = __lasx_xvmin_w(src0, vmax);                                \
+    src0 = __lasx_xvperm_w(src0, shuf);                               \
+}
+
+#define SCALE_8_4(_sh)                                                    \
+{                                                                         \
+    src0    = __lasx_xvldrepl_d(src + filterPos[0], 0);                   \
+    src1    = __lasx_xvldrepl_d(src + filterPos[1], 0);                   \
+    src2    = __lasx_xvldrepl_d(src + filterPos[2], 0);                   \
+    src3    = __lasx_xvldrepl_d(src + filterPos[3], 0);                   \
+    DUP2_ARG2(__lasx_xvld, filter, 0, filter, 32, filter0, filter1);      \
+    DUP2_ARG2(__lasx_xvilvl_d, src1, src0, src3, src2, src0, src2);       \
+    DUP2_ARG1(__lasx_vext2xv_hu_bu, src0, src2, src0, src2);              \
+    DUP2_ARG2(__lasx_xvdp2_w_h, filter0, src0, filter1, src2, src0, src1);\
+    src0 = __lasx_xvhaddw_d_w(src0, src0);                                \
+    src1 = __lasx_xvhaddw_d_w(src1, src1);                                \
+    src0 = __lasx_xvpickev_w(src1, src0);                                 \
+    src0 = __lasx_xvhaddw_d_w(src0, src0);                                \
+    src0 = __lasx_xvpickev_w(src0, src0);                                 \
+    src0 = __lasx_xvsrai_w(src0, _sh);                                    \
+    src0 = __lasx_xvmin_w(src0, vmax);                                    \
+    src0 = __lasx_xvperm_w(src0, shuf);                                   \
+}
+
+#define SCALE_8_2(_sh)                                                \
+{                                                                     \
+    src0    = __lasx_xvldrepl_d(src + filterPos[0], 0);               \
+    src1    = __lasx_xvldrepl_d(src + filterPos[1], 0);               \
+    filter0 = __lasx_xvld(filter, 0);                                 \
+    src0 = __lasx_xvilvl_d(src1, src0);                               \
+    src0 = __lasx_vext2xv_hu_bu(src0);                                \
+    src0 = __lasx_xvdp2_w_h(filter0, src0);                           \
+    src0 = __lasx_xvhaddw_d_w(src0, src0);                            \
+    src0 = __lasx_xvhaddw_q_d(src0, src0);                            \
+    src0 = __lasx_xvsrai_w(src0, _sh);                                \
+    src0 = __lasx_xvmin_w(src0, vmax);                                \
+    dst[0] = __lasx_xvpickve2gr_w(src0, 0);                           \
+    dst[1] = __lasx_xvpickve2gr_w(src0, 4);                           \
+    filterPos += 2;                                                   \
+    filter    += 16;                                                  \
+    dst       += 2;                                                   \
+}
+
+#define SCALE_4_16(_sh)                                               \
+{                                                                     \
+    src0    = __lasx_xvldrepl_w(src + filterPos[0], 0);               \
+    src1    = __lasx_xvldrepl_w(src + filterPos[1], 0);               \
+    src2    = __lasx_xvldrepl_w(src + filterPos[2], 0);               \
+    src3    = __lasx_xvldrepl_w(src + filterPos[3], 0);               \
+    src4    = __lasx_xvldrepl_w(src + filterPos[4], 0);               \
+    src5    = __lasx_xvldrepl_w(src + filterPos[5], 0);               \
+    src6    = __lasx_xvldrepl_w(src + filterPos[6], 0);               \
+    src7    = __lasx_xvldrepl_w(src + filterPos[7], 0);               \
+    src8    = __lasx_xvldrepl_w(src + filterPos[8], 0);               \
+    src9    = __lasx_xvldrepl_w(src + filterPos[9], 0);               \
+    src10   = __lasx_xvldrepl_w(src + filterPos[10], 0);              \
+    src11   = __lasx_xvldrepl_w(src + filterPos[11], 0);              \
+    src12   = __lasx_xvldrepl_w(src + filterPos[12], 0);              \
+    src13   = __lasx_xvldrepl_w(src + filterPos[13], 0);              \
+    src14   = __lasx_xvldrepl_w(src + filterPos[14], 0);              \
+    src15   = __lasx_xvldrepl_w(src + filterPos[15], 0);              \
+    DUP4_ARG2(__lasx_xvld, filter, 0, filter, 32, filter, 64,         \
+              filter, 96, filter0, filter1, filter2, filter3);        \
+    DUP4_ARG2(__lasx_xvilvl_w, src1, src0, src3, src2, src5,          \
+              src4, src7, src6, src0, src2, src4, src6);              \
+    DUP4_ARG2(__lasx_xvilvl_w, src9, src8, src11, src10, src13,       \
+              src12, src15, src14, src8, src10, src12, src14);        \
+    DUP4_ARG2(__lasx_xvilvl_d, src2, src0, src6, src4, src10,         \
+                   src8, src14, src12, src0, src1, src2, src3);       \
+    DUP4_ARG1(__lasx_vext2xv_hu_bu, src0, src1, src2, src3,           \
+              src0, src1, src2, src3);                                \
+    DUP4_ARG2(__lasx_xvdp2_w_h, filter0, src0, filter1, src1,         \
+              filter2, src2, filter3, src3, src0, src1, src2, src3);  \
+    src0 = __lasx_xvhaddw_d_w(src0, src0);                            \
+    src1 = __lasx_xvhaddw_d_w(src1, src1);                            \
+    src2 = __lasx_xvhaddw_d_w(src2, src2);                            \
+    src3 = __lasx_xvhaddw_d_w(src3, src3);                            \
+    DUP2_ARG2(__lasx_xvpickev_w, src1, src0, src3, src2, src0, src1); \
+    DUP2_ARG2(__lasx_xvsrai_w, src0, _sh, src1, _sh, src0, src1);     \
+    src0 = __lasx_xvmin_w(src0, vmax);                                \
+    src1 = __lasx_xvmin_w(src1, vmax);                                \
+    src0 = __lasx_xvpickev_h(src1, src0);                             \
+    src0 = __lasx_xvperm_w(src0, shuf);                               \
+    __lasx_xvst(src0, dst, 0);                                        \
+    filterPos += 16;                                                  \
+    filter    += 64;                                                  \
+    dst       += 16;                                                  \
+}
+
+#define SCALE_4_8(_sh)                                                    \
+{                                                                         \
+    src0    = __lasx_xvldrepl_w(src + filterPos[0], 0);                   \
+    src1    = __lasx_xvldrepl_w(src + filterPos[1], 0);                   \
+    src2    = __lasx_xvldrepl_w(src + filterPos[2], 0);                   \
+    src3    = __lasx_xvldrepl_w(src + filterPos[3], 0);                   \
+    src4    = __lasx_xvldrepl_w(src + filterPos[4], 0);                   \
+    src5    = __lasx_xvldrepl_w(src + filterPos[5], 0);                   \
+    src6    = __lasx_xvldrepl_w(src + filterPos[6], 0);                   \
+    src7    = __lasx_xvldrepl_w(src + filterPos[7], 0);                   \
+    DUP2_ARG2(__lasx_xvld, filter, 0, filter, 32, filter0, filter1);      \
+    DUP4_ARG2(__lasx_xvilvl_w, src1, src0, src3, src2, src5,              \
+              src4, src7, src6, src0, src2, src4, src6);                  \
+    DUP2_ARG2(__lasx_xvilvl_d, src2, src0, src6, src4, src0, src1);       \
+                                                                          \
+    DUP2_ARG1(__lasx_vext2xv_hu_bu, src0, src1, src0, src1);              \
+    DUP2_ARG2(__lasx_xvdp2_w_h, filter0, src0, filter1, src1, src0, src1);\
+    src0 = __lasx_xvhaddw_d_w(src0, src0);                                \
+    src1 = __lasx_xvhaddw_d_w(src1, src1);                                \
+    src0 = __lasx_xvpickev_w(src1, src0);                                 \
+    src0 = __lasx_xvsrai_w(src0, _sh);                                    \
+    src0 = __lasx_xvmin_w(src0, vmax);                                    \
+}
+
+#define SCALE_4_4(_sh)                                                \
+{                                                                     \
+    src0    = __lasx_xvldrepl_w(src + filterPos[0], 0);               \
+    src1    = __lasx_xvldrepl_w(src + filterPos[1], 0);               \
+    src2    = __lasx_xvldrepl_w(src + filterPos[2], 0);               \
+    src3    = __lasx_xvldrepl_w(src + filterPos[3], 0);               \
+    filter0 = __lasx_xvld(filter, 0);                                 \
+    DUP2_ARG2(__lasx_xvilvl_w, src1, src0, src3, src2, src0, src1);   \
+                                                                      \
+    src0 = __lasx_xvilvl_d(src1, src0);                               \
+    src0 = __lasx_vext2xv_hu_bu(src0);                                \
+    src0 = __lasx_xvdp2_w_h(filter0, src0);                           \
+    src0 = __lasx_xvhaddw_d_w(src0, src0);                            \
+    src0 = __lasx_xvsrai_w(src0, _sh);                                \
+    src0 = __lasx_xvmin_w(src0, vmax);                                \
+    src0 = __lasx_xvpickev_w(src0, src0);                             \
+    src0 = __lasx_xvpermi_d(src0, 0xd8);                              \
+}
+
+#define SCALE_4_2(_sh)                                                \
+{                                                                     \
+    src0    = __lasx_xvldrepl_w(src + filterPos[0], 0);               \
+    src1    = __lasx_xvldrepl_w(src + filterPos[1], 0);               \
+    filter0 = __lasx_xvld(filter, 0);                                 \
+    src0 = __lasx_xvilvl_w(src1, src0);                               \
+    src0 = __lasx_vext2xv_hu_bu(src0);                                \
+    src0 = __lasx_xvdp2_w_h(filter0, src0);                           \
+    src0 = __lasx_xvhaddw_d_w(src0, src0);                            \
+    src0 = __lasx_xvsrai_w(src0, _sh);                                \
+    src0 = __lasx_xvmin_w(src0, vmax);                                \
+    dst[0] = __lasx_xvpickve2gr_w(src0, 0);                           \
+    dst[1] = __lasx_xvpickve2gr_w(src0, 2);                           \
+    filterPos += 2;                                                   \
+    filter    += 8;                                                   \
+    dst       += 2;                                                   \
+}
+
+#define SCALE_16                                                      \
+{                                                                     \
+    int dex  = j << 1;                                                \
+    src0     = __lasx_xvldrepl_d((srcPos1 + j), 0);                   \
+    src1     = __lasx_xvldrepl_d((srcPos2 + j), 0);                   \
+    src2     = __lasx_xvldrepl_d((srcPos3 + j), 0);                   \
+    src3     = __lasx_xvldrepl_d((srcPos4 + j), 0);                   \
+    DUP4_ARG2(__lasx_xvldx, filterStart1, dex, filterStart2, dex,     \
+              filterStart3, dex, filterStart4, dex, filter0,          \
+              filter1, filter2, filter3);                             \
+    src0     = __lasx_xvpermi_q(src0, src1, 0x02);                    \
+    src1     = __lasx_xvpermi_q(src2, src3, 0x02);                    \
+    filter0  = __lasx_xvpermi_q(filter0, filter1, 0x02);              \
+    filter1  = __lasx_xvpermi_q(filter2, filter3, 0x02);              \
+    DUP2_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, src0, src1);   \
+    DUP2_ARG2(__lasx_xvdp2_w_h, filter0, src0, filter1, src1,         \
+              out0, out1);                                            \
+    src0     = __lasx_xvhaddw_d_w(out0, out0);                        \
+    src1     = __lasx_xvhaddw_d_w(out1, out1);                        \
+    out0     = __lasx_xvpackev_d(src1, src0);                         \
+    out1     = __lasx_xvpackod_d(src1, src0);                         \
+    out0     = __lasx_xvadd_w(out0, out1);                            \
+    out      = __lasx_xvadd_w(out, out0);                             \
+}
+
+void ff_hscale_8_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
+                            const uint8_t *src, const int16_t *filter,
+                            const int32_t *filterPos, int filterSize)
+{
+    int i;
+    int max = (1 << 15) - 1;
+
+    if (filterSize == 8) {
+        __m256i src0, src1, src2, src3, src4, src5, src6, src7;
+        __m256i src8, src9, src10, src11, src12, src13, src14, src15;
+        __m256i filter0, filter1, filter2, filter3;
+        __m256i filter4, filter5, filter6, filter7;
+        __m256i vmax = __lasx_xvreplgr2vr_w(max);
+        __m256i shuf = {0x0000000400000000, 0x0000000500000001,
+                        0x0000000600000002, 0x0000000700000003};
+        int len = dstW >> 4;
+        int res = dstW & 15;
+        while (len--) {
+            SCALE_8_16(7);
+        }
+        if (res & 8) {
+            SCALE_8_8(7);
+            src0 = __lasx_xvpickev_h(src0, src0);
+            __lasx_xvstelm_d(src0, dst, 0, 0);
+            __lasx_xvstelm_d(src0, dst, 8, 2);
+            filterPos += 8;
+            filter    += 64;
+            dst       += 8;
+        }
+        if (res & 4) {
+            SCALE_8_4(7);
+            src0 = __lasx_xvpickev_h(src0, src0);
+            __lasx_xvstelm_d(src0, dst, 0, 0);
+            filterPos += 4;
+            filter    += 32;
+            dst       += 4;
+        }
+        if (res & 2) {
+            SCALE_8_2(7);
+        }
+        if (res & 1) {
+            int val = 0;
+            src0    = __lasx_xvldrepl_d(src + filterPos[0], 0);
+            filter0 = __lasx_xvld(filter, 0);
+            src0 = __lasx_vext2xv_hu_bu(src0);
+            src0 = __lasx_xvdp2_w_h(filter0, src0);
+            src0    = __lasx_xvhaddw_d_w(src0, src0);
+            src0    = __lasx_xvhaddw_q_d(src0, src0);
+            val     = __lasx_xvpickve2gr_w(src0, 0);
+            dst[0]  = FFMIN(val >> 7, max);
+        }
+    } else if (filterSize == 4) {
+        __m256i src0, src1, src2, src3, src4, src5, src6, src7;
+        __m256i src8, src9, src10, src11, src12, src13, src14, src15;
+        __m256i filter0, filter1, filter2, filter3;
+        __m256i vmax = __lasx_xvreplgr2vr_w(max);
+        __m256i shuf = {0x0000000400000000, 0x0000000500000001,
+                        0x0000000600000002, 0x0000000700000003};
+        int len = dstW >> 4;
+        int res = dstW & 15;
+        while (len--) {
+            SCALE_4_16(7);
+        }
+        if (res & 8) {
+            SCALE_4_8(7);
+            src0 = __lasx_xvpickev_h(src1, src0);
+            src0 = __lasx_xvperm_w(src0, shuf);
+            __lasx_xvstelm_d(src0, dst, 0, 0);
+            __lasx_xvstelm_d(src0, dst, 8, 1);
+            filterPos += 8;
+            filter    += 32;
+            dst       += 8;
+        }
+        if (res & 4) {
+            SCALE_4_4(7);
+            src0 = __lasx_xvpickev_h(src0, src0);
+            __lasx_xvstelm_d(src0, dst, 0, 0);
+            filterPos += 4;
+            filter    += 16;
+            dst       += 4;
+        }
+        if (res & 2) {
+            SCALE_4_2(7);
+        }
+        if (res & 1) {
+            int val = 0;
+            const uint8_t *srcPos = src + filterPos[0];
+
+            for (int j = 0; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[0] = FFMIN(val >> 7, max);
+        }
+    } else if (filterSize > 8) {
+        int filterlen = filterSize - 7;
+        int len = dstW >> 2;
+        int res = dstW & 3;
+        __m256i zero = __lasx_xvldi(0);
+
+        while (len--) {
+            __m256i src0, src1, src2, src3;
+            __m256i filter0, filter1, filter2, filter3, out0, out1;
+            __m256i out = zero;
+            const uint8_t *srcPos1 = src + filterPos[0];
+            const uint8_t *srcPos2 = src + filterPos[1];
+            const uint8_t *srcPos3 = src + filterPos[2];
+            const uint8_t *srcPos4 = src + filterPos[3];
+            const int16_t *filterStart1 = filter;
+            const int16_t *filterStart2 = filterStart1 + filterSize;
+            const int16_t *filterStart3 = filterStart2 + filterSize;
+            const int16_t *filterStart4 = filterStart3 + filterSize;
+            int j, val1 = 0, val2 = 0, val3 = 0, val4 = 0;
+
+            for (j = 0; j < filterlen; j += 8) {
+                SCALE_16
+            }
+            val1 = __lasx_xvpickve2gr_w(out, 0);
+            val2 = __lasx_xvpickve2gr_w(out, 4);
+            val3 = __lasx_xvpickve2gr_w(out, 2);
+            val4 = __lasx_xvpickve2gr_w(out, 6);
+            for (; j < filterSize; j++) {
+                val1 += ((int)srcPos1[j]) * filterStart1[j];
+                val2 += ((int)srcPos2[j]) * filterStart2[j];
+                val3 += ((int)srcPos3[j]) * filterStart3[j];
+                val4 += ((int)srcPos4[j]) * filterStart4[j];
+            }
+            dst[0] = FFMIN(val1 >> 7, max);
+            dst[1] = FFMIN(val2 >> 7, max);
+            dst[2] = FFMIN(val3 >> 7, max);
+            dst[3] = FFMIN(val4 >> 7, max);
+            dst       += 4;
+            filterPos += 4;
+            filter     = filterStart4 + filterSize;
+        }
+        for(i = 0; i < res; i++) {
+            int j, val = 0;
+            const uint8_t *srcPos = src + filterPos[i];
+            __m256i src1, filter0, out0;
+
+            for (j = 0; j < filterlen; j += 8) {
+                src1   = __lasx_xvldrepl_d((srcPos + j), 0);
+                filter0 = __lasx_xvld(filter + j, 0);
+                src1 = __lasx_xvilvl_b(zero, src1);
+                out0 = __lasx_xvdp2_w_h(filter0, src1);
+                out0 = __lasx_xvhaddw_d_w(out0, out0);
+                out0 = __lasx_xvhaddw_q_d(out0, out0);
+                val += __lasx_xvpickve2gr_w(out0, 0);
+            }
+            for (; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[i]  = FFMIN(val >> 7, max);
+            filter += filterSize;
+        }
+    } else {
+        for (i = 0; i < dstW; i++) {
+            int val = 0;
+            const uint8_t *srcPos = src + filterPos[i];
+
+            for (int j = 0; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[i]  = FFMIN(val >> 7, max);
+            filter += filterSize;
+        }
+    }
+}
+
+void ff_hscale_8_to_19_lasx(SwsContext *c, int16_t *_dst, int dstW,
+                            const uint8_t *src, const int16_t *filter,
+                            const int32_t *filterPos, int filterSize)
+{
+    int i;
+    int max = (1 << 19) - 1;
+    int32_t *dst = (int32_t *) _dst;
+
+    if (filterSize == 8) {
+        __m256i src0, src1, src2, src3, src4, src5, src6, src7;
+        __m256i filter0, filter1, filter2, filter3;
+        __m256i vmax = __lasx_xvreplgr2vr_w(max);
+        __m256i shuf = {0x0000000400000000, 0x0000000500000001,
+                        0x0000000600000002, 0x0000000700000003};
+        int len = dstW >> 3;
+        int res = dstW & 7;
+        while (len--) {
+            SCALE_8_8(3);
+            __lasx_xvst(src0, dst, 0);
+            filterPos += 8;
+            filter    += 64;
+            dst       += 8;
+        }
+        if (res & 4) {
+            SCALE_8_4(3);
+            __lasx_xvstelm_d(src0, dst, 0, 0);
+            __lasx_xvstelm_d(src0, dst, 8, 1);
+            filterPos += 4;
+            filter    += 32;
+            dst       += 4;
+        }
+        if (res & 2) {
+            SCALE_8_2(3);
+        }
+        if (res & 1) {
+            int val = 0;
+            __m256i src0, filter0, out0;
+
+            src0    = __lasx_xvldrepl_d(src + filterPos[0], 0);
+            filter0 = __lasx_xvld(filter, 0);
+            src0 = __lasx_vext2xv_hu_bu(src0);
+            out0 = __lasx_xvdp2_w_h(filter0, src0);
+            out0    = __lasx_xvhaddw_d_w(out0, out0);
+            out0    = __lasx_xvhaddw_q_d(out0, out0);
+            val     = __lasx_xvpickve2gr_w(out0, 0);
+            dst[0]  = FFMIN(val >> 3, max);
+        }
+    } else if (filterSize == 4) {
+        __m256i src0, src1, src2, src3, src4, src5, src6, src7;
+        __m256i filter0, filter1;
+        __m256i vmax = __lasx_xvreplgr2vr_w(max);
+        __m256i shuf = {0x0000000100000000, 0x0000000500000004,
+                        0x0000000300000002, 0x0000000700000006};
+        int len = dstW >> 3;
+        int res = dstW & 7;
+        while (len--) {
+            SCALE_4_8(3);
+            src0 = __lasx_xvperm_w(src0, shuf);
+            __lasx_xvst(src0, dst, 0);
+            filterPos += 8;
+            filter    += 32;
+            dst       += 8;
+        }
+        if (res & 4) {
+            SCALE_4_4(3);
+            __lasx_xvstelm_d(src0, dst, 0, 0);
+            __lasx_xvstelm_d(src0, dst, 8, 1);
+            filterPos += 4;
+            filter    += 16;
+            dst       += 4;
+        }
+        if (res & 2) {
+            SCALE_4_2(3);
+        }
+        if (res & 1) {
+            int val = 0;
+            const uint8_t *srcPos = src + filterPos[0];
+
+            for (int j = 0; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[0] = FFMIN(val >> 3, max);
+        }
+    } else if (filterSize > 8) {
+        int len = dstW >> 2;
+        int res = dstW & 3;
+        int filterlen = filterSize - 7;
+        __m256i zero = __lasx_xvldi(0);
+
+        while (len--) {
+            __m256i src0, src1, src2, src3;
+            __m256i filter0, filter1, filter2, filter3, out0, out1;
+            __m256i out = zero;
+            const uint8_t *srcPos1 = src + filterPos[0];
+            const uint8_t *srcPos2 = src + filterPos[1];
+            const uint8_t *srcPos3 = src + filterPos[2];
+            const uint8_t *srcPos4 = src + filterPos[3];
+            const int16_t *filterStart1 = filter;
+            const int16_t *filterStart2 = filterStart1 + filterSize;
+            const int16_t *filterStart3 = filterStart2 + filterSize;
+            const int16_t *filterStart4 = filterStart3 + filterSize;
+            int j, val1 = 0, val2 = 0, val3 = 0, val4 = 0;
+
+            for (j = 0; j < filterlen; j += 8) {
+                SCALE_16
+            }
+            val1 = __lasx_xvpickve2gr_w(out, 0);
+            val2 = __lasx_xvpickve2gr_w(out, 4);
+            val3 = __lasx_xvpickve2gr_w(out, 2);
+            val4 = __lasx_xvpickve2gr_w(out, 6);
+            for (; j < filterSize; j++) {
+                val1 += ((int)srcPos1[j]) * filterStart1[j];
+                val2 += ((int)srcPos2[j]) * filterStart2[j];
+                val3 += ((int)srcPos3[j]) * filterStart3[j];
+                val4 += ((int)srcPos4[j]) * filterStart4[j];
+            }
+            dst[0] = FFMIN(val1 >> 3, max);
+            dst[1] = FFMIN(val2 >> 3, max);
+            dst[2] = FFMIN(val3 >> 3, max);
+            dst[3] = FFMIN(val4 >> 3, max);
+            dst       += 4;
+            filterPos += 4;
+            filter     = filterStart4 + filterSize;
+        }
+        for (i = 0; i < res; i++) {
+            int j, val = 0;
+            const uint8_t *srcPos = src + filterPos[i];
+            __m256i src1, filter0, out0;
+
+            for (j = 0; j < filterlen; j += 8) {
+                src1   = __lasx_xvldrepl_d((srcPos + j), 0);
+                filter0 = __lasx_xvld(filter + j, 0);
+                src1 = __lasx_xvilvl_b(zero, src1);
+                out0 = __lasx_xvdp2_w_h(filter0, src1);
+                out0 = __lasx_xvhaddw_d_w(out0, out0);
+                out0 = __lasx_xvhaddw_q_d(out0, out0);
+                val += __lasx_xvpickve2gr_w(out0, 0);
+            }
+            for (; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[i] = FFMIN(val >> 3, max);
+            filter += filterSize;
+        }
+    } else {
+        for (i = 0; i < dstW; i++) {
+            int val = 0;
+            const uint8_t *srcPos = src + filterPos[i];
+
+            for (int j = 0; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[i]  = FFMIN(val >> 3, max);
+            filter += filterSize;
+        }
+    }
+}
+
+#undef SCALE_16
+
+#define SCALE_8                                                              \
+{                                                                            \
+    int val1, val2, val3, val4;                                              \
+    __m256i src0, src1, src2, src3, filter0, filter1, out0, out1;            \
+    DUP4_ARG2(__lasx_xvld, src + filterPos[0], 0, src + filterPos[1], 0,     \
+              src + filterPos[2], 0, src + filterPos[3], 0, src0, src1, src2,\
+              src3);                                                         \
+    DUP2_ARG2(__lasx_xvld, filter, 0, filter, 32, filter0, filter1);         \
+    src0    = __lasx_xvpermi_q(src0, src1, 0x02);                            \
+    src2    = __lasx_xvpermi_q(src2, src3, 0x02);                            \
+    DUP2_ARG2(__lasx_xvdp2_w_hu_h, src0, filter0, src2, filter1, out0, out1);\
+    src0    = __lasx_xvhaddw_d_w(out0, out0);                                \
+    src1    = __lasx_xvhaddw_d_w(out1, out1);                                \
+    out0    = __lasx_xvpackev_d(src1, src0);                                 \
+    out1    = __lasx_xvpackod_d(src1, src0);                                 \
+    out0    = __lasx_xvadd_w(out0, out1);                                    \
+    out0    = __lasx_xvsra_w(out0, shift);                                   \
+    val1    = __lasx_xvpickve2gr_w(out0, 0);                                 \
+    val2    = __lasx_xvpickve2gr_w(out0, 4);                                 \
+    val3    = __lasx_xvpickve2gr_w(out0, 2);                                 \
+    val4    = __lasx_xvpickve2gr_w(out0, 6);                                 \
+    dst[0]  = FFMIN(val1, max);                                              \
+    dst[1]  = FFMIN(val2, max);                                              \
+    dst[2]  = FFMIN(val3, max);                                              \
+    dst[3]  = FFMIN(val4, max);                                              \
+    filterPos += 4;                                                          \
+    filter += 32;                                                            \
+    dst += 4;                                                                \
+}
+
+#define SCALE_16                                                             \
+{                                                                            \
+    int dex = j << 1;                                                        \
+    DUP4_ARG2(__lasx_xvldx, srcPos1, dex, srcPos2, dex, srcPos3, dex,        \
+              srcPos4, dex, src0, src1, src2, src3);                         \
+    DUP4_ARG2(__lasx_xvldx, filterStart1, dex, filterStart2, dex,            \
+              filterStart3, dex, filterStart4, dex, filter0,                 \
+              filter1, filter2, filter3);                                    \
+    src0     = __lasx_xvpermi_q(src0, src1, 0x02);                           \
+    src1     = __lasx_xvpermi_q(src2, src3, 0x02);                           \
+    filter0  = __lasx_xvpermi_q(filter0, filter1, 0x02);                     \
+    filter1  = __lasx_xvpermi_q(filter2, filter3, 0x02);                     \
+    DUP2_ARG2(__lasx_xvdp2_w_hu_h, src0, filter0, src1, filter1, out0, out1);\
+    src0     = __lasx_xvhaddw_d_w(out0, out0);                               \
+    src1     = __lasx_xvhaddw_d_w(out1, out1);                               \
+    out0     = __lasx_xvpackev_d(src1, src0);                                \
+    out1     = __lasx_xvpackod_d(src1, src0);                                \
+    out0     = __lasx_xvadd_w(out0, out1);                                   \
+    out      = __lasx_xvadd_w(out, out0);                                    \
+}
+
+void ff_hscale_16_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
+                             const uint8_t *_src, const int16_t *filter,
+                             const int32_t *filterPos, int filterSize)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
+    int i;
+    const uint16_t *src = (const uint16_t *) _src;
+    int sh              = desc->comp[0].depth - 1;
+    int max = (1 << 15) - 1;
+    int len = dstW >> 2;
+    int res = dstW & 3;
+    __m256i shift;
+    __m256i zero = __lasx_xvldi(0);
+
+    if (sh < 15) {
+        sh = isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8 ? 13 :
+                      (desc->comp[0].depth - 1);
+    } else if (desc->flags && AV_PIX_FMT_FLAG_FLOAT) {
+        sh = 15;
+    }
+    shift = __lasx_xvreplgr2vr_w(sh);
+
+    if (filterSize == 8) {
+        for (i = 0; i < len; i++) {
+            SCALE_8
+        }
+        for (i = 0; i < res; i++) {
+            int val = 0;
+            __m256i src0, filter0, out0;
+
+            src0    = __lasx_xvld(src + filterPos[i], 0);
+            filter0 = __lasx_xvld(filter, 0);
+            out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
+            out0    = __lasx_xvhaddw_d_w(out0, out0);
+            out0    = __lasx_xvhaddw_q_d(out0, out0);
+            val     = __lasx_xvpickve2gr_w(out0, 0);
+            dst[i]  = FFMIN(val >> sh, max);
+            filter += 8;
+        }
+    } else if (filterSize == 4) {
+        for (i = 0; i < len; i++) {
+            __m256i src1, src2, src3, src4, src0, filter0, out0;
+
+            src1 = __lasx_xvldrepl_d(src + filterPos[0], 0);
+            src2 = __lasx_xvldrepl_d(src + filterPos[1], 0);
+            src3 = __lasx_xvldrepl_d(src + filterPos[2], 0);
+            src4 = __lasx_xvldrepl_d(src + filterPos[3], 0);
+            filter0 = __lasx_xvld(filter, 0);
+            src1 = __lasx_xvextrins_d(src1, src2, 0x10);
+            src3 = __lasx_xvextrins_d(src3, src4, 0x10);
+            src0 = __lasx_xvpermi_q(src1, src3, 0x02);
+            out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
+            out0 = __lasx_xvhaddw_d_w(out0, out0);
+            out0 = __lasx_xvsra_w(out0, shift);
+            dst[0] = FFMIN((__lasx_xvpickve2gr_w(out0, 0)), max);
+            dst[1] = FFMIN((__lasx_xvpickve2gr_w(out0, 2)), max);
+            dst[2] = FFMIN((__lasx_xvpickve2gr_w(out0, 4)), max);
+            dst[3] = FFMIN((__lasx_xvpickve2gr_w(out0, 6)), max);
+            dst       += 4;
+            filterPos += 4;
+            filter    += 16;
+        }
+        for (i = 0; i < res; i++) {
+            int val = 0;
+            const uint16_t *srcPos = src + filterPos[i];
+
+            for (int j = 0; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[i]  = FFMIN(val >> sh, max);
+            filter += 4;
+        }
+    } else if (filterSize > 8) {
+        int filterlen = filterSize - 7;
+
+        for (i = 0; i < len; i++) {
+            __m256i src0, src1, src2, src3;
+            __m256i filter0, filter1, filter2, filter3, out0, out1;
+            __m256i out = zero;
+            const uint16_t *srcPos1 = src + filterPos[0];
+            const uint16_t *srcPos2 = src + filterPos[1];
+            const uint16_t *srcPos3 = src + filterPos[2];
+            const uint16_t *srcPos4 = src + filterPos[3];
+            const int16_t *filterStart1 = filter;
+            const int16_t *filterStart2 = filterStart1 + filterSize;
+            const int16_t *filterStart3 = filterStart2 + filterSize;
+            const int16_t *filterStart4 = filterStart3 + filterSize;
+            int j, val1 = 0, val2 = 0, val3 = 0, val4 = 0;
+
+            for (j = 0; j < filterlen; j += 8) {
+                SCALE_16
+            }
+            val1 = __lasx_xvpickve2gr_w(out, 0);
+            val2 = __lasx_xvpickve2gr_w(out, 4);
+            val3 = __lasx_xvpickve2gr_w(out, 2);
+            val4 = __lasx_xvpickve2gr_w(out, 6);
+            for (; j < filterSize; j++) {
+                val1 += ((int)srcPos1[j]) * filterStart1[j];
+                val2 += ((int)srcPos2[j]) * filterStart2[j];
+                val3 += ((int)srcPos3[j]) * filterStart3[j];
+                val4 += ((int)srcPos4[j]) * filterStart4[j];
+            }
+            dst[0] = FFMIN(val1 >> sh, max);
+            dst[1] = FFMIN(val2 >> sh, max);
+            dst[2] = FFMIN(val3 >> sh, max);
+            dst[3] = FFMIN(val4 >> sh, max);
+            dst       += 4;
+            filterPos += 4;
+            filter     = filterStart4 + filterSize;
+        }
+        for (i = 0; i < res; i++) {
+            int j, val = 0;
+            const uint16_t *srcPos      = src + filterPos[i];
+            __m256i src0, filter0, out0;
+
+            for (j = 0; j < filterlen; j += 8) {
+                int dex = j << 1;
+                DUP2_ARG2(__lasx_xvldx, srcPos, dex, filter, dex, src0, filter0);
+                out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
+                out0    = __lasx_xvhaddw_d_w(out0, out0);
+                out0    = __lasx_xvhaddw_q_d(out0, out0);
+                val    += __lasx_xvpickve2gr_w(out0, 0);
+            }
+            for (; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[i]  = FFMIN(val >> sh, max);
+            filter += filterSize;
+        }
+    } else {
+        for (i = 0; i < dstW; i++) {
+            int val = 0;
+            const uint16_t *srcPos = src + filterPos[i];
+
+            for (int j = 0; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[i]  = FFMIN(val >> sh, max);
+            filter += filterSize;
+        }
+    }
+}
+
+void ff_hscale_16_to_19_lasx(SwsContext *c, int16_t *_dst, int dstW,
+                             const uint8_t *_src, const int16_t *filter,
+                             const int32_t *filterPos, int filterSize)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
+    int i;
+    int32_t *dst        = (int32_t *) _dst;
+    const uint16_t *src = (const uint16_t *) _src;
+    int sh              = desc->comp[0].depth - 5;
+    int max = (1 << 19) - 1;
+    int len = dstW >> 2;
+    int res = dstW & 3;
+    __m256i shift;
+    __m256i zero = __lasx_xvldi(0);
+
+    if ((isAnyRGB(c->srcFormat) || c->srcFormat == AV_PIX_FMT_PAL8)
+         && desc->comp[0].depth<16) {
+        sh = 9;
+    } else if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) {
+        sh = 11;
+    }
+    shift = __lasx_xvreplgr2vr_w(sh);
+
+    if (filterSize == 8) {
+        for (i = 0; i < len; i++) {
+            SCALE_8
+        }
+        for (i = 0; i < res; i++) {
+            int val = 0;
+            __m256i src0, filter0, out0;
+
+            DUP2_ARG2(__lasx_xvld, src + filterPos[i], 0, filter, 0,
+                      src0, filter0);
+            out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
+            out0 = __lasx_xvhaddw_d_w(out0, out0);
+            out0 = __lasx_xvhaddw_q_d(out0, out0);
+            val  = __lasx_xvpickve2gr_w(out0, 0);
+            dst[i] = FFMIN(val >> sh, max);
+            filter += 8;
+        }
+    } else if (filterSize == 4) {
+        for (i = 0; i < len; i++) {
+            __m256i src1, src2, src3, src4, src0, filter0, out0;
+
+            src1 = __lasx_xvldrepl_d(src + filterPos[0], 0);
+            src2 = __lasx_xvldrepl_d(src + filterPos[1], 0);
+            src3 = __lasx_xvldrepl_d(src + filterPos[2], 0);
+            src4 = __lasx_xvldrepl_d(src + filterPos[3], 0);
+            filter0 = __lasx_xvld(filter, 0);
+            src1 = __lasx_xvextrins_d(src1, src2, 0x10);
+            src3 = __lasx_xvextrins_d(src3, src4, 0x10);
+            src0 = __lasx_xvpermi_q(src1, src3, 0x02);
+            out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
+            out0 = __lasx_xvhaddw_d_w(out0, out0);
+            out0 = __lasx_xvsra_w(out0, shift);
+            dst[0] = FFMIN((__lasx_xvpickve2gr_w(out0, 0)), max);
+            dst[1] = FFMIN((__lasx_xvpickve2gr_w(out0, 2)), max);
+            dst[2] = FFMIN((__lasx_xvpickve2gr_w(out0, 4)), max);
+            dst[3] = FFMIN((__lasx_xvpickve2gr_w(out0, 6)), max);
+            dst       += 4;
+            filterPos += 4;
+            filter    += 16;
+        }
+        for (i = 0; i < res; i++) {
+            int val = 0;
+            const uint16_t *srcPos = src + filterPos[i];
+
+            for (int j = 0; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[i]  = FFMIN(val >> sh, max);
+            filter += 4;
+        }
+    } else if (filterSize > 8) {
+        int filterlen = filterSize - 7;
+
+        for (i = 0; i < len; i ++) {
+            __m256i src0, src1, src2, src3;
+            __m256i filter0, filter1, filter2, filter3, out0, out1;
+            __m256i out = zero;
+            const uint16_t *srcPos1 = src + filterPos[0];
+            const uint16_t *srcPos2 = src + filterPos[1];
+            const uint16_t *srcPos3 = src + filterPos[2];
+            const uint16_t *srcPos4 = src + filterPos[3];
+            const int16_t *filterStart1 = filter;
+            const int16_t *filterStart2 = filterStart1 + filterSize;
+            const int16_t *filterStart3 = filterStart2 + filterSize;
+            const int16_t *filterStart4 = filterStart3 + filterSize;
+            int j, val1 = 0, val2 = 0, val3 = 0, val4 = 0;
+
+            for (j = 0; j < filterlen; j += 8) {
+                SCALE_16
+            }
+            val1 = __lasx_xvpickve2gr_w(out, 0);
+            val2 = __lasx_xvpickve2gr_w(out, 4);
+            val3 = __lasx_xvpickve2gr_w(out, 2);
+            val4 = __lasx_xvpickve2gr_w(out, 6);
+            for (; j < filterSize; j++) {
+                val1 += ((int)srcPos1[j]) * filterStart1[j];
+                val2 += ((int)srcPos2[j]) * filterStart2[j];
+                val3 += ((int)srcPos3[j]) * filterStart3[j];
+                val4 += ((int)srcPos4[j]) * filterStart4[j];
+            }
+            dst[0] = FFMIN(val1 >> sh, max);
+            dst[1] = FFMIN(val2 >> sh, max);
+            dst[2] = FFMIN(val3 >> sh, max);
+            dst[3] = FFMIN(val4 >> sh, max);
+            dst       += 4;
+            filterPos += 4;
+            filter     = filterStart4 + filterSize;
+        }
+        for (i = 0; i < res; i++) {
+            int j, val = 0;
+            const uint16_t *srcPos      = src + filterPos[i];
+            __m256i src0, filter0, out0;
+
+            for (j = 0; j < filterlen; j += 8) {
+                int dex = j << 1;
+                DUP2_ARG2(__lasx_xvldx, srcPos, dex, filter, dex,
+                          src0, filter0);
+                out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
+                out0    = __lasx_xvhaddw_d_w(out0, out0);
+                out0    = __lasx_xvhaddw_q_d(out0, out0);
+                val    += __lasx_xvpickve2gr_w(out0, 0);
+            }
+            for (; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[i]  = FFMIN(val >> sh, max);
+            filter += filterSize;
+        }
+    } else {
+        for (i = 0; i < dstW; i++) {
+            int val = 0;
+            const uint16_t *srcPos = src + filterPos[i];
+
+            for (int j = 0; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[i]  = FFMIN(val >> sh, max);
+            filter += filterSize;
+        }
+    }
+}
+
+#undef SCALE_8
+#undef SCALE_16
diff --git a/libswscale/loongarch/swscale_loongarch.h b/libswscale/loongarch/swscale_loongarch.h
new file mode 100644
index 0000000000..67e24d544c
--- /dev/null
+++ b/libswscale/loongarch/swscale_loongarch.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) 2022 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen(chenhao@loongson.cn)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H
+#define SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H
+
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+
+void ff_hscale_8_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
+                            const uint8_t *src, const int16_t *filter,
+                            const int32_t *filterPos, int filterSize);
+
+void ff_hscale_8_to_19_lasx(SwsContext *c, int16_t *_dst, int dstW,
+                            const uint8_t *src, const int16_t *filter,
+                            const int32_t *filterPos, int filterSize);
+
+void ff_hscale_16_to_19_lasx(SwsContext *c, int16_t *_dst, int dstW,
+                             const uint8_t *_src, const int16_t *filter,
+                             const int32_t *filterPos, int filterSize);
+
+void ff_hscale_16_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
+                             const uint8_t *_src, const int16_t *filter,
+                             const int32_t *filterPos, int filterSize);
+
+void planar_rgb_to_uv_lasx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
+                           int width, int32_t *rgb2yuv);
+
+void planar_rgb_to_y_lasx(uint8_t *_dst, const uint8_t *src[4], int width,
+                          int32_t *rgb2yuv);
+
+#endif /* SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H */
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index 7b40f49da4..367d045a02 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -598,6 +598,8 @@ void ff_sws_init_scale(SwsContext *c)
     ff_sws_init_swscale_aarch64(c);
 #elif ARCH_ARM
     ff_sws_init_swscale_arm(c);
+#elif ARCH_LOONGARCH64
+    ff_sws_init_swscale_loongarch(c);
 #endif
 }
 
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 6c14ce8536..abeebbb002 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -698,6 +698,7 @@ av_cold void ff_sws_init_range_convert(SwsContext *c);
 
 SwsFunc ff_yuv2rgb_init_x86(SwsContext *c);
 SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c);
+SwsFunc ff_yuv2rgb_init_loongarch(SwsContext *c);
 
 static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)
 {
@@ -983,6 +984,7 @@ void ff_sws_init_swscale_vsx(SwsContext *c);
 void ff_sws_init_swscale_x86(SwsContext *c);
 void ff_sws_init_swscale_aarch64(SwsContext *c);
 void ff_sws_init_swscale_arm(SwsContext *c);
+void ff_sws_init_swscale_loongarch(SwsContext *c);
 
 void ff_hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
                        const uint8_t *src, int srcW, int xInc);
diff --git a/libswscale/utils.c b/libswscale/utils.c
index a621a35862..f762fba1df 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -53,6 +53,7 @@
 #include "libavutil/ppc/cpu.h"
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
+#include "libavutil/loongarch/cpu.h"
 
 #include "rgb2rgb.h"
 #include "swscale.h"
@@ -654,6 +655,15 @@ static av_cold int initFilter(int16_t **outFilter, int32_t **filterPos,
             filterAlign = 1;
     }
 
+    if (have_lasx(cpu_flags)) {
+        int reNum = minFilterSize & (0x07);
+
+        if (minFilterSize < 5)
+            filterAlign = 4;
+        if (reNum < 3)
+            filterAlign = 1;
+    }
+
     av_assert0(minFilterSize > 0);
     filterSize = (minFilterSize + (filterAlign - 1)) & (~(filterAlign - 1));
     av_assert0(filterSize > 0);
@@ -1839,7 +1849,8 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
         {
             const int filterAlign = X86_MMX(cpu_flags)     ? 4 :
                                     PPC_ALTIVEC(cpu_flags) ? 8 :
-                                    have_neon(cpu_flags)   ? 4 : 1;
+                                    have_neon(cpu_flags)   ? 4 :
+                                    have_lasx(cpu_flags)   ? 8 : 1;
 
             if ((ret = initFilter(&c->hLumFilter, &c->hLumFilterPos,
                            &c->hLumFilterSize, c->lumXInc,
-- 
2.20.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [FFmpeg-devel] [PATCH v1 2/3] swscale/la: Add yuv2rgb_lasx.c and rgb2rgb_lasx.c files
  2022-08-29 11:26 [FFmpeg-devel] Add loongarch SIMD optimization in swscale lib Hao Chen
  2022-08-29 11:26 ` [FFmpeg-devel] [PATCH v1 1/3] swscale/la: Optimize hscale functions with lasx Hao Chen
@ 2022-08-29 11:26 ` Hao Chen
  2022-08-29 11:26 ` [FFmpeg-devel] [PATCH v1 3/3] swscale/la: Add output_lasx.c file Hao Chen
  2 siblings, 0 replies; 13+ messages in thread
From: Hao Chen @ 2022-08-29 11:26 UTC (permalink / raw)
  To: ffmpeg-devel

ffmpeg -i ~/media/1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -pix_fmt rgb24 -y /dev/null -an
before: 178fps
after:  210fps

Signed-off-by: Hao Chen <chenhao@loongson.cn>
---
 libswscale/loongarch/Makefile                 |   2 +
 libswscale/loongarch/rgb2rgb_lasx.c           |  52 +++
 libswscale/loongarch/swscale_init_loongarch.c |  42 +++
 libswscale/loongarch/swscale_loongarch.h      |  22 ++
 libswscale/loongarch/yuv2rgb_lasx.c           | 329 ++++++++++++++++++
 libswscale/rgb2rgb.c                          |   2 +
 libswscale/rgb2rgb.h                          |   1 +
 libswscale/yuv2rgb.c                          |   2 +
 8 files changed, 452 insertions(+)
 create mode 100644 libswscale/loongarch/rgb2rgb_lasx.c
 create mode 100644 libswscale/loongarch/yuv2rgb_lasx.c

diff --git a/libswscale/loongarch/Makefile b/libswscale/loongarch/Makefile
index 586a1717b6..4345971514 100644
--- a/libswscale/loongarch/Makefile
+++ b/libswscale/loongarch/Makefile
@@ -1,3 +1,5 @@
 OBJS-$(CONFIG_SWSCALE)      += loongarch/swscale_init_loongarch.o
 LASX-OBJS-$(CONFIG_SWSCALE) += loongarch/swscale_lasx.o \
                                loongarch/input_lasx.o   \
+                               loongarch/yuv2rgb_lasx.o \
+                               loongarch/rgb2rgb_lasx.o
diff --git a/libswscale/loongarch/rgb2rgb_lasx.c b/libswscale/loongarch/rgb2rgb_lasx.c
new file mode 100644
index 0000000000..68684ca595
--- /dev/null
+++ b/libswscale/loongarch/rgb2rgb_lasx.c
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen(chenhao@loongson.cn)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "swscale_loongarch.h"
+#include "libavutil/loongarch/loongson_intrinsics.h"
+
+void ff_interleave_bytes_lasx(const uint8_t *src1, const uint8_t *src2,
+                              uint8_t *dest, int width, int height,
+                              int src1Stride, int src2Stride, int dstStride)
+{
+    int h;
+    int len = width & (0xFFFFFFF0);
+
+    for (h = 0; h < height; h++) {
+        int w, index = 0;
+        __m256i src_1, src_2, dst;
+
+        for (w = 0; w < len; w += 16) {
+            DUP2_ARG2(__lasx_xvld, src1 + w, 0, src2 + w, 0, src_1, src_2);
+            src_1 = __lasx_xvpermi_d(src_1, 0xD8);
+            src_2 = __lasx_xvpermi_d(src_2, 0xD8);
+            dst   = __lasx_xvilvl_b(src_2, src_1);
+            __lasx_xvst(dst, dest + index, 0);
+            index  += 32;
+        }
+        for (w = 0; w < width; w++) {
+            dest[(w << 1) + 0] = src1[w];
+            dest[(w << 1) + 1] = src2[w];
+        }
+        dest += dstStride;
+        src1 += src1Stride;
+        src2 += src2Stride;
+    }
+}
diff --git a/libswscale/loongarch/swscale_init_loongarch.c b/libswscale/loongarch/swscale_init_loongarch.c
index 197dc6e1e7..1e0bb1b116 100644
--- a/libswscale/loongarch/swscale_init_loongarch.c
+++ b/libswscale/loongarch/swscale_init_loongarch.c
@@ -21,6 +21,7 @@
 
 #include "swscale_loongarch.h"
 #include "libswscale/swscale_internal.h"
+#include "libswscale/rgb2rgb.h"
 #include "libavutil/loongarch/cpu.h"
 
 av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
@@ -48,3 +49,44 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
         }
     }
 }
+
+av_cold void rgb2rgb_init_loongarch(void)
+{
+    int cpu_flags = av_get_cpu_flags();
+    if (have_lasx(cpu_flags))
+        interleaveBytes = ff_interleave_bytes_lasx;
+}
+
+av_cold SwsFunc ff_yuv2rgb_init_loongarch(SwsContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+    if (have_lasx(cpu_flags)) {
+        switch (c->dstFormat) {
+            case AV_PIX_FMT_RGB24:
+                return yuv420_rgb24_lasx;
+            case AV_PIX_FMT_BGR24:
+                return yuv420_bgr24_lasx;
+            case AV_PIX_FMT_RGBA:
+                if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat)) {
+                    break;
+                } else
+                    return yuv420_rgba32_lasx;
+            case AV_PIX_FMT_ARGB:
+                if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat)) {
+                    break;
+                } else
+                    return yuv420_argb32_lasx;
+            case AV_PIX_FMT_BGRA:
+                if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat)) {
+                    break;
+                } else
+                    return yuv420_bgra32_lasx;
+            case AV_PIX_FMT_ABGR:
+                if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat)) {
+                    break;
+                } else
+                    return yuv420_abgr32_lasx;
+        }
+    }
+    return NULL;
+}
diff --git a/libswscale/loongarch/swscale_loongarch.h b/libswscale/loongarch/swscale_loongarch.h
index 67e24d544c..07a8145da2 100644
--- a/libswscale/loongarch/swscale_loongarch.h
+++ b/libswscale/loongarch/swscale_loongarch.h
@@ -47,4 +47,26 @@ void planar_rgb_to_uv_lasx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4]
 void planar_rgb_to_y_lasx(uint8_t *_dst, const uint8_t *src[4], int width,
                           int32_t *rgb2yuv);
 
+int yuv420_rgb24_lasx(SwsContext *c, const uint8_t *src[], int srcStride[],
+                      int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
+
+int yuv420_bgr24_lasx(SwsContext *c, const uint8_t *src[], int srcStride[],
+                      int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
+
+int yuv420_rgba32_lasx(SwsContext *c, const uint8_t *src[], int srcStride[],
+                       int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
+
+int yuv420_bgra32_lasx(SwsContext *c, const uint8_t *src[], int srcStride[],
+                       int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
+
+int yuv420_argb32_lasx(SwsContext *c, const uint8_t *src[], int srcStride[],
+                       int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
+
+int yuv420_abgr32_lasx(SwsContext *c, const uint8_t *src[], int srcStride[],
+                       int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
+
+void ff_interleave_bytes_lasx(const uint8_t *src1, const uint8_t *src2,
+                              uint8_t *dest, int width, int height,
+                              int src1Stride, int src2Stride, int dstStride);
+
 #endif /* SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H */
diff --git a/libswscale/loongarch/yuv2rgb_lasx.c b/libswscale/loongarch/yuv2rgb_lasx.c
new file mode 100644
index 0000000000..6f160fe880
--- /dev/null
+++ b/libswscale/loongarch/yuv2rgb_lasx.c
@@ -0,0 +1,329 @@
+/*
+ * Copyright (C) 2022 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen(chenhao@loongson.cn)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "swscale_loongarch.h"
+#include "libavutil/loongarch/loongson_intrinsics.h"
+
+#define YUV2RGB_LOAD_COE                                     \
+    /* Load x_offset */                                      \
+    __m256i y_offset = __lasx_xvreplgr2vr_d(c->yOffset);     \
+    __m256i u_offset = __lasx_xvreplgr2vr_d(c->uOffset);     \
+    __m256i v_offset = __lasx_xvreplgr2vr_d(c->vOffset);     \
+    /* Load x_coeff  */                                      \
+    __m256i ug_coeff = __lasx_xvreplgr2vr_d(c->ugCoeff);     \
+    __m256i vg_coeff = __lasx_xvreplgr2vr_d(c->vgCoeff);     \
+    __m256i y_coeff  = __lasx_xvreplgr2vr_d(c->yCoeff);      \
+    __m256i ub_coeff = __lasx_xvreplgr2vr_d(c->ubCoeff);     \
+    __m256i vr_coeff = __lasx_xvreplgr2vr_d(c->vrCoeff);     \
+
+#define LOAD_YUV_16                                          \
+    m_y1 = __lasx_xvld(py_1, 0);                             \
+    m_y2 = __lasx_xvld(py_2, 0);                             \
+    m_u  = __lasx_xvldrepl_d(pu, 0);                         \
+    m_v  = __lasx_xvldrepl_d(pv, 0);                         \
+    DUP2_ARG2(__lasx_xvilvl_b, m_u, m_u, m_v, m_v, m_u, m_v);\
+    DUP4_ARG1(__lasx_vext2xv_hu_bu, m_y1, m_y2, m_u, m_v,    \
+              m_y1, m_y2, m_u, m_v);                         \
+
+/* YUV2RGB method
+ * The conversion method is as follows:
+ * R = Y' * y_coeff + V' * vr_coeff
+ * G = Y' * y_coeff + V' * vg_coeff + U' * ug_coeff
+ * B = Y' * y_coeff + U' * ub_coeff
+ *
+ * where X' = X * 8 - x_offset
+ *
+ */
+
+#define YUV2RGB                                                            \
+    m_y1 = __lasx_xvslli_h(m_y1, 3);                                       \
+    m_y2 = __lasx_xvslli_h(m_y2, 3);                                       \
+    m_u  = __lasx_xvslli_h(m_u, 3);                                        \
+    m_v  = __lasx_xvslli_h(m_v, 3);                                        \
+    m_y1 = __lasx_xvsub_h(m_y1, y_offset);                                 \
+    m_y2 = __lasx_xvsub_h(m_y2, y_offset);                                 \
+    m_u  = __lasx_xvsub_h(m_u, u_offset);                                  \
+    m_v  = __lasx_xvsub_h(m_v, v_offset);                                  \
+    y_1  = __lasx_xvmuh_h(m_y1, y_coeff);                                  \
+    y_2  = __lasx_xvmuh_h(m_y2, y_coeff);                                  \
+    u2g  = __lasx_xvmuh_h(m_u, ug_coeff);                                  \
+    u2b  = __lasx_xvmuh_h(m_u, ub_coeff);                                  \
+    v2r  = __lasx_xvmuh_h(m_v, vr_coeff);                                  \
+    v2g  = __lasx_xvmuh_h(m_v, vg_coeff);                                  \
+    r1   = __lasx_xvsadd_h(y_1, v2r);                                      \
+    v2g  = __lasx_xvsadd_h(v2g, u2g);                                      \
+    g1   = __lasx_xvsadd_h(y_1, v2g);                                      \
+    b1   = __lasx_xvsadd_h(y_1, u2b);                                      \
+    r2   = __lasx_xvsadd_h(y_2, v2r);                                      \
+    g2   = __lasx_xvsadd_h(y_2, v2g);                                      \
+    b2   = __lasx_xvsadd_h(y_2, u2b);                                      \
+    DUP4_ARG1(__lasx_xvclip255_h, r1, g1, b1, r2, r1, g1, b1, r2);         \
+    DUP2_ARG1(__lasx_xvclip255_h, g2, b2, g2, b2);                         \
+
+#define RGB_PACK(r, g, b, rgb_l, rgb_h)                                    \
+{                                                                          \
+    __m256i rg;                                                            \
+    rg = __lasx_xvpackev_b(g, r);                                          \
+    DUP2_ARG3(__lasx_xvshuf_b, b, rg, shuf2, b, rg, shuf3, rgb_l, rgb_h);  \
+}
+
+#define RGB32_PACK(a, r, g, b, rgb_l, rgb_h)                               \
+{                                                                          \
+    __m256i ra, bg, tmp0, tmp1;                                            \
+    ra    = __lasx_xvpackev_b(r, a);                                       \
+    bg    = __lasx_xvpackev_b(b, g);                                       \
+    tmp0  = __lasx_xvilvl_h(bg, ra);                                       \
+    tmp1  = __lasx_xvilvh_h(bg, ra);                                       \
+    rgb_l = __lasx_xvpermi_q(tmp1, tmp0, 0x20);                            \
+    rgb_h = __lasx_xvpermi_q(tmp1, tmp0, 0x31);                            \
+}
+
+#define RGB_STORE(rgb_l, rgb_h, image)                                       \
+{                                                                            \
+    __lasx_xvstelm_d(rgb_l, image, 0,  0);                                   \
+    __lasx_xvstelm_d(rgb_l, image, 8,  1);                                   \
+    __lasx_xvstelm_d(rgb_h, image, 16, 0);                                   \
+    __lasx_xvstelm_d(rgb_l, image, 24, 2);                                   \
+    __lasx_xvstelm_d(rgb_l, image, 32, 3);                                   \
+    __lasx_xvstelm_d(rgb_h, image, 40, 2);                                   \
+}
+
+#define RGB32_STORE(rgb_l, rgb_h, image)                                     \
+{                                                                            \
+    __lasx_xvst(rgb_l, image, 0);                                            \
+    __lasx_xvst(rgb_h, image, 32);                                           \
+}
+
+#define YUV2RGBFUNC(func_name, dst_type, alpha)                                     \
+           int func_name(SwsContext *c, const uint8_t *src[],                       \
+                         int srcStride[], int srcSliceY, int srcSliceH,             \
+                         uint8_t *dst[], int dstStride[])                           \
+{                                                                                   \
+    int x, y, h_size, vshift, res;                                                  \
+    __m256i m_y1, m_y2, m_u, m_v;                                                   \
+    __m256i y_1, y_2, u2g, v2g, u2b, v2r, rgb1_l, rgb1_h;                           \
+    __m256i rgb2_l, rgb2_h, r1, g1, b1, r2, g2, b2;                                 \
+    __m256i shuf2 = {0x0504120302100100, 0x0A18090816070614,                        \
+                     0x0504120302100100, 0x0A18090816070614};                       \
+    __m256i shuf3 = {0x1E0F0E1C0D0C1A0B, 0x0101010101010101,                        \
+                     0x1E0F0E1C0D0C1A0B, 0x0101010101010101};                       \
+    YUV2RGB_LOAD_COE                                                                \
+                                                                                    \
+    h_size = c->dstW >> 4;                                                          \
+    res = (c->dstW & 15) >> 1;                                                      \
+    vshift = c->srcFormat != AV_PIX_FMT_YUV422P;                                    \
+    for (y = 0; y < srcSliceH; y += 2) {                                            \
+        dst_type av_unused *r, *g, *b;                                              \
+        dst_type *image1    = (dst_type *)(dst[0] + (y + srcSliceY) * dstStride[0]);\
+        dst_type *image2    = (dst_type *)(image1 +                   dstStride[0]);\
+        const uint8_t *py_1 = src[0] +               y * srcStride[0];              \
+        const uint8_t *py_2 = py_1   +                   srcStride[0];              \
+        const uint8_t *pu   = src[1] +   (y >> vshift) * srcStride[1];              \
+        const uint8_t *pv   = src[2] +   (y >> vshift) * srcStride[2];              \
+        for(x = 0; x < h_size; x++) {                                               \
+
+#define YUV2RGBFUNC32(func_name, dst_type, alpha)                                   \
+           int func_name(SwsContext *c, const uint8_t *src[],                       \
+                         int srcStride[], int srcSliceY, int srcSliceH,             \
+                         uint8_t *dst[], int dstStride[])                           \
+{                                                                                   \
+    int x, y, h_size, vshift, res;                                                  \
+    __m256i m_y1, m_y2, m_u, m_v;                                                   \
+    __m256i y_1, y_2, u2g, v2g, u2b, v2r, rgb1_l, rgb1_h;                           \
+    __m256i rgb2_l, rgb2_h, r1, g1, b1, r2, g2, b2;                                 \
+    __m256i a = __lasx_xvldi(0xFF);                                                 \
+                                                                                    \
+    YUV2RGB_LOAD_COE                                                                \
+                                                                                    \
+    h_size = c->dstW >> 4;                                                          \
+    res = (c->dstW & 15) >> 1;                                                      \
+    vshift = c->srcFormat != AV_PIX_FMT_YUV422P;                                    \
+    for (y = 0; y < srcSliceH; y += 2) {                                            \
+        int yd = y + srcSliceY;                                                     \
+        dst_type av_unused *r, *g, *b;                                              \
+        dst_type *image1    = (dst_type *)(dst[0] + (yd)     * dstStride[0]);       \
+        dst_type *image2    = (dst_type *)(dst[0] + (yd + 1) * dstStride[0]);       \
+        const uint8_t *py_1 = src[0] +               y * srcStride[0];              \
+        const uint8_t *py_2 = py_1   +                   srcStride[0];              \
+        const uint8_t *pu   = src[1] +   (y >> vshift) * srcStride[1];              \
+        const uint8_t *pv   = src[2] +   (y >> vshift) * srcStride[2];              \
+        for(x = 0; x < h_size; x++) {                                               \
+
+#define DEALYUV2RGBREMAIN                                                           \
+            py_1 += 16;                                                             \
+            py_2 += 16;                                                             \
+            pu += 8;                                                                \
+            pv += 8;                                                                \
+            image1 += 48;                                                           \
+            image2 += 48;                                                           \
+        }                                                                           \
+        for (x = 0; x < res; x++) {                                                 \
+            int av_unused U, V, Y;                                                  \
+            U = pu[0];                                                              \
+            V = pv[0];                                                              \
+            r = (void *)c->table_rV[V+YUVRGB_TABLE_HEADROOM];                       \
+            g = (void *)(c->table_gU[U+YUVRGB_TABLE_HEADROOM]                       \
+                       + c->table_gV[V+YUVRGB_TABLE_HEADROOM]);                     \
+            b = (void *)c->table_bU[U+YUVRGB_TABLE_HEADROOM];                       \
+
+#define DEALYUV2RGBREMAIN32                                                         \
+            py_1 += 16;                                                             \
+            py_2 += 16;                                                             \
+            pu += 8;                                                                \
+            pv += 8;                                                                \
+            image1 += 16;                                                           \
+            image2 += 16;                                                           \
+        }                                                                           \
+        for (x = 0; x < res; x++) {                                                 \
+            int av_unused U, V, Y;                                                  \
+            U = pu[0];                                                              \
+            V = pv[0];                                                              \
+            r = (void *)c->table_rV[V+YUVRGB_TABLE_HEADROOM];                       \
+            g = (void *)(c->table_gU[U+YUVRGB_TABLE_HEADROOM]                       \
+                       + c->table_gV[V+YUVRGB_TABLE_HEADROOM]);                     \
+            b = (void *)c->table_bU[U+YUVRGB_TABLE_HEADROOM];                       \
+
+
+#define PUTRGB24(dst, src)                  \
+    Y      = src[0];                        \
+    dst[0] = r[Y];                          \
+    dst[1] = g[Y];                          \
+    dst[2] = b[Y];                          \
+    Y      = src[1];                        \
+    dst[3] = r[Y];                          \
+    dst[4] = g[Y];                          \
+    dst[5] = b[Y];
+
+#define PUTBGR24(dst, src)                  \
+    Y      = src[0];                        \
+    dst[0] = b[Y];                          \
+    dst[1] = g[Y];                          \
+    dst[2] = r[Y];                          \
+    Y      = src[1];                        \
+    dst[3] = b[Y];                          \
+    dst[4] = g[Y];                          \
+    dst[5] = r[Y];
+
+#define PUTRGB(dst, src)                    \
+    Y      = src[0];                        \
+    dst[0] = r[Y] + g[Y] + b[Y];            \
+    Y      = src[1];                        \
+    dst[1] = r[Y] + g[Y] + b[Y];            \
+
+#define ENDRES                              \
+    pu += 1;                                \
+    pv += 1;                                \
+    py_1 += 2;                              \
+    py_2 += 2;                              \
+    image1 += 6;                            \
+    image2 += 6;                            \
+
+#define ENDRES32                            \
+    pu += 1;                                \
+    pv += 1;                                \
+    py_1 += 2;                              \
+    py_2 += 2;                              \
+    image1 += 2;                            \
+    image2 += 2;                            \
+
+#define END_FUNC()                          \
+        }                                   \
+    }                                       \
+    return srcSliceH;                       \
+}
+
+YUV2RGBFUNC(yuv420_rgb24_lasx, uint8_t, 0)
+    LOAD_YUV_16
+    YUV2RGB
+    RGB_PACK(r1, g1, b1, rgb1_l, rgb1_h);
+    RGB_PACK(r2, g2, b2, rgb2_l, rgb2_h);
+    RGB_STORE(rgb1_l, rgb1_h, image1);
+    RGB_STORE(rgb2_l, rgb2_h, image2);
+    DEALYUV2RGBREMAIN
+    PUTRGB24(image1, py_1);
+    PUTRGB24(image2, py_2);
+    ENDRES
+    END_FUNC()
+
+YUV2RGBFUNC(yuv420_bgr24_lasx, uint8_t, 0)
+    LOAD_YUV_16
+    YUV2RGB
+    RGB_PACK(b1, g1, r1, rgb1_l, rgb1_h);
+    RGB_PACK(b2, g2, r2, rgb2_l, rgb2_h);
+    RGB_STORE(rgb1_l, rgb1_h, image1);
+    RGB_STORE(rgb2_l, rgb2_h, image2);
+    DEALYUV2RGBREMAIN
+    PUTBGR24(image1, py_1);
+    PUTBGR24(image2, py_2);
+    ENDRES
+    END_FUNC()
+
+YUV2RGBFUNC32(yuv420_rgba32_lasx, uint32_t, 0)
+    LOAD_YUV_16
+    YUV2RGB
+    RGB32_PACK(r1, g1, b1, a, rgb1_l, rgb1_h);
+    RGB32_PACK(r2, g2, b2, a, rgb2_l, rgb2_h);
+    RGB32_STORE(rgb1_l, rgb1_h, image1);
+    RGB32_STORE(rgb2_l, rgb2_h, image2);
+    DEALYUV2RGBREMAIN32
+    PUTRGB(image1, py_1);
+    PUTRGB(image2, py_2);
+    ENDRES32
+    END_FUNC()
+
+YUV2RGBFUNC32(yuv420_bgra32_lasx, uint32_t, 0)
+    LOAD_YUV_16
+    YUV2RGB
+    RGB32_PACK(b1, g1, r1, a, rgb1_l, rgb1_h);
+    RGB32_PACK(b2, g2, r2, a, rgb2_l, rgb2_h);
+    RGB32_STORE(rgb1_l, rgb1_h, image1);
+    RGB32_STORE(rgb2_l, rgb2_h, image2);
+    DEALYUV2RGBREMAIN32
+    PUTRGB(image1, py_1);
+    PUTRGB(image2, py_2);
+    ENDRES32
+    END_FUNC()
+
+YUV2RGBFUNC32(yuv420_argb32_lasx, uint32_t, 0)
+    LOAD_YUV_16
+    YUV2RGB
+    RGB32_PACK(a, r1, g1, b1, rgb1_l, rgb1_h);
+    RGB32_PACK(a, r2, g2, b2, rgb2_l, rgb2_h);
+    RGB32_STORE(rgb1_l, rgb1_h, image1);
+    RGB32_STORE(rgb2_l, rgb2_h, image2);
+    DEALYUV2RGBREMAIN32
+    PUTRGB(image1, py_1);
+    PUTRGB(image2, py_2);
+    ENDRES32
+    END_FUNC()
+
+YUV2RGBFUNC32(yuv420_abgr32_lasx, uint32_t, 0)
+    LOAD_YUV_16
+    YUV2RGB
+    RGB32_PACK(a, b1, g1, r1, rgb1_l, rgb1_h);
+    RGB32_PACK(a, b2, g2, r2, rgb2_l, rgb2_h);
+    RGB32_STORE(rgb1_l, rgb1_h, image1);
+    RGB32_STORE(rgb2_l, rgb2_h, image2);
+    DEALYUV2RGBREMAIN32
+    PUTRGB(image1, py_1);
+    PUTRGB(image2, py_2);
+    ENDRES32
+    END_FUNC()
diff --git a/libswscale/rgb2rgb.c b/libswscale/rgb2rgb.c
index 4f1ac9c465..3af775b389 100644
--- a/libswscale/rgb2rgb.c
+++ b/libswscale/rgb2rgb.c
@@ -141,6 +141,8 @@ av_cold void ff_sws_rgb2rgb_init(void)
     rgb2rgb_init_aarch64();
 #elif ARCH_X86
     rgb2rgb_init_x86();
+#elif ARCH_LOONGARCH64
+    rgb2rgb_init_loongarch();
 #endif
 }
 
diff --git a/libswscale/rgb2rgb.h b/libswscale/rgb2rgb.h
index 7272e98c57..db85bfc42f 100644
--- a/libswscale/rgb2rgb.h
+++ b/libswscale/rgb2rgb.h
@@ -168,5 +168,6 @@ void ff_sws_rgb2rgb_init(void);
 
 void rgb2rgb_init_aarch64(void);
 void rgb2rgb_init_x86(void);
+void rgb2rgb_init_loongarch(void);
 
 #endif /* SWSCALE_RGB2RGB_H */
diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c
index 6ee483d12a..9c3f5e23c6 100644
--- a/libswscale/yuv2rgb.c
+++ b/libswscale/yuv2rgb.c
@@ -683,6 +683,8 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
     t = ff_yuv2rgb_init_ppc(c);
 #elif ARCH_X86
     t = ff_yuv2rgb_init_x86(c);
+#elif ARCH_LOONGARCH64
+    t = ff_yuv2rgb_init_loongarch(c);
 #endif
 
     if (t)
-- 
2.20.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [FFmpeg-devel] [PATCH v1 3/3] swscale/la: Add output_lasx.c file.
  2022-08-29 11:26 [FFmpeg-devel] Add loongarch SIMD optimization in swscale lib Hao Chen
  2022-08-29 11:26 ` [FFmpeg-devel] [PATCH v1 1/3] swscale/la: Optimize hscale functions with lasx Hao Chen
  2022-08-29 11:26 ` [FFmpeg-devel] [PATCH v1 2/3] swscale/la: Add yuv2rgb_lasx.c and rgb2rgb_lasx.c files Hao Chen
@ 2022-08-29 11:26 ` Hao Chen
  2022-08-29 12:30   ` Andreas Rheinhardt
  2 siblings, 1 reply; 13+ messages in thread
From: Hao Chen @ 2022-08-29 11:26 UTC (permalink / raw)
  To: ffmpeg-devel

ffmpeg -i ~/media/1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -s 640x480 -pix_fmt
rgb24 -y /dev/null -an
before: 150fps
after:  183fps

Signed-off-by: Hao Chen <chenhao@loongson.cn>
---
 libswscale/loongarch/Makefile                 |    3 +-
 libswscale/loongarch/output_lasx.c            | 1982 +++++++++++++++++
 libswscale/loongarch/swscale_init_loongarch.c |    3 +
 libswscale/loongarch/swscale_loongarch.h      |    6 +
 4 files changed, 1993 insertions(+), 1 deletion(-)
 create mode 100644 libswscale/loongarch/output_lasx.c

diff --git a/libswscale/loongarch/Makefile b/libswscale/loongarch/Makefile
index 4345971514..54d48b3de0 100644
--- a/libswscale/loongarch/Makefile
+++ b/libswscale/loongarch/Makefile
@@ -2,4 +2,5 @@ OBJS-$(CONFIG_SWSCALE)      += loongarch/swscale_init_loongarch.o
 LASX-OBJS-$(CONFIG_SWSCALE) += loongarch/swscale_lasx.o \
                                loongarch/input_lasx.o   \
                                loongarch/yuv2rgb_lasx.o \
-                               loongarch/rgb2rgb_lasx.o
+                               loongarch/rgb2rgb_lasx.o \
+							   loongarch/output_lasx.o
diff --git a/libswscale/loongarch/output_lasx.c b/libswscale/loongarch/output_lasx.c
new file mode 100644
index 0000000000..19f82692ff
--- /dev/null
+++ b/libswscale/loongarch/output_lasx.c
@@ -0,0 +1,1982 @@
+/*
+ * Copyright (C) 2022 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen(chenhao@loongson.cn)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "swscale_loongarch.h"
+#include "libavutil/loongarch/loongson_intrinsics.h"
+
+void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
+                          const int16_t **src, uint8_t *dest, int dstW,
+                          const uint8_t *dither, int offset)
+{
+    int i;
+    int len = dstW - 15;
+    __m256i mask = {0x1C0C180814041000, 0x1C1814100C080400,
+                    0x1C0C180814041000, 0x1C1814100C080400};
+    __m256i val1, val2, val3;
+    uint8_t dither0 = dither[offset & 7];
+    uint8_t dither1 = dither[(offset + 1) & 7];
+    uint8_t dither2 = dither[(offset + 2) & 7];
+    uint8_t dither3 = dither[(offset + 3) & 7];
+    uint8_t dither4 = dither[(offset + 4) & 7];
+    uint8_t dither5 = dither[(offset + 5) & 7];
+    uint8_t dither6 = dither[(offset + 6) & 7];
+    uint8_t dither7 = dither[(offset + 7) & 7];
+    int val_1[8] = {dither0, dither2, dither4, dither6,
+                    dither0, dither2, dither4, dither6};
+    int val_2[8] = {dither1, dither3, dither5, dither7,
+                    dither1, dither3, dither5, dither7};
+    int val_3[8] = {dither0, dither1, dither2, dither3,
+                    dither4, dither5, dither6, dither7};
+
+    DUP2_ARG2(__lasx_xvld, val_1, 0, val_2, 0, val1, val2);
+    val3 = __lasx_xvld(val_3, 0);
+
+    for (i = 0; i < len; i += 16) {
+        int j;
+        __m256i src0, filter0, val;
+        __m256i val_ev, val_od;
+
+        val_ev = __lasx_xvslli_w(val1, 12);
+        val_od = __lasx_xvslli_w(val2, 12);
+
+        for (j = 0; j < filterSize; j++) {
+            src0  = __lasx_xvld(src[j]+ i, 0);
+            filter0 = __lasx_xvldrepl_h((filter + j), 0);
+            val_ev = __lasx_xvmaddwev_w_h(val_ev, src0, filter0);
+            val_od = __lasx_xvmaddwod_w_h(val_od, src0, filter0);
+        }
+        val_ev = __lasx_xvsrai_w(val_ev, 19);
+        val_od = __lasx_xvsrai_w(val_od, 19);
+        val_ev = __lasx_xvclip255_w(val_ev);
+        val_od = __lasx_xvclip255_w(val_od);
+        val    = __lasx_xvshuf_b(val_od, val_ev, mask);
+        __lasx_xvstelm_d(val, (dest + i), 0, 0);
+        __lasx_xvstelm_d(val, (dest + i), 8, 2);
+    }
+    if (dstW - i >= 8){
+        int j;
+        __m256i src0, filter0, val_h;
+        __m256i val_l;
+
+        val_l = __lasx_xvslli_w(val3, 12);
+
+        for (j = 0; j < filterSize; j++) {
+            src0  = __lasx_xvld(src[j] + i, 0);
+            src0  = __lasx_vext2xv_w_h(src0);
+            filter0 = __lasx_xvldrepl_h((filter + j), 0);
+            filter0 = __lasx_vext2xv_w_h(filter0);
+            val_l = __lasx_xvmadd_w(val_l, src0, filter0);
+        }
+        val_l = __lasx_xvsrai_w(val_l, 19);
+        val_l = __lasx_xvclip255_w(val_l);
+        val_h = __lasx_xvpermi_d(val_l, 0x4E);
+        val_l = __lasx_xvshuf_b(val_h, val_l, mask);
+        __lasx_xvstelm_d(val_l, (dest + i), 0, 1);
+        i += 8;
+    }
+    for (; i < dstW; i++) {
+        int val = dither[(i + offset) & 7] << 12;
+        int j;
+        for (j = 0; j< filterSize; j++)
+            val += src[j][i] * filter[j];
+
+        dest[i] = av_clip_uint8(val >> 19);
+    }
+}
+
+/*Copy from libswscale/output.c*/
+static av_always_inline void
+yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2,
+              unsigned A1, unsigned A2,
+              const void *_r, const void *_g, const void *_b, int y,
+              enum AVPixelFormat target, int hasAlpha)
+{
+    if (target == AV_PIX_FMT_ARGB || target == AV_PIX_FMT_RGBA ||
+        target == AV_PIX_FMT_ABGR || target == AV_PIX_FMT_BGRA) {
+        uint32_t *dest = (uint32_t *) _dest;
+        const uint32_t *r = (const uint32_t *) _r;
+        const uint32_t *g = (const uint32_t *) _g;
+        const uint32_t *b = (const uint32_t *) _b;
+
+#if CONFIG_SMALL
+        dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
+        dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
+#else
+#if defined(ASSERT_LEVEL) && ASSERT_LEVEL > 1
+        int sh = (target == AV_PIX_FMT_RGB32_1 ||
+                  target == AV_PIX_FMT_BGR32_1) ? 0 : 24;
+        av_assert2((((r[Y1] + g[Y1] + b[Y1]) >> sh) & 0xFF) == 0xFF);
+#endif
+        dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
+        dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
+#endif
+    } else if (target == AV_PIX_FMT_RGB24 || target == AV_PIX_FMT_BGR24) {
+        uint8_t *dest = (uint8_t *) _dest;
+        const uint8_t *r = (const uint8_t *) _r;
+        const uint8_t *g = (const uint8_t *) _g;
+        const uint8_t *b = (const uint8_t *) _b;
+
+#define r_b ((target == AV_PIX_FMT_RGB24) ? r : b)
+#define b_r ((target == AV_PIX_FMT_RGB24) ? b : r)
+
+        dest[i * 6 + 0] = r_b[Y1];
+        dest[i * 6 + 1] =   g[Y1];
+        dest[i * 6 + 2] = b_r[Y1];
+        dest[i * 6 + 3] = r_b[Y2];
+        dest[i * 6 + 4] =   g[Y2];
+        dest[i * 6 + 5] = b_r[Y2];
+#undef r_b
+#undef b_r
+    } else if (target == AV_PIX_FMT_RGB565 || target == AV_PIX_FMT_BGR565 ||
+               target == AV_PIX_FMT_RGB555 || target == AV_PIX_FMT_BGR555 ||
+               target == AV_PIX_FMT_RGB444 || target == AV_PIX_FMT_BGR444) {
+        uint16_t *dest = (uint16_t *) _dest;
+        const uint16_t *r = (const uint16_t *) _r;
+        const uint16_t *g = (const uint16_t *) _g;
+        const uint16_t *b = (const uint16_t *) _b;
+        int dr1, dg1, db1, dr2, dg2, db2;
+
+        if (target == AV_PIX_FMT_RGB565 || target == AV_PIX_FMT_BGR565) {
+            dr1 = ff_dither_2x2_8[ y & 1     ][0];
+            dg1 = ff_dither_2x2_4[ y & 1     ][0];
+            db1 = ff_dither_2x2_8[(y & 1) ^ 1][0];
+            dr2 = ff_dither_2x2_8[ y & 1     ][1];
+            dg2 = ff_dither_2x2_4[ y & 1     ][1];
+            db2 = ff_dither_2x2_8[(y & 1) ^ 1][1];
+    } else if (target == AV_PIX_FMT_RGB555 || target == AV_PIX_FMT_BGR555) {
+            dr1 = ff_dither_2x2_8[ y & 1     ][0];
+            dg1 = ff_dither_2x2_8[ y & 1     ][1];
+            db1 = ff_dither_2x2_8[(y & 1) ^ 1][0];
+            dr2 = ff_dither_2x2_8[ y & 1     ][1];
+            dg2 = ff_dither_2x2_8[ y & 1     ][0];
+            db2 = ff_dither_2x2_8[(y & 1) ^ 1][1];
+        } else {
+            dr1 = ff_dither_4x4_16[ y & 3     ][0];
+            dg1 = ff_dither_4x4_16[ y & 3     ][1];
+            db1 = ff_dither_4x4_16[(y & 3) ^ 3][0];
+            dr2 = ff_dither_4x4_16[ y & 3     ][1];
+            dg2 = ff_dither_4x4_16[ y & 3     ][0];
+            db2 = ff_dither_4x4_16[(y & 3) ^ 3][1];
+        }
+
+        dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
+        dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
+    } else /* 8/4 bits */ {
+        uint8_t *dest = (uint8_t *) _dest;
+        const uint8_t *r = (const uint8_t *) _r;
+        const uint8_t *g = (const uint8_t *) _g;
+        const uint8_t *b = (const uint8_t *) _b;
+        int dr1, dg1, db1, dr2, dg2, db2;
+
+        if (target == AV_PIX_FMT_RGB8 || target == AV_PIX_FMT_BGR8) {
+            const uint8_t * const d64 = ff_dither_8x8_73[y & 7];
+            const uint8_t * const d32 = ff_dither_8x8_32[y & 7];
+            dr1 = dg1 = d32[(i * 2 + 0) & 7];
+            db1 =       d64[(i * 2 + 0) & 7];
+            dr2 = dg2 = d32[(i * 2 + 1) & 7];
+            db2 =       d64[(i * 2 + 1) & 7];
+        } else {
+            const uint8_t * const d64  = ff_dither_8x8_73 [y & 7];
+            const uint8_t * const d128 = ff_dither_8x8_220[y & 7];
+            dr1 = db1 = d128[(i * 2 + 0) & 7];
+            dg1 =        d64[(i * 2 + 0) & 7];
+            dr2 = db2 = d128[(i * 2 + 1) & 7];
+            dg2 =        d64[(i * 2 + 1) & 7];
+        }
+
+        if (target == AV_PIX_FMT_RGB4 || target == AV_PIX_FMT_BGR4) {
+            dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] +
+                    ((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4);
+        } else {
+            dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
+            dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
+        }
+    }
+}
+
+#define WRITE_YUV2RGB(vec_y1, vec_y2, vec_u, vec_v, t1, t2, t3, t4)    \
+{                                                                      \
+    Y1 = __lasx_xvpickve2gr_w(vec_y1, t1);                             \
+    Y2 = __lasx_xvpickve2gr_w(vec_y2, t2);                             \
+    U  = __lasx_xvpickve2gr_w(vec_u, t3);                              \
+    V  = __lasx_xvpickve2gr_w(vec_v, t4);                              \
+    r  =  c->table_rV[V];                                              \
+    g  = (c->table_gU[U] + c->table_gV[V]);                            \
+    b  =  c->table_bU[U];                                              \
+    yuv2rgb_write(dest, count, Y1, Y2, 0, 0,                           \
+                  r, g, b, y, target, 0);                              \
+    count++;                                                           \
+}
+
+static void
+yuv2rgb_X_template_lasx(SwsContext *c, const int16_t *lumFilter,
+                        const int16_t **lumSrc, int lumFilterSize,
+                        const int16_t *chrFilter, const int16_t **chrUSrc,
+                        const int16_t **chrVSrc, int chrFilterSize,
+                        const int16_t **alpSrc, uint8_t *dest, int dstW,
+                        int y, enum AVPixelFormat target, int hasAlpha)
+{
+    int i, j;
+    int count = 0;
+    int t     = 1 << 18;
+    int len   = dstW >> 6;
+    int res   = dstW & 63;
+    int len_count = (dstW + 1) >> 1;
+    const void *r, *g, *b;
+    int head = YUVRGB_TABLE_HEADROOM;
+    __m256i headroom  = __lasx_xvreplgr2vr_w(head);
+
+    for (i = 0; i < len; i++) {
+        int Y1, Y2, U, V, count_lum = count << 1;
+        __m256i l_src1, l_src2, l_src3, l_src4, u_src1, u_src2, v_src1, v_src2;
+        __m256i yl1_ev, yl1_od, yh1_ev, yh1_od, yl2_ev, yl2_od, yh2_ev, yh2_od;
+        __m256i u1_ev, u1_od, v1_ev, v1_od, u2_ev, u2_od, v2_ev, v2_od, temp;
+
+        yl1_ev = __lasx_xvldrepl_w(&t, 0);
+        yl1_od = yl1_ev;
+        yh1_ev = yl1_ev;
+        yh1_od = yl1_ev;
+        u1_ev  = yl1_ev;
+        v1_ev  = yl1_ev;
+        u1_od  = yl1_ev;
+        v1_od  = yl1_ev;
+        yl2_ev = yl1_ev;
+        yl2_od = yl1_ev;
+        yh2_ev = yl1_ev;
+        yh2_od = yl1_ev;
+        u2_ev  = yl1_ev;
+        v2_ev  = yl1_ev;
+        u2_od  = yl1_ev;
+        v2_od  = yl1_ev;
+        for (j = 0; j < lumFilterSize; j++) {
+            int16_t *src_lum = lumSrc[j] + count_lum;
+            temp    = __lasx_xvldrepl_h((lumFilter + j), 0);
+            DUP4_ARG2(__lasx_xvld, src_lum, 0, src_lum, 32, src_lum, 64,
+                      src_lum, 96, l_src1, l_src2, l_src3, l_src4);
+
+            yl1_ev  = __lasx_xvmaddwev_w_h(yl1_ev, temp, l_src1);
+            yl1_od  = __lasx_xvmaddwod_w_h(yl1_od, temp, l_src1);
+            yh1_ev  = __lasx_xvmaddwev_w_h(yh1_ev, temp, l_src2);
+            yh1_od  = __lasx_xvmaddwod_w_h(yh1_od, temp, l_src2);
+            yl2_ev  = __lasx_xvmaddwev_w_h(yl2_ev, temp, l_src3);
+            yl2_od  = __lasx_xvmaddwod_w_h(yl2_od, temp, l_src3);
+            yh2_ev  = __lasx_xvmaddwev_w_h(yh2_ev, temp, l_src4);
+            yh2_od  = __lasx_xvmaddwod_w_h(yh2_od, temp, l_src4);
+        }
+        for (j = 0; j < chrFilterSize; j++) {
+            DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrUSrc[j] + count, 32,
+                      u_src1, u_src2);
+            DUP2_ARG2(__lasx_xvld, chrVSrc[j] + count, 0, chrVSrc[j] + count, 32,
+                      v_src1, v_src2);
+            temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
+            u1_ev  = __lasx_xvmaddwev_w_h(u1_ev, temp, u_src1);
+            u1_od  = __lasx_xvmaddwod_w_h(u1_od, temp, u_src1);
+            v1_ev  = __lasx_xvmaddwev_w_h(v1_ev, temp, v_src1);
+            v1_od  = __lasx_xvmaddwod_w_h(v1_od, temp, v_src1);
+            u2_ev  = __lasx_xvmaddwev_w_h(u2_ev, temp, u_src2);
+            u2_od  = __lasx_xvmaddwod_w_h(u2_od, temp, u_src2);
+            v2_ev  = __lasx_xvmaddwev_w_h(v2_ev, temp, v_src2);
+            v2_od  = __lasx_xvmaddwod_w_h(v2_od, temp, v_src2);
+        }
+        yl1_ev = __lasx_xvsrai_w(yl1_ev, 19);
+        yh1_ev = __lasx_xvsrai_w(yh1_ev, 19);
+        yl1_od = __lasx_xvsrai_w(yl1_od, 19);
+        yh1_od = __lasx_xvsrai_w(yh1_od, 19);
+        u1_ev  = __lasx_xvsrai_w(u1_ev, 19);
+        v1_ev  = __lasx_xvsrai_w(v1_ev, 19);
+        u1_od  = __lasx_xvsrai_w(u1_od, 19);
+        v1_od  = __lasx_xvsrai_w(v1_od, 19);
+        yl2_ev = __lasx_xvsrai_w(yl2_ev, 19);
+        yh2_ev = __lasx_xvsrai_w(yh2_ev, 19);
+        yl2_od = __lasx_xvsrai_w(yl2_od, 19);
+        yh2_od = __lasx_xvsrai_w(yh2_od, 19);
+        u2_ev  = __lasx_xvsrai_w(u2_ev, 19);
+        v2_ev  = __lasx_xvsrai_w(v2_ev, 19);
+        u2_od  = __lasx_xvsrai_w(u2_od, 19);
+        v2_od  = __lasx_xvsrai_w(v2_od, 19);
+        u1_ev  = __lasx_xvadd_w(u1_ev, headroom);
+        v1_ev  = __lasx_xvadd_w(v1_ev, headroom);
+        u1_od  = __lasx_xvadd_w(u1_od, headroom);
+        v1_od  = __lasx_xvadd_w(v1_od, headroom);
+        u2_ev  = __lasx_xvadd_w(u2_ev, headroom);
+        v2_ev  = __lasx_xvadd_w(v2_ev, headroom);
+        u2_od  = __lasx_xvadd_w(u2_od, headroom);
+        v2_od  = __lasx_xvadd_w(v2_od, headroom);
+        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 0, 0, 0, 0);
+        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 1, 1, 0, 0);
+        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 2, 2, 1, 1);
+        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 3, 3, 1, 1);
+        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 4, 4, 2, 2);
+        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 5, 5, 2, 2);
+        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 6, 6, 3, 3);
+        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 7, 7, 3, 3);
+        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 0, 0, 4, 4);
+        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 1, 1, 4, 4);
+        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 2, 2, 5, 5);
+        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 3, 3, 5, 5);
+        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 4, 4, 6, 6);
+        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 5, 5, 6, 6);
+        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 6, 6, 7, 7);
+        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 7, 7, 7, 7);
+        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 0, 0, 0, 0);
+        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 1, 1, 0, 0);
+        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 2, 2, 1, 1);
+        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 3, 3, 1, 1);
+        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 4, 4, 2, 2);
+        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 5, 5, 2, 2);
+        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 6, 6, 3, 3);
+        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 7, 7, 3, 3);
+        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 0, 0, 4, 4);
+        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 1, 1, 4, 4);
+        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 2, 2, 5, 5);
+        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 3, 3, 5, 5);
+        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 4, 4, 6, 6);
+        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 5, 5, 6, 6);
+        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 6, 6, 7, 7);
+        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 7, 7, 7, 7);
+    }
+    if (res >= 32) {
+        int Y1, Y2, U, V, count_lum = count << 1;
+        __m256i l_src1, l_src2, u_src, v_src;
+        __m256i yl_ev, yl_od, yh_ev, yh_od;
+        __m256i u_ev, u_od, v_ev, v_od, temp;
+
+        yl_ev = __lasx_xvldrepl_w(&t, 0);
+        yl_od = yl_ev;
+        yh_ev = yl_ev;
+        yh_od = yl_ev;
+        u_ev  = yl_ev;
+        v_ev  = yl_ev;
+        u_od  = yl_ev;
+        v_od  = yl_ev;
+        for (j = 0; j < lumFilterSize; j++) {
+            temp   = __lasx_xvldrepl_h((lumFilter + j), 0);
+            DUP2_ARG2(__lasx_xvld, lumSrc[j] + count_lum, 0, lumSrc[j] + count_lum,
+                      32, l_src1, l_src2);
+            yl_ev  = __lasx_xvmaddwev_w_h(yl_ev, temp, l_src1);
+            yl_od  = __lasx_xvmaddwod_w_h(yl_od, temp, l_src1);
+            yh_ev  = __lasx_xvmaddwev_w_h(yh_ev, temp, l_src2);
+            yh_od  = __lasx_xvmaddwod_w_h(yh_od, temp, l_src2);
+        }
+        for (j = 0; j < chrFilterSize; j++) {
+            DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
+                      u_src, v_src);
+            temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
+            u_ev  = __lasx_xvmaddwev_w_h(u_ev, temp, u_src);
+            u_od  = __lasx_xvmaddwod_w_h(u_od, temp, u_src);
+            v_ev  = __lasx_xvmaddwev_w_h(v_ev, temp, v_src);
+            v_od  = __lasx_xvmaddwod_w_h(v_od, temp, v_src);
+        }
+        yl_ev = __lasx_xvsrai_w(yl_ev, 19);
+        yh_ev = __lasx_xvsrai_w(yh_ev, 19);
+        yl_od = __lasx_xvsrai_w(yl_od, 19);
+        yh_od = __lasx_xvsrai_w(yh_od, 19);
+        u_ev  = __lasx_xvsrai_w(u_ev, 19);
+        v_ev  = __lasx_xvsrai_w(v_ev, 19);
+        u_od  = __lasx_xvsrai_w(u_od, 19);
+        v_od  = __lasx_xvsrai_w(v_od, 19);
+        u_ev  = __lasx_xvadd_w(u_ev, headroom);
+        v_ev  = __lasx_xvadd_w(v_ev, headroom);
+        u_od  = __lasx_xvadd_w(u_od, headroom);
+        v_od  = __lasx_xvadd_w(v_od, headroom);
+        WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 0, 0, 0, 0);
+        WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 1, 1, 0, 0);
+        WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 2, 2, 1, 1);
+        WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 3, 3, 1, 1);
+        WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 4, 4, 2, 2);
+        WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 5, 5, 2, 2);
+        WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 6, 6, 3, 3);
+        WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 7, 7, 3, 3);
+        WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 0, 0, 4, 4);
+        WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 1, 1, 4, 4);
+        WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 2, 2, 5, 5);
+        WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 3, 3, 5, 5);
+        WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 4, 4, 6, 6);
+        WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 5, 5, 6, 6);
+        WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 6, 6, 7, 7);
+        WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 7, 7, 7, 7);
+        res -= 32;
+    }
+    if (res >= 16) {
+        int Y1, Y2, U, V;
+        int count_lum = count << 1;
+        __m256i l_src, u_src, v_src;
+        __m256i y_ev, y_od, u, v, temp;
+
+        y_ev = __lasx_xvldrepl_w(&t, 0);
+        y_od = y_ev;
+        u    = y_ev;
+        v    = y_ev;
+        for (j = 0; j < lumFilterSize; j++) {
+            temp  = __lasx_xvldrepl_h((lumFilter + j), 0);
+            l_src = __lasx_xvld(lumSrc[j] + count_lum, 0);
+            y_ev  = __lasx_xvmaddwev_w_h(y_ev, temp, l_src);
+            y_od  = __lasx_xvmaddwod_w_h(y_od, temp, l_src);
+        }
+        for (j = 0; j < chrFilterSize; j++) {
+            DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrVSrc[j] + count,
+                      0, u_src, v_src);
+            temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
+            u_src = __lasx_vext2xv_w_h(u_src);
+            v_src = __lasx_vext2xv_w_h(v_src);
+            u     = __lasx_xvmaddwev_w_h(u, temp, u_src);
+            v     = __lasx_xvmaddwev_w_h(v, temp, v_src);
+        }
+        y_ev = __lasx_xvsrai_w(y_ev, 19);
+        y_od = __lasx_xvsrai_w(y_od, 19);
+        u    = __lasx_xvsrai_w(u, 19);
+        v    = __lasx_xvsrai_w(v, 19);
+        u    = __lasx_xvadd_w(u, headroom);
+        v    = __lasx_xvadd_w(v, headroom);
+        WRITE_YUV2RGB(y_ev, y_od, u, v, 0, 0, 0, 0);
+        WRITE_YUV2RGB(y_ev, y_od, u, v, 1, 1, 1, 1);
+        WRITE_YUV2RGB(y_ev, y_od, u, v, 2, 2, 2, 2);
+        WRITE_YUV2RGB(y_ev, y_od, u, v, 3, 3, 3, 3);
+        WRITE_YUV2RGB(y_ev, y_od, u, v, 4, 4, 4, 4);
+        WRITE_YUV2RGB(y_ev, y_od, u, v, 5, 5, 5, 5);
+        WRITE_YUV2RGB(y_ev, y_od, u, v, 6, 6, 6, 6);
+        WRITE_YUV2RGB(y_ev, y_od, u, v, 7, 7, 7, 7);
+        res -= 16;
+    }
+    if (res >= 8) {
+        int Y1, Y2, U, V;
+        int count_lum = count << 1;
+        __m256i l_src, u_src, v_src;
+        __m256i y_ev, uv, temp;
+
+        y_ev = __lasx_xvldrepl_w(&t, 0);
+        uv   = y_ev;
+        for (j = 0; j < lumFilterSize; j++) {
+            temp  = __lasx_xvldrepl_h((lumFilter + j), 0);
+            l_src = __lasx_xvld(lumSrc[j] + count_lum, 0);
+            l_src = __lasx_vext2xv_w_h(l_src);
+            y_ev  = __lasx_xvmaddwev_w_h(y_ev, temp, l_src);
+        }
+        for (j = 0; j < chrFilterSize; j++) {
+            u_src = __lasx_xvldrepl_d((chrUSrc[j] + count), 0);
+            v_src = __lasx_xvldrepl_d((chrVSrc[j] + count), 0);
+            temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
+            u_src = __lasx_xvilvl_d(v_src, u_src);
+            u_src = __lasx_vext2xv_w_h(u_src);
+            uv    = __lasx_xvmaddwev_w_h(uv, temp, u_src);
+        }
+        y_ev = __lasx_xvsrai_w(y_ev, 19);
+        uv   = __lasx_xvsrai_w(uv, 19);
+        uv   = __lasx_xvadd_w(uv, headroom);
+        WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 0, 1, 0, 4);
+        WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 2, 3, 1, 5);
+        WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 4, 5, 2, 6);
+        WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 6, 7, 3, 7);
+    }
+    for (; count < len_count; count++) {
+        int Y1 = 1 << 18;
+        int Y2 = Y1;
+        int U  = Y1;
+        int V  = Y1;
+
+        for (j = 0; j < lumFilterSize; j++) {
+            Y1 += lumSrc[j][count * 2]     * lumFilter[j];
+            Y2 += lumSrc[j][count * 2 + 1] * lumFilter[j];
+        }
+        for (j = 0; j < chrFilterSize; j++) {
+            U += chrUSrc[j][count] * chrFilter[j];
+            V += chrVSrc[j][count] * chrFilter[j];
+        }
+        Y1 >>= 19;
+        Y2 >>= 19;
+        U  >>= 19;
+        V  >>= 19;
+        r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM];
+        g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
+             c->table_gV[V + YUVRGB_TABLE_HEADROOM]);
+        b =  c->table_bU[U + YUVRGB_TABLE_HEADROOM];
+
+        yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
+                      r, g, b, y, target, 0);
+    }
+}
+
+static void
+yuv2rgb_2_template_lasx(SwsContext *c, const int16_t *buf[2],
+                        const int16_t *ubuf[2], const int16_t *vbuf[2],
+                        const int16_t *abuf[2], uint8_t *dest, int dstW,
+                        int yalpha, int uvalpha, int y,
+                        enum AVPixelFormat target, int hasAlpha)
+{
+    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
+                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
+                  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
+    int yalpha1   = 4096 - yalpha;
+    int uvalpha1  = 4096 - uvalpha;
+    int i, count  = 0;
+    int len       = dstW - 15;
+    int len_count = (dstW + 1) >> 1;
+    const void *r, *g, *b;
+    int head  = YUVRGB_TABLE_HEADROOM;
+    __m256i v_yalpha1  = __lasx_xvreplgr2vr_w(yalpha1);
+    __m256i v_uvalpha1 = __lasx_xvreplgr2vr_w(uvalpha1);
+    __m256i v_yalpha   = __lasx_xvreplgr2vr_w(yalpha);
+    __m256i v_uvalpha  = __lasx_xvreplgr2vr_w(uvalpha);
+    __m256i headroom   = __lasx_xvreplgr2vr_w(head);
+
+    for (i = 0; i < len; i += 16) {
+        int Y1, Y2, U, V;
+        int i_dex = i << 1;
+        int c_dex = count << 1;
+        __m256i y0_h, y0_l, y0, u0, v0;
+        __m256i y1_h, y1_l, y1, u1, v1;
+        __m256i y_l, y_h, u, v;
+
+        DUP4_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
+                  buf1, i_dex, y0, u0, v0, y1);
+        DUP2_ARG2(__lasx_xvldx, ubuf1, c_dex, vbuf1, c_dex, u1, v1);
+        DUP2_ARG2(__lasx_xvsllwil_w_h, y0, 0, y1, 0, y0_l, y1_l);
+        DUP2_ARG1(__lasx_xvexth_w_h, y0, y1, y0_h, y1_h);
+        DUP4_ARG1(__lasx_vext2xv_w_h, u0, u1, v0, v1, u0, u1, v0, v1);
+        y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
+        y0_h = __lasx_xvmul_w(y0_h, v_yalpha1);
+        u0   = __lasx_xvmul_w(u0, v_uvalpha1);
+        v0   = __lasx_xvmul_w(v0, v_uvalpha1);
+        y_l  = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
+        y_h  = __lasx_xvmadd_w(y0_h, v_yalpha, y1_h);
+        u    = __lasx_xvmadd_w(u0, v_uvalpha, u1);
+        v    = __lasx_xvmadd_w(v0, v_uvalpha, v1);
+        y_l  = __lasx_xvsrai_w(y_l, 19);
+        y_h  = __lasx_xvsrai_w(y_h, 19);
+        u    = __lasx_xvsrai_w(u, 19);
+        v    = __lasx_xvsrai_w(v, 19);
+        u    = __lasx_xvadd_w(u, headroom);
+        v    = __lasx_xvadd_w(v, headroom);
+        WRITE_YUV2RGB(y_l, y_l, u, v, 0, 1, 0, 0);
+        WRITE_YUV2RGB(y_l, y_l, u, v, 2, 3, 1, 1);
+        WRITE_YUV2RGB(y_h, y_h, u, v, 0, 1, 2, 2);
+        WRITE_YUV2RGB(y_h, y_h, u, v, 2, 3, 3, 3);
+        WRITE_YUV2RGB(y_l, y_l, u, v, 4, 5, 4, 4);
+        WRITE_YUV2RGB(y_l, y_l, u, v, 6, 7, 5, 5);
+        WRITE_YUV2RGB(y_h, y_h, u, v, 4, 5, 6, 6);
+        WRITE_YUV2RGB(y_h, y_h, u, v, 6, 7, 7, 7);
+    }
+    if (dstW - i >= 8) {
+        int Y1, Y2, U, V;
+        int i_dex = i << 1;
+        __m256i y0_l, y0, u0, v0;
+        __m256i y1_l, y1, u1, v1;
+        __m256i y_l, u, v;
+
+        y0   = __lasx_xvldx(buf0, i_dex);
+        u0   = __lasx_xvldrepl_d((ubuf0 + count), 0);
+        v0   = __lasx_xvldrepl_d((vbuf0 + count), 0);
+        y1   = __lasx_xvldx(buf1, i_dex);
+        u1   = __lasx_xvldrepl_d((ubuf1 + count), 0);
+        v1   = __lasx_xvldrepl_d((vbuf1 + count), 0);
+        DUP2_ARG1(__lasx_vext2xv_w_h, y0, y1, y0_l, y1_l);
+        DUP4_ARG1(__lasx_vext2xv_w_h, u0, u1, v0, v1, u0, u1, v0, v1);
+        y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
+        u0   = __lasx_xvmul_w(u0, v_uvalpha1);
+        v0   = __lasx_xvmul_w(v0, v_uvalpha1);
+        y_l  = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
+        u    = __lasx_xvmadd_w(u0, v_uvalpha, u1);
+        v    = __lasx_xvmadd_w(v0, v_uvalpha, v1);
+        y_l  = __lasx_xvsrai_w(y_l, 19);
+        u    = __lasx_xvsrai_w(u, 19);
+        v    = __lasx_xvsrai_w(v, 19);
+        u    = __lasx_xvadd_w(u, headroom);
+        v    = __lasx_xvadd_w(v, headroom);
+        WRITE_YUV2RGB(y_l, y_l, u, v, 0, 1, 0, 0);
+        WRITE_YUV2RGB(y_l, y_l, u, v, 2, 3, 1, 1);
+        WRITE_YUV2RGB(y_l, y_l, u, v, 4, 5, 2, 2);
+        WRITE_YUV2RGB(y_l, y_l, u, v, 6, 7, 3, 3);
+        i += 8;
+    }
+    for (; count < len_count; count++) {
+        int Y1 = (buf0[count * 2]     * yalpha1  +
+                  buf1[count * 2]     * yalpha)  >> 19;
+        int Y2 = (buf0[count * 2 + 1] * yalpha1  +
+                  buf1[count * 2 + 1] * yalpha) >> 19;
+        int U  = (ubuf0[count] * uvalpha1 + ubuf1[count] * uvalpha) >> 19;
+        int V  = (vbuf0[count] * uvalpha1 + vbuf1[count] * uvalpha) >> 19;
+
+        r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM],
+        g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
+             c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
+        b =  c->table_bU[U + YUVRGB_TABLE_HEADROOM];
+
+        yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
+                      r, g, b, y, target, 0);
+    }
+}
+
+static void
+yuv2rgb_1_template_lasx(SwsContext *c, const int16_t *buf0,
+                        const int16_t *ubuf[2], const int16_t *vbuf[2],
+                        const int16_t *abuf0, uint8_t *dest, int dstW,
+                        int uvalpha, int y, enum AVPixelFormat target,
+                        int hasAlpha)
+{
+    const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
+    int i;
+    int len       = (dstW - 15);
+    int len_count = (dstW + 1) >> 1;
+    const void *r, *g, *b;
+
+    if (uvalpha < 2048) {
+        int count    = 0;
+        int head = YUVRGB_TABLE_HEADROOM;
+        __m256i headroom  = __lasx_xvreplgr2vr_h(head);
+
+        for (i = 0; i < len; i += 16) {
+            int Y1, Y2, U, V;
+            int i_dex = i << 1;
+            int c_dex = count << 1;
+            __m256i src_y, src_u, src_v;
+            __m256i u, v, y_l, y_h;
+
+            DUP2_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, src_y, src_u);
+            src_v = __lasx_xvldx(vbuf0, c_dex);
+            src_u = __lasx_xvpermi_q(src_u, src_v, 0x02);
+            src_y = __lasx_xvsrari_h(src_y, 7);
+            src_u = __lasx_xvsrari_h(src_u, 7);
+            y_l   = __lasx_xvsllwil_w_h(src_y, 0);
+            y_h   = __lasx_xvexth_w_h(src_y);
+            u     = __lasx_xvaddwev_w_h(src_u, headroom);
+            v     = __lasx_xvaddwod_w_h(src_u, headroom);
+            WRITE_YUV2RGB(y_l, y_l, u, u, 0, 1, 0, 4);
+            WRITE_YUV2RGB(y_l, y_l, v, v, 2, 3, 0, 4);
+            WRITE_YUV2RGB(y_h, y_h, u, u, 0, 1, 1, 5);
+            WRITE_YUV2RGB(y_h, y_h, v, v, 2, 3, 1, 5);
+            WRITE_YUV2RGB(y_l, y_l, u, u, 4, 5, 2, 6);
+            WRITE_YUV2RGB(y_l, y_l, v, v, 6, 7, 2, 6);
+            WRITE_YUV2RGB(y_h, y_h, u, u, 4, 5, 3, 7);
+            WRITE_YUV2RGB(y_h, y_h, v, v, 6, 7, 3, 7);
+        }
+        if (dstW - i >= 8){
+            int Y1, Y2, U, V;
+            int i_dex = i << 1;
+            __m256i src_y, src_u, src_v;
+            __m256i y_l, uv;
+
+            src_y  = __lasx_xvldx(buf0, i_dex);
+            src_u  = __lasx_xvldrepl_d((ubuf0 + count), 0);
+            src_v  = __lasx_xvldrepl_d((vbuf0 + count), 0);
+            src_u  = __lasx_xvilvl_d(src_v, src_u);
+            y_l    = __lasx_xvsrari_h(src_y, 7);
+            uv     = __lasx_xvsrari_h(src_u, 7);
+            y_l    = __lasx_vext2xv_w_h(y_l);
+            uv     = __lasx_vext2xv_w_h(uv);
+            uv     = __lasx_xvaddwev_w_h(uv, headroom);
+            WRITE_YUV2RGB(y_l, y_l, uv, uv, 0, 1, 0, 4);
+            WRITE_YUV2RGB(y_l, y_l, uv, uv, 2, 3, 1, 5);
+            WRITE_YUV2RGB(y_l, y_l, uv, uv, 4, 5, 2, 6);
+            WRITE_YUV2RGB(y_l, y_l, uv, uv, 6, 7, 3, 7);
+            i += 8;
+        }
+        for (; count < len_count; count++) {
+            int Y1 = (buf0[count * 2    ] + 64) >> 7;
+            int Y2 = (buf0[count * 2 + 1] + 64) >> 7;
+            int U  = (ubuf0[count]        + 64) >> 7;
+            int V  = (vbuf0[count]        + 64) >> 7;
+
+            r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM],
+            g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
+                 c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
+            b =  c->table_bU[U + YUVRGB_TABLE_HEADROOM];
+
+            yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
+                          r, g, b, y, target, 0);
+        }
+    } else {
+        const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
+        int count = 0;
+        int HEADROOM = YUVRGB_TABLE_HEADROOM;
+        __m256i headroom    = __lasx_xvreplgr2vr_w(HEADROOM);
+
+        for (i = 0; i < len; i += 16) {
+            int Y1, Y2, U, V;
+            int i_dex = i << 1;
+            int c_dex = count << 1;
+            __m256i src_y, src_u0, src_v0, src_u1, src_v1;
+            __m256i y_l, y_h, u, v;
+
+            DUP4_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
+                      ubuf1, c_dex, src_y, src_u0, src_v0, src_u1);
+            src_v1 = __lasx_xvldx(vbuf1, c_dex);
+            src_u0 = __lasx_xvpermi_q(src_u0, src_v0, 0x02);
+            src_u1 = __lasx_xvpermi_q(src_u1, src_v1, 0x02);
+            src_y  = __lasx_xvsrari_h(src_y, 7);
+            u      = __lasx_xvaddwev_w_h(src_u0, src_u1);
+            v      = __lasx_xvaddwod_w_h(src_u0, src_u1);
+            y_l    = __lasx_xvsllwil_w_h(src_y, 0);
+            y_h    = __lasx_xvexth_w_h(src_y);
+            u      = __lasx_xvsrari_w(u, 8);
+            v      = __lasx_xvsrari_w(v, 8);
+            u      = __lasx_xvadd_w(u, headroom);
+            v      = __lasx_xvadd_w(v, headroom);
+            WRITE_YUV2RGB(y_l, y_l, u, u, 0, 1, 0, 4);
+            WRITE_YUV2RGB(y_l, y_l, v, v, 2, 3, 0, 4);
+            WRITE_YUV2RGB(y_h, y_h, u, u, 0, 1, 1, 5);
+            WRITE_YUV2RGB(y_h, y_h, v, v, 2, 3, 1, 5);
+            WRITE_YUV2RGB(y_l, y_l, u, u, 4, 5, 2, 6);
+            WRITE_YUV2RGB(y_l, y_l, v, v, 6, 7, 2, 6);
+            WRITE_YUV2RGB(y_h, y_h, u, u, 4, 5, 3, 7);
+            WRITE_YUV2RGB(y_h, y_h, v, v, 6, 7, 3, 7);
+        }
+        if (dstW - i >= 8) {
+            int Y1, Y2, U, V;
+            int i_dex = i << 1;
+            __m256i src_y, src_u0, src_v0, src_u1, src_v1;
+            __m256i uv;
+
+            src_y  = __lasx_xvldx(buf0, i_dex);
+            src_u0 = __lasx_xvldrepl_d((ubuf0 + count), 0);
+            src_v0 = __lasx_xvldrepl_d((vbuf0 + count), 0);
+            src_u1 = __lasx_xvldrepl_d((ubuf1 + count), 0);
+            src_v1 = __lasx_xvldrepl_d((vbuf1 + count), 0);
+
+            src_u0 = __lasx_xvilvl_h(src_u1, src_u0);
+            src_v0 = __lasx_xvilvl_h(src_v1, src_v0);
+            src_u0 = __lasx_xvpermi_q(src_u0, src_v0, 0x02);
+            src_y  = __lasx_xvsrari_h(src_y, 7);
+            uv     = __lasx_xvhaddw_w_h(src_u0, src_u0);
+            src_y  = __lasx_vext2xv_w_h(src_y);
+            uv     = __lasx_xvsrari_w(uv, 8);
+            uv     = __lasx_xvadd_w(uv, headroom);
+            WRITE_YUV2RGB(src_y, src_y, uv, uv, 0, 1, 0, 4);
+            WRITE_YUV2RGB(src_y, src_y, uv, uv, 2, 3, 1, 5);
+            WRITE_YUV2RGB(src_y, src_y, uv, uv, 4, 5, 2, 6);
+            WRITE_YUV2RGB(src_y, src_y, uv, uv, 6, 7, 3, 7);
+            i += 8;
+        }
+        for (; count < len_count; count++) {
+            int Y1 = (buf0[count * 2    ]         +  64) >> 7;
+            int Y2 = (buf0[count * 2 + 1]         +  64) >> 7;
+            int U  = (ubuf0[count] + ubuf1[count] + 128) >> 8;
+            int V  = (vbuf0[count] + vbuf1[count] + 128) >> 8;
+
+            r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM],
+            g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
+                 c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
+            b =  c->table_bU[U + YUVRGB_TABLE_HEADROOM];
+
+            yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
+                          r, g, b, y, target, 0);
+        }
+    }
+}
+
+#define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha)                                \
+static void name ## ext ## _X_lasx(SwsContext *c, const int16_t *lumFilter,            \
+                                   const int16_t **lumSrc, int lumFilterSize,          \
+                                   const int16_t *chrFilter, const int16_t **chrUSrc,  \
+                                   const int16_t **chrVSrc, int chrFilterSize,         \
+                                   const int16_t **alpSrc, uint8_t *dest, int dstW,    \
+                                   int y)                                              \
+{                                                                                      \
+    name ## base ## _X_template_lasx(c, lumFilter, lumSrc, lumFilterSize,              \
+                                     chrFilter, chrUSrc, chrVSrc, chrFilterSize,       \
+                                     alpSrc, dest, dstW, y, fmt, hasAlpha);            \
+}
+
+#define YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha)                               \
+YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha)                                        \
+static void name ## ext ## _2_lasx(SwsContext *c, const int16_t *buf[2],               \
+                                   const int16_t *ubuf[2], const int16_t *vbuf[2],     \
+                                   const int16_t *abuf[2], uint8_t *dest, int dstW,    \
+                                   int yalpha, int uvalpha, int y)                     \
+{                                                                                      \
+    name ## base ## _2_template_lasx(c, buf, ubuf, vbuf, abuf, dest,                   \
+                                     dstW, yalpha, uvalpha, y, fmt, hasAlpha);         \
+}
+
+#define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha)                                 \
+YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha)                                       \
+static void name ## ext ## _1_lasx(SwsContext *c, const int16_t *buf0,                 \
+                                   const int16_t *ubuf[2], const int16_t *vbuf[2],     \
+                                   const int16_t *abuf0, uint8_t *dest, int dstW,      \
+                                   int uvalpha, int y)                                 \
+{                                                                                      \
+    name ## base ## _1_template_lasx(c, buf0, ubuf, vbuf, abuf0, dest,                 \
+                                     dstW, uvalpha, y, fmt, hasAlpha);                 \
+}
+
+
+#if CONFIG_SMALL
+#else
+#if CONFIG_SWSCALE_ALPHA
+#endif
+YUV2RGBWRAPPER(yuv2rgb,, x32_1,  AV_PIX_FMT_RGB32_1, 0)
+YUV2RGBWRAPPER(yuv2rgb,, x32,    AV_PIX_FMT_RGB32,   0)
+#endif
+YUV2RGBWRAPPER(yuv2, rgb, rgb24, AV_PIX_FMT_RGB24,     0)
+YUV2RGBWRAPPER(yuv2, rgb, bgr24, AV_PIX_FMT_BGR24,     0)
+YUV2RGBWRAPPER(yuv2rgb,,  16,    AV_PIX_FMT_RGB565,    0)
+YUV2RGBWRAPPER(yuv2rgb,,  15,    AV_PIX_FMT_RGB555,    0)
+YUV2RGBWRAPPER(yuv2rgb,,  12,    AV_PIX_FMT_RGB444,    0)
+YUV2RGBWRAPPER(yuv2rgb,,   8,    AV_PIX_FMT_RGB8,      0)
+YUV2RGBWRAPPER(yuv2rgb,,   4,    AV_PIX_FMT_RGB4,      0)
+YUV2RGBWRAPPER(yuv2rgb,,   4b,   AV_PIX_FMT_RGB4_BYTE, 0)
+
+// This function is copied from libswscale/output.c
+static av_always_inline void yuv2rgb_write_full(SwsContext *c,
+    uint8_t *dest, int i, int R, int A, int G, int B,
+    int y, enum AVPixelFormat target, int hasAlpha, int err[4])
+{
+    int isrgb8 = target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8;
+
+    if ((R | G | B) & 0xC0000000) {
+        R = av_clip_uintp2(R, 30);
+        G = av_clip_uintp2(G, 30);
+        B = av_clip_uintp2(B, 30);
+    }
+
+    switch(target) {
+    case AV_PIX_FMT_ARGB:
+        dest[0] = hasAlpha ? A : 255;
+        dest[1] = R >> 22;
+        dest[2] = G >> 22;
+        dest[3] = B >> 22;
+        break;
+    case AV_PIX_FMT_RGB24:
+        dest[0] = R >> 22;
+        dest[1] = G >> 22;
+        dest[2] = B >> 22;
+        break;
+    case AV_PIX_FMT_RGBA:
+        dest[0] = R >> 22;
+        dest[1] = G >> 22;
+        dest[2] = B >> 22;
+        dest[3] = hasAlpha ? A : 255;
+        break;
+    case AV_PIX_FMT_ABGR:
+        dest[0] = hasAlpha ? A : 255;
+        dest[1] = B >> 22;
+        dest[2] = G >> 22;
+        dest[3] = R >> 22;
+        break;
+    case AV_PIX_FMT_BGR24:
+        dest[0] = B >> 22;
+        dest[1] = G >> 22;
+        dest[2] = R >> 22;
+        break;
+    case AV_PIX_FMT_BGRA:
+        dest[0] = B >> 22;
+        dest[1] = G >> 22;
+        dest[2] = R >> 22;
+        dest[3] = hasAlpha ? A : 255;
+        break;
+    case AV_PIX_FMT_BGR4_BYTE:
+    case AV_PIX_FMT_RGB4_BYTE:
+    case AV_PIX_FMT_BGR8:
+    case AV_PIX_FMT_RGB8:
+    {
+        int r,g,b;
+
+        switch (c->dither) {
+        default:
+        case SWS_DITHER_AUTO:
+        case SWS_DITHER_ED:
+            R >>= 22;
+            G >>= 22;
+            B >>= 22;
+            R += (7*err[0] + 1*c->dither_error[0][i] + 5*c->dither_error[0][i+1] + 3*c->dither_error[0][i+2])>>4;
+            G += (7*err[1] + 1*c->dither_error[1][i] + 5*c->dither_error[1][i+1] + 3*c->dither_error[1][i+2])>>4;
+            B += (7*err[2] + 1*c->dither_error[2][i] + 5*c->dither_error[2][i+1] + 3*c->dither_error[2][i+2])>>4;
+            c->dither_error[0][i] = err[0];
+            c->dither_error[1][i] = err[1];
+            c->dither_error[2][i] = err[2];
+            r = R >> (isrgb8 ? 5 : 7);
+            g = G >> (isrgb8 ? 5 : 6);
+            b = B >> (isrgb8 ? 6 : 7);
+            r = av_clip(r, 0, isrgb8 ? 7 : 1);
+            g = av_clip(g, 0, isrgb8 ? 7 : 3);
+            b = av_clip(b, 0, isrgb8 ? 3 : 1);
+            err[0] = R - r*(isrgb8 ? 36 : 255);
+            err[1] = G - g*(isrgb8 ? 36 : 85);
+            err[2] = B - b*(isrgb8 ? 85 : 255);
+            break;
+        case SWS_DITHER_A_DITHER:
+            if (isrgb8) {
+  /* see http://pippin.gimp.org/a_dither/ for details/origin */
+#define A_DITHER(u,v)   (((((u)+((v)*236))*119)&0xff))
+                r = (((R >> 19) + A_DITHER(i,y)  -96)>>8);
+                g = (((G >> 19) + A_DITHER(i + 17,y) - 96)>>8);
+                b = (((B >> 20) + A_DITHER(i + 17*2,y) -96)>>8);
+                r = av_clip_uintp2(r, 3);
+                g = av_clip_uintp2(g, 3);
+                b = av_clip_uintp2(b, 2);
+            } else {
+                r = (((R >> 21) + A_DITHER(i,y)-256)>>8);
+                g = (((G >> 19) + A_DITHER(i + 17,y)-256)>>8);
+                b = (((B >> 21) + A_DITHER(i + 17*2,y)-256)>>8);
+                r = av_clip_uintp2(r, 1);
+                g = av_clip_uintp2(g, 2);
+                b = av_clip_uintp2(b, 1);
+            }
+            break;
+        case SWS_DITHER_X_DITHER:
+            if (isrgb8) {
+  /* see http://pippin.gimp.org/a_dither/ for details/origin */
+#define X_DITHER(u,v)   (((((u)^((v)*237))*181)&0x1ff)/2)
+                r = (((R >> 19) + X_DITHER(i,y) - 96)>>8);
+                g = (((G >> 19) + X_DITHER(i + 17,y) - 96)>>8);
+                b = (((B >> 20) + X_DITHER(i + 17*2,y) - 96)>>8);
+                r = av_clip_uintp2(r, 3);
+                g = av_clip_uintp2(g, 3);
+                b = av_clip_uintp2(b, 2);
+            } else {
+                r = (((R >> 21) + X_DITHER(i,y)-256)>>8);
+                g = (((G >> 19) + X_DITHER(i + 17,y)-256)>>8);
+                b = (((B >> 21) + X_DITHER(i + 17*2,y)-256)>>8);
+                r = av_clip_uintp2(r, 1);
+                g = av_clip_uintp2(g, 2);
+                b = av_clip_uintp2(b, 1);
+            }
+
+            break;
+        }
+
+        if(target == AV_PIX_FMT_BGR4_BYTE) {
+            dest[0] = r + 2*g + 8*b;
+        } else if(target == AV_PIX_FMT_RGB4_BYTE) {
+            dest[0] = b + 2*g + 8*r;
+        } else if(target == AV_PIX_FMT_BGR8) {
+            dest[0] = r + 8*g + 64*b;
+        } else if(target == AV_PIX_FMT_RGB8) {
+            dest[0] = b + 4*g + 32*r;
+        } else
+            av_assert2(0);
+        break; }
+    }
+}
+
+#define YUVTORGB_SETUP                                           \
+    int y_offset   = c->yuv2rgb_y_offset;                        \
+    int y_coeff    = c->yuv2rgb_y_coeff;                         \
+    int v2r_coe    = c->yuv2rgb_v2r_coeff;                       \
+    int v2g_coe    = c->yuv2rgb_v2g_coeff;                       \
+    int u2g_coe    = c->yuv2rgb_u2g_coeff;                       \
+    int u2b_coe    = c->yuv2rgb_u2b_coeff;                       \
+    __m256i offset = __lasx_xvreplgr2vr_w(y_offset);             \
+    __m256i coeff  = __lasx_xvreplgr2vr_w(y_coeff);              \
+    __m256i v2r    = __lasx_xvreplgr2vr_w(v2r_coe);              \
+    __m256i v2g    = __lasx_xvreplgr2vr_w(v2g_coe);              \
+    __m256i u2g    = __lasx_xvreplgr2vr_w(u2g_coe);              \
+    __m256i u2b    = __lasx_xvreplgr2vr_w(u2b_coe);              \
+
+
+#define YUVTORGB(y, u, v, R, G, B, offset, coeff,              \
+                 y_temp, v2r, v2g, u2g, u2b)                   \
+{                                                              \
+     y = __lasx_xvsub_w(y, offset);                            \
+     y = __lasx_xvmul_w(y, coeff);                             \
+     y = __lasx_xvadd_w(y, y_temp);                            \
+     R = __lasx_xvmadd_w(y, v, v2r);                           \
+     v = __lasx_xvmadd_w(y, v, v2g);                           \
+     G = __lasx_xvmadd_w(v, u, u2g);                           \
+     B = __lasx_xvmadd_w(y, u, u2b);                           \
+}
+
+#define WRITE_FULL_A(r, g, b, a, t1, s)                                      \
+{                                                                            \
+    R = __lasx_xvpickve2gr_w(r, t1);                                         \
+    G = __lasx_xvpickve2gr_w(g, t1);                                         \
+    B = __lasx_xvpickve2gr_w(b, t1);                                         \
+    A = __lasx_xvpickve2gr_w(a, t1);                                         \
+    if (A & 0x100)                                                           \
+        A = av_clip_uint8(A);                                                \
+    yuv2rgb_write_full(c, dest, i + s, R, A, G, B, y, target, hasAlpha, err);\
+    dest += step;                                                            \
+}
+
+#define WRITE_FULL(r, g, b, t1, s)                                            \
+{                                                                             \
+    R = __lasx_xvpickve2gr_w(r, t1);                                          \
+    G = __lasx_xvpickve2gr_w(g, t1);                                          \
+    B = __lasx_xvpickve2gr_w(b, t1);                                          \
+    yuv2rgb_write_full(c, dest, i + s, R, 0, G, B, y, target, hasAlpha, err); \
+    dest += step;                                                             \
+}
+
+static void
+yuv2rgb_full_X_template_lasx(SwsContext *c, const int16_t *lumFilter,
+                             const int16_t **lumSrc, int lumFilterSize,
+                             const int16_t *chrFilter, const int16_t **chrUSrc,
+                             const int16_t **chrVSrc, int chrFilterSize,
+                             const int16_t **alpSrc, uint8_t *dest,
+                             int dstW, int y, enum AVPixelFormat target,
+                             int hasAlpha)
+{
+    int i, j, B, G, R, A;
+    int step       = (target == AV_PIX_FMT_RGB24 ||
+                      target == AV_PIX_FMT_BGR24) ? 3 : 4;
+    int err[4]     = {0};
+    int a_temp     = 1 << 18;
+    int templ      = 1 << 9;
+    int tempc      = templ - (128 << 19);
+    int ytemp      = 1 << 21;
+    int len        = dstW - 15;
+    __m256i y_temp = __lasx_xvreplgr2vr_w(ytemp);
+    YUVTORGB_SETUP
+
+    if(   target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
+       || target == AV_PIX_FMT_BGR8      || target == AV_PIX_FMT_RGB8)
+        step = 1;
+
+    for (i = 0; i < len; i += 16) {
+        __m256i l_src, u_src, v_src;
+        __m256i y_ev, y_od, u_ev, u_od, v_ev, v_od, temp;
+        __m256i R_ev, R_od, G_ev, G_od, B_ev, B_od;
+        int n = i << 1;
+
+        y_ev = y_od = __lasx_xvreplgr2vr_w(templ);
+        u_ev = u_od = v_ev = v_od = __lasx_xvreplgr2vr_w(tempc);
+        for (j = 0; j < lumFilterSize; j++) {
+            temp  = __lasx_xvldrepl_h((lumFilter + j), 0);
+            l_src = __lasx_xvldx(lumSrc[j], n);
+            y_ev  = __lasx_xvmaddwev_w_h(y_ev, l_src, temp);
+            y_od  = __lasx_xvmaddwod_w_h(y_od, l_src, temp);
+        }
+        for (j = 0; j < chrFilterSize; j++) {
+            temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
+            DUP2_ARG2(__lasx_xvldx, chrUSrc[j], n, chrVSrc[j], n,
+                      u_src, v_src);
+            DUP2_ARG3(__lasx_xvmaddwev_w_h, u_ev, u_src, temp, v_ev,
+                      v_src, temp, u_ev, v_ev);
+            DUP2_ARG3(__lasx_xvmaddwod_w_h, u_od, u_src, temp, v_od,
+                      v_src, temp, u_od, v_od);
+        }
+        y_ev = __lasx_xvsrai_w(y_ev, 10);
+        y_od = __lasx_xvsrai_w(y_od, 10);
+        u_ev = __lasx_xvsrai_w(u_ev, 10);
+        u_od = __lasx_xvsrai_w(u_od, 10);
+        v_ev = __lasx_xvsrai_w(v_ev, 10);
+        v_od = __lasx_xvsrai_w(v_od, 10);
+        YUVTORGB(y_ev, u_ev, v_ev, R_ev, G_ev, B_ev, offset, coeff,
+                 y_temp, v2r, v2g, u2g, u2b);
+        YUVTORGB(y_od, u_od, v_od, R_od, G_od, B_od, offset, coeff,
+                 y_temp, v2r, v2g, u2g, u2b);
+
+        if (hasAlpha) {
+            __m256i a_src, a_ev, a_od;
+
+            a_ev = a_od = __lasx_xvreplgr2vr_w(a_temp);
+            for (j = 0; j < lumFilterSize; j++) {
+                temp  = __lasx_xvldrepl_h(lumFilter + j, 0);
+                a_src = __lasx_xvldx(alpSrc[j], n);
+                a_ev  = __lasx_xvmaddwev_w_h(a_ev, a_src, temp);
+                a_od  = __lasx_xvmaddwod_w_h(a_od, a_src, temp);
+            }
+            a_ev = __lasx_xvsrai_w(a_ev, 19);
+            a_od = __lasx_xvsrai_w(a_od, 19);
+            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 0, 0);
+            WRITE_FULL_A(R_od, G_od, B_od, a_od, 0, 1);
+            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 1, 2);
+            WRITE_FULL_A(R_od, G_od, B_od, a_od, 1, 3);
+            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 2, 4);
+            WRITE_FULL_A(R_od, G_od, B_od, a_od, 2, 5);
+            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 3, 6);
+            WRITE_FULL_A(R_od, G_od, B_od, a_od, 3, 7);
+            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 4, 8);
+            WRITE_FULL_A(R_od, G_od, B_od, a_od, 4, 9);
+            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 5, 10);
+            WRITE_FULL_A(R_od, G_od, B_od, a_od, 5, 11);
+            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 6, 12);
+            WRITE_FULL_A(R_od, G_od, B_od, a_od, 6, 13);
+            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 7, 14);
+            WRITE_FULL_A(R_od, G_od, B_od, a_od, 7, 15);
+        } else {
+            WRITE_FULL(R_ev, G_ev, B_ev, 0, 0);
+            WRITE_FULL(R_od, G_od, B_od, 0, 1);
+            WRITE_FULL(R_ev, G_ev, B_ev, 1, 2);
+            WRITE_FULL(R_od, G_od, B_od, 1, 3);
+            WRITE_FULL(R_ev, G_ev, B_ev, 2, 4);
+            WRITE_FULL(R_od, G_od, B_od, 2, 5);
+            WRITE_FULL(R_ev, G_ev, B_ev, 3, 6);
+            WRITE_FULL(R_od, G_od, B_od, 3, 7);
+            WRITE_FULL(R_ev, G_ev, B_ev, 4, 8);
+            WRITE_FULL(R_od, G_od, B_od, 4, 9);
+            WRITE_FULL(R_ev, G_ev, B_ev, 5, 10);
+            WRITE_FULL(R_od, G_od, B_od, 5, 11);
+            WRITE_FULL(R_ev, G_ev, B_ev, 6, 12);
+            WRITE_FULL(R_od, G_od, B_od, 6, 13);
+            WRITE_FULL(R_ev, G_ev, B_ev, 7, 14);
+            WRITE_FULL(R_od, G_od, B_od, 7, 15);
+        }
+    }
+    if (dstW - i >= 8) {
+        __m256i l_src, u_src, v_src;
+        __m256i y_ev, u_ev, v_ev, uv, temp;
+        __m256i R_ev, G_ev, B_ev;
+        int n = i << 1;
+
+        y_ev = __lasx_xvreplgr2vr_w(templ);
+        u_ev = v_ev = __lasx_xvreplgr2vr_w(tempc);
+        for (j = 0; j < lumFilterSize; j++) {
+            temp  = __lasx_xvldrepl_h((lumFilter + j), 0);
+            l_src = __lasx_xvldx(lumSrc[j], n);
+            l_src = __lasx_xvpermi_d(l_src, 0xD8);
+            l_src = __lasx_xvilvl_h(l_src, l_src);
+            y_ev  = __lasx_xvmaddwev_w_h(y_ev, l_src, temp);
+        }
+        for (j = 0; j < chrFilterSize; j++) {
+            temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
+            DUP2_ARG2(__lasx_xvldx, chrUSrc[j], n, chrVSrc[j], n, u_src, v_src);
+            u_src = __lasx_xvpermi_d(u_src, 0xD8);
+            v_src = __lasx_xvpermi_d(v_src, 0xD8);
+            uv    = __lasx_xvilvl_h(v_src, u_src);
+            u_ev  = __lasx_xvmaddwev_w_h(u_ev, uv, temp);
+            v_ev  = __lasx_xvmaddwod_w_h(v_ev, uv, temp);
+        }
+        y_ev = __lasx_xvsrai_w(y_ev, 10);
+        u_ev = __lasx_xvsrai_w(u_ev, 10);
+        v_ev = __lasx_xvsrai_w(v_ev, 10);
+        YUVTORGB(y_ev, u_ev, v_ev, R_ev, G_ev, B_ev, offset, coeff,
+                 y_temp, v2r, v2g, u2g, u2b);
+
+        if (hasAlpha) {
+            __m256i a_src, a_ev;
+
+            a_ev = __lasx_xvreplgr2vr_w(a_temp);
+            for (j = 0; j < lumFilterSize; j++) {
+                temp  = __lasx_xvldrepl_h(lumFilter + j, 0);
+                a_src = __lasx_xvldx(alpSrc[j], n);
+                a_src = __lasx_xvpermi_d(a_src, 0xD8);
+                a_src = __lasx_xvilvl_h(a_src, a_src);
+                a_ev  =  __lasx_xvmaddwev_w_h(a_ev, a_src, temp);
+            }
+            a_ev = __lasx_xvsrai_w(a_ev, 19);
+            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 0, 0);
+            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 1, 1);
+            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 2, 2);
+            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 3, 3);
+            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 4, 4);
+            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 5, 5);
+            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 6, 6);
+            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 7, 7);
+        } else {
+            WRITE_FULL(R_ev, G_ev, B_ev, 0, 0);
+            WRITE_FULL(R_ev, G_ev, B_ev, 1, 1);
+            WRITE_FULL(R_ev, G_ev, B_ev, 2, 2);
+            WRITE_FULL(R_ev, G_ev, B_ev, 3, 3);
+            WRITE_FULL(R_ev, G_ev, B_ev, 4, 4);
+            WRITE_FULL(R_ev, G_ev, B_ev, 5, 5);
+            WRITE_FULL(R_ev, G_ev, B_ev, 6, 6);
+            WRITE_FULL(R_ev, G_ev, B_ev, 7, 7);
+        }
+        i += 8;
+    }
+    for (; i < dstW; i++) {
+        int Y = templ;
+        int V, U = V = tempc;
+
+        A = 0;
+        for (j = 0; j < lumFilterSize; j++) {
+            Y += lumSrc[j][i] * lumFilter[j];
+        }
+        for (j = 0; j < chrFilterSize; j++) {
+            U += chrUSrc[j][i] * chrFilter[j];
+            V += chrVSrc[j][i] * chrFilter[j];
+
+        }
+        Y >>= 10;
+        U >>= 10;
+        V >>= 10;
+        if (hasAlpha) {
+            A = 1 << 18;
+            for (j = 0; j < lumFilterSize; j++) {
+                A += alpSrc[j][i] * lumFilter[j];
+            }
+            A >>= 19;
+            if (A & 0x100)
+                A = av_clip_uint8(A);
+        }
+        Y -= y_offset;
+        Y *= y_coeff;
+        Y += ytemp;
+        R  = (unsigned)Y + V * v2r_coe;
+        G  = (unsigned)Y + V * v2g_coe + U * u2g_coe;
+        B  = (unsigned)Y + U * u2b_coe;
+        yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
+        dest += step;
+    }
+    c->dither_error[0][i] = err[0];
+    c->dither_error[1][i] = err[1];
+    c->dither_error[2][i] = err[2];
+}
+
+static void
+yuv2rgb_full_2_template_lasx(SwsContext *c, const int16_t *buf[2],
+                             const int16_t *ubuf[2], const int16_t *vbuf[2],
+                             const int16_t *abuf[2], uint8_t *dest, int dstW,
+                             int yalpha, int uvalpha, int y,
+                             enum AVPixelFormat target, int hasAlpha)
+{
+    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
+                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
+                  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
+                  *abuf0 = hasAlpha ? abuf[0] : NULL,
+                  *abuf1 = hasAlpha ? abuf[1] : NULL;
+    int yalpha1  = 4096 - yalpha;
+    int uvalpha1 = 4096 - uvalpha;
+    int uvtemp   = 128 << 19;
+    int atemp    = 1 << 18;
+    int err[4]   = {0};
+    int ytemp    = 1 << 21;
+    int len      = dstW - 15;
+    int i, R, G, B, A;
+    int step = (target == AV_PIX_FMT_RGB24 ||
+                target == AV_PIX_FMT_BGR24) ? 3 : 4;
+    __m256i v_uvalpha1 = __lasx_xvreplgr2vr_w(uvalpha1);
+    __m256i v_yalpha1  = __lasx_xvreplgr2vr_w(yalpha1);
+    __m256i v_uvalpha  = __lasx_xvreplgr2vr_w(uvalpha);
+    __m256i v_yalpha   = __lasx_xvreplgr2vr_w(yalpha);
+    __m256i uv         = __lasx_xvreplgr2vr_w(uvtemp);
+    __m256i a_bias     = __lasx_xvreplgr2vr_w(atemp);
+    __m256i y_temp     = __lasx_xvreplgr2vr_w(ytemp);
+    YUVTORGB_SETUP
+
+    av_assert2(yalpha  <= 4096U);
+    av_assert2(uvalpha <= 4096U);
+
+    if(   target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
+       || target == AV_PIX_FMT_BGR8      || target == AV_PIX_FMT_RGB8)
+        step = 1;
+
+    for (i = 0; i < len; i += 16) {
+        __m256i b0, b1, ub0, ub1, vb0, vb1;
+        __m256i y0_l, y0_h, y1_l, y1_h, u0_l, u0_h;
+        __m256i v0_l, v0_h, u1_l, u1_h, v1_l, v1_h;
+        __m256i y_l, y_h, v_l, v_h, u_l, u_h;
+        __m256i R_l, R_h, G_l, G_h, B_l, B_h;
+        int n = i << 1;
+
+        DUP4_ARG2(__lasx_xvldx, buf0, n, buf1, n, ubuf0,
+                  n, ubuf1, n, b0, b1, ub0, ub1);
+        DUP2_ARG2(__lasx_xvldx, vbuf0, n, vbuf1, n, vb0 , vb1);
+        DUP2_ARG2(__lasx_xvsllwil_w_h, b0, 0, b1, 0, y0_l, y1_l);
+        DUP4_ARG2(__lasx_xvsllwil_w_h, ub0, 0, ub1, 0, vb0, 0, vb1, 0,
+                  u0_l, u1_l, v0_l, v1_l);
+        DUP2_ARG1(__lasx_xvexth_w_h, b0, b1, y0_h, y1_h);
+        DUP4_ARG1(__lasx_xvexth_w_h, ub0, ub1, vb0, vb1,
+                  u0_h, u1_h, v0_h, v1_h);
+        y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
+        y0_h = __lasx_xvmul_w(y0_h, v_yalpha1);
+        u0_l = __lasx_xvmul_w(u0_l, v_uvalpha1);
+        u0_h = __lasx_xvmul_w(u0_h, v_uvalpha1);
+        v0_l = __lasx_xvmul_w(v0_l, v_uvalpha1);
+        v0_h = __lasx_xvmul_w(v0_h, v_uvalpha1);
+        y_l  = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
+        y_h  = __lasx_xvmadd_w(y0_h, v_yalpha, y1_h);
+        u_l  = __lasx_xvmadd_w(u0_l, v_uvalpha, u1_l);
+        u_h  = __lasx_xvmadd_w(u0_h, v_uvalpha, u1_h);
+        v_l  = __lasx_xvmadd_w(v0_l, v_uvalpha, v1_l);
+        v_h  = __lasx_xvmadd_w(v0_h, v_uvalpha, v1_h);
+        u_l  = __lasx_xvsub_w(u_l, uv);
+        u_h  = __lasx_xvsub_w(u_h, uv);
+        v_l  = __lasx_xvsub_w(v_l, uv);
+        v_h  = __lasx_xvsub_w(v_h, uv);
+        y_l  = __lasx_xvsrai_w(y_l, 10);
+        y_h  = __lasx_xvsrai_w(y_h, 10);
+        u_l  = __lasx_xvsrai_w(u_l, 10);
+        u_h  = __lasx_xvsrai_w(u_h, 10);
+        v_l  = __lasx_xvsrai_w(v_l, 10);
+        v_h  = __lasx_xvsrai_w(v_h, 10);
+        YUVTORGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
+                 y_temp, v2r, v2g, u2g, u2b);
+        YUVTORGB(y_h, u_h, v_h, R_h, G_h, B_h, offset, coeff,
+                 y_temp, v2r, v2g, u2g, u2b);
+
+        if (hasAlpha) {
+            __m256i a0, a1, a0_l, a0_h;
+            __m256i a_l, a_h, a1_l, a1_h;
+
+            DUP2_ARG2(__lasx_xvldx, abuf0, n, abuf1, n, a0, a1);
+            DUP2_ARG2(__lasx_xvsllwil_w_h, a0, 0, a1, 0, a0_l, a1_l);
+            DUP2_ARG1(__lasx_xvexth_w_h, a0, a1, a0_h, a1_h);
+            a_l = __lasx_xvmadd_w(a_bias, a0_l, v_yalpha1);
+            a_h = __lasx_xvmadd_w(a_bias, a0_h, v_yalpha1);
+            a_l = __lasx_xvmadd_w(a_l, v_yalpha, a1_l);
+            a_h = __lasx_xvmadd_w(a_h, v_yalpha, a1_h);
+            a_l = __lasx_xvsrai_w(a_l, 19);
+            a_h = __lasx_xvsrai_w(a_h, 19);
+            WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
+            WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
+            WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
+            WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
+            WRITE_FULL_A(R_h, G_h, B_h, a_h, 0, 4);
+            WRITE_FULL_A(R_h, G_h, B_h, a_h, 1, 5);
+            WRITE_FULL_A(R_h, G_h, B_h, a_h, 2, 6);
+            WRITE_FULL_A(R_h, G_h, B_h, a_h, 3, 7);
+            WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 8);
+            WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 9);
+            WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 10);
+            WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 11);
+            WRITE_FULL_A(R_h, G_h, B_h, a_h, 4, 12);
+            WRITE_FULL_A(R_h, G_h, B_h, a_h, 5, 13);
+            WRITE_FULL_A(R_h, G_h, B_h, a_h, 6, 14);
+            WRITE_FULL_A(R_h, G_h, B_h, a_h, 7, 15);
+        } else {
+            WRITE_FULL(R_l, G_l, B_l, 0, 0);
+            WRITE_FULL(R_l, G_l, B_l, 1, 1);
+            WRITE_FULL(R_l, G_l, B_l, 2, 2);
+            WRITE_FULL(R_l, G_l, B_l, 3, 3);
+            WRITE_FULL(R_h, G_h, B_h, 0, 4);
+            WRITE_FULL(R_h, G_h, B_h, 1, 5);
+            WRITE_FULL(R_h, G_h, B_h, 2, 6);
+            WRITE_FULL(R_h, G_h, B_h, 3, 7);
+            WRITE_FULL(R_l, G_l, B_l, 4, 8);
+            WRITE_FULL(R_l, G_l, B_l, 5, 9);
+            WRITE_FULL(R_l, G_l, B_l, 6, 10);
+            WRITE_FULL(R_l, G_l, B_l, 7, 11);
+            WRITE_FULL(R_h, G_h, B_h, 4, 12);
+            WRITE_FULL(R_h, G_h, B_h, 5, 13);
+            WRITE_FULL(R_h, G_h, B_h, 6, 14);
+            WRITE_FULL(R_h, G_h, B_h, 7, 15);
+        }
+    }
+    if (dstW - i >= 8) {
+        __m256i b0, b1, ub0, ub1, vb0, vb1;
+        __m256i y0_l, y1_l, u0_l;
+        __m256i v0_l, u1_l, v1_l;
+        __m256i y_l, u_l, v_l;
+        __m256i R_l, G_l, B_l;
+        int n = i << 1;
+
+        DUP4_ARG2(__lasx_xvldx, buf0, n, buf1, n, ubuf0, n,
+                  ubuf1, n, b0, b1, ub0, ub1);
+        DUP2_ARG2(__lasx_xvldx, vbuf0, n, vbuf1, n, vb0, vb1);
+        DUP2_ARG1(__lasx_vext2xv_w_h, b0, b1, y0_l, y1_l);
+        DUP4_ARG1(__lasx_vext2xv_w_h, ub0, ub1, vb0, vb1,
+                  u0_l, u1_l, v0_l, v1_l);
+        y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
+        u0_l = __lasx_xvmul_w(u0_l, v_uvalpha1);
+        v0_l = __lasx_xvmul_w(v0_l, v_uvalpha1);
+        y_l  = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
+        u_l  = __lasx_xvmadd_w(u0_l, v_uvalpha, u1_l);
+        v_l  = __lasx_xvmadd_w(v0_l, v_uvalpha, v1_l);
+        u_l  = __lasx_xvsub_w(u_l, uv);
+        v_l  = __lasx_xvsub_w(v_l, uv);
+        y_l  = __lasx_xvsrai_w(y_l, 10);
+        u_l  = __lasx_xvsrai_w(u_l, 10);
+        v_l  = __lasx_xvsrai_w(v_l, 10);
+        YUVTORGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
+                 y_temp, v2r, v2g, u2g, u2b);
+
+        if (hasAlpha) {
+            __m256i a0, a1, a0_l;
+            __m256i a_l, a1_l;
+
+            DUP2_ARG2(__lasx_xvldx, abuf0, n, abuf1, n, a0, a1);
+            DUP2_ARG1(__lasx_vext2xv_w_h, a0, a1, a0_l, a1_l);
+            a_l = __lasx_xvmadd_w(a_bias, a0_l, v_yalpha1);
+            a_l = __lasx_xvmadd_w(a_l, v_yalpha, a1_l);
+            a_l = __lasx_xvsrai_w(a_l, 19);
+            WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
+            WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
+            WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
+            WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
+            WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 4);
+            WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 5);
+            WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 6);
+            WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 7);
+        } else {
+            WRITE_FULL(R_l, G_l, B_l, 0, 0);
+            WRITE_FULL(R_l, G_l, B_l, 1, 1);
+            WRITE_FULL(R_l, G_l, B_l, 2, 2);
+            WRITE_FULL(R_l, G_l, B_l, 3, 3);
+            WRITE_FULL(R_l, G_l, B_l, 4, 4);
+            WRITE_FULL(R_l, G_l, B_l, 5, 5);
+            WRITE_FULL(R_l, G_l, B_l, 6, 6);
+            WRITE_FULL(R_l, G_l, B_l, 7, 7);
+        }
+        i += 8;
+    }
+    for (; i < dstW; i++){
+        int Y = ( buf0[i] * yalpha1  +  buf1[i] * yalpha         ) >> 10;
+        int U = (ubuf0[i] * uvalpha1 + ubuf1[i] * uvalpha- uvtemp) >> 10;
+        int V = (vbuf0[i] * uvalpha1 + vbuf1[i] * uvalpha- uvtemp) >> 10;
+
+        A = 0;
+        if (hasAlpha){
+            A = (abuf0[i] * yalpha1 + abuf1[i] * yalpha + atemp) >> 19;
+            if (A & 0x100)
+                A = av_clip_uint8(A);
+        }
+
+        Y -= y_offset;
+        Y *= y_coeff;
+        Y += ytemp;
+        R  = (unsigned)Y + V * v2r_coe;
+        G  = (unsigned)Y + V * v2g_coe + U * u2g_coe;
+        B  = (unsigned)Y + U * u2b_coe;
+        yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
+        dest += step;
+    }
+    c->dither_error[0][i] = err[0];
+    c->dither_error[1][i] = err[1];
+    c->dither_error[2][i] = err[2];
+}
+
+static void
+yuv2rgb_full_1_template_lasx(SwsContext *c, const int16_t *buf0,
+                             const int16_t *ubuf[2], const int16_t *vbuf[2],
+                             const int16_t *abuf0, uint8_t *dest, int dstW,
+                             int uvalpha, int y, enum AVPixelFormat target,
+                             int hasAlpha)
+{
+    const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
+    int i, B, G, R, A;
+    int step = (target == AV_PIX_FMT_RGB24 || target == AV_PIX_FMT_BGR24) ? 3 : 4;
+    int err[4]     = {0};
+    int ytemp      = 1 << 21;
+    int bias_int   = 64;
+    int len        = dstW - 15;
+    __m256i y_temp = __lasx_xvreplgr2vr_w(ytemp);
+    YUVTORGB_SETUP
+
+    if(   target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
+       || target == AV_PIX_FMT_BGR8      || target == AV_PIX_FMT_RGB8)
+        step = 1;
+    if (uvalpha < 2048) {
+        int uvtemp   = 128 << 7;
+        __m256i uv   = __lasx_xvreplgr2vr_w(uvtemp);
+        __m256i bias = __lasx_xvreplgr2vr_w(bias_int);
+
+        for (i = 0; i < len; i += 16) {
+            __m256i b, ub, vb, ub_l, ub_h, vb_l, vb_h;
+            __m256i y_l, y_h, u_l, u_h, v_l, v_h;
+            __m256i R_l, R_h, G_l, G_h, B_l, B_h;
+            int n = i << 1;
+
+            DUP2_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, b, ub);
+            vb  = __lasx_xvldx(vbuf0, n);
+            y_l = __lasx_xvsllwil_w_h(b, 2);
+            y_h = __lasx_xvexth_w_h(b);
+            DUP2_ARG2(__lasx_xvsllwil_w_h, ub, 0, vb, 0, ub_l, vb_l);
+            DUP2_ARG1(__lasx_xvexth_w_h, ub, vb, ub_h, vb_h);
+            y_h = __lasx_xvslli_w(y_h, 2);
+            u_l = __lasx_xvsub_w(ub_l, uv);
+            u_h = __lasx_xvsub_w(ub_h, uv);
+            v_l = __lasx_xvsub_w(vb_l, uv);
+            v_h = __lasx_xvsub_w(vb_h, uv);
+            u_l = __lasx_xvslli_w(u_l, 2);
+            u_h = __lasx_xvslli_w(u_h, 2);
+            v_l = __lasx_xvslli_w(v_l, 2);
+            v_h = __lasx_xvslli_w(v_h, 2);
+            YUVTORGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
+                     y_temp, v2r, v2g, u2g, u2b);
+            YUVTORGB(y_h, u_h, v_h, R_h, G_h, B_h, offset, coeff,
+                     y_temp, v2r, v2g, u2g, u2b);
+
+            if(hasAlpha) {
+                __m256i a_src;
+                __m256i a_l, a_h;
+
+                a_src = __lasx_xvld(abuf0 + i, 0);
+                a_l   = __lasx_xvsllwil_w_h(a_src, 0);
+                a_h   = __lasx_xvexth_w_h(a_src);
+                a_l   = __lasx_xvadd_w(a_l, bias);
+                a_h   = __lasx_xvadd_w(a_h, bias);
+                a_l   = __lasx_xvsrai_w(a_l, 7);
+                a_h   = __lasx_xvsrai_w(a_h, 7);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
+                WRITE_FULL_A(R_h, G_h, B_h, a_h, 0, 4);
+                WRITE_FULL_A(R_h, G_h, B_h, a_h, 1, 5);
+                WRITE_FULL_A(R_h, G_h, B_h, a_h, 2, 6);
+                WRITE_FULL_A(R_h, G_h, B_h, a_h, 3, 7);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 8);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 9);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 10);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 11);
+                WRITE_FULL_A(R_h, G_h, B_h, a_h, 4, 12);
+                WRITE_FULL_A(R_h, G_h, B_h, a_h, 5, 13);
+                WRITE_FULL_A(R_h, G_h, B_h, a_h, 6, 14);
+                WRITE_FULL_A(R_h, G_h, B_h, a_h, 7, 15);
+            } else {
+                WRITE_FULL(R_l, G_l, B_l, 0, 0);
+                WRITE_FULL(R_l, G_l, B_l, 1, 1);
+                WRITE_FULL(R_l, G_l, B_l, 2, 2);
+                WRITE_FULL(R_l, G_l, B_l, 3, 3);
+                WRITE_FULL(R_h, G_h, B_h, 0, 4);
+                WRITE_FULL(R_h, G_h, B_h, 1, 5);
+                WRITE_FULL(R_h, G_h, B_h, 2, 6);
+                WRITE_FULL(R_h, G_h, B_h, 3, 7);
+                WRITE_FULL(R_l, G_l, B_l, 4, 8);
+                WRITE_FULL(R_l, G_l, B_l, 5, 9);
+                WRITE_FULL(R_l, G_l, B_l, 6, 10);
+                WRITE_FULL(R_l, G_l, B_l, 7, 11);
+                WRITE_FULL(R_h, G_h, B_h, 4, 12);
+                WRITE_FULL(R_h, G_h, B_h, 5, 13);
+                WRITE_FULL(R_h, G_h, B_h, 6, 14);
+                WRITE_FULL(R_h, G_h, B_h, 7, 15);
+            }
+        }
+        if (dstW - i >= 8) {
+            __m256i b, ub, vb, ub_l, vb_l;
+            __m256i y_l, u_l, v_l;
+            __m256i R_l, G_l, B_l;
+            int n = i << 1;
+
+            DUP2_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, b, ub);
+            vb  = __lasx_xvldx(vbuf0, n);
+            y_l = __lasx_vext2xv_w_h(b);
+            DUP2_ARG1(__lasx_vext2xv_w_h, ub, vb, ub_l, vb_l);
+            y_l = __lasx_xvslli_w(y_l, 2);
+            u_l = __lasx_xvsub_w(ub_l, uv);
+            v_l = __lasx_xvsub_w(vb_l, uv);
+            u_l = __lasx_xvslli_w(u_l, 2);
+            v_l = __lasx_xvslli_w(v_l, 2);
+            YUVTORGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
+                     y_temp, v2r, v2g, u2g, u2b);
+
+            if(hasAlpha) {
+                __m256i a_src, a_l;
+
+                a_src = __lasx_xvldx(abuf0, n);
+                a_src = __lasx_vext2xv_w_h(a_src);
+                a_l   = __lasx_xvadd_w(bias, a_src);
+                a_l   = __lasx_xvsrai_w(a_l, 7);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 4);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 5);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 6);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 7);
+            } else {
+                WRITE_FULL(R_l, G_l, B_l, 0, 0);
+                WRITE_FULL(R_l, G_l, B_l, 1, 1);
+                WRITE_FULL(R_l, G_l, B_l, 2, 2);
+                WRITE_FULL(R_l, G_l, B_l, 3, 3);
+                WRITE_FULL(R_l, G_l, B_l, 4, 4);
+                WRITE_FULL(R_l, G_l, B_l, 5, 5);
+                WRITE_FULL(R_l, G_l, B_l, 6, 6);
+                WRITE_FULL(R_l, G_l, B_l, 7, 7);
+            }
+            i += 8;
+        }
+        for (; i < dstW; i++) {
+            int Y = buf0[i] << 2;
+            int U = (ubuf0[i] - uvtemp) << 2;
+            int V = (vbuf0[i] - uvtemp) << 2;
+
+            A = 0;
+            if(hasAlpha) {
+                A = (abuf0[i] + 64) >> 7;
+                if (A & 0x100)
+                    A = av_clip_uint8(A);
+            }
+            Y -= y_offset;
+            Y *= y_coeff;
+            Y += ytemp;
+            R  = (unsigned)Y + V * v2r_coe;
+            G  = (unsigned)Y + V * v2g_coe + U * u2g_coe;
+            B  = (unsigned)Y + U * u2b_coe;
+            yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
+            dest += step;
+        }
+    } else {
+        const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
+        int uvtemp   = 128 << 8;
+        __m256i uv   = __lasx_xvreplgr2vr_w(uvtemp);
+        __m256i zero = __lasx_xvldi(0);
+        __m256i bias = __lasx_xvreplgr2vr_h(bias_int);
+
+        for (i = 0; i < len; i += 16) {
+            __m256i b, ub0, ub1, vb0, vb1;
+            __m256i y_ev, y_od, u_ev, u_od, v_ev, v_od;
+            __m256i R_ev, R_od, G_ev, G_od, B_ev, B_od;
+            int n = i << 1;
+
+            DUP4_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, vbuf0, n,
+                      ubuf1, n, b, ub0, vb0, ub1);
+            vb1 = __lasx_xvldx(vbuf, n);
+            y_ev = __lasx_xvaddwev_w_h(b, zero);
+            y_od = __lasx_xvaddwod_w_h(b, zero);
+            DUP2_ARG2(__lasx_xvaddwev_w_h, ub0, vb0, ub1, vb1, u_ev, v_ev);
+            DUP2_ARG2(__lasx_xvaddwod_w_h, ub0, vb0, ub1, vb1, u_od, v_od);
+            DUP2_ARG2(__lasx_xvslli_w, y_ev, 2, y_od, 2, y_ev, y_od);
+            DUP4_ARG2(__lasx_xvsub_w, u_ev, uv, u_od, uv, v_ev, uv, v_od, uv,
+                      u_ev, u_od, v_ev, v_od);
+            DUP4_ARG2(__lasx_xvslli_w, u_ev, 1, u_od, 1, v_ev, 1, v_od, 1,
+                      u_ev, u_od, v_ev, v_od);
+            YUVTORGB(y_ev, u_ev, v_ev, R_ev, G_ev, B_ev, offset, coeff,
+                     y_temp, v2r, v2g, u2g, u2b);
+            YUVTORGB(y_od, u_od, v_od, R_od, G_od, B_od, offset, coeff,
+                     y_temp, v2r, v2g, u2g, u2b);
+
+            if(hasAlpha) {
+                __m256i a_src;
+                __m256i a_ev, a_od;
+
+                a_src = __lasx_xvld(abuf0 + i, 0);
+                a_ev  = __lasx_xvaddwev_w_h(bias, a_src);
+                a_od  = __lasx_xvaddwod_w_h(bias, a_src);
+                a_ev  = __lasx_xvsrai_w(a_ev, 7);
+                a_od  = __lasx_xvsrai_w(a_od, 7);
+                WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 0, 0);
+                WRITE_FULL_A(R_od, G_od, B_od, a_od, 0, 1);
+                WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 1, 2);
+                WRITE_FULL_A(R_od, G_od, B_od, a_od, 1, 3);
+                WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 2, 4);
+                WRITE_FULL_A(R_od, G_od, B_od, a_od, 2, 5);
+                WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 3, 6);
+                WRITE_FULL_A(R_od, G_od, B_od, a_od, 3, 7);
+                WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 4, 8);
+                WRITE_FULL_A(R_od, G_od, B_od, a_od, 4, 9);
+                WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 5, 10);
+                WRITE_FULL_A(R_od, G_od, B_od, a_od, 5, 11);
+                WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 6, 12);
+                WRITE_FULL_A(R_od, G_od, B_od, a_od, 6, 13);
+                WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 7, 14);
+                WRITE_FULL_A(R_od, G_od, B_od, a_od, 7, 15);
+            } else {
+                WRITE_FULL(R_ev, G_ev, B_ev, 0, 0);
+                WRITE_FULL(R_od, G_od, B_od, 0, 1);
+                WRITE_FULL(R_ev, G_ev, B_ev, 1, 2);
+                WRITE_FULL(R_od, G_od, B_od, 1, 3);
+                WRITE_FULL(R_ev, G_ev, B_ev, 2, 4);
+                WRITE_FULL(R_od, G_od, B_od, 2, 5);
+                WRITE_FULL(R_ev, G_ev, B_ev, 3, 6);
+                WRITE_FULL(R_od, G_od, B_od, 3, 7);
+                WRITE_FULL(R_ev, G_ev, B_ev, 4, 8);
+                WRITE_FULL(R_od, G_od, B_od, 4, 9);
+                WRITE_FULL(R_ev, G_ev, B_ev, 5, 10);
+                WRITE_FULL(R_od, G_od, B_od, 5, 11);
+                WRITE_FULL(R_ev, G_ev, B_ev, 6, 12);
+                WRITE_FULL(R_od, G_od, B_od, 6, 13);
+                WRITE_FULL(R_ev, G_ev, B_ev, 7, 14);
+                WRITE_FULL(R_od, G_od, B_od, 7, 15);
+            }
+        }
+        if (dstW - i >= 8) {
+            __m256i b, ub0, ub1, vb0, vb1;
+            __m256i y_l, u_l, v_l;
+            __m256i R_l, G_l, B_l;
+            int n = i << 1;
+
+            DUP4_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, vbuf0, n,
+                      ubuf1, n, b, ub0, vb0, ub1);
+            vb1 = __lasx_xvldx(vbuf1, n);
+            y_l = __lasx_vext2xv_w_h(b);
+            y_l = __lasx_xvslli_w(y_l, 2);
+            DUP4_ARG1(__lasx_vext2xv_w_h, ub0, vb0, ub1, vb1,
+                      ub0, vb0, ub1, vb1);
+            DUP2_ARG2(__lasx_xvadd_w, ub0, ub1, vb0, vb1, u_l, v_l);
+            u_l = __lasx_xvsub_w(u_l, uv);
+            v_l = __lasx_xvsub_w(v_l, uv);
+            u_l = __lasx_xvslli_w(u_l, 1);
+            v_l = __lasx_xvslli_w(v_l, 1);
+            YUVTORGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
+                     y_temp, v2r, v2g, u2g, u2b);
+
+            if(hasAlpha) {
+                __m256i a_src;
+                __m256i a_l;
+
+                a_src  = __lasx_xvld(abuf0 + i, 0);
+                a_src  = __lasx_xvpermi_d(a_src, 0xD8);
+                a_src  = __lasx_xvilvl_h(a_src, a_src);
+                a_l    = __lasx_xvaddwev_w_h(bias, a_src);
+                a_l   = __lasx_xvsrai_w(a_l, 7);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 4);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 5);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 6);
+                WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 7);
+            } else {
+                WRITE_FULL(R_l, G_l, B_l, 0, 0);
+                WRITE_FULL(R_l, G_l, B_l, 1, 1);
+                WRITE_FULL(R_l, G_l, B_l, 2, 2);
+                WRITE_FULL(R_l, G_l, B_l, 3, 3);
+                WRITE_FULL(R_l, G_l, B_l, 4, 4);
+                WRITE_FULL(R_l, G_l, B_l, 5, 5);
+                WRITE_FULL(R_l, G_l, B_l, 6, 6);
+                WRITE_FULL(R_l, G_l, B_l, 7, 7);
+            }
+            i += 8;
+        }
+        for (; i < dstW; i++) {
+            int Y = buf0[i] << 2;
+            int U = (ubuf0[i] + ubuf1[i] - uvtemp) << 1;
+            int V = (vbuf0[i] + vbuf1[i] - uvtemp) << 1;
+
+            A = 0;
+            if(hasAlpha) {
+                A = (abuf0[i] + 64) >> 7;
+                if (A & 0x100)
+                    A = av_clip_uint8(A);
+            }
+            Y -= y_offset;
+            Y *= y_coeff;
+            Y += ytemp;
+            R  = (unsigned)Y + V * v2r_coe;
+            G  = (unsigned)Y + V * v2g_coe + U * u2g_coe;
+            B  = (unsigned)Y + U * u2b_coe;
+            yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
+            dest += step;
+        }
+    }
+    c->dither_error[0][i] = err[0];
+    c->dither_error[1][i] = err[1];
+    c->dither_error[2][i] = err[2];
+}
+#if CONFIG_SMALL
+YUV2RGBWRAPPER(yuv2, rgb_full, bgra32_full, AV_PIX_FMT_BGRA,
+               CONFIG_SWSCALE_ALPHA && c->needAlpha)
+YUV2RGBWRAPPER(yuv2, rgb_full, abgr32_full, AV_PIX_FMT_ABGR,
+               CONFIG_SWSCALE_ALPHA && c->needAlpha)
+YUV2RGBWRAPPER(yuv2, rgb_full, rgba32_full, AV_PIX_FMT_RGBA,
+               CONFIG_SWSCALE_ALPHA && c->needAlpha)
+YUV2RGBWRAPPER(yuv2, rgb_full, argb32_full, AV_PIX_FMT_ARGB,
+               CONFIG_SWSCALE_ALPHA && c->needAlpha)
+#else
+#if CONFIG_SWSCALE_ALPHA
+YUV2RGBWRAPPER(yuv2, rgb_full, bgra32_full, AV_PIX_FMT_BGRA,  1)
+YUV2RGBWRAPPER(yuv2, rgb_full, abgr32_full, AV_PIX_FMT_ABGR,  1)
+YUV2RGBWRAPPER(yuv2, rgb_full, rgba32_full, AV_PIX_FMT_RGBA,  1)
+YUV2RGBWRAPPER(yuv2, rgb_full, argb32_full, AV_PIX_FMT_ARGB,  1)
+#endif
+YUV2RGBWRAPPER(yuv2, rgb_full, bgrx32_full, AV_PIX_FMT_BGRA,  0)
+YUV2RGBWRAPPER(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR,  0)
+YUV2RGBWRAPPER(yuv2, rgb_full, rgbx32_full, AV_PIX_FMT_RGBA,  0)
+YUV2RGBWRAPPER(yuv2, rgb_full, xrgb32_full, AV_PIX_FMT_ARGB,  0)
+#endif
+YUV2RGBWRAPPER(yuv2, rgb_full, bgr24_full,  AV_PIX_FMT_BGR24, 0)
+YUV2RGBWRAPPER(yuv2, rgb_full, rgb24_full,  AV_PIX_FMT_RGB24, 0)
+
+YUV2RGBWRAPPER(yuv2, rgb_full, bgr4_byte_full,  AV_PIX_FMT_BGR4_BYTE, 0)
+YUV2RGBWRAPPER(yuv2, rgb_full, rgb4_byte_full,  AV_PIX_FMT_RGB4_BYTE, 0)
+YUV2RGBWRAPPER(yuv2, rgb_full, bgr8_full,   AV_PIX_FMT_BGR8,  0)
+YUV2RGBWRAPPER(yuv2, rgb_full, rgb8_full,   AV_PIX_FMT_RGB8,  0)
+#undef yuvTorgb
+#undef yuvTorgb_setup
+
+
+av_cold void ff_sws_init_output_loongarch(SwsContext *c)
+{
+
+    if(c->flags & SWS_FULL_CHR_H_INT) {
+        switch (c->dstFormat) {
+        case AV_PIX_FMT_RGBA:
+#if CONFIG_SMALL
+            c->yuv2packedX = yuv2rgba32_full_X_lasx;
+            c->yuv2packed2 = yuv2rgba32_full_2_lasx;
+            c->yuv2packed1 = yuv2rgba32_full_1_lasx;
+#else
+#if CONFIG_SWSCALE_ALPHA
+            if (c->needAlpha) {
+                c->yuv2packedX = yuv2rgba32_full_X_lasx;
+                c->yuv2packed2 = yuv2rgba32_full_2_lasx;
+                c->yuv2packed1 = yuv2rgba32_full_1_lasx;
+            } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+            {
+                c->yuv2packedX = yuv2rgbx32_full_X_lasx;
+                c->yuv2packed2 = yuv2rgbx32_full_2_lasx;
+                c->yuv2packed1 = yuv2rgbx32_full_1_lasx;
+            }
+#endif /* !CONFIG_SMALL */
+            break;
+        case AV_PIX_FMT_ARGB:
+#if CONFIG_SMALL
+            c->yuv2packedX = yuv2argb32_full_X_lasx;
+            c->yuv2packed2 = yuv2argb32_full_2_lasx;
+            c->yuv2packed1 = yuv2argb32_full_1_lasx;
+#else
+#if CONFIG_SWSCALE_ALPHA
+            if (c->needAlpha) {
+                c->yuv2packedX = yuv2argb32_full_X_lasx;
+                c->yuv2packed2 = yuv2argb32_full_2_lasx;
+                c->yuv2packed1 = yuv2argb32_full_1_lasx;
+            } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+            {
+                c->yuv2packedX = yuv2xrgb32_full_X_lasx;
+                c->yuv2packed2 = yuv2xrgb32_full_2_lasx;
+                c->yuv2packed1 = yuv2xrgb32_full_1_lasx;
+            }
+#endif /* !CONFIG_SMALL */
+            break;
+        case AV_PIX_FMT_BGRA:
+#if CONFIG_SMALL
+            c->yuv2packedX = yuv2bgra32_full_X_lasx;
+            c->yuv2packed2 = yuv2bgra32_full_2_lasx;
+            c->yuv2packed1 = yuv2bgra32_full_1_lasx;
+#else
+#if CONFIG_SWSCALE_ALPHA
+            if (c->needAlpha) {
+                c->yuv2packedX = yuv2bgra32_full_X_lasx;
+                c->yuv2packed2 = yuv2bgra32_full_2_lasx;
+                c->yuv2packed1 = yuv2bgra32_full_1_lasx;
+            } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+            {
+                c->yuv2packedX = yuv2bgrx32_full_X_lasx;
+                c->yuv2packed2 = yuv2bgrx32_full_2_lasx;
+                c->yuv2packed1 = yuv2bgrx32_full_1_lasx;
+            }
+#endif /* !CONFIG_SMALL */
+            break;
+        case AV_PIX_FMT_ABGR:
+#if CONFIG_SMALL
+            c->yuv2packedX = yuv2abgr32_full_X_lasx;
+            c->yuv2packed2 = yuv2abgr32_full_2_lasx;
+            c->yuv2packed1 = yuv2abgr32_full_1_lasx;
+#else
+#if CONFIG_SWSCALE_ALPHA
+            if (c->needAlpha) {
+                c->yuv2packedX = yuv2abgr32_full_X_lasx;
+                c->yuv2packed2 = yuv2abgr32_full_2_lasx;
+                c->yuv2packed1 = yuv2abgr32_full_1_lasx;
+            } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+            {
+                c->yuv2packedX = yuv2xbgr32_full_X_lasx;
+                c->yuv2packed2 = yuv2xbgr32_full_2_lasx;
+                c->yuv2packed1 = yuv2xbgr32_full_1_lasx;
+            }
+#endif /* !CONFIG_SMALL */
+            break;
+        case AV_PIX_FMT_RGB24:
+            c->yuv2packedX = yuv2rgb24_full_X_lasx;
+            c->yuv2packed2 = yuv2rgb24_full_2_lasx;
+            c->yuv2packed1 = yuv2rgb24_full_1_lasx;
+            break;
+        case AV_PIX_FMT_BGR24:
+            c->yuv2packedX = yuv2bgr24_full_X_lasx;
+            c->yuv2packed2 = yuv2bgr24_full_2_lasx;
+            c->yuv2packed1 = yuv2bgr24_full_1_lasx;
+            break;
+        case AV_PIX_FMT_BGR4_BYTE:
+            c->yuv2packedX = yuv2bgr4_byte_full_X_lasx;
+            c->yuv2packed2 = yuv2bgr4_byte_full_2_lasx;
+            c->yuv2packed1 = yuv2bgr4_byte_full_1_lasx;
+            break;
+        case AV_PIX_FMT_RGB4_BYTE:
+            c->yuv2packedX = yuv2rgb4_byte_full_X_lasx;
+            c->yuv2packed2 = yuv2rgb4_byte_full_2_lasx;
+            c->yuv2packed1 = yuv2rgb4_byte_full_1_lasx;
+            break;
+        case AV_PIX_FMT_BGR8:
+            c->yuv2packedX = yuv2bgr8_full_X_lasx;
+            c->yuv2packed2 = yuv2bgr8_full_2_lasx;
+            c->yuv2packed1 = yuv2bgr8_full_1_lasx;
+            break;
+        case AV_PIX_FMT_RGB8:
+            c->yuv2packedX = yuv2rgb8_full_X_lasx;
+            c->yuv2packed2 = yuv2rgb8_full_2_lasx;
+            c->yuv2packed1 = yuv2rgb8_full_1_lasx;
+            break;
+    }
+    } else {
+        switch (c->dstFormat) {
+        case AV_PIX_FMT_RGB32:
+        case AV_PIX_FMT_BGR32:
+#if CONFIG_SMALL
+#else
+#if CONFIG_SWSCALE_ALPHA
+            if (c->needAlpha) {
+            } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+            {
+                c->yuv2packed1 = yuv2rgbx32_1_lasx;
+                c->yuv2packed2 = yuv2rgbx32_2_lasx;
+                c->yuv2packedX = yuv2rgbx32_X_lasx;
+            }
+#endif /* !CONFIG_SMALL */
+            break;
+        case AV_PIX_FMT_RGB32_1:
+        case AV_PIX_FMT_BGR32_1:
+#if CONFIG_SMALL
+#else
+#if CONFIG_SWSCALE_ALPHA
+            if (c->needAlpha) {
+            } else
+#endif /* CONFIG_SWSCALE_ALPHA */
+            {
+                c->yuv2packed1 = yuv2rgbx32_1_1_lasx;
+                c->yuv2packed2 = yuv2rgbx32_1_2_lasx;
+                c->yuv2packedX = yuv2rgbx32_1_X_lasx;
+            }
+#endif /* !CONFIG_SMALL */
+            break;
+        case AV_PIX_FMT_RGB24:
+            c->yuv2packed1 = yuv2rgb24_1_lasx;
+            c->yuv2packed2 = yuv2rgb24_2_lasx;
+            c->yuv2packedX = yuv2rgb24_X_lasx;
+            break;
+        case AV_PIX_FMT_BGR24:
+            c->yuv2packed1 = yuv2bgr24_1_lasx;
+            c->yuv2packed2 = yuv2bgr24_2_lasx;
+            c->yuv2packedX = yuv2bgr24_X_lasx;
+            break;
+        case AV_PIX_FMT_RGB565LE:
+        case AV_PIX_FMT_RGB565BE:
+        case AV_PIX_FMT_BGR565LE:
+        case AV_PIX_FMT_BGR565BE:
+            c->yuv2packed1 = yuv2rgb16_1_lasx;
+            c->yuv2packed2 = yuv2rgb16_2_lasx;
+            c->yuv2packedX = yuv2rgb16_X_lasx;
+            break;
+        case AV_PIX_FMT_RGB555LE:
+        case AV_PIX_FMT_RGB555BE:
+        case AV_PIX_FMT_BGR555LE:
+        case AV_PIX_FMT_BGR555BE:
+            c->yuv2packed1 = yuv2rgb15_1_lasx;
+            c->yuv2packed2 = yuv2rgb15_2_lasx;
+            c->yuv2packedX = yuv2rgb15_X_lasx;
+            break;
+        case AV_PIX_FMT_RGB444LE:
+        case AV_PIX_FMT_RGB444BE:
+        case AV_PIX_FMT_BGR444LE:
+        case AV_PIX_FMT_BGR444BE:
+            c->yuv2packed1 = yuv2rgb12_1_lasx;
+            c->yuv2packed2 = yuv2rgb12_2_lasx;
+            c->yuv2packedX = yuv2rgb12_X_lasx;
+            break;
+        case AV_PIX_FMT_RGB8:
+        case AV_PIX_FMT_BGR8:
+            c->yuv2packed1 = yuv2rgb8_1_lasx;
+            c->yuv2packed2 = yuv2rgb8_2_lasx;
+            c->yuv2packedX = yuv2rgb8_X_lasx;
+            break;
+        case AV_PIX_FMT_RGB4:
+        case AV_PIX_FMT_BGR4:
+            c->yuv2packed1 = yuv2rgb4_1_lasx;
+            c->yuv2packed2 = yuv2rgb4_2_lasx;
+            c->yuv2packedX = yuv2rgb4_X_lasx;
+            break;
+        case AV_PIX_FMT_RGB4_BYTE:
+        case AV_PIX_FMT_BGR4_BYTE:
+            c->yuv2packed1 = yuv2rgb4b_1_lasx;
+            c->yuv2packed2 = yuv2rgb4b_2_lasx;
+            c->yuv2packedX = yuv2rgb4b_X_lasx;
+            break;
+        }
+    }
+}
diff --git a/libswscale/loongarch/swscale_init_loongarch.c b/libswscale/loongarch/swscale_init_loongarch.c
index 1e0bb1b116..97fe947e2e 100644
--- a/libswscale/loongarch/swscale_init_loongarch.c
+++ b/libswscale/loongarch/swscale_init_loongarch.c
@@ -28,6 +28,7 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
     if (have_lasx(cpu_flags)) {
+        ff_sws_init_output_loongarch(c);
         if (c->srcBpc == 8) {
             if (c->dstBpc <= 14) {
                 c->hyScale = c->hcScale = ff_hscale_8_to_15_lasx;
@@ -47,6 +48,8 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
             }
             break;
         }
+        if (c->dstBpc == 8)
+            c->yuv2planeX = ff_yuv2planeX_8_lasx;
     }
 }
 
diff --git a/libswscale/loongarch/swscale_loongarch.h b/libswscale/loongarch/swscale_loongarch.h
index 07a8145da2..d6c0399737 100644
--- a/libswscale/loongarch/swscale_loongarch.h
+++ b/libswscale/loongarch/swscale_loongarch.h
@@ -69,4 +69,10 @@ void ff_interleave_bytes_lasx(const uint8_t *src1, const uint8_t *src2,
                               uint8_t *dest, int width, int height,
                               int src1Stride, int src2Stride, int dstStride);
 
+av_cold void ff_sws_init_output_loongarch(SwsContext *c);
+
+void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
+                          const int16_t **src, uint8_t *dest, int dstW,
+                          const uint8_t *dither, int offset);
+
 #endif /* SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H */
-- 
2.20.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [FFmpeg-devel] [PATCH v1 3/3] swscale/la: Add output_lasx.c file.
  2022-08-29 11:26 ` [FFmpeg-devel] [PATCH v1 3/3] swscale/la: Add output_lasx.c file Hao Chen
@ 2022-08-29 12:30   ` Andreas Rheinhardt
  2022-09-06  8:12     ` Shiyou Yin
  0 siblings, 1 reply; 13+ messages in thread
From: Andreas Rheinhardt @ 2022-08-29 12:30 UTC (permalink / raw)
  To: ffmpeg-devel

Hao Chen:
> ffmpeg -i ~/media/1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -s 640x480 -pix_fmt
> rgb24 -y /dev/null -an
> before: 150fps
> after:  183fps
> 
> Signed-off-by: Hao Chen <chenhao@loongson.cn>
> ---
>  libswscale/loongarch/Makefile                 |    3 +-
>  libswscale/loongarch/output_lasx.c            | 1982 +++++++++++++++++
>  libswscale/loongarch/swscale_init_loongarch.c |    3 +
>  libswscale/loongarch/swscale_loongarch.h      |    6 +
>  4 files changed, 1993 insertions(+), 1 deletion(-)
>  create mode 100644 libswscale/loongarch/output_lasx.c
> 
> diff --git a/libswscale/loongarch/Makefile b/libswscale/loongarch/Makefile
> index 4345971514..54d48b3de0 100644
> --- a/libswscale/loongarch/Makefile
> +++ b/libswscale/loongarch/Makefile
> @@ -2,4 +2,5 @@ OBJS-$(CONFIG_SWSCALE)      += loongarch/swscale_init_loongarch.o
>  LASX-OBJS-$(CONFIG_SWSCALE) += loongarch/swscale_lasx.o \
>                                 loongarch/input_lasx.o   \
>                                 loongarch/yuv2rgb_lasx.o \
> -                               loongarch/rgb2rgb_lasx.o
> +                               loongarch/rgb2rgb_lasx.o \
> +							   loongarch/output_lasx.o
> diff --git a/libswscale/loongarch/output_lasx.c b/libswscale/loongarch/output_lasx.c
> new file mode 100644
> index 0000000000..19f82692ff
> --- /dev/null
> +++ b/libswscale/loongarch/output_lasx.c
> @@ -0,0 +1,1982 @@
> +/*
> + * Copyright (C) 2022 Loongson Technology Corporation Limited
> + * Contributed by Hao Chen(chenhao@loongson.cn)
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "swscale_loongarch.h"
> +#include "libavutil/loongarch/loongson_intrinsics.h"
> +
> +void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
> +                          const int16_t **src, uint8_t *dest, int dstW,
> +                          const uint8_t *dither, int offset)
> +{
> +    int i;
> +    int len = dstW - 15;
> +    __m256i mask = {0x1C0C180814041000, 0x1C1814100C080400,
> +                    0x1C0C180814041000, 0x1C1814100C080400};
> +    __m256i val1, val2, val3;
> +    uint8_t dither0 = dither[offset & 7];
> +    uint8_t dither1 = dither[(offset + 1) & 7];
> +    uint8_t dither2 = dither[(offset + 2) & 7];
> +    uint8_t dither3 = dither[(offset + 3) & 7];
> +    uint8_t dither4 = dither[(offset + 4) & 7];
> +    uint8_t dither5 = dither[(offset + 5) & 7];
> +    uint8_t dither6 = dither[(offset + 6) & 7];
> +    uint8_t dither7 = dither[(offset + 7) & 7];
> +    int val_1[8] = {dither0, dither2, dither4, dither6,
> +                    dither0, dither2, dither4, dither6};
> +    int val_2[8] = {dither1, dither3, dither5, dither7,
> +                    dither1, dither3, dither5, dither7};
> +    int val_3[8] = {dither0, dither1, dither2, dither3,
> +                    dither4, dither5, dither6, dither7};
> +
> +    DUP2_ARG2(__lasx_xvld, val_1, 0, val_2, 0, val1, val2);
> +    val3 = __lasx_xvld(val_3, 0);
> +
> +    for (i = 0; i < len; i += 16) {
> +        int j;
> +        __m256i src0, filter0, val;
> +        __m256i val_ev, val_od;
> +
> +        val_ev = __lasx_xvslli_w(val1, 12);
> +        val_od = __lasx_xvslli_w(val2, 12);
> +
> +        for (j = 0; j < filterSize; j++) {
> +            src0  = __lasx_xvld(src[j]+ i, 0);
> +            filter0 = __lasx_xvldrepl_h((filter + j), 0);
> +            val_ev = __lasx_xvmaddwev_w_h(val_ev, src0, filter0);
> +            val_od = __lasx_xvmaddwod_w_h(val_od, src0, filter0);
> +        }
> +        val_ev = __lasx_xvsrai_w(val_ev, 19);
> +        val_od = __lasx_xvsrai_w(val_od, 19);
> +        val_ev = __lasx_xvclip255_w(val_ev);
> +        val_od = __lasx_xvclip255_w(val_od);
> +        val    = __lasx_xvshuf_b(val_od, val_ev, mask);
> +        __lasx_xvstelm_d(val, (dest + i), 0, 0);
> +        __lasx_xvstelm_d(val, (dest + i), 8, 2);
> +    }
> +    if (dstW - i >= 8){
> +        int j;
> +        __m256i src0, filter0, val_h;
> +        __m256i val_l;
> +
> +        val_l = __lasx_xvslli_w(val3, 12);
> +
> +        for (j = 0; j < filterSize; j++) {
> +            src0  = __lasx_xvld(src[j] + i, 0);
> +            src0  = __lasx_vext2xv_w_h(src0);
> +            filter0 = __lasx_xvldrepl_h((filter + j), 0);
> +            filter0 = __lasx_vext2xv_w_h(filter0);
> +            val_l = __lasx_xvmadd_w(val_l, src0, filter0);
> +        }
> +        val_l = __lasx_xvsrai_w(val_l, 19);
> +        val_l = __lasx_xvclip255_w(val_l);
> +        val_h = __lasx_xvpermi_d(val_l, 0x4E);
> +        val_l = __lasx_xvshuf_b(val_h, val_l, mask);
> +        __lasx_xvstelm_d(val_l, (dest + i), 0, 1);
> +        i += 8;
> +    }
> +    for (; i < dstW; i++) {
> +        int val = dither[(i + offset) & 7] << 12;
> +        int j;
> +        for (j = 0; j< filterSize; j++)
> +            val += src[j][i] * filter[j];
> +
> +        dest[i] = av_clip_uint8(val >> 19);
> +    }
> +}
> +
> +/*Copy from libswscale/output.c*/
> +static av_always_inline void
> +yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2,
> +              unsigned A1, unsigned A2,
> +              const void *_r, const void *_g, const void *_b, int y,
> +              enum AVPixelFormat target, int hasAlpha)
> +{
> +    if (target == AV_PIX_FMT_ARGB || target == AV_PIX_FMT_RGBA ||
> +        target == AV_PIX_FMT_ABGR || target == AV_PIX_FMT_BGRA) {
> +        uint32_t *dest = (uint32_t *) _dest;
> +        const uint32_t *r = (const uint32_t *) _r;
> +        const uint32_t *g = (const uint32_t *) _g;
> +        const uint32_t *b = (const uint32_t *) _b;
> +
> +#if CONFIG_SMALL
> +        dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
> +        dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
> +#else
> +#if defined(ASSERT_LEVEL) && ASSERT_LEVEL > 1
> +        int sh = (target == AV_PIX_FMT_RGB32_1 ||
> +                  target == AV_PIX_FMT_BGR32_1) ? 0 : 24;
> +        av_assert2((((r[Y1] + g[Y1] + b[Y1]) >> sh) & 0xFF) == 0xFF);
> +#endif
> +        dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
> +        dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
> +#endif
> +    } else if (target == AV_PIX_FMT_RGB24 || target == AV_PIX_FMT_BGR24) {
> +        uint8_t *dest = (uint8_t *) _dest;
> +        const uint8_t *r = (const uint8_t *) _r;
> +        const uint8_t *g = (const uint8_t *) _g;
> +        const uint8_t *b = (const uint8_t *) _b;
> +
> +#define r_b ((target == AV_PIX_FMT_RGB24) ? r : b)
> +#define b_r ((target == AV_PIX_FMT_RGB24) ? b : r)
> +
> +        dest[i * 6 + 0] = r_b[Y1];
> +        dest[i * 6 + 1] =   g[Y1];
> +        dest[i * 6 + 2] = b_r[Y1];
> +        dest[i * 6 + 3] = r_b[Y2];
> +        dest[i * 6 + 4] =   g[Y2];
> +        dest[i * 6 + 5] = b_r[Y2];
> +#undef r_b
> +#undef b_r
> +    } else if (target == AV_PIX_FMT_RGB565 || target == AV_PIX_FMT_BGR565 ||
> +               target == AV_PIX_FMT_RGB555 || target == AV_PIX_FMT_BGR555 ||
> +               target == AV_PIX_FMT_RGB444 || target == AV_PIX_FMT_BGR444) {
> +        uint16_t *dest = (uint16_t *) _dest;
> +        const uint16_t *r = (const uint16_t *) _r;
> +        const uint16_t *g = (const uint16_t *) _g;
> +        const uint16_t *b = (const uint16_t *) _b;
> +        int dr1, dg1, db1, dr2, dg2, db2;
> +
> +        if (target == AV_PIX_FMT_RGB565 || target == AV_PIX_FMT_BGR565) {
> +            dr1 = ff_dither_2x2_8[ y & 1     ][0];
> +            dg1 = ff_dither_2x2_4[ y & 1     ][0];
> +            db1 = ff_dither_2x2_8[(y & 1) ^ 1][0];
> +            dr2 = ff_dither_2x2_8[ y & 1     ][1];
> +            dg2 = ff_dither_2x2_4[ y & 1     ][1];
> +            db2 = ff_dither_2x2_8[(y & 1) ^ 1][1];
> +    } else if (target == AV_PIX_FMT_RGB555 || target == AV_PIX_FMT_BGR555) {
> +            dr1 = ff_dither_2x2_8[ y & 1     ][0];
> +            dg1 = ff_dither_2x2_8[ y & 1     ][1];
> +            db1 = ff_dither_2x2_8[(y & 1) ^ 1][0];
> +            dr2 = ff_dither_2x2_8[ y & 1     ][1];
> +            dg2 = ff_dither_2x2_8[ y & 1     ][0];
> +            db2 = ff_dither_2x2_8[(y & 1) ^ 1][1];
> +        } else {
> +            dr1 = ff_dither_4x4_16[ y & 3     ][0];
> +            dg1 = ff_dither_4x4_16[ y & 3     ][1];
> +            db1 = ff_dither_4x4_16[(y & 3) ^ 3][0];
> +            dr2 = ff_dither_4x4_16[ y & 3     ][1];
> +            dg2 = ff_dither_4x4_16[ y & 3     ][0];
> +            db2 = ff_dither_4x4_16[(y & 3) ^ 3][1];
> +        }
> +
> +        dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
> +        dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
> +    } else /* 8/4 bits */ {
> +        uint8_t *dest = (uint8_t *) _dest;
> +        const uint8_t *r = (const uint8_t *) _r;
> +        const uint8_t *g = (const uint8_t *) _g;
> +        const uint8_t *b = (const uint8_t *) _b;
> +        int dr1, dg1, db1, dr2, dg2, db2;
> +
> +        if (target == AV_PIX_FMT_RGB8 || target == AV_PIX_FMT_BGR8) {
> +            const uint8_t * const d64 = ff_dither_8x8_73[y & 7];
> +            const uint8_t * const d32 = ff_dither_8x8_32[y & 7];
> +            dr1 = dg1 = d32[(i * 2 + 0) & 7];
> +            db1 =       d64[(i * 2 + 0) & 7];
> +            dr2 = dg2 = d32[(i * 2 + 1) & 7];
> +            db2 =       d64[(i * 2 + 1) & 7];
> +        } else {
> +            const uint8_t * const d64  = ff_dither_8x8_73 [y & 7];
> +            const uint8_t * const d128 = ff_dither_8x8_220[y & 7];
> +            dr1 = db1 = d128[(i * 2 + 0) & 7];
> +            dg1 =        d64[(i * 2 + 0) & 7];
> +            dr2 = db2 = d128[(i * 2 + 1) & 7];
> +            dg2 =        d64[(i * 2 + 1) & 7];
> +        }
> +
> +        if (target == AV_PIX_FMT_RGB4 || target == AV_PIX_FMT_BGR4) {
> +            dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] +
> +                    ((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4);
> +        } else {
> +            dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
> +            dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
> +        }
> +    }
> +}
> +
> +#define WRITE_YUV2RGB(vec_y1, vec_y2, vec_u, vec_v, t1, t2, t3, t4)    \
> +{                                                                      \
> +    Y1 = __lasx_xvpickve2gr_w(vec_y1, t1);                             \
> +    Y2 = __lasx_xvpickve2gr_w(vec_y2, t2);                             \
> +    U  = __lasx_xvpickve2gr_w(vec_u, t3);                              \
> +    V  = __lasx_xvpickve2gr_w(vec_v, t4);                              \
> +    r  =  c->table_rV[V];                                              \
> +    g  = (c->table_gU[U] + c->table_gV[V]);                            \
> +    b  =  c->table_bU[U];                                              \
> +    yuv2rgb_write(dest, count, Y1, Y2, 0, 0,                           \
> +                  r, g, b, y, target, 0);                              \
> +    count++;                                                           \
> +}
> +
> +static void
> +yuv2rgb_X_template_lasx(SwsContext *c, const int16_t *lumFilter,
> +                        const int16_t **lumSrc, int lumFilterSize,
> +                        const int16_t *chrFilter, const int16_t **chrUSrc,
> +                        const int16_t **chrVSrc, int chrFilterSize,
> +                        const int16_t **alpSrc, uint8_t *dest, int dstW,
> +                        int y, enum AVPixelFormat target, int hasAlpha)
> +{
> +    int i, j;
> +    int count = 0;
> +    int t     = 1 << 18;
> +    int len   = dstW >> 6;
> +    int res   = dstW & 63;
> +    int len_count = (dstW + 1) >> 1;
> +    const void *r, *g, *b;
> +    int head = YUVRGB_TABLE_HEADROOM;
> +    __m256i headroom  = __lasx_xvreplgr2vr_w(head);
> +
> +    for (i = 0; i < len; i++) {
> +        int Y1, Y2, U, V, count_lum = count << 1;
> +        __m256i l_src1, l_src2, l_src3, l_src4, u_src1, u_src2, v_src1, v_src2;
> +        __m256i yl1_ev, yl1_od, yh1_ev, yh1_od, yl2_ev, yl2_od, yh2_ev, yh2_od;
> +        __m256i u1_ev, u1_od, v1_ev, v1_od, u2_ev, u2_od, v2_ev, v2_od, temp;
> +
> +        yl1_ev = __lasx_xvldrepl_w(&t, 0);
> +        yl1_od = yl1_ev;
> +        yh1_ev = yl1_ev;
> +        yh1_od = yl1_ev;
> +        u1_ev  = yl1_ev;
> +        v1_ev  = yl1_ev;
> +        u1_od  = yl1_ev;
> +        v1_od  = yl1_ev;
> +        yl2_ev = yl1_ev;
> +        yl2_od = yl1_ev;
> +        yh2_ev = yl1_ev;
> +        yh2_od = yl1_ev;
> +        u2_ev  = yl1_ev;
> +        v2_ev  = yl1_ev;
> +        u2_od  = yl1_ev;
> +        v2_od  = yl1_ev;
> +        for (j = 0; j < lumFilterSize; j++) {
> +            int16_t *src_lum = lumSrc[j] + count_lum;
> +            temp    = __lasx_xvldrepl_h((lumFilter + j), 0);
> +            DUP4_ARG2(__lasx_xvld, src_lum, 0, src_lum, 32, src_lum, 64,
> +                      src_lum, 96, l_src1, l_src2, l_src3, l_src4);
> +
> +            yl1_ev  = __lasx_xvmaddwev_w_h(yl1_ev, temp, l_src1);
> +            yl1_od  = __lasx_xvmaddwod_w_h(yl1_od, temp, l_src1);
> +            yh1_ev  = __lasx_xvmaddwev_w_h(yh1_ev, temp, l_src2);
> +            yh1_od  = __lasx_xvmaddwod_w_h(yh1_od, temp, l_src2);
> +            yl2_ev  = __lasx_xvmaddwev_w_h(yl2_ev, temp, l_src3);
> +            yl2_od  = __lasx_xvmaddwod_w_h(yl2_od, temp, l_src3);
> +            yh2_ev  = __lasx_xvmaddwev_w_h(yh2_ev, temp, l_src4);
> +            yh2_od  = __lasx_xvmaddwod_w_h(yh2_od, temp, l_src4);
> +        }
> +        for (j = 0; j < chrFilterSize; j++) {
> +            DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrUSrc[j] + count, 32,
> +                      u_src1, u_src2);
> +            DUP2_ARG2(__lasx_xvld, chrVSrc[j] + count, 0, chrVSrc[j] + count, 32,
> +                      v_src1, v_src2);
> +            temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
> +            u1_ev  = __lasx_xvmaddwev_w_h(u1_ev, temp, u_src1);
> +            u1_od  = __lasx_xvmaddwod_w_h(u1_od, temp, u_src1);
> +            v1_ev  = __lasx_xvmaddwev_w_h(v1_ev, temp, v_src1);
> +            v1_od  = __lasx_xvmaddwod_w_h(v1_od, temp, v_src1);
> +            u2_ev  = __lasx_xvmaddwev_w_h(u2_ev, temp, u_src2);
> +            u2_od  = __lasx_xvmaddwod_w_h(u2_od, temp, u_src2);
> +            v2_ev  = __lasx_xvmaddwev_w_h(v2_ev, temp, v_src2);
> +            v2_od  = __lasx_xvmaddwod_w_h(v2_od, temp, v_src2);
> +        }
> +        yl1_ev = __lasx_xvsrai_w(yl1_ev, 19);
> +        yh1_ev = __lasx_xvsrai_w(yh1_ev, 19);
> +        yl1_od = __lasx_xvsrai_w(yl1_od, 19);
> +        yh1_od = __lasx_xvsrai_w(yh1_od, 19);
> +        u1_ev  = __lasx_xvsrai_w(u1_ev, 19);
> +        v1_ev  = __lasx_xvsrai_w(v1_ev, 19);
> +        u1_od  = __lasx_xvsrai_w(u1_od, 19);
> +        v1_od  = __lasx_xvsrai_w(v1_od, 19);
> +        yl2_ev = __lasx_xvsrai_w(yl2_ev, 19);
> +        yh2_ev = __lasx_xvsrai_w(yh2_ev, 19);
> +        yl2_od = __lasx_xvsrai_w(yl2_od, 19);
> +        yh2_od = __lasx_xvsrai_w(yh2_od, 19);
> +        u2_ev  = __lasx_xvsrai_w(u2_ev, 19);
> +        v2_ev  = __lasx_xvsrai_w(v2_ev, 19);
> +        u2_od  = __lasx_xvsrai_w(u2_od, 19);
> +        v2_od  = __lasx_xvsrai_w(v2_od, 19);
> +        u1_ev  = __lasx_xvadd_w(u1_ev, headroom);
> +        v1_ev  = __lasx_xvadd_w(v1_ev, headroom);
> +        u1_od  = __lasx_xvadd_w(u1_od, headroom);
> +        v1_od  = __lasx_xvadd_w(v1_od, headroom);
> +        u2_ev  = __lasx_xvadd_w(u2_ev, headroom);
> +        v2_ev  = __lasx_xvadd_w(v2_ev, headroom);
> +        u2_od  = __lasx_xvadd_w(u2_od, headroom);
> +        v2_od  = __lasx_xvadd_w(v2_od, headroom);
> +        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 0, 0, 0, 0);
> +        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 1, 1, 0, 0);
> +        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 2, 2, 1, 1);
> +        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 3, 3, 1, 1);
> +        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 4, 4, 2, 2);
> +        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 5, 5, 2, 2);
> +        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 6, 6, 3, 3);
> +        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 7, 7, 3, 3);
> +        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 0, 0, 4, 4);
> +        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 1, 1, 4, 4);
> +        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 2, 2, 5, 5);
> +        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 3, 3, 5, 5);
> +        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 4, 4, 6, 6);
> +        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 5, 5, 6, 6);
> +        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 6, 6, 7, 7);
> +        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 7, 7, 7, 7);
> +        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 0, 0, 0, 0);
> +        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 1, 1, 0, 0);
> +        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 2, 2, 1, 1);
> +        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 3, 3, 1, 1);
> +        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 4, 4, 2, 2);
> +        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 5, 5, 2, 2);
> +        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 6, 6, 3, 3);
> +        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 7, 7, 3, 3);
> +        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 0, 0, 4, 4);
> +        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 1, 1, 4, 4);
> +        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 2, 2, 5, 5);
> +        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 3, 3, 5, 5);
> +        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 4, 4, 6, 6);
> +        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 5, 5, 6, 6);
> +        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 6, 6, 7, 7);
> +        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 7, 7, 7, 7);
> +    }
> +    if (res >= 32) {
> +        int Y1, Y2, U, V, count_lum = count << 1;
> +        __m256i l_src1, l_src2, u_src, v_src;
> +        __m256i yl_ev, yl_od, yh_ev, yh_od;
> +        __m256i u_ev, u_od, v_ev, v_od, temp;
> +
> +        yl_ev = __lasx_xvldrepl_w(&t, 0);
> +        yl_od = yl_ev;
> +        yh_ev = yl_ev;
> +        yh_od = yl_ev;
> +        u_ev  = yl_ev;
> +        v_ev  = yl_ev;
> +        u_od  = yl_ev;
> +        v_od  = yl_ev;
> +        for (j = 0; j < lumFilterSize; j++) {
> +            temp   = __lasx_xvldrepl_h((lumFilter + j), 0);
> +            DUP2_ARG2(__lasx_xvld, lumSrc[j] + count_lum, 0, lumSrc[j] + count_lum,
> +                      32, l_src1, l_src2);
> +            yl_ev  = __lasx_xvmaddwev_w_h(yl_ev, temp, l_src1);
> +            yl_od  = __lasx_xvmaddwod_w_h(yl_od, temp, l_src1);
> +            yh_ev  = __lasx_xvmaddwev_w_h(yh_ev, temp, l_src2);
> +            yh_od  = __lasx_xvmaddwod_w_h(yh_od, temp, l_src2);
> +        }
> +        for (j = 0; j < chrFilterSize; j++) {
> +            DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
> +                      u_src, v_src);
> +            temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
> +            u_ev  = __lasx_xvmaddwev_w_h(u_ev, temp, u_src);
> +            u_od  = __lasx_xvmaddwod_w_h(u_od, temp, u_src);
> +            v_ev  = __lasx_xvmaddwev_w_h(v_ev, temp, v_src);
> +            v_od  = __lasx_xvmaddwod_w_h(v_od, temp, v_src);
> +        }
> +        yl_ev = __lasx_xvsrai_w(yl_ev, 19);
> +        yh_ev = __lasx_xvsrai_w(yh_ev, 19);
> +        yl_od = __lasx_xvsrai_w(yl_od, 19);
> +        yh_od = __lasx_xvsrai_w(yh_od, 19);
> +        u_ev  = __lasx_xvsrai_w(u_ev, 19);
> +        v_ev  = __lasx_xvsrai_w(v_ev, 19);
> +        u_od  = __lasx_xvsrai_w(u_od, 19);
> +        v_od  = __lasx_xvsrai_w(v_od, 19);
> +        u_ev  = __lasx_xvadd_w(u_ev, headroom);
> +        v_ev  = __lasx_xvadd_w(v_ev, headroom);
> +        u_od  = __lasx_xvadd_w(u_od, headroom);
> +        v_od  = __lasx_xvadd_w(v_od, headroom);
> +        WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 0, 0, 0, 0);
> +        WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 1, 1, 0, 0);
> +        WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 2, 2, 1, 1);
> +        WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 3, 3, 1, 1);
> +        WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 4, 4, 2, 2);
> +        WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 5, 5, 2, 2);
> +        WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 6, 6, 3, 3);
> +        WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 7, 7, 3, 3);
> +        WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 0, 0, 4, 4);
> +        WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 1, 1, 4, 4);
> +        WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 2, 2, 5, 5);
> +        WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 3, 3, 5, 5);
> +        WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 4, 4, 6, 6);
> +        WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 5, 5, 6, 6);
> +        WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 6, 6, 7, 7);
> +        WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 7, 7, 7, 7);
> +        res -= 32;
> +    }
> +    if (res >= 16) {
> +        int Y1, Y2, U, V;
> +        int count_lum = count << 1;
> +        __m256i l_src, u_src, v_src;
> +        __m256i y_ev, y_od, u, v, temp;
> +
> +        y_ev = __lasx_xvldrepl_w(&t, 0);
> +        y_od = y_ev;
> +        u    = y_ev;
> +        v    = y_ev;
> +        for (j = 0; j < lumFilterSize; j++) {
> +            temp  = __lasx_xvldrepl_h((lumFilter + j), 0);
> +            l_src = __lasx_xvld(lumSrc[j] + count_lum, 0);
> +            y_ev  = __lasx_xvmaddwev_w_h(y_ev, temp, l_src);
> +            y_od  = __lasx_xvmaddwod_w_h(y_od, temp, l_src);
> +        }
> +        for (j = 0; j < chrFilterSize; j++) {
> +            DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrVSrc[j] + count,
> +                      0, u_src, v_src);
> +            temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
> +            u_src = __lasx_vext2xv_w_h(u_src);
> +            v_src = __lasx_vext2xv_w_h(v_src);
> +            u     = __lasx_xvmaddwev_w_h(u, temp, u_src);
> +            v     = __lasx_xvmaddwev_w_h(v, temp, v_src);
> +        }
> +        y_ev = __lasx_xvsrai_w(y_ev, 19);
> +        y_od = __lasx_xvsrai_w(y_od, 19);
> +        u    = __lasx_xvsrai_w(u, 19);
> +        v    = __lasx_xvsrai_w(v, 19);
> +        u    = __lasx_xvadd_w(u, headroom);
> +        v    = __lasx_xvadd_w(v, headroom);
> +        WRITE_YUV2RGB(y_ev, y_od, u, v, 0, 0, 0, 0);
> +        WRITE_YUV2RGB(y_ev, y_od, u, v, 1, 1, 1, 1);
> +        WRITE_YUV2RGB(y_ev, y_od, u, v, 2, 2, 2, 2);
> +        WRITE_YUV2RGB(y_ev, y_od, u, v, 3, 3, 3, 3);
> +        WRITE_YUV2RGB(y_ev, y_od, u, v, 4, 4, 4, 4);
> +        WRITE_YUV2RGB(y_ev, y_od, u, v, 5, 5, 5, 5);
> +        WRITE_YUV2RGB(y_ev, y_od, u, v, 6, 6, 6, 6);
> +        WRITE_YUV2RGB(y_ev, y_od, u, v, 7, 7, 7, 7);
> +        res -= 16;
> +    }
> +    if (res >= 8) {
> +        int Y1, Y2, U, V;
> +        int count_lum = count << 1;
> +        __m256i l_src, u_src, v_src;
> +        __m256i y_ev, uv, temp;
> +
> +        y_ev = __lasx_xvldrepl_w(&t, 0);
> +        uv   = y_ev;
> +        for (j = 0; j < lumFilterSize; j++) {
> +            temp  = __lasx_xvldrepl_h((lumFilter + j), 0);
> +            l_src = __lasx_xvld(lumSrc[j] + count_lum, 0);
> +            l_src = __lasx_vext2xv_w_h(l_src);
> +            y_ev  = __lasx_xvmaddwev_w_h(y_ev, temp, l_src);
> +        }
> +        for (j = 0; j < chrFilterSize; j++) {
> +            u_src = __lasx_xvldrepl_d((chrUSrc[j] + count), 0);
> +            v_src = __lasx_xvldrepl_d((chrVSrc[j] + count), 0);
> +            temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
> +            u_src = __lasx_xvilvl_d(v_src, u_src);
> +            u_src = __lasx_vext2xv_w_h(u_src);
> +            uv    = __lasx_xvmaddwev_w_h(uv, temp, u_src);
> +        }
> +        y_ev = __lasx_xvsrai_w(y_ev, 19);
> +        uv   = __lasx_xvsrai_w(uv, 19);
> +        uv   = __lasx_xvadd_w(uv, headroom);
> +        WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 0, 1, 0, 4);
> +        WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 2, 3, 1, 5);
> +        WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 4, 5, 2, 6);
> +        WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 6, 7, 3, 7);
> +    }
> +    for (; count < len_count; count++) {
> +        int Y1 = 1 << 18;
> +        int Y2 = Y1;
> +        int U  = Y1;
> +        int V  = Y1;
> +
> +        for (j = 0; j < lumFilterSize; j++) {
> +            Y1 += lumSrc[j][count * 2]     * lumFilter[j];
> +            Y2 += lumSrc[j][count * 2 + 1] * lumFilter[j];
> +        }
> +        for (j = 0; j < chrFilterSize; j++) {
> +            U += chrUSrc[j][count] * chrFilter[j];
> +            V += chrVSrc[j][count] * chrFilter[j];
> +        }
> +        Y1 >>= 19;
> +        Y2 >>= 19;
> +        U  >>= 19;
> +        V  >>= 19;
> +        r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM];
> +        g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
> +             c->table_gV[V + YUVRGB_TABLE_HEADROOM]);
> +        b =  c->table_bU[U + YUVRGB_TABLE_HEADROOM];
> +
> +        yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
> +                      r, g, b, y, target, 0);
> +    }
> +}
> +
> +static void
> +yuv2rgb_2_template_lasx(SwsContext *c, const int16_t *buf[2],
> +                        const int16_t *ubuf[2], const int16_t *vbuf[2],
> +                        const int16_t *abuf[2], uint8_t *dest, int dstW,
> +                        int yalpha, int uvalpha, int y,
> +                        enum AVPixelFormat target, int hasAlpha)
> +{
> +    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
> +                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
> +                  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
> +    int yalpha1   = 4096 - yalpha;
> +    int uvalpha1  = 4096 - uvalpha;
> +    int i, count  = 0;
> +    int len       = dstW - 15;
> +    int len_count = (dstW + 1) >> 1;
> +    const void *r, *g, *b;
> +    int head  = YUVRGB_TABLE_HEADROOM;
> +    __m256i v_yalpha1  = __lasx_xvreplgr2vr_w(yalpha1);
> +    __m256i v_uvalpha1 = __lasx_xvreplgr2vr_w(uvalpha1);
> +    __m256i v_yalpha   = __lasx_xvreplgr2vr_w(yalpha);
> +    __m256i v_uvalpha  = __lasx_xvreplgr2vr_w(uvalpha);
> +    __m256i headroom   = __lasx_xvreplgr2vr_w(head);
> +
> +    for (i = 0; i < len; i += 16) {
> +        int Y1, Y2, U, V;
> +        int i_dex = i << 1;
> +        int c_dex = count << 1;
> +        __m256i y0_h, y0_l, y0, u0, v0;
> +        __m256i y1_h, y1_l, y1, u1, v1;
> +        __m256i y_l, y_h, u, v;
> +
> +        DUP4_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
> +                  buf1, i_dex, y0, u0, v0, y1);
> +        DUP2_ARG2(__lasx_xvldx, ubuf1, c_dex, vbuf1, c_dex, u1, v1);
> +        DUP2_ARG2(__lasx_xvsllwil_w_h, y0, 0, y1, 0, y0_l, y1_l);
> +        DUP2_ARG1(__lasx_xvexth_w_h, y0, y1, y0_h, y1_h);
> +        DUP4_ARG1(__lasx_vext2xv_w_h, u0, u1, v0, v1, u0, u1, v0, v1);
> +        y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
> +        y0_h = __lasx_xvmul_w(y0_h, v_yalpha1);
> +        u0   = __lasx_xvmul_w(u0, v_uvalpha1);
> +        v0   = __lasx_xvmul_w(v0, v_uvalpha1);
> +        y_l  = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
> +        y_h  = __lasx_xvmadd_w(y0_h, v_yalpha, y1_h);
> +        u    = __lasx_xvmadd_w(u0, v_uvalpha, u1);
> +        v    = __lasx_xvmadd_w(v0, v_uvalpha, v1);
> +        y_l  = __lasx_xvsrai_w(y_l, 19);
> +        y_h  = __lasx_xvsrai_w(y_h, 19);
> +        u    = __lasx_xvsrai_w(u, 19);
> +        v    = __lasx_xvsrai_w(v, 19);
> +        u    = __lasx_xvadd_w(u, headroom);
> +        v    = __lasx_xvadd_w(v, headroom);
> +        WRITE_YUV2RGB(y_l, y_l, u, v, 0, 1, 0, 0);
> +        WRITE_YUV2RGB(y_l, y_l, u, v, 2, 3, 1, 1);
> +        WRITE_YUV2RGB(y_h, y_h, u, v, 0, 1, 2, 2);
> +        WRITE_YUV2RGB(y_h, y_h, u, v, 2, 3, 3, 3);
> +        WRITE_YUV2RGB(y_l, y_l, u, v, 4, 5, 4, 4);
> +        WRITE_YUV2RGB(y_l, y_l, u, v, 6, 7, 5, 5);
> +        WRITE_YUV2RGB(y_h, y_h, u, v, 4, 5, 6, 6);
> +        WRITE_YUV2RGB(y_h, y_h, u, v, 6, 7, 7, 7);
> +    }
> +    if (dstW - i >= 8) {
> +        int Y1, Y2, U, V;
> +        int i_dex = i << 1;
> +        __m256i y0_l, y0, u0, v0;
> +        __m256i y1_l, y1, u1, v1;
> +        __m256i y_l, u, v;
> +
> +        y0   = __lasx_xvldx(buf0, i_dex);

1. Not long ago, I tried to constify the src pointer of several asm
functions and noticed that they produced new warnings for loongarch
(according to patchwork:
https://patchwork.ffmpeg.org/project/ffmpeg/patch/DB6PR0101MB2214178D3E6B8DCA5B86F8198F9A9@DB6PR0101MB2214.eurprd01.prod.exchangelabs.com/),
even though I was sure that the code is const-correct. After finding
(via https://github.com/opencv/opencv/pull/21833) a toolchain
(https://gitee.com/wenux/cross-compiler-la-on-x86) that can build the
lasx and lsx code (upstream GCC seems to be lacking lsx and lasx support
at the moment; at least, my self-compiled loongarch-GCC did not support
lsx and lasx) the issue was clear: lsxintrin.h and lasxintrin.h do not
use const at all, even for functions that only read data (I presume the
vl in __lsx_vldx stands for "vector load"?).
So I sent another iteration
https://ffmpeg.org/pipermail/ffmpeg-devel/2022-August/299562.html of
that patchset that now added wrappers for __lsx_vldx() and
__lasx_xvldx() and cc'ed you and some other developers from loongson to
alert you of the issue in the hope that you fix the headers, so that my
wrappers wouldn't need to be applied. That didn't work, as my mails
could not be delivered to you. So I applied the patchset.
2. You use __lasx_xvldx() to read from a const int16_t. This will give
new warnings unless the above issue has been fixed. Has it?
3. I don't know whether it has, as patchwork's fate tests don't work for
a few days already. Given that the mails I receive from patchwork when
it doesn't like a commit message arrive from "Patchwork
<yinshiyou-hf@loongson.cn>" I presume that loongson is now somehow
running patchwork, so you should be able to inform the right people to
fix it.
4. If you fixed the const-issue, can you please make an updated
toolchain with lsx and lasx support enabled available to us?

- Andreas

> +        u0   = __lasx_xvldrepl_d((ubuf0 + count), 0);
> +        v0   = __lasx_xvldrepl_d((vbuf0 + count), 0);
> +        y1   = __lasx_xvldx(buf1, i_dex);
> +        u1   = __lasx_xvldrepl_d((ubuf1 + count), 0);
> +        v1   = __lasx_xvldrepl_d((vbuf1 + count), 0);
> +        DUP2_ARG1(__lasx_vext2xv_w_h, y0, y1, y0_l, y1_l);
> +        DUP4_ARG1(__lasx_vext2xv_w_h, u0, u1, v0, v1, u0, u1, v0, v1);
> +        y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
> +        u0   = __lasx_xvmul_w(u0, v_uvalpha1);
> +        v0   = __lasx_xvmul_w(v0, v_uvalpha1);
> +        y_l  = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
> +        u    = __lasx_xvmadd_w(u0, v_uvalpha, u1);
> +        v    = __lasx_xvmadd_w(v0, v_uvalpha, v1);
> +        y_l  = __lasx_xvsrai_w(y_l, 19);
> +        u    = __lasx_xvsrai_w(u, 19);
> +        v    = __lasx_xvsrai_w(v, 19);
> +        u    = __lasx_xvadd_w(u, headroom);
> +        v    = __lasx_xvadd_w(v, headroom);
> +        WRITE_YUV2RGB(y_l, y_l, u, v, 0, 1, 0, 0);
> +        WRITE_YUV2RGB(y_l, y_l, u, v, 2, 3, 1, 1);
> +        WRITE_YUV2RGB(y_l, y_l, u, v, 4, 5, 2, 2);
> +        WRITE_YUV2RGB(y_l, y_l, u, v, 6, 7, 3, 3);
> +        i += 8;
> +    }
> +    for (; count < len_count; count++) {
> +        int Y1 = (buf0[count * 2]     * yalpha1  +
> +                  buf1[count * 2]     * yalpha)  >> 19;
> +        int Y2 = (buf0[count * 2 + 1] * yalpha1  +
> +                  buf1[count * 2 + 1] * yalpha) >> 19;
> +        int U  = (ubuf0[count] * uvalpha1 + ubuf1[count] * uvalpha) >> 19;
> +        int V  = (vbuf0[count] * uvalpha1 + vbuf1[count] * uvalpha) >> 19;
> +
> +        r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM],
> +        g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
> +             c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
> +        b =  c->table_bU[U + YUVRGB_TABLE_HEADROOM];
> +
> +        yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
> +                      r, g, b, y, target, 0);
> +    }
> +}
> +
> +static void
> +yuv2rgb_1_template_lasx(SwsContext *c, const int16_t *buf0,
> +                        const int16_t *ubuf[2], const int16_t *vbuf[2],
> +                        const int16_t *abuf0, uint8_t *dest, int dstW,
> +                        int uvalpha, int y, enum AVPixelFormat target,
> +                        int hasAlpha)
> +{
> +    const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
> +    int i;
> +    int len       = (dstW - 15);
> +    int len_count = (dstW + 1) >> 1;
> +    const void *r, *g, *b;
> +
> +    if (uvalpha < 2048) {
> +        int count    = 0;
> +        int head = YUVRGB_TABLE_HEADROOM;
> +        __m256i headroom  = __lasx_xvreplgr2vr_h(head);
> +
> +        for (i = 0; i < len; i += 16) {
> +            int Y1, Y2, U, V;
> +            int i_dex = i << 1;
> +            int c_dex = count << 1;
> +            __m256i src_y, src_u, src_v;
> +            __m256i u, v, y_l, y_h;
> +
> +            DUP2_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, src_y, src_u);
> +            src_v = __lasx_xvldx(vbuf0, c_dex);
> +            src_u = __lasx_xvpermi_q(src_u, src_v, 0x02);
> +            src_y = __lasx_xvsrari_h(src_y, 7);
> +            src_u = __lasx_xvsrari_h(src_u, 7);
> +            y_l   = __lasx_xvsllwil_w_h(src_y, 0);
> +            y_h   = __lasx_xvexth_w_h(src_y);
> +            u     = __lasx_xvaddwev_w_h(src_u, headroom);
> +            v     = __lasx_xvaddwod_w_h(src_u, headroom);
> +            WRITE_YUV2RGB(y_l, y_l, u, u, 0, 1, 0, 4);
> +            WRITE_YUV2RGB(y_l, y_l, v, v, 2, 3, 0, 4);
> +            WRITE_YUV2RGB(y_h, y_h, u, u, 0, 1, 1, 5);
> +            WRITE_YUV2RGB(y_h, y_h, v, v, 2, 3, 1, 5);
> +            WRITE_YUV2RGB(y_l, y_l, u, u, 4, 5, 2, 6);
> +            WRITE_YUV2RGB(y_l, y_l, v, v, 6, 7, 2, 6);
> +            WRITE_YUV2RGB(y_h, y_h, u, u, 4, 5, 3, 7);
> +            WRITE_YUV2RGB(y_h, y_h, v, v, 6, 7, 3, 7);
> +        }
> +        if (dstW - i >= 8){
> +            int Y1, Y2, U, V;
> +            int i_dex = i << 1;
> +            __m256i src_y, src_u, src_v;
> +            __m256i y_l, uv;
> +
> +            src_y  = __lasx_xvldx(buf0, i_dex);
> +            src_u  = __lasx_xvldrepl_d((ubuf0 + count), 0);
> +            src_v  = __lasx_xvldrepl_d((vbuf0 + count), 0);
> +            src_u  = __lasx_xvilvl_d(src_v, src_u);
> +            y_l    = __lasx_xvsrari_h(src_y, 7);
> +            uv     = __lasx_xvsrari_h(src_u, 7);
> +            y_l    = __lasx_vext2xv_w_h(y_l);
> +            uv     = __lasx_vext2xv_w_h(uv);
> +            uv     = __lasx_xvaddwev_w_h(uv, headroom);
> +            WRITE_YUV2RGB(y_l, y_l, uv, uv, 0, 1, 0, 4);
> +            WRITE_YUV2RGB(y_l, y_l, uv, uv, 2, 3, 1, 5);
> +            WRITE_YUV2RGB(y_l, y_l, uv, uv, 4, 5, 2, 6);
> +            WRITE_YUV2RGB(y_l, y_l, uv, uv, 6, 7, 3, 7);
> +            i += 8;
> +        }
> +        for (; count < len_count; count++) {
> +            int Y1 = (buf0[count * 2    ] + 64) >> 7;
> +            int Y2 = (buf0[count * 2 + 1] + 64) >> 7;
> +            int U  = (ubuf0[count]        + 64) >> 7;
> +            int V  = (vbuf0[count]        + 64) >> 7;
> +
> +            r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM],
> +            g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
> +                 c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
> +            b =  c->table_bU[U + YUVRGB_TABLE_HEADROOM];
> +
> +            yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
> +                          r, g, b, y, target, 0);
> +        }
> +    } else {
> +        const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
> +        int count = 0;
> +        int HEADROOM = YUVRGB_TABLE_HEADROOM;
> +        __m256i headroom    = __lasx_xvreplgr2vr_w(HEADROOM);
> +
> +        for (i = 0; i < len; i += 16) {
> +            int Y1, Y2, U, V;
> +            int i_dex = i << 1;
> +            int c_dex = count << 1;
> +            __m256i src_y, src_u0, src_v0, src_u1, src_v1;
> +            __m256i y_l, y_h, u, v;
> +
> +            DUP4_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
> +                      ubuf1, c_dex, src_y, src_u0, src_v0, src_u1);
> +            src_v1 = __lasx_xvldx(vbuf1, c_dex);
> +            src_u0 = __lasx_xvpermi_q(src_u0, src_v0, 0x02);
> +            src_u1 = __lasx_xvpermi_q(src_u1, src_v1, 0x02);
> +            src_y  = __lasx_xvsrari_h(src_y, 7);
> +            u      = __lasx_xvaddwev_w_h(src_u0, src_u1);
> +            v      = __lasx_xvaddwod_w_h(src_u0, src_u1);
> +            y_l    = __lasx_xvsllwil_w_h(src_y, 0);
> +            y_h    = __lasx_xvexth_w_h(src_y);
> +            u      = __lasx_xvsrari_w(u, 8);
> +            v      = __lasx_xvsrari_w(v, 8);
> +            u      = __lasx_xvadd_w(u, headroom);
> +            v      = __lasx_xvadd_w(v, headroom);
> +            WRITE_YUV2RGB(y_l, y_l, u, u, 0, 1, 0, 4);
> +            WRITE_YUV2RGB(y_l, y_l, v, v, 2, 3, 0, 4);
> +            WRITE_YUV2RGB(y_h, y_h, u, u, 0, 1, 1, 5);
> +            WRITE_YUV2RGB(y_h, y_h, v, v, 2, 3, 1, 5);
> +            WRITE_YUV2RGB(y_l, y_l, u, u, 4, 5, 2, 6);
> +            WRITE_YUV2RGB(y_l, y_l, v, v, 6, 7, 2, 6);
> +            WRITE_YUV2RGB(y_h, y_h, u, u, 4, 5, 3, 7);
> +            WRITE_YUV2RGB(y_h, y_h, v, v, 6, 7, 3, 7);
> +        }
> +        if (dstW - i >= 8) {
> +            int Y1, Y2, U, V;
> +            int i_dex = i << 1;
> +            __m256i src_y, src_u0, src_v0, src_u1, src_v1;
> +            __m256i uv;
> +
> +            src_y  = __lasx_xvldx(buf0, i_dex);
> +            src_u0 = __lasx_xvldrepl_d((ubuf0 + count), 0);
> +            src_v0 = __lasx_xvldrepl_d((vbuf0 + count), 0);
> +            src_u1 = __lasx_xvldrepl_d((ubuf1 + count), 0);
> +            src_v1 = __lasx_xvldrepl_d((vbuf1 + count), 0);
> +
> +            src_u0 = __lasx_xvilvl_h(src_u1, src_u0);
> +            src_v0 = __lasx_xvilvl_h(src_v1, src_v0);
> +            src_u0 = __lasx_xvpermi_q(src_u0, src_v0, 0x02);
> +            src_y  = __lasx_xvsrari_h(src_y, 7);
> +            uv     = __lasx_xvhaddw_w_h(src_u0, src_u0);
> +            src_y  = __lasx_vext2xv_w_h(src_y);
> +            uv     = __lasx_xvsrari_w(uv, 8);
> +            uv     = __lasx_xvadd_w(uv, headroom);
> +            WRITE_YUV2RGB(src_y, src_y, uv, uv, 0, 1, 0, 4);
> +            WRITE_YUV2RGB(src_y, src_y, uv, uv, 2, 3, 1, 5);
> +            WRITE_YUV2RGB(src_y, src_y, uv, uv, 4, 5, 2, 6);
> +            WRITE_YUV2RGB(src_y, src_y, uv, uv, 6, 7, 3, 7);
> +            i += 8;
> +        }
> +        for (; count < len_count; count++) {
> +            int Y1 = (buf0[count * 2    ]         +  64) >> 7;
> +            int Y2 = (buf0[count * 2 + 1]         +  64) >> 7;
> +            int U  = (ubuf0[count] + ubuf1[count] + 128) >> 8;
> +            int V  = (vbuf0[count] + vbuf1[count] + 128) >> 8;
> +
> +            r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM],
> +            g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
> +                 c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
> +            b =  c->table_bU[U + YUVRGB_TABLE_HEADROOM];
> +
> +            yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
> +                          r, g, b, y, target, 0);
> +        }
> +    }
> +}
> +
> +#define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha)                                \
> +static void name ## ext ## _X_lasx(SwsContext *c, const int16_t *lumFilter,            \
> +                                   const int16_t **lumSrc, int lumFilterSize,          \
> +                                   const int16_t *chrFilter, const int16_t **chrUSrc,  \
> +                                   const int16_t **chrVSrc, int chrFilterSize,         \
> +                                   const int16_t **alpSrc, uint8_t *dest, int dstW,    \
> +                                   int y)                                              \
> +{                                                                                      \
> +    name ## base ## _X_template_lasx(c, lumFilter, lumSrc, lumFilterSize,              \
> +                                     chrFilter, chrUSrc, chrVSrc, chrFilterSize,       \
> +                                     alpSrc, dest, dstW, y, fmt, hasAlpha);            \
> +}
> +
> +#define YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha)                               \
> +YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha)                                        \
> +static void name ## ext ## _2_lasx(SwsContext *c, const int16_t *buf[2],               \
> +                                   const int16_t *ubuf[2], const int16_t *vbuf[2],     \
> +                                   const int16_t *abuf[2], uint8_t *dest, int dstW,    \
> +                                   int yalpha, int uvalpha, int y)                     \
> +{                                                                                      \
> +    name ## base ## _2_template_lasx(c, buf, ubuf, vbuf, abuf, dest,                   \
> +                                     dstW, yalpha, uvalpha, y, fmt, hasAlpha);         \
> +}
> +
> +#define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha)                                 \
> +YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha)                                       \
> +static void name ## ext ## _1_lasx(SwsContext *c, const int16_t *buf0,                 \
> +                                   const int16_t *ubuf[2], const int16_t *vbuf[2],     \
> +                                   const int16_t *abuf0, uint8_t *dest, int dstW,      \
> +                                   int uvalpha, int y)                                 \
> +{                                                                                      \
> +    name ## base ## _1_template_lasx(c, buf0, ubuf, vbuf, abuf0, dest,                 \
> +                                     dstW, uvalpha, y, fmt, hasAlpha);                 \
> +}
> +
> +
> +#if CONFIG_SMALL
> +#else
> +#if CONFIG_SWSCALE_ALPHA
> +#endif
> +YUV2RGBWRAPPER(yuv2rgb,, x32_1,  AV_PIX_FMT_RGB32_1, 0)
> +YUV2RGBWRAPPER(yuv2rgb,, x32,    AV_PIX_FMT_RGB32,   0)
> +#endif
> +YUV2RGBWRAPPER(yuv2, rgb, rgb24, AV_PIX_FMT_RGB24,     0)
> +YUV2RGBWRAPPER(yuv2, rgb, bgr24, AV_PIX_FMT_BGR24,     0)
> +YUV2RGBWRAPPER(yuv2rgb,,  16,    AV_PIX_FMT_RGB565,    0)
> +YUV2RGBWRAPPER(yuv2rgb,,  15,    AV_PIX_FMT_RGB555,    0)
> +YUV2RGBWRAPPER(yuv2rgb,,  12,    AV_PIX_FMT_RGB444,    0)
> +YUV2RGBWRAPPER(yuv2rgb,,   8,    AV_PIX_FMT_RGB8,      0)
> +YUV2RGBWRAPPER(yuv2rgb,,   4,    AV_PIX_FMT_RGB4,      0)
> +YUV2RGBWRAPPER(yuv2rgb,,   4b,   AV_PIX_FMT_RGB4_BYTE, 0)
> +
> +// This function is copied from libswscale/output.c
> +static av_always_inline void yuv2rgb_write_full(SwsContext *c,
> +    uint8_t *dest, int i, int R, int A, int G, int B,
> +    int y, enum AVPixelFormat target, int hasAlpha, int err[4])
> +{
> +    int isrgb8 = target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8;
> +
> +    if ((R | G | B) & 0xC0000000) {
> +        R = av_clip_uintp2(R, 30);
> +        G = av_clip_uintp2(G, 30);
> +        B = av_clip_uintp2(B, 30);
> +    }
> +
> +    switch(target) {
> +    case AV_PIX_FMT_ARGB:
> +        dest[0] = hasAlpha ? A : 255;
> +        dest[1] = R >> 22;
> +        dest[2] = G >> 22;
> +        dest[3] = B >> 22;
> +        break;
> +    case AV_PIX_FMT_RGB24:
> +        dest[0] = R >> 22;
> +        dest[1] = G >> 22;
> +        dest[2] = B >> 22;
> +        break;
> +    case AV_PIX_FMT_RGBA:
> +        dest[0] = R >> 22;
> +        dest[1] = G >> 22;
> +        dest[2] = B >> 22;
> +        dest[3] = hasAlpha ? A : 255;
> +        break;
> +    case AV_PIX_FMT_ABGR:
> +        dest[0] = hasAlpha ? A : 255;
> +        dest[1] = B >> 22;
> +        dest[2] = G >> 22;
> +        dest[3] = R >> 22;
> +        break;
> +    case AV_PIX_FMT_BGR24:
> +        dest[0] = B >> 22;
> +        dest[1] = G >> 22;
> +        dest[2] = R >> 22;
> +        break;
> +    case AV_PIX_FMT_BGRA:
> +        dest[0] = B >> 22;
> +        dest[1] = G >> 22;
> +        dest[2] = R >> 22;
> +        dest[3] = hasAlpha ? A : 255;
> +        break;
> +    case AV_PIX_FMT_BGR4_BYTE:
> +    case AV_PIX_FMT_RGB4_BYTE:
> +    case AV_PIX_FMT_BGR8:
> +    case AV_PIX_FMT_RGB8:
> +    {
> +        int r,g,b;
> +
> +        switch (c->dither) {
> +        default:
> +        case SWS_DITHER_AUTO:
> +        case SWS_DITHER_ED:
> +            R >>= 22;
> +            G >>= 22;
> +            B >>= 22;
> +            R += (7*err[0] + 1*c->dither_error[0][i] + 5*c->dither_error[0][i+1] + 3*c->dither_error[0][i+2])>>4;
> +            G += (7*err[1] + 1*c->dither_error[1][i] + 5*c->dither_error[1][i+1] + 3*c->dither_error[1][i+2])>>4;
> +            B += (7*err[2] + 1*c->dither_error[2][i] + 5*c->dither_error[2][i+1] + 3*c->dither_error[2][i+2])>>4;
> +            c->dither_error[0][i] = err[0];
> +            c->dither_error[1][i] = err[1];
> +            c->dither_error[2][i] = err[2];
> +            r = R >> (isrgb8 ? 5 : 7);
> +            g = G >> (isrgb8 ? 5 : 6);
> +            b = B >> (isrgb8 ? 6 : 7);
> +            r = av_clip(r, 0, isrgb8 ? 7 : 1);
> +            g = av_clip(g, 0, isrgb8 ? 7 : 3);
> +            b = av_clip(b, 0, isrgb8 ? 3 : 1);
> +            err[0] = R - r*(isrgb8 ? 36 : 255);
> +            err[1] = G - g*(isrgb8 ? 36 : 85);
> +            err[2] = B - b*(isrgb8 ? 85 : 255);
> +            break;
> +        case SWS_DITHER_A_DITHER:
> +            if (isrgb8) {
> +  /* see http://pippin.gimp.org/a_dither/ for details/origin */
> +#define A_DITHER(u,v)   (((((u)+((v)*236))*119)&0xff))
> +                r = (((R >> 19) + A_DITHER(i,y)  -96)>>8);
> +                g = (((G >> 19) + A_DITHER(i + 17,y) - 96)>>8);
> +                b = (((B >> 20) + A_DITHER(i + 17*2,y) -96)>>8);
> +                r = av_clip_uintp2(r, 3);
> +                g = av_clip_uintp2(g, 3);
> +                b = av_clip_uintp2(b, 2);
> +            } else {
> +                r = (((R >> 21) + A_DITHER(i,y)-256)>>8);
> +                g = (((G >> 19) + A_DITHER(i + 17,y)-256)>>8);
> +                b = (((B >> 21) + A_DITHER(i + 17*2,y)-256)>>8);
> +                r = av_clip_uintp2(r, 1);
> +                g = av_clip_uintp2(g, 2);
> +                b = av_clip_uintp2(b, 1);
> +            }
> +            break;
> +        case SWS_DITHER_X_DITHER:
> +            if (isrgb8) {
> +  /* see http://pippin.gimp.org/a_dither/ for details/origin */
> +#define X_DITHER(u,v)   (((((u)^((v)*237))*181)&0x1ff)/2)
> +                r = (((R >> 19) + X_DITHER(i,y) - 96)>>8);
> +                g = (((G >> 19) + X_DITHER(i + 17,y) - 96)>>8);
> +                b = (((B >> 20) + X_DITHER(i + 17*2,y) - 96)>>8);
> +                r = av_clip_uintp2(r, 3);
> +                g = av_clip_uintp2(g, 3);
> +                b = av_clip_uintp2(b, 2);
> +            } else {
> +                r = (((R >> 21) + X_DITHER(i,y)-256)>>8);
> +                g = (((G >> 19) + X_DITHER(i + 17,y)-256)>>8);
> +                b = (((B >> 21) + X_DITHER(i + 17*2,y)-256)>>8);
> +                r = av_clip_uintp2(r, 1);
> +                g = av_clip_uintp2(g, 2);
> +                b = av_clip_uintp2(b, 1);
> +            }
> +
> +            break;
> +        }
> +
> +        if(target == AV_PIX_FMT_BGR4_BYTE) {
> +            dest[0] = r + 2*g + 8*b;
> +        } else if(target == AV_PIX_FMT_RGB4_BYTE) {
> +            dest[0] = b + 2*g + 8*r;
> +        } else if(target == AV_PIX_FMT_BGR8) {
> +            dest[0] = r + 8*g + 64*b;
> +        } else if(target == AV_PIX_FMT_RGB8) {
> +            dest[0] = b + 4*g + 32*r;
> +        } else
> +            av_assert2(0);
> +        break; }
> +    }
> +}
> +
> +#define YUVTORGB_SETUP                                           \
> +    int y_offset   = c->yuv2rgb_y_offset;                        \
> +    int y_coeff    = c->yuv2rgb_y_coeff;                         \
> +    int v2r_coe    = c->yuv2rgb_v2r_coeff;                       \
> +    int v2g_coe    = c->yuv2rgb_v2g_coeff;                       \
> +    int u2g_coe    = c->yuv2rgb_u2g_coeff;                       \
> +    int u2b_coe    = c->yuv2rgb_u2b_coeff;                       \
> +    __m256i offset = __lasx_xvreplgr2vr_w(y_offset);             \
> +    __m256i coeff  = __lasx_xvreplgr2vr_w(y_coeff);              \
> +    __m256i v2r    = __lasx_xvreplgr2vr_w(v2r_coe);              \
> +    __m256i v2g    = __lasx_xvreplgr2vr_w(v2g_coe);              \
> +    __m256i u2g    = __lasx_xvreplgr2vr_w(u2g_coe);              \
> +    __m256i u2b    = __lasx_xvreplgr2vr_w(u2b_coe);              \
> +
> +
> +#define YUVTORGB(y, u, v, R, G, B, offset, coeff,              \
> +                 y_temp, v2r, v2g, u2g, u2b)                   \
> +{                                                              \
> +     y = __lasx_xvsub_w(y, offset);                            \
> +     y = __lasx_xvmul_w(y, coeff);                             \
> +     y = __lasx_xvadd_w(y, y_temp);                            \
> +     R = __lasx_xvmadd_w(y, v, v2r);                           \
> +     v = __lasx_xvmadd_w(y, v, v2g);                           \
> +     G = __lasx_xvmadd_w(v, u, u2g);                           \
> +     B = __lasx_xvmadd_w(y, u, u2b);                           \
> +}
> +
> +#define WRITE_FULL_A(r, g, b, a, t1, s)                                      \
> +{                                                                            \
> +    R = __lasx_xvpickve2gr_w(r, t1);                                         \
> +    G = __lasx_xvpickve2gr_w(g, t1);                                         \
> +    B = __lasx_xvpickve2gr_w(b, t1);                                         \
> +    A = __lasx_xvpickve2gr_w(a, t1);                                         \
> +    if (A & 0x100)                                                           \
> +        A = av_clip_uint8(A);                                                \
> +    yuv2rgb_write_full(c, dest, i + s, R, A, G, B, y, target, hasAlpha, err);\
> +    dest += step;                                                            \
> +}
> +
> +#define WRITE_FULL(r, g, b, t1, s)                                            \
> +{                                                                             \
> +    R = __lasx_xvpickve2gr_w(r, t1);                                          \
> +    G = __lasx_xvpickve2gr_w(g, t1);                                          \
> +    B = __lasx_xvpickve2gr_w(b, t1);                                          \
> +    yuv2rgb_write_full(c, dest, i + s, R, 0, G, B, y, target, hasAlpha, err); \
> +    dest += step;                                                             \
> +}
> +
> +static void
> +yuv2rgb_full_X_template_lasx(SwsContext *c, const int16_t *lumFilter,
> +                             const int16_t **lumSrc, int lumFilterSize,
> +                             const int16_t *chrFilter, const int16_t **chrUSrc,
> +                             const int16_t **chrVSrc, int chrFilterSize,
> +                             const int16_t **alpSrc, uint8_t *dest,
> +                             int dstW, int y, enum AVPixelFormat target,
> +                             int hasAlpha)
> +{
> +    int i, j, B, G, R, A;
> +    int step       = (target == AV_PIX_FMT_RGB24 ||
> +                      target == AV_PIX_FMT_BGR24) ? 3 : 4;
> +    int err[4]     = {0};
> +    int a_temp     = 1 << 18;
> +    int templ      = 1 << 9;
> +    int tempc      = templ - (128 << 19);
> +    int ytemp      = 1 << 21;
> +    int len        = dstW - 15;
> +    __m256i y_temp = __lasx_xvreplgr2vr_w(ytemp);
> +    YUVTORGB_SETUP
> +
> +    if(   target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
> +       || target == AV_PIX_FMT_BGR8      || target == AV_PIX_FMT_RGB8)
> +        step = 1;
> +
> +    for (i = 0; i < len; i += 16) {
> +        __m256i l_src, u_src, v_src;
> +        __m256i y_ev, y_od, u_ev, u_od, v_ev, v_od, temp;
> +        __m256i R_ev, R_od, G_ev, G_od, B_ev, B_od;
> +        int n = i << 1;
> +
> +        y_ev = y_od = __lasx_xvreplgr2vr_w(templ);
> +        u_ev = u_od = v_ev = v_od = __lasx_xvreplgr2vr_w(tempc);
> +        for (j = 0; j < lumFilterSize; j++) {
> +            temp  = __lasx_xvldrepl_h((lumFilter + j), 0);
> +            l_src = __lasx_xvldx(lumSrc[j], n);
> +            y_ev  = __lasx_xvmaddwev_w_h(y_ev, l_src, temp);
> +            y_od  = __lasx_xvmaddwod_w_h(y_od, l_src, temp);
> +        }
> +        for (j = 0; j < chrFilterSize; j++) {
> +            temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
> +            DUP2_ARG2(__lasx_xvldx, chrUSrc[j], n, chrVSrc[j], n,
> +                      u_src, v_src);
> +            DUP2_ARG3(__lasx_xvmaddwev_w_h, u_ev, u_src, temp, v_ev,
> +                      v_src, temp, u_ev, v_ev);
> +            DUP2_ARG3(__lasx_xvmaddwod_w_h, u_od, u_src, temp, v_od,
> +                      v_src, temp, u_od, v_od);
> +        }
> +        y_ev = __lasx_xvsrai_w(y_ev, 10);
> +        y_od = __lasx_xvsrai_w(y_od, 10);
> +        u_ev = __lasx_xvsrai_w(u_ev, 10);
> +        u_od = __lasx_xvsrai_w(u_od, 10);
> +        v_ev = __lasx_xvsrai_w(v_ev, 10);
> +        v_od = __lasx_xvsrai_w(v_od, 10);
> +        YUVTORGB(y_ev, u_ev, v_ev, R_ev, G_ev, B_ev, offset, coeff,
> +                 y_temp, v2r, v2g, u2g, u2b);
> +        YUVTORGB(y_od, u_od, v_od, R_od, G_od, B_od, offset, coeff,
> +                 y_temp, v2r, v2g, u2g, u2b);
> +
> +        if (hasAlpha) {
> +            __m256i a_src, a_ev, a_od;
> +
> +            a_ev = a_od = __lasx_xvreplgr2vr_w(a_temp);
> +            for (j = 0; j < lumFilterSize; j++) {
> +                temp  = __lasx_xvldrepl_h(lumFilter + j, 0);
> +                a_src = __lasx_xvldx(alpSrc[j], n);
> +                a_ev  = __lasx_xvmaddwev_w_h(a_ev, a_src, temp);
> +                a_od  = __lasx_xvmaddwod_w_h(a_od, a_src, temp);
> +            }
> +            a_ev = __lasx_xvsrai_w(a_ev, 19);
> +            a_od = __lasx_xvsrai_w(a_od, 19);
> +            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 0, 0);
> +            WRITE_FULL_A(R_od, G_od, B_od, a_od, 0, 1);
> +            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 1, 2);
> +            WRITE_FULL_A(R_od, G_od, B_od, a_od, 1, 3);
> +            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 2, 4);
> +            WRITE_FULL_A(R_od, G_od, B_od, a_od, 2, 5);
> +            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 3, 6);
> +            WRITE_FULL_A(R_od, G_od, B_od, a_od, 3, 7);
> +            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 4, 8);
> +            WRITE_FULL_A(R_od, G_od, B_od, a_od, 4, 9);
> +            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 5, 10);
> +            WRITE_FULL_A(R_od, G_od, B_od, a_od, 5, 11);
> +            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 6, 12);
> +            WRITE_FULL_A(R_od, G_od, B_od, a_od, 6, 13);
> +            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 7, 14);
> +            WRITE_FULL_A(R_od, G_od, B_od, a_od, 7, 15);
> +        } else {
> +            WRITE_FULL(R_ev, G_ev, B_ev, 0, 0);
> +            WRITE_FULL(R_od, G_od, B_od, 0, 1);
> +            WRITE_FULL(R_ev, G_ev, B_ev, 1, 2);
> +            WRITE_FULL(R_od, G_od, B_od, 1, 3);
> +            WRITE_FULL(R_ev, G_ev, B_ev, 2, 4);
> +            WRITE_FULL(R_od, G_od, B_od, 2, 5);
> +            WRITE_FULL(R_ev, G_ev, B_ev, 3, 6);
> +            WRITE_FULL(R_od, G_od, B_od, 3, 7);
> +            WRITE_FULL(R_ev, G_ev, B_ev, 4, 8);
> +            WRITE_FULL(R_od, G_od, B_od, 4, 9);
> +            WRITE_FULL(R_ev, G_ev, B_ev, 5, 10);
> +            WRITE_FULL(R_od, G_od, B_od, 5, 11);
> +            WRITE_FULL(R_ev, G_ev, B_ev, 6, 12);
> +            WRITE_FULL(R_od, G_od, B_od, 6, 13);
> +            WRITE_FULL(R_ev, G_ev, B_ev, 7, 14);
> +            WRITE_FULL(R_od, G_od, B_od, 7, 15);
> +        }
> +    }
> +    if (dstW - i >= 8) {
> +        __m256i l_src, u_src, v_src;
> +        __m256i y_ev, u_ev, v_ev, uv, temp;
> +        __m256i R_ev, G_ev, B_ev;
> +        int n = i << 1;
> +
> +        y_ev = __lasx_xvreplgr2vr_w(templ);
> +        u_ev = v_ev = __lasx_xvreplgr2vr_w(tempc);
> +        for (j = 0; j < lumFilterSize; j++) {
> +            temp  = __lasx_xvldrepl_h((lumFilter + j), 0);
> +            l_src = __lasx_xvldx(lumSrc[j], n);
> +            l_src = __lasx_xvpermi_d(l_src, 0xD8);
> +            l_src = __lasx_xvilvl_h(l_src, l_src);
> +            y_ev  = __lasx_xvmaddwev_w_h(y_ev, l_src, temp);
> +        }
> +        for (j = 0; j < chrFilterSize; j++) {
> +            temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
> +            DUP2_ARG2(__lasx_xvldx, chrUSrc[j], n, chrVSrc[j], n, u_src, v_src);
> +            u_src = __lasx_xvpermi_d(u_src, 0xD8);
> +            v_src = __lasx_xvpermi_d(v_src, 0xD8);
> +            uv    = __lasx_xvilvl_h(v_src, u_src);
> +            u_ev  = __lasx_xvmaddwev_w_h(u_ev, uv, temp);
> +            v_ev  = __lasx_xvmaddwod_w_h(v_ev, uv, temp);
> +        }
> +        y_ev = __lasx_xvsrai_w(y_ev, 10);
> +        u_ev = __lasx_xvsrai_w(u_ev, 10);
> +        v_ev = __lasx_xvsrai_w(v_ev, 10);
> +        YUVTORGB(y_ev, u_ev, v_ev, R_ev, G_ev, B_ev, offset, coeff,
> +                 y_temp, v2r, v2g, u2g, u2b);
> +
> +        if (hasAlpha) {
> +            __m256i a_src, a_ev;
> +
> +            a_ev = __lasx_xvreplgr2vr_w(a_temp);
> +            for (j = 0; j < lumFilterSize; j++) {
> +                temp  = __lasx_xvldrepl_h(lumFilter + j, 0);
> +                a_src = __lasx_xvldx(alpSrc[j], n);
> +                a_src = __lasx_xvpermi_d(a_src, 0xD8);
> +                a_src = __lasx_xvilvl_h(a_src, a_src);
> +                a_ev  =  __lasx_xvmaddwev_w_h(a_ev, a_src, temp);
> +            }
> +            a_ev = __lasx_xvsrai_w(a_ev, 19);
> +            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 0, 0);
> +            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 1, 1);
> +            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 2, 2);
> +            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 3, 3);
> +            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 4, 4);
> +            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 5, 5);
> +            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 6, 6);
> +            WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 7, 7);
> +        } else {
> +            WRITE_FULL(R_ev, G_ev, B_ev, 0, 0);
> +            WRITE_FULL(R_ev, G_ev, B_ev, 1, 1);
> +            WRITE_FULL(R_ev, G_ev, B_ev, 2, 2);
> +            WRITE_FULL(R_ev, G_ev, B_ev, 3, 3);
> +            WRITE_FULL(R_ev, G_ev, B_ev, 4, 4);
> +            WRITE_FULL(R_ev, G_ev, B_ev, 5, 5);
> +            WRITE_FULL(R_ev, G_ev, B_ev, 6, 6);
> +            WRITE_FULL(R_ev, G_ev, B_ev, 7, 7);
> +        }
> +        i += 8;
> +    }
> +    for (; i < dstW; i++) {
> +        int Y = templ;
> +        int V, U = V = tempc;
> +
> +        A = 0;
> +        for (j = 0; j < lumFilterSize; j++) {
> +            Y += lumSrc[j][i] * lumFilter[j];
> +        }
> +        for (j = 0; j < chrFilterSize; j++) {
> +            U += chrUSrc[j][i] * chrFilter[j];
> +            V += chrVSrc[j][i] * chrFilter[j];
> +
> +        }
> +        Y >>= 10;
> +        U >>= 10;
> +        V >>= 10;
> +        if (hasAlpha) {
> +            A = 1 << 18;
> +            for (j = 0; j < lumFilterSize; j++) {
> +                A += alpSrc[j][i] * lumFilter[j];
> +            }
> +            A >>= 19;
> +            if (A & 0x100)
> +                A = av_clip_uint8(A);
> +        }
> +        Y -= y_offset;
> +        Y *= y_coeff;
> +        Y += ytemp;
> +        R  = (unsigned)Y + V * v2r_coe;
> +        G  = (unsigned)Y + V * v2g_coe + U * u2g_coe;
> +        B  = (unsigned)Y + U * u2b_coe;
> +        yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
> +        dest += step;
> +    }
> +    c->dither_error[0][i] = err[0];
> +    c->dither_error[1][i] = err[1];
> +    c->dither_error[2][i] = err[2];
> +}
> +
> +static void
> +yuv2rgb_full_2_template_lasx(SwsContext *c, const int16_t *buf[2],
> +                             const int16_t *ubuf[2], const int16_t *vbuf[2],
> +                             const int16_t *abuf[2], uint8_t *dest, int dstW,
> +                             int yalpha, int uvalpha, int y,
> +                             enum AVPixelFormat target, int hasAlpha)
> +{
> +    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
> +                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
> +                  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
> +                  *abuf0 = hasAlpha ? abuf[0] : NULL,
> +                  *abuf1 = hasAlpha ? abuf[1] : NULL;
> +    int yalpha1  = 4096 - yalpha;
> +    int uvalpha1 = 4096 - uvalpha;
> +    int uvtemp   = 128 << 19;
> +    int atemp    = 1 << 18;
> +    int err[4]   = {0};
> +    int ytemp    = 1 << 21;
> +    int len      = dstW - 15;
> +    int i, R, G, B, A;
> +    int step = (target == AV_PIX_FMT_RGB24 ||
> +                target == AV_PIX_FMT_BGR24) ? 3 : 4;
> +    __m256i v_uvalpha1 = __lasx_xvreplgr2vr_w(uvalpha1);
> +    __m256i v_yalpha1  = __lasx_xvreplgr2vr_w(yalpha1);
> +    __m256i v_uvalpha  = __lasx_xvreplgr2vr_w(uvalpha);
> +    __m256i v_yalpha   = __lasx_xvreplgr2vr_w(yalpha);
> +    __m256i uv         = __lasx_xvreplgr2vr_w(uvtemp);
> +    __m256i a_bias     = __lasx_xvreplgr2vr_w(atemp);
> +    __m256i y_temp     = __lasx_xvreplgr2vr_w(ytemp);
> +    YUVTORGB_SETUP
> +
> +    av_assert2(yalpha  <= 4096U);
> +    av_assert2(uvalpha <= 4096U);
> +
> +    if(   target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
> +       || target == AV_PIX_FMT_BGR8      || target == AV_PIX_FMT_RGB8)
> +        step = 1;
> +
> +    for (i = 0; i < len; i += 16) {
> +        __m256i b0, b1, ub0, ub1, vb0, vb1;
> +        __m256i y0_l, y0_h, y1_l, y1_h, u0_l, u0_h;
> +        __m256i v0_l, v0_h, u1_l, u1_h, v1_l, v1_h;
> +        __m256i y_l, y_h, v_l, v_h, u_l, u_h;
> +        __m256i R_l, R_h, G_l, G_h, B_l, B_h;
> +        int n = i << 1;
> +
> +        DUP4_ARG2(__lasx_xvldx, buf0, n, buf1, n, ubuf0,
> +                  n, ubuf1, n, b0, b1, ub0, ub1);
> +        DUP2_ARG2(__lasx_xvldx, vbuf0, n, vbuf1, n, vb0 , vb1);
> +        DUP2_ARG2(__lasx_xvsllwil_w_h, b0, 0, b1, 0, y0_l, y1_l);
> +        DUP4_ARG2(__lasx_xvsllwil_w_h, ub0, 0, ub1, 0, vb0, 0, vb1, 0,
> +                  u0_l, u1_l, v0_l, v1_l);
> +        DUP2_ARG1(__lasx_xvexth_w_h, b0, b1, y0_h, y1_h);
> +        DUP4_ARG1(__lasx_xvexth_w_h, ub0, ub1, vb0, vb1,
> +                  u0_h, u1_h, v0_h, v1_h);
> +        y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
> +        y0_h = __lasx_xvmul_w(y0_h, v_yalpha1);
> +        u0_l = __lasx_xvmul_w(u0_l, v_uvalpha1);
> +        u0_h = __lasx_xvmul_w(u0_h, v_uvalpha1);
> +        v0_l = __lasx_xvmul_w(v0_l, v_uvalpha1);
> +        v0_h = __lasx_xvmul_w(v0_h, v_uvalpha1);
> +        y_l  = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
> +        y_h  = __lasx_xvmadd_w(y0_h, v_yalpha, y1_h);
> +        u_l  = __lasx_xvmadd_w(u0_l, v_uvalpha, u1_l);
> +        u_h  = __lasx_xvmadd_w(u0_h, v_uvalpha, u1_h);
> +        v_l  = __lasx_xvmadd_w(v0_l, v_uvalpha, v1_l);
> +        v_h  = __lasx_xvmadd_w(v0_h, v_uvalpha, v1_h);
> +        u_l  = __lasx_xvsub_w(u_l, uv);
> +        u_h  = __lasx_xvsub_w(u_h, uv);
> +        v_l  = __lasx_xvsub_w(v_l, uv);
> +        v_h  = __lasx_xvsub_w(v_h, uv);
> +        y_l  = __lasx_xvsrai_w(y_l, 10);
> +        y_h  = __lasx_xvsrai_w(y_h, 10);
> +        u_l  = __lasx_xvsrai_w(u_l, 10);
> +        u_h  = __lasx_xvsrai_w(u_h, 10);
> +        v_l  = __lasx_xvsrai_w(v_l, 10);
> +        v_h  = __lasx_xvsrai_w(v_h, 10);
> +        YUVTORGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
> +                 y_temp, v2r, v2g, u2g, u2b);
> +        YUVTORGB(y_h, u_h, v_h, R_h, G_h, B_h, offset, coeff,
> +                 y_temp, v2r, v2g, u2g, u2b);
> +
> +        if (hasAlpha) {
> +            __m256i a0, a1, a0_l, a0_h;
> +            __m256i a_l, a_h, a1_l, a1_h;
> +
> +            DUP2_ARG2(__lasx_xvldx, abuf0, n, abuf1, n, a0, a1);
> +            DUP2_ARG2(__lasx_xvsllwil_w_h, a0, 0, a1, 0, a0_l, a1_l);
> +            DUP2_ARG1(__lasx_xvexth_w_h, a0, a1, a0_h, a1_h);
> +            a_l = __lasx_xvmadd_w(a_bias, a0_l, v_yalpha1);
> +            a_h = __lasx_xvmadd_w(a_bias, a0_h, v_yalpha1);
> +            a_l = __lasx_xvmadd_w(a_l, v_yalpha, a1_l);
> +            a_h = __lasx_xvmadd_w(a_h, v_yalpha, a1_h);
> +            a_l = __lasx_xvsrai_w(a_l, 19);
> +            a_h = __lasx_xvsrai_w(a_h, 19);
> +            WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
> +            WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
> +            WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
> +            WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
> +            WRITE_FULL_A(R_h, G_h, B_h, a_h, 0, 4);
> +            WRITE_FULL_A(R_h, G_h, B_h, a_h, 1, 5);
> +            WRITE_FULL_A(R_h, G_h, B_h, a_h, 2, 6);
> +            WRITE_FULL_A(R_h, G_h, B_h, a_h, 3, 7);
> +            WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 8);
> +            WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 9);
> +            WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 10);
> +            WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 11);
> +            WRITE_FULL_A(R_h, G_h, B_h, a_h, 4, 12);
> +            WRITE_FULL_A(R_h, G_h, B_h, a_h, 5, 13);
> +            WRITE_FULL_A(R_h, G_h, B_h, a_h, 6, 14);
> +            WRITE_FULL_A(R_h, G_h, B_h, a_h, 7, 15);
> +        } else {
> +            WRITE_FULL(R_l, G_l, B_l, 0, 0);
> +            WRITE_FULL(R_l, G_l, B_l, 1, 1);
> +            WRITE_FULL(R_l, G_l, B_l, 2, 2);
> +            WRITE_FULL(R_l, G_l, B_l, 3, 3);
> +            WRITE_FULL(R_h, G_h, B_h, 0, 4);
> +            WRITE_FULL(R_h, G_h, B_h, 1, 5);
> +            WRITE_FULL(R_h, G_h, B_h, 2, 6);
> +            WRITE_FULL(R_h, G_h, B_h, 3, 7);
> +            WRITE_FULL(R_l, G_l, B_l, 4, 8);
> +            WRITE_FULL(R_l, G_l, B_l, 5, 9);
> +            WRITE_FULL(R_l, G_l, B_l, 6, 10);
> +            WRITE_FULL(R_l, G_l, B_l, 7, 11);
> +            WRITE_FULL(R_h, G_h, B_h, 4, 12);
> +            WRITE_FULL(R_h, G_h, B_h, 5, 13);
> +            WRITE_FULL(R_h, G_h, B_h, 6, 14);
> +            WRITE_FULL(R_h, G_h, B_h, 7, 15);
> +        }
> +    }
> +    if (dstW - i >= 8) {
> +        __m256i b0, b1, ub0, ub1, vb0, vb1;
> +        __m256i y0_l, y1_l, u0_l;
> +        __m256i v0_l, u1_l, v1_l;
> +        __m256i y_l, u_l, v_l;
> +        __m256i R_l, G_l, B_l;
> +        int n = i << 1;
> +
> +        DUP4_ARG2(__lasx_xvldx, buf0, n, buf1, n, ubuf0, n,
> +                  ubuf1, n, b0, b1, ub0, ub1);
> +        DUP2_ARG2(__lasx_xvldx, vbuf0, n, vbuf1, n, vb0, vb1);
> +        DUP2_ARG1(__lasx_vext2xv_w_h, b0, b1, y0_l, y1_l);
> +        DUP4_ARG1(__lasx_vext2xv_w_h, ub0, ub1, vb0, vb1,
> +                  u0_l, u1_l, v0_l, v1_l);
> +        y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
> +        u0_l = __lasx_xvmul_w(u0_l, v_uvalpha1);
> +        v0_l = __lasx_xvmul_w(v0_l, v_uvalpha1);
> +        y_l  = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
> +        u_l  = __lasx_xvmadd_w(u0_l, v_uvalpha, u1_l);
> +        v_l  = __lasx_xvmadd_w(v0_l, v_uvalpha, v1_l);
> +        u_l  = __lasx_xvsub_w(u_l, uv);
> +        v_l  = __lasx_xvsub_w(v_l, uv);
> +        y_l  = __lasx_xvsrai_w(y_l, 10);
> +        u_l  = __lasx_xvsrai_w(u_l, 10);
> +        v_l  = __lasx_xvsrai_w(v_l, 10);
> +        YUVTORGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
> +                 y_temp, v2r, v2g, u2g, u2b);
> +
> +        if (hasAlpha) {
> +            __m256i a0, a1, a0_l;
> +            __m256i a_l, a1_l;
> +
> +            DUP2_ARG2(__lasx_xvldx, abuf0, n, abuf1, n, a0, a1);
> +            DUP2_ARG1(__lasx_vext2xv_w_h, a0, a1, a0_l, a1_l);
> +            a_l = __lasx_xvmadd_w(a_bias, a0_l, v_yalpha1);
> +            a_l = __lasx_xvmadd_w(a_l, v_yalpha, a1_l);
> +            a_l = __lasx_xvsrai_w(a_l, 19);
> +            WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
> +            WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
> +            WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
> +            WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
> +            WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 4);
> +            WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 5);
> +            WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 6);
> +            WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 7);
> +        } else {
> +            WRITE_FULL(R_l, G_l, B_l, 0, 0);
> +            WRITE_FULL(R_l, G_l, B_l, 1, 1);
> +            WRITE_FULL(R_l, G_l, B_l, 2, 2);
> +            WRITE_FULL(R_l, G_l, B_l, 3, 3);
> +            WRITE_FULL(R_l, G_l, B_l, 4, 4);
> +            WRITE_FULL(R_l, G_l, B_l, 5, 5);
> +            WRITE_FULL(R_l, G_l, B_l, 6, 6);
> +            WRITE_FULL(R_l, G_l, B_l, 7, 7);
> +        }
> +        i += 8;
> +    }
> +    for (; i < dstW; i++){
> +        int Y = ( buf0[i] * yalpha1  +  buf1[i] * yalpha         ) >> 10;
> +        int U = (ubuf0[i] * uvalpha1 + ubuf1[i] * uvalpha- uvtemp) >> 10;
> +        int V = (vbuf0[i] * uvalpha1 + vbuf1[i] * uvalpha- uvtemp) >> 10;
> +
> +        A = 0;
> +        if (hasAlpha){
> +            A = (abuf0[i] * yalpha1 + abuf1[i] * yalpha + atemp) >> 19;
> +            if (A & 0x100)
> +                A = av_clip_uint8(A);
> +        }
> +
> +        Y -= y_offset;
> +        Y *= y_coeff;
> +        Y += ytemp;
> +        R  = (unsigned)Y + V * v2r_coe;
> +        G  = (unsigned)Y + V * v2g_coe + U * u2g_coe;
> +        B  = (unsigned)Y + U * u2b_coe;
> +        yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
> +        dest += step;
> +    }
> +    c->dither_error[0][i] = err[0];
> +    c->dither_error[1][i] = err[1];
> +    c->dither_error[2][i] = err[2];
> +}
> +
> +static void
> +yuv2rgb_full_1_template_lasx(SwsContext *c, const int16_t *buf0,
> +                             const int16_t *ubuf[2], const int16_t *vbuf[2],
> +                             const int16_t *abuf0, uint8_t *dest, int dstW,
> +                             int uvalpha, int y, enum AVPixelFormat target,
> +                             int hasAlpha)
> +{
> +    const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
> +    int i, B, G, R, A;
> +    int step = (target == AV_PIX_FMT_RGB24 || target == AV_PIX_FMT_BGR24) ? 3 : 4;
> +    int err[4]     = {0};
> +    int ytemp      = 1 << 21;
> +    int bias_int   = 64;
> +    int len        = dstW - 15;
> +    __m256i y_temp = __lasx_xvreplgr2vr_w(ytemp);
> +    YUVTORGB_SETUP
> +
> +    if(   target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
> +       || target == AV_PIX_FMT_BGR8      || target == AV_PIX_FMT_RGB8)
> +        step = 1;
> +    if (uvalpha < 2048) {
> +        int uvtemp   = 128 << 7;
> +        __m256i uv   = __lasx_xvreplgr2vr_w(uvtemp);
> +        __m256i bias = __lasx_xvreplgr2vr_w(bias_int);
> +
> +        for (i = 0; i < len; i += 16) {
> +            __m256i b, ub, vb, ub_l, ub_h, vb_l, vb_h;
> +            __m256i y_l, y_h, u_l, u_h, v_l, v_h;
> +            __m256i R_l, R_h, G_l, G_h, B_l, B_h;
> +            int n = i << 1;
> +
> +            DUP2_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, b, ub);
> +            vb  = __lasx_xvldx(vbuf0, n);
> +            y_l = __lasx_xvsllwil_w_h(b, 2);
> +            y_h = __lasx_xvexth_w_h(b);
> +            DUP2_ARG2(__lasx_xvsllwil_w_h, ub, 0, vb, 0, ub_l, vb_l);
> +            DUP2_ARG1(__lasx_xvexth_w_h, ub, vb, ub_h, vb_h);
> +            y_h = __lasx_xvslli_w(y_h, 2);
> +            u_l = __lasx_xvsub_w(ub_l, uv);
> +            u_h = __lasx_xvsub_w(ub_h, uv);
> +            v_l = __lasx_xvsub_w(vb_l, uv);
> +            v_h = __lasx_xvsub_w(vb_h, uv);
> +            u_l = __lasx_xvslli_w(u_l, 2);
> +            u_h = __lasx_xvslli_w(u_h, 2);
> +            v_l = __lasx_xvslli_w(v_l, 2);
> +            v_h = __lasx_xvslli_w(v_h, 2);
> +            YUVTORGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
> +                     y_temp, v2r, v2g, u2g, u2b);
> +            YUVTORGB(y_h, u_h, v_h, R_h, G_h, B_h, offset, coeff,
> +                     y_temp, v2r, v2g, u2g, u2b);
> +
> +            if(hasAlpha) {
> +                __m256i a_src;
> +                __m256i a_l, a_h;
> +
> +                a_src = __lasx_xvld(abuf0 + i, 0);
> +                a_l   = __lasx_xvsllwil_w_h(a_src, 0);
> +                a_h   = __lasx_xvexth_w_h(a_src);
> +                a_l   = __lasx_xvadd_w(a_l, bias);
> +                a_h   = __lasx_xvadd_w(a_h, bias);
> +                a_l   = __lasx_xvsrai_w(a_l, 7);
> +                a_h   = __lasx_xvsrai_w(a_h, 7);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
> +                WRITE_FULL_A(R_h, G_h, B_h, a_h, 0, 4);
> +                WRITE_FULL_A(R_h, G_h, B_h, a_h, 1, 5);
> +                WRITE_FULL_A(R_h, G_h, B_h, a_h, 2, 6);
> +                WRITE_FULL_A(R_h, G_h, B_h, a_h, 3, 7);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 8);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 9);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 10);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 11);
> +                WRITE_FULL_A(R_h, G_h, B_h, a_h, 4, 12);
> +                WRITE_FULL_A(R_h, G_h, B_h, a_h, 5, 13);
> +                WRITE_FULL_A(R_h, G_h, B_h, a_h, 6, 14);
> +                WRITE_FULL_A(R_h, G_h, B_h, a_h, 7, 15);
> +            } else {
> +                WRITE_FULL(R_l, G_l, B_l, 0, 0);
> +                WRITE_FULL(R_l, G_l, B_l, 1, 1);
> +                WRITE_FULL(R_l, G_l, B_l, 2, 2);
> +                WRITE_FULL(R_l, G_l, B_l, 3, 3);
> +                WRITE_FULL(R_h, G_h, B_h, 0, 4);
> +                WRITE_FULL(R_h, G_h, B_h, 1, 5);
> +                WRITE_FULL(R_h, G_h, B_h, 2, 6);
> +                WRITE_FULL(R_h, G_h, B_h, 3, 7);
> +                WRITE_FULL(R_l, G_l, B_l, 4, 8);
> +                WRITE_FULL(R_l, G_l, B_l, 5, 9);
> +                WRITE_FULL(R_l, G_l, B_l, 6, 10);
> +                WRITE_FULL(R_l, G_l, B_l, 7, 11);
> +                WRITE_FULL(R_h, G_h, B_h, 4, 12);
> +                WRITE_FULL(R_h, G_h, B_h, 5, 13);
> +                WRITE_FULL(R_h, G_h, B_h, 6, 14);
> +                WRITE_FULL(R_h, G_h, B_h, 7, 15);
> +            }
> +        }
> +        if (dstW - i >= 8) {
> +            __m256i b, ub, vb, ub_l, vb_l;
> +            __m256i y_l, u_l, v_l;
> +            __m256i R_l, G_l, B_l;
> +            int n = i << 1;
> +
> +            DUP2_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, b, ub);
> +            vb  = __lasx_xvldx(vbuf0, n);
> +            y_l = __lasx_vext2xv_w_h(b);
> +            DUP2_ARG1(__lasx_vext2xv_w_h, ub, vb, ub_l, vb_l);
> +            y_l = __lasx_xvslli_w(y_l, 2);
> +            u_l = __lasx_xvsub_w(ub_l, uv);
> +            v_l = __lasx_xvsub_w(vb_l, uv);
> +            u_l = __lasx_xvslli_w(u_l, 2);
> +            v_l = __lasx_xvslli_w(v_l, 2);
> +            YUVTORGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
> +                     y_temp, v2r, v2g, u2g, u2b);
> +
> +            if(hasAlpha) {
> +                __m256i a_src, a_l;
> +
> +                a_src = __lasx_xvldx(abuf0, n);
> +                a_src = __lasx_vext2xv_w_h(a_src);
> +                a_l   = __lasx_xvadd_w(bias, a_src);
> +                a_l   = __lasx_xvsrai_w(a_l, 7);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 4);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 5);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 6);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 7);
> +            } else {
> +                WRITE_FULL(R_l, G_l, B_l, 0, 0);
> +                WRITE_FULL(R_l, G_l, B_l, 1, 1);
> +                WRITE_FULL(R_l, G_l, B_l, 2, 2);
> +                WRITE_FULL(R_l, G_l, B_l, 3, 3);
> +                WRITE_FULL(R_l, G_l, B_l, 4, 4);
> +                WRITE_FULL(R_l, G_l, B_l, 5, 5);
> +                WRITE_FULL(R_l, G_l, B_l, 6, 6);
> +                WRITE_FULL(R_l, G_l, B_l, 7, 7);
> +            }
> +            i += 8;
> +        }
> +        for (; i < dstW; i++) {
> +            int Y = buf0[i] << 2;
> +            int U = (ubuf0[i] - uvtemp) << 2;
> +            int V = (vbuf0[i] - uvtemp) << 2;
> +
> +            A = 0;
> +            if(hasAlpha) {
> +                A = (abuf0[i] + 64) >> 7;
> +                if (A & 0x100)
> +                    A = av_clip_uint8(A);
> +            }
> +            Y -= y_offset;
> +            Y *= y_coeff;
> +            Y += ytemp;
> +            R  = (unsigned)Y + V * v2r_coe;
> +            G  = (unsigned)Y + V * v2g_coe + U * u2g_coe;
> +            B  = (unsigned)Y + U * u2b_coe;
> +            yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
> +            dest += step;
> +        }
> +    } else {
> +        const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
> +        int uvtemp   = 128 << 8;
> +        __m256i uv   = __lasx_xvreplgr2vr_w(uvtemp);
> +        __m256i zero = __lasx_xvldi(0);
> +        __m256i bias = __lasx_xvreplgr2vr_h(bias_int);
> +
> +        for (i = 0; i < len; i += 16) {
> +            __m256i b, ub0, ub1, vb0, vb1;
> +            __m256i y_ev, y_od, u_ev, u_od, v_ev, v_od;
> +            __m256i R_ev, R_od, G_ev, G_od, B_ev, B_od;
> +            int n = i << 1;
> +
> +            DUP4_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, vbuf0, n,
> +                      ubuf1, n, b, ub0, vb0, ub1);
> +            vb1 = __lasx_xvldx(vbuf, n);
> +            y_ev = __lasx_xvaddwev_w_h(b, zero);
> +            y_od = __lasx_xvaddwod_w_h(b, zero);
> +            DUP2_ARG2(__lasx_xvaddwev_w_h, ub0, vb0, ub1, vb1, u_ev, v_ev);
> +            DUP2_ARG2(__lasx_xvaddwod_w_h, ub0, vb0, ub1, vb1, u_od, v_od);
> +            DUP2_ARG2(__lasx_xvslli_w, y_ev, 2, y_od, 2, y_ev, y_od);
> +            DUP4_ARG2(__lasx_xvsub_w, u_ev, uv, u_od, uv, v_ev, uv, v_od, uv,
> +                      u_ev, u_od, v_ev, v_od);
> +            DUP4_ARG2(__lasx_xvslli_w, u_ev, 1, u_od, 1, v_ev, 1, v_od, 1,
> +                      u_ev, u_od, v_ev, v_od);
> +            YUVTORGB(y_ev, u_ev, v_ev, R_ev, G_ev, B_ev, offset, coeff,
> +                     y_temp, v2r, v2g, u2g, u2b);
> +            YUVTORGB(y_od, u_od, v_od, R_od, G_od, B_od, offset, coeff,
> +                     y_temp, v2r, v2g, u2g, u2b);
> +
> +            if(hasAlpha) {
> +                __m256i a_src;
> +                __m256i a_ev, a_od;
> +
> +                a_src = __lasx_xvld(abuf0 + i, 0);
> +                a_ev  = __lasx_xvaddwev_w_h(bias, a_src);
> +                a_od  = __lasx_xvaddwod_w_h(bias, a_src);
> +                a_ev  = __lasx_xvsrai_w(a_ev, 7);
> +                a_od  = __lasx_xvsrai_w(a_od, 7);
> +                WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 0, 0);
> +                WRITE_FULL_A(R_od, G_od, B_od, a_od, 0, 1);
> +                WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 1, 2);
> +                WRITE_FULL_A(R_od, G_od, B_od, a_od, 1, 3);
> +                WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 2, 4);
> +                WRITE_FULL_A(R_od, G_od, B_od, a_od, 2, 5);
> +                WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 3, 6);
> +                WRITE_FULL_A(R_od, G_od, B_od, a_od, 3, 7);
> +                WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 4, 8);
> +                WRITE_FULL_A(R_od, G_od, B_od, a_od, 4, 9);
> +                WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 5, 10);
> +                WRITE_FULL_A(R_od, G_od, B_od, a_od, 5, 11);
> +                WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 6, 12);
> +                WRITE_FULL_A(R_od, G_od, B_od, a_od, 6, 13);
> +                WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 7, 14);
> +                WRITE_FULL_A(R_od, G_od, B_od, a_od, 7, 15);
> +            } else {
> +                WRITE_FULL(R_ev, G_ev, B_ev, 0, 0);
> +                WRITE_FULL(R_od, G_od, B_od, 0, 1);
> +                WRITE_FULL(R_ev, G_ev, B_ev, 1, 2);
> +                WRITE_FULL(R_od, G_od, B_od, 1, 3);
> +                WRITE_FULL(R_ev, G_ev, B_ev, 2, 4);
> +                WRITE_FULL(R_od, G_od, B_od, 2, 5);
> +                WRITE_FULL(R_ev, G_ev, B_ev, 3, 6);
> +                WRITE_FULL(R_od, G_od, B_od, 3, 7);
> +                WRITE_FULL(R_ev, G_ev, B_ev, 4, 8);
> +                WRITE_FULL(R_od, G_od, B_od, 4, 9);
> +                WRITE_FULL(R_ev, G_ev, B_ev, 5, 10);
> +                WRITE_FULL(R_od, G_od, B_od, 5, 11);
> +                WRITE_FULL(R_ev, G_ev, B_ev, 6, 12);
> +                WRITE_FULL(R_od, G_od, B_od, 6, 13);
> +                WRITE_FULL(R_ev, G_ev, B_ev, 7, 14);
> +                WRITE_FULL(R_od, G_od, B_od, 7, 15);
> +            }
> +        }
> +        if (dstW - i >= 8) {
> +            __m256i b, ub0, ub1, vb0, vb1;
> +            __m256i y_l, u_l, v_l;
> +            __m256i R_l, G_l, B_l;
> +            int n = i << 1;
> +
> +            DUP4_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, vbuf0, n,
> +                      ubuf1, n, b, ub0, vb0, ub1);
> +            vb1 = __lasx_xvldx(vbuf1, n);
> +            y_l = __lasx_vext2xv_w_h(b);
> +            y_l = __lasx_xvslli_w(y_l, 2);
> +            DUP4_ARG1(__lasx_vext2xv_w_h, ub0, vb0, ub1, vb1,
> +                      ub0, vb0, ub1, vb1);
> +            DUP2_ARG2(__lasx_xvadd_w, ub0, ub1, vb0, vb1, u_l, v_l);
> +            u_l = __lasx_xvsub_w(u_l, uv);
> +            v_l = __lasx_xvsub_w(v_l, uv);
> +            u_l = __lasx_xvslli_w(u_l, 1);
> +            v_l = __lasx_xvslli_w(v_l, 1);
> +            YUVTORGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
> +                     y_temp, v2r, v2g, u2g, u2b);
> +
> +            if(hasAlpha) {
> +                __m256i a_src;
> +                __m256i a_l;
> +
> +                a_src  = __lasx_xvld(abuf0 + i, 0);
> +                a_src  = __lasx_xvpermi_d(a_src, 0xD8);
> +                a_src  = __lasx_xvilvl_h(a_src, a_src);
> +                a_l    = __lasx_xvaddwev_w_h(bias, a_src);
> +                a_l   = __lasx_xvsrai_w(a_l, 7);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 4);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 5);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 6);
> +                WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 7);
> +            } else {
> +                WRITE_FULL(R_l, G_l, B_l, 0, 0);
> +                WRITE_FULL(R_l, G_l, B_l, 1, 1);
> +                WRITE_FULL(R_l, G_l, B_l, 2, 2);
> +                WRITE_FULL(R_l, G_l, B_l, 3, 3);
> +                WRITE_FULL(R_l, G_l, B_l, 4, 4);
> +                WRITE_FULL(R_l, G_l, B_l, 5, 5);
> +                WRITE_FULL(R_l, G_l, B_l, 6, 6);
> +                WRITE_FULL(R_l, G_l, B_l, 7, 7);
> +            }
> +            i += 8;
> +        }
> +        for (; i < dstW; i++) {
> +            int Y = buf0[i] << 2;
> +            int U = (ubuf0[i] + ubuf1[i] - uvtemp) << 1;
> +            int V = (vbuf0[i] + vbuf1[i] - uvtemp) << 1;
> +
> +            A = 0;
> +            if(hasAlpha) {
> +                A = (abuf0[i] + 64) >> 7;
> +                if (A & 0x100)
> +                    A = av_clip_uint8(A);
> +            }
> +            Y -= y_offset;
> +            Y *= y_coeff;
> +            Y += ytemp;
> +            R  = (unsigned)Y + V * v2r_coe;
> +            G  = (unsigned)Y + V * v2g_coe + U * u2g_coe;
> +            B  = (unsigned)Y + U * u2b_coe;
> +            yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
> +            dest += step;
> +        }
> +    }
> +    c->dither_error[0][i] = err[0];
> +    c->dither_error[1][i] = err[1];
> +    c->dither_error[2][i] = err[2];
> +}
> +#if CONFIG_SMALL
> +YUV2RGBWRAPPER(yuv2, rgb_full, bgra32_full, AV_PIX_FMT_BGRA,
> +               CONFIG_SWSCALE_ALPHA && c->needAlpha)
> +YUV2RGBWRAPPER(yuv2, rgb_full, abgr32_full, AV_PIX_FMT_ABGR,
> +               CONFIG_SWSCALE_ALPHA && c->needAlpha)
> +YUV2RGBWRAPPER(yuv2, rgb_full, rgba32_full, AV_PIX_FMT_RGBA,
> +               CONFIG_SWSCALE_ALPHA && c->needAlpha)
> +YUV2RGBWRAPPER(yuv2, rgb_full, argb32_full, AV_PIX_FMT_ARGB,
> +               CONFIG_SWSCALE_ALPHA && c->needAlpha)
> +#else
> +#if CONFIG_SWSCALE_ALPHA
> +YUV2RGBWRAPPER(yuv2, rgb_full, bgra32_full, AV_PIX_FMT_BGRA,  1)
> +YUV2RGBWRAPPER(yuv2, rgb_full, abgr32_full, AV_PIX_FMT_ABGR,  1)
> +YUV2RGBWRAPPER(yuv2, rgb_full, rgba32_full, AV_PIX_FMT_RGBA,  1)
> +YUV2RGBWRAPPER(yuv2, rgb_full, argb32_full, AV_PIX_FMT_ARGB,  1)
> +#endif
> +YUV2RGBWRAPPER(yuv2, rgb_full, bgrx32_full, AV_PIX_FMT_BGRA,  0)
> +YUV2RGBWRAPPER(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR,  0)
> +YUV2RGBWRAPPER(yuv2, rgb_full, rgbx32_full, AV_PIX_FMT_RGBA,  0)
> +YUV2RGBWRAPPER(yuv2, rgb_full, xrgb32_full, AV_PIX_FMT_ARGB,  0)
> +#endif
> +YUV2RGBWRAPPER(yuv2, rgb_full, bgr24_full,  AV_PIX_FMT_BGR24, 0)
> +YUV2RGBWRAPPER(yuv2, rgb_full, rgb24_full,  AV_PIX_FMT_RGB24, 0)
> +
> +YUV2RGBWRAPPER(yuv2, rgb_full, bgr4_byte_full,  AV_PIX_FMT_BGR4_BYTE, 0)
> +YUV2RGBWRAPPER(yuv2, rgb_full, rgb4_byte_full,  AV_PIX_FMT_RGB4_BYTE, 0)
> +YUV2RGBWRAPPER(yuv2, rgb_full, bgr8_full,   AV_PIX_FMT_BGR8,  0)
> +YUV2RGBWRAPPER(yuv2, rgb_full, rgb8_full,   AV_PIX_FMT_RGB8,  0)
> +#undef yuvTorgb
> +#undef yuvTorgb_setup
> +
> +
> +av_cold void ff_sws_init_output_loongarch(SwsContext *c)
> +{
> +
> +    if(c->flags & SWS_FULL_CHR_H_INT) {
> +        switch (c->dstFormat) {
> +        case AV_PIX_FMT_RGBA:
> +#if CONFIG_SMALL
> +            c->yuv2packedX = yuv2rgba32_full_X_lasx;
> +            c->yuv2packed2 = yuv2rgba32_full_2_lasx;
> +            c->yuv2packed1 = yuv2rgba32_full_1_lasx;
> +#else
> +#if CONFIG_SWSCALE_ALPHA
> +            if (c->needAlpha) {
> +                c->yuv2packedX = yuv2rgba32_full_X_lasx;
> +                c->yuv2packed2 = yuv2rgba32_full_2_lasx;
> +                c->yuv2packed1 = yuv2rgba32_full_1_lasx;
> +            } else
> +#endif /* CONFIG_SWSCALE_ALPHA */
> +            {
> +                c->yuv2packedX = yuv2rgbx32_full_X_lasx;
> +                c->yuv2packed2 = yuv2rgbx32_full_2_lasx;
> +                c->yuv2packed1 = yuv2rgbx32_full_1_lasx;
> +            }
> +#endif /* !CONFIG_SMALL */
> +            break;
> +        case AV_PIX_FMT_ARGB:
> +#if CONFIG_SMALL
> +            c->yuv2packedX = yuv2argb32_full_X_lasx;
> +            c->yuv2packed2 = yuv2argb32_full_2_lasx;
> +            c->yuv2packed1 = yuv2argb32_full_1_lasx;
> +#else
> +#if CONFIG_SWSCALE_ALPHA
> +            if (c->needAlpha) {
> +                c->yuv2packedX = yuv2argb32_full_X_lasx;
> +                c->yuv2packed2 = yuv2argb32_full_2_lasx;
> +                c->yuv2packed1 = yuv2argb32_full_1_lasx;
> +            } else
> +#endif /* CONFIG_SWSCALE_ALPHA */
> +            {
> +                c->yuv2packedX = yuv2xrgb32_full_X_lasx;
> +                c->yuv2packed2 = yuv2xrgb32_full_2_lasx;
> +                c->yuv2packed1 = yuv2xrgb32_full_1_lasx;
> +            }
> +#endif /* !CONFIG_SMALL */
> +            break;
> +        case AV_PIX_FMT_BGRA:
> +#if CONFIG_SMALL
> +            c->yuv2packedX = yuv2bgra32_full_X_lasx;
> +            c->yuv2packed2 = yuv2bgra32_full_2_lasx;
> +            c->yuv2packed1 = yuv2bgra32_full_1_lasx;
> +#else
> +#if CONFIG_SWSCALE_ALPHA
> +            if (c->needAlpha) {
> +                c->yuv2packedX = yuv2bgra32_full_X_lasx;
> +                c->yuv2packed2 = yuv2bgra32_full_2_lasx;
> +                c->yuv2packed1 = yuv2bgra32_full_1_lasx;
> +            } else
> +#endif /* CONFIG_SWSCALE_ALPHA */
> +            {
> +                c->yuv2packedX = yuv2bgrx32_full_X_lasx;
> +                c->yuv2packed2 = yuv2bgrx32_full_2_lasx;
> +                c->yuv2packed1 = yuv2bgrx32_full_1_lasx;
> +            }
> +#endif /* !CONFIG_SMALL */
> +            break;
> +        case AV_PIX_FMT_ABGR:
> +#if CONFIG_SMALL
> +            c->yuv2packedX = yuv2abgr32_full_X_lasx;
> +            c->yuv2packed2 = yuv2abgr32_full_2_lasx;
> +            c->yuv2packed1 = yuv2abgr32_full_1_lasx;
> +#else
> +#if CONFIG_SWSCALE_ALPHA
> +            if (c->needAlpha) {
> +                c->yuv2packedX = yuv2abgr32_full_X_lasx;
> +                c->yuv2packed2 = yuv2abgr32_full_2_lasx;
> +                c->yuv2packed1 = yuv2abgr32_full_1_lasx;
> +            } else
> +#endif /* CONFIG_SWSCALE_ALPHA */
> +            {
> +                c->yuv2packedX = yuv2xbgr32_full_X_lasx;
> +                c->yuv2packed2 = yuv2xbgr32_full_2_lasx;
> +                c->yuv2packed1 = yuv2xbgr32_full_1_lasx;
> +            }
> +#endif /* !CONFIG_SMALL */
> +            break;
> +        case AV_PIX_FMT_RGB24:
> +            c->yuv2packedX = yuv2rgb24_full_X_lasx;
> +            c->yuv2packed2 = yuv2rgb24_full_2_lasx;
> +            c->yuv2packed1 = yuv2rgb24_full_1_lasx;
> +            break;
> +        case AV_PIX_FMT_BGR24:
> +            c->yuv2packedX = yuv2bgr24_full_X_lasx;
> +            c->yuv2packed2 = yuv2bgr24_full_2_lasx;
> +            c->yuv2packed1 = yuv2bgr24_full_1_lasx;
> +            break;
> +        case AV_PIX_FMT_BGR4_BYTE:
> +            c->yuv2packedX = yuv2bgr4_byte_full_X_lasx;
> +            c->yuv2packed2 = yuv2bgr4_byte_full_2_lasx;
> +            c->yuv2packed1 = yuv2bgr4_byte_full_1_lasx;
> +            break;
> +        case AV_PIX_FMT_RGB4_BYTE:
> +            c->yuv2packedX = yuv2rgb4_byte_full_X_lasx;
> +            c->yuv2packed2 = yuv2rgb4_byte_full_2_lasx;
> +            c->yuv2packed1 = yuv2rgb4_byte_full_1_lasx;
> +            break;
> +        case AV_PIX_FMT_BGR8:
> +            c->yuv2packedX = yuv2bgr8_full_X_lasx;
> +            c->yuv2packed2 = yuv2bgr8_full_2_lasx;
> +            c->yuv2packed1 = yuv2bgr8_full_1_lasx;
> +            break;
> +        case AV_PIX_FMT_RGB8:
> +            c->yuv2packedX = yuv2rgb8_full_X_lasx;
> +            c->yuv2packed2 = yuv2rgb8_full_2_lasx;
> +            c->yuv2packed1 = yuv2rgb8_full_1_lasx;
> +            break;
> +    }
> +    } else {
> +        switch (c->dstFormat) {
> +        case AV_PIX_FMT_RGB32:
> +        case AV_PIX_FMT_BGR32:
> +#if CONFIG_SMALL
> +#else
> +#if CONFIG_SWSCALE_ALPHA
> +            if (c->needAlpha) {
> +            } else
> +#endif /* CONFIG_SWSCALE_ALPHA */
> +            {
> +                c->yuv2packed1 = yuv2rgbx32_1_lasx;
> +                c->yuv2packed2 = yuv2rgbx32_2_lasx;
> +                c->yuv2packedX = yuv2rgbx32_X_lasx;
> +            }
> +#endif /* !CONFIG_SMALL */
> +            break;
> +        case AV_PIX_FMT_RGB32_1:
> +        case AV_PIX_FMT_BGR32_1:
> +#if CONFIG_SMALL
> +#else
> +#if CONFIG_SWSCALE_ALPHA
> +            if (c->needAlpha) {
> +            } else
> +#endif /* CONFIG_SWSCALE_ALPHA */
> +            {
> +                c->yuv2packed1 = yuv2rgbx32_1_1_lasx;
> +                c->yuv2packed2 = yuv2rgbx32_1_2_lasx;
> +                c->yuv2packedX = yuv2rgbx32_1_X_lasx;
> +            }
> +#endif /* !CONFIG_SMALL */
> +            break;
> +        case AV_PIX_FMT_RGB24:
> +            c->yuv2packed1 = yuv2rgb24_1_lasx;
> +            c->yuv2packed2 = yuv2rgb24_2_lasx;
> +            c->yuv2packedX = yuv2rgb24_X_lasx;
> +            break;
> +        case AV_PIX_FMT_BGR24:
> +            c->yuv2packed1 = yuv2bgr24_1_lasx;
> +            c->yuv2packed2 = yuv2bgr24_2_lasx;
> +            c->yuv2packedX = yuv2bgr24_X_lasx;
> +            break;
> +        case AV_PIX_FMT_RGB565LE:
> +        case AV_PIX_FMT_RGB565BE:
> +        case AV_PIX_FMT_BGR565LE:
> +        case AV_PIX_FMT_BGR565BE:
> +            c->yuv2packed1 = yuv2rgb16_1_lasx;
> +            c->yuv2packed2 = yuv2rgb16_2_lasx;
> +            c->yuv2packedX = yuv2rgb16_X_lasx;
> +            break;
> +        case AV_PIX_FMT_RGB555LE:
> +        case AV_PIX_FMT_RGB555BE:
> +        case AV_PIX_FMT_BGR555LE:
> +        case AV_PIX_FMT_BGR555BE:
> +            c->yuv2packed1 = yuv2rgb15_1_lasx;
> +            c->yuv2packed2 = yuv2rgb15_2_lasx;
> +            c->yuv2packedX = yuv2rgb15_X_lasx;
> +            break;
> +        case AV_PIX_FMT_RGB444LE:
> +        case AV_PIX_FMT_RGB444BE:
> +        case AV_PIX_FMT_BGR444LE:
> +        case AV_PIX_FMT_BGR444BE:
> +            c->yuv2packed1 = yuv2rgb12_1_lasx;
> +            c->yuv2packed2 = yuv2rgb12_2_lasx;
> +            c->yuv2packedX = yuv2rgb12_X_lasx;
> +            break;
> +        case AV_PIX_FMT_RGB8:
> +        case AV_PIX_FMT_BGR8:
> +            c->yuv2packed1 = yuv2rgb8_1_lasx;
> +            c->yuv2packed2 = yuv2rgb8_2_lasx;
> +            c->yuv2packedX = yuv2rgb8_X_lasx;
> +            break;
> +        case AV_PIX_FMT_RGB4:
> +        case AV_PIX_FMT_BGR4:
> +            c->yuv2packed1 = yuv2rgb4_1_lasx;
> +            c->yuv2packed2 = yuv2rgb4_2_lasx;
> +            c->yuv2packedX = yuv2rgb4_X_lasx;
> +            break;
> +        case AV_PIX_FMT_RGB4_BYTE:
> +        case AV_PIX_FMT_BGR4_BYTE:
> +            c->yuv2packed1 = yuv2rgb4b_1_lasx;
> +            c->yuv2packed2 = yuv2rgb4b_2_lasx;
> +            c->yuv2packedX = yuv2rgb4b_X_lasx;
> +            break;
> +        }
> +    }
> +}
> diff --git a/libswscale/loongarch/swscale_init_loongarch.c b/libswscale/loongarch/swscale_init_loongarch.c
> index 1e0bb1b116..97fe947e2e 100644
> --- a/libswscale/loongarch/swscale_init_loongarch.c
> +++ b/libswscale/loongarch/swscale_init_loongarch.c
> @@ -28,6 +28,7 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
>  {
>      int cpu_flags = av_get_cpu_flags();
>      if (have_lasx(cpu_flags)) {
> +        ff_sws_init_output_loongarch(c);
>          if (c->srcBpc == 8) {
>              if (c->dstBpc <= 14) {
>                  c->hyScale = c->hcScale = ff_hscale_8_to_15_lasx;
> @@ -47,6 +48,8 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
>              }
>              break;
>          }
> +        if (c->dstBpc == 8)
> +            c->yuv2planeX = ff_yuv2planeX_8_lasx;
>      }
>  }
>  
> diff --git a/libswscale/loongarch/swscale_loongarch.h b/libswscale/loongarch/swscale_loongarch.h
> index 07a8145da2..d6c0399737 100644
> --- a/libswscale/loongarch/swscale_loongarch.h
> +++ b/libswscale/loongarch/swscale_loongarch.h
> @@ -69,4 +69,10 @@ void ff_interleave_bytes_lasx(const uint8_t *src1, const uint8_t *src2,
>                                uint8_t *dest, int width, int height,
>                                int src1Stride, int src2Stride, int dstStride);
>  
> +av_cold void ff_sws_init_output_loongarch(SwsContext *c);
> +
> +void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
> +                          const int16_t **src, uint8_t *dest, int dstW,
> +                          const uint8_t *dither, int offset);
> +
>  #endif /* SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H */

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [FFmpeg-devel] [PATCH v1 3/3] swscale/la: Add output_lasx.c file.
  2022-08-29 12:30   ` Andreas Rheinhardt
@ 2022-09-06  8:12     ` Shiyou Yin
  2022-09-09  1:22       ` Shiyou Yin
  0 siblings, 1 reply; 13+ messages in thread
From: Shiyou Yin @ 2022-09-06  8:12 UTC (permalink / raw)
  To: FFmpeg development discussions and patches; +Cc: andreas.rheinhardt


> 2022年8月29日 20:30，Andreas Rheinhardt <andreas.rheinhardt@outlook.com> 写道：
> 
> Hao Chen:
>> ffmpeg -i ~/media/1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -s 640x480 -pix_fmt
>> rgb24 -y /dev/null -an
>> before: 150fps
>> after:  183fps
>> 
>> Signed-off-by: Hao Chen <chenhao@loongson.cn>
>> ---
>> libswscale/loongarch/Makefile                 |    3 +-
>> libswscale/loongarch/output_lasx.c            | 1982 +++++++++++++++++
>> libswscale/loongarch/swscale_init_loongarch.c |    3 +
>> libswscale/loongarch/swscale_loongarch.h      |    6 +
>> 4 files changed, 1993 insertions(+), 1 deletion(-)
>> create mode 100644 libswscale/loongarch/output_lasx.c
>> 
>> diff --git a/libswscale/loongarch/Makefile b/libswscale/loongarch/Makefile
>> index 4345971514..54d48b3de0 100644
>> --- a/libswscale/loongarch/Makefile
>> +++ b/libswscale/loongarch/Makefile
>> @@ -2,4 +2,5 @@ OBJS-$(CONFIG_SWSCALE)      += loongarch/swscale_init_loongarch.o
>> LASX-OBJS-$(CONFIG_SWSCALE) += loongarch/swscale_lasx.o \
>>                                loongarch/input_lasx.o   \
>>                                loongarch/yuv2rgb_lasx.o \
>> -                               loongarch/rgb2rgb_lasx.o
>> +                               loongarch/rgb2rgb_lasx.o \
>> +							   loongarch/output_lasx.o
>> diff --git a/libswscale/loongarch/output_lasx.c b/libswscale/loongarch/output_lasx.c
>> new file mode 100644
>> index 0000000000..19f82692ff
>> --- /dev/null
>> +++ b/libswscale/loongarch/output_lasx.c
>> @@ -0,0 +1,1982 @@
>> +/*
>> + * Copyright (C) 2022 Loongson Technology Corporation Limited
>> + * Contributed by Hao Chen(chenhao@loongson.cn)
>> + *
>> + * This file is part of FFmpeg.
>> + *
>> + * FFmpeg is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU Lesser General Public
>> + * License as published by the Free Software Foundation; either
>> + * version 2.1 of the License, or (at your option) any later version.
>> + *
>> + * FFmpeg is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> + * Lesser General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU Lesser General Public
>> + * License along with FFmpeg; if not, write to the Free Software
>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>> + */
>> +
>> +#include "swscale_loongarch.h"
>> +#include "libavutil/loongarch/loongson_intrinsics.h"
>> +
>> +void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
>> +                          const int16_t **src, uint8_t *dest, int dstW,
>> +                          const uint8_t *dither, int offset)
>> +{
>> +    int i;
>> +    int len = dstW - 15;
>> +    __m256i mask = {0x1C0C180814041000, 0x1C1814100C080400,
>> +                    0x1C0C180814041000, 0x1C1814100C080400};
>> +    __m256i val1, val2, val3;
>> +    uint8_t dither0 = dither[offset & 7];
>> +    uint8_t dither1 = dither[(offset + 1) & 7];
>> +    uint8_t dither2 = dither[(offset + 2) & 7];
>> +    uint8_t dither3 = dither[(offset + 3) & 7];
>> +    uint8_t dither4 = dither[(offset + 4) & 7];
>> +    uint8_t dither5 = dither[(offset + 5) & 7];
>> +    uint8_t dither6 = dither[(offset + 6) & 7];
>> +    uint8_t dither7 = dither[(offset + 7) & 7];
>> +    int val_1[8] = {dither0, dither2, dither4, dither6,
>> +                    dither0, dither2, dither4, dither6};
>> +    int val_2[8] = {dither1, dither3, dither5, dither7,
>> +                    dither1, dither3, dither5, dither7};
>> +    int val_3[8] = {dither0, dither1, dither2, dither3,
>> +                    dither4, dither5, dither6, dither7};
>> +
>> +    DUP2_ARG2(__lasx_xvld, val_1, 0, val_2, 0, val1, val2);
>> +    val3 = __lasx_xvld(val_3, 0);
>> +
>> +    for (i = 0; i < len; i += 16) {
>> +        int j;
>> +        __m256i src0, filter0, val;
>> +        __m256i val_ev, val_od;
>> +
>> +        val_ev = __lasx_xvslli_w(val1, 12);
>> +        val_od = __lasx_xvslli_w(val2, 12);
>> +
>> +        for (j = 0; j < filterSize; j++) {
>> +            src0  = __lasx_xvld(src[j]+ i, 0);
>> +            filter0 = __lasx_xvldrepl_h((filter + j), 0);
>> +            val_ev = __lasx_xvmaddwev_w_h(val_ev, src0, filter0);
>> +            val_od = __lasx_xvmaddwod_w_h(val_od, src0, filter0);
>> +        }
>> +        val_ev = __lasx_xvsrai_w(val_ev, 19);
>> +        val_od = __lasx_xvsrai_w(val_od, 19);
>> +        val_ev = __lasx_xvclip255_w(val_ev);
>> +        val_od = __lasx_xvclip255_w(val_od);
>> +        val    = __lasx_xvshuf_b(val_od, val_ev, mask);
>> +        __lasx_xvstelm_d(val, (dest + i), 0, 0);
>> +        __lasx_xvstelm_d(val, (dest + i), 8, 2);
>> +    }
>> +    if (dstW - i >= 8){
>> +        int j;
>> +        __m256i src0, filter0, val_h;
>> +        __m256i val_l;
>> +
>> +        val_l = __lasx_xvslli_w(val3, 12);
>> +
>> +        for (j = 0; j < filterSize; j++) {
>> +            src0  = __lasx_xvld(src[j] + i, 0);
>> +            src0  = __lasx_vext2xv_w_h(src0);
>> +            filter0 = __lasx_xvldrepl_h((filter + j), 0);
>> +            filter0 = __lasx_vext2xv_w_h(filter0);
>> +            val_l = __lasx_xvmadd_w(val_l, src0, filter0);
>> +        }
>> +        val_l = __lasx_xvsrai_w(val_l, 19);
>> +        val_l = __lasx_xvclip255_w(val_l);
>> +        val_h = __lasx_xvpermi_d(val_l, 0x4E);
>> +        val_l = __lasx_xvshuf_b(val_h, val_l, mask);
>> +        __lasx_xvstelm_d(val_l, (dest + i), 0, 1);
>> +        i += 8;
>> +    }
>> +    for (; i < dstW; i++) {
>> +        int val = dither[(i + offset) & 7] << 12;
>> +        int j;
>> +        for (j = 0; j< filterSize; j++)
>> +            val += src[j][i] * filter[j];
>> +
>> +        dest[i] = av_clip_uint8(val >> 19);
>> +    }
>> +}
>> +
>> +/*Copy from libswscale/output.c*/
>> +static av_always_inline void
>> +yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2,
>> +              unsigned A1, unsigned A2,
>> +              const void *_r, const void *_g, const void *_b, int y,
>> +              enum AVPixelFormat target, int hasAlpha)
>> +{
>> +    if (target == AV_PIX_FMT_ARGB || target == AV_PIX_FMT_RGBA ||
>> +        target == AV_PIX_FMT_ABGR || target == AV_PIX_FMT_BGRA) {
>> +        uint32_t *dest = (uint32_t *) _dest;
>> +        const uint32_t *r = (const uint32_t *) _r;
>> +        const uint32_t *g = (const uint32_t *) _g;
>> +        const uint32_t *b = (const uint32_t *) _b;
>> +
>> +#if CONFIG_SMALL
>> +        dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
>> +        dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
>> +#else
>> +#if defined(ASSERT_LEVEL) && ASSERT_LEVEL > 1
>> +        int sh = (target == AV_PIX_FMT_RGB32_1 ||
>> +                  target == AV_PIX_FMT_BGR32_1) ? 0 : 24;
>> +        av_assert2((((r[Y1] + g[Y1] + b[Y1]) >> sh) & 0xFF) == 0xFF);
>> +#endif
>> +        dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
>> +        dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
>> +#endif
>> +    } else if (target == AV_PIX_FMT_RGB24 || target == AV_PIX_FMT_BGR24) {
>> +        uint8_t *dest = (uint8_t *) _dest;
>> +        const uint8_t *r = (const uint8_t *) _r;
>> +        const uint8_t *g = (const uint8_t *) _g;
>> +        const uint8_t *b = (const uint8_t *) _b;
>> +
>> +#define r_b ((target == AV_PIX_FMT_RGB24) ? r : b)
>> +#define b_r ((target == AV_PIX_FMT_RGB24) ? b : r)
>> +
>> +        dest[i * 6 + 0] = r_b[Y1];
>> +        dest[i * 6 + 1] =   g[Y1];
>> +        dest[i * 6 + 2] = b_r[Y1];
>> +        dest[i * 6 + 3] = r_b[Y2];
>> +        dest[i * 6 + 4] =   g[Y2];
>> +        dest[i * 6 + 5] = b_r[Y2];
>> +#undef r_b
>> +#undef b_r
>> +    } else if (target == AV_PIX_FMT_RGB565 || target == AV_PIX_FMT_BGR565 ||
>> +               target == AV_PIX_FMT_RGB555 || target == AV_PIX_FMT_BGR555 ||
>> +               target == AV_PIX_FMT_RGB444 || target == AV_PIX_FMT_BGR444) {
>> +        uint16_t *dest = (uint16_t *) _dest;
>> +        const uint16_t *r = (const uint16_t *) _r;
>> +        const uint16_t *g = (const uint16_t *) _g;
>> +        const uint16_t *b = (const uint16_t *) _b;
>> +        int dr1, dg1, db1, dr2, dg2, db2;
>> +
>> +        if (target == AV_PIX_FMT_RGB565 || target == AV_PIX_FMT_BGR565) {
>> +            dr1 = ff_dither_2x2_8[ y & 1     ][0];
>> +            dg1 = ff_dither_2x2_4[ y & 1     ][0];
>> +            db1 = ff_dither_2x2_8[(y & 1) ^ 1][0];
>> +            dr2 = ff_dither_2x2_8[ y & 1     ][1];
>> +            dg2 = ff_dither_2x2_4[ y & 1     ][1];
>> +            db2 = ff_dither_2x2_8[(y & 1) ^ 1][1];
>> +    } else if (target == AV_PIX_FMT_RGB555 || target == AV_PIX_FMT_BGR555) {
>> +            dr1 = ff_dither_2x2_8[ y & 1     ][0];
>> +            dg1 = ff_dither_2x2_8[ y & 1     ][1];
>> +            db1 = ff_dither_2x2_8[(y & 1) ^ 1][0];
>> +            dr2 = ff_dither_2x2_8[ y & 1     ][1];
>> +            dg2 = ff_dither_2x2_8[ y & 1     ][0];
>> +            db2 = ff_dither_2x2_8[(y & 1) ^ 1][1];
>> +        } else {
>> +            dr1 = ff_dither_4x4_16[ y & 3     ][0];
>> +            dg1 = ff_dither_4x4_16[ y & 3     ][1];
>> +            db1 = ff_dither_4x4_16[(y & 3) ^ 3][0];
>> +            dr2 = ff_dither_4x4_16[ y & 3     ][1];
>> +            dg2 = ff_dither_4x4_16[ y & 3     ][0];
>> +            db2 = ff_dither_4x4_16[(y & 3) ^ 3][1];
>> +        }
>> +
>> +        dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
>> +        dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
>> +    } else /* 8/4 bits */ {
>> +        uint8_t *dest = (uint8_t *) _dest;
>> +        const uint8_t *r = (const uint8_t *) _r;
>> +        const uint8_t *g = (const uint8_t *) _g;
>> +        const uint8_t *b = (const uint8_t *) _b;
>> +        int dr1, dg1, db1, dr2, dg2, db2;
>> +
>> +        if (target == AV_PIX_FMT_RGB8 || target == AV_PIX_FMT_BGR8) {
>> +            const uint8_t * const d64 = ff_dither_8x8_73[y & 7];
>> +            const uint8_t * const d32 = ff_dither_8x8_32[y & 7];
>> +            dr1 = dg1 = d32[(i * 2 + 0) & 7];
>> +            db1 =       d64[(i * 2 + 0) & 7];
>> +            dr2 = dg2 = d32[(i * 2 + 1) & 7];
>> +            db2 =       d64[(i * 2 + 1) & 7];
>> +        } else {
>> +            const uint8_t * const d64  = ff_dither_8x8_73 [y & 7];
>> +            const uint8_t * const d128 = ff_dither_8x8_220[y & 7];
>> +            dr1 = db1 = d128[(i * 2 + 0) & 7];
>> +            dg1 =        d64[(i * 2 + 0) & 7];
>> +            dr2 = db2 = d128[(i * 2 + 1) & 7];
>> +            dg2 =        d64[(i * 2 + 1) & 7];
>> +        }
>> +
>> +        if (target == AV_PIX_FMT_RGB4 || target == AV_PIX_FMT_BGR4) {
>> +            dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] +
>> +                    ((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4);
>> +        } else {
>> +            dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
>> +            dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
>> +        }
>> +    }
>> +}
>> +
>> +#define WRITE_YUV2RGB(vec_y1, vec_y2, vec_u, vec_v, t1, t2, t3, t4)    \
>> +{                                                                      \
>> +    Y1 = __lasx_xvpickve2gr_w(vec_y1, t1);                             \
>> +    Y2 = __lasx_xvpickve2gr_w(vec_y2, t2);                             \
>> +    U  = __lasx_xvpickve2gr_w(vec_u, t3);                              \
>> +    V  = __lasx_xvpickve2gr_w(vec_v, t4);                              \
>> +    r  =  c->table_rV[V];                                              \
>> +    g  = (c->table_gU[U] + c->table_gV[V]);                            \
>> +    b  =  c->table_bU[U];                                              \
>> +    yuv2rgb_write(dest, count, Y1, Y2, 0, 0,                           \
>> +                  r, g, b, y, target, 0);                              \
>> +    count++;                                                           \
>> +}
>> +
>> +static void
>> +yuv2rgb_X_template_lasx(SwsContext *c, const int16_t *lumFilter,
>> +                        const int16_t **lumSrc, int lumFilterSize,
>> +                        const int16_t *chrFilter, const int16_t **chrUSrc,
>> +                        const int16_t **chrVSrc, int chrFilterSize,
>> +                        const int16_t **alpSrc, uint8_t *dest, int dstW,
>> +                        int y, enum AVPixelFormat target, int hasAlpha)
>> +{
>> +    int i, j;
>> +    int count = 0;
>> +    int t     = 1 << 18;
>> +    int len   = dstW >> 6;
>> +    int res   = dstW & 63;
>> +    int len_count = (dstW + 1) >> 1;
>> +    const void *r, *g, *b;
>> +    int head = YUVRGB_TABLE_HEADROOM;
>> +    __m256i headroom  = __lasx_xvreplgr2vr_w(head);
>> +
>> +    for (i = 0; i < len; i++) {
>> +        int Y1, Y2, U, V, count_lum = count << 1;
>> +        __m256i l_src1, l_src2, l_src3, l_src4, u_src1, u_src2, v_src1, v_src2;
>> +        __m256i yl1_ev, yl1_od, yh1_ev, yh1_od, yl2_ev, yl2_od, yh2_ev, yh2_od;
>> +        __m256i u1_ev, u1_od, v1_ev, v1_od, u2_ev, u2_od, v2_ev, v2_od, temp;
>> +
>> +        yl1_ev = __lasx_xvldrepl_w(&t, 0);
>> +        yl1_od = yl1_ev;
>> +        yh1_ev = yl1_ev;
>> +        yh1_od = yl1_ev;
>> +        u1_ev  = yl1_ev;
>> +        v1_ev  = yl1_ev;
>> +        u1_od  = yl1_ev;
>> +        v1_od  = yl1_ev;
>> +        yl2_ev = yl1_ev;
>> +        yl2_od = yl1_ev;
>> +        yh2_ev = yl1_ev;
>> +        yh2_od = yl1_ev;
>> +        u2_ev  = yl1_ev;
>> +        v2_ev  = yl1_ev;
>> +        u2_od  = yl1_ev;
>> +        v2_od  = yl1_ev;
>> +        for (j = 0; j < lumFilterSize; j++) {
>> +            int16_t *src_lum = lumSrc[j] + count_lum;
>> +            temp    = __lasx_xvldrepl_h((lumFilter + j), 0);
>> +            DUP4_ARG2(__lasx_xvld, src_lum, 0, src_lum, 32, src_lum, 64,
>> +                      src_lum, 96, l_src1, l_src2, l_src3, l_src4);
>> +
>> +            yl1_ev  = __lasx_xvmaddwev_w_h(yl1_ev, temp, l_src1);
>> +            yl1_od  = __lasx_xvmaddwod_w_h(yl1_od, temp, l_src1);
>> +            yh1_ev  = __lasx_xvmaddwev_w_h(yh1_ev, temp, l_src2);
>> +            yh1_od  = __lasx_xvmaddwod_w_h(yh1_od, temp, l_src2);
>> +            yl2_ev  = __lasx_xvmaddwev_w_h(yl2_ev, temp, l_src3);
>> +            yl2_od  = __lasx_xvmaddwod_w_h(yl2_od, temp, l_src3);
>> +            yh2_ev  = __lasx_xvmaddwev_w_h(yh2_ev, temp, l_src4);
>> +            yh2_od  = __lasx_xvmaddwod_w_h(yh2_od, temp, l_src4);
>> +        }
>> +        for (j = 0; j < chrFilterSize; j++) {
>> +            DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrUSrc[j] + count, 32,
>> +                      u_src1, u_src2);
>> +            DUP2_ARG2(__lasx_xvld, chrVSrc[j] + count, 0, chrVSrc[j] + count, 32,
>> +                      v_src1, v_src2);
>> +            temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
>> +            u1_ev  = __lasx_xvmaddwev_w_h(u1_ev, temp, u_src1);
>> +            u1_od  = __lasx_xvmaddwod_w_h(u1_od, temp, u_src1);
>> +            v1_ev  = __lasx_xvmaddwev_w_h(v1_ev, temp, v_src1);
>> +            v1_od  = __lasx_xvmaddwod_w_h(v1_od, temp, v_src1);
>> +            u2_ev  = __lasx_xvmaddwev_w_h(u2_ev, temp, u_src2);
>> +            u2_od  = __lasx_xvmaddwod_w_h(u2_od, temp, u_src2);
>> +            v2_ev  = __lasx_xvmaddwev_w_h(v2_ev, temp, v_src2);
>> +            v2_od  = __lasx_xvmaddwod_w_h(v2_od, temp, v_src2);
>> +        }
>> +        yl1_ev = __lasx_xvsrai_w(yl1_ev, 19);
>> +        yh1_ev = __lasx_xvsrai_w(yh1_ev, 19);
>> +        yl1_od = __lasx_xvsrai_w(yl1_od, 19);
>> +        yh1_od = __lasx_xvsrai_w(yh1_od, 19);
>> +        u1_ev  = __lasx_xvsrai_w(u1_ev, 19);
>> +        v1_ev  = __lasx_xvsrai_w(v1_ev, 19);
>> +        u1_od  = __lasx_xvsrai_w(u1_od, 19);
>> +        v1_od  = __lasx_xvsrai_w(v1_od, 19);
>> +        yl2_ev = __lasx_xvsrai_w(yl2_ev, 19);
>> +        yh2_ev = __lasx_xvsrai_w(yh2_ev, 19);
>> +        yl2_od = __lasx_xvsrai_w(yl2_od, 19);
>> +        yh2_od = __lasx_xvsrai_w(yh2_od, 19);
>> +        u2_ev  = __lasx_xvsrai_w(u2_ev, 19);
>> +        v2_ev  = __lasx_xvsrai_w(v2_ev, 19);
>> +        u2_od  = __lasx_xvsrai_w(u2_od, 19);
>> +        v2_od  = __lasx_xvsrai_w(v2_od, 19);
>> +        u1_ev  = __lasx_xvadd_w(u1_ev, headroom);
>> +        v1_ev  = __lasx_xvadd_w(v1_ev, headroom);
>> +        u1_od  = __lasx_xvadd_w(u1_od, headroom);
>> +        v1_od  = __lasx_xvadd_w(v1_od, headroom);
>> +        u2_ev  = __lasx_xvadd_w(u2_ev, headroom);
>> +        v2_ev  = __lasx_xvadd_w(v2_ev, headroom);
>> +        u2_od  = __lasx_xvadd_w(u2_od, headroom);
>> +        v2_od  = __lasx_xvadd_w(v2_od, headroom);
>> +        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 0, 0, 0, 0);
>> +        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 1, 1, 0, 0);
>> +        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 2, 2, 1, 1);
>> +        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 3, 3, 1, 1);
>> +        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 4, 4, 2, 2);
>> +        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 5, 5, 2, 2);
>> +        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 6, 6, 3, 3);
>> +        WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 7, 7, 3, 3);
>> +        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 0, 0, 4, 4);
>> +        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 1, 1, 4, 4);
>> +        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 2, 2, 5, 5);
>> +        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 3, 3, 5, 5);
>> +        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 4, 4, 6, 6);
>> +        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 5, 5, 6, 6);
>> +        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 6, 6, 7, 7);
>> +        WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 7, 7, 7, 7);
>> +        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 0, 0, 0, 0);
>> +        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 1, 1, 0, 0);
>> +        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 2, 2, 1, 1);
>> +        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 3, 3, 1, 1);
>> +        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 4, 4, 2, 2);
>> +        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 5, 5, 2, 2);
>> +        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 6, 6, 3, 3);
>> +        WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 7, 7, 3, 3);
>> +        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 0, 0, 4, 4);
>> +        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 1, 1, 4, 4);
>> +        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 2, 2, 5, 5);
>> +        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 3, 3, 5, 5);
>> +        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 4, 4, 6, 6);
>> +        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 5, 5, 6, 6);
>> +        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 6, 6, 7, 7);
>> +        WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 7, 7, 7, 7);
>> +    }
>> +    if (res >= 32) {
>> +        int Y1, Y2, U, V, count_lum = count << 1;
>> +        __m256i l_src1, l_src2, u_src, v_src;
>> +        __m256i yl_ev, yl_od, yh_ev, yh_od;
>> +        __m256i u_ev, u_od, v_ev, v_od, temp;
>> +
>> +        yl_ev = __lasx_xvldrepl_w(&t, 0);
>> +        yl_od = yl_ev;
>> +        yh_ev = yl_ev;
>> +        yh_od = yl_ev;
>> +        u_ev  = yl_ev;
>> +        v_ev  = yl_ev;
>> +        u_od  = yl_ev;
>> +        v_od  = yl_ev;
>> +        for (j = 0; j < lumFilterSize; j++) {
>> +            temp   = __lasx_xvldrepl_h((lumFilter + j), 0);
>> +            DUP2_ARG2(__lasx_xvld, lumSrc[j] + count_lum, 0, lumSrc[j] + count_lum,
>> +                      32, l_src1, l_src2);
>> +            yl_ev  = __lasx_xvmaddwev_w_h(yl_ev, temp, l_src1);
>> +            yl_od  = __lasx_xvmaddwod_w_h(yl_od, temp, l_src1);
>> +            yh_ev  = __lasx_xvmaddwev_w_h(yh_ev, temp, l_src2);
>> +            yh_od  = __lasx_xvmaddwod_w_h(yh_od, temp, l_src2);
>> +        }
>> +        for (j = 0; j < chrFilterSize; j++) {
>> +            DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
>> +                      u_src, v_src);
>> +            temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
>> +            u_ev  = __lasx_xvmaddwev_w_h(u_ev, temp, u_src);
>> +            u_od  = __lasx_xvmaddwod_w_h(u_od, temp, u_src);
>> +            v_ev  = __lasx_xvmaddwev_w_h(v_ev, temp, v_src);
>> +            v_od  = __lasx_xvmaddwod_w_h(v_od, temp, v_src);
>> +        }
>> +        yl_ev = __lasx_xvsrai_w(yl_ev, 19);
>> +        yh_ev = __lasx_xvsrai_w(yh_ev, 19);
>> +        yl_od = __lasx_xvsrai_w(yl_od, 19);
>> +        yh_od = __lasx_xvsrai_w(yh_od, 19);
>> +        u_ev  = __lasx_xvsrai_w(u_ev, 19);
>> +        v_ev  = __lasx_xvsrai_w(v_ev, 19);
>> +        u_od  = __lasx_xvsrai_w(u_od, 19);
>> +        v_od  = __lasx_xvsrai_w(v_od, 19);
>> +        u_ev  = __lasx_xvadd_w(u_ev, headroom);
>> +        v_ev  = __lasx_xvadd_w(v_ev, headroom);
>> +        u_od  = __lasx_xvadd_w(u_od, headroom);
>> +        v_od  = __lasx_xvadd_w(v_od, headroom);
>> +        WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 0, 0, 0, 0);
>> +        WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 1, 1, 0, 0);
>> +        WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 2, 2, 1, 1);
>> +        WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 3, 3, 1, 1);
>> +        WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 4, 4, 2, 2);
>> +        WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 5, 5, 2, 2);
>> +        WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 6, 6, 3, 3);
>> +        WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 7, 7, 3, 3);
>> +        WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 0, 0, 4, 4);
>> +        WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 1, 1, 4, 4);
>> +        WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 2, 2, 5, 5);
>> +        WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 3, 3, 5, 5);
>> +        WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 4, 4, 6, 6);
>> +        WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 5, 5, 6, 6);
>> +        WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 6, 6, 7, 7);
>> +        WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 7, 7, 7, 7);
>> +        res -= 32;
>> +    }
>> +    if (res >= 16) {
>> +        int Y1, Y2, U, V;
>> +        int count_lum = count << 1;
>> +        __m256i l_src, u_src, v_src;
>> +        __m256i y_ev, y_od, u, v, temp;
>> +
>> +        y_ev = __lasx_xvldrepl_w(&t, 0);
>> +        y_od = y_ev;
>> +        u    = y_ev;
>> +        v    = y_ev;
>> +        for (j = 0; j < lumFilterSize; j++) {
>> +            temp  = __lasx_xvldrepl_h((lumFilter + j), 0);
>> +            l_src = __lasx_xvld(lumSrc[j] + count_lum, 0);
>> +            y_ev  = __lasx_xvmaddwev_w_h(y_ev, temp, l_src);
>> +            y_od  = __lasx_xvmaddwod_w_h(y_od, temp, l_src);
>> +        }
>> +        for (j = 0; j < chrFilterSize; j++) {
>> +            DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrVSrc[j] + count,
>> +                      0, u_src, v_src);
>> +            temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
>> +            u_src = __lasx_vext2xv_w_h(u_src);
>> +            v_src = __lasx_vext2xv_w_h(v_src);
>> +            u     = __lasx_xvmaddwev_w_h(u, temp, u_src);
>> +            v     = __lasx_xvmaddwev_w_h(v, temp, v_src);
>> +        }
>> +        y_ev = __lasx_xvsrai_w(y_ev, 19);
>> +        y_od = __lasx_xvsrai_w(y_od, 19);
>> +        u    = __lasx_xvsrai_w(u, 19);
>> +        v    = __lasx_xvsrai_w(v, 19);
>> +        u    = __lasx_xvadd_w(u, headroom);
>> +        v    = __lasx_xvadd_w(v, headroom);
>> +        WRITE_YUV2RGB(y_ev, y_od, u, v, 0, 0, 0, 0);
>> +        WRITE_YUV2RGB(y_ev, y_od, u, v, 1, 1, 1, 1);
>> +        WRITE_YUV2RGB(y_ev, y_od, u, v, 2, 2, 2, 2);
>> +        WRITE_YUV2RGB(y_ev, y_od, u, v, 3, 3, 3, 3);
>> +        WRITE_YUV2RGB(y_ev, y_od, u, v, 4, 4, 4, 4);
>> +        WRITE_YUV2RGB(y_ev, y_od, u, v, 5, 5, 5, 5);
>> +        WRITE_YUV2RGB(y_ev, y_od, u, v, 6, 6, 6, 6);
>> +        WRITE_YUV2RGB(y_ev, y_od, u, v, 7, 7, 7, 7);
>> +        res -= 16;
>> +    }
>> +    if (res >= 8) {
>> +        int Y1, Y2, U, V;
>> +        int count_lum = count << 1;
>> +        __m256i l_src, u_src, v_src;
>> +        __m256i y_ev, uv, temp;
>> +
>> +        y_ev = __lasx_xvldrepl_w(&t, 0);
>> +        uv   = y_ev;
>> +        for (j = 0; j < lumFilterSize; j++) {
>> +            temp  = __lasx_xvldrepl_h((lumFilter + j), 0);
>> +            l_src = __lasx_xvld(lumSrc[j] + count_lum, 0);
>> +            l_src = __lasx_vext2xv_w_h(l_src);
>> +            y_ev  = __lasx_xvmaddwev_w_h(y_ev, temp, l_src);
>> +        }
>> +        for (j = 0; j < chrFilterSize; j++) {
>> +            u_src = __lasx_xvldrepl_d((chrUSrc[j] + count), 0);
>> +            v_src = __lasx_xvldrepl_d((chrVSrc[j] + count), 0);
>> +            temp  = __lasx_xvldrepl_h((chrFilter + j), 0);
>> +            u_src = __lasx_xvilvl_d(v_src, u_src);
>> +            u_src = __lasx_vext2xv_w_h(u_src);
>> +            uv    = __lasx_xvmaddwev_w_h(uv, temp, u_src);
>> +        }
>> +        y_ev = __lasx_xvsrai_w(y_ev, 19);
>> +        uv   = __lasx_xvsrai_w(uv, 19);
>> +        uv   = __lasx_xvadd_w(uv, headroom);
>> +        WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 0, 1, 0, 4);
>> +        WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 2, 3, 1, 5);
>> +        WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 4, 5, 2, 6);
>> +        WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 6, 7, 3, 7);
>> +    }
>> +    for (; count < len_count; count++) {
>> +        int Y1 = 1 << 18;
>> +        int Y2 = Y1;
>> +        int U  = Y1;
>> +        int V  = Y1;
>> +
>> +        for (j = 0; j < lumFilterSize; j++) {
>> +            Y1 += lumSrc[j][count * 2]     * lumFilter[j];
>> +            Y2 += lumSrc[j][count * 2 + 1] * lumFilter[j];
>> +        }
>> +        for (j = 0; j < chrFilterSize; j++) {
>> +            U += chrUSrc[j][count] * chrFilter[j];
>> +            V += chrVSrc[j][count] * chrFilter[j];
>> +        }
>> +        Y1 >>= 19;
>> +        Y2 >>= 19;
>> +        U  >>= 19;
>> +        V  >>= 19;
>> +        r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM];
>> +        g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
>> +             c->table_gV[V + YUVRGB_TABLE_HEADROOM]);
>> +        b =  c->table_bU[U + YUVRGB_TABLE_HEADROOM];
>> +
>> +        yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
>> +                      r, g, b, y, target, 0);
>> +    }
>> +}
>> +
>> +static void
>> +yuv2rgb_2_template_lasx(SwsContext *c, const int16_t *buf[2],
>> +                        const int16_t *ubuf[2], const int16_t *vbuf[2],
>> +                        const int16_t *abuf[2], uint8_t *dest, int dstW,
>> +                        int yalpha, int uvalpha, int y,
>> +                        enum AVPixelFormat target, int hasAlpha)
>> +{
>> +    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
>> +                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
>> +                  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
>> +    int yalpha1   = 4096 - yalpha;
>> +    int uvalpha1  = 4096 - uvalpha;
>> +    int i, count  = 0;
>> +    int len       = dstW - 15;
>> +    int len_count = (dstW + 1) >> 1;
>> +    const void *r, *g, *b;
>> +    int head  = YUVRGB_TABLE_HEADROOM;
>> +    __m256i v_yalpha1  = __lasx_xvreplgr2vr_w(yalpha1);
>> +    __m256i v_uvalpha1 = __lasx_xvreplgr2vr_w(uvalpha1);
>> +    __m256i v_yalpha   = __lasx_xvreplgr2vr_w(yalpha);
>> +    __m256i v_uvalpha  = __lasx_xvreplgr2vr_w(uvalpha);
>> +    __m256i headroom   = __lasx_xvreplgr2vr_w(head);
>> +
>> +    for (i = 0; i < len; i += 16) {
>> +        int Y1, Y2, U, V;
>> +        int i_dex = i << 1;
>> +        int c_dex = count << 1;
>> +        __m256i y0_h, y0_l, y0, u0, v0;
>> +        __m256i y1_h, y1_l, y1, u1, v1;
>> +        __m256i y_l, y_h, u, v;
>> +
>> +        DUP4_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
>> +                  buf1, i_dex, y0, u0, v0, y1);
>> +        DUP2_ARG2(__lasx_xvldx, ubuf1, c_dex, vbuf1, c_dex, u1, v1);
>> +        DUP2_ARG2(__lasx_xvsllwil_w_h, y0, 0, y1, 0, y0_l, y1_l);
>> +        DUP2_ARG1(__lasx_xvexth_w_h, y0, y1, y0_h, y1_h);
>> +        DUP4_ARG1(__lasx_vext2xv_w_h, u0, u1, v0, v1, u0, u1, v0, v1);
>> +        y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
>> +        y0_h = __lasx_xvmul_w(y0_h, v_yalpha1);
>> +        u0   = __lasx_xvmul_w(u0, v_uvalpha1);
>> +        v0   = __lasx_xvmul_w(v0, v_uvalpha1);
>> +        y_l  = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
>> +        y_h  = __lasx_xvmadd_w(y0_h, v_yalpha, y1_h);
>> +        u    = __lasx_xvmadd_w(u0, v_uvalpha, u1);
>> +        v    = __lasx_xvmadd_w(v0, v_uvalpha, v1);
>> +        y_l  = __lasx_xvsrai_w(y_l, 19);
>> +        y_h  = __lasx_xvsrai_w(y_h, 19);
>> +        u    = __lasx_xvsrai_w(u, 19);
>> +        v    = __lasx_xvsrai_w(v, 19);
>> +        u    = __lasx_xvadd_w(u, headroom);
>> +        v    = __lasx_xvadd_w(v, headroom);
>> +        WRITE_YUV2RGB(y_l, y_l, u, v, 0, 1, 0, 0);
>> +        WRITE_YUV2RGB(y_l, y_l, u, v, 2, 3, 1, 1);
>> +        WRITE_YUV2RGB(y_h, y_h, u, v, 0, 1, 2, 2);
>> +        WRITE_YUV2RGB(y_h, y_h, u, v, 2, 3, 3, 3);
>> +        WRITE_YUV2RGB(y_l, y_l, u, v, 4, 5, 4, 4);
>> +        WRITE_YUV2RGB(y_l, y_l, u, v, 6, 7, 5, 5);
>> +        WRITE_YUV2RGB(y_h, y_h, u, v, 4, 5, 6, 6);
>> +        WRITE_YUV2RGB(y_h, y_h, u, v, 6, 7, 7, 7);
>> +    }
>> +    if (dstW - i >= 8) {
>> +        int Y1, Y2, U, V;
>> +        int i_dex = i << 1;
>> +        __m256i y0_l, y0, u0, v0;
>> +        __m256i y1_l, y1, u1, v1;
>> +        __m256i y_l, u, v;
>> +
>> +        y0   = __lasx_xvldx(buf0, i_dex);
> 
> 1. Not long ago, I tried to constify the src pointer of several asm
> functions and noticed that they produced new warnings for loongarch
> (according to patchwork:
> https://patchwork.ffmpeg.org/project/ffmpeg/patch/DB6PR0101MB2214178D3E6B8DCA5B86F8198F9A9@DB6PR0101MB2214.eurprd01.prod.exchangelabs.com/),
> even though I was sure that the code is const-correct. After finding
> (via https://github.com/opencv/opencv/pull/21833) a toolchain
> (https://gitee.com/wenux/cross-compiler-la-on-x86) that can build the
> lasx and lsx code (upstream GCC seems to be lacking lsx and lasx support
> at the moment; at least, my self-compiled loongarch-GCC did not support
> lsx and lasx) the issue was clear: lsxintrin.h and lasxintrin.h do not
> use const at all, even for functions that only read data (I presume the
> vl in __lsx_vldx stands for "vector load"?).
> So I sent another iteration
> https://ffmpeg.org/pipermail/ffmpeg-devel/2022-August/299562.html of
> that patchset that now added wrappers for __lsx_vldx() and
> __lasx_xvldx() and cc'ed you and some other developers from loongson to
> alert you of the issue in the hope that you fix the headers, so that my
> wrappers wouldn't need to be applied. That didn't work, as my mails
> could not be delivered to you. So I applied the patchset.
> 2. You use __lasx_xvldx() to read from a const int16_t. This will give
> new warnings unless the above issue has been fixed. Has it?
> 3. I don't know whether it has, as patchwork's fate tests don't work for
> a few days already. Given that the mails I receive from patchwork when
> it doesn't like a commit message arrive from "Patchwork
> <yinshiyou-hf@loongson.cn>" I presume that loongson is now somehow
> running patchwork, so you should be able to inform the right people to
> fix it.
> 4. If you fixed the const-issue, can you please make an updated
> toolchain with lsx and lasx support enabled available to us?
> 
> - Andreas
> 

Hi Andreas,

Sorry for the late reply.
This issue will be fixed by using const for v1 of __lsx_vldx, and I will update toolchain of LoongArch patchwork runner ASAP.
Thank you very much for analyzing this problem and giving suggestion. 

Thanks,
Shiyou
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [FFmpeg-devel] [PATCH v1 3/3] swscale/la: Add output_lasx.c file.
  2022-09-06  8:12     ` Shiyou Yin
@ 2022-09-09  1:22       ` Shiyou Yin
  2022-09-09 13:11         ` Andreas Rheinhardt
  0 siblings, 1 reply; 13+ messages in thread
From: Shiyou Yin @ 2022-09-09  1:22 UTC (permalink / raw)
  To: FFmpeg development discussions and patches; +Cc: andreas.rheinhardt



> 2022年9月6日 16:12，Shiyou Yin <yinshiyou-hf@loongson.cn> 写道：
> 
>> 
>> 2022年8月29日 20:30，Andreas Rheinhardt <andreas.rheinhardt@outlook.com <mailto:andreas.rheinhardt@outlook.com>> 写道：
>> 
>> Hao Chen:
>>> ffmpeg -i ~/media/1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -s 640x480 -pix_fmt
>>> rgb24 -y /dev/null -an
>>> before: 150fps
>>> after: 183fps
>>> 
>>> Signed-off-by: Hao Chen <chenhao@loongson.cn>
>>> ---
>>> libswscale/loongarch/Makefile | 3 +-
>>> libswscale/loongarch/output_lasx.c | 1982 +++++++++++++++++
>>> libswscale/loongarch/swscale_init_loongarch.c | 3 +
>>> libswscale/loongarch/swscale_loongarch.h | 6 +
>>> 4 files changed, 1993 insertions(+), 1 deletion(-)
>>> create mode 100644 libswscale/loongarch/output_lasx.c
>>> 

>>> +static void
>>> +yuv2rgb_2_template_lasx(SwsContext *c, const int16_t *buf[2],
>>> + const int16_t *ubuf[2], const int16_t *vbuf[2],
>>> + const int16_t *abuf[2], uint8_t *dest, int dstW,
>>> + int yalpha, int uvalpha, int y,
>>> + enum AVPixelFormat target, int hasAlpha)
>>> +{
>>> + const int16_t *buf0 = buf[0], *buf1 = buf[1],
>>> + *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
>>> + *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
>>> + int yalpha1 = 4096 - yalpha;
>>> + int uvalpha1 = 4096 - uvalpha;
>>> + int i, count = 0;
>>> + int len = dstW - 15;
>>> + int len_count = (dstW + 1) >> 1;
>>> + const void *r, *g, *b;
>>> + int head = YUVRGB_TABLE_HEADROOM;
>>> + __m256i v_yalpha1 = __lasx_xvreplgr2vr_w(yalpha1);
>>> + __m256i v_uvalpha1 = __lasx_xvreplgr2vr_w(uvalpha1);
>>> + __m256i v_yalpha = __lasx_xvreplgr2vr_w(yalpha);
>>> + __m256i v_uvalpha = __lasx_xvreplgr2vr_w(uvalpha);
>>> + __m256i headroom = __lasx_xvreplgr2vr_w(head);
>>> +
>>> + for (i = 0; i < len; i += 16) {
>>> + int Y1, Y2, U, V;
>>> + int i_dex = i << 1;
>>> + int c_dex = count << 1;
>>> + __m256i y0_h, y0_l, y0, u0, v0;
>>> + __m256i y1_h, y1_l, y1, u1, v1;
>>> + __m256i y_l, y_h, u, v;
>>> +
>>> + DUP4_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
>>> + buf1, i_dex, y0, u0, v0, y1);
>>> + DUP2_ARG2(__lasx_xvldx, ubuf1, c_dex, vbuf1, c_dex, u1, v1);
>>> + DUP2_ARG2(__lasx_xvsllwil_w_h, y0, 0, y1, 0, y0_l, y1_l);
>>> + DUP2_ARG1(__lasx_xvexth_w_h, y0, y1, y0_h, y1_h);
>>> + DUP4_ARG1(__lasx_vext2xv_w_h, u0, u1, v0, v1, u0, u1, v0, v1);
>>> + y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
>>> + y0_h = __lasx_xvmul_w(y0_h, v_yalpha1);
>>> + u0 = __lasx_xvmul_w(u0, v_uvalpha1);
>>> + v0 = __lasx_xvmul_w(v0, v_uvalpha1);
>>> + y_l = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
>>> + y_h = __lasx_xvmadd_w(y0_h, v_yalpha, y1_h);
>>> + u = __lasx_xvmadd_w(u0, v_uvalpha, u1);
>>> + v = __lasx_xvmadd_w(v0, v_uvalpha, v1);
>>> + y_l = __lasx_xvsrai_w(y_l, 19);
>>> + y_h = __lasx_xvsrai_w(y_h, 19);
>>> + u = __lasx_xvsrai_w(u, 19);
>>> + v = __lasx_xvsrai_w(v, 19);
>>> + u = __lasx_xvadd_w(u, headroom);
>>> + v = __lasx_xvadd_w(v, headroom);
>>> + WRITE_YUV2RGB(y_l, y_l, u, v, 0, 1, 0, 0);
>>> + WRITE_YUV2RGB(y_l, y_l, u, v, 2, 3, 1, 1);
>>> + WRITE_YUV2RGB(y_h, y_h, u, v, 0, 1, 2, 2);
>>> + WRITE_YUV2RGB(y_h, y_h, u, v, 2, 3, 3, 3);
>>> + WRITE_YUV2RGB(y_l, y_l, u, v, 4, 5, 4, 4);
>>> + WRITE_YUV2RGB(y_l, y_l, u, v, 6, 7, 5, 5);
>>> + WRITE_YUV2RGB(y_h, y_h, u, v, 4, 5, 6, 6);
>>> + WRITE_YUV2RGB(y_h, y_h, u, v, 6, 7, 7, 7);
>>> + }
>>> + if (dstW - i >= 8) {
>>> + int Y1, Y2, U, V;
>>> + int i_dex = i << 1;
>>> + __m256i y0_l, y0, u0, v0;
>>> + __m256i y1_l, y1, u1, v1;
>>> + __m256i y_l, u, v;
>>> +
>>> + y0 = __lasx_xvldx(buf0, i_dex);
>> 
>> 1. Not long ago, I tried to constify the src pointer of several asm
>> functions and noticed that they produced new warnings for loongarch
>> (according to patchwork:
>> https://patchwork.ffmpeg.org/project/ffmpeg/patch/DB6PR0101MB2214178D3E6B8DCA5B86F8198F9A9@DB6PR0101MB2214.eurprd01.prod.exchangelabs.com/),
>> even though I was sure that the code is const-correct. After finding
>> (via https://github.com/opencv/opencv/pull/21833) a toolchain
>> (https://gitee.com/wenux/cross-compiler-la-on-x86) that can build the
>> lasx and lsx code (upstream GCC seems to be lacking lsx and lasx support
>> at the moment; at least, my self-compiled loongarch-GCC did not support
>> lsx and lasx) the issue was clear: lsxintrin.h and lasxintrin.h do not
>> use const at all, even for functions that only read data (I presume the
>> vl in __lsx_vldx stands for "vector load"?).
>> So I sent another iteration
>> https://ffmpeg.org/pipermail/ffmpeg-devel/2022-August/299562.html of
>> that patchset that now added wrappers for __lsx_vldx() and
>> __lasx_xvldx() and cc'ed you and some other developers from loongson to
>> alert you of the issue in the hope that you fix the headers, so that my
>> wrappers wouldn't need to be applied. That didn't work, as my mails
>> could not be delivered to you. So I applied the patchset.
>> 2. You use __lasx_xvldx() to read from a const int16_t. This will give
>> new warnings unless the above issue has been fixed. Has it?
>> 3. I don't know whether it has, as patchwork's fate tests don't work for
>> a few days already. Given that the mails I receive from patchwork when
>> it doesn't like a commit message arrive from "Patchwork
>> <yinshiyou-hf@loongson.cn>" I presume that loongson is now somehow
>> running patchwork, so you should be able to inform the right people to
>> fix it.
>> 4. If you fixed the const-issue, can you please make an updated
>> toolchain with lsx and lasx support enabled available to us?
>> 
>> - Andreas
>> 
> 
> Hi Andreas,
> 
> Sorry for the late reply.
> This issue will be fixed by using const for v1 of __lsx_vldx, and I will update toolchain of LoongArch patchwork runner ASAP.
> Thank you very much for analyzing this problem and giving suggestion. 
> 
> Thanks,
> Shiyou

Hi Andreas,
Has updated patchwork runner environment, you can revert the patchset which add wrappers for __lsx_[x]vldx.

Shiyou
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [FFmpeg-devel] [PATCH v1 3/3] swscale/la: Add output_lasx.c file.
  2022-09-09  1:22       ` Shiyou Yin
@ 2022-09-09 13:11         ` Andreas Rheinhardt
  2022-09-11  2:06           ` Shiyou Yin
  0 siblings, 1 reply; 13+ messages in thread
From: Andreas Rheinhardt @ 2022-09-09 13:11 UTC (permalink / raw)
  To: ffmpeg-devel

Shiyou Yin:
> 
> 
>> 2022年9月6日 16:12，Shiyou Yin <yinshiyou-hf@loongson.cn> 写道：
>>
>>>
>>> 2022年8月29日 20:30，Andreas Rheinhardt <andreas.rheinhardt@outlook.com <mailto:andreas.rheinhardt@outlook.com>> 写道：
>>>
>>> Hao Chen:
>>>> ffmpeg -i ~/media/1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -s 640x480 -pix_fmt
>>>> rgb24 -y /dev/null -an
>>>> before: 150fps
>>>> after: 183fps
>>>>
>>>> Signed-off-by: Hao Chen <chenhao@loongson.cn>
>>>> ---
>>>> libswscale/loongarch/Makefile | 3 +-
>>>> libswscale/loongarch/output_lasx.c | 1982 +++++++++++++++++
>>>> libswscale/loongarch/swscale_init_loongarch.c | 3 +
>>>> libswscale/loongarch/swscale_loongarch.h | 6 +
>>>> 4 files changed, 1993 insertions(+), 1 deletion(-)
>>>> create mode 100644 libswscale/loongarch/output_lasx.c
>>>>
> 
>>>> +static void
>>>> +yuv2rgb_2_template_lasx(SwsContext *c, const int16_t *buf[2],
>>>> + const int16_t *ubuf[2], const int16_t *vbuf[2],
>>>> + const int16_t *abuf[2], uint8_t *dest, int dstW,
>>>> + int yalpha, int uvalpha, int y,
>>>> + enum AVPixelFormat target, int hasAlpha)
>>>> +{
>>>> + const int16_t *buf0 = buf[0], *buf1 = buf[1],
>>>> + *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
>>>> + *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
>>>> + int yalpha1 = 4096 - yalpha;
>>>> + int uvalpha1 = 4096 - uvalpha;
>>>> + int i, count = 0;
>>>> + int len = dstW - 15;
>>>> + int len_count = (dstW + 1) >> 1;
>>>> + const void *r, *g, *b;
>>>> + int head = YUVRGB_TABLE_HEADROOM;
>>>> + __m256i v_yalpha1 = __lasx_xvreplgr2vr_w(yalpha1);
>>>> + __m256i v_uvalpha1 = __lasx_xvreplgr2vr_w(uvalpha1);
>>>> + __m256i v_yalpha = __lasx_xvreplgr2vr_w(yalpha);
>>>> + __m256i v_uvalpha = __lasx_xvreplgr2vr_w(uvalpha);
>>>> + __m256i headroom = __lasx_xvreplgr2vr_w(head);
>>>> +
>>>> + for (i = 0; i < len; i += 16) {
>>>> + int Y1, Y2, U, V;
>>>> + int i_dex = i << 1;
>>>> + int c_dex = count << 1;
>>>> + __m256i y0_h, y0_l, y0, u0, v0;
>>>> + __m256i y1_h, y1_l, y1, u1, v1;
>>>> + __m256i y_l, y_h, u, v;
>>>> +
>>>> + DUP4_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
>>>> + buf1, i_dex, y0, u0, v0, y1);
>>>> + DUP2_ARG2(__lasx_xvldx, ubuf1, c_dex, vbuf1, c_dex, u1, v1);
>>>> + DUP2_ARG2(__lasx_xvsllwil_w_h, y0, 0, y1, 0, y0_l, y1_l);
>>>> + DUP2_ARG1(__lasx_xvexth_w_h, y0, y1, y0_h, y1_h);
>>>> + DUP4_ARG1(__lasx_vext2xv_w_h, u0, u1, v0, v1, u0, u1, v0, v1);
>>>> + y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
>>>> + y0_h = __lasx_xvmul_w(y0_h, v_yalpha1);
>>>> + u0 = __lasx_xvmul_w(u0, v_uvalpha1);
>>>> + v0 = __lasx_xvmul_w(v0, v_uvalpha1);
>>>> + y_l = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
>>>> + y_h = __lasx_xvmadd_w(y0_h, v_yalpha, y1_h);
>>>> + u = __lasx_xvmadd_w(u0, v_uvalpha, u1);
>>>> + v = __lasx_xvmadd_w(v0, v_uvalpha, v1);
>>>> + y_l = __lasx_xvsrai_w(y_l, 19);
>>>> + y_h = __lasx_xvsrai_w(y_h, 19);
>>>> + u = __lasx_xvsrai_w(u, 19);
>>>> + v = __lasx_xvsrai_w(v, 19);
>>>> + u = __lasx_xvadd_w(u, headroom);
>>>> + v = __lasx_xvadd_w(v, headroom);
>>>> + WRITE_YUV2RGB(y_l, y_l, u, v, 0, 1, 0, 0);
>>>> + WRITE_YUV2RGB(y_l, y_l, u, v, 2, 3, 1, 1);
>>>> + WRITE_YUV2RGB(y_h, y_h, u, v, 0, 1, 2, 2);
>>>> + WRITE_YUV2RGB(y_h, y_h, u, v, 2, 3, 3, 3);
>>>> + WRITE_YUV2RGB(y_l, y_l, u, v, 4, 5, 4, 4);
>>>> + WRITE_YUV2RGB(y_l, y_l, u, v, 6, 7, 5, 5);
>>>> + WRITE_YUV2RGB(y_h, y_h, u, v, 4, 5, 6, 6);
>>>> + WRITE_YUV2RGB(y_h, y_h, u, v, 6, 7, 7, 7);
>>>> + }
>>>> + if (dstW - i >= 8) {
>>>> + int Y1, Y2, U, V;
>>>> + int i_dex = i << 1;
>>>> + __m256i y0_l, y0, u0, v0;
>>>> + __m256i y1_l, y1, u1, v1;
>>>> + __m256i y_l, u, v;
>>>> +
>>>> + y0 = __lasx_xvldx(buf0, i_dex);
>>>
>>> 1. Not long ago, I tried to constify the src pointer of several asm
>>> functions and noticed that they produced new warnings for loongarch
>>> (according to patchwork:
>>> https://patchwork.ffmpeg.org/project/ffmpeg/patch/DB6PR0101MB2214178D3E6B8DCA5B86F8198F9A9@DB6PR0101MB2214.eurprd01.prod.exchangelabs.com/),
>>> even though I was sure that the code is const-correct. After finding
>>> (via https://github.com/opencv/opencv/pull/21833) a toolchain
>>> (https://gitee.com/wenux/cross-compiler-la-on-x86) that can build the
>>> lasx and lsx code (upstream GCC seems to be lacking lsx and lasx support
>>> at the moment; at least, my self-compiled loongarch-GCC did not support
>>> lsx and lasx) the issue was clear: lsxintrin.h and lasxintrin.h do not
>>> use const at all, even for functions that only read data (I presume the
>>> vl in __lsx_vldx stands for "vector load"?).
>>> So I sent another iteration
>>> https://ffmpeg.org/pipermail/ffmpeg-devel/2022-August/299562.html of
>>> that patchset that now added wrappers for __lsx_vldx() and
>>> __lasx_xvldx() and cc'ed you and some other developers from loongson to
>>> alert you of the issue in the hope that you fix the headers, so that my
>>> wrappers wouldn't need to be applied. That didn't work, as my mails
>>> could not be delivered to you. So I applied the patchset.
>>> 2. You use __lasx_xvldx() to read from a const int16_t. This will give
>>> new warnings unless the above issue has been fixed. Has it?
>>> 3. I don't know whether it has, as patchwork's fate tests don't work for
>>> a few days already. Given that the mails I receive from patchwork when
>>> it doesn't like a commit message arrive from "Patchwork
>>> <yinshiyou-hf@loongson.cn>" I presume that loongson is now somehow
>>> running patchwork, so you should be able to inform the right people to
>>> fix it.
>>> 4. If you fixed the const-issue, can you please make an updated
>>> toolchain with lsx and lasx support enabled available to us?
>>>
>>> - Andreas
>>>
>>
>> Hi Andreas,
>>
>> Sorry for the late reply.
>> This issue will be fixed by using const for v1 of __lsx_vldx, and I will update toolchain of LoongArch patchwork runner ASAP.
>> Thank you very much for analyzing this problem and giving suggestion. 
>>
>> Thanks,
>> Shiyou
> 
> Hi Andreas,
> Has updated patchwork runner environment, you can revert the patchset which add wrappers for __lsx_[x]vldx.
> 

Just sent the patches:
https://ffmpeg.org/pipermail/ffmpeg-devel/2022-September/301201.html
Can you provide an updated toolchain (with x86 host) able to compile lsx
and lasx?

- Andreas
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [FFmpeg-devel] [PATCH v1 3/3] swscale/la: Add output_lasx.c file.
  2022-09-09 13:11         ` Andreas Rheinhardt
@ 2022-09-11  2:06           ` Shiyou Yin
  2022-09-21  3:03             ` Shiyou Yin
  0 siblings, 1 reply; 13+ messages in thread
From: Shiyou Yin @ 2022-09-11  2:06 UTC (permalink / raw)
  To: FFmpeg development discussions and patches



> 2022年9月9日 21:11，Andreas Rheinhardt <andreas.rheinhardt@outlook.com> 写道：
> 
> Shiyou Yin:
>> 
>> 
>>> 2022年9月6日 16:12，Shiyou Yin <yinshiyou-hf@loongson.cn> 写道：
>>> 
>>>> 
>>>> 2022年8月29日 20:30，Andreas Rheinhardt <andreas.rheinhardt@outlook.com <mailto:andreas.rheinhardt@outlook.com>> 写道：
>>>> 
>>>> Hao Chen:
>>>>> ffmpeg -i ~/media/1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -s 640x480 -pix_fmt
>>>>> rgb24 -y /dev/null -an
>>>>> before: 150fps
>>>>> after: 183fps
>>>>> 
>>>>> Signed-off-by: Hao Chen <chenhao@loongson.cn>
>>>>> ---
>>>>> libswscale/loongarch/Makefile | 3 +-
>>>>> libswscale/loongarch/output_lasx.c | 1982 +++++++++++++++++
>>>>> libswscale/loongarch/swscale_init_loongarch.c | 3 +
>>>>> libswscale/loongarch/swscale_loongarch.h | 6 +
>>>>> 4 files changed, 1993 insertions(+), 1 deletion(-)
>>>>> create mode 100644 libswscale/loongarch/output_lasx.c
>>>>> 
>> 
>>>>> +static void
>>>>> +yuv2rgb_2_template_lasx(SwsContext *c, const int16_t *buf[2],
>>>>> + const int16_t *ubuf[2], const int16_t *vbuf[2],
>>>>> + const int16_t *abuf[2], uint8_t *dest, int dstW,
>>>>> + int yalpha, int uvalpha, int y,
>>>>> + enum AVPixelFormat target, int hasAlpha)
>>>>> +{
>>>>> + const int16_t *buf0 = buf[0], *buf1 = buf[1],
>>>>> + *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
>>>>> + *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
>>>>> + int yalpha1 = 4096 - yalpha;
>>>>> + int uvalpha1 = 4096 - uvalpha;
>>>>> + int i, count = 0;
>>>>> + int len = dstW - 15;
>>>>> + int len_count = (dstW + 1) >> 1;
>>>>> + const void *r, *g, *b;
>>>>> + int head = YUVRGB_TABLE_HEADROOM;
>>>>> + __m256i v_yalpha1 = __lasx_xvreplgr2vr_w(yalpha1);
>>>>> + __m256i v_uvalpha1 = __lasx_xvreplgr2vr_w(uvalpha1);
>>>>> + __m256i v_yalpha = __lasx_xvreplgr2vr_w(yalpha);
>>>>> + __m256i v_uvalpha = __lasx_xvreplgr2vr_w(uvalpha);
>>>>> + __m256i headroom = __lasx_xvreplgr2vr_w(head);
>>>>> +
>>>>> + for (i = 0; i < len; i += 16) {
>>>>> + int Y1, Y2, U, V;
>>>>> + int i_dex = i << 1;
>>>>> + int c_dex = count << 1;
>>>>> + __m256i y0_h, y0_l, y0, u0, v0;
>>>>> + __m256i y1_h, y1_l, y1, u1, v1;
>>>>> + __m256i y_l, y_h, u, v;
>>>>> +
>>>>> + DUP4_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
>>>>> + buf1, i_dex, y0, u0, v0, y1);
>>>>> + DUP2_ARG2(__lasx_xvldx, ubuf1, c_dex, vbuf1, c_dex, u1, v1);
>>>>> + DUP2_ARG2(__lasx_xvsllwil_w_h, y0, 0, y1, 0, y0_l, y1_l);
>>>>> + DUP2_ARG1(__lasx_xvexth_w_h, y0, y1, y0_h, y1_h);
>>>>> + DUP4_ARG1(__lasx_vext2xv_w_h, u0, u1, v0, v1, u0, u1, v0, v1);
>>>>> + y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
>>>>> + y0_h = __lasx_xvmul_w(y0_h, v_yalpha1);
>>>>> + u0 = __lasx_xvmul_w(u0, v_uvalpha1);
>>>>> + v0 = __lasx_xvmul_w(v0, v_uvalpha1);
>>>>> + y_l = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
>>>>> + y_h = __lasx_xvmadd_w(y0_h, v_yalpha, y1_h);
>>>>> + u = __lasx_xvmadd_w(u0, v_uvalpha, u1);
>>>>> + v = __lasx_xvmadd_w(v0, v_uvalpha, v1);
>>>>> + y_l = __lasx_xvsrai_w(y_l, 19);
>>>>> + y_h = __lasx_xvsrai_w(y_h, 19);
>>>>> + u = __lasx_xvsrai_w(u, 19);
>>>>> + v = __lasx_xvsrai_w(v, 19);
>>>>> + u = __lasx_xvadd_w(u, headroom);
>>>>> + v = __lasx_xvadd_w(v, headroom);
>>>>> + WRITE_YUV2RGB(y_l, y_l, u, v, 0, 1, 0, 0);
>>>>> + WRITE_YUV2RGB(y_l, y_l, u, v, 2, 3, 1, 1);
>>>>> + WRITE_YUV2RGB(y_h, y_h, u, v, 0, 1, 2, 2);
>>>>> + WRITE_YUV2RGB(y_h, y_h, u, v, 2, 3, 3, 3);
>>>>> + WRITE_YUV2RGB(y_l, y_l, u, v, 4, 5, 4, 4);
>>>>> + WRITE_YUV2RGB(y_l, y_l, u, v, 6, 7, 5, 5);
>>>>> + WRITE_YUV2RGB(y_h, y_h, u, v, 4, 5, 6, 6);
>>>>> + WRITE_YUV2RGB(y_h, y_h, u, v, 6, 7, 7, 7);
>>>>> + }
>>>>> + if (dstW - i >= 8) {
>>>>> + int Y1, Y2, U, V;
>>>>> + int i_dex = i << 1;
>>>>> + __m256i y0_l, y0, u0, v0;
>>>>> + __m256i y1_l, y1, u1, v1;
>>>>> + __m256i y_l, u, v;
>>>>> +
>>>>> + y0 = __lasx_xvldx(buf0, i_dex);
>>>> 
>>>> 1. Not long ago, I tried to constify the src pointer of several asm
>>>> functions and noticed that they produced new warnings for loongarch
>>>> (according to patchwork:
>>>> https://patchwork.ffmpeg.org/project/ffmpeg/patch/DB6PR0101MB2214178D3E6B8DCA5B86F8198F9A9@DB6PR0101MB2214.eurprd01.prod.exchangelabs.com/),
>>>> even though I was sure that the code is const-correct. After finding
>>>> (via https://github.com/opencv/opencv/pull/21833) a toolchain
>>>> (https://gitee.com/wenux/cross-compiler-la-on-x86) that can build the
>>>> lasx and lsx code (upstream GCC seems to be lacking lsx and lasx support
>>>> at the moment; at least, my self-compiled loongarch-GCC did not support
>>>> lsx and lasx) the issue was clear: lsxintrin.h and lasxintrin.h do not
>>>> use const at all, even for functions that only read data (I presume the
>>>> vl in __lsx_vldx stands for "vector load"?).
>>>> So I sent another iteration
>>>> https://ffmpeg.org/pipermail/ffmpeg-devel/2022-August/299562.html of
>>>> that patchset that now added wrappers for __lsx_vldx() and
>>>> __lasx_xvldx() and cc'ed you and some other developers from loongson to
>>>> alert you of the issue in the hope that you fix the headers, so that my
>>>> wrappers wouldn't need to be applied. That didn't work, as my mails
>>>> could not be delivered to you. So I applied the patchset.
>>>> 2. You use __lasx_xvldx() to read from a const int16_t. This will give
>>>> new warnings unless the above issue has been fixed. Has it?
>>>> 3. I don't know whether it has, as patchwork's fate tests don't work for
>>>> a few days already. Given that the mails I receive from patchwork when
>>>> it doesn't like a commit message arrive from "Patchwork
>>>> <yinshiyou-hf@loongson.cn>" I presume that loongson is now somehow
>>>> running patchwork, so you should be able to inform the right people to
>>>> fix it.
>>>> 4. If you fixed the const-issue, can you please make an updated
>>>> toolchain with lsx and lasx support enabled available to us?
>>>> 
>>>> - Andreas
>>>> 
>>> 
>>> Hi Andreas,
>>> 
>>> Sorry for the late reply.
>>> This issue will be fixed by using const for v1 of __lsx_vldx, and I will update toolchain of LoongArch patchwork runner ASAP.
>>> Thank you very much for analyzing this problem and giving suggestion. 
>>> 
>>> Thanks,
>>> Shiyou
>> 
>> Hi Andreas,
>> Has updated patchwork runner environment, you can revert the patchset which add wrappers for __lsx_[x]vldx.
>> 
> 
> Just sent the patches:
> https://ffmpeg.org/pipermail/ffmpeg-devel/2022-September/301201.html <https://ffmpeg.org/pipermail/ffmpeg-devel/2022-September/301201.html>
> Can you provide an updated toolchain (with x86 host) able to compile lsx
> and lasx?
> 
> - Andreas

Compiler team haven’t release this version yet(maybe at Sep 13), you can check here :
http://ftp.loongnix.cn/toolchain/gcc/release/loongarch/gcc8/
If you want to try it now, you can update attachment file first.



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [FFmpeg-devel] [PATCH v1 3/3] swscale/la: Add output_lasx.c file.
  2022-09-11  2:06           ` Shiyou Yin
@ 2022-09-21  3:03             ` Shiyou Yin
  2022-09-21 10:50               ` Andreas Rheinhardt
  0 siblings, 1 reply; 13+ messages in thread
From: Shiyou Yin @ 2022-09-21  3:03 UTC (permalink / raw)
  To: FFmpeg development discussions and patches; +Cc: andreas.rheinhardt



> 2022年9月11日 10:06，Shiyou Yin <yinshiyou-hf@loongson.cn> 写道：
> 
> 
> 
>> 2022年9月9日 21:11，Andreas Rheinhardt <andreas.rheinhardt@outlook.com <mailto:andreas.rheinhardt@outlook.com>> 写道：
>> 
>> Shiyou Yin:
>>> 
>>> 
>>>> 2022年9月6日 16:12，Shiyou Yin <yinshiyou-hf@loongson.cn <mailto:yinshiyou-hf@loongson.cn>> 写道：
>>>> 
>>>>> 
>>>>> 2022年8月29日 20:30，Andreas Rheinhardt <andreas.rheinhardt@outlook.com <mailto:andreas.rheinhardt@outlook.com> <mailto:andreas.rheinhardt@outlook.com <mailto:andreas.rheinhardt@outlook.com>>> 写道：
>>>>> 
>>>>> Hao Chen:
>>>>>> ffmpeg -i ~/media/1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -s 640x480 -pix_fmt
>>>>>> rgb24 -y /dev/null -an
>>>>>> before: 150fps
>>>>>> after: 183fps
>>>>>> 
>>>>>> Signed-off-by: Hao Chen <chenhao@loongson.cn <mailto:chenhao@loongson.cn>>
>>>>>> ---
>>>>>> libswscale/loongarch/Makefile | 3 +-
>>>>>> libswscale/loongarch/output_lasx.c | 1982 +++++++++++++++++
>>>>>> libswscale/loongarch/swscale_init_loongarch.c | 3 +
>>>>>> libswscale/loongarch/swscale_loongarch.h | 6 +
>>>>>> 4 files changed, 1993 insertions(+), 1 deletion(-)
>>>>>> create mode 100644 libswscale/loongarch/output_lasx.c
>>>>>> 
>>> 
>>>>>> +static void
>>>>>> +yuv2rgb_2_template_lasx(SwsContext *c, const int16_t *buf[2],
>>>>>> + const int16_t *ubuf[2], const int16_t *vbuf[2],
>>>>>> + const int16_t *abuf[2], uint8_t *dest, int dstW,
>>>>>> + int yalpha, int uvalpha, int y,
>>>>>> + enum AVPixelFormat target, int hasAlpha)
>>>>>> +{
>>>>>> + const int16_t *buf0 = buf[0], *buf1 = buf[1],
>>>>>> + *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
>>>>>> + *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
>>>>>> + int yalpha1 = 4096 - yalpha;
>>>>>> + int uvalpha1 = 4096 - uvalpha;
>>>>>> + int i, count = 0;
>>>>>> + int len = dstW - 15;
>>>>>> + int len_count = (dstW + 1) >> 1;
>>>>>> + const void *r, *g, *b;
>>>>>> + int head = YUVRGB_TABLE_HEADROOM;
>>>>>> + __m256i v_yalpha1 = __lasx_xvreplgr2vr_w(yalpha1);
>>>>>> + __m256i v_uvalpha1 = __lasx_xvreplgr2vr_w(uvalpha1);
>>>>>> + __m256i v_yalpha = __lasx_xvreplgr2vr_w(yalpha);
>>>>>> + __m256i v_uvalpha = __lasx_xvreplgr2vr_w(uvalpha);
>>>>>> + __m256i headroom = __lasx_xvreplgr2vr_w(head);
>>>>>> +
>>>>>> + for (i = 0; i < len; i += 16) {
>>>>>> + int Y1, Y2, U, V;
>>>>>> + int i_dex = i << 1;
>>>>>> + int c_dex = count << 1;
>>>>>> + __m256i y0_h, y0_l, y0, u0, v0;
>>>>>> + __m256i y1_h, y1_l, y1, u1, v1;
>>>>>> + __m256i y_l, y_h, u, v;
>>>>>> +
>>>>>> + DUP4_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
>>>>>> + buf1, i_dex, y0, u0, v0, y1);
>>>>>> + DUP2_ARG2(__lasx_xvldx, ubuf1, c_dex, vbuf1, c_dex, u1, v1);
>>>>>> + DUP2_ARG2(__lasx_xvsllwil_w_h, y0, 0, y1, 0, y0_l, y1_l);
>>>>>> + DUP2_ARG1(__lasx_xvexth_w_h, y0, y1, y0_h, y1_h);
>>>>>> + DUP4_ARG1(__lasx_vext2xv_w_h, u0, u1, v0, v1, u0, u1, v0, v1);
>>>>>> + y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
>>>>>> + y0_h = __lasx_xvmul_w(y0_h, v_yalpha1);
>>>>>> + u0 = __lasx_xvmul_w(u0, v_uvalpha1);
>>>>>> + v0 = __lasx_xvmul_w(v0, v_uvalpha1);
>>>>>> + y_l = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
>>>>>> + y_h = __lasx_xvmadd_w(y0_h, v_yalpha, y1_h);
>>>>>> + u = __lasx_xvmadd_w(u0, v_uvalpha, u1);
>>>>>> + v = __lasx_xvmadd_w(v0, v_uvalpha, v1);
>>>>>> + y_l = __lasx_xvsrai_w(y_l, 19);
>>>>>> + y_h = __lasx_xvsrai_w(y_h, 19);
>>>>>> + u = __lasx_xvsrai_w(u, 19);
>>>>>> + v = __lasx_xvsrai_w(v, 19);
>>>>>> + u = __lasx_xvadd_w(u, headroom);
>>>>>> + v = __lasx_xvadd_w(v, headroom);
>>>>>> + WRITE_YUV2RGB(y_l, y_l, u, v, 0, 1, 0, 0);
>>>>>> + WRITE_YUV2RGB(y_l, y_l, u, v, 2, 3, 1, 1);
>>>>>> + WRITE_YUV2RGB(y_h, y_h, u, v, 0, 1, 2, 2);
>>>>>> + WRITE_YUV2RGB(y_h, y_h, u, v, 2, 3, 3, 3);
>>>>>> + WRITE_YUV2RGB(y_l, y_l, u, v, 4, 5, 4, 4);
>>>>>> + WRITE_YUV2RGB(y_l, y_l, u, v, 6, 7, 5, 5);
>>>>>> + WRITE_YUV2RGB(y_h, y_h, u, v, 4, 5, 6, 6);
>>>>>> + WRITE_YUV2RGB(y_h, y_h, u, v, 6, 7, 7, 7);
>>>>>> + }
>>>>>> + if (dstW - i >= 8) {
>>>>>> + int Y1, Y2, U, V;
>>>>>> + int i_dex = i << 1;
>>>>>> + __m256i y0_l, y0, u0, v0;
>>>>>> + __m256i y1_l, y1, u1, v1;
>>>>>> + __m256i y_l, u, v;
>>>>>> +
>>>>>> + y0 = __lasx_xvldx(buf0, i_dex);
>>>>> 
>>>>> 1. Not long ago, I tried to constify the src pointer of several asm
>>>>> functions and noticed that they produced new warnings for loongarch
>>>>> (according to patchwork:
>>>>> https://patchwork.ffmpeg.org/project/ffmpeg/patch/DB6PR0101MB2214178D3E6B8DCA5B86F8198F9A9@DB6PR0101MB2214.eurprd01.prod.exchangelabs.com/ <https://patchwork.ffmpeg.org/project/ffmpeg/patch/DB6PR0101MB2214178D3E6B8DCA5B86F8198F9A9@DB6PR0101MB2214.eurprd01.prod.exchangelabs.com/>),
>>>>> even though I was sure that the code is const-correct. After finding
>>>>> (via https://github.com/opencv/opencv/pull/21833 <https://github.com/opencv/opencv/pull/21833>) a toolchain
>>>>> (https://gitee.com/wenux/cross-compiler-la-on-x86 <https://gitee.com/wenux/cross-compiler-la-on-x86>) that can build the
>>>>> lasx and lsx code (upstream GCC seems to be lacking lsx and lasx support
>>>>> at the moment; at least, my self-compiled loongarch-GCC did not support
>>>>> lsx and lasx) the issue was clear: lsxintrin.h and lasxintrin.h do not
>>>>> use const at all, even for functions that only read data (I presume the
>>>>> vl in __lsx_vldx stands for "vector load"?).
>>>>> So I sent another iteration
>>>>> https://ffmpeg.org/pipermail/ffmpeg-devel/2022-August/299562.html <https://ffmpeg.org/pipermail/ffmpeg-devel/2022-August/299562.html> of
>>>>> that patchset that now added wrappers for __lsx_vldx() and
>>>>> __lasx_xvldx() and cc'ed you and some other developers from loongson to
>>>>> alert you of the issue in the hope that you fix the headers, so that my
>>>>> wrappers wouldn't need to be applied. That didn't work, as my mails
>>>>> could not be delivered to you. So I applied the patchset.
>>>>> 2. You use __lasx_xvldx() to read from a const int16_t. This will give
>>>>> new warnings unless the above issue has been fixed. Has it?
>>>>> 3. I don't know whether it has, as patchwork's fate tests don't work for
>>>>> a few days already. Given that the mails I receive from patchwork when
>>>>> it doesn't like a commit message arrive from "Patchwork
>>>>> <yinshiyou-hf@loongson.cn <mailto:yinshiyou-hf@loongson.cn>>" I presume that loongson is now somehow
>>>>> running patchwork, so you should be able to inform the right people to
>>>>> fix it.
>>>>> 4. If you fixed the const-issue, can you please make an updated
>>>>> toolchain with lsx and lasx support enabled available to us?
>>>>> 
>>>>> - Andreas
>>>>> 
>>>> 
>>>> Hi Andreas,
>>>> 
>>>> Sorry for the late reply.
>>>> This issue will be fixed by using const for v1 of __lsx_vldx, and I will update toolchain of LoongArch patchwork runner ASAP.
>>>> Thank you very much for analyzing this problem and giving suggestion. 
>>>> 
>>>> Thanks,
>>>> Shiyou
>>> 
>>> Hi Andreas,
>>> Has updated patchwork runner environment, you can revert the patchset which add wrappers for __lsx_[x]vldx.
>>> 
>> 
>> Just sent the patches:
>> https://ffmpeg.org/pipermail/ffmpeg-devel/2022-September/301201.html <https://ffmpeg.org/pipermail/ffmpeg-devel/2022-September/301201.html>
>> Can you provide an updated toolchain (with x86 host) able to compile lsx
>> and lasx?
>> 
>> - Andreas
> 
> Compiler team haven’t release this version yet(maybe at Sep 13), you can check here :
> http://ftp.loongnix.cn/toolchain/gcc/release/loongarch/gcc8/ <http://ftp.loongnix.cn/toolchain/gcc/release/loongarch/gcc8/>
> If you want to try it now, you can update attachment file first.
> 
> 
toolchain has been uploaded to http://ftp.loongnix.cn/toolchain/gcc/release/loongarch/gcc8/ <http://ftp.loongnix.cn/toolchain/gcc/release/loongarch/gcc8/>
loongson-gnu-toolchain-8.3-src-loongarch64-linux-gnu-rc1.1.tar <http://ftp.loongnix.cn/toolchain/gcc/release/loongarch/gcc8/loongson-gnu-toolchain-8.3-src-loongarch64-linux-gnu-rc1.1.tar>  
loongson-gnu-toolchain-8.3-x86_64-loongarch64-linux-gnu-rc1.1.tar.xz <http://ftp.loongnix.cn/toolchain/gcc/release/loongarch/gcc8/loongson-gnu-toolchain-8.3-x86_64-loongarch64-linux-gnu-rc1.1.tar.xz>
loongson-gnu-toolchain-8.3-x86_64-loongarch64-linux-gnu-rc1.1.tar.xz.md5 <http://ftp.loongnix.cn/toolchain/gcc/release/loongarch/gcc8/loongson-gnu-toolchain-8.3-x86_64-loongarch64-linux-gnu-rc1.1.tar.xz.md5>  

Thanks,
Shiyou
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [FFmpeg-devel] [PATCH v1 3/3] swscale/la: Add output_lasx.c file.
  2022-09-21  3:03             ` Shiyou Yin
@ 2022-09-21 10:50               ` Andreas Rheinhardt
  0 siblings, 0 replies; 13+ messages in thread
From: Andreas Rheinhardt @ 2022-09-21 10:50 UTC (permalink / raw)
  To: Shiyou Yin, FFmpeg development discussions and patches

Shiyou Yin:
> 
> 
>> 2022年9月11日 10:06，Shiyou Yin <yinshiyou-hf@loongson.cn> 写道：
>>
>>
>>
>>> 2022年9月9日 21:11，Andreas Rheinhardt <andreas.rheinhardt@outlook.com <mailto:andreas.rheinhardt@outlook.com>> 写道：
>>>
>>> Shiyou Yin:
>>>>
>>>>
>>>>> 2022年9月6日 16:12，Shiyou Yin <yinshiyou-hf@loongson.cn <mailto:yinshiyou-hf@loongson.cn>> 写道：
>>>>>
>>>>>>
>>>>>> 2022年8月29日 20:30，Andreas Rheinhardt <andreas.rheinhardt@outlook.com <mailto:andreas.rheinhardt@outlook.com> <mailto:andreas.rheinhardt@outlook.com <mailto:andreas.rheinhardt@outlook.com>>> 写道：
>>>>>>
>>>>>> Hao Chen:
>>>>>>> ffmpeg -i ~/media/1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -s 640x480 -pix_fmt
>>>>>>> rgb24 -y /dev/null -an
>>>>>>> before: 150fps
>>>>>>> after: 183fps
>>>>>>>
>>>>>>> Signed-off-by: Hao Chen <chenhao@loongson.cn <mailto:chenhao@loongson.cn>>
>>>>>>> ---
>>>>>>> libswscale/loongarch/Makefile | 3 +-
>>>>>>> libswscale/loongarch/output_lasx.c | 1982 +++++++++++++++++
>>>>>>> libswscale/loongarch/swscale_init_loongarch.c | 3 +
>>>>>>> libswscale/loongarch/swscale_loongarch.h | 6 +
>>>>>>> 4 files changed, 1993 insertions(+), 1 deletion(-)
>>>>>>> create mode 100644 libswscale/loongarch/output_lasx.c
>>>>>>>
>>>>
>>>>>>> +static void
>>>>>>> +yuv2rgb_2_template_lasx(SwsContext *c, const int16_t *buf[2],
>>>>>>> + const int16_t *ubuf[2], const int16_t *vbuf[2],
>>>>>>> + const int16_t *abuf[2], uint8_t *dest, int dstW,
>>>>>>> + int yalpha, int uvalpha, int y,
>>>>>>> + enum AVPixelFormat target, int hasAlpha)
>>>>>>> +{
>>>>>>> + const int16_t *buf0 = buf[0], *buf1 = buf[1],
>>>>>>> + *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
>>>>>>> + *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
>>>>>>> + int yalpha1 = 4096 - yalpha;
>>>>>>> + int uvalpha1 = 4096 - uvalpha;
>>>>>>> + int i, count = 0;
>>>>>>> + int len = dstW - 15;
>>>>>>> + int len_count = (dstW + 1) >> 1;
>>>>>>> + const void *r, *g, *b;
>>>>>>> + int head = YUVRGB_TABLE_HEADROOM;
>>>>>>> + __m256i v_yalpha1 = __lasx_xvreplgr2vr_w(yalpha1);
>>>>>>> + __m256i v_uvalpha1 = __lasx_xvreplgr2vr_w(uvalpha1);
>>>>>>> + __m256i v_yalpha = __lasx_xvreplgr2vr_w(yalpha);
>>>>>>> + __m256i v_uvalpha = __lasx_xvreplgr2vr_w(uvalpha);
>>>>>>> + __m256i headroom = __lasx_xvreplgr2vr_w(head);
>>>>>>> +
>>>>>>> + for (i = 0; i < len; i += 16) {
>>>>>>> + int Y1, Y2, U, V;
>>>>>>> + int i_dex = i << 1;
>>>>>>> + int c_dex = count << 1;
>>>>>>> + __m256i y0_h, y0_l, y0, u0, v0;
>>>>>>> + __m256i y1_h, y1_l, y1, u1, v1;
>>>>>>> + __m256i y_l, y_h, u, v;
>>>>>>> +
>>>>>>> + DUP4_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
>>>>>>> + buf1, i_dex, y0, u0, v0, y1);
>>>>>>> + DUP2_ARG2(__lasx_xvldx, ubuf1, c_dex, vbuf1, c_dex, u1, v1);
>>>>>>> + DUP2_ARG2(__lasx_xvsllwil_w_h, y0, 0, y1, 0, y0_l, y1_l);
>>>>>>> + DUP2_ARG1(__lasx_xvexth_w_h, y0, y1, y0_h, y1_h);
>>>>>>> + DUP4_ARG1(__lasx_vext2xv_w_h, u0, u1, v0, v1, u0, u1, v0, v1);
>>>>>>> + y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
>>>>>>> + y0_h = __lasx_xvmul_w(y0_h, v_yalpha1);
>>>>>>> + u0 = __lasx_xvmul_w(u0, v_uvalpha1);
>>>>>>> + v0 = __lasx_xvmul_w(v0, v_uvalpha1);
>>>>>>> + y_l = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
>>>>>>> + y_h = __lasx_xvmadd_w(y0_h, v_yalpha, y1_h);
>>>>>>> + u = __lasx_xvmadd_w(u0, v_uvalpha, u1);
>>>>>>> + v = __lasx_xvmadd_w(v0, v_uvalpha, v1);
>>>>>>> + y_l = __lasx_xvsrai_w(y_l, 19);
>>>>>>> + y_h = __lasx_xvsrai_w(y_h, 19);
>>>>>>> + u = __lasx_xvsrai_w(u, 19);
>>>>>>> + v = __lasx_xvsrai_w(v, 19);
>>>>>>> + u = __lasx_xvadd_w(u, headroom);
>>>>>>> + v = __lasx_xvadd_w(v, headroom);
>>>>>>> + WRITE_YUV2RGB(y_l, y_l, u, v, 0, 1, 0, 0);
>>>>>>> + WRITE_YUV2RGB(y_l, y_l, u, v, 2, 3, 1, 1);
>>>>>>> + WRITE_YUV2RGB(y_h, y_h, u, v, 0, 1, 2, 2);
>>>>>>> + WRITE_YUV2RGB(y_h, y_h, u, v, 2, 3, 3, 3);
>>>>>>> + WRITE_YUV2RGB(y_l, y_l, u, v, 4, 5, 4, 4);
>>>>>>> + WRITE_YUV2RGB(y_l, y_l, u, v, 6, 7, 5, 5);
>>>>>>> + WRITE_YUV2RGB(y_h, y_h, u, v, 4, 5, 6, 6);
>>>>>>> + WRITE_YUV2RGB(y_h, y_h, u, v, 6, 7, 7, 7);
>>>>>>> + }
>>>>>>> + if (dstW - i >= 8) {
>>>>>>> + int Y1, Y2, U, V;
>>>>>>> + int i_dex = i << 1;
>>>>>>> + __m256i y0_l, y0, u0, v0;
>>>>>>> + __m256i y1_l, y1, u1, v1;
>>>>>>> + __m256i y_l, u, v;
>>>>>>> +
>>>>>>> + y0 = __lasx_xvldx(buf0, i_dex);
>>>>>>
>>>>>> 1. Not long ago, I tried to constify the src pointer of several asm
>>>>>> functions and noticed that they produced new warnings for loongarch
>>>>>> (according to patchwork:
>>>>>> https://patchwork.ffmpeg.org/project/ffmpeg/patch/DB6PR0101MB2214178D3E6B8DCA5B86F8198F9A9@DB6PR0101MB2214.eurprd01.prod.exchangelabs.com/ <https://patchwork.ffmpeg.org/project/ffmpeg/patch/DB6PR0101MB2214178D3E6B8DCA5B86F8198F9A9@DB6PR0101MB2214.eurprd01.prod.exchangelabs.com/>),
>>>>>> even though I was sure that the code is const-correct. After finding
>>>>>> (via https://github.com/opencv/opencv/pull/21833 <https://github.com/opencv/opencv/pull/21833>) a toolchain
>>>>>> (https://gitee.com/wenux/cross-compiler-la-on-x86 <https://gitee.com/wenux/cross-compiler-la-on-x86>) that can build the
>>>>>> lasx and lsx code (upstream GCC seems to be lacking lsx and lasx support
>>>>>> at the moment; at least, my self-compiled loongarch-GCC did not support
>>>>>> lsx and lasx) the issue was clear: lsxintrin.h and lasxintrin.h do not
>>>>>> use const at all, even for functions that only read data (I presume the
>>>>>> vl in __lsx_vldx stands for "vector load"?).
>>>>>> So I sent another iteration
>>>>>> https://ffmpeg.org/pipermail/ffmpeg-devel/2022-August/299562.html <https://ffmpeg.org/pipermail/ffmpeg-devel/2022-August/299562.html> of
>>>>>> that patchset that now added wrappers for __lsx_vldx() and
>>>>>> __lasx_xvldx() and cc'ed you and some other developers from loongson to
>>>>>> alert you of the issue in the hope that you fix the headers, so that my
>>>>>> wrappers wouldn't need to be applied. That didn't work, as my mails
>>>>>> could not be delivered to you. So I applied the patchset.
>>>>>> 2. You use __lasx_xvldx() to read from a const int16_t. This will give
>>>>>> new warnings unless the above issue has been fixed. Has it?
>>>>>> 3. I don't know whether it has, as patchwork's fate tests don't work for
>>>>>> a few days already. Given that the mails I receive from patchwork when
>>>>>> it doesn't like a commit message arrive from "Patchwork
>>>>>> <yinshiyou-hf@loongson.cn <mailto:yinshiyou-hf@loongson.cn>>" I presume that loongson is now somehow
>>>>>> running patchwork, so you should be able to inform the right people to
>>>>>> fix it.
>>>>>> 4. If you fixed the const-issue, can you please make an updated
>>>>>> toolchain with lsx and lasx support enabled available to us?
>>>>>>
>>>>>> - Andreas
>>>>>>
>>>>>
>>>>> Hi Andreas,
>>>>>
>>>>> Sorry for the late reply.
>>>>> This issue will be fixed by using const for v1 of __lsx_vldx, and I will update toolchain of LoongArch patchwork runner ASAP.
>>>>> Thank you very much for analyzing this problem and giving suggestion. 
>>>>>
>>>>> Thanks,
>>>>> Shiyou
>>>>
>>>> Hi Andreas,
>>>> Has updated patchwork runner environment, you can revert the patchset which add wrappers for __lsx_[x]vldx.
>>>>
>>>
>>> Just sent the patches:
>>> https://ffmpeg.org/pipermail/ffmpeg-devel/2022-September/301201.html <https://ffmpeg.org/pipermail/ffmpeg-devel/2022-September/301201.html>
>>> Can you provide an updated toolchain (with x86 host) able to compile lsx
>>> and lasx?
>>>
>>> - Andreas
>>
>> Compiler team haven’t release this version yet(maybe at Sep 13), you can check here :
>> http://ftp.loongnix.cn/toolchain/gcc/release/loongarch/gcc8/ <http://ftp.loongnix.cn/toolchain/gcc/release/loongarch/gcc8/>
>> If you want to try it now, you can update attachment file first.
>>
>>
> toolchain has been uploaded to http://ftp.loongnix.cn/toolchain/gcc/release/loongarch/gcc8/ <http://ftp.loongnix.cn/toolchain/gcc/release/loongarch/gcc8/>
> loongson-gnu-toolchain-8.3-src-loongarch64-linux-gnu-rc1.1.tar <http://ftp.loongnix.cn/toolchain/gcc/release/loongarch/gcc8/loongson-gnu-toolchain-8.3-src-loongarch64-linux-gnu-rc1.1.tar>  
> loongson-gnu-toolchain-8.3-x86_64-loongarch64-linux-gnu-rc1.1.tar.xz <http://ftp.loongnix.cn/toolchain/gcc/release/loongarch/gcc8/loongson-gnu-toolchain-8.3-x86_64-loongarch64-linux-gnu-rc1.1.tar.xz>
> loongson-gnu-toolchain-8.3-x86_64-loongarch64-linux-gnu-rc1.1.tar.xz.md5 <http://ftp.loongnix.cn/toolchain/gcc/release/loongarch/gcc8/loongson-gnu-toolchain-8.3-x86_64-loongarch64-linux-gnu-rc1.1.tar.xz.md5>  
> 
> Thanks,
> Shiyou

Yes, I have already noticed this and have therefore already reverted my
wrapper patches. Thanks.

- Andreas
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [FFmpeg-devel] [PATCH v1 1/3] swscale/la: Optimize hscale functions with lasx.
  2022-08-29 11:51 ` [FFmpeg-devel] [PATCH v1 1/3] swscale/la: Optimize hscale functions with lasx Hao Chen
@ 2022-08-31  8:42   ` Shiyou Yin
  0 siblings, 0 replies; 13+ messages in thread
From: Shiyou Yin @ 2022-08-31  8:42 UTC (permalink / raw)
  To: FFmpeg development discussions and patches


> +
> +void planar_rgb_to_uv_lasx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
> +                           int width, int32_t *rgb2yuv)
> +{
> +    int i;
> +    uint16_t *dstU   = (uint16_t *)_dstU;
> +    uint16_t *dstV   = (uint16_t *)_dstV;
> +    int set          = 0x4001 << (RGB2YUV_SHIFT - 7);
> +    int len          = width - 15;
> +    int32_t tem_ru   = rgb2yuv[RU_IDX], tem_gu = rgb2yuv[GU_IDX];
> +    int32_t tem_bu = rgb2yuv[BU_IDX], tem_rv   = rgb2yuv[RV_IDX];
> +    int32_t tem_gv = rgb2yuv[GV_IDX], tem_bv = rgb2yuv[BV_IDX];
> +    int shift        = RGB2YUV_SHIFT - 6;
> +    const uint8_t *src0 = src[0], *src1 = src[1], *src2 = src[2];
> +    __m256i ru, gu, bu, rv, gv, bv;
> +    __m256i mask = {0x0D0C090805040100, 0x1D1C191815141110,
> +                    0x0D0C090805040100, 0x1D1C191815141110};
> +    __m256i temp = __lasx_xvreplgr2vr_w(set);
> +    __m256i sra  = __lasx_xvreplgr2vr_w(shift);
> +
> +    ru = __lasx_xvreplgr2vr_w(tem_ru);
> +    gu = __lasx_xvreplgr2vr_w(tem_gu);
> +    bu = __lasx_xvreplgr2vr_w(tem_bu);
> +    rv = __lasx_xvreplgr2vr_w(tem_rv);
> +    gv = __lasx_xvreplgr2vr_w(tem_gv);
> +    bv = __lasx_xvreplgr2vr_w(tem_bv);
> +    for (i = 0; i < len; i += 16) {
> +        __m256i _g, _b, _r;
> +        __m256i g_l, g_h, b_l, b_h, r_l, r_h;
> +        __m256i v_l, v_h, u_l, u_h, u_lh, v_lh;
> +
> +        DUP2_ARG2(__lasx_xvld, src0 + i, 0, src1 + i, 0, _g, _b);
> +        _r   = __lasx_xvld(src2 + i, 0);

In this case, it’s more readable not to use DUP2.
The following extensions of r,g,b are the same.

> +        g_l = __lasx_vext2xv_wu_bu(_g);
> +        DUP2_ARG1(__lasx_vext2xv_wu_bu, _b, _r, b_l, r_l);
> +        _g   = __lasx_xvpermi_d(_g, 0x01);
> +        _b   = __lasx_xvpermi_d(_b, 0x01);
> +        _r   = __lasx_xvpermi_d(_r, 0x01);
> +        g_h = __lasx_vext2xv_wu_bu(_g);
> +        DUP2_ARG1(__lasx_vext2xv_wu_bu, _b, _r, b_h, r_h);
> +        u_l  = __lasx_xvmadd_w(temp, ru, r_l);
> +        u_h  = __lasx_xvmadd_w(temp, ru, r_h);
> +        v_l  = __lasx_xvmadd_w(temp, rv, r_l);
> +        v_h  = __lasx_xvmadd_w(temp, rv, r_h);
> +        u_l  = __lasx_xvmadd_w(u_l, gu, g_l);
> +        u_l  = __lasx_xvmadd_w(u_l, bu, b_l);
> +        u_h  = __lasx_xvmadd_w(u_h, gu, g_h);
> +        u_h  = __lasx_xvmadd_w(u_h, bu, b_h);
> +        v_l  = __lasx_xvmadd_w(v_l, gv, g_l);
> +        v_l  = __lasx_xvmadd_w(v_l, bv, b_l);
> +        v_h  = __lasx_xvmadd_w(v_h, gv, g_h);
> +        v_h  = __lasx_xvmadd_w(v_h, bv, b_h);
> +        u_l  = __lasx_xvsra_w(u_l, sra);
> +        u_h  = __lasx_xvsra_w(u_h, sra);
> +        v_l  = __lasx_xvsra_w(v_l, sra);
> +        v_h  = __lasx_xvsra_w(v_h, sra);
> +        DUP2_ARG3(__lasx_xvshuf_b, u_h, u_l, mask, v_h, v_l, mask, u_lh, v_lh);
> +        u_lh = __lasx_xvpermi_d(u_lh, 0xD8);
> +        v_lh = __lasx_xvpermi_d(v_lh, 0xD8);
> +        __lasx_xvst(u_lh, (dstU + i), 0);
> +        __lasx_xvst(v_lh, (dstV + i), 0);
> +    }
> +    if (width - i >= 8) {
> +        __m256i _g, _b, _r;
> +        __m256i g_l, b_l, r_l;
> +        __m256i v_l, u_l, u, v;
> +
> +        _g  = __lasx_xvldrepl_d((src0 + i), 0);
> +        _b  = __lasx_xvldrepl_d((src1 + i), 0);
> +        _r  = __lasx_xvldrepl_d((src2 + i), 0);
> +        g_l = __lasx_vext2xv_wu_bu(_g);
> +        DUP2_ARG1(__lasx_vext2xv_wu_bu, _b, _r, b_l, r_l);
> +        u_l = __lasx_xvmadd_w(temp, ru, r_l);
> +        v_l = __lasx_xvmadd_w(temp, rv, r_l);
> +        u_l = __lasx_xvmadd_w(u_l, gu, g_l);
> +        u_l = __lasx_xvmadd_w(u_l, bu, b_l);
> +        v_l = __lasx_xvmadd_w(v_l, gv, g_l);
> +        v_l = __lasx_xvmadd_w(v_l, bv, b_l);
> +        u_l = __lasx_xvsra_w(u_l, sra);
> +        v_l = __lasx_xvsra_w(v_l, sra);
> +        DUP2_ARG3(__lasx_xvshuf_b, u_l, u_l, mask, v_l, v_l, mask, u, v);
> +        __lasx_xvstelm_d(u, (dstU + i), 0, 0);
> +        __lasx_xvstelm_d(u, (dstU + i), 8, 2);
> +        __lasx_xvstelm_d(v, (dstV + i), 0, 0);
> +        __lasx_xvstelm_d(v, (dstV + i), 8, 2);
> +        i += 8;
> +    }
> +    for (; i < width; i++) {
> +        int g = src[0][i];
> +        int b = src[1][i];
> +        int r = src[2][i];
> +
> +        dstU[i] = (tem_ru * r + tem_gu * g + tem_bu * b + set) >> shift;
> +        dstV[i] = (tem_rv * r + tem_gv * g + tem_bv * b + set) >> shift;
> +    }

Suggest to use following condition controls instead.
While(i=width; i>=16; i-=16) {}
if(i>=8) {}
while(I--){}

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [FFmpeg-devel] [PATCH v1 1/3] swscale/la: Optimize hscale functions with lasx.
  2022-08-29 11:51 [FFmpeg-devel] Add LoongArch SIMD optimization in swscale lib Hao Chen
@ 2022-08-29 11:51 ` Hao Chen
  2022-08-31  8:42   ` Shiyou Yin
  0 siblings, 1 reply; 13+ messages in thread
From: Hao Chen @ 2022-08-29 11:51 UTC (permalink / raw)
  To: ffmpeg-devel

ffmpeg -i ~/media/1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -s 640x480 -y /dev/null -an
before: 101fps
after:  138fps

Signed-off-by: Hao Chen <chenhao@loongson.cn>
---
 libswscale/loongarch/Makefile                 |   3 +
 libswscale/loongarch/input_lasx.c             | 192 ++++
 libswscale/loongarch/swscale_init_loongarch.c |  50 +
 libswscale/loongarch/swscale_lasx.c           | 959 ++++++++++++++++++
 libswscale/loongarch/swscale_loongarch.h      |  50 +
 libswscale/swscale.c                          |   2 +
 libswscale/swscale_internal.h                 |   2 +
 libswscale/utils.c                            |  13 +-
 8 files changed, 1270 insertions(+), 1 deletion(-)
 create mode 100644 libswscale/loongarch/Makefile
 create mode 100644 libswscale/loongarch/input_lasx.c
 create mode 100644 libswscale/loongarch/swscale_init_loongarch.c
 create mode 100644 libswscale/loongarch/swscale_lasx.c
 create mode 100644 libswscale/loongarch/swscale_loongarch.h

diff --git a/libswscale/loongarch/Makefile b/libswscale/loongarch/Makefile
new file mode 100644
index 0000000000..586a1717b6
--- /dev/null
+++ b/libswscale/loongarch/Makefile
@@ -0,0 +1,3 @@
+OBJS-$(CONFIG_SWSCALE)      += loongarch/swscale_init_loongarch.o
+LASX-OBJS-$(CONFIG_SWSCALE) += loongarch/swscale_lasx.o \
+                               loongarch/input_lasx.o   \
diff --git a/libswscale/loongarch/input_lasx.c b/libswscale/loongarch/input_lasx.c
new file mode 100644
index 0000000000..6fcb998bc0
--- /dev/null
+++ b/libswscale/loongarch/input_lasx.c
@@ -0,0 +1,192 @@
+/*
+ * Copyright (C) 2022 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen(chenhao@loongson.cn)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "swscale_loongarch.h"
+#include "libavutil/loongarch/loongson_intrinsics.h"
+
+void planar_rgb_to_uv_lasx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
+                           int width, int32_t *rgb2yuv)
+{
+    int i;
+    uint16_t *dstU   = (uint16_t *)_dstU;
+    uint16_t *dstV   = (uint16_t *)_dstV;
+    int set          = 0x4001 << (RGB2YUV_SHIFT - 7);
+    int len          = width - 15;
+    int32_t tem_ru   = rgb2yuv[RU_IDX], tem_gu = rgb2yuv[GU_IDX];
+    int32_t tem_bu = rgb2yuv[BU_IDX], tem_rv   = rgb2yuv[RV_IDX];
+    int32_t tem_gv = rgb2yuv[GV_IDX], tem_bv = rgb2yuv[BV_IDX];
+    int shift        = RGB2YUV_SHIFT - 6;
+    const uint8_t *src0 = src[0], *src1 = src[1], *src2 = src[2];
+    __m256i ru, gu, bu, rv, gv, bv;
+    __m256i mask = {0x0D0C090805040100, 0x1D1C191815141110,
+                    0x0D0C090805040100, 0x1D1C191815141110};
+    __m256i temp = __lasx_xvreplgr2vr_w(set);
+    __m256i sra  = __lasx_xvreplgr2vr_w(shift);
+
+    ru = __lasx_xvreplgr2vr_w(tem_ru);
+    gu = __lasx_xvreplgr2vr_w(tem_gu);
+    bu = __lasx_xvreplgr2vr_w(tem_bu);
+    rv = __lasx_xvreplgr2vr_w(tem_rv);
+    gv = __lasx_xvreplgr2vr_w(tem_gv);
+    bv = __lasx_xvreplgr2vr_w(tem_bv);
+    for (i = 0; i < len; i += 16) {
+        __m256i _g, _b, _r;
+        __m256i g_l, g_h, b_l, b_h, r_l, r_h;
+        __m256i v_l, v_h, u_l, u_h, u_lh, v_lh;
+
+        DUP2_ARG2(__lasx_xvld, src0 + i, 0, src1 + i, 0, _g, _b);
+        _r   = __lasx_xvld(src2 + i, 0);
+        g_l = __lasx_vext2xv_wu_bu(_g);
+        DUP2_ARG1(__lasx_vext2xv_wu_bu, _b, _r, b_l, r_l);
+        _g   = __lasx_xvpermi_d(_g, 0x01);
+        _b   = __lasx_xvpermi_d(_b, 0x01);
+        _r   = __lasx_xvpermi_d(_r, 0x01);
+        g_h = __lasx_vext2xv_wu_bu(_g);
+        DUP2_ARG1(__lasx_vext2xv_wu_bu, _b, _r, b_h, r_h);
+        u_l  = __lasx_xvmadd_w(temp, ru, r_l);
+        u_h  = __lasx_xvmadd_w(temp, ru, r_h);
+        v_l  = __lasx_xvmadd_w(temp, rv, r_l);
+        v_h  = __lasx_xvmadd_w(temp, rv, r_h);
+        u_l  = __lasx_xvmadd_w(u_l, gu, g_l);
+        u_l  = __lasx_xvmadd_w(u_l, bu, b_l);
+        u_h  = __lasx_xvmadd_w(u_h, gu, g_h);
+        u_h  = __lasx_xvmadd_w(u_h, bu, b_h);
+        v_l  = __lasx_xvmadd_w(v_l, gv, g_l);
+        v_l  = __lasx_xvmadd_w(v_l, bv, b_l);
+        v_h  = __lasx_xvmadd_w(v_h, gv, g_h);
+        v_h  = __lasx_xvmadd_w(v_h, bv, b_h);
+        u_l  = __lasx_xvsra_w(u_l, sra);
+        u_h  = __lasx_xvsra_w(u_h, sra);
+        v_l  = __lasx_xvsra_w(v_l, sra);
+        v_h  = __lasx_xvsra_w(v_h, sra);
+        DUP2_ARG3(__lasx_xvshuf_b, u_h, u_l, mask, v_h, v_l, mask, u_lh, v_lh);
+        u_lh = __lasx_xvpermi_d(u_lh, 0xD8);
+        v_lh = __lasx_xvpermi_d(v_lh, 0xD8);
+        __lasx_xvst(u_lh, (dstU + i), 0);
+        __lasx_xvst(v_lh, (dstV + i), 0);
+    }
+    if (width - i >= 8) {
+        __m256i _g, _b, _r;
+        __m256i g_l, b_l, r_l;
+        __m256i v_l, u_l, u, v;
+
+        _g  = __lasx_xvldrepl_d((src0 + i), 0);
+        _b  = __lasx_xvldrepl_d((src1 + i), 0);
+        _r  = __lasx_xvldrepl_d((src2 + i), 0);
+        g_l = __lasx_vext2xv_wu_bu(_g);
+        DUP2_ARG1(__lasx_vext2xv_wu_bu, _b, _r, b_l, r_l);
+        u_l = __lasx_xvmadd_w(temp, ru, r_l);
+        v_l = __lasx_xvmadd_w(temp, rv, r_l);
+        u_l = __lasx_xvmadd_w(u_l, gu, g_l);
+        u_l = __lasx_xvmadd_w(u_l, bu, b_l);
+        v_l = __lasx_xvmadd_w(v_l, gv, g_l);
+        v_l = __lasx_xvmadd_w(v_l, bv, b_l);
+        u_l = __lasx_xvsra_w(u_l, sra);
+        v_l = __lasx_xvsra_w(v_l, sra);
+        DUP2_ARG3(__lasx_xvshuf_b, u_l, u_l, mask, v_l, v_l, mask, u, v);
+        __lasx_xvstelm_d(u, (dstU + i), 0, 0);
+        __lasx_xvstelm_d(u, (dstU + i), 8, 2);
+        __lasx_xvstelm_d(v, (dstV + i), 0, 0);
+        __lasx_xvstelm_d(v, (dstV + i), 8, 2);
+        i += 8;
+    }
+    for (; i < width; i++) {
+        int g = src[0][i];
+        int b = src[1][i];
+        int r = src[2][i];
+
+        dstU[i] = (tem_ru * r + tem_gu * g + tem_bu * b + set) >> shift;
+        dstV[i] = (tem_rv * r + tem_gv * g + tem_bv * b + set) >> shift;
+    }
+}
+
+void planar_rgb_to_y_lasx(uint8_t *_dst, const uint8_t *src[4], int width,
+                          int32_t *rgb2yuv)
+{
+    int i;
+    int shift        = (RGB2YUV_SHIFT - 6);
+    int set          = 0x801 << (RGB2YUV_SHIFT - 7);
+    int len          = width - 15;
+    uint16_t *dst    = (uint16_t *)_dst;
+    int32_t tem_ry   = rgb2yuv[RY_IDX], tem_gy = rgb2yuv[GY_IDX];
+    int32_t tem_by   = rgb2yuv[BY_IDX];
+    const uint8_t *src0 = src[0], *src1 = src[1], *src2 = src[2];
+    __m256i mask = {0x0D0C090805040100, 0x1D1C191815141110,
+                    0x0D0C090805040100, 0x1D1C191815141110};
+    __m256i temp = __lasx_xvreplgr2vr_w(set);
+    __m256i sra  = __lasx_xvreplgr2vr_w(shift);
+    __m256i ry   = __lasx_xvreplgr2vr_w(tem_ry);
+    __m256i gy   = __lasx_xvreplgr2vr_w(tem_gy);
+    __m256i by   = __lasx_xvreplgr2vr_w(tem_by);
+
+    for (i = 0; i < len; i += 16) {
+        __m256i _g, _b, _r;
+        __m256i g_l, g_h, b_l, b_h, r_l, r_h;
+        __m256i y_l, y_h, y_lh;
+
+        DUP2_ARG2(__lasx_xvld, src0 + i, 0, src1 + i, 0, _g, _b);
+        _r   = __lasx_xvld(src2 + i, 0);
+        g_l = __lasx_vext2xv_wu_bu(_g);
+        DUP2_ARG1(__lasx_vext2xv_wu_bu, _b, _r, b_l, r_l);
+        _g   = __lasx_xvpermi_d(_g, 0x01);
+        _b   = __lasx_xvpermi_d(_b, 0x01);
+        _r   = __lasx_xvpermi_d(_r, 0x01);
+        g_h  = __lasx_vext2xv_wu_bu(_g);
+        DUP2_ARG1(__lasx_vext2xv_wu_bu,_b, _r, b_h, r_h);
+        y_l  = __lasx_xvmadd_w(temp, ry, r_l);
+        y_h  = __lasx_xvmadd_w(temp, ry, r_h);
+        y_l  = __lasx_xvmadd_w(y_l, gy, g_l);
+        y_l  = __lasx_xvmadd_w(y_l, by, b_l);
+        y_h  = __lasx_xvmadd_w(y_h, gy, g_h);
+        y_h  = __lasx_xvmadd_w(y_h, by, b_h);
+        y_l  = __lasx_xvsra_w(y_l, sra);
+        y_h  = __lasx_xvsra_w(y_h, sra);
+        y_lh = __lasx_xvshuf_b(y_h, y_l, mask);
+        y_lh = __lasx_xvpermi_d(y_lh, 0xD8);
+        __lasx_xvst(y_lh, (dst + i), 0);
+    }
+    if (width - i >= 8) {
+        __m256i _g, _b, _r;
+        __m256i g_l, b_l, r_l;
+        __m256i y_l, y;
+
+        _g  = __lasx_xvldrepl_d((src0 + i), 0);
+        _b  = __lasx_xvldrepl_d((src1 + i), 0);
+        _r  = __lasx_xvldrepl_d((src2 + i), 0);
+        g_l = __lasx_vext2xv_wu_bu(_g);
+        DUP2_ARG1(__lasx_vext2xv_wu_bu, _b, _r, b_l, r_l);
+        y_l = __lasx_xvmadd_w(temp, ry, r_l);
+        y_l = __lasx_xvmadd_w(y_l, gy, g_l);
+        y_l = __lasx_xvmadd_w(y_l, by, b_l);
+        y_l = __lasx_xvsra_w(y_l, sra);
+        y = __lasx_xvshuf_b(y_l, y_l, mask);
+        __lasx_xvstelm_d(y, (dst + i), 0, 0);
+        __lasx_xvstelm_d(y, (dst + i), 8, 2);
+        i += 8;
+    }
+    for (; i < width; i++) {
+        int g = src[0][i];
+        int b = src[1][i];
+        int r = src[2][i];
+
+        dst[i] = (tem_ry * r + tem_gy * g + tem_by * b + set) >> shift;
+    }
+}
diff --git a/libswscale/loongarch/swscale_init_loongarch.c b/libswscale/loongarch/swscale_init_loongarch.c
new file mode 100644
index 0000000000..197dc6e1e7
--- /dev/null
+++ b/libswscale/loongarch/swscale_init_loongarch.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) 2022 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen(chenhao@loongson.cn)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "swscale_loongarch.h"
+#include "libswscale/swscale_internal.h"
+#include "libavutil/loongarch/cpu.h"
+
+av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+    if (have_lasx(cpu_flags)) {
+        if (c->srcBpc == 8) {
+            if (c->dstBpc <= 14) {
+                c->hyScale = c->hcScale = ff_hscale_8_to_15_lasx;
+            } else {
+                c->hyScale = c->hcScale = ff_hscale_8_to_19_lasx;
+            }
+        } else {
+            c->hyScale = c->hcScale = c->dstBpc > 14 ? ff_hscale_16_to_19_lasx
+                                                     : ff_hscale_16_to_15_lasx;
+        }
+        switch (c->srcFormat) {
+        case AV_PIX_FMT_GBRAP:
+        case AV_PIX_FMT_GBRP:
+            {
+                c->readChrPlanar = planar_rgb_to_uv_lasx;
+                c->readLumPlanar = planar_rgb_to_y_lasx;
+            }
+            break;
+        }
+    }
+}
diff --git a/libswscale/loongarch/swscale_lasx.c b/libswscale/loongarch/swscale_lasx.c
new file mode 100644
index 0000000000..53bfdf7e5c
--- /dev/null
+++ b/libswscale/loongarch/swscale_lasx.c
@@ -0,0 +1,959 @@
+/*
+ * Copyright (C) 2022 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen(chenhao@loongson.cn)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "swscale_loongarch.h"
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "libavutil/intreadwrite.h"
+
+#define SCALE_8_16(_sh)                                               \
+{                                                                     \
+    src0    = __lasx_xvldrepl_d(src + filterPos[0], 0);               \
+    src1    = __lasx_xvldrepl_d(src + filterPos[1], 0);               \
+    src2    = __lasx_xvldrepl_d(src + filterPos[2], 0);               \
+    src3    = __lasx_xvldrepl_d(src + filterPos[3], 0);               \
+    src4    = __lasx_xvldrepl_d(src + filterPos[4], 0);               \
+    src5    = __lasx_xvldrepl_d(src + filterPos[5], 0);               \
+    src6    = __lasx_xvldrepl_d(src + filterPos[6], 0);               \
+    src7    = __lasx_xvldrepl_d(src + filterPos[7], 0);               \
+    src8    = __lasx_xvldrepl_d(src + filterPos[8], 0);               \
+    src9    = __lasx_xvldrepl_d(src + filterPos[9], 0);               \
+    src10   = __lasx_xvldrepl_d(src + filterPos[10], 0);              \
+    src11   = __lasx_xvldrepl_d(src + filterPos[11], 0);              \
+    src12   = __lasx_xvldrepl_d(src + filterPos[12], 0);              \
+    src13   = __lasx_xvldrepl_d(src + filterPos[13], 0);              \
+    src14   = __lasx_xvldrepl_d(src + filterPos[14], 0);              \
+    src15   = __lasx_xvldrepl_d(src + filterPos[15], 0);              \
+    DUP4_ARG2(__lasx_xvld, filter, 0, filter, 32, filter, 64,         \
+              filter, 96, filter0, filter1, filter2, filter3);        \
+    DUP4_ARG2(__lasx_xvld, filter, 128, filter, 160,                  \
+              filter, 192, filter, 224, filter4,                      \
+              filter5, filter6, filter7);                             \
+    DUP4_ARG2(__lasx_xvilvl_d, src1, src0, src3, src2,                \
+              src5, src4, src7, src6, src0, src2, src4, src6);        \
+    DUP4_ARG2(__lasx_xvilvl_d, src9, src8, src11, src10,              \
+              src13, src12, src15, src14, src8, src10, src12, src14); \
+    DUP4_ARG1(__lasx_vext2xv_hu_bu, src0, src2, src4, src6,           \
+              src0, src2, src4, src6);                                \
+    DUP4_ARG1(__lasx_vext2xv_hu_bu, src8, src10, src12,               \
+              src14, src8, src10, src12, src14);                      \
+    DUP4_ARG2(__lasx_xvdp2_w_h, filter0, src0, filter1, src2,         \
+              filter2, src4, filter3, src6, src0, src1, src2, src3);  \
+    DUP4_ARG2(__lasx_xvdp2_w_h, filter4, src8, filter5, src10,        \
+              filter6, src12, filter7, src14, src4, src5, src6, src7);\
+    src0 = __lasx_xvhaddw_d_w(src0, src0);                            \
+    src1 = __lasx_xvhaddw_d_w(src1, src1);                            \
+    src2 = __lasx_xvhaddw_d_w(src2, src2);                            \
+    src3 = __lasx_xvhaddw_d_w(src3, src3);                            \
+    src4 = __lasx_xvhaddw_d_w(src4, src4);                            \
+    src5 = __lasx_xvhaddw_d_w(src5, src5);                            \
+    src6 = __lasx_xvhaddw_d_w(src6, src6);                            \
+    src7 = __lasx_xvhaddw_d_w(src7, src7);                            \
+    DUP4_ARG2(__lasx_xvpickev_w, src1, src0, src3, src2,              \
+              src5, src4, src7, src6, src0, src1, src2, src3);        \
+    src0 = __lasx_xvhaddw_d_w(src0, src0);                            \
+    src1 = __lasx_xvhaddw_d_w(src1, src1);                            \
+    src2 = __lasx_xvhaddw_d_w(src2, src2);                            \
+    src3 = __lasx_xvhaddw_d_w(src3, src3);                            \
+    DUP2_ARG2(__lasx_xvpickev_w, src1, src0, src3, src2, src0, src1); \
+    DUP2_ARG2(__lasx_xvsrai_w, src0, _sh, src1, _sh, src0, src1);     \
+    src0 = __lasx_xvmin_w(src0, vmax);                                \
+    src1 = __lasx_xvmin_w(src1, vmax);                                \
+    src0 = __lasx_xvperm_w(src0, shuf);                               \
+    src1 = __lasx_xvperm_w(src1, shuf);                               \
+    src0 = __lasx_xvpickev_h(src1, src0);                             \
+    src0 = __lasx_xvpermi_d(src0, 0xd8);                              \
+    __lasx_xvst(src0, dst, 0);                                        \
+    filterPos += 16;                                                  \
+    filter    += 128;                                                 \
+    dst       += 16;                                                  \
+}
+
+#define SCALE_8_8(_sh)                                                \
+{                                                                     \
+    src0    = __lasx_xvldrepl_d(src + filterPos[0], 0);               \
+    src1    = __lasx_xvldrepl_d(src + filterPos[1], 0);               \
+    src2    = __lasx_xvldrepl_d(src + filterPos[2], 0);               \
+    src3    = __lasx_xvldrepl_d(src + filterPos[3], 0);               \
+    src4    = __lasx_xvldrepl_d(src + filterPos[4], 0);               \
+    src5    = __lasx_xvldrepl_d(src + filterPos[5], 0);               \
+    src6    = __lasx_xvldrepl_d(src + filterPos[6], 0);               \
+    src7    = __lasx_xvldrepl_d(src + filterPos[7], 0);               \
+    DUP4_ARG2(__lasx_xvld, filter, 0, filter, 32, filter, 64,         \
+              filter, 96, filter0, filter1, filter2, filter3);        \
+    DUP4_ARG2(__lasx_xvilvl_d, src1, src0, src3, src2,                \
+              src5, src4, src7, src6, src0, src2, src4, src6);        \
+    DUP4_ARG1(__lasx_vext2xv_hu_bu, src0, src2, src4, src6,           \
+              src0, src2, src4, src6);                                \
+    DUP4_ARG2(__lasx_xvdp2_w_h, filter0, src0, filter1, src2,         \
+              filter2, src4, filter3, src6, src0, src1, src2,src3);   \
+    src0 = __lasx_xvhaddw_d_w(src0, src0);                            \
+    src1 = __lasx_xvhaddw_d_w(src1, src1);                            \
+    src2 = __lasx_xvhaddw_d_w(src2, src2);                            \
+    src3 = __lasx_xvhaddw_d_w(src3, src3);                            \
+    DUP2_ARG2(__lasx_xvpickev_w, src1, src0, src3, src2, src0, src1); \
+    src0 = __lasx_xvhaddw_d_w(src0, src0);                            \
+    src1 = __lasx_xvhaddw_d_w(src1, src1);                            \
+    src0 = __lasx_xvpickev_w(src1, src0);                             \
+    src0 = __lasx_xvsrai_w(src0, _sh);                                \
+    src0 = __lasx_xvmin_w(src0, vmax);                                \
+    src0 = __lasx_xvperm_w(src0, shuf);                               \
+}
+
+#define SCALE_8_4(_sh)                                                    \
+{                                                                         \
+    src0    = __lasx_xvldrepl_d(src + filterPos[0], 0);                   \
+    src1    = __lasx_xvldrepl_d(src + filterPos[1], 0);                   \
+    src2    = __lasx_xvldrepl_d(src + filterPos[2], 0);                   \
+    src3    = __lasx_xvldrepl_d(src + filterPos[3], 0);                   \
+    DUP2_ARG2(__lasx_xvld, filter, 0, filter, 32, filter0, filter1);      \
+    DUP2_ARG2(__lasx_xvilvl_d, src1, src0, src3, src2, src0, src2);       \
+    DUP2_ARG1(__lasx_vext2xv_hu_bu, src0, src2, src0, src2);              \
+    DUP2_ARG2(__lasx_xvdp2_w_h, filter0, src0, filter1, src2, src0, src1);\
+    src0 = __lasx_xvhaddw_d_w(src0, src0);                                \
+    src1 = __lasx_xvhaddw_d_w(src1, src1);                                \
+    src0 = __lasx_xvpickev_w(src1, src0);                                 \
+    src0 = __lasx_xvhaddw_d_w(src0, src0);                                \
+    src0 = __lasx_xvpickev_w(src0, src0);                                 \
+    src0 = __lasx_xvsrai_w(src0, _sh);                                    \
+    src0 = __lasx_xvmin_w(src0, vmax);                                    \
+    src0 = __lasx_xvperm_w(src0, shuf);                                   \
+}
+
+#define SCALE_8_2(_sh)                                                \
+{                                                                     \
+    src0    = __lasx_xvldrepl_d(src + filterPos[0], 0);               \
+    src1    = __lasx_xvldrepl_d(src + filterPos[1], 0);               \
+    filter0 = __lasx_xvld(filter, 0);                                 \
+    src0 = __lasx_xvilvl_d(src1, src0);                               \
+    src0 = __lasx_vext2xv_hu_bu(src0);                                \
+    src0 = __lasx_xvdp2_w_h(filter0, src0);                           \
+    src0 = __lasx_xvhaddw_d_w(src0, src0);                            \
+    src0 = __lasx_xvhaddw_q_d(src0, src0);                            \
+    src0 = __lasx_xvsrai_w(src0, _sh);                                \
+    src0 = __lasx_xvmin_w(src0, vmax);                                \
+    dst[0] = __lasx_xvpickve2gr_w(src0, 0);                           \
+    dst[1] = __lasx_xvpickve2gr_w(src0, 4);                           \
+    filterPos += 2;                                                   \
+    filter    += 16;                                                  \
+    dst       += 2;                                                   \
+}
+
+#define SCALE_4_16(_sh)                                               \
+{                                                                     \
+    src0    = __lasx_xvldrepl_w(src + filterPos[0], 0);               \
+    src1    = __lasx_xvldrepl_w(src + filterPos[1], 0);               \
+    src2    = __lasx_xvldrepl_w(src + filterPos[2], 0);               \
+    src3    = __lasx_xvldrepl_w(src + filterPos[3], 0);               \
+    src4    = __lasx_xvldrepl_w(src + filterPos[4], 0);               \
+    src5    = __lasx_xvldrepl_w(src + filterPos[5], 0);               \
+    src6    = __lasx_xvldrepl_w(src + filterPos[6], 0);               \
+    src7    = __lasx_xvldrepl_w(src + filterPos[7], 0);               \
+    src8    = __lasx_xvldrepl_w(src + filterPos[8], 0);               \
+    src9    = __lasx_xvldrepl_w(src + filterPos[9], 0);               \
+    src10   = __lasx_xvldrepl_w(src + filterPos[10], 0);              \
+    src11   = __lasx_xvldrepl_w(src + filterPos[11], 0);              \
+    src12   = __lasx_xvldrepl_w(src + filterPos[12], 0);              \
+    src13   = __lasx_xvldrepl_w(src + filterPos[13], 0);              \
+    src14   = __lasx_xvldrepl_w(src + filterPos[14], 0);              \
+    src15   = __lasx_xvldrepl_w(src + filterPos[15], 0);              \
+    DUP4_ARG2(__lasx_xvld, filter, 0, filter, 32, filter, 64,         \
+              filter, 96, filter0, filter1, filter2, filter3);        \
+    DUP4_ARG2(__lasx_xvilvl_w, src1, src0, src3, src2, src5,          \
+              src4, src7, src6, src0, src2, src4, src6);              \
+    DUP4_ARG2(__lasx_xvilvl_w, src9, src8, src11, src10, src13,       \
+              src12, src15, src14, src8, src10, src12, src14);        \
+    DUP4_ARG2(__lasx_xvilvl_d, src2, src0, src6, src4, src10,         \
+                   src8, src14, src12, src0, src1, src2, src3);       \
+    DUP4_ARG1(__lasx_vext2xv_hu_bu, src0, src1, src2, src3,           \
+              src0, src1, src2, src3);                                \
+    DUP4_ARG2(__lasx_xvdp2_w_h, filter0, src0, filter1, src1,         \
+              filter2, src2, filter3, src3, src0, src1, src2, src3);  \
+    src0 = __lasx_xvhaddw_d_w(src0, src0);                            \
+    src1 = __lasx_xvhaddw_d_w(src1, src1);                            \
+    src2 = __lasx_xvhaddw_d_w(src2, src2);                            \
+    src3 = __lasx_xvhaddw_d_w(src3, src3);                            \
+    DUP2_ARG2(__lasx_xvpickev_w, src1, src0, src3, src2, src0, src1); \
+    DUP2_ARG2(__lasx_xvsrai_w, src0, _sh, src1, _sh, src0, src1);     \
+    src0 = __lasx_xvmin_w(src0, vmax);                                \
+    src1 = __lasx_xvmin_w(src1, vmax);                                \
+    src0 = __lasx_xvpickev_h(src1, src0);                             \
+    src0 = __lasx_xvperm_w(src0, shuf);                               \
+    __lasx_xvst(src0, dst, 0);                                        \
+    filterPos += 16;                                                  \
+    filter    += 64;                                                  \
+    dst       += 16;                                                  \
+}
+
+#define SCALE_4_8(_sh)                                                    \
+{                                                                         \
+    src0    = __lasx_xvldrepl_w(src + filterPos[0], 0);                   \
+    src1    = __lasx_xvldrepl_w(src + filterPos[1], 0);                   \
+    src2    = __lasx_xvldrepl_w(src + filterPos[2], 0);                   \
+    src3    = __lasx_xvldrepl_w(src + filterPos[3], 0);                   \
+    src4    = __lasx_xvldrepl_w(src + filterPos[4], 0);                   \
+    src5    = __lasx_xvldrepl_w(src + filterPos[5], 0);                   \
+    src6    = __lasx_xvldrepl_w(src + filterPos[6], 0);                   \
+    src7    = __lasx_xvldrepl_w(src + filterPos[7], 0);                   \
+    DUP2_ARG2(__lasx_xvld, filter, 0, filter, 32, filter0, filter1);      \
+    DUP4_ARG2(__lasx_xvilvl_w, src1, src0, src3, src2, src5,              \
+              src4, src7, src6, src0, src2, src4, src6);                  \
+    DUP2_ARG2(__lasx_xvilvl_d, src2, src0, src6, src4, src0, src1);       \
+                                                                          \
+    DUP2_ARG1(__lasx_vext2xv_hu_bu, src0, src1, src0, src1);              \
+    DUP2_ARG2(__lasx_xvdp2_w_h, filter0, src0, filter1, src1, src0, src1);\
+    src0 = __lasx_xvhaddw_d_w(src0, src0);                                \
+    src1 = __lasx_xvhaddw_d_w(src1, src1);                                \
+    src0 = __lasx_xvpickev_w(src1, src0);                                 \
+    src0 = __lasx_xvsrai_w(src0, _sh);                                    \
+    src0 = __lasx_xvmin_w(src0, vmax);                                    \
+}
+
+#define SCALE_4_4(_sh)                                                \
+{                                                                     \
+    src0    = __lasx_xvldrepl_w(src + filterPos[0], 0);               \
+    src1    = __lasx_xvldrepl_w(src + filterPos[1], 0);               \
+    src2    = __lasx_xvldrepl_w(src + filterPos[2], 0);               \
+    src3    = __lasx_xvldrepl_w(src + filterPos[3], 0);               \
+    filter0 = __lasx_xvld(filter, 0);                                 \
+    DUP2_ARG2(__lasx_xvilvl_w, src1, src0, src3, src2, src0, src1);   \
+                                                                      \
+    src0 = __lasx_xvilvl_d(src1, src0);                               \
+    src0 = __lasx_vext2xv_hu_bu(src0);                                \
+    src0 = __lasx_xvdp2_w_h(filter0, src0);                           \
+    src0 = __lasx_xvhaddw_d_w(src0, src0);                            \
+    src0 = __lasx_xvsrai_w(src0, _sh);                                \
+    src0 = __lasx_xvmin_w(src0, vmax);                                \
+    src0 = __lasx_xvpickev_w(src0, src0);                             \
+    src0 = __lasx_xvpermi_d(src0, 0xd8);                              \
+}
+
+#define SCALE_4_2(_sh)                                                \
+{                                                                     \
+    src0    = __lasx_xvldrepl_w(src + filterPos[0], 0);               \
+    src1    = __lasx_xvldrepl_w(src + filterPos[1], 0);               \
+    filter0 = __lasx_xvld(filter, 0);                                 \
+    src0 = __lasx_xvilvl_w(src1, src0);                               \
+    src0 = __lasx_vext2xv_hu_bu(src0);                                \
+    src0 = __lasx_xvdp2_w_h(filter0, src0);                           \
+    src0 = __lasx_xvhaddw_d_w(src0, src0);                            \
+    src0 = __lasx_xvsrai_w(src0, _sh);                                \
+    src0 = __lasx_xvmin_w(src0, vmax);                                \
+    dst[0] = __lasx_xvpickve2gr_w(src0, 0);                           \
+    dst[1] = __lasx_xvpickve2gr_w(src0, 2);                           \
+    filterPos += 2;                                                   \
+    filter    += 8;                                                   \
+    dst       += 2;                                                   \
+}
+
+#define SCALE_16                                                      \
+{                                                                     \
+    int dex  = j << 1;                                                \
+    src0     = __lasx_xvldrepl_d((srcPos1 + j), 0);                   \
+    src1     = __lasx_xvldrepl_d((srcPos2 + j), 0);                   \
+    src2     = __lasx_xvldrepl_d((srcPos3 + j), 0);                   \
+    src3     = __lasx_xvldrepl_d((srcPos4 + j), 0);                   \
+    DUP4_ARG2(__lasx_xvldx, filterStart1, dex, filterStart2, dex,     \
+              filterStart3, dex, filterStart4, dex, filter0,          \
+              filter1, filter2, filter3);                             \
+    src0     = __lasx_xvpermi_q(src0, src1, 0x02);                    \
+    src1     = __lasx_xvpermi_q(src2, src3, 0x02);                    \
+    filter0  = __lasx_xvpermi_q(filter0, filter1, 0x02);              \
+    filter1  = __lasx_xvpermi_q(filter2, filter3, 0x02);              \
+    DUP2_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, src0, src1);   \
+    DUP2_ARG2(__lasx_xvdp2_w_h, filter0, src0, filter1, src1,         \
+              out0, out1);                                            \
+    src0     = __lasx_xvhaddw_d_w(out0, out0);                        \
+    src1     = __lasx_xvhaddw_d_w(out1, out1);                        \
+    out0     = __lasx_xvpackev_d(src1, src0);                         \
+    out1     = __lasx_xvpackod_d(src1, src0);                         \
+    out0     = __lasx_xvadd_w(out0, out1);                            \
+    out      = __lasx_xvadd_w(out, out0);                             \
+}
+
+void ff_hscale_8_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
+                            const uint8_t *src, const int16_t *filter,
+                            const int32_t *filterPos, int filterSize)
+{
+    int i;
+    int max = (1 << 15) - 1;
+
+    if (filterSize == 8) {
+        __m256i src0, src1, src2, src3, src4, src5, src6, src7;
+        __m256i src8, src9, src10, src11, src12, src13, src14, src15;
+        __m256i filter0, filter1, filter2, filter3;
+        __m256i filter4, filter5, filter6, filter7;
+        __m256i vmax = __lasx_xvreplgr2vr_w(max);
+        __m256i shuf = {0x0000000400000000, 0x0000000500000001,
+                        0x0000000600000002, 0x0000000700000003};
+        int len = dstW >> 4;
+        int res = dstW & 15;
+        while (len--) {
+            SCALE_8_16(7);
+        }
+        if (res & 8) {
+            SCALE_8_8(7);
+            src0 = __lasx_xvpickev_h(src0, src0);
+            __lasx_xvstelm_d(src0, dst, 0, 0);
+            __lasx_xvstelm_d(src0, dst, 8, 2);
+            filterPos += 8;
+            filter    += 64;
+            dst       += 8;
+        }
+        if (res & 4) {
+            SCALE_8_4(7);
+            src0 = __lasx_xvpickev_h(src0, src0);
+            __lasx_xvstelm_d(src0, dst, 0, 0);
+            filterPos += 4;
+            filter    += 32;
+            dst       += 4;
+        }
+        if (res & 2) {
+            SCALE_8_2(7);
+        }
+        if (res & 1) {
+            int val = 0;
+            src0    = __lasx_xvldrepl_d(src + filterPos[0], 0);
+            filter0 = __lasx_xvld(filter, 0);
+            src0 = __lasx_vext2xv_hu_bu(src0);
+            src0 = __lasx_xvdp2_w_h(filter0, src0);
+            src0    = __lasx_xvhaddw_d_w(src0, src0);
+            src0    = __lasx_xvhaddw_q_d(src0, src0);
+            val     = __lasx_xvpickve2gr_w(src0, 0);
+            dst[0]  = FFMIN(val >> 7, max);
+        }
+    } else if (filterSize == 4) {
+        __m256i src0, src1, src2, src3, src4, src5, src6, src7;
+        __m256i src8, src9, src10, src11, src12, src13, src14, src15;
+        __m256i filter0, filter1, filter2, filter3;
+        __m256i vmax = __lasx_xvreplgr2vr_w(max);
+        __m256i shuf = {0x0000000400000000, 0x0000000500000001,
+                        0x0000000600000002, 0x0000000700000003};
+        int len = dstW >> 4;
+        int res = dstW & 15;
+        while (len--) {
+            SCALE_4_16(7);
+        }
+        if (res & 8) {
+            SCALE_4_8(7);
+            src0 = __lasx_xvpickev_h(src1, src0);
+            src0 = __lasx_xvperm_w(src0, shuf);
+            __lasx_xvstelm_d(src0, dst, 0, 0);
+            __lasx_xvstelm_d(src0, dst, 8, 1);
+            filterPos += 8;
+            filter    += 32;
+            dst       += 8;
+        }
+        if (res & 4) {
+            SCALE_4_4(7);
+            src0 = __lasx_xvpickev_h(src0, src0);
+            __lasx_xvstelm_d(src0, dst, 0, 0);
+            filterPos += 4;
+            filter    += 16;
+            dst       += 4;
+        }
+        if (res & 2) {
+            SCALE_4_2(7);
+        }
+        if (res & 1) {
+            int val = 0;
+            const uint8_t *srcPos = src + filterPos[0];
+
+            for (int j = 0; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[0] = FFMIN(val >> 7, max);
+        }
+    } else if (filterSize > 8) {
+        int filterlen = filterSize - 7;
+        int len = dstW >> 2;
+        int res = dstW & 3;
+        __m256i zero = __lasx_xvldi(0);
+
+        while (len--) {
+            __m256i src0, src1, src2, src3;
+            __m256i filter0, filter1, filter2, filter3, out0, out1;
+            __m256i out = zero;
+            const uint8_t *srcPos1 = src + filterPos[0];
+            const uint8_t *srcPos2 = src + filterPos[1];
+            const uint8_t *srcPos3 = src + filterPos[2];
+            const uint8_t *srcPos4 = src + filterPos[3];
+            const int16_t *filterStart1 = filter;
+            const int16_t *filterStart2 = filterStart1 + filterSize;
+            const int16_t *filterStart3 = filterStart2 + filterSize;
+            const int16_t *filterStart4 = filterStart3 + filterSize;
+            int j, val1 = 0, val2 = 0, val3 = 0, val4 = 0;
+
+            for (j = 0; j < filterlen; j += 8) {
+                SCALE_16
+            }
+            val1 = __lasx_xvpickve2gr_w(out, 0);
+            val2 = __lasx_xvpickve2gr_w(out, 4);
+            val3 = __lasx_xvpickve2gr_w(out, 2);
+            val4 = __lasx_xvpickve2gr_w(out, 6);
+            for (; j < filterSize; j++) {
+                val1 += ((int)srcPos1[j]) * filterStart1[j];
+                val2 += ((int)srcPos2[j]) * filterStart2[j];
+                val3 += ((int)srcPos3[j]) * filterStart3[j];
+                val4 += ((int)srcPos4[j]) * filterStart4[j];
+            }
+            dst[0] = FFMIN(val1 >> 7, max);
+            dst[1] = FFMIN(val2 >> 7, max);
+            dst[2] = FFMIN(val3 >> 7, max);
+            dst[3] = FFMIN(val4 >> 7, max);
+            dst       += 4;
+            filterPos += 4;
+            filter     = filterStart4 + filterSize;
+        }
+        for(i = 0; i < res; i++) {
+            int j, val = 0;
+            const uint8_t *srcPos = src + filterPos[i];
+            __m256i src1, filter0, out0;
+
+            for (j = 0; j < filterlen; j += 8) {
+                src1   = __lasx_xvldrepl_d((srcPos + j), 0);
+                filter0 = __lasx_xvld(filter + j, 0);
+                src1 = __lasx_xvilvl_b(zero, src1);
+                out0 = __lasx_xvdp2_w_h(filter0, src1);
+                out0 = __lasx_xvhaddw_d_w(out0, out0);
+                out0 = __lasx_xvhaddw_q_d(out0, out0);
+                val += __lasx_xvpickve2gr_w(out0, 0);
+            }
+            for (; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[i]  = FFMIN(val >> 7, max);
+            filter += filterSize;
+        }
+    } else {
+        for (i = 0; i < dstW; i++) {
+            int val = 0;
+            const uint8_t *srcPos = src + filterPos[i];
+
+            for (int j = 0; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[i]  = FFMIN(val >> 7, max);
+            filter += filterSize;
+        }
+    }
+}
+
+void ff_hscale_8_to_19_lasx(SwsContext *c, int16_t *_dst, int dstW,
+                            const uint8_t *src, const int16_t *filter,
+                            const int32_t *filterPos, int filterSize)
+{
+    int i;
+    int max = (1 << 19) - 1;
+    int32_t *dst = (int32_t *) _dst;
+
+    if (filterSize == 8) {
+        __m256i src0, src1, src2, src3, src4, src5, src6, src7;
+        __m256i filter0, filter1, filter2, filter3;
+        __m256i vmax = __lasx_xvreplgr2vr_w(max);
+        __m256i shuf = {0x0000000400000000, 0x0000000500000001,
+                        0x0000000600000002, 0x0000000700000003};
+        int len = dstW >> 3;
+        int res = dstW & 7;
+        while (len--) {
+            SCALE_8_8(3);
+            __lasx_xvst(src0, dst, 0);
+            filterPos += 8;
+            filter    += 64;
+            dst       += 8;
+        }
+        if (res & 4) {
+            SCALE_8_4(3);
+            __lasx_xvstelm_d(src0, dst, 0, 0);
+            __lasx_xvstelm_d(src0, dst, 8, 1);
+            filterPos += 4;
+            filter    += 32;
+            dst       += 4;
+        }
+        if (res & 2) {
+            SCALE_8_2(3);
+        }
+        if (res & 1) {
+            int val = 0;
+            __m256i src0, filter0, out0;
+
+            src0    = __lasx_xvldrepl_d(src + filterPos[0], 0);
+            filter0 = __lasx_xvld(filter, 0);
+            src0 = __lasx_vext2xv_hu_bu(src0);
+            out0 = __lasx_xvdp2_w_h(filter0, src0);
+            out0    = __lasx_xvhaddw_d_w(out0, out0);
+            out0    = __lasx_xvhaddw_q_d(out0, out0);
+            val     = __lasx_xvpickve2gr_w(out0, 0);
+            dst[0]  = FFMIN(val >> 3, max);
+        }
+    } else if (filterSize == 4) {
+        __m256i src0, src1, src2, src3, src4, src5, src6, src7;
+        __m256i filter0, filter1;
+        __m256i vmax = __lasx_xvreplgr2vr_w(max);
+        __m256i shuf = {0x0000000100000000, 0x0000000500000004,
+                        0x0000000300000002, 0x0000000700000006};
+        int len = dstW >> 3;
+        int res = dstW & 7;
+        while (len--) {
+            SCALE_4_8(3);
+            src0 = __lasx_xvperm_w(src0, shuf);
+            __lasx_xvst(src0, dst, 0);
+            filterPos += 8;
+            filter    += 32;
+            dst       += 8;
+        }
+        if (res & 4) {
+            SCALE_4_4(3);
+            __lasx_xvstelm_d(src0, dst, 0, 0);
+            __lasx_xvstelm_d(src0, dst, 8, 1);
+            filterPos += 4;
+            filter    += 16;
+            dst       += 4;
+        }
+        if (res & 2) {
+            SCALE_4_2(3);
+        }
+        if (res & 1) {
+            int val = 0;
+            const uint8_t *srcPos = src + filterPos[0];
+
+            for (int j = 0; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[0] = FFMIN(val >> 3, max);
+        }
+    } else if (filterSize > 8) {
+        int len = dstW >> 2;
+        int res = dstW & 3;
+        int filterlen = filterSize - 7;
+        __m256i zero = __lasx_xvldi(0);
+
+        while (len--) {
+            __m256i src0, src1, src2, src3;
+            __m256i filter0, filter1, filter2, filter3, out0, out1;
+            __m256i out = zero;
+            const uint8_t *srcPos1 = src + filterPos[0];
+            const uint8_t *srcPos2 = src + filterPos[1];
+            const uint8_t *srcPos3 = src + filterPos[2];
+            const uint8_t *srcPos4 = src + filterPos[3];
+            const int16_t *filterStart1 = filter;
+            const int16_t *filterStart2 = filterStart1 + filterSize;
+            const int16_t *filterStart3 = filterStart2 + filterSize;
+            const int16_t *filterStart4 = filterStart3 + filterSize;
+            int j, val1 = 0, val2 = 0, val3 = 0, val4 = 0;
+
+            for (j = 0; j < filterlen; j += 8) {
+                SCALE_16
+            }
+            val1 = __lasx_xvpickve2gr_w(out, 0);
+            val2 = __lasx_xvpickve2gr_w(out, 4);
+            val3 = __lasx_xvpickve2gr_w(out, 2);
+            val4 = __lasx_xvpickve2gr_w(out, 6);
+            for (; j < filterSize; j++) {
+                val1 += ((int)srcPos1[j]) * filterStart1[j];
+                val2 += ((int)srcPos2[j]) * filterStart2[j];
+                val3 += ((int)srcPos3[j]) * filterStart3[j];
+                val4 += ((int)srcPos4[j]) * filterStart4[j];
+            }
+            dst[0] = FFMIN(val1 >> 3, max);
+            dst[1] = FFMIN(val2 >> 3, max);
+            dst[2] = FFMIN(val3 >> 3, max);
+            dst[3] = FFMIN(val4 >> 3, max);
+            dst       += 4;
+            filterPos += 4;
+            filter     = filterStart4 + filterSize;
+        }
+        for (i = 0; i < res; i++) {
+            int j, val = 0;
+            const uint8_t *srcPos = src + filterPos[i];
+            __m256i src1, filter0, out0;
+
+            for (j = 0; j < filterlen; j += 8) {
+                src1   = __lasx_xvldrepl_d((srcPos + j), 0);
+                filter0 = __lasx_xvld(filter + j, 0);
+                src1 = __lasx_xvilvl_b(zero, src1);
+                out0 = __lasx_xvdp2_w_h(filter0, src1);
+                out0 = __lasx_xvhaddw_d_w(out0, out0);
+                out0 = __lasx_xvhaddw_q_d(out0, out0);
+                val += __lasx_xvpickve2gr_w(out0, 0);
+            }
+            for (; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[i] = FFMIN(val >> 3, max);
+            filter += filterSize;
+        }
+    } else {
+        for (i = 0; i < dstW; i++) {
+            int val = 0;
+            const uint8_t *srcPos = src + filterPos[i];
+
+            for (int j = 0; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[i]  = FFMIN(val >> 3, max);
+            filter += filterSize;
+        }
+    }
+}
+
+#undef SCALE_16
+
+#define SCALE_8                                                              \
+{                                                                            \
+    int val1, val2, val3, val4;                                              \
+    __m256i src0, src1, src2, src3, filter0, filter1, out0, out1;            \
+    DUP4_ARG2(__lasx_xvld, src + filterPos[0], 0, src + filterPos[1], 0,     \
+              src + filterPos[2], 0, src + filterPos[3], 0, src0, src1, src2,\
+              src3);                                                         \
+    DUP2_ARG2(__lasx_xvld, filter, 0, filter, 32, filter0, filter1);         \
+    src0    = __lasx_xvpermi_q(src0, src1, 0x02);                            \
+    src2    = __lasx_xvpermi_q(src2, src3, 0x02);                            \
+    DUP2_ARG2(__lasx_xvdp2_w_hu_h, src0, filter0, src2, filter1, out0, out1);\
+    src0    = __lasx_xvhaddw_d_w(out0, out0);                                \
+    src1    = __lasx_xvhaddw_d_w(out1, out1);                                \
+    out0    = __lasx_xvpackev_d(src1, src0);                                 \
+    out1    = __lasx_xvpackod_d(src1, src0);                                 \
+    out0    = __lasx_xvadd_w(out0, out1);                                    \
+    out0    = __lasx_xvsra_w(out0, shift);                                   \
+    val1    = __lasx_xvpickve2gr_w(out0, 0);                                 \
+    val2    = __lasx_xvpickve2gr_w(out0, 4);                                 \
+    val3    = __lasx_xvpickve2gr_w(out0, 2);                                 \
+    val4    = __lasx_xvpickve2gr_w(out0, 6);                                 \
+    dst[0]  = FFMIN(val1, max);                                              \
+    dst[1]  = FFMIN(val2, max);                                              \
+    dst[2]  = FFMIN(val3, max);                                              \
+    dst[3]  = FFMIN(val4, max);                                              \
+    filterPos += 4;                                                          \
+    filter += 32;                                                            \
+    dst += 4;                                                                \
+}
+
+#define SCALE_16                                                             \
+{                                                                            \
+    int dex = j << 1;                                                        \
+    DUP4_ARG2(__lasx_xvldx, srcPos1, dex, srcPos2, dex, srcPos3, dex,        \
+              srcPos4, dex, src0, src1, src2, src3);                         \
+    DUP4_ARG2(__lasx_xvldx, filterStart1, dex, filterStart2, dex,            \
+              filterStart3, dex, filterStart4, dex, filter0,                 \
+              filter1, filter2, filter3);                                    \
+    src0     = __lasx_xvpermi_q(src0, src1, 0x02);                           \
+    src1     = __lasx_xvpermi_q(src2, src3, 0x02);                           \
+    filter0  = __lasx_xvpermi_q(filter0, filter1, 0x02);                     \
+    filter1  = __lasx_xvpermi_q(filter2, filter3, 0x02);                     \
+    DUP2_ARG2(__lasx_xvdp2_w_hu_h, src0, filter0, src1, filter1, out0, out1);\
+    src0     = __lasx_xvhaddw_d_w(out0, out0);                               \
+    src1     = __lasx_xvhaddw_d_w(out1, out1);                               \
+    out0     = __lasx_xvpackev_d(src1, src0);                                \
+    out1     = __lasx_xvpackod_d(src1, src0);                                \
+    out0     = __lasx_xvadd_w(out0, out1);                                   \
+    out      = __lasx_xvadd_w(out, out0);                                    \
+}
+
+void ff_hscale_16_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
+                             const uint8_t *_src, const int16_t *filter,
+                             const int32_t *filterPos, int filterSize)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
+    int i;
+    const uint16_t *src = (const uint16_t *) _src;
+    int sh              = desc->comp[0].depth - 1;
+    int max = (1 << 15) - 1;
+    int len = dstW >> 2;
+    int res = dstW & 3;
+    __m256i shift;
+    __m256i zero = __lasx_xvldi(0);
+
+    if (sh < 15) {
+        sh = isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8 ? 13 :
+                      (desc->comp[0].depth - 1);
+    } else if (desc->flags && AV_PIX_FMT_FLAG_FLOAT) {
+        sh = 15;
+    }
+    shift = __lasx_xvreplgr2vr_w(sh);
+
+    if (filterSize == 8) {
+        for (i = 0; i < len; i++) {
+            SCALE_8
+        }
+        for (i = 0; i < res; i++) {
+            int val = 0;
+            __m256i src0, filter0, out0;
+
+            src0    = __lasx_xvld(src + filterPos[i], 0);
+            filter0 = __lasx_xvld(filter, 0);
+            out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
+            out0    = __lasx_xvhaddw_d_w(out0, out0);
+            out0    = __lasx_xvhaddw_q_d(out0, out0);
+            val     = __lasx_xvpickve2gr_w(out0, 0);
+            dst[i]  = FFMIN(val >> sh, max);
+            filter += 8;
+        }
+    } else if (filterSize == 4) {
+        for (i = 0; i < len; i++) {
+            __m256i src1, src2, src3, src4, src0, filter0, out0;
+
+            src1 = __lasx_xvldrepl_d(src + filterPos[0], 0);
+            src2 = __lasx_xvldrepl_d(src + filterPos[1], 0);
+            src3 = __lasx_xvldrepl_d(src + filterPos[2], 0);
+            src4 = __lasx_xvldrepl_d(src + filterPos[3], 0);
+            filter0 = __lasx_xvld(filter, 0);
+            src1 = __lasx_xvextrins_d(src1, src2, 0x10);
+            src3 = __lasx_xvextrins_d(src3, src4, 0x10);
+            src0 = __lasx_xvpermi_q(src1, src3, 0x02);
+            out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
+            out0 = __lasx_xvhaddw_d_w(out0, out0);
+            out0 = __lasx_xvsra_w(out0, shift);
+            dst[0] = FFMIN((__lasx_xvpickve2gr_w(out0, 0)), max);
+            dst[1] = FFMIN((__lasx_xvpickve2gr_w(out0, 2)), max);
+            dst[2] = FFMIN((__lasx_xvpickve2gr_w(out0, 4)), max);
+            dst[3] = FFMIN((__lasx_xvpickve2gr_w(out0, 6)), max);
+            dst       += 4;
+            filterPos += 4;
+            filter    += 16;
+        }
+        for (i = 0; i < res; i++) {
+            int val = 0;
+            const uint16_t *srcPos = src + filterPos[i];
+
+            for (int j = 0; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[i]  = FFMIN(val >> sh, max);
+            filter += 4;
+        }
+    } else if (filterSize > 8) {
+        int filterlen = filterSize - 7;
+
+        for (i = 0; i < len; i++) {
+            __m256i src0, src1, src2, src3;
+            __m256i filter0, filter1, filter2, filter3, out0, out1;
+            __m256i out = zero;
+            const uint16_t *srcPos1 = src + filterPos[0];
+            const uint16_t *srcPos2 = src + filterPos[1];
+            const uint16_t *srcPos3 = src + filterPos[2];
+            const uint16_t *srcPos4 = src + filterPos[3];
+            const int16_t *filterStart1 = filter;
+            const int16_t *filterStart2 = filterStart1 + filterSize;
+            const int16_t *filterStart3 = filterStart2 + filterSize;
+            const int16_t *filterStart4 = filterStart3 + filterSize;
+            int j, val1 = 0, val2 = 0, val3 = 0, val4 = 0;
+
+            for (j = 0; j < filterlen; j += 8) {
+                SCALE_16
+            }
+            val1 = __lasx_xvpickve2gr_w(out, 0);
+            val2 = __lasx_xvpickve2gr_w(out, 4);
+            val3 = __lasx_xvpickve2gr_w(out, 2);
+            val4 = __lasx_xvpickve2gr_w(out, 6);
+            for (; j < filterSize; j++) {
+                val1 += ((int)srcPos1[j]) * filterStart1[j];
+                val2 += ((int)srcPos2[j]) * filterStart2[j];
+                val3 += ((int)srcPos3[j]) * filterStart3[j];
+                val4 += ((int)srcPos4[j]) * filterStart4[j];
+            }
+            dst[0] = FFMIN(val1 >> sh, max);
+            dst[1] = FFMIN(val2 >> sh, max);
+            dst[2] = FFMIN(val3 >> sh, max);
+            dst[3] = FFMIN(val4 >> sh, max);
+            dst       += 4;
+            filterPos += 4;
+            filter     = filterStart4 + filterSize;
+        }
+        for (i = 0; i < res; i++) {
+            int j, val = 0;
+            const uint16_t *srcPos      = src + filterPos[i];
+            __m256i src0, filter0, out0;
+
+            for (j = 0; j < filterlen; j += 8) {
+                int dex = j << 1;
+                DUP2_ARG2(__lasx_xvldx, srcPos, dex, filter, dex, src0, filter0);
+                out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
+                out0    = __lasx_xvhaddw_d_w(out0, out0);
+                out0    = __lasx_xvhaddw_q_d(out0, out0);
+                val    += __lasx_xvpickve2gr_w(out0, 0);
+            }
+            for (; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[i]  = FFMIN(val >> sh, max);
+            filter += filterSize;
+        }
+    } else {
+        for (i = 0; i < dstW; i++) {
+            int val = 0;
+            const uint16_t *srcPos = src + filterPos[i];
+
+            for (int j = 0; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[i]  = FFMIN(val >> sh, max);
+            filter += filterSize;
+        }
+    }
+}
+
+void ff_hscale_16_to_19_lasx(SwsContext *c, int16_t *_dst, int dstW,
+                             const uint8_t *_src, const int16_t *filter,
+                             const int32_t *filterPos, int filterSize)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
+    int i;
+    int32_t *dst        = (int32_t *) _dst;
+    const uint16_t *src = (const uint16_t *) _src;
+    int sh              = desc->comp[0].depth - 5;
+    int max = (1 << 19) - 1;
+    int len = dstW >> 2;
+    int res = dstW & 3;
+    __m256i shift;
+    __m256i zero = __lasx_xvldi(0);
+
+    if ((isAnyRGB(c->srcFormat) || c->srcFormat == AV_PIX_FMT_PAL8)
+         && desc->comp[0].depth<16) {
+        sh = 9;
+    } else if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) {
+        sh = 11;
+    }
+    shift = __lasx_xvreplgr2vr_w(sh);
+
+    if (filterSize == 8) {
+        for (i = 0; i < len; i++) {
+            SCALE_8
+        }
+        for (i = 0; i < res; i++) {
+            int val = 0;
+            __m256i src0, filter0, out0;
+
+            DUP2_ARG2(__lasx_xvld, src + filterPos[i], 0, filter, 0,
+                      src0, filter0);
+            out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
+            out0 = __lasx_xvhaddw_d_w(out0, out0);
+            out0 = __lasx_xvhaddw_q_d(out0, out0);
+            val  = __lasx_xvpickve2gr_w(out0, 0);
+            dst[i] = FFMIN(val >> sh, max);
+            filter += 8;
+        }
+    } else if (filterSize == 4) {
+        for (i = 0; i < len; i++) {
+            __m256i src1, src2, src3, src4, src0, filter0, out0;
+
+            src1 = __lasx_xvldrepl_d(src + filterPos[0], 0);
+            src2 = __lasx_xvldrepl_d(src + filterPos[1], 0);
+            src3 = __lasx_xvldrepl_d(src + filterPos[2], 0);
+            src4 = __lasx_xvldrepl_d(src + filterPos[3], 0);
+            filter0 = __lasx_xvld(filter, 0);
+            src1 = __lasx_xvextrins_d(src1, src2, 0x10);
+            src3 = __lasx_xvextrins_d(src3, src4, 0x10);
+            src0 = __lasx_xvpermi_q(src1, src3, 0x02);
+            out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
+            out0 = __lasx_xvhaddw_d_w(out0, out0);
+            out0 = __lasx_xvsra_w(out0, shift);
+            dst[0] = FFMIN((__lasx_xvpickve2gr_w(out0, 0)), max);
+            dst[1] = FFMIN((__lasx_xvpickve2gr_w(out0, 2)), max);
+            dst[2] = FFMIN((__lasx_xvpickve2gr_w(out0, 4)), max);
+            dst[3] = FFMIN((__lasx_xvpickve2gr_w(out0, 6)), max);
+            dst       += 4;
+            filterPos += 4;
+            filter    += 16;
+        }
+        for (i = 0; i < res; i++) {
+            int val = 0;
+            const uint16_t *srcPos = src + filterPos[i];
+
+            for (int j = 0; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[i]  = FFMIN(val >> sh, max);
+            filter += 4;
+        }
+    } else if (filterSize > 8) {
+        int filterlen = filterSize - 7;
+
+        for (i = 0; i < len; i ++) {
+            __m256i src0, src1, src2, src3;
+            __m256i filter0, filter1, filter2, filter3, out0, out1;
+            __m256i out = zero;
+            const uint16_t *srcPos1 = src + filterPos[0];
+            const uint16_t *srcPos2 = src + filterPos[1];
+            const uint16_t *srcPos3 = src + filterPos[2];
+            const uint16_t *srcPos4 = src + filterPos[3];
+            const int16_t *filterStart1 = filter;
+            const int16_t *filterStart2 = filterStart1 + filterSize;
+            const int16_t *filterStart3 = filterStart2 + filterSize;
+            const int16_t *filterStart4 = filterStart3 + filterSize;
+            int j, val1 = 0, val2 = 0, val3 = 0, val4 = 0;
+
+            for (j = 0; j < filterlen; j += 8) {
+                SCALE_16
+            }
+            val1 = __lasx_xvpickve2gr_w(out, 0);
+            val2 = __lasx_xvpickve2gr_w(out, 4);
+            val3 = __lasx_xvpickve2gr_w(out, 2);
+            val4 = __lasx_xvpickve2gr_w(out, 6);
+            for (; j < filterSize; j++) {
+                val1 += ((int)srcPos1[j]) * filterStart1[j];
+                val2 += ((int)srcPos2[j]) * filterStart2[j];
+                val3 += ((int)srcPos3[j]) * filterStart3[j];
+                val4 += ((int)srcPos4[j]) * filterStart4[j];
+            }
+            dst[0] = FFMIN(val1 >> sh, max);
+            dst[1] = FFMIN(val2 >> sh, max);
+            dst[2] = FFMIN(val3 >> sh, max);
+            dst[3] = FFMIN(val4 >> sh, max);
+            dst       += 4;
+            filterPos += 4;
+            filter     = filterStart4 + filterSize;
+        }
+        for (i = 0; i < res; i++) {
+            int j, val = 0;
+            const uint16_t *srcPos      = src + filterPos[i];
+            __m256i src0, filter0, out0;
+
+            for (j = 0; j < filterlen; j += 8) {
+                int dex = j << 1;
+                DUP2_ARG2(__lasx_xvldx, srcPos, dex, filter, dex,
+                          src0, filter0);
+                out0 = __lasx_xvdp2_w_hu_h(src0, filter0);
+                out0    = __lasx_xvhaddw_d_w(out0, out0);
+                out0    = __lasx_xvhaddw_q_d(out0, out0);
+                val    += __lasx_xvpickve2gr_w(out0, 0);
+            }
+            for (; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[i]  = FFMIN(val >> sh, max);
+            filter += filterSize;
+        }
+    } else {
+        for (i = 0; i < dstW; i++) {
+            int val = 0;
+            const uint16_t *srcPos = src + filterPos[i];
+
+            for (int j = 0; j < filterSize; j++) {
+                val += ((int)srcPos[j]) * filter[j];
+            }
+            dst[i]  = FFMIN(val >> sh, max);
+            filter += filterSize;
+        }
+    }
+}
+
+#undef SCALE_8
+#undef SCALE_16
diff --git a/libswscale/loongarch/swscale_loongarch.h b/libswscale/loongarch/swscale_loongarch.h
new file mode 100644
index 0000000000..67e24d544c
--- /dev/null
+++ b/libswscale/loongarch/swscale_loongarch.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) 2022 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen(chenhao@loongson.cn)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H
+#define SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H
+
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+
+void ff_hscale_8_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
+                            const uint8_t *src, const int16_t *filter,
+                            const int32_t *filterPos, int filterSize);
+
+void ff_hscale_8_to_19_lasx(SwsContext *c, int16_t *_dst, int dstW,
+                            const uint8_t *src, const int16_t *filter,
+                            const int32_t *filterPos, int filterSize);
+
+void ff_hscale_16_to_19_lasx(SwsContext *c, int16_t *_dst, int dstW,
+                             const uint8_t *_src, const int16_t *filter,
+                             const int32_t *filterPos, int filterSize);
+
+void ff_hscale_16_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
+                             const uint8_t *_src, const int16_t *filter,
+                             const int32_t *filterPos, int filterSize);
+
+void planar_rgb_to_uv_lasx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
+                           int width, int32_t *rgb2yuv);
+
+void planar_rgb_to_y_lasx(uint8_t *_dst, const uint8_t *src[4], int width,
+                          int32_t *rgb2yuv);
+
+#endif /* SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H */
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index 7b40f49da4..367d045a02 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -598,6 +598,8 @@ void ff_sws_init_scale(SwsContext *c)
     ff_sws_init_swscale_aarch64(c);
 #elif ARCH_ARM
     ff_sws_init_swscale_arm(c);
+#elif ARCH_LOONGARCH64
+    ff_sws_init_swscale_loongarch(c);
 #endif
 }
 
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 6c14ce8536..abeebbb002 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -698,6 +698,7 @@ av_cold void ff_sws_init_range_convert(SwsContext *c);
 
 SwsFunc ff_yuv2rgb_init_x86(SwsContext *c);
 SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c);
+SwsFunc ff_yuv2rgb_init_loongarch(SwsContext *c);
 
 static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)
 {
@@ -983,6 +984,7 @@ void ff_sws_init_swscale_vsx(SwsContext *c);
 void ff_sws_init_swscale_x86(SwsContext *c);
 void ff_sws_init_swscale_aarch64(SwsContext *c);
 void ff_sws_init_swscale_arm(SwsContext *c);
+void ff_sws_init_swscale_loongarch(SwsContext *c);
 
 void ff_hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
                        const uint8_t *src, int srcW, int xInc);
diff --git a/libswscale/utils.c b/libswscale/utils.c
index a621a35862..f762fba1df 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -53,6 +53,7 @@
 #include "libavutil/ppc/cpu.h"
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
+#include "libavutil/loongarch/cpu.h"
 
 #include "rgb2rgb.h"
 #include "swscale.h"
@@ -654,6 +655,15 @@ static av_cold int initFilter(int16_t **outFilter, int32_t **filterPos,
             filterAlign = 1;
     }
 
+    if (have_lasx(cpu_flags)) {
+        int reNum = minFilterSize & (0x07);
+
+        if (minFilterSize < 5)
+            filterAlign = 4;
+        if (reNum < 3)
+            filterAlign = 1;
+    }
+
     av_assert0(minFilterSize > 0);
     filterSize = (minFilterSize + (filterAlign - 1)) & (~(filterAlign - 1));
     av_assert0(filterSize > 0);
@@ -1839,7 +1849,8 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
         {
             const int filterAlign = X86_MMX(cpu_flags)     ? 4 :
                                     PPC_ALTIVEC(cpu_flags) ? 8 :
-                                    have_neon(cpu_flags)   ? 4 : 1;
+                                    have_neon(cpu_flags)   ? 4 :
+                                    have_lasx(cpu_flags)   ? 8 : 1;
 
             if ((ret = initFilter(&c->hLumFilter, &c->hLumFilterPos,
                            &c->hLumFilterSize, c->lumXInc,
-- 
2.20.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2022-09-21 10:50 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-08-29 11:26 [FFmpeg-devel] Add loongarch SIMD optimization in swscale lib Hao Chen
2022-08-29 11:26 ` [FFmpeg-devel] [PATCH v1 1/3] swscale/la: Optimize hscale functions with lasx Hao Chen
2022-08-29 11:26 ` [FFmpeg-devel] [PATCH v1 2/3] swscale/la: Add yuv2rgb_lasx.c and rgb2rgb_lasx.c files Hao Chen
2022-08-29 11:26 ` [FFmpeg-devel] [PATCH v1 3/3] swscale/la: Add output_lasx.c file Hao Chen
2022-08-29 12:30   ` Andreas Rheinhardt
2022-09-06  8:12     ` Shiyou Yin
2022-09-09  1:22       ` Shiyou Yin
2022-09-09 13:11         ` Andreas Rheinhardt
2022-09-11  2:06           ` Shiyou Yin
2022-09-21  3:03             ` Shiyou Yin
2022-09-21 10:50               ` Andreas Rheinhardt
2022-08-29 11:51 [FFmpeg-devel] Add LoongArch SIMD optimization in swscale lib Hao Chen
2022-08-29 11:51 ` [FFmpeg-devel] [PATCH v1 1/3] swscale/la: Optimize hscale functions with lasx Hao Chen
2022-08-31  8:42   ` Shiyou Yin

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git