Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH v2 1/4] checkasm: add tests for {lum, chr}ConvertRange
@ 2024-06-11 12:28 Ramiro Polla
  2024-06-11 12:28 ` [FFmpeg-devel] [PATCH v2 2/4] swscale/x86: add sse4 " Ramiro Polla
                   ` (3 more replies)
  0 siblings, 4 replies; 12+ messages in thread
From: Ramiro Polla @ 2024-06-11 12:28 UTC (permalink / raw)
  To: ffmpeg-devel

---
 tests/checkasm/Makefile           |   2 +-
 tests/checkasm/checkasm.c         |   1 +
 tests/checkasm/checkasm.h         |   1 +
 tests/checkasm/sw_range_convert.c | 134 ++++++++++++++++++++++++++++++
 4 files changed, 137 insertions(+), 1 deletion(-)
 create mode 100644 tests/checkasm/sw_range_convert.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 6eb94d10d5..f20732b37a 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -63,7 +63,7 @@ AVFILTEROBJS-$(CONFIG_SOBEL_FILTER)      += vf_convolution.o
 CHECKASMOBJS-$(CONFIG_AVFILTER) += $(AVFILTEROBJS-yes)
 
 # swscale tests
-SWSCALEOBJS                             += sw_gbrp.o sw_rgb.o sw_scale.o
+SWSCALEOBJS                             += sw_gbrp.o sw_range_convert.o sw_rgb.o sw_scale.o
 
 CHECKASMOBJS-$(CONFIG_SWSCALE)  += $(SWSCALEOBJS)
 
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 2329e2e1bc..56232ab1e0 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -251,6 +251,7 @@ static const struct {
 #endif
 #if CONFIG_SWSCALE
     { "sw_gbrp", checkasm_check_sw_gbrp },
+    { "sw_range_convert", checkasm_check_sw_range_convert },
     { "sw_rgb", checkasm_check_sw_rgb },
     { "sw_scale", checkasm_check_sw_scale },
 #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 211d7f52e6..e544007b67 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -119,6 +119,7 @@ void checkasm_check_rv40dsp(void);
 void checkasm_check_svq1enc(void);
 void checkasm_check_synth_filter(void);
 void checkasm_check_sw_gbrp(void);
+void checkasm_check_sw_range_convert(void);
 void checkasm_check_sw_rgb(void);
 void checkasm_check_sw_scale(void);
 void checkasm_check_takdsp(void);
diff --git a/tests/checkasm/sw_range_convert.c b/tests/checkasm/sw_range_convert.c
new file mode 100644
index 0000000000..08029103d1
--- /dev/null
+++ b/tests/checkasm/sw_range_convert.c
@@ -0,0 +1,134 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string.h>
+
+#include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem.h"
+#include "libavutil/mem_internal.h"
+
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+
+#include "checkasm.h"
+
+static void check_lumConvertRange(int from)
+{
+    const char *func_str = from ? "lumRangeFromJpeg" : "lumRangeToJpeg";
+#define LARGEST_INPUT_SIZE 512
+#define INPUT_SIZES 6
+    static const int input_sizes[] = {8, 24, 128, 144, 256, 512};
+    struct SwsContext *ctx;
+
+    LOCAL_ALIGNED_32(int16_t, dst0, [LARGEST_INPUT_SIZE]);
+    LOCAL_ALIGNED_32(int16_t, dst1, [LARGEST_INPUT_SIZE]);
+
+    declare_func(void, int16_t *dst, int width);
+
+    ctx = sws_alloc_context();
+    if (sws_init_context(ctx, NULL, NULL) < 0)
+        fail();
+
+    ctx->srcFormat = from ? AV_PIX_FMT_YUVJ444P : AV_PIX_FMT_YUV444P;
+    ctx->dstFormat = from ? AV_PIX_FMT_YUV444P : AV_PIX_FMT_YUVJ444P;
+    ctx->srcRange = from;
+    ctx->dstRange = !from;
+
+    for (int dstWi = 0; dstWi < INPUT_SIZES; dstWi++) {
+        int width = input_sizes[dstWi];
+        for (int i = 0; i < width; i++) {
+            uint8_t r = rnd();
+            dst0[i] = (int16_t) r << 7;
+            dst1[i] = (int16_t) r << 7;
+        }
+        ff_sws_init_scale(ctx);
+        if (check_func(ctx->lumConvertRange, "%s_%d", func_str, width)) {
+            call_ref(dst0, width);
+            call_new(dst1, width);
+            if (memcmp(dst0, dst1, width * sizeof(int16_t)))
+                fail();
+            bench_new(dst1, width);
+        }
+    }
+
+    sws_freeContext(ctx);
+}
+#undef LARGEST_INPUT_SIZE
+#undef INPUT_SIZES
+
+static void check_chrConvertRange(int from)
+{
+    const char *func_str = from ? "chrRangeFromJpeg" : "chrRangeToJpeg";
+#define LARGEST_INPUT_SIZE 512
+#define INPUT_SIZES 6
+    static const int input_sizes[] = {8, 24, 128, 144, 256, 512};
+    struct SwsContext *ctx;
+
+    LOCAL_ALIGNED_32(int16_t, dstU0, [LARGEST_INPUT_SIZE]);
+    LOCAL_ALIGNED_32(int16_t, dstV0, [LARGEST_INPUT_SIZE]);
+    LOCAL_ALIGNED_32(int16_t, dstU1, [LARGEST_INPUT_SIZE]);
+    LOCAL_ALIGNED_32(int16_t, dstV1, [LARGEST_INPUT_SIZE]);
+
+    declare_func(void, int16_t *dstU, int16_t *dstV, int width);
+
+    ctx = sws_alloc_context();
+    if (sws_init_context(ctx, NULL, NULL) < 0)
+        fail();
+
+    ctx->srcFormat = from ? AV_PIX_FMT_YUVJ444P : AV_PIX_FMT_YUV444P;
+    ctx->dstFormat = from ? AV_PIX_FMT_YUV444P : AV_PIX_FMT_YUVJ444P;
+    ctx->srcRange = from;
+    ctx->dstRange = !from;
+
+    for (int dstWi = 0; dstWi < INPUT_SIZES; dstWi++) {
+        int width = input_sizes[dstWi];
+        for (int i = 0; i < width; i++) {
+            uint8_t r = rnd();
+            dstU0[i] = (int16_t) r << 7;
+            dstV0[i] = (int16_t) r << 7;
+            dstU1[i] = (int16_t) r << 7;
+            dstV1[i] = (int16_t) r << 7;
+        }
+        ff_sws_init_scale(ctx);
+        if (check_func(ctx->chrConvertRange, "%s_%d", func_str, width)) {
+            call_ref(dstU0, dstV0, width);
+            call_new(dstU1, dstV1, width);
+            if (memcmp(dstU0, dstU1, width * sizeof(int16_t)) ||
+                memcmp(dstV0, dstV1, width * sizeof(int16_t)))
+                fail();
+            bench_new(dstU1, dstV1, width);
+        }
+    }
+
+    sws_freeContext(ctx);
+}
+#undef LARGEST_INPUT_SIZE
+#undef INPUT_SIZES
+
+void checkasm_check_sw_range_convert(void)
+{
+    check_lumConvertRange(1);
+    report("lumRangeFromJpeg");
+    check_chrConvertRange(1);
+    report("chrRangeFromJpeg");
+    check_lumConvertRange(0);
+    report("lumRangeToJpeg");
+    check_chrConvertRange(0);
+    report("chrRangeToJpeg");
+}
-- 
2.30.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [FFmpeg-devel] [PATCH v2 2/4] swscale/x86: add sse4 {lum, chr}ConvertRange
  2024-06-11 12:28 [FFmpeg-devel] [PATCH v2 1/4] checkasm: add tests for {lum, chr}ConvertRange Ramiro Polla
@ 2024-06-11 12:28 ` Ramiro Polla
  2024-06-11 12:32   ` James Almer
  2024-06-11 18:26   ` Michael Niedermayer
  2024-06-11 12:28 ` [FFmpeg-devel] [PATCH v2 3/4] swscale/x86: add avx2 " Ramiro Polla
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 12+ messages in thread
From: Ramiro Polla @ 2024-06-11 12:28 UTC (permalink / raw)
  To: ffmpeg-devel

chrRangeFromJpeg_8_c: 28.7
chrRangeFromJpeg_8_sse4: 16.2
chrRangeFromJpeg_24_c: 152.7
chrRangeFromJpeg_24_sse4: 29.7
chrRangeFromJpeg_128_c: 366.5
chrRangeFromJpeg_128_sse4: 233.0
chrRangeFromJpeg_144_c: 408.0
chrRangeFromJpeg_144_sse4: 182.5
chrRangeFromJpeg_256_c: 698.7
chrRangeFromJpeg_256_sse4: 325.5
chrRangeFromJpeg_512_c: 1348.7
chrRangeFromJpeg_512_sse4: 660.2
chrRangeToJpeg_8_c: 37.7
chrRangeToJpeg_8_sse4: 16.2
chrRangeToJpeg_24_c: 115.7
chrRangeToJpeg_24_sse4: 36.2
chrRangeToJpeg_128_c: 631.2
chrRangeToJpeg_128_sse4: 163.7
chrRangeToJpeg_144_c: 710.7
chrRangeToJpeg_144_sse4: 183.0
chrRangeToJpeg_256_c: 1253.0
chrRangeToJpeg_256_sse4: 343.5
chrRangeToJpeg_512_c: 2491.2
chrRangeToJpeg_512_sse4: 654.2
lumRangeFromJpeg_8_c: 11.7
lumRangeFromJpeg_8_sse4: 10.5
lumRangeFromJpeg_24_c: 38.5
lumRangeFromJpeg_24_sse4: 19.0
lumRangeFromJpeg_128_c: 237.5
lumRangeFromJpeg_128_sse4: 79.2
lumRangeFromJpeg_144_c: 255.7
lumRangeFromJpeg_144_sse4: 90.5
lumRangeFromJpeg_256_c: 441.5
lumRangeFromJpeg_256_sse4: 161.7
lumRangeFromJpeg_512_c: 879.0
lumRangeFromJpeg_512_sse4: 333.2
lumRangeToJpeg_8_c: 20.0
lumRangeToJpeg_8_sse4: 11.7
lumRangeToJpeg_24_c: 61.5
lumRangeToJpeg_24_sse4: 17.7
lumRangeToJpeg_128_c: 357.5
lumRangeToJpeg_128_sse4: 80.0
lumRangeToJpeg_144_c: 371.5
lumRangeToJpeg_144_sse4: 93.2
lumRangeToJpeg_256_c: 651.5
lumRangeToJpeg_256_sse4: 164.5
lumRangeToJpeg_512_c: 1279.0
lumRangeToJpeg_512_sse4: 333.7
---
 libswscale/swscale_internal.h    |   1 +
 libswscale/utils.c               |   2 +
 libswscale/x86/Makefile          |   1 +
 libswscale/x86/range_convert.asm | 130 +++++++++++++++++++++++++++++++
 libswscale/x86/swscale.c         |  36 +++++++++
 5 files changed, 170 insertions(+)
 create mode 100644 libswscale/x86/range_convert.asm

diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 5007dd422f..d5e7b5e71c 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -698,6 +698,7 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY);
 
 av_cold void ff_sws_init_range_convert(SwsContext *c);
 av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c);
+av_cold void ff_sws_init_range_convert_x86(SwsContext *c);
 
 SwsFunc ff_yuv2rgb_init_x86(SwsContext *c);
 SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c);
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 476a24fea5..8dfa57b5ff 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -1082,6 +1082,8 @@ int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
         ff_sws_init_range_convert(c);
 #if ARCH_LOONGARCH64
         ff_sws_init_range_convert_loongarch(c);
+#elif ARCH_X86
+        ff_sws_init_range_convert_x86(c);
 #endif
     }
 
diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index 68391494be..f00154941d 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -12,6 +12,7 @@ X86ASM-OBJS                     += x86/input.o                          \
                                    x86/output.o                         \
                                    x86/scale.o                          \
                                    x86/scale_avx2.o                          \
+                                   x86/range_convert.o                  \
                                    x86/rgb_2_rgb.o                      \
                                    x86/yuv_2_rgb.o                      \
                                    x86/yuv2yuvX.o                       \
diff --git a/libswscale/x86/range_convert.asm b/libswscale/x86/range_convert.asm
new file mode 100644
index 0000000000..13983a386b
--- /dev/null
+++ b/libswscale/x86/range_convert.asm
@@ -0,0 +1,130 @@
+;******************************************************************************
+;* Copyright (c) 2024 Ramiro Polla
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+chr_to_mult:        times 4 dd 4663
+chr_to_offset:      times 4 dd -9289992
+%define chr_to_shift 12
+
+chr_from_mult:      times 4 dd 1799
+chr_from_offset:    times 4 dd 4081085
+%define chr_from_shift 11
+
+lum_to_mult:        times 4 dd 19077
+lum_to_offset:      times 4 dd -39057361
+%define lum_to_shift 14
+
+lum_from_mult:      times 4 dd 14071
+lum_from_offset:    times 4 dd 33561947
+%define lum_from_shift 14
+
+SECTION .text
+
+; NOTE: there is no need to clamp the input when converting to jpeg range
+;       (like we do in the C code) because packssdw will saturate the output.
+
+;-----------------------------------------------------------------------------
+; lumConvertRange
+;
+; void ff_lumRangeToJpeg_<opt>(int16_t *dst, int width);
+; void ff_lumRangeFromJpeg_<opt>(int16_t *dst, int width);
+;
+;-----------------------------------------------------------------------------
+
+%macro LUMCONVERTRANGE 4
+cglobal %1, 2, 3, 3, dst, width, x
+    movsxdifnidn widthq, widthd
+    xor              xq, xq
+    mova             m4, [%2]
+    mova             m5, [%3]
+.loop:
+    pmovsxwd         m0, [dstq+xq*2]
+    pmovsxwd         m1, [dstq+xq*2+mmsize/2]
+    pmulld           m0, m4
+    pmulld           m1, m4
+    paddd            m0, m5
+    paddd            m1, m5
+    psrad            m0, %4
+    psrad            m1, %4
+    packssdw         m0, m0
+    packssdw         m1, m1
+    movq    [dstq+xq*2], m0
+    movq    [dstq+xq*2+mmsize/2], m1
+    add              xq, mmsize / 2
+    cmp              xd, widthd
+    jl .loop
+    RET
+%endmacro
+
+;-----------------------------------------------------------------------------
+; chrConvertRange
+;
+; void ff_chrRangeToJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width);
+; void ff_chrRangeFromJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width);
+;
+;-----------------------------------------------------------------------------
+
+%macro CHRCONVERTRANGE 4
+cglobal %1, 3, 4, 4, dstU, dstV, width, x
+    movsxdifnidn widthq, widthd
+    xor              xq, xq
+    mova             m4, [%2]
+    mova             m5, [%3]
+.loop:
+    pmovsxwd         m0, [dstUq+xq*2]
+    pmovsxwd         m1, [dstUq+xq*2+mmsize/2]
+    pmovsxwd         m2, [dstVq+xq*2]
+    pmovsxwd         m3, [dstVq+xq*2+mmsize/2]
+    pmulld           m0, m4
+    pmulld           m1, m4
+    pmulld           m2, m4
+    pmulld           m3, m4
+    paddd            m0, m5
+    paddd            m1, m5
+    paddd            m2, m5
+    paddd            m3, m5
+    psrad            m0, %4
+    psrad            m1, %4
+    psrad            m2, %4
+    psrad            m3, %4
+    packssdw         m0, m0
+    packssdw         m1, m1
+    packssdw         m2, m2
+    packssdw         m3, m3
+    movq   [dstUq+xq*2], m0
+    movq   [dstUq+xq*2+mmsize/2], m1
+    movq   [dstVq+xq*2], m2
+    movq   [dstVq+xq*2+mmsize/2], m3
+    add              xq, mmsize / 2
+    cmp              xd, widthd
+    jl .loop
+    RET
+%endmacro
+
+%if ARCH_X86_64
+INIT_XMM sse4
+LUMCONVERTRANGE lumRangeToJpeg,   lum_to_mult,   lum_to_offset,   lum_to_shift
+CHRCONVERTRANGE chrRangeToJpeg,   chr_to_mult,   chr_to_offset,   chr_to_shift
+LUMCONVERTRANGE lumRangeFromJpeg, lum_from_mult, lum_from_offset, lum_from_shift
+CHRCONVERTRANGE chrRangeFromJpeg, chr_from_mult, chr_from_offset, chr_from_shift
+%endif
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 5a9da23265..8f477b7b72 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -453,6 +453,38 @@ INPUT_PLANAR_RGB_UV_ALL_DECL(avx2);
 INPUT_PLANAR_RGB_A_ALL_DECL(avx2);
 #endif
 
+#if ARCH_X86_64
+#define RANGE_CONVERT_FUNCS(opt) do {                                       \
+    if (c->dstBpc <= 14) {                                                  \
+        if (c->srcRange) {                                                  \
+            c->lumConvertRange = ff_lumRangeFromJpeg_ ##opt;                \
+            c->chrConvertRange = ff_chrRangeFromJpeg_ ##opt;                \
+        } else {                                                            \
+            c->lumConvertRange = ff_lumRangeToJpeg_ ##opt;                  \
+            c->chrConvertRange = ff_chrRangeToJpeg_ ##opt;                  \
+        }                                                                   \
+    }                                                                       \
+} while (0)
+
+#define RANGE_CONVERT_FUNCS_DECL(opt)                                       \
+void ff_lumRangeFromJpeg_ ##opt(int16_t *dst, int width);                   \
+void ff_chrRangeFromJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width);   \
+void ff_lumRangeToJpeg_ ##opt(int16_t *dst, int width);                     \
+void ff_chrRangeToJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width);     \
+
+RANGE_CONVERT_FUNCS_DECL(sse4);
+
+av_cold void ff_sws_init_range_convert_x86(SwsContext *c)
+{
+    if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
+        int cpu_flags = av_get_cpu_flags();
+        if (EXTERNAL_SSE4(cpu_flags)) {
+            RANGE_CONVERT_FUNCS(sse4);
+        }
+    }
+}
+#endif
+
 av_cold void ff_sws_init_swscale_x86(SwsContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -820,4 +852,8 @@ switch(c->dstBpc){ \
     }
 
 #endif
+
+#if ARCH_X86_64
+    ff_sws_init_range_convert_x86(c);
+#endif
 }
-- 
2.30.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [FFmpeg-devel] [PATCH v2 3/4] swscale/x86: add avx2 {lum, chr}ConvertRange
  2024-06-11 12:28 [FFmpeg-devel] [PATCH v2 1/4] checkasm: add tests for {lum, chr}ConvertRange Ramiro Polla
  2024-06-11 12:28 ` [FFmpeg-devel] [PATCH v2 2/4] swscale/x86: add sse4 " Ramiro Polla
@ 2024-06-11 12:28 ` Ramiro Polla
  2024-06-11 12:28 ` [FFmpeg-devel] [PATCH v2 4/4] swscale/aarch64: add neon " Ramiro Polla
  2024-06-14 15:45 ` [FFmpeg-devel] [PATCH v2 1/4] checkasm: add tests for " Ramiro Polla
  3 siblings, 0 replies; 12+ messages in thread
From: Ramiro Polla @ 2024-06-11 12:28 UTC (permalink / raw)
  To: ffmpeg-devel

chrRangeFromJpeg_8_c: 24.1
chrRangeFromJpeg_8_sse4: 16.1
chrRangeFromJpeg_8_avx2: 19.9
chrRangeFromJpeg_24_c: 72.6
chrRangeFromJpeg_24_sse4: 34.6
chrRangeFromJpeg_24_avx2: 30.9
chrRangeFromJpeg_128_c: 341.1
chrRangeFromJpeg_128_sse4: 160.9
chrRangeFromJpeg_128_avx2: 94.1
chrRangeFromJpeg_144_c: 381.9
chrRangeFromJpeg_144_sse4: 183.6
chrRangeFromJpeg_144_avx2: 108.9
chrRangeFromJpeg_256_c: 646.1
chrRangeFromJpeg_256_sse4: 320.4
chrRangeFromJpeg_256_avx2: 190.6
chrRangeFromJpeg_512_c: 1255.9
chrRangeFromJpeg_512_sse4: 654.1
chrRangeFromJpeg_512_avx2: 392.4
chrRangeToJpeg_8_c: 36.9
chrRangeToJpeg_8_sse4: 13.9
chrRangeToJpeg_8_avx2: 20.6
chrRangeToJpeg_24_c: 113.4
chrRangeToJpeg_24_sse4: 29.6
chrRangeToJpeg_24_avx2: 28.9
chrRangeToJpeg_128_c: 632.1
chrRangeToJpeg_128_sse4: 162.4
chrRangeToJpeg_128_avx2: 94.6
chrRangeToJpeg_144_c: 709.9
chrRangeToJpeg_144_sse4: 183.9
chrRangeToJpeg_144_avx2: 108.1
chrRangeToJpeg_256_c: 2672.9
chrRangeToJpeg_256_sse4: 334.4
chrRangeToJpeg_256_avx2: 190.6
chrRangeToJpeg_512_c: 2500.9
chrRangeToJpeg_512_sse4: 654.1
chrRangeToJpeg_512_avx2: 379.6
lumRangeFromJpeg_8_c: 10.9
lumRangeFromJpeg_8_sse4: 12.4
lumRangeFromJpeg_8_avx2: 17.6
lumRangeFromJpeg_24_c: 38.4
lumRangeFromJpeg_24_sse4: 16.9
lumRangeFromJpeg_24_avx2: 20.6
lumRangeFromJpeg_128_c: 233.6
lumRangeFromJpeg_128_sse4: 79.9
lumRangeFromJpeg_128_avx2: 51.6
lumRangeFromJpeg_144_c: 263.9
lumRangeFromJpeg_144_sse4: 90.1
lumRangeFromJpeg_144_avx2: 57.6
lumRangeFromJpeg_256_c: 436.9
lumRangeFromJpeg_256_sse4: 162.1
lumRangeFromJpeg_256_avx2: 100.6
lumRangeFromJpeg_512_c: 878.4
lumRangeFromJpeg_512_sse4: 335.1
lumRangeFromJpeg_512_avx2: 199.4
lumRangeToJpeg_8_c: 19.1
lumRangeToJpeg_8_sse4: 11.6
lumRangeToJpeg_8_avx2: 17.6
lumRangeToJpeg_24_c: 56.9
lumRangeToJpeg_24_sse4: 17.6
lumRangeToJpeg_24_avx2: 21.4
lumRangeToJpeg_128_c: 335.9
lumRangeToJpeg_128_sse4: 79.1
lumRangeToJpeg_128_avx2: 48.9
lumRangeToJpeg_144_c: 372.9
lumRangeToJpeg_144_sse4: 91.6
lumRangeToJpeg_144_avx2: 55.4
lumRangeToJpeg_256_c: 651.9
lumRangeToJpeg_256_sse4: 163.6
lumRangeToJpeg_256_avx2: 99.1
lumRangeToJpeg_512_c: 1289.9
lumRangeToJpeg_512_sse4: 333.6
lumRangeToJpeg_512_avx2: 211.1
---
 libswscale/x86/range_convert.asm | 46 ++++++++++++++++++++++++++------
 libswscale/x86/swscale.c         |  5 +++-
 2 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/libswscale/x86/range_convert.asm b/libswscale/x86/range_convert.asm
index 13983a386b..54c2f64769 100644
--- a/libswscale/x86/range_convert.asm
+++ b/libswscale/x86/range_convert.asm
@@ -22,20 +22,20 @@
 
 SECTION_RODATA
 
-chr_to_mult:        times 4 dd 4663
-chr_to_offset:      times 4 dd -9289992
+chr_to_mult:        times 8 dd 4663
+chr_to_offset:      times 8 dd -9289992
 %define chr_to_shift 12
 
-chr_from_mult:      times 4 dd 1799
-chr_from_offset:    times 4 dd 4081085
+chr_from_mult:      times 8 dd 1799
+chr_from_offset:    times 8 dd 4081085
 %define chr_from_shift 11
 
-lum_to_mult:        times 4 dd 19077
-lum_to_offset:      times 4 dd -39057361
+lum_to_mult:        times 8 dd 19077
+lum_to_offset:      times 8 dd -39057361
 %define lum_to_shift 14
 
-lum_from_mult:      times 4 dd 14071
-lum_from_offset:    times 4 dd 33561947
+lum_from_mult:      times 8 dd 14071
+lum_from_offset:    times 8 dd 33561947
 %define lum_from_shift 14
 
 SECTION .text
@@ -66,10 +66,19 @@ cglobal %1, 2, 3, 3, dst, width, x
     paddd            m1, m5
     psrad            m0, %4
     psrad            m1, %4
+%if mmsize == 16
     packssdw         m0, m0
     packssdw         m1, m1
     movq    [dstq+xq*2], m0
     movq    [dstq+xq*2+mmsize/2], m1
+%else
+    vextracti128    xm7, ym0, 1
+    packssdw        xm0, xm7
+    vextracti128    xm7, ym1, 1
+    packssdw        xm1, xm7
+    movdqu  [dstq+xq*2], xm0
+    movdqu  [dstq+xq*2+mmsize/2], xm1
+%endif
     add              xq, mmsize / 2
     cmp              xd, widthd
     jl .loop
@@ -107,6 +116,7 @@ cglobal %1, 3, 4, 4, dstU, dstV, width, x
     psrad            m1, %4
     psrad            m2, %4
     psrad            m3, %4
+%if mmsize == 16
     packssdw         m0, m0
     packssdw         m1, m1
     packssdw         m2, m2
@@ -115,6 +125,20 @@ cglobal %1, 3, 4, 4, dstU, dstV, width, x
     movq   [dstUq+xq*2+mmsize/2], m1
     movq   [dstVq+xq*2], m2
     movq   [dstVq+xq*2+mmsize/2], m3
+%else
+    vextracti128    xm7, ym0, 1
+    packssdw        xm0, xm7
+    vextracti128    xm7, ym1, 1
+    packssdw        xm1, xm7
+    vextracti128    xm7, ym2, 1
+    packssdw        xm2, xm7
+    vextracti128    xm7, ym3, 1
+    packssdw        xm3, xm7
+    movdqu [dstUq+xq*2], xm0
+    movdqu [dstUq+xq*2+mmsize/2], xm1
+    movdqu [dstVq+xq*2], xm2
+    movdqu [dstVq+xq*2+mmsize/2], xm3
+%endif
     add              xq, mmsize / 2
     cmp              xd, widthd
     jl .loop
@@ -127,4 +151,10 @@ LUMCONVERTRANGE lumRangeToJpeg,   lum_to_mult,   lum_to_offset,   lum_to_shift
 CHRCONVERTRANGE chrRangeToJpeg,   chr_to_mult,   chr_to_offset,   chr_to_shift
 LUMCONVERTRANGE lumRangeFromJpeg, lum_from_mult, lum_from_offset, lum_from_shift
 CHRCONVERTRANGE chrRangeFromJpeg, chr_from_mult, chr_from_offset, chr_from_shift
+
+INIT_YMM avx2
+LUMCONVERTRANGE lumRangeToJpeg,   lum_to_mult,   lum_to_offset,   lum_to_shift
+CHRCONVERTRANGE chrRangeToJpeg,   chr_to_mult,   chr_to_offset,   chr_to_shift
+LUMCONVERTRANGE lumRangeFromJpeg, lum_from_mult, lum_from_offset, lum_from_shift
+CHRCONVERTRANGE chrRangeFromJpeg, chr_from_mult, chr_from_offset, chr_from_shift
 %endif
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 8f477b7b72..704e5f9c85 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -473,12 +473,15 @@ void ff_lumRangeToJpeg_ ##opt(int16_t *dst, int width);                     \
 void ff_chrRangeToJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width);     \
 
 RANGE_CONVERT_FUNCS_DECL(sse4);
+RANGE_CONVERT_FUNCS_DECL(avx2);
 
 av_cold void ff_sws_init_range_convert_x86(SwsContext *c)
 {
     if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
         int cpu_flags = av_get_cpu_flags();
-        if (EXTERNAL_SSE4(cpu_flags)) {
+        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+            RANGE_CONVERT_FUNCS(avx2);
+        } else if (EXTERNAL_SSE4(cpu_flags)) {
             RANGE_CONVERT_FUNCS(sse4);
         }
     }
-- 
2.30.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [FFmpeg-devel] [PATCH v2 4/4] swscale/aarch64: add neon {lum, chr}ConvertRange
  2024-06-11 12:28 [FFmpeg-devel] [PATCH v2 1/4] checkasm: add tests for {lum, chr}ConvertRange Ramiro Polla
  2024-06-11 12:28 ` [FFmpeg-devel] [PATCH v2 2/4] swscale/x86: add sse4 " Ramiro Polla
  2024-06-11 12:28 ` [FFmpeg-devel] [PATCH v2 3/4] swscale/x86: add avx2 " Ramiro Polla
@ 2024-06-11 12:28 ` Ramiro Polla
  2024-06-18 17:42   ` Ramiro Polla
  2024-06-14 15:45 ` [FFmpeg-devel] [PATCH v2 1/4] checkasm: add tests for " Ramiro Polla
  3 siblings, 1 reply; 12+ messages in thread
From: Ramiro Polla @ 2024-06-11 12:28 UTC (permalink / raw)
  To: ffmpeg-devel

chrRangeFromJpeg_8_c: 29.2
chrRangeFromJpeg_8_neon: 19.5
chrRangeFromJpeg_24_c: 80.5
chrRangeFromJpeg_24_neon: 34.0
chrRangeFromJpeg_128_c: 413.7
chrRangeFromJpeg_128_neon: 156.0
chrRangeFromJpeg_144_c: 471.0
chrRangeFromJpeg_144_neon: 174.2
chrRangeFromJpeg_256_c: 842.0
chrRangeFromJpeg_256_neon: 305.5
chrRangeFromJpeg_512_c: 1699.0
chrRangeFromJpeg_512_neon: 608.0
chrRangeToJpeg_8_c: 51.7
chrRangeToJpeg_8_neon: 22.7
chrRangeToJpeg_24_c: 149.7
chrRangeToJpeg_24_neon: 38.0
chrRangeToJpeg_128_c: 761.7
chrRangeToJpeg_128_neon: 176.7
chrRangeToJpeg_144_c: 866.2
chrRangeToJpeg_144_neon: 198.7
chrRangeToJpeg_256_c: 1516.5
chrRangeToJpeg_256_neon: 348.7
chrRangeToJpeg_512_c: 3067.2
chrRangeToJpeg_512_neon: 692.7
lumRangeFromJpeg_8_c: 24.0
lumRangeFromJpeg_8_neon: 17.0
lumRangeFromJpeg_24_c: 56.7
lumRangeFromJpeg_24_neon: 21.0
lumRangeFromJpeg_128_c: 294.5
lumRangeFromJpeg_128_neon: 76.7
lumRangeFromJpeg_144_c: 332.5
lumRangeFromJpeg_144_neon: 86.7
lumRangeFromJpeg_256_c: 586.0
lumRangeFromJpeg_256_neon: 152.2
lumRangeFromJpeg_512_c: 1190.0
lumRangeFromJpeg_512_neon: 298.0
lumRangeToJpeg_8_c: 31.7
lumRangeToJpeg_8_neon: 19.5
lumRangeToJpeg_24_c: 83.5
lumRangeToJpeg_24_neon: 24.2
lumRangeToJpeg_128_c: 440.5
lumRangeToJpeg_128_neon: 91.0
lumRangeToJpeg_144_c: 504.2
lumRangeToJpeg_144_neon: 101.0
lumRangeToJpeg_256_c: 879.7
lumRangeToJpeg_256_neon: 177.2
lumRangeToJpeg_512_c: 1794.2
lumRangeToJpeg_512_neon: 354.0
---
 libswscale/aarch64/Makefile             |  1 +
 libswscale/aarch64/range_convert_neon.S | 99 +++++++++++++++++++++++++
 libswscale/aarch64/swscale.c            | 21 ++++++
 libswscale/swscale_internal.h           |  1 +
 libswscale/utils.c                      |  4 +-
 5 files changed, 125 insertions(+), 1 deletion(-)
 create mode 100644 libswscale/aarch64/range_convert_neon.S

diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
index adfd90a1b6..37ad960619 100644
--- a/libswscale/aarch64/Makefile
+++ b/libswscale/aarch64/Makefile
@@ -5,5 +5,6 @@ OBJS        += aarch64/rgb2rgb.o                \
 NEON-OBJS   += aarch64/hscale.o                 \
                aarch64/input.o                  \
                aarch64/output.o                 \
+               aarch64/range_convert_neon.o     \
                aarch64/rgb2rgb_neon.o           \
                aarch64/yuv2rgb_neon.o           \
diff --git a/libswscale/aarch64/range_convert_neon.S b/libswscale/aarch64/range_convert_neon.S
new file mode 100644
index 0000000000..ea56dc2e32
--- /dev/null
+++ b/libswscale/aarch64/range_convert_neon.S
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2024 Ramiro Polla
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+.macro lumConvertRange name, max, mult, offset, shift
+function ff_\name, export=1
+.if \max != 0
+        mov             w3, #\max
+        dup             v24.8h, w3
+.endif
+        mov             w3, #\mult
+        dup             v25.4s, w3
+        movz            w3, \offset & 0xffff
+        movk            w3, (\offset >> 16) & 0xffff, lsl #16
+        dup             v26.4s, w3
+1:
+        ld1             {v0.8h}, [x0]
+.if \max != 0
+        smin            v0.8h, v0.8h, v24.8h
+.endif
+        mov             v16.16b, v26.16b
+        mov             v18.16b, v26.16b
+        sxtl            v20.4s, v0.4h
+        sxtl2           v22.4s, v0.8h
+        mla             v16.4s, v20.4s, v25.4s
+        mla             v18.4s, v22.4s, v25.4s
+        shrn            v0.4h, v16.4s, #\shift
+        shrn2           v0.8h, v18.4s, #\shift
+        subs            w1, w1, #8
+        st1             {v0.8h}, [x0], #16
+        b.gt            1b
+        ret
+endfunc
+.endm
+
+.macro chrConvertRange name, max, mult, offset, shift
+function ff_\name, export=1
+.if \max != 0
+        mov             w3, #\max
+        dup             v24.8h, w3
+.endif
+        mov             w3, #\mult
+        dup             v25.4s, w3
+        movz            w3, \offset & 0xffff
+        movk            w3, (\offset >> 16) & 0xffff, lsl #16
+        dup             v26.4s, w3
+1:
+        ld1             {v0.8h}, [x0]
+        ld1             {v1.8h}, [x1]
+.if \max != 0
+        smin            v0.8h, v0.8h, v24.8h
+        smin            v1.8h, v1.8h, v24.8h
+.endif
+        mov             v16.16b, v26.16b
+        mov             v17.16b, v26.16b
+        mov             v18.16b, v26.16b
+        mov             v19.16b, v26.16b
+        sxtl            v20.4s, v0.4h
+        sxtl            v21.4s, v1.4h
+        sxtl2           v22.4s, v0.8h
+        sxtl2           v23.4s, v1.8h
+        mla             v16.4s, v20.4s, v25.4s
+        mla             v17.4s, v21.4s, v25.4s
+        mla             v18.4s, v22.4s, v25.4s
+        mla             v19.4s, v23.4s, v25.4s
+        shrn            v0.4h, v16.4s, #\shift
+        shrn            v1.4h, v17.4s, #\shift
+        shrn2           v0.8h, v18.4s, #\shift
+        shrn2           v1.8h, v19.4s, #\shift
+        subs            w2, w2, #8
+        st1             {v0.8h}, [x0], #16
+        st1             {v1.8h}, [x1], #16
+        b.gt            1b
+        ret
+endfunc
+.endm
+
+lumConvertRange lumRangeToJpeg_neon,   30189, 19077, -39057361, 14
+chrConvertRange chrRangeToJpeg_neon,   30775,  4663,  -9289992, 12
+lumConvertRange lumRangeFromJpeg_neon,     0, 14071,  33561947, 14
+chrConvertRange chrRangeFromJpeg_neon,     0,  1799,   4081085, 11
diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
index 4c4ea39dc1..e4ea3309ba 100644
--- a/libswscale/aarch64/swscale.c
+++ b/libswscale/aarch64/swscale.c
@@ -215,6 +215,26 @@ void ff_rgb24ToUV_half_neon(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unuse
                        const uint8_t *src2, int width, uint32_t *rgb2yuv,
                        void *opq);
 
+void ff_lumRangeFromJpeg_neon(int16_t *dst, int width);
+void ff_chrRangeFromJpeg_neon(int16_t *dstU, int16_t *dstV, int width);
+void ff_lumRangeToJpeg_neon(int16_t *dst, int width);
+void ff_chrRangeToJpeg_neon(int16_t *dstU, int16_t *dstV, int width);
+
+av_cold void ff_sws_init_range_convert_aarch64(SwsContext *c)
+{
+    if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
+        if (c->dstBpc <= 14) {
+            if (c->srcRange) {
+                c->lumConvertRange = ff_lumRangeFromJpeg_neon;
+                c->chrConvertRange = ff_chrRangeFromJpeg_neon;
+            } else {
+                c->lumConvertRange = ff_lumRangeToJpeg_neon;
+                c->chrConvertRange = ff_chrRangeToJpeg_neon;
+            }
+        }
+    }
+}
+
 av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -237,5 +257,6 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
         default:
             break;
         }
+        ff_sws_init_range_convert_aarch64(c);
     }
 }
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index d5e7b5e71c..0818f50c7f 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -697,6 +697,7 @@ void ff_yuv2rgb_init_tables_ppc(SwsContext *c, const int inv_table[4],
 void ff_updateMMXDitherTables(SwsContext *c, int dstY);
 
 av_cold void ff_sws_init_range_convert(SwsContext *c);
+av_cold void ff_sws_init_range_convert_aarch64(SwsContext *c);
 av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c);
 av_cold void ff_sws_init_range_convert_x86(SwsContext *c);
 
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 8dfa57b5ff..12dba712c1 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -1080,7 +1080,9 @@ int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
 
     if (need_reinit) {
         ff_sws_init_range_convert(c);
-#if ARCH_LOONGARCH64
+#if ARCH_AARCH64
+        ff_sws_init_range_convert_aarch64(c);
+#elif ARCH_LOONGARCH64
         ff_sws_init_range_convert_loongarch(c);
 #elif ARCH_X86
         ff_sws_init_range_convert_x86(c);
-- 
2.30.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [FFmpeg-devel] [PATCH v2 2/4] swscale/x86: add sse4 {lum, chr}ConvertRange
  2024-06-11 12:28 ` [FFmpeg-devel] [PATCH v2 2/4] swscale/x86: add sse4 " Ramiro Polla
@ 2024-06-11 12:32   ` James Almer
  2024-06-11 18:26   ` Michael Niedermayer
  1 sibling, 0 replies; 12+ messages in thread
From: James Almer @ 2024-06-11 12:32 UTC (permalink / raw)
  To: ffmpeg-devel

On 6/11/2024 9:28 AM, Ramiro Polla wrote:
> chrRangeFromJpeg_8_c: 28.7
> chrRangeFromJpeg_8_sse4: 16.2
> chrRangeFromJpeg_24_c: 152.7
> chrRangeFromJpeg_24_sse4: 29.7
> chrRangeFromJpeg_128_c: 366.5
> chrRangeFromJpeg_128_sse4: 233.0
> chrRangeFromJpeg_144_c: 408.0
> chrRangeFromJpeg_144_sse4: 182.5
> chrRangeFromJpeg_256_c: 698.7
> chrRangeFromJpeg_256_sse4: 325.5
> chrRangeFromJpeg_512_c: 1348.7
> chrRangeFromJpeg_512_sse4: 660.2
> chrRangeToJpeg_8_c: 37.7
> chrRangeToJpeg_8_sse4: 16.2
> chrRangeToJpeg_24_c: 115.7
> chrRangeToJpeg_24_sse4: 36.2
> chrRangeToJpeg_128_c: 631.2
> chrRangeToJpeg_128_sse4: 163.7
> chrRangeToJpeg_144_c: 710.7
> chrRangeToJpeg_144_sse4: 183.0
> chrRangeToJpeg_256_c: 1253.0
> chrRangeToJpeg_256_sse4: 343.5
> chrRangeToJpeg_512_c: 2491.2
> chrRangeToJpeg_512_sse4: 654.2
> lumRangeFromJpeg_8_c: 11.7
> lumRangeFromJpeg_8_sse4: 10.5
> lumRangeFromJpeg_24_c: 38.5
> lumRangeFromJpeg_24_sse4: 19.0
> lumRangeFromJpeg_128_c: 237.5
> lumRangeFromJpeg_128_sse4: 79.2
> lumRangeFromJpeg_144_c: 255.7
> lumRangeFromJpeg_144_sse4: 90.5
> lumRangeFromJpeg_256_c: 441.5
> lumRangeFromJpeg_256_sse4: 161.7
> lumRangeFromJpeg_512_c: 879.0
> lumRangeFromJpeg_512_sse4: 333.2
> lumRangeToJpeg_8_c: 20.0
> lumRangeToJpeg_8_sse4: 11.7
> lumRangeToJpeg_24_c: 61.5
> lumRangeToJpeg_24_sse4: 17.7
> lumRangeToJpeg_128_c: 357.5
> lumRangeToJpeg_128_sse4: 80.0
> lumRangeToJpeg_144_c: 371.5
> lumRangeToJpeg_144_sse4: 93.2
> lumRangeToJpeg_256_c: 651.5
> lumRangeToJpeg_256_sse4: 164.5
> lumRangeToJpeg_512_c: 1279.0
> lumRangeToJpeg_512_sse4: 333.7
> ---
>   libswscale/swscale_internal.h    |   1 +
>   libswscale/utils.c               |   2 +
>   libswscale/x86/Makefile          |   1 +
>   libswscale/x86/range_convert.asm | 130 +++++++++++++++++++++++++++++++
>   libswscale/x86/swscale.c         |  36 +++++++++
>   5 files changed, 170 insertions(+)
>   create mode 100644 libswscale/x86/range_convert.asm
> 
> diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
> index 5007dd422f..d5e7b5e71c 100644
> --- a/libswscale/swscale_internal.h
> +++ b/libswscale/swscale_internal.h
> @@ -698,6 +698,7 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY);
>   
>   av_cold void ff_sws_init_range_convert(SwsContext *c);
>   av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c);
> +av_cold void ff_sws_init_range_convert_x86(SwsContext *c);
>   
>   SwsFunc ff_yuv2rgb_init_x86(SwsContext *c);
>   SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c);
> diff --git a/libswscale/utils.c b/libswscale/utils.c
> index 476a24fea5..8dfa57b5ff 100644
> --- a/libswscale/utils.c
> +++ b/libswscale/utils.c
> @@ -1082,6 +1082,8 @@ int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
>           ff_sws_init_range_convert(c);
>   #if ARCH_LOONGARCH64
>           ff_sws_init_range_convert_loongarch(c);
> +#elif ARCH_X86
> +        ff_sws_init_range_convert_x86(c);
>   #endif
>       }
>   
> diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
> index 68391494be..f00154941d 100644
> --- a/libswscale/x86/Makefile
> +++ b/libswscale/x86/Makefile
> @@ -12,6 +12,7 @@ X86ASM-OBJS                     += x86/input.o                          \
>                                      x86/output.o                         \
>                                      x86/scale.o                          \
>                                      x86/scale_avx2.o                          \
> +                                   x86/range_convert.o                  \
>                                      x86/rgb_2_rgb.o                      \
>                                      x86/yuv_2_rgb.o                      \
>                                      x86/yuv2yuvX.o                       \
> diff --git a/libswscale/x86/range_convert.asm b/libswscale/x86/range_convert.asm
> new file mode 100644
> index 0000000000..13983a386b
> --- /dev/null
> +++ b/libswscale/x86/range_convert.asm
> @@ -0,0 +1,130 @@
> +;******************************************************************************
> +;* Copyright (c) 2024 Ramiro Polla
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA
> +
> +chr_to_mult:        times 4 dd 4663
> +chr_to_offset:      times 4 dd -9289992
> +%define chr_to_shift 12
> +
> +chr_from_mult:      times 4 dd 1799
> +chr_from_offset:    times 4 dd 4081085
> +%define chr_from_shift 11
> +
> +lum_to_mult:        times 4 dd 19077
> +lum_to_offset:      times 4 dd -39057361
> +%define lum_to_shift 14
> +
> +lum_from_mult:      times 4 dd 14071
> +lum_from_offset:    times 4 dd 33561947
> +%define lum_from_shift 14
> +
> +SECTION .text
> +
> +; NOTE: there is no need to clamp the input when converting to jpeg range
> +;       (like we do in the C code) because packssdw will saturate the output.
> +
> +;-----------------------------------------------------------------------------
> +; lumConvertRange
> +;
> +; void ff_lumRangeToJpeg_<opt>(int16_t *dst, int width);
> +; void ff_lumRangeFromJpeg_<opt>(int16_t *dst, int width);
> +;
> +;-----------------------------------------------------------------------------
> +
> +%macro LUMCONVERTRANGE 4
> +cglobal %1, 2, 3, 3, dst, width, x
> +    movsxdifnidn widthq, widthd
> +    xor              xq, xq
> +    mova             m4, [%2]
> +    mova             m5, [%3]
> +.loop:
> +    pmovsxwd         m0, [dstq+xq*2]
> +    pmovsxwd         m1, [dstq+xq*2+mmsize/2]
> +    pmulld           m0, m4
> +    pmulld           m1, m4

Can't you use pmaddwd without sign extending to dword instead? pmulld is 
pretty slow.

> +    paddd            m0, m5
> +    paddd            m1, m5
> +    psrad            m0, %4
> +    psrad            m1, %4
> +    packssdw         m0, m0
> +    packssdw         m1, m1
> +    movq    [dstq+xq*2], m0
> +    movq    [dstq+xq*2+mmsize/2], m1
> +    add              xq, mmsize / 2
> +    cmp              xd, widthd
> +    jl .loop
> +    RET
> +%endmacro
> +
> +;-----------------------------------------------------------------------------
> +; chrConvertRange
> +;
> +; void ff_chrRangeToJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width);
> +; void ff_chrRangeFromJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width);
> +;
> +;-----------------------------------------------------------------------------
> +
> +%macro CHRCONVERTRANGE 4
> +cglobal %1, 3, 4, 4, dstU, dstV, width, x
> +    movsxdifnidn widthq, widthd
> +    xor              xq, xq
> +    mova             m4, [%2]
> +    mova             m5, [%3]
> +.loop:
> +    pmovsxwd         m0, [dstUq+xq*2]
> +    pmovsxwd         m1, [dstUq+xq*2+mmsize/2]
> +    pmovsxwd         m2, [dstVq+xq*2]
> +    pmovsxwd         m3, [dstVq+xq*2+mmsize/2]
> +    pmulld           m0, m4
> +    pmulld           m1, m4
> +    pmulld           m2, m4
> +    pmulld           m3, m4
> +    paddd            m0, m5
> +    paddd            m1, m5
> +    paddd            m2, m5
> +    paddd            m3, m5
> +    psrad            m0, %4
> +    psrad            m1, %4
> +    psrad            m2, %4
> +    psrad            m3, %4
> +    packssdw         m0, m0
> +    packssdw         m1, m1
> +    packssdw         m2, m2
> +    packssdw         m3, m3
> +    movq   [dstUq+xq*2], m0
> +    movq   [dstUq+xq*2+mmsize/2], m1
> +    movq   [dstVq+xq*2], m2
> +    movq   [dstVq+xq*2+mmsize/2], m3
> +    add              xq, mmsize / 2
> +    cmp              xd, widthd
> +    jl .loop
> +    RET
> +%endmacro
> +
> +%if ARCH_X86_64
> +INIT_XMM sse4
> +LUMCONVERTRANGE lumRangeToJpeg,   lum_to_mult,   lum_to_offset,   lum_to_shift
> +CHRCONVERTRANGE chrRangeToJpeg,   chr_to_mult,   chr_to_offset,   chr_to_shift
> +LUMCONVERTRANGE lumRangeFromJpeg, lum_from_mult, lum_from_offset, lum_from_shift
> +CHRCONVERTRANGE chrRangeFromJpeg, chr_from_mult, chr_from_offset, chr_from_shift
> +%endif
> diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
> index 5a9da23265..8f477b7b72 100644
> --- a/libswscale/x86/swscale.c
> +++ b/libswscale/x86/swscale.c
> @@ -453,6 +453,38 @@ INPUT_PLANAR_RGB_UV_ALL_DECL(avx2);
>   INPUT_PLANAR_RGB_A_ALL_DECL(avx2);
>   #endif
>   
> +#if ARCH_X86_64
> +#define RANGE_CONVERT_FUNCS(opt) do {                                       \
> +    if (c->dstBpc <= 14) {                                                  \
> +        if (c->srcRange) {                                                  \
> +            c->lumConvertRange = ff_lumRangeFromJpeg_ ##opt;                \
> +            c->chrConvertRange = ff_chrRangeFromJpeg_ ##opt;                \
> +        } else {                                                            \
> +            c->lumConvertRange = ff_lumRangeToJpeg_ ##opt;                  \
> +            c->chrConvertRange = ff_chrRangeToJpeg_ ##opt;                  \
> +        }                                                                   \
> +    }                                                                       \
> +} while (0)
> +
> +#define RANGE_CONVERT_FUNCS_DECL(opt)                                       \
> +void ff_lumRangeFromJpeg_ ##opt(int16_t *dst, int width);                   \
> +void ff_chrRangeFromJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width);   \
> +void ff_lumRangeToJpeg_ ##opt(int16_t *dst, int width);                     \
> +void ff_chrRangeToJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width);     \
> +
> +RANGE_CONVERT_FUNCS_DECL(sse4);
> +
> +av_cold void ff_sws_init_range_convert_x86(SwsContext *c)
> +{
> +    if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
> +        int cpu_flags = av_get_cpu_flags();
> +        if (EXTERNAL_SSE4(cpu_flags)) {
> +            RANGE_CONVERT_FUNCS(sse4);
> +        }
> +    }
> +}
> +#endif
> +
>   av_cold void ff_sws_init_swscale_x86(SwsContext *c)
>   {
>       int cpu_flags = av_get_cpu_flags();
> @@ -820,4 +852,8 @@ switch(c->dstBpc){ \
>       }
>   
>   #endif
> +
> +#if ARCH_X86_64
> +    ff_sws_init_range_convert_x86(c);
> +#endif
>   }
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [FFmpeg-devel] [PATCH v2 2/4] swscale/x86: add sse4 {lum, chr}ConvertRange
  2024-06-11 12:28 ` [FFmpeg-devel] [PATCH v2 2/4] swscale/x86: add sse4 " Ramiro Polla
  2024-06-11 12:32   ` James Almer
@ 2024-06-11 18:26   ` Michael Niedermayer
  2024-06-11 18:43     ` James Almer
  1 sibling, 1 reply; 12+ messages in thread
From: Michael Niedermayer @ 2024-06-11 18:26 UTC (permalink / raw)
  To: FFmpeg development discussions and patches


[-- Attachment #1.1: Type: text/plain, Size: 2648 bytes --]

On Tue, Jun 11, 2024 at 02:28:56PM +0200, Ramiro Polla wrote:
> chrRangeFromJpeg_8_c: 28.7
> chrRangeFromJpeg_8_sse4: 16.2
> chrRangeFromJpeg_24_c: 152.7
> chrRangeFromJpeg_24_sse4: 29.7
> chrRangeFromJpeg_128_c: 366.5
> chrRangeFromJpeg_128_sse4: 233.0
> chrRangeFromJpeg_144_c: 408.0
> chrRangeFromJpeg_144_sse4: 182.5
> chrRangeFromJpeg_256_c: 698.7
> chrRangeFromJpeg_256_sse4: 325.5
> chrRangeFromJpeg_512_c: 1348.7
> chrRangeFromJpeg_512_sse4: 660.2
> chrRangeToJpeg_8_c: 37.7
> chrRangeToJpeg_8_sse4: 16.2
> chrRangeToJpeg_24_c: 115.7
> chrRangeToJpeg_24_sse4: 36.2
> chrRangeToJpeg_128_c: 631.2
> chrRangeToJpeg_128_sse4: 163.7
> chrRangeToJpeg_144_c: 710.7
> chrRangeToJpeg_144_sse4: 183.0
> chrRangeToJpeg_256_c: 1253.0
> chrRangeToJpeg_256_sse4: 343.5
> chrRangeToJpeg_512_c: 2491.2
> chrRangeToJpeg_512_sse4: 654.2
> lumRangeFromJpeg_8_c: 11.7
> lumRangeFromJpeg_8_sse4: 10.5
> lumRangeFromJpeg_24_c: 38.5
> lumRangeFromJpeg_24_sse4: 19.0
> lumRangeFromJpeg_128_c: 237.5
> lumRangeFromJpeg_128_sse4: 79.2
> lumRangeFromJpeg_144_c: 255.7
> lumRangeFromJpeg_144_sse4: 90.5
> lumRangeFromJpeg_256_c: 441.5
> lumRangeFromJpeg_256_sse4: 161.7
> lumRangeFromJpeg_512_c: 879.0
> lumRangeFromJpeg_512_sse4: 333.2
> lumRangeToJpeg_8_c: 20.0
> lumRangeToJpeg_8_sse4: 11.7
> lumRangeToJpeg_24_c: 61.5
> lumRangeToJpeg_24_sse4: 17.7
> lumRangeToJpeg_128_c: 357.5
> lumRangeToJpeg_128_sse4: 80.0
> lumRangeToJpeg_144_c: 371.5
> lumRangeToJpeg_144_sse4: 93.2
> lumRangeToJpeg_256_c: 651.5
> lumRangeToJpeg_256_sse4: 164.5
> lumRangeToJpeg_512_c: 1279.0
> lumRangeToJpeg_512_sse4: 333.7
> ---
>  libswscale/swscale_internal.h    |   1 +
>  libswscale/utils.c               |   2 +
>  libswscale/x86/Makefile          |   1 +
>  libswscale/x86/range_convert.asm | 130 +++++++++++++++++++++++++++++++
>  libswscale/x86/swscale.c         |  36 +++++++++
>  5 files changed, 170 insertions(+)
>  create mode 100644 libswscale/x86/range_convert.asm

breaks x86-32 build

LD	ffmpeg_g
/usr/lib/gcc-cross/i686-linux-gnu/7/../../../../i686-linux-gnu/bin/ld: libswscale/libswscale.a(utils.o): in function `sws_setColorspaceDetails':
ffmpeg/linux32/src/libswscale/utils.c:1086: undefined reference to `ff_sws_init_range_convert_x86'
collect2: error: ld returned 1 exit status
make: *** [Makefile:139: ffmpeg_g] Error 1

thx

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Whats the most studid thing your enemy could do ? Blow himself up
Whats the most studid thing you could do ? Give up your rights and
freedom because your enemy blew himself up.


[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

[-- Attachment #2: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [FFmpeg-devel] [PATCH v2 2/4] swscale/x86: add sse4 {lum, chr}ConvertRange
  2024-06-11 18:26   ` Michael Niedermayer
@ 2024-06-11 18:43     ` James Almer
  2024-06-12 14:54       ` Ramiro Polla
  0 siblings, 1 reply; 12+ messages in thread
From: James Almer @ 2024-06-11 18:43 UTC (permalink / raw)
  To: ffmpeg-devel

On 6/11/2024 3:26 PM, Michael Niedermayer wrote:
> On Tue, Jun 11, 2024 at 02:28:56PM +0200, Ramiro Polla wrote:
>> chrRangeFromJpeg_8_c: 28.7
>> chrRangeFromJpeg_8_sse4: 16.2
>> chrRangeFromJpeg_24_c: 152.7
>> chrRangeFromJpeg_24_sse4: 29.7
>> chrRangeFromJpeg_128_c: 366.5
>> chrRangeFromJpeg_128_sse4: 233.0
>> chrRangeFromJpeg_144_c: 408.0
>> chrRangeFromJpeg_144_sse4: 182.5
>> chrRangeFromJpeg_256_c: 698.7
>> chrRangeFromJpeg_256_sse4: 325.5
>> chrRangeFromJpeg_512_c: 1348.7
>> chrRangeFromJpeg_512_sse4: 660.2
>> chrRangeToJpeg_8_c: 37.7
>> chrRangeToJpeg_8_sse4: 16.2
>> chrRangeToJpeg_24_c: 115.7
>> chrRangeToJpeg_24_sse4: 36.2
>> chrRangeToJpeg_128_c: 631.2
>> chrRangeToJpeg_128_sse4: 163.7
>> chrRangeToJpeg_144_c: 710.7
>> chrRangeToJpeg_144_sse4: 183.0
>> chrRangeToJpeg_256_c: 1253.0
>> chrRangeToJpeg_256_sse4: 343.5
>> chrRangeToJpeg_512_c: 2491.2
>> chrRangeToJpeg_512_sse4: 654.2
>> lumRangeFromJpeg_8_c: 11.7
>> lumRangeFromJpeg_8_sse4: 10.5
>> lumRangeFromJpeg_24_c: 38.5
>> lumRangeFromJpeg_24_sse4: 19.0
>> lumRangeFromJpeg_128_c: 237.5
>> lumRangeFromJpeg_128_sse4: 79.2
>> lumRangeFromJpeg_144_c: 255.7
>> lumRangeFromJpeg_144_sse4: 90.5
>> lumRangeFromJpeg_256_c: 441.5
>> lumRangeFromJpeg_256_sse4: 161.7
>> lumRangeFromJpeg_512_c: 879.0
>> lumRangeFromJpeg_512_sse4: 333.2
>> lumRangeToJpeg_8_c: 20.0
>> lumRangeToJpeg_8_sse4: 11.7
>> lumRangeToJpeg_24_c: 61.5
>> lumRangeToJpeg_24_sse4: 17.7
>> lumRangeToJpeg_128_c: 357.5
>> lumRangeToJpeg_128_sse4: 80.0
>> lumRangeToJpeg_144_c: 371.5
>> lumRangeToJpeg_144_sse4: 93.2
>> lumRangeToJpeg_256_c: 651.5
>> lumRangeToJpeg_256_sse4: 164.5
>> lumRangeToJpeg_512_c: 1279.0
>> lumRangeToJpeg_512_sse4: 333.7
>> ---
>>   libswscale/swscale_internal.h    |   1 +
>>   libswscale/utils.c               |   2 +
>>   libswscale/x86/Makefile          |   1 +
>>   libswscale/x86/range_convert.asm | 130 +++++++++++++++++++++++++++++++
>>   libswscale/x86/swscale.c         |  36 +++++++++
>>   5 files changed, 170 insertions(+)
>>   create mode 100644 libswscale/x86/range_convert.asm
> 
> breaks x86-32 build
> 
> LD	ffmpeg_g
> /usr/lib/gcc-cross/i686-linux-gnu/7/../../../../i686-linux-gnu/bin/ld: libswscale/libswscale.a(utils.o): in function `sws_setColorspaceDetails':
> ffmpeg/linux32/src/libswscale/utils.c:1086: undefined reference to `ff_sws_init_range_convert_x86'
> collect2: error: ld returned 1 exit status
> make: *** [Makefile:139: ffmpeg_g] Error 1
> 
> thx

The functions are wrapped in ARCH_X86_64 checks for seemingly no reason, 
so they should be removed in the next iteration.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [FFmpeg-devel] [PATCH v2 2/4] swscale/x86: add sse4 {lum, chr}ConvertRange
  2024-06-11 18:43     ` James Almer
@ 2024-06-12 14:54       ` Ramiro Polla
  2024-06-14 15:46         ` Ramiro Polla
  0 siblings, 1 reply; 12+ messages in thread
From: Ramiro Polla @ 2024-06-12 14:54 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

[-- Attachment #1: Type: text/plain, Size: 3015 bytes --]

Hi,

On Tue, Jun 11, 2024 at 8:42 PM James Almer <jamrial@gmail.com> wrote:
>
> On 6/11/2024 3:26 PM, Michael Niedermayer wrote:
> > On Tue, Jun 11, 2024 at 02:28:56PM +0200, Ramiro Polla wrote:
> >> chrRangeFromJpeg_8_c: 28.7
> >> chrRangeFromJpeg_8_sse4: 16.2
> >> chrRangeFromJpeg_24_c: 152.7
> >> chrRangeFromJpeg_24_sse4: 29.7
> >> chrRangeFromJpeg_128_c: 366.5
> >> chrRangeFromJpeg_128_sse4: 233.0
> >> chrRangeFromJpeg_144_c: 408.0
> >> chrRangeFromJpeg_144_sse4: 182.5
> >> chrRangeFromJpeg_256_c: 698.7
> >> chrRangeFromJpeg_256_sse4: 325.5
> >> chrRangeFromJpeg_512_c: 1348.7
> >> chrRangeFromJpeg_512_sse4: 660.2
> >> chrRangeToJpeg_8_c: 37.7
> >> chrRangeToJpeg_8_sse4: 16.2
> >> chrRangeToJpeg_24_c: 115.7
> >> chrRangeToJpeg_24_sse4: 36.2
> >> chrRangeToJpeg_128_c: 631.2
> >> chrRangeToJpeg_128_sse4: 163.7
> >> chrRangeToJpeg_144_c: 710.7
> >> chrRangeToJpeg_144_sse4: 183.0
> >> chrRangeToJpeg_256_c: 1253.0
> >> chrRangeToJpeg_256_sse4: 343.5
> >> chrRangeToJpeg_512_c: 2491.2
> >> chrRangeToJpeg_512_sse4: 654.2
> >> lumRangeFromJpeg_8_c: 11.7
> >> lumRangeFromJpeg_8_sse4: 10.5
> >> lumRangeFromJpeg_24_c: 38.5
> >> lumRangeFromJpeg_24_sse4: 19.0
> >> lumRangeFromJpeg_128_c: 237.5
> >> lumRangeFromJpeg_128_sse4: 79.2
> >> lumRangeFromJpeg_144_c: 255.7
> >> lumRangeFromJpeg_144_sse4: 90.5
> >> lumRangeFromJpeg_256_c: 441.5
> >> lumRangeFromJpeg_256_sse4: 161.7
> >> lumRangeFromJpeg_512_c: 879.0
> >> lumRangeFromJpeg_512_sse4: 333.2
> >> lumRangeToJpeg_8_c: 20.0
> >> lumRangeToJpeg_8_sse4: 11.7
> >> lumRangeToJpeg_24_c: 61.5
> >> lumRangeToJpeg_24_sse4: 17.7
> >> lumRangeToJpeg_128_c: 357.5
> >> lumRangeToJpeg_128_sse4: 80.0
> >> lumRangeToJpeg_144_c: 371.5
> >> lumRangeToJpeg_144_sse4: 93.2
> >> lumRangeToJpeg_256_c: 651.5
> >> lumRangeToJpeg_256_sse4: 164.5
> >> lumRangeToJpeg_512_c: 1279.0
> >> lumRangeToJpeg_512_sse4: 333.7
> >> ---
> >>   libswscale/swscale_internal.h    |   1 +
> >>   libswscale/utils.c               |   2 +
> >>   libswscale/x86/Makefile          |   1 +
> >>   libswscale/x86/range_convert.asm | 130 +++++++++++++++++++++++++++++++
> >>   libswscale/x86/swscale.c         |  36 +++++++++
> >>   5 files changed, 170 insertions(+)
> >>   create mode 100644 libswscale/x86/range_convert.asm
> >
> > breaks x86-32 build
> >
> > LD    ffmpeg_g
> > /usr/lib/gcc-cross/i686-linux-gnu/7/../../../../i686-linux-gnu/bin/ld: libswscale/libswscale.a(utils.o): in function `sws_setColorspaceDetails':
> > ffmpeg/linux32/src/libswscale/utils.c:1086: undefined reference to `ff_sws_init_range_convert_x86'
> > collect2: error: ld returned 1 exit status
> > make: *** [Makefile:139: ffmpeg_g] Error 1
> >
> > thx
>
> The functions are wrapped in ARCH_X86_64 checks for seemingly no reason,
> so they should be removed in the next iteration.

Fixed.

James walked me through on IRC to optimize and improve the functions
in a way that they work both with sse2 and avx2. New patch attached.

[-- Attachment #2: 0001-swscale-x86-add-sse2-and-avx2-lum-chr-ConvertRange.patch --]
[-- Type: text/x-patch, Size: 11420 bytes --]

From 9e49e72f6766e96cc06bec869fb776fff4c477bf Mon Sep 17 00:00:00 2001
From: Ramiro Polla <ramiro.polla@gmail.com>
Date: Thu, 6 Jun 2024 18:33:34 +0200
Subject: [PATCH] swscale/x86: add sse2 and avx2 {lum,chr}ConvertRange

chrRangeFromJpeg_8_c: 22.3
chrRangeFromJpeg_8_sse2: 13.3
chrRangeFromJpeg_8_avx2: 13.3
chrRangeFromJpeg_24_c: 72.8
chrRangeFromJpeg_24_sse2: 22.3
chrRangeFromJpeg_24_avx2: 17.5
chrRangeFromJpeg_128_c: 345.5
chrRangeFromJpeg_128_sse2: 106.0
chrRangeFromJpeg_128_avx2: 57.8
chrRangeFromJpeg_144_c: 380.5
chrRangeFromJpeg_144_sse2: 118.5
chrRangeFromJpeg_144_avx2: 62.3
chrRangeFromJpeg_256_c: 646.3
chrRangeFromJpeg_256_sse2: 218.8
chrRangeFromJpeg_256_avx2: 109.0
chrRangeFromJpeg_512_c: 1461.5
chrRangeFromJpeg_512_sse2: 426.5
chrRangeFromJpeg_512_avx2: 211.5
chrRangeToJpeg_8_c: 37.8
chrRangeToJpeg_8_sse2: 10.5
chrRangeToJpeg_8_avx2: 14.0
chrRangeToJpeg_24_c: 114.3
chrRangeToJpeg_24_sse2: 23.5
chrRangeToJpeg_24_avx2: 16.3
chrRangeToJpeg_128_c: 633.5
chrRangeToJpeg_128_sse2: 107.5
chrRangeToJpeg_128_avx2: 55.0
chrRangeToJpeg_144_c: 758.3
chrRangeToJpeg_144_sse2: 132.0
chrRangeToJpeg_144_avx2: 64.5
chrRangeToJpeg_256_c: 1345.0
chrRangeToJpeg_256_sse2: 218.0
chrRangeToJpeg_256_avx2: 105.3
chrRangeToJpeg_512_c: 2524.0
chrRangeToJpeg_512_sse2: 417.0
chrRangeToJpeg_512_avx2: 218.8
lumRangeFromJpeg_8_c: 11.8
lumRangeFromJpeg_8_sse2: 11.0
lumRangeFromJpeg_8_avx2: 10.3
lumRangeFromJpeg_24_c: 38.5
lumRangeFromJpeg_24_sse2: 15.5
lumRangeFromJpeg_24_avx2: 12.5
lumRangeFromJpeg_128_c: 232.3
lumRangeFromJpeg_128_sse2: 60.0
lumRangeFromJpeg_128_avx2: 26.8
lumRangeFromJpeg_144_c: 259.5
lumRangeFromJpeg_144_sse2: 65.3
lumRangeFromJpeg_144_avx2: 29.0
lumRangeFromJpeg_256_c: 464.5
lumRangeFromJpeg_256_sse2: 107.5
lumRangeFromJpeg_256_avx2: 54.0
lumRangeFromJpeg_512_c: 897.5
lumRangeFromJpeg_512_sse2: 224.5
lumRangeFromJpeg_512_avx2: 109.8
lumRangeToJpeg_8_c: 17.8
lumRangeToJpeg_8_sse2: 11.0
lumRangeToJpeg_8_avx2: 11.8
lumRangeToJpeg_24_c: 56.3
lumRangeToJpeg_24_sse2: 11.0
lumRangeToJpeg_24_avx2: 12.5
lumRangeToJpeg_128_c: 333.8
lumRangeToJpeg_128_sse2: 53.3
lumRangeToJpeg_128_avx2: 26.5
lumRangeToJpeg_144_c: 375.5
lumRangeToJpeg_144_sse2: 60.8
lumRangeToJpeg_144_avx2: 29.0
lumRangeToJpeg_256_c: 652.0
lumRangeToJpeg_256_sse2: 109.5
lumRangeToJpeg_256_avx2: 53.5
lumRangeToJpeg_512_c: 1284.3
lumRangeToJpeg_512_sse2: 218.0
lumRangeToJpeg_512_avx2: 108.3
---
 libswscale/swscale_internal.h    |   1 +
 libswscale/utils.c               |   2 +
 libswscale/x86/Makefile          |   1 +
 libswscale/x86/range_convert.asm | 134 +++++++++++++++++++++++++++++++
 libswscale/x86/swscale.c         |  35 ++++++++
 5 files changed, 173 insertions(+)
 create mode 100644 libswscale/x86/range_convert.asm

diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 5007dd422f..d5e7b5e71c 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -698,6 +698,7 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY);
 
 av_cold void ff_sws_init_range_convert(SwsContext *c);
 av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c);
+av_cold void ff_sws_init_range_convert_x86(SwsContext *c);
 
 SwsFunc ff_yuv2rgb_init_x86(SwsContext *c);
 SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c);
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 476a24fea5..8dfa57b5ff 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -1082,6 +1082,8 @@ int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
         ff_sws_init_range_convert(c);
 #if ARCH_LOONGARCH64
         ff_sws_init_range_convert_loongarch(c);
+#elif ARCH_X86
+        ff_sws_init_range_convert_x86(c);
 #endif
     }
 
diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index 68391494be..f00154941d 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -12,6 +12,7 @@ X86ASM-OBJS                     += x86/input.o                          \
                                    x86/output.o                         \
                                    x86/scale.o                          \
                                    x86/scale_avx2.o                          \
+                                   x86/range_convert.o                  \
                                    x86/rgb_2_rgb.o                      \
                                    x86/yuv_2_rgb.o                      \
                                    x86/yuv2yuvX.o                       \
diff --git a/libswscale/x86/range_convert.asm b/libswscale/x86/range_convert.asm
new file mode 100644
index 0000000000..ae51e9d573
--- /dev/null
+++ b/libswscale/x86/range_convert.asm
@@ -0,0 +1,134 @@
+;******************************************************************************
+;* Copyright (c) 2024 Ramiro Polla
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+chr_to_mult:        times 4 dw 4663, 0
+chr_to_offset:      times 4 dd -9289992
+%define chr_to_shift 12
+
+chr_from_mult:      times 4 dw 1799, 0
+chr_from_offset:    times 4 dd 4081085
+%define chr_from_shift 11
+
+lum_to_mult:        times 4 dw 19077, 0
+lum_to_offset:      times 4 dd -39057361
+%define lum_to_shift 14
+
+lum_from_mult:      times 4 dw 14071, 0
+lum_from_offset:    times 4 dd 33561947
+%define lum_from_shift 14
+
+SECTION .text
+
+; NOTE: there is no need to clamp the input when converting to jpeg range
+;       (like we do in the C code) because packssdw will saturate the output.
+
+;-----------------------------------------------------------------------------
+; lumConvertRange
+;
+; void ff_lumRangeToJpeg_<opt>(int16_t *dst, int width);
+; void ff_lumRangeFromJpeg_<opt>(int16_t *dst, int width);
+;
+;-----------------------------------------------------------------------------
+
+%macro LUMCONVERTRANGE 4
+cglobal %1, 2, 2, 7, dst, width
+    shl          widthd, 1
+    VBROADCASTI128   m4, [%2]
+    VBROADCASTI128   m5, [%3]
+    pxor             m6, m6
+    add            dstq, widthq
+    neg          widthq
+.loop:
+    movu             m0, [dstq+widthq]
+    punpckhwd        m1, m0, m6
+    punpcklwd        m0, m6
+    pmaddwd          m0, m4
+    pmaddwd          m1, m4
+    paddd            m0, m5
+    paddd            m1, m5
+    psrad            m0, %4
+    psrad            m1, %4
+    packssdw         m0, m1
+    movu  [dstq+widthq], m0
+    add          widthq, mmsize
+    jl .loop
+    RET
+%endmacro
+
+;-----------------------------------------------------------------------------
+; chrConvertRange
+;
+; void ff_chrRangeToJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width);
+; void ff_chrRangeFromJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width);
+;
+;-----------------------------------------------------------------------------
+
+%macro CHRCONVERTRANGE 4
+cglobal %1, 3, 3, 7, dstU, dstV, width
+    shl          widthd, 1
+    VBROADCASTI128   m4, [%2]
+    VBROADCASTI128   m5, [%3]
+    pxor             m6, m6
+    add           dstUq, widthq
+    add           dstVq, widthq
+    neg          widthq
+.loop:
+    movu             m0, [dstUq+widthq]
+    movu             m2, [dstVq+widthq]
+    punpckhwd        m1, m0, m6
+    punpckhwd        m3, m2, m6
+    punpcklwd        m0, m6
+    punpcklwd        m2, m6
+    pmaddwd          m0, m4
+    pmaddwd          m1, m4
+    pmaddwd          m2, m4
+    pmaddwd          m3, m4
+    paddd            m0, m5
+    paddd            m1, m5
+    paddd            m2, m5
+    paddd            m3, m5
+    psrad            m0, %4
+    psrad            m1, %4
+    psrad            m2, %4
+    psrad            m3, %4
+    packssdw         m0, m1
+    packssdw         m2, m3
+    movu [dstUq+widthq], m0
+    movu [dstVq+widthq], m2
+    add          widthq, mmsize
+    jl .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+LUMCONVERTRANGE lumRangeToJpeg,   lum_to_mult,   lum_to_offset,   lum_to_shift
+CHRCONVERTRANGE chrRangeToJpeg,   chr_to_mult,   chr_to_offset,   chr_to_shift
+LUMCONVERTRANGE lumRangeFromJpeg, lum_from_mult, lum_from_offset, lum_from_shift
+CHRCONVERTRANGE chrRangeFromJpeg, chr_from_mult, chr_from_offset, chr_from_shift
+
+INIT_YMM avx2
+LUMCONVERTRANGE lumRangeToJpeg,   lum_to_mult,   lum_to_offset,   lum_to_shift
+CHRCONVERTRANGE chrRangeToJpeg,   chr_to_mult,   chr_to_offset,   chr_to_shift
+LUMCONVERTRANGE lumRangeFromJpeg, lum_from_mult, lum_from_offset, lum_from_shift
+CHRCONVERTRANGE chrRangeFromJpeg, chr_from_mult, chr_from_offset, chr_from_shift
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 5a9da23265..ad7f67f90e 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -453,6 +453,39 @@ INPUT_PLANAR_RGB_UV_ALL_DECL(avx2);
 INPUT_PLANAR_RGB_A_ALL_DECL(avx2);
 #endif
 
+#define RANGE_CONVERT_FUNCS(opt) do {                                       \
+    if (c->dstBpc <= 14) {                                                  \
+        if (c->srcRange) {                                                  \
+            c->lumConvertRange = ff_lumRangeFromJpeg_ ##opt;                \
+            c->chrConvertRange = ff_chrRangeFromJpeg_ ##opt;                \
+        } else {                                                            \
+            c->lumConvertRange = ff_lumRangeToJpeg_ ##opt;                  \
+            c->chrConvertRange = ff_chrRangeToJpeg_ ##opt;                  \
+        }                                                                   \
+    }                                                                       \
+} while (0)
+
+#define RANGE_CONVERT_FUNCS_DECL(opt)                                       \
+void ff_lumRangeFromJpeg_ ##opt(int16_t *dst, int width);                   \
+void ff_chrRangeFromJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width);   \
+void ff_lumRangeToJpeg_ ##opt(int16_t *dst, int width);                     \
+void ff_chrRangeToJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width);     \
+
+RANGE_CONVERT_FUNCS_DECL(sse2);
+RANGE_CONVERT_FUNCS_DECL(avx2);
+
+av_cold void ff_sws_init_range_convert_x86(SwsContext *c)
+{
+    if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
+        int cpu_flags = av_get_cpu_flags();
+        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+            RANGE_CONVERT_FUNCS(avx2);
+        } else if (EXTERNAL_SSE2(cpu_flags)) {
+            RANGE_CONVERT_FUNCS(sse2);
+        }
+    }
+}
+
 av_cold void ff_sws_init_swscale_x86(SwsContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -820,4 +853,6 @@ switch(c->dstBpc){ \
     }
 
 #endif
+
+    ff_sws_init_range_convert_x86(c);
 }
-- 
2.30.2


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [FFmpeg-devel] [PATCH v2 1/4] checkasm: add tests for {lum, chr}ConvertRange
  2024-06-11 12:28 [FFmpeg-devel] [PATCH v2 1/4] checkasm: add tests for {lum, chr}ConvertRange Ramiro Polla
                   ` (2 preceding siblings ...)
  2024-06-11 12:28 ` [FFmpeg-devel] [PATCH v2 4/4] swscale/aarch64: add neon " Ramiro Polla
@ 2024-06-14 15:45 ` Ramiro Polla
  3 siblings, 0 replies; 12+ messages in thread
From: Ramiro Polla @ 2024-06-14 15:45 UTC (permalink / raw)
  To: ffmpeg-devel

On Tue, Jun 11, 2024 at 2:29 PM Ramiro Polla <ramiro.polla@gmail.com> wrote:
>
> ---
>  tests/checkasm/Makefile           |   2 +-
>  tests/checkasm/checkasm.c         |   1 +
>  tests/checkasm/checkasm.h         |   1 +
>  tests/checkasm/sw_range_convert.c | 134 ++++++++++++++++++++++++++++++
>  4 files changed, 137 insertions(+), 1 deletion(-)
>  create mode 100644 tests/checkasm/sw_range_convert.c
>
> diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
> index 6eb94d10d5..f20732b37a 100644
> --- a/tests/checkasm/Makefile
> +++ b/tests/checkasm/Makefile
> @@ -63,7 +63,7 @@ AVFILTEROBJS-$(CONFIG_SOBEL_FILTER)      += vf_convolution.o
>  CHECKASMOBJS-$(CONFIG_AVFILTER) += $(AVFILTEROBJS-yes)
>
>  # swscale tests
> -SWSCALEOBJS                             += sw_gbrp.o sw_rgb.o sw_scale.o
> +SWSCALEOBJS                             += sw_gbrp.o sw_range_convert.o sw_rgb.o sw_scale.o
>
>  CHECKASMOBJS-$(CONFIG_SWSCALE)  += $(SWSCALEOBJS)
>
> diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
> index 2329e2e1bc..56232ab1e0 100644
> --- a/tests/checkasm/checkasm.c
> +++ b/tests/checkasm/checkasm.c
> @@ -251,6 +251,7 @@ static const struct {
>  #endif
>  #if CONFIG_SWSCALE
>      { "sw_gbrp", checkasm_check_sw_gbrp },
> +    { "sw_range_convert", checkasm_check_sw_range_convert },
>      { "sw_rgb", checkasm_check_sw_rgb },
>      { "sw_scale", checkasm_check_sw_scale },
>  #endif
> diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
> index 211d7f52e6..e544007b67 100644
> --- a/tests/checkasm/checkasm.h
> +++ b/tests/checkasm/checkasm.h
> @@ -119,6 +119,7 @@ void checkasm_check_rv40dsp(void);
>  void checkasm_check_svq1enc(void);
>  void checkasm_check_synth_filter(void);
>  void checkasm_check_sw_gbrp(void);
> +void checkasm_check_sw_range_convert(void);
>  void checkasm_check_sw_rgb(void);
>  void checkasm_check_sw_scale(void);
>  void checkasm_check_takdsp(void);
> diff --git a/tests/checkasm/sw_range_convert.c b/tests/checkasm/sw_range_convert.c
> new file mode 100644
> index 0000000000..08029103d1
> --- /dev/null
> +++ b/tests/checkasm/sw_range_convert.c
> @@ -0,0 +1,134 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
> + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
> + */
> +
> +#include <string.h>
> +
> +#include "libavutil/common.h"
> +#include "libavutil/intreadwrite.h"
> +#include "libavutil/mem.h"
> +#include "libavutil/mem_internal.h"
> +
> +#include "libswscale/swscale.h"
> +#include "libswscale/swscale_internal.h"
> +
> +#include "checkasm.h"
> +
> +static void check_lumConvertRange(int from)
> +{
> +    const char *func_str = from ? "lumRangeFromJpeg" : "lumRangeToJpeg";
> +#define LARGEST_INPUT_SIZE 512
> +#define INPUT_SIZES 6
> +    static const int input_sizes[] = {8, 24, 128, 144, 256, 512};
> +    struct SwsContext *ctx;
> +
> +    LOCAL_ALIGNED_32(int16_t, dst0, [LARGEST_INPUT_SIZE]);
> +    LOCAL_ALIGNED_32(int16_t, dst1, [LARGEST_INPUT_SIZE]);
> +
> +    declare_func(void, int16_t *dst, int width);
> +
> +    ctx = sws_alloc_context();
> +    if (sws_init_context(ctx, NULL, NULL) < 0)
> +        fail();
> +
> +    ctx->srcFormat = from ? AV_PIX_FMT_YUVJ444P : AV_PIX_FMT_YUV444P;
> +    ctx->dstFormat = from ? AV_PIX_FMT_YUV444P : AV_PIX_FMT_YUVJ444P;
> +    ctx->srcRange = from;
> +    ctx->dstRange = !from;
> +
> +    for (int dstWi = 0; dstWi < INPUT_SIZES; dstWi++) {
> +        int width = input_sizes[dstWi];
> +        for (int i = 0; i < width; i++) {
> +            uint8_t r = rnd();
> +            dst0[i] = (int16_t) r << 7;
> +            dst1[i] = (int16_t) r << 7;
> +        }
> +        ff_sws_init_scale(ctx);
> +        if (check_func(ctx->lumConvertRange, "%s_%d", func_str, width)) {
> +            call_ref(dst0, width);
> +            call_new(dst1, width);
> +            if (memcmp(dst0, dst1, width * sizeof(int16_t)))
> +                fail();
> +            bench_new(dst1, width);
> +        }
> +    }
> +
> +    sws_freeContext(ctx);
> +}
> +#undef LARGEST_INPUT_SIZE
> +#undef INPUT_SIZES
> +
> +static void check_chrConvertRange(int from)
> +{
> +    const char *func_str = from ? "chrRangeFromJpeg" : "chrRangeToJpeg";
> +#define LARGEST_INPUT_SIZE 512
> +#define INPUT_SIZES 6
> +    static const int input_sizes[] = {8, 24, 128, 144, 256, 512};
> +    struct SwsContext *ctx;
> +
> +    LOCAL_ALIGNED_32(int16_t, dstU0, [LARGEST_INPUT_SIZE]);
> +    LOCAL_ALIGNED_32(int16_t, dstV0, [LARGEST_INPUT_SIZE]);
> +    LOCAL_ALIGNED_32(int16_t, dstU1, [LARGEST_INPUT_SIZE]);
> +    LOCAL_ALIGNED_32(int16_t, dstV1, [LARGEST_INPUT_SIZE]);
> +
> +    declare_func(void, int16_t *dstU, int16_t *dstV, int width);
> +
> +    ctx = sws_alloc_context();
> +    if (sws_init_context(ctx, NULL, NULL) < 0)
> +        fail();
> +
> +    ctx->srcFormat = from ? AV_PIX_FMT_YUVJ444P : AV_PIX_FMT_YUV444P;
> +    ctx->dstFormat = from ? AV_PIX_FMT_YUV444P : AV_PIX_FMT_YUVJ444P;
> +    ctx->srcRange = from;
> +    ctx->dstRange = !from;
> +
> +    for (int dstWi = 0; dstWi < INPUT_SIZES; dstWi++) {
> +        int width = input_sizes[dstWi];
> +        for (int i = 0; i < width; i++) {
> +            uint8_t r = rnd();
> +            dstU0[i] = (int16_t) r << 7;
> +            dstV0[i] = (int16_t) r << 7;
> +            dstU1[i] = (int16_t) r << 7;
> +            dstV1[i] = (int16_t) r << 7;
> +        }
> +        ff_sws_init_scale(ctx);
> +        if (check_func(ctx->chrConvertRange, "%s_%d", func_str, width)) {
> +            call_ref(dstU0, dstV0, width);
> +            call_new(dstU1, dstV1, width);
> +            if (memcmp(dstU0, dstU1, width * sizeof(int16_t)) ||
> +                memcmp(dstV0, dstV1, width * sizeof(int16_t)))
> +                fail();
> +            bench_new(dstU1, dstV1, width);
> +        }
> +    }
> +
> +    sws_freeContext(ctx);
> +}
> +#undef LARGEST_INPUT_SIZE
> +#undef INPUT_SIZES
> +
> +void checkasm_check_sw_range_convert(void)
> +{
> +    check_lumConvertRange(1);
> +    report("lumRangeFromJpeg");
> +    check_chrConvertRange(1);
> +    report("chrRangeFromJpeg");
> +    check_lumConvertRange(0);
> +    report("lumRangeToJpeg");
> +    check_chrConvertRange(0);
> +    report("chrRangeToJpeg");
> +}
> --
> 2.30.2
>

I'll apply tomorrow if there are no comments.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [FFmpeg-devel] [PATCH v2 2/4] swscale/x86: add sse4 {lum, chr}ConvertRange
  2024-06-12 14:54       ` Ramiro Polla
@ 2024-06-14 15:46         ` Ramiro Polla
  0 siblings, 0 replies; 12+ messages in thread
From: Ramiro Polla @ 2024-06-14 15:46 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

On Wed, Jun 12, 2024 at 4:54 PM Ramiro Polla <ramiro.polla@gmail.com> wrote:
>
> Hi,
>
> On Tue, Jun 11, 2024 at 8:42 PM James Almer <jamrial@gmail.com> wrote:
> >
> > On 6/11/2024 3:26 PM, Michael Niedermayer wrote:
> > > On Tue, Jun 11, 2024 at 02:28:56PM +0200, Ramiro Polla wrote:
> > >> chrRangeFromJpeg_8_c: 28.7
> > >> chrRangeFromJpeg_8_sse4: 16.2
> > >> chrRangeFromJpeg_24_c: 152.7
> > >> chrRangeFromJpeg_24_sse4: 29.7
> > >> chrRangeFromJpeg_128_c: 366.5
> > >> chrRangeFromJpeg_128_sse4: 233.0
> > >> chrRangeFromJpeg_144_c: 408.0
> > >> chrRangeFromJpeg_144_sse4: 182.5
> > >> chrRangeFromJpeg_256_c: 698.7
> > >> chrRangeFromJpeg_256_sse4: 325.5
> > >> chrRangeFromJpeg_512_c: 1348.7
> > >> chrRangeFromJpeg_512_sse4: 660.2
> > >> chrRangeToJpeg_8_c: 37.7
> > >> chrRangeToJpeg_8_sse4: 16.2
> > >> chrRangeToJpeg_24_c: 115.7
> > >> chrRangeToJpeg_24_sse4: 36.2
> > >> chrRangeToJpeg_128_c: 631.2
> > >> chrRangeToJpeg_128_sse4: 163.7
> > >> chrRangeToJpeg_144_c: 710.7
> > >> chrRangeToJpeg_144_sse4: 183.0
> > >> chrRangeToJpeg_256_c: 1253.0
> > >> chrRangeToJpeg_256_sse4: 343.5
> > >> chrRangeToJpeg_512_c: 2491.2
> > >> chrRangeToJpeg_512_sse4: 654.2
> > >> lumRangeFromJpeg_8_c: 11.7
> > >> lumRangeFromJpeg_8_sse4: 10.5
> > >> lumRangeFromJpeg_24_c: 38.5
> > >> lumRangeFromJpeg_24_sse4: 19.0
> > >> lumRangeFromJpeg_128_c: 237.5
> > >> lumRangeFromJpeg_128_sse4: 79.2
> > >> lumRangeFromJpeg_144_c: 255.7
> > >> lumRangeFromJpeg_144_sse4: 90.5
> > >> lumRangeFromJpeg_256_c: 441.5
> > >> lumRangeFromJpeg_256_sse4: 161.7
> > >> lumRangeFromJpeg_512_c: 879.0
> > >> lumRangeFromJpeg_512_sse4: 333.2
> > >> lumRangeToJpeg_8_c: 20.0
> > >> lumRangeToJpeg_8_sse4: 11.7
> > >> lumRangeToJpeg_24_c: 61.5
> > >> lumRangeToJpeg_24_sse4: 17.7
> > >> lumRangeToJpeg_128_c: 357.5
> > >> lumRangeToJpeg_128_sse4: 80.0
> > >> lumRangeToJpeg_144_c: 371.5
> > >> lumRangeToJpeg_144_sse4: 93.2
> > >> lumRangeToJpeg_256_c: 651.5
> > >> lumRangeToJpeg_256_sse4: 164.5
> > >> lumRangeToJpeg_512_c: 1279.0
> > >> lumRangeToJpeg_512_sse4: 333.7
> > >> ---
> > >>   libswscale/swscale_internal.h    |   1 +
> > >>   libswscale/utils.c               |   2 +
> > >>   libswscale/x86/Makefile          |   1 +
> > >>   libswscale/x86/range_convert.asm | 130 +++++++++++++++++++++++++++++++
> > >>   libswscale/x86/swscale.c         |  36 +++++++++
> > >>   5 files changed, 170 insertions(+)
> > >>   create mode 100644 libswscale/x86/range_convert.asm
> > >
> > > breaks x86-32 build
> > >
> > > LD    ffmpeg_g
> > > /usr/lib/gcc-cross/i686-linux-gnu/7/../../../../i686-linux-gnu/bin/ld: libswscale/libswscale.a(utils.o): in function `sws_setColorspaceDetails':
> > > ffmpeg/linux32/src/libswscale/utils.c:1086: undefined reference to `ff_sws_init_range_convert_x86'
> > > collect2: error: ld returned 1 exit status
> > > make: *** [Makefile:139: ffmpeg_g] Error 1
> > >
> > > thx
> >
> > The functions are wrapped in ARCH_X86_64 checks for seemingly no reason,
> > so they should be removed in the next iteration.
>
> Fixed.
>
> James walked me through on IRC to optimize and improve the functions
> in a way that they work both with sse2 and avx2. New patch attached.

I'll apply tomorrow if there are no more comments.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [FFmpeg-devel] [PATCH v2 4/4] swscale/aarch64: add neon {lum, chr}ConvertRange
  2024-06-11 12:28 ` [FFmpeg-devel] [PATCH v2 4/4] swscale/aarch64: add neon " Ramiro Polla
@ 2024-06-18 17:42   ` Ramiro Polla
  2024-06-18 21:15     ` Ramiro Polla
  0 siblings, 1 reply; 12+ messages in thread
From: Ramiro Polla @ 2024-06-18 17:42 UTC (permalink / raw)
  To: ffmpeg-devel

On Tue, Jun 11, 2024 at 2:29 PM Ramiro Polla <ramiro.polla@gmail.com> wrote:
>
> chrRangeFromJpeg_8_c: 29.2
> chrRangeFromJpeg_8_neon: 19.5
> chrRangeFromJpeg_24_c: 80.5
> chrRangeFromJpeg_24_neon: 34.0
> chrRangeFromJpeg_128_c: 413.7
> chrRangeFromJpeg_128_neon: 156.0
> chrRangeFromJpeg_144_c: 471.0
> chrRangeFromJpeg_144_neon: 174.2
> chrRangeFromJpeg_256_c: 842.0
> chrRangeFromJpeg_256_neon: 305.5
> chrRangeFromJpeg_512_c: 1699.0
> chrRangeFromJpeg_512_neon: 608.0
> chrRangeToJpeg_8_c: 51.7
> chrRangeToJpeg_8_neon: 22.7
> chrRangeToJpeg_24_c: 149.7
> chrRangeToJpeg_24_neon: 38.0
> chrRangeToJpeg_128_c: 761.7
> chrRangeToJpeg_128_neon: 176.7
> chrRangeToJpeg_144_c: 866.2
> chrRangeToJpeg_144_neon: 198.7
> chrRangeToJpeg_256_c: 1516.5
> chrRangeToJpeg_256_neon: 348.7
> chrRangeToJpeg_512_c: 3067.2
> chrRangeToJpeg_512_neon: 692.7
> lumRangeFromJpeg_8_c: 24.0
> lumRangeFromJpeg_8_neon: 17.0
> lumRangeFromJpeg_24_c: 56.7
> lumRangeFromJpeg_24_neon: 21.0
> lumRangeFromJpeg_128_c: 294.5
> lumRangeFromJpeg_128_neon: 76.7
> lumRangeFromJpeg_144_c: 332.5
> lumRangeFromJpeg_144_neon: 86.7
> lumRangeFromJpeg_256_c: 586.0
> lumRangeFromJpeg_256_neon: 152.2
> lumRangeFromJpeg_512_c: 1190.0
> lumRangeFromJpeg_512_neon: 298.0
> lumRangeToJpeg_8_c: 31.7
> lumRangeToJpeg_8_neon: 19.5
> lumRangeToJpeg_24_c: 83.5
> lumRangeToJpeg_24_neon: 24.2
> lumRangeToJpeg_128_c: 440.5
> lumRangeToJpeg_128_neon: 91.0
> lumRangeToJpeg_144_c: 504.2
> lumRangeToJpeg_144_neon: 101.0
> lumRangeToJpeg_256_c: 879.7
> lumRangeToJpeg_256_neon: 177.2
> lumRangeToJpeg_512_c: 1794.2
> lumRangeToJpeg_512_neon: 354.0
> ---
>  libswscale/aarch64/Makefile             |  1 +
>  libswscale/aarch64/range_convert_neon.S | 99 +++++++++++++++++++++++++
>  libswscale/aarch64/swscale.c            | 21 ++++++
>  libswscale/swscale_internal.h           |  1 +
>  libswscale/utils.c                      |  4 +-
>  5 files changed, 125 insertions(+), 1 deletion(-)
>  create mode 100644 libswscale/aarch64/range_convert_neon.S
>
> diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
> index adfd90a1b6..37ad960619 100644
> --- a/libswscale/aarch64/Makefile
> +++ b/libswscale/aarch64/Makefile
> @@ -5,5 +5,6 @@ OBJS        += aarch64/rgb2rgb.o                \
>  NEON-OBJS   += aarch64/hscale.o                 \
>                 aarch64/input.o                  \
>                 aarch64/output.o                 \
> +               aarch64/range_convert_neon.o     \
>                 aarch64/rgb2rgb_neon.o           \
>                 aarch64/yuv2rgb_neon.o           \
> diff --git a/libswscale/aarch64/range_convert_neon.S b/libswscale/aarch64/range_convert_neon.S
> new file mode 100644
> index 0000000000..ea56dc2e32
> --- /dev/null
> +++ b/libswscale/aarch64/range_convert_neon.S
> @@ -0,0 +1,99 @@
> +/*
> + * Copyright (c) 2024 Ramiro Polla
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +
> +.macro lumConvertRange name, max, mult, offset, shift
> +function ff_\name, export=1
> +.if \max != 0
> +        mov             w3, #\max
> +        dup             v24.8h, w3
> +.endif
> +        mov             w3, #\mult
> +        dup             v25.4s, w3
> +        movz            w3, \offset & 0xffff
> +        movk            w3, (\offset >> 16) & 0xffff, lsl #16
> +        dup             v26.4s, w3
> +1:
> +        ld1             {v0.8h}, [x0]
> +.if \max != 0
> +        smin            v0.8h, v0.8h, v24.8h
> +.endif
> +        mov             v16.16b, v26.16b
> +        mov             v18.16b, v26.16b
> +        sxtl            v20.4s, v0.4h
> +        sxtl2           v22.4s, v0.8h
> +        mla             v16.4s, v20.4s, v25.4s
> +        mla             v18.4s, v22.4s, v25.4s
> +        shrn            v0.4h, v16.4s, #\shift
> +        shrn2           v0.8h, v18.4s, #\shift
> +        subs            w1, w1, #8
> +        st1             {v0.8h}, [x0], #16
> +        b.gt            1b
> +        ret
> +endfunc
> +.endm
> +
> +.macro chrConvertRange name, max, mult, offset, shift
> +function ff_\name, export=1
> +.if \max != 0
> +        mov             w3, #\max
> +        dup             v24.8h, w3
> +.endif
> +        mov             w3, #\mult
> +        dup             v25.4s, w3
> +        movz            w3, \offset & 0xffff
> +        movk            w3, (\offset >> 16) & 0xffff, lsl #16
> +        dup             v26.4s, w3
> +1:
> +        ld1             {v0.8h}, [x0]
> +        ld1             {v1.8h}, [x1]
> +.if \max != 0
> +        smin            v0.8h, v0.8h, v24.8h
> +        smin            v1.8h, v1.8h, v24.8h
> +.endif
> +        mov             v16.16b, v26.16b
> +        mov             v17.16b, v26.16b
> +        mov             v18.16b, v26.16b
> +        mov             v19.16b, v26.16b
> +        sxtl            v20.4s, v0.4h
> +        sxtl            v21.4s, v1.4h
> +        sxtl2           v22.4s, v0.8h
> +        sxtl2           v23.4s, v1.8h
> +        mla             v16.4s, v20.4s, v25.4s
> +        mla             v17.4s, v21.4s, v25.4s
> +        mla             v18.4s, v22.4s, v25.4s
> +        mla             v19.4s, v23.4s, v25.4s
> +        shrn            v0.4h, v16.4s, #\shift
> +        shrn            v1.4h, v17.4s, #\shift
> +        shrn2           v0.8h, v18.4s, #\shift
> +        shrn2           v1.8h, v19.4s, #\shift
> +        subs            w2, w2, #8
> +        st1             {v0.8h}, [x0], #16
> +        st1             {v1.8h}, [x1], #16
> +        b.gt            1b
> +        ret
> +endfunc
> +.endm
> +
> +lumConvertRange lumRangeToJpeg_neon,   30189, 19077, -39057361, 14
> +chrConvertRange chrRangeToJpeg_neon,   30775,  4663,  -9289992, 12
> +lumConvertRange lumRangeFromJpeg_neon,     0, 14071,  33561947, 14
> +chrConvertRange chrRangeFromJpeg_neon,     0,  1799,   4081085, 11
> diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
> index 4c4ea39dc1..e4ea3309ba 100644
> --- a/libswscale/aarch64/swscale.c
> +++ b/libswscale/aarch64/swscale.c
> @@ -215,6 +215,26 @@ void ff_rgb24ToUV_half_neon(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unuse
>                         const uint8_t *src2, int width, uint32_t *rgb2yuv,
>                         void *opq);
>
> +void ff_lumRangeFromJpeg_neon(int16_t *dst, int width);
> +void ff_chrRangeFromJpeg_neon(int16_t *dstU, int16_t *dstV, int width);
> +void ff_lumRangeToJpeg_neon(int16_t *dst, int width);
> +void ff_chrRangeToJpeg_neon(int16_t *dstU, int16_t *dstV, int width);
> +
> +av_cold void ff_sws_init_range_convert_aarch64(SwsContext *c)
> +{
> +    if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
> +        if (c->dstBpc <= 14) {
> +            if (c->srcRange) {
> +                c->lumConvertRange = ff_lumRangeFromJpeg_neon;
> +                c->chrConvertRange = ff_chrRangeFromJpeg_neon;
> +            } else {
> +                c->lumConvertRange = ff_lumRangeToJpeg_neon;
> +                c->chrConvertRange = ff_chrRangeToJpeg_neon;
> +            }
> +        }
> +    }
> +}
> +
>  av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
>  {
>      int cpu_flags = av_get_cpu_flags();
> @@ -237,5 +257,6 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
>          default:
>              break;
>          }
> +        ff_sws_init_range_convert_aarch64(c);
>      }
>  }
> diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
> index d5e7b5e71c..0818f50c7f 100644
> --- a/libswscale/swscale_internal.h
> +++ b/libswscale/swscale_internal.h
> @@ -697,6 +697,7 @@ void ff_yuv2rgb_init_tables_ppc(SwsContext *c, const int inv_table[4],
>  void ff_updateMMXDitherTables(SwsContext *c, int dstY);
>
>  av_cold void ff_sws_init_range_convert(SwsContext *c);
> +av_cold void ff_sws_init_range_convert_aarch64(SwsContext *c);
>  av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c);
>  av_cold void ff_sws_init_range_convert_x86(SwsContext *c);
>
> diff --git a/libswscale/utils.c b/libswscale/utils.c
> index 8dfa57b5ff..12dba712c1 100644
> --- a/libswscale/utils.c
> +++ b/libswscale/utils.c
> @@ -1080,7 +1080,9 @@ int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
>
>      if (need_reinit) {
>          ff_sws_init_range_convert(c);
> -#if ARCH_LOONGARCH64
> +#if ARCH_AARCH64
> +        ff_sws_init_range_convert_aarch64(c);
> +#elif ARCH_LOONGARCH64
>          ff_sws_init_range_convert_loongarch(c);
>  #elif ARCH_X86
>          ff_sws_init_range_convert_x86(c);
> --
> 2.30.2
>

I finally tested this patch with movz/movk/dup instead of ld1 on apple
arm. I'll apply tomorrow if there are no more comments.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [FFmpeg-devel] [PATCH v2 4/4] swscale/aarch64: add neon {lum, chr}ConvertRange
  2024-06-18 17:42   ` Ramiro Polla
@ 2024-06-18 21:15     ` Ramiro Polla
  0 siblings, 0 replies; 12+ messages in thread
From: Ramiro Polla @ 2024-06-18 21:15 UTC (permalink / raw)
  To: ffmpeg-devel

On Tue, Jun 18, 2024 at 7:42 PM Ramiro Polla <ramiro.polla@gmail.com> wrote:
>
> On Tue, Jun 11, 2024 at 2:29 PM Ramiro Polla <ramiro.polla@gmail.com> wrote:
> >
> > chrRangeFromJpeg_8_c: 29.2
> > chrRangeFromJpeg_8_neon: 19.5
> > chrRangeFromJpeg_24_c: 80.5
> > chrRangeFromJpeg_24_neon: 34.0
> > chrRangeFromJpeg_128_c: 413.7
> > chrRangeFromJpeg_128_neon: 156.0
> > chrRangeFromJpeg_144_c: 471.0
> > chrRangeFromJpeg_144_neon: 174.2
> > chrRangeFromJpeg_256_c: 842.0
> > chrRangeFromJpeg_256_neon: 305.5
> > chrRangeFromJpeg_512_c: 1699.0
> > chrRangeFromJpeg_512_neon: 608.0
> > chrRangeToJpeg_8_c: 51.7
> > chrRangeToJpeg_8_neon: 22.7
> > chrRangeToJpeg_24_c: 149.7
> > chrRangeToJpeg_24_neon: 38.0
> > chrRangeToJpeg_128_c: 761.7
> > chrRangeToJpeg_128_neon: 176.7
> > chrRangeToJpeg_144_c: 866.2
> > chrRangeToJpeg_144_neon: 198.7
> > chrRangeToJpeg_256_c: 1516.5
> > chrRangeToJpeg_256_neon: 348.7
> > chrRangeToJpeg_512_c: 3067.2
> > chrRangeToJpeg_512_neon: 692.7
> > lumRangeFromJpeg_8_c: 24.0
> > lumRangeFromJpeg_8_neon: 17.0
> > lumRangeFromJpeg_24_c: 56.7
> > lumRangeFromJpeg_24_neon: 21.0
> > lumRangeFromJpeg_128_c: 294.5
> > lumRangeFromJpeg_128_neon: 76.7
> > lumRangeFromJpeg_144_c: 332.5
> > lumRangeFromJpeg_144_neon: 86.7
> > lumRangeFromJpeg_256_c: 586.0
> > lumRangeFromJpeg_256_neon: 152.2
> > lumRangeFromJpeg_512_c: 1190.0
> > lumRangeFromJpeg_512_neon: 298.0
> > lumRangeToJpeg_8_c: 31.7
> > lumRangeToJpeg_8_neon: 19.5
> > lumRangeToJpeg_24_c: 83.5
> > lumRangeToJpeg_24_neon: 24.2
> > lumRangeToJpeg_128_c: 440.5
> > lumRangeToJpeg_128_neon: 91.0
> > lumRangeToJpeg_144_c: 504.2
> > lumRangeToJpeg_144_neon: 101.0
> > lumRangeToJpeg_256_c: 879.7
> > lumRangeToJpeg_256_neon: 177.2
> > lumRangeToJpeg_512_c: 1794.2
> > lumRangeToJpeg_512_neon: 354.0
> > ---
> >  libswscale/aarch64/Makefile             |  1 +
> >  libswscale/aarch64/range_convert_neon.S | 99 +++++++++++++++++++++++++
> >  libswscale/aarch64/swscale.c            | 21 ++++++
> >  libswscale/swscale_internal.h           |  1 +
> >  libswscale/utils.c                      |  4 +-
> >  5 files changed, 125 insertions(+), 1 deletion(-)
> >  create mode 100644 libswscale/aarch64/range_convert_neon.S
> >
> > diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
> > index adfd90a1b6..37ad960619 100644
> > --- a/libswscale/aarch64/Makefile
> > +++ b/libswscale/aarch64/Makefile
> > @@ -5,5 +5,6 @@ OBJS        += aarch64/rgb2rgb.o                \
> >  NEON-OBJS   += aarch64/hscale.o                 \
> >                 aarch64/input.o                  \
> >                 aarch64/output.o                 \
> > +               aarch64/range_convert_neon.o     \
> >                 aarch64/rgb2rgb_neon.o           \
> >                 aarch64/yuv2rgb_neon.o           \
> > diff --git a/libswscale/aarch64/range_convert_neon.S b/libswscale/aarch64/range_convert_neon.S
> > new file mode 100644
> > index 0000000000..ea56dc2e32
> > --- /dev/null
> > +++ b/libswscale/aarch64/range_convert_neon.S
> > @@ -0,0 +1,99 @@
> > +/*
> > + * Copyright (c) 2024 Ramiro Polla
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> > + */
> > +
> > +#include "libavutil/aarch64/asm.S"
> > +
> > +.macro lumConvertRange name, max, mult, offset, shift
> > +function ff_\name, export=1
> > +.if \max != 0
> > +        mov             w3, #\max
> > +        dup             v24.8h, w3
> > +.endif
> > +        mov             w3, #\mult
> > +        dup             v25.4s, w3
> > +        movz            w3, \offset & 0xffff
> > +        movk            w3, (\offset >> 16) & 0xffff, lsl #16
> > +        dup             v26.4s, w3
> > +1:
> > +        ld1             {v0.8h}, [x0]
> > +.if \max != 0
> > +        smin            v0.8h, v0.8h, v24.8h
> > +.endif
> > +        mov             v16.16b, v26.16b
> > +        mov             v18.16b, v26.16b
> > +        sxtl            v20.4s, v0.4h
> > +        sxtl2           v22.4s, v0.8h
> > +        mla             v16.4s, v20.4s, v25.4s
> > +        mla             v18.4s, v22.4s, v25.4s
> > +        shrn            v0.4h, v16.4s, #\shift
> > +        shrn2           v0.8h, v18.4s, #\shift
> > +        subs            w1, w1, #8
> > +        st1             {v0.8h}, [x0], #16
> > +        b.gt            1b
> > +        ret
> > +endfunc
> > +.endm
> > +
> > +.macro chrConvertRange name, max, mult, offset, shift
> > +function ff_\name, export=1
> > +.if \max != 0
> > +        mov             w3, #\max
> > +        dup             v24.8h, w3
> > +.endif
> > +        mov             w3, #\mult
> > +        dup             v25.4s, w3
> > +        movz            w3, \offset & 0xffff
> > +        movk            w3, (\offset >> 16) & 0xffff, lsl #16
> > +        dup             v26.4s, w3
> > +1:
> > +        ld1             {v0.8h}, [x0]
> > +        ld1             {v1.8h}, [x1]
> > +.if \max != 0
> > +        smin            v0.8h, v0.8h, v24.8h
> > +        smin            v1.8h, v1.8h, v24.8h
> > +.endif
> > +        mov             v16.16b, v26.16b
> > +        mov             v17.16b, v26.16b
> > +        mov             v18.16b, v26.16b
> > +        mov             v19.16b, v26.16b
> > +        sxtl            v20.4s, v0.4h
> > +        sxtl            v21.4s, v1.4h
> > +        sxtl2           v22.4s, v0.8h
> > +        sxtl2           v23.4s, v1.8h
> > +        mla             v16.4s, v20.4s, v25.4s
> > +        mla             v17.4s, v21.4s, v25.4s
> > +        mla             v18.4s, v22.4s, v25.4s
> > +        mla             v19.4s, v23.4s, v25.4s
> > +        shrn            v0.4h, v16.4s, #\shift
> > +        shrn            v1.4h, v17.4s, #\shift
> > +        shrn2           v0.8h, v18.4s, #\shift
> > +        shrn2           v1.8h, v19.4s, #\shift
> > +        subs            w2, w2, #8
> > +        st1             {v0.8h}, [x0], #16
> > +        st1             {v1.8h}, [x1], #16
> > +        b.gt            1b
> > +        ret
> > +endfunc
> > +.endm
> > +
> > +lumConvertRange lumRangeToJpeg_neon,   30189, 19077, -39057361, 14
> > +chrConvertRange chrRangeToJpeg_neon,   30775,  4663,  -9289992, 12
> > +lumConvertRange lumRangeFromJpeg_neon,     0, 14071,  33561947, 14
> > +chrConvertRange chrRangeFromJpeg_neon,     0,  1799,   4081085, 11
> > diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
> > index 4c4ea39dc1..e4ea3309ba 100644
> > --- a/libswscale/aarch64/swscale.c
> > +++ b/libswscale/aarch64/swscale.c
> > @@ -215,6 +215,26 @@ void ff_rgb24ToUV_half_neon(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unuse
> >                         const uint8_t *src2, int width, uint32_t *rgb2yuv,
> >                         void *opq);
> >
> > +void ff_lumRangeFromJpeg_neon(int16_t *dst, int width);
> > +void ff_chrRangeFromJpeg_neon(int16_t *dstU, int16_t *dstV, int width);
> > +void ff_lumRangeToJpeg_neon(int16_t *dst, int width);
> > +void ff_chrRangeToJpeg_neon(int16_t *dstU, int16_t *dstV, int width);
> > +
> > +av_cold void ff_sws_init_range_convert_aarch64(SwsContext *c)
> > +{
> > +    if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
> > +        if (c->dstBpc <= 14) {
> > +            if (c->srcRange) {
> > +                c->lumConvertRange = ff_lumRangeFromJpeg_neon;
> > +                c->chrConvertRange = ff_chrRangeFromJpeg_neon;
> > +            } else {
> > +                c->lumConvertRange = ff_lumRangeToJpeg_neon;
> > +                c->chrConvertRange = ff_chrRangeToJpeg_neon;
> > +            }
> > +        }
> > +    }
> > +}
> > +
> >  av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
> >  {
> >      int cpu_flags = av_get_cpu_flags();
> > @@ -237,5 +257,6 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
> >          default:
> >              break;
> >          }
> > +        ff_sws_init_range_convert_aarch64(c);
> >      }
> >  }
> > diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
> > index d5e7b5e71c..0818f50c7f 100644
> > --- a/libswscale/swscale_internal.h
> > +++ b/libswscale/swscale_internal.h
> > @@ -697,6 +697,7 @@ void ff_yuv2rgb_init_tables_ppc(SwsContext *c, const int inv_table[4],
> >  void ff_updateMMXDitherTables(SwsContext *c, int dstY);
> >
> >  av_cold void ff_sws_init_range_convert(SwsContext *c);
> > +av_cold void ff_sws_init_range_convert_aarch64(SwsContext *c);
> >  av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c);
> >  av_cold void ff_sws_init_range_convert_x86(SwsContext *c);
> >
> > diff --git a/libswscale/utils.c b/libswscale/utils.c
> > index 8dfa57b5ff..12dba712c1 100644
> > --- a/libswscale/utils.c
> > +++ b/libswscale/utils.c
> > @@ -1080,7 +1080,9 @@ int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
> >
> >      if (need_reinit) {
> >          ff_sws_init_range_convert(c);
> > -#if ARCH_LOONGARCH64
> > +#if ARCH_AARCH64
> > +        ff_sws_init_range_convert_aarch64(c);
> > +#elif ARCH_LOONGARCH64
> >          ff_sws_init_range_convert_loongarch(c);
> >  #elif ARCH_X86
> >          ff_sws_init_range_convert_x86(c);
> > --
> > 2.30.2
> >
>
> I finally tested this patch with movz/movk/dup instead of ld1 on apple
> arm. I'll apply tomorrow if there are no more comments.

Reviewed by Martin on IRC and pushed.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2024-06-18 21:15 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-06-11 12:28 [FFmpeg-devel] [PATCH v2 1/4] checkasm: add tests for {lum, chr}ConvertRange Ramiro Polla
2024-06-11 12:28 ` [FFmpeg-devel] [PATCH v2 2/4] swscale/x86: add sse4 " Ramiro Polla
2024-06-11 12:32   ` James Almer
2024-06-11 18:26   ` Michael Niedermayer
2024-06-11 18:43     ` James Almer
2024-06-12 14:54       ` Ramiro Polla
2024-06-14 15:46         ` Ramiro Polla
2024-06-11 12:28 ` [FFmpeg-devel] [PATCH v2 3/4] swscale/x86: add avx2 " Ramiro Polla
2024-06-11 12:28 ` [FFmpeg-devel] [PATCH v2 4/4] swscale/aarch64: add neon " Ramiro Polla
2024-06-18 17:42   ` Ramiro Polla
2024-06-18 21:15     ` Ramiro Polla
2024-06-14 15:45 ` [FFmpeg-devel] [PATCH v2 1/4] checkasm: add tests for " Ramiro Polla

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git