[FFmpeg-devel] [PATCH 1/4] tests/checkasm: cosmetics, one object per line in Makefile

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed

* [FFmpeg-devel] [PATCH 1/4] tests/checkasm: cosmetics, one object per line in Makefile
@ 2024-06-07 14:05 Ramiro Polla
  2024-06-07 14:05 ` [FFmpeg-devel] [PATCH 2/4] checkasm: add tests for {lum, chr}ConvertRange Ramiro Polla
                   ` (3 more replies)
  0 siblings, 4 replies; 11+ messages in thread
From: Ramiro Polla @ 2024-06-07 14:05 UTC (permalink / raw)
  To: ffmpeg-devel

---
 tests/checkasm/Makefile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 6eb94d10d5..3ce152e818 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -63,7 +63,9 @@ AVFILTEROBJS-$(CONFIG_SOBEL_FILTER)      += vf_convolution.o
 CHECKASMOBJS-$(CONFIG_AVFILTER) += $(AVFILTEROBJS-yes)
 
 # swscale tests
-SWSCALEOBJS                             += sw_gbrp.o sw_rgb.o sw_scale.o
+SWSCALEOBJS                             += sw_gbrp.o
+SWSCALEOBJS                             += sw_rgb.o
+SWSCALEOBJS                             += sw_scale.o
 
 CHECKASMOBJS-$(CONFIG_SWSCALE)  += $(SWSCALEOBJS)
 
-- 
2.30.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [FFmpeg-devel] [PATCH 2/4] checkasm: add tests for {lum, chr}ConvertRange
  2024-06-07 14:05 [FFmpeg-devel] [PATCH 1/4] tests/checkasm: cosmetics, one object per line in Makefile Ramiro Polla
@ 2024-06-07 14:05 ` Ramiro Polla
  2024-06-07 14:05 ` [FFmpeg-devel] [PATCH 3/4] swscale/x86: add sse4 " Ramiro Polla
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 11+ messages in thread
From: Ramiro Polla @ 2024-06-07 14:05 UTC (permalink / raw)
  To: ffmpeg-devel

---
 tests/checkasm/Makefile           |   1 +
 tests/checkasm/checkasm.c         |   1 +
 tests/checkasm/checkasm.h         |   1 +
 tests/checkasm/sw_range_convert.c | 134 ++++++++++++++++++++++++++++++
 4 files changed, 137 insertions(+)
 create mode 100644 tests/checkasm/sw_range_convert.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 3ce152e818..e4ec6a27ec 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -64,6 +64,7 @@ CHECKASMOBJS-$(CONFIG_AVFILTER) += $(AVFILTEROBJS-yes)
 
 # swscale tests
 SWSCALEOBJS                             += sw_gbrp.o
+SWSCALEOBJS                             += sw_range_convert.o
 SWSCALEOBJS                             += sw_rgb.o
 SWSCALEOBJS                             += sw_scale.o
 
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index d7aa2a9c09..d2b50c023a 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -248,6 +248,7 @@ static const struct {
 #endif
 #if CONFIG_SWSCALE
     { "sw_gbrp", checkasm_check_sw_gbrp },
+    { "sw_range_convert", checkasm_check_sw_range_convert },
     { "sw_rgb", checkasm_check_sw_rgb },
     { "sw_scale", checkasm_check_sw_scale },
 #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 211d7f52e6..e544007b67 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -119,6 +119,7 @@ void checkasm_check_rv40dsp(void);
 void checkasm_check_svq1enc(void);
 void checkasm_check_synth_filter(void);
 void checkasm_check_sw_gbrp(void);
+void checkasm_check_sw_range_convert(void);
 void checkasm_check_sw_rgb(void);
 void checkasm_check_sw_scale(void);
 void checkasm_check_takdsp(void);
diff --git a/tests/checkasm/sw_range_convert.c b/tests/checkasm/sw_range_convert.c
new file mode 100644
index 0000000000..6d7e22ad40
--- /dev/null
+++ b/tests/checkasm/sw_range_convert.c
@@ -0,0 +1,134 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string.h>
+
+#include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem.h"
+#include "libavutil/mem_internal.h"
+
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+
+#include "checkasm.h"
+
+static void check_lumConvertRange(int from)
+{
+    const char *func_str = from ? "lumRangeFromJpeg" : "lumRangeToJpeg";
+#define LARGEST_INPUT_SIZE 512
+#define INPUT_SIZES 6
+    static const int input_sizes[] = {8, 24, 128, 144, 256, 512};
+    struct SwsContext *ctx;
+
+    LOCAL_ALIGNED_32(int16_t, dst0, [LARGEST_INPUT_SIZE]);
+    LOCAL_ALIGNED_32(int16_t, dst1, [LARGEST_INPUT_SIZE]);
+
+    declare_func(void, int16_t *dst, int width);
+
+    ctx = sws_alloc_context();
+    if (sws_init_context(ctx, NULL, NULL) < 0)
+        fail();
+
+    ctx->srcFormat = from ? AV_PIX_FMT_YUVJ444P : AV_PIX_FMT_YUV444P;
+    ctx->dstFormat = from ? AV_PIX_FMT_YUV444P : AV_PIX_FMT_YUVJ444P;
+    ctx->srcRange = from;
+    ctx->dstRange = !from;
+
+    for (int dstWi = 0; dstWi < INPUT_SIZES; dstWi++) {
+        int width = input_sizes[dstWi];
+        for (int i = 0; i < width; i++) {
+            uint8_t r = rnd();
+            dst0[i] = (int16_t) r << 7;
+            dst1[i] = (int16_t) r << 7;
+        }
+        ff_sws_init_scale(ctx);
+        if (check_func(ctx->lumConvertRange, "%s_%d", func_str, width)) {
+            call_ref(dst0, width);
+            call_new(dst1, width);
+            if (memcmp(dst0, dst1, width * sizeof(int16_t)))
+                fail();
+            bench_new(dst1, width);
+        }
+    }
+
+    sws_freeContext(ctx);
+}
+#undef LARGEST_INPUT_SIZE
+#undef INPUT_SIZES
+
+static void check_chrConvertRange(int from)
+{
+    const char *func_str = from ? "chrRangeFromJpeg" : "chrRangeToJpeg";
+#define LARGEST_INPUT_SIZE 512
+#define INPUT_SIZES 6
+    static const int input_sizes[] = {8, 24, 128, 144, 256, 512};
+    struct SwsContext *ctx;
+
+    LOCAL_ALIGNED_32(int16_t, dstU0, [LARGEST_INPUT_SIZE]);
+    LOCAL_ALIGNED_32(int16_t, dstV0, [LARGEST_INPUT_SIZE]);
+    LOCAL_ALIGNED_32(int16_t, dstU1, [LARGEST_INPUT_SIZE]);
+    LOCAL_ALIGNED_32(int16_t, dstV1, [LARGEST_INPUT_SIZE]);
+
+    declare_func(void, int16_t *dstU, int16_t *dstV, int width);
+
+    ctx = sws_alloc_context();
+    if (sws_init_context(ctx, NULL, NULL) < 0)
+        fail();
+
+    ctx->srcFormat = from ? AV_PIX_FMT_YUVJ444P : AV_PIX_FMT_YUV444P;
+    ctx->dstFormat = from ? AV_PIX_FMT_YUV444P : AV_PIX_FMT_YUVJ444P;
+    ctx->srcRange = from;
+    ctx->dstRange = !from;
+
+    for (int dstWi = 0; dstWi < INPUT_SIZES; dstWi++) {
+        int width = input_sizes[dstWi];
+        for (int i = 0; i < width; i++) {
+            uint8_t r = rnd();
+            dstU0[i] = (int16_t) r << 7;
+            dstV0[i] = (int16_t) r << 7;
+            dstU1[i] = (int16_t) r << 7;
+            dstV1[i] = (int16_t) r << 7;
+        }
+        ff_sws_init_scale(ctx);
+        if (check_func(ctx->chrConvertRange, "%s_%d", func_str, width)) {
+            call_ref(dstU0, dstV0, width);
+            call_new(dstU1, dstV1, width);
+            if (memcmp(dstU0, dstU1, width * sizeof(int16_t))
+             || memcmp(dstV0, dstV1, width * sizeof(int16_t)))
+                fail();
+            bench_new(dstU1, dstV1, width);
+        }
+    }
+
+    sws_freeContext(ctx);
+}
+#undef LARGEST_INPUT_SIZE
+#undef INPUT_SIZES
+
+void checkasm_check_sw_range_convert(void)
+{
+    check_lumConvertRange(1);
+    report("lumRangeFromJpeg");
+    check_chrConvertRange(1);
+    report("chrRangeFromJpeg");
+    check_lumConvertRange(0);
+    report("lumRangeToJpeg");
+    check_chrConvertRange(0);
+    report("chrRangeToJpeg");
+}
-- 
2.30.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [FFmpeg-devel] [PATCH 3/4] swscale/x86: add sse4 {lum, chr}ConvertRange
  2024-06-07 14:05 [FFmpeg-devel] [PATCH 1/4] tests/checkasm: cosmetics, one object per line in Makefile Ramiro Polla
  2024-06-07 14:05 ` [FFmpeg-devel] [PATCH 2/4] checkasm: add tests for {lum, chr}ConvertRange Ramiro Polla
@ 2024-06-07 14:05 ` Ramiro Polla
  2024-06-07 17:38   ` Ramiro Polla
  2024-06-07 14:05 ` [FFmpeg-devel] [PATCH 4/4] swscale/aarch64: add neon " Ramiro Polla
  2024-06-07 18:45 ` [FFmpeg-devel] [PATCH 1/4] tests/checkasm: cosmetics, one object per line in Makefile Andreas Rheinhardt
  3 siblings, 1 reply; 11+ messages in thread
From: Ramiro Polla @ 2024-06-07 14:05 UTC (permalink / raw)
  To: ffmpeg-devel

chrRangeFromJpeg_8_c: 19.9
chrRangeFromJpeg_8_sse4: 16.2
chrRangeFromJpeg_24_c: 60.7
chrRangeFromJpeg_24_sse4: 28.9
chrRangeFromJpeg_128_c: 325.7
chrRangeFromJpeg_128_sse4: 160.2
chrRangeFromJpeg_144_c: 364.2
chrRangeFromJpeg_144_sse4: 194.9
chrRangeFromJpeg_256_c: 630.7
chrRangeFromJpeg_256_sse4: 337.4
chrRangeFromJpeg_512_c: 1240.4
chrRangeFromJpeg_512_sse4: 668.4
chrRangeToJpeg_8_c: 37.7
chrRangeToJpeg_8_sse4: 19.7
chrRangeToJpeg_24_c: 114.7
chrRangeToJpeg_24_sse4: 30.2
chrRangeToJpeg_128_c: 636.4
chrRangeToJpeg_128_sse4: 161.7
chrRangeToJpeg_144_c: 715.7
chrRangeToJpeg_144_sse4: 272.9
chrRangeToJpeg_256_c: 1256.7
chrRangeToJpeg_256_sse4: 341.9
chrRangeToJpeg_512_c: 2498.7
chrRangeToJpeg_512_sse4: 668.4
lumRangeFromJpeg_8_c: 11.7
lumRangeFromJpeg_8_sse4: 12.4
lumRangeFromJpeg_24_c: 36.9
lumRangeFromJpeg_24_sse4: 17.7
lumRangeFromJpeg_128_c: 228.4
lumRangeFromJpeg_128_sse4: 85.2
lumRangeFromJpeg_144_c: 272.9
lumRangeFromJpeg_144_sse4: 96.9
lumRangeFromJpeg_256_c: 463.4
lumRangeFromJpeg_256_sse4: 183.9
lumRangeFromJpeg_512_c: 879.9
lumRangeFromJpeg_512_sse4: 355.9
lumRangeToJpeg_8_c: 17.7
lumRangeToJpeg_8_sse4: 15.4
lumRangeToJpeg_24_c: 56.2
lumRangeToJpeg_24_sse4: 18.4
lumRangeToJpeg_128_c: 331.4
lumRangeToJpeg_128_sse4: 84.4
lumRangeToJpeg_144_c: 375.2
lumRangeToJpeg_144_sse4: 96.9
lumRangeToJpeg_256_c: 649.7
lumRangeToJpeg_256_sse4: 184.4
lumRangeToJpeg_512_c: 1281.9
lumRangeToJpeg_512_sse4: 355.9
---
 libswscale/swscale_internal.h    |   1 +
 libswscale/utils.c               |   2 +
 libswscale/x86/Makefile          |   1 +
 libswscale/x86/range_convert.asm | 100 +++++++++++++++++++++++++++++++
 libswscale/x86/swscale.c         |  36 +++++++++++
 5 files changed, 140 insertions(+)
 create mode 100644 libswscale/x86/range_convert.asm

diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index d4b0c3cee2..92f6105443 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -698,6 +698,7 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY);
 
 av_cold void ff_sws_init_range_convert(SwsContext *c);
 av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c);
+av_cold void ff_sws_init_range_convert_x86(SwsContext *c);
 
 SwsFunc ff_yuv2rgb_init_x86(SwsContext *c);
 SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c);
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 476a24fea5..8dfa57b5ff 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -1082,6 +1082,8 @@ int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
         ff_sws_init_range_convert(c);
 #if ARCH_LOONGARCH64
         ff_sws_init_range_convert_loongarch(c);
+#elif ARCH_X86
+        ff_sws_init_range_convert_x86(c);
 #endif
     }
 
diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index 68391494be..f00154941d 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -12,6 +12,7 @@ X86ASM-OBJS                     += x86/input.o                          \
                                    x86/output.o                         \
                                    x86/scale.o                          \
                                    x86/scale_avx2.o                          \
+                                   x86/range_convert.o                  \
                                    x86/rgb_2_rgb.o                      \
                                    x86/yuv_2_rgb.o                      \
                                    x86/yuv2yuvX.o                       \
diff --git a/libswscale/x86/range_convert.asm b/libswscale/x86/range_convert.asm
new file mode 100644
index 0000000000..333265fb65
--- /dev/null
+++ b/libswscale/x86/range_convert.asm
@@ -0,0 +1,100 @@
+;******************************************************************************
+;* Copyright (c) 2024 Ramiro Polla
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+; NOTE: there is no need to clamp the input when converting to jpeg range
+;       (like we do in the C code) because packssdw will saturate the output.
+
+;-----------------------------------------------------------------------------
+; lumConvertRange
+;
+; void ff_lumRangeToJpeg_<opt>(int16_t *dst, int width);
+; void ff_lumRangeFromJpeg_<opt>(int16_t *dst, int width);
+;
+;-----------------------------------------------------------------------------
+
+%macro LUMCONVERTRANGE 4
+SECTION_RODATA
+mult_%1:        times 4 dd %2
+offset_%1:      times 4 dd %3
+SECTION .text
+cglobal %1, 2, 3, 3, dst, width, x
+    movsxdifnidn widthq, widthd
+    xor              xq, xq
+    mova             m1, [mult_%1]
+    mova             m2, [offset_%1]
+.loop:
+    pmovsxwd         m0, [dstq+xq*2]
+    pmulld           m0, m1
+    paddd            m0, m2
+    psrad            m0, %4
+    packssdw         m0, m0
+    movh    [dstq+xq*2], m0
+    add              xq, mmsize / 4
+    cmp              xd, widthd
+    jl .loop
+    RET
+%endmacro
+
+;-----------------------------------------------------------------------------
+; chrConvertRange
+;
+; void ff_chrRangeToJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width);
+; void ff_chrRangeFromJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width);
+;
+;-----------------------------------------------------------------------------
+
+%macro CHRCONVERTRANGE 4
+SECTION_RODATA
+mult_%1:        times 4 dd %2
+offset_%1:      times 4 dd %3
+SECTION .text
+cglobal %1, 3, 4, 4, dstU, dstV, width, x
+    movsxdifnidn widthq, widthd
+    xor              xq, xq
+    mova             m1, [mult_%1]
+    mova             m2, [offset_%1]
+.loop:
+    pmovsxwd         m0, [dstUq+xq*2]
+    pmulld           m0, m1
+    paddd            m0, m2
+    psrad            m0, %4
+    packssdw         m0, m0
+    movh   [dstUq+xq*2], m0
+    pmovsxwd         m0, [dstVq+xq*2]
+    pmulld           m0, m1
+    paddd            m0, m2
+    psrad            m0, %4
+    packssdw         m0, m0
+    movh   [dstVq+xq*2], m0
+    add              xq, mmsize / 4
+    cmp              xd, widthd
+    jl .loop
+    RET
+%endmacro
+
+%if ARCH_X86_64
+INIT_XMM sse4
+LUMCONVERTRANGE lumRangeToJpeg,   19077, -39057361, 14
+CHRCONVERTRANGE chrRangeToJpeg,    4663,  -9289992, 12
+LUMCONVERTRANGE lumRangeFromJpeg, 14071,  33561947, 14
+CHRCONVERTRANGE chrRangeFromJpeg,  1799,   4081085, 11
+%endif
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index fff8bb4396..c5ddfb5605 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -447,6 +447,38 @@ INPUT_PLANAR_RGB_UV_ALL_DECL(avx2);
 INPUT_PLANAR_RGB_A_ALL_DECL(avx2);
 #endif
 
+#if ARCH_X86_64
+#define RANGE_CONVERT_FUNCS(opt) do {                                       \
+    if (c->dstBpc <= 14) {                                                  \
+        if (c->srcRange) {                                                  \
+            c->lumConvertRange = ff_lumRangeFromJpeg_ ##opt;                \
+            c->chrConvertRange = ff_chrRangeFromJpeg_ ##opt;                \
+        } else {                                                            \
+            c->lumConvertRange = ff_lumRangeToJpeg_ ##opt;                  \
+            c->chrConvertRange = ff_chrRangeToJpeg_ ##opt;                  \
+        }                                                                   \
+    }                                                                       \
+} while (0)
+
+#define RANGE_CONVERT_FUNCS_DECL(opt)                                       \
+void ff_lumRangeFromJpeg_ ##opt(int16_t *dst, int width);                   \
+void ff_chrRangeFromJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width);   \
+void ff_lumRangeToJpeg_ ##opt(int16_t *dst, int width);                     \
+void ff_chrRangeToJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width);     \
+
+RANGE_CONVERT_FUNCS_DECL(sse4);
+
+av_cold void ff_sws_init_range_convert_x86(SwsContext *c)
+{
+    if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
+        int cpu_flags = av_get_cpu_flags();
+        if (EXTERNAL_SSE4(cpu_flags)) {
+            RANGE_CONVERT_FUNCS(sse4);
+        }
+    }
+}
+#endif
+
 av_cold void ff_sws_init_swscale_x86(SwsContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -805,4 +837,8 @@ switch(c->dstBpc){ \
     }
 
 #endif
+
+#if ARCH_X86_64
+    ff_sws_init_range_convert_x86(c);
+#endif
 }
-- 
2.30.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [FFmpeg-devel] [PATCH 4/4] swscale/aarch64: add neon {lum, chr}ConvertRange
  2024-06-07 14:05 [FFmpeg-devel] [PATCH 1/4] tests/checkasm: cosmetics, one object per line in Makefile Ramiro Polla
  2024-06-07 14:05 ` [FFmpeg-devel] [PATCH 2/4] checkasm: add tests for {lum, chr}ConvertRange Ramiro Polla
  2024-06-07 14:05 ` [FFmpeg-devel] [PATCH 3/4] swscale/x86: add sse4 " Ramiro Polla
@ 2024-06-07 14:05 ` Ramiro Polla
  2024-06-10 11:56   ` Martin Storsjö
  2024-06-07 18:45 ` [FFmpeg-devel] [PATCH 1/4] tests/checkasm: cosmetics, one object per line in Makefile Andreas Rheinhardt
  3 siblings, 1 reply; 11+ messages in thread
From: Ramiro Polla @ 2024-06-07 14:05 UTC (permalink / raw)
  To: ffmpeg-devel

chrRangeFromJpeg_8_c: 28.5
chrRangeFromJpeg_8_neon: 21.2
chrRangeFromJpeg_24_c: 81.2
chrRangeFromJpeg_24_neon: 34.7
chrRangeFromJpeg_128_c: 425.2
chrRangeFromJpeg_128_neon: 162.0
chrRangeFromJpeg_144_c: 480.2
chrRangeFromJpeg_144_neon: 180.2
chrRangeFromJpeg_256_c: 838.2
chrRangeFromJpeg_256_neon: 318.0
chrRangeFromJpeg_512_c: 1698.2
chrRangeFromJpeg_512_neon: 630.0
chrRangeToJpeg_8_c: 56.0
chrRangeToJpeg_8_neon: 23.5
chrRangeToJpeg_24_c: 147.7
chrRangeToJpeg_24_neon: 38.2
chrRangeToJpeg_128_c: 760.2
chrRangeToJpeg_128_neon: 182.5
chrRangeToJpeg_144_c: 857.7
chrRangeToJpeg_144_neon: 204.5
chrRangeToJpeg_256_c: 1504.2
chrRangeToJpeg_256_neon: 358.5
chrRangeToJpeg_512_c: 3025.7
chrRangeToJpeg_512_neon: 710.5
lumRangeFromJpeg_8_c: 24.0
lumRangeFromJpeg_8_neon: 18.2
lumRangeFromJpeg_24_c: 64.0
lumRangeFromJpeg_24_neon: 22.2
lumRangeFromJpeg_128_c: 289.2
lumRangeFromJpeg_128_neon: 79.2
lumRangeFromJpeg_144_c: 334.7
lumRangeFromJpeg_144_neon: 87.7
lumRangeFromJpeg_256_c: 579.5
lumRangeFromJpeg_256_neon: 152.0
lumRangeFromJpeg_512_c: 1208.0
lumRangeFromJpeg_512_neon: 299.0
lumRangeToJpeg_8_c: 30.0
lumRangeToJpeg_8_neon: 19.0
lumRangeToJpeg_24_c: 82.2
lumRangeToJpeg_24_neon: 24.0
lumRangeToJpeg_128_c: 440.7
lumRangeToJpeg_128_neon: 90.5
lumRangeToJpeg_144_c: 502.0
lumRangeToJpeg_144_neon: 102.2
lumRangeToJpeg_256_c: 893.7
lumRangeToJpeg_256_neon: 178.0
lumRangeToJpeg_512_c: 1793.7
lumRangeToJpeg_512_neon: 355.0
---
 libswscale/aarch64/Makefile             |   1 +
 libswscale/aarch64/range_convert_neon.S | 103 ++++++++++++++++++++++++
 libswscale/aarch64/swscale.c            |  21 +++++
 libswscale/swscale_internal.h           |   1 +
 libswscale/utils.c                      |   4 +-
 5 files changed, 129 insertions(+), 1 deletion(-)
 create mode 100644 libswscale/aarch64/range_convert_neon.S

diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
index da1d909561..6923827f82 100644
--- a/libswscale/aarch64/Makefile
+++ b/libswscale/aarch64/Makefile
@@ -4,5 +4,6 @@ OBJS        += aarch64/rgb2rgb.o                \
 
 NEON-OBJS   += aarch64/hscale.o                 \
                aarch64/output.o                 \
+               aarch64/range_convert_neon.o     \
                aarch64/rgb2rgb_neon.o           \
                aarch64/yuv2rgb_neon.o           \
diff --git a/libswscale/aarch64/range_convert_neon.S b/libswscale/aarch64/range_convert_neon.S
new file mode 100644
index 0000000000..5e104971f0
--- /dev/null
+++ b/libswscale/aarch64/range_convert_neon.S
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2024 Ramiro Polla
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+.macro lumConvertRange name max mult offset shift
+const offset_\name, align=4
+        .word \offset, \offset, \offset, \offset
+endconst
+function ff_\name, export=1
+.if \max != 0
+        mov             w3, #\max
+        dup             v24.8h, w3
+.endif
+        mov             w3, #\mult
+        dup             v25.4s, w3
+        movrel          x3, offset_\name
+        ld1             {v26.4s}, [x3]
+1:
+        ld1             {v0.8h}, [x0]
+.if \max != 0
+        smin            v0.8h, v0.8h, v24.8h
+.endif
+        mov             v16.16b, v26.16b
+        mov             v18.16b, v26.16b
+        sxtl            v20.4s, v0.4h
+        sxtl2           v22.4s, v0.8h
+        mla             v16.4s, v20.4s, v25.4s
+        mla             v18.4s, v22.4s, v25.4s
+        shrn            v0.4h, v16.4s, #\shift
+        shrn2           v0.8h, v18.4s, #\shift
+        subs            w1, w1, #8
+        st1             {v0.8h}, [x0], #16
+        b.gt            1b
+        ret
+endfunc
+.endm
+
+.macro chrConvertRange name max mult offset shift
+const offset_\name, align=4
+        .word \offset, \offset, \offset, \offset
+endconst
+function ff_\name, export=1
+.if \max != 0
+        mov             w3, #\max
+        dup             v24.8h, w3
+.endif
+        mov             w3, #\mult
+        dup             v25.4s, w3
+        movrel          x3, offset_\name
+        ld1             {v26.4s}, [x3]
+1:
+        ld1             {v0.8h}, [x0]
+        ld1             {v1.8h}, [x1]
+.if \max != 0
+        smin            v0.8h, v0.8h, v24.8h
+        smin            v1.8h, v1.8h, v24.8h
+.endif
+        mov             v16.16b, v26.16b
+        mov             v17.16b, v26.16b
+        mov             v18.16b, v26.16b
+        mov             v19.16b, v26.16b
+        sxtl            v20.4s, v0.4h
+        sxtl            v21.4s, v1.4h
+        sxtl2           v22.4s, v0.8h
+        sxtl2           v23.4s, v1.8h
+        mla             v16.4s, v20.4s, v25.4s
+        mla             v17.4s, v21.4s, v25.4s
+        mla             v18.4s, v22.4s, v25.4s
+        mla             v19.4s, v23.4s, v25.4s
+        shrn            v0.4h, v16.4s, #\shift
+        shrn            v1.4h, v17.4s, #\shift
+        shrn2           v0.8h, v18.4s, #\shift
+        shrn2           v1.8h, v19.4s, #\shift
+        subs            w2, w2, #8
+        st1             {v0.8h}, [x0], #16
+        st1             {v1.8h}, [x1], #16
+        b.gt            1b
+        ret
+endfunc
+.endm
+
+lumConvertRange lumRangeToJpeg_neon,   30189, 19077, -39057361, 14
+chrConvertRange chrRangeToJpeg_neon,   30775,  4663,  -9289992, 12
+lumConvertRange lumRangeFromJpeg_neon,     0, 14071,  33561947, 14
+chrConvertRange chrRangeFromJpeg_neon,     0,  1799,   4081085, 11
diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
index bbd9719a44..7344f75b2e 100644
--- a/libswscale/aarch64/swscale.c
+++ b/libswscale/aarch64/swscale.c
@@ -201,6 +201,26 @@ void ff_yuv2plane1_8_neon(
     default: break;                                                     \
     }
 
+void ff_lumRangeFromJpeg_neon(int16_t *dst, int width);
+void ff_chrRangeFromJpeg_neon(int16_t *dstU, int16_t *dstV, int width);
+void ff_lumRangeToJpeg_neon(int16_t *dst, int width);
+void ff_chrRangeToJpeg_neon(int16_t *dstU, int16_t *dstV, int width);
+
+av_cold void ff_sws_init_range_convert_aarch64(SwsContext *c)
+{
+    if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
+        if (c->dstBpc <= 14) {
+            if (c->srcRange) {
+                c->lumConvertRange = ff_lumRangeFromJpeg_neon;
+                c->chrConvertRange = ff_chrRangeFromJpeg_neon;
+            } else {
+                c->lumConvertRange = ff_lumRangeToJpeg_neon;
+                c->chrConvertRange = ff_chrRangeToJpeg_neon;
+            }
+        }
+    }
+}
+
 av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -212,5 +232,6 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
         if (c->dstBpc == 8) {
             c->yuv2planeX = ff_yuv2planeX_8_neon;
         }
+        ff_sws_init_range_convert_aarch64(c);
     }
 }
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 92f6105443..1059f8a6de 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -697,6 +697,7 @@ void ff_yuv2rgb_init_tables_ppc(SwsContext *c, const int inv_table[4],
 void ff_updateMMXDitherTables(SwsContext *c, int dstY);
 
 av_cold void ff_sws_init_range_convert(SwsContext *c);
+av_cold void ff_sws_init_range_convert_aarch64(SwsContext *c);
 av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c);
 av_cold void ff_sws_init_range_convert_x86(SwsContext *c);
 
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 8dfa57b5ff..12dba712c1 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -1080,7 +1080,9 @@ int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
 
     if (need_reinit) {
         ff_sws_init_range_convert(c);
-#if ARCH_LOONGARCH64
+#if ARCH_AARCH64
+        ff_sws_init_range_convert_aarch64(c);
+#elif ARCH_LOONGARCH64
         ff_sws_init_range_convert_loongarch(c);
 #elif ARCH_X86
         ff_sws_init_range_convert_x86(c);
-- 
2.30.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [FFmpeg-devel] [PATCH 3/4] swscale/x86: add sse4 {lum, chr}ConvertRange
  2024-06-07 14:05 ` [FFmpeg-devel] [PATCH 3/4] swscale/x86: add sse4 " Ramiro Polla
@ 2024-06-07 17:38   ` Ramiro Polla
  0 siblings, 0 replies; 11+ messages in thread
From: Ramiro Polla @ 2024-06-07 17:38 UTC (permalink / raw)
  To: ffmpeg-devel

[-- Attachment #1: Type: text/plain, Size: 10195 bytes --]

On Fri, Jun 7, 2024 at 4:05 PM Ramiro Polla <ramiro.polla@gmail.com> wrote:
>
> chrRangeFromJpeg_8_c: 19.9
> chrRangeFromJpeg_8_sse4: 16.2
> chrRangeFromJpeg_24_c: 60.7
> chrRangeFromJpeg_24_sse4: 28.9
> chrRangeFromJpeg_128_c: 325.7
> chrRangeFromJpeg_128_sse4: 160.2
> chrRangeFromJpeg_144_c: 364.2
> chrRangeFromJpeg_144_sse4: 194.9
> chrRangeFromJpeg_256_c: 630.7
> chrRangeFromJpeg_256_sse4: 337.4
> chrRangeFromJpeg_512_c: 1240.4
> chrRangeFromJpeg_512_sse4: 668.4
> chrRangeToJpeg_8_c: 37.7
> chrRangeToJpeg_8_sse4: 19.7
> chrRangeToJpeg_24_c: 114.7
> chrRangeToJpeg_24_sse4: 30.2
> chrRangeToJpeg_128_c: 636.4
> chrRangeToJpeg_128_sse4: 161.7
> chrRangeToJpeg_144_c: 715.7
> chrRangeToJpeg_144_sse4: 272.9
> chrRangeToJpeg_256_c: 1256.7
> chrRangeToJpeg_256_sse4: 341.9
> chrRangeToJpeg_512_c: 2498.7
> chrRangeToJpeg_512_sse4: 668.4
> lumRangeFromJpeg_8_c: 11.7
> lumRangeFromJpeg_8_sse4: 12.4
> lumRangeFromJpeg_24_c: 36.9
> lumRangeFromJpeg_24_sse4: 17.7
> lumRangeFromJpeg_128_c: 228.4
> lumRangeFromJpeg_128_sse4: 85.2
> lumRangeFromJpeg_144_c: 272.9
> lumRangeFromJpeg_144_sse4: 96.9
> lumRangeFromJpeg_256_c: 463.4
> lumRangeFromJpeg_256_sse4: 183.9
> lumRangeFromJpeg_512_c: 879.9
> lumRangeFromJpeg_512_sse4: 355.9
> lumRangeToJpeg_8_c: 17.7
> lumRangeToJpeg_8_sse4: 15.4
> lumRangeToJpeg_24_c: 56.2
> lumRangeToJpeg_24_sse4: 18.4
> lumRangeToJpeg_128_c: 331.4
> lumRangeToJpeg_128_sse4: 84.4
> lumRangeToJpeg_144_c: 375.2
> lumRangeToJpeg_144_sse4: 96.9
> lumRangeToJpeg_256_c: 649.7
> lumRangeToJpeg_256_sse4: 184.4
> lumRangeToJpeg_512_c: 1281.9
> lumRangeToJpeg_512_sse4: 355.9
> ---
>  libswscale/swscale_internal.h    |   1 +
>  libswscale/utils.c               |   2 +
>  libswscale/x86/Makefile          |   1 +
>  libswscale/x86/range_convert.asm | 100 +++++++++++++++++++++++++++++++
>  libswscale/x86/swscale.c         |  36 +++++++++++
>  5 files changed, 140 insertions(+)
>  create mode 100644 libswscale/x86/range_convert.asm
>
> diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
> index d4b0c3cee2..92f6105443 100644
> --- a/libswscale/swscale_internal.h
> +++ b/libswscale/swscale_internal.h
> @@ -698,6 +698,7 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY);
>
>  av_cold void ff_sws_init_range_convert(SwsContext *c);
>  av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c);
> +av_cold void ff_sws_init_range_convert_x86(SwsContext *c);
>
>  SwsFunc ff_yuv2rgb_init_x86(SwsContext *c);
>  SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c);
> diff --git a/libswscale/utils.c b/libswscale/utils.c
> index 476a24fea5..8dfa57b5ff 100644
> --- a/libswscale/utils.c
> +++ b/libswscale/utils.c
> @@ -1082,6 +1082,8 @@ int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
>          ff_sws_init_range_convert(c);
>  #if ARCH_LOONGARCH64
>          ff_sws_init_range_convert_loongarch(c);
> +#elif ARCH_X86
> +        ff_sws_init_range_convert_x86(c);
>  #endif
>      }
>
> diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
> index 68391494be..f00154941d 100644
> --- a/libswscale/x86/Makefile
> +++ b/libswscale/x86/Makefile
> @@ -12,6 +12,7 @@ X86ASM-OBJS                     += x86/input.o                          \
>                                     x86/output.o                         \
>                                     x86/scale.o                          \
>                                     x86/scale_avx2.o                          \
> +                                   x86/range_convert.o                  \
>                                     x86/rgb_2_rgb.o                      \
>                                     x86/yuv_2_rgb.o                      \
>                                     x86/yuv2yuvX.o                       \
> diff --git a/libswscale/x86/range_convert.asm b/libswscale/x86/range_convert.asm
> new file mode 100644
> index 0000000000..333265fb65
> --- /dev/null
> +++ b/libswscale/x86/range_convert.asm
> @@ -0,0 +1,100 @@
> +;******************************************************************************
> +;* Copyright (c) 2024 Ramiro Polla
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +; NOTE: there is no need to clamp the input when converting to jpeg range
> +;       (like we do in the C code) because packssdw will saturate the output.
> +
> +;-----------------------------------------------------------------------------
> +; lumConvertRange
> +;
> +; void ff_lumRangeToJpeg_<opt>(int16_t *dst, int width);
> +; void ff_lumRangeFromJpeg_<opt>(int16_t *dst, int width);
> +;
> +;-----------------------------------------------------------------------------
> +
> +%macro LUMCONVERTRANGE 4
> +SECTION_RODATA
> +mult_%1:        times 4 dd %2
> +offset_%1:      times 4 dd %3
> +SECTION .text
> +cglobal %1, 2, 3, 3, dst, width, x
> +    movsxdifnidn widthq, widthd
> +    xor              xq, xq
> +    mova             m1, [mult_%1]
> +    mova             m2, [offset_%1]
> +.loop:
> +    pmovsxwd         m0, [dstq+xq*2]
> +    pmulld           m0, m1
> +    paddd            m0, m2
> +    psrad            m0, %4
> +    packssdw         m0, m0
> +    movh    [dstq+xq*2], m0
> +    add              xq, mmsize / 4
> +    cmp              xd, widthd
> +    jl .loop
> +    RET
> +%endmacro
> +
> +;-----------------------------------------------------------------------------
> +; chrConvertRange
> +;
> +; void ff_chrRangeToJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width);
> +; void ff_chrRangeFromJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width);
> +;
> +;-----------------------------------------------------------------------------
> +
> +%macro CHRCONVERTRANGE 4
> +SECTION_RODATA
> +mult_%1:        times 4 dd %2
> +offset_%1:      times 4 dd %3
> +SECTION .text
> +cglobal %1, 3, 4, 4, dstU, dstV, width, x
> +    movsxdifnidn widthq, widthd
> +    xor              xq, xq
> +    mova             m1, [mult_%1]
> +    mova             m2, [offset_%1]
> +.loop:
> +    pmovsxwd         m0, [dstUq+xq*2]
> +    pmulld           m0, m1
> +    paddd            m0, m2
> +    psrad            m0, %4
> +    packssdw         m0, m0
> +    movh   [dstUq+xq*2], m0
> +    pmovsxwd         m0, [dstVq+xq*2]
> +    pmulld           m0, m1
> +    paddd            m0, m2
> +    psrad            m0, %4
> +    packssdw         m0, m0
> +    movh   [dstVq+xq*2], m0
> +    add              xq, mmsize / 4
> +    cmp              xd, widthd
> +    jl .loop
> +    RET
> +%endmacro
> +
> +%if ARCH_X86_64
> +INIT_XMM sse4
> +LUMCONVERTRANGE lumRangeToJpeg,   19077, -39057361, 14
> +CHRCONVERTRANGE chrRangeToJpeg,    4663,  -9289992, 12
> +LUMCONVERTRANGE lumRangeFromJpeg, 14071,  33561947, 14
> +CHRCONVERTRANGE chrRangeFromJpeg,  1799,   4081085, 11
> +%endif
> diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
> index fff8bb4396..c5ddfb5605 100644
> --- a/libswscale/x86/swscale.c
> +++ b/libswscale/x86/swscale.c
> @@ -447,6 +447,38 @@ INPUT_PLANAR_RGB_UV_ALL_DECL(avx2);
>  INPUT_PLANAR_RGB_A_ALL_DECL(avx2);
>  #endif
>
> +#if ARCH_X86_64
> +#define RANGE_CONVERT_FUNCS(opt) do {                                       \
> +    if (c->dstBpc <= 14) {                                                  \
> +        if (c->srcRange) {                                                  \
> +            c->lumConvertRange = ff_lumRangeFromJpeg_ ##opt;                \
> +            c->chrConvertRange = ff_chrRangeFromJpeg_ ##opt;                \
> +        } else {                                                            \
> +            c->lumConvertRange = ff_lumRangeToJpeg_ ##opt;                  \
> +            c->chrConvertRange = ff_chrRangeToJpeg_ ##opt;                  \
> +        }                                                                   \
> +    }                                                                       \
> +} while (0)
> +
> +#define RANGE_CONVERT_FUNCS_DECL(opt)                                       \
> +void ff_lumRangeFromJpeg_ ##opt(int16_t *dst, int width);                   \
> +void ff_chrRangeFromJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width);   \
> +void ff_lumRangeToJpeg_ ##opt(int16_t *dst, int width);                     \
> +void ff_chrRangeToJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width);     \
> +
> +RANGE_CONVERT_FUNCS_DECL(sse4);
> +
> +av_cold void ff_sws_init_range_convert_x86(SwsContext *c)
> +{
> +    if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
> +        int cpu_flags = av_get_cpu_flags();
> +        if (EXTERNAL_SSE4(cpu_flags)) {
> +            RANGE_CONVERT_FUNCS(sse4);
> +        }
> +    }
> +}
> +#endif
> +
>  av_cold void ff_sws_init_swscale_x86(SwsContext *c)
>  {
>      int cpu_flags = av_get_cpu_flags();
> @@ -805,4 +837,8 @@ switch(c->dstBpc){ \
>      }
>
>  #endif
> +
> +#if ARCH_X86_64
> +    ff_sws_init_range_convert_x86(c);
> +#endif
>  }
> --
> 2.30.2
>

Attached version is a little bit different, moving the consts out of
the macro (so they can be reused by avx2) and processing twice the
amount of data per loop.

[-- Attachment #2: 0001-swscale-x86-add-sse4-lum-chr-ConvertRange.patch --]
[-- Type: text/x-patch, Size: 10363 bytes --]

From b8f72b1c4c8393becea9962378af6d7dffabbce2 Mon Sep 17 00:00:00 2001
From: Ramiro Polla <ramiro.polla@gmail.com>
Date: Thu, 6 Jun 2024 18:33:34 +0200
Subject: [PATCH] swscale/x86: add sse4 {lum,chr}ConvertRange

chrRangeFromJpeg_8_c: 19.9
chrRangeFromJpeg_8_sse4: 16.2
chrRangeFromJpeg_24_c: 60.7
chrRangeFromJpeg_24_sse4: 28.9
chrRangeFromJpeg_128_c: 325.7
chrRangeFromJpeg_128_sse4: 160.2
chrRangeFromJpeg_144_c: 364.2
chrRangeFromJpeg_144_sse4: 194.9
chrRangeFromJpeg_256_c: 630.7
chrRangeFromJpeg_256_sse4: 337.4
chrRangeFromJpeg_512_c: 1240.4
chrRangeFromJpeg_512_sse4: 668.4
chrRangeToJpeg_8_c: 37.7
chrRangeToJpeg_8_sse4: 19.7
chrRangeToJpeg_24_c: 114.7
chrRangeToJpeg_24_sse4: 30.2
chrRangeToJpeg_128_c: 636.4
chrRangeToJpeg_128_sse4: 161.7
chrRangeToJpeg_144_c: 715.7
chrRangeToJpeg_144_sse4: 272.9
chrRangeToJpeg_256_c: 1256.7
chrRangeToJpeg_256_sse4: 341.9
chrRangeToJpeg_512_c: 2498.7
chrRangeToJpeg_512_sse4: 668.4
lumRangeFromJpeg_8_c: 11.7
lumRangeFromJpeg_8_sse4: 12.4
lumRangeFromJpeg_24_c: 36.9
lumRangeFromJpeg_24_sse4: 17.7
lumRangeFromJpeg_128_c: 228.4
lumRangeFromJpeg_128_sse4: 85.2
lumRangeFromJpeg_144_c: 272.9
lumRangeFromJpeg_144_sse4: 96.9
lumRangeFromJpeg_256_c: 463.4
lumRangeFromJpeg_256_sse4: 183.9
lumRangeFromJpeg_512_c: 879.9
lumRangeFromJpeg_512_sse4: 355.9
lumRangeToJpeg_8_c: 17.7
lumRangeToJpeg_8_sse4: 15.4
lumRangeToJpeg_24_c: 56.2
lumRangeToJpeg_24_sse4: 18.4
lumRangeToJpeg_128_c: 331.4
lumRangeToJpeg_128_sse4: 84.4
lumRangeToJpeg_144_c: 375.2
lumRangeToJpeg_144_sse4: 96.9
lumRangeToJpeg_256_c: 649.7
lumRangeToJpeg_256_sse4: 184.4
lumRangeToJpeg_512_c: 1281.9
lumRangeToJpeg_512_sse4: 355.9
---
 libswscale/swscale_internal.h    |   1 +
 libswscale/utils.c               |   2 +
 libswscale/x86/Makefile          |   1 +
 libswscale/x86/range_convert.asm | 130 +++++++++++++++++++++++++++++++
 libswscale/x86/swscale.c         |  36 +++++++++
 5 files changed, 170 insertions(+)
 create mode 100644 libswscale/x86/range_convert.asm

diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index d4b0c3cee2..92f6105443 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -698,6 +698,7 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY);
 
 av_cold void ff_sws_init_range_convert(SwsContext *c);
 av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c);
+av_cold void ff_sws_init_range_convert_x86(SwsContext *c);
 
 SwsFunc ff_yuv2rgb_init_x86(SwsContext *c);
 SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c);
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 476a24fea5..8dfa57b5ff 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -1082,6 +1082,8 @@ int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
         ff_sws_init_range_convert(c);
 #if ARCH_LOONGARCH64
         ff_sws_init_range_convert_loongarch(c);
+#elif ARCH_X86
+        ff_sws_init_range_convert_x86(c);
 #endif
     }
 
diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index 68391494be..f00154941d 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -12,6 +12,7 @@ X86ASM-OBJS                     += x86/input.o                          \
                                    x86/output.o                         \
                                    x86/scale.o                          \
                                    x86/scale_avx2.o                          \
+                                   x86/range_convert.o                  \
                                    x86/rgb_2_rgb.o                      \
                                    x86/yuv_2_rgb.o                      \
                                    x86/yuv2yuvX.o                       \
diff --git a/libswscale/x86/range_convert.asm b/libswscale/x86/range_convert.asm
new file mode 100644
index 0000000000..13983a386b
--- /dev/null
+++ b/libswscale/x86/range_convert.asm
@@ -0,0 +1,130 @@
+;******************************************************************************
+;* Copyright (c) 2024 Ramiro Polla
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+chr_to_mult:        times 4 dd 4663
+chr_to_offset:      times 4 dd -9289992
+%define chr_to_shift 12
+
+chr_from_mult:      times 4 dd 1799
+chr_from_offset:    times 4 dd 4081085
+%define chr_from_shift 11
+
+lum_to_mult:        times 4 dd 19077
+lum_to_offset:      times 4 dd -39057361
+%define lum_to_shift 14
+
+lum_from_mult:      times 4 dd 14071
+lum_from_offset:    times 4 dd 33561947
+%define lum_from_shift 14
+
+SECTION .text
+
+; NOTE: there is no need to clamp the input when converting to jpeg range
+;       (like we do in the C code) because packssdw will saturate the output.
+
+;-----------------------------------------------------------------------------
+; lumConvertRange
+;
+; void ff_lumRangeToJpeg_<opt>(int16_t *dst, int width);
+; void ff_lumRangeFromJpeg_<opt>(int16_t *dst, int width);
+;
+;-----------------------------------------------------------------------------
+
+%macro LUMCONVERTRANGE 4
+cglobal %1, 2, 3, 3, dst, width, x
+    movsxdifnidn widthq, widthd
+    xor              xq, xq
+    mova             m4, [%2]
+    mova             m5, [%3]
+.loop:
+    pmovsxwd         m0, [dstq+xq*2]
+    pmovsxwd         m1, [dstq+xq*2+mmsize/2]
+    pmulld           m0, m4
+    pmulld           m1, m4
+    paddd            m0, m5
+    paddd            m1, m5
+    psrad            m0, %4
+    psrad            m1, %4
+    packssdw         m0, m0
+    packssdw         m1, m1
+    movq    [dstq+xq*2], m0
+    movq    [dstq+xq*2+mmsize/2], m1
+    add              xq, mmsize / 2
+    cmp              xd, widthd
+    jl .loop
+    RET
+%endmacro
+
+;-----------------------------------------------------------------------------
+; chrConvertRange
+;
+; void ff_chrRangeToJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width);
+; void ff_chrRangeFromJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width);
+;
+;-----------------------------------------------------------------------------
+
+%macro CHRCONVERTRANGE 4
+cglobal %1, 3, 4, 4, dstU, dstV, width, x
+    movsxdifnidn widthq, widthd
+    xor              xq, xq
+    mova             m4, [%2]
+    mova             m5, [%3]
+.loop:
+    pmovsxwd         m0, [dstUq+xq*2]
+    pmovsxwd         m1, [dstUq+xq*2+mmsize/2]
+    pmovsxwd         m2, [dstVq+xq*2]
+    pmovsxwd         m3, [dstVq+xq*2+mmsize/2]
+    pmulld           m0, m4
+    pmulld           m1, m4
+    pmulld           m2, m4
+    pmulld           m3, m4
+    paddd            m0, m5
+    paddd            m1, m5
+    paddd            m2, m5
+    paddd            m3, m5
+    psrad            m0, %4
+    psrad            m1, %4
+    psrad            m2, %4
+    psrad            m3, %4
+    packssdw         m0, m0
+    packssdw         m1, m1
+    packssdw         m2, m2
+    packssdw         m3, m3
+    movq   [dstUq+xq*2], m0
+    movq   [dstUq+xq*2+mmsize/2], m1
+    movq   [dstVq+xq*2], m2
+    movq   [dstVq+xq*2+mmsize/2], m3
+    add              xq, mmsize / 2
+    cmp              xd, widthd
+    jl .loop
+    RET
+%endmacro
+
+%if ARCH_X86_64
+INIT_XMM sse4
+LUMCONVERTRANGE lumRangeToJpeg,   lum_to_mult,   lum_to_offset,   lum_to_shift
+CHRCONVERTRANGE chrRangeToJpeg,   chr_to_mult,   chr_to_offset,   chr_to_shift
+LUMCONVERTRANGE lumRangeFromJpeg, lum_from_mult, lum_from_offset, lum_from_shift
+CHRCONVERTRANGE chrRangeFromJpeg, chr_from_mult, chr_from_offset, chr_from_shift
+%endif
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index fff8bb4396..c5ddfb5605 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -447,6 +447,38 @@ INPUT_PLANAR_RGB_UV_ALL_DECL(avx2);
 INPUT_PLANAR_RGB_A_ALL_DECL(avx2);
 #endif
 
+#if ARCH_X86_64
+#define RANGE_CONVERT_FUNCS(opt) do {                                       \
+    if (c->dstBpc <= 14) {                                                  \
+        if (c->srcRange) {                                                  \
+            c->lumConvertRange = ff_lumRangeFromJpeg_ ##opt;                \
+            c->chrConvertRange = ff_chrRangeFromJpeg_ ##opt;                \
+        } else {                                                            \
+            c->lumConvertRange = ff_lumRangeToJpeg_ ##opt;                  \
+            c->chrConvertRange = ff_chrRangeToJpeg_ ##opt;                  \
+        }                                                                   \
+    }                                                                       \
+} while (0)
+
+#define RANGE_CONVERT_FUNCS_DECL(opt)                                       \
+void ff_lumRangeFromJpeg_ ##opt(int16_t *dst, int width);                   \
+void ff_chrRangeFromJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width);   \
+void ff_lumRangeToJpeg_ ##opt(int16_t *dst, int width);                     \
+void ff_chrRangeToJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width);     \
+
+RANGE_CONVERT_FUNCS_DECL(sse4);
+
+av_cold void ff_sws_init_range_convert_x86(SwsContext *c)
+{
+    if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
+        int cpu_flags = av_get_cpu_flags();
+        if (EXTERNAL_SSE4(cpu_flags)) {
+            RANGE_CONVERT_FUNCS(sse4);
+        }
+    }
+}
+#endif
+
 av_cold void ff_sws_init_swscale_x86(SwsContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -805,4 +837,8 @@ switch(c->dstBpc){ \
     }
 
 #endif
+
+#if ARCH_X86_64
+    ff_sws_init_range_convert_x86(c);
+#endif
 }
-- 
2.30.2


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/4] tests/checkasm: cosmetics, one object per line in Makefile
  2024-06-07 14:05 [FFmpeg-devel] [PATCH 1/4] tests/checkasm: cosmetics, one object per line in Makefile Ramiro Polla
                   ` (2 preceding siblings ...)
  2024-06-07 14:05 ` [FFmpeg-devel] [PATCH 4/4] swscale/aarch64: add neon " Ramiro Polla
@ 2024-06-07 18:45 ` Andreas Rheinhardt
  2024-06-07 19:09   ` Ramiro Polla
  3 siblings, 1 reply; 11+ messages in thread
From: Andreas Rheinhardt @ 2024-06-07 18:45 UTC (permalink / raw)
  To: ffmpeg-devel

Ramiro Polla:
> ---
>  tests/checkasm/Makefile | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
> index 6eb94d10d5..3ce152e818 100644
> --- a/tests/checkasm/Makefile
> +++ b/tests/checkasm/Makefile
> @@ -63,7 +63,9 @@ AVFILTEROBJS-$(CONFIG_SOBEL_FILTER)      += vf_convolution.o
>  CHECKASMOBJS-$(CONFIG_AVFILTER) += $(AVFILTEROBJS-yes)
>  
>  # swscale tests
> -SWSCALEOBJS                             += sw_gbrp.o sw_rgb.o sw_scale.o
> +SWSCALEOBJS                             += sw_gbrp.o
> +SWSCALEOBJS                             += sw_rgb.o
> +SWSCALEOBJS                             += sw_scale.o
>  
>  CHECKASMOBJS-$(CONFIG_SWSCALE)  += $(SWSCALEOBJS)
>  

We use the multiple-objects in a line style in all Makefiles.

- Andreas

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/4] tests/checkasm: cosmetics, one object per line in Makefile
  2024-06-07 18:45 ` [FFmpeg-devel] [PATCH 1/4] tests/checkasm: cosmetics, one object per line in Makefile Andreas Rheinhardt
@ 2024-06-07 19:09   ` Ramiro Polla
  2024-06-07 19:12     ` Andreas Rheinhardt
  0 siblings, 1 reply; 11+ messages in thread
From: Ramiro Polla @ 2024-06-07 19:09 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

[-- Attachment #1: Type: text/plain, Size: 1146 bytes --]

On Fri, Jun 7, 2024 at 8:46 PM Andreas Rheinhardt
<andreas.rheinhardt@outlook.com> wrote:
>
> Ramiro Polla:
> > ---
> >  tests/checkasm/Makefile | 4 +++-
> >  1 file changed, 3 insertions(+), 1 deletion(-)
> >
> > diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
> > index 6eb94d10d5..3ce152e818 100644
> > --- a/tests/checkasm/Makefile
> > +++ b/tests/checkasm/Makefile
> > @@ -63,7 +63,9 @@ AVFILTEROBJS-$(CONFIG_SOBEL_FILTER)      += vf_convolution.o
> >  CHECKASMOBJS-$(CONFIG_AVFILTER) += $(AVFILTEROBJS-yes)
> >
> >  # swscale tests
> > -SWSCALEOBJS                             += sw_gbrp.o sw_rgb.o sw_scale.o
> > +SWSCALEOBJS                             += sw_gbrp.o
> > +SWSCALEOBJS                             += sw_rgb.o
> > +SWSCALEOBJS                             += sw_scale.o
> >
> >  CHECKASMOBJS-$(CONFIG_SWSCALE)  += $(SWSCALEOBJS)
> >
>
> We use the multiple-objects in a line style in all Makefiles.

Then we should change the following:
libswscale/arm/Makefile (NEON_OBJS)
tests/checkasm/Makefile (AVUTILOBJS)
libavfilter/dnn/Makefile (OBJS-$(CONFIG_DNN))

New patch attached.

[-- Attachment #2: 0001-tests-checkasm-cosmetics-one-object-per-line-in-Make.patch --]
[-- Type: text/x-patch, Size: 938 bytes --]

From 4965ece9648be5da6e93b6bfa319b6a5fe92aee6 Mon Sep 17 00:00:00 2001
From: Ramiro Polla <ramiro.polla@gmail.com>
Date: Thu, 6 Jun 2024 15:40:03 +0200
Subject: [PATCH] tests/checkasm: cosmetics, one object per line in Makefile

---
 tests/checkasm/Makefile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 6eb94d10d5..c2a41d7f7b 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -63,7 +63,9 @@ AVFILTEROBJS-$(CONFIG_SOBEL_FILTER)      += vf_convolution.o
 CHECKASMOBJS-$(CONFIG_AVFILTER) += $(AVFILTEROBJS-yes)
 
 # swscale tests
-SWSCALEOBJS                             += sw_gbrp.o sw_rgb.o sw_scale.o
+SWSCALEOBJS                             += sw_gbrp.o \
+                                           sw_rgb.o \
+                                           sw_scale.o \
 
 CHECKASMOBJS-$(CONFIG_SWSCALE)  += $(SWSCALEOBJS)
 
-- 
2.30.2


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/4] tests/checkasm: cosmetics, one object per line in Makefile
  2024-06-07 19:09   ` Ramiro Polla
@ 2024-06-07 19:12     ` Andreas Rheinhardt
  2024-06-07 19:47       ` Ramiro Polla
  0 siblings, 1 reply; 11+ messages in thread
From: Andreas Rheinhardt @ 2024-06-07 19:12 UTC (permalink / raw)
  To: ffmpeg-devel

Ramiro Polla:
>  # swscale tests
> -SWSCALEOBJS                             += sw_gbrp.o sw_rgb.o sw_scale.o
> +SWSCALEOBJS                             += sw_gbrp.o \
> +                                           sw_rgb.o \
> +                                           sw_scale.o \
>  
>  CHECKASMOBJS-$(CONFIG_SWSCALE)  += $(SWSCALEOBJS)

We typically only use a new line of the old line is full.

- Andreas

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [FFmpeg-devel] [PATCH 1/4] tests/checkasm: cosmetics, one object per line in Makefile
  2024-06-07 19:12     ` Andreas Rheinhardt
@ 2024-06-07 19:47       ` Ramiro Polla
  0 siblings, 0 replies; 11+ messages in thread
From: Ramiro Polla @ 2024-06-07 19:47 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

On Fri, Jun 7, 2024 at 9:27 PM Andreas Rheinhardt
<andreas.rheinhardt@outlook.com> wrote:
>
> Ramiro Polla:
> >  # swscale tests
> > -SWSCALEOBJS                             += sw_gbrp.o sw_rgb.o sw_scale.o
> > +SWSCALEOBJS                             += sw_gbrp.o \
> > +                                           sw_rgb.o \
> > +                                           sw_scale.o \
> >
> >  CHECKASMOBJS-$(CONFIG_SWSCALE)  += $(SWSCALEOBJS)
>
> We typically only use a new line of the old line is full.

There's currently a mix of everything in the Makefiles. One object per
line, multiple objects per line, mix of one or multiple objects per
line in the same statement, aligned and unaligned += between lines,
aligned and unaligned \ at the end of the lines, some have \ at the
last line, some don't...

I personally prefer += one object per line and no \ at the end of the
line everywhere. It makes the code look consistent and the patches are
cleaner and easier to understand. But I don't maintain this, so I have
no strong opinion in this case.

This patch was meant to simplify the next commit (checkasm: add tests
for {lum,chr}ConvertRange), but I can drop it if you prefer.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [FFmpeg-devel] [PATCH 4/4] swscale/aarch64: add neon {lum, chr}ConvertRange
  2024-06-07 14:05 ` [FFmpeg-devel] [PATCH 4/4] swscale/aarch64: add neon " Ramiro Polla
@ 2024-06-10 11:56   ` Martin Storsjö
  2024-06-11 12:33     ` Ramiro Polla
  0 siblings, 1 reply; 11+ messages in thread
From: Martin Storsjö @ 2024-06-10 11:56 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

On Fri, 7 Jun 2024, Ramiro Polla wrote:

> chrRangeFromJpeg_8_c: 28.5
> chrRangeFromJpeg_8_neon: 21.2
> chrRangeFromJpeg_24_c: 81.2
> chrRangeFromJpeg_24_neon: 34.7
> chrRangeFromJpeg_128_c: 425.2
> chrRangeFromJpeg_128_neon: 162.0
> chrRangeFromJpeg_144_c: 480.2
> chrRangeFromJpeg_144_neon: 180.2
> chrRangeFromJpeg_256_c: 838.2
> chrRangeFromJpeg_256_neon: 318.0
> chrRangeFromJpeg_512_c: 1698.2
> chrRangeFromJpeg_512_neon: 630.0
> chrRangeToJpeg_8_c: 56.0
> chrRangeToJpeg_8_neon: 23.5
> chrRangeToJpeg_24_c: 147.7
> chrRangeToJpeg_24_neon: 38.2
> chrRangeToJpeg_128_c: 760.2
> chrRangeToJpeg_128_neon: 182.5
> chrRangeToJpeg_144_c: 857.7
> chrRangeToJpeg_144_neon: 204.5
> chrRangeToJpeg_256_c: 1504.2
> chrRangeToJpeg_256_neon: 358.5
> chrRangeToJpeg_512_c: 3025.7
> chrRangeToJpeg_512_neon: 710.5
> lumRangeFromJpeg_8_c: 24.0
> lumRangeFromJpeg_8_neon: 18.2
> lumRangeFromJpeg_24_c: 64.0
> lumRangeFromJpeg_24_neon: 22.2
> lumRangeFromJpeg_128_c: 289.2
> lumRangeFromJpeg_128_neon: 79.2
> lumRangeFromJpeg_144_c: 334.7
> lumRangeFromJpeg_144_neon: 87.7
> lumRangeFromJpeg_256_c: 579.5
> lumRangeFromJpeg_256_neon: 152.0
> lumRangeFromJpeg_512_c: 1208.0
> lumRangeFromJpeg_512_neon: 299.0
> lumRangeToJpeg_8_c: 30.0
> lumRangeToJpeg_8_neon: 19.0
> lumRangeToJpeg_24_c: 82.2
> lumRangeToJpeg_24_neon: 24.0
> lumRangeToJpeg_128_c: 440.7
> lumRangeToJpeg_128_neon: 90.5
> lumRangeToJpeg_144_c: 502.0
> lumRangeToJpeg_144_neon: 102.2
> lumRangeToJpeg_256_c: 893.7
> lumRangeToJpeg_256_neon: 178.0
> lumRangeToJpeg_512_c: 1793.7
> lumRangeToJpeg_512_neon: 355.0
> ---
> libswscale/aarch64/Makefile             |   1 +
> libswscale/aarch64/range_convert_neon.S | 103 ++++++++++++++++++++++++
> libswscale/aarch64/swscale.c            |  21 +++++
> libswscale/swscale_internal.h           |   1 +
> libswscale/utils.c                      |   4 +-
> 5 files changed, 129 insertions(+), 1 deletion(-)
> create mode 100644 libswscale/aarch64/range_convert_neon.S
>
> diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
> index da1d909561..6923827f82 100644
> --- a/libswscale/aarch64/Makefile
> +++ b/libswscale/aarch64/Makefile
> @@ -4,5 +4,6 @@ OBJS        += aarch64/rgb2rgb.o                \
>
> NEON-OBJS   += aarch64/hscale.o                 \
>                aarch64/output.o                 \
> +               aarch64/range_convert_neon.o     \
>                aarch64/rgb2rgb_neon.o           \
>                aarch64/yuv2rgb_neon.o           \
> diff --git a/libswscale/aarch64/range_convert_neon.S b/libswscale/aarch64/range_convert_neon.S
> new file mode 100644
> index 0000000000..5e104971f0
> --- /dev/null
> +++ b/libswscale/aarch64/range_convert_neon.S
> @@ -0,0 +1,103 @@
> +/*
> + * Copyright (c) 2024 Ramiro Polla
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +
> +.macro lumConvertRange name max mult offset shift

We usually use commas between the macro arguments here. Apparently it 
doesn't make any difference for any of the tools we support, but it would 
be nice for consistency. (When invoking macros, commas between arguments 
are optional for most platforms, but not when targeting Apple platforms, 
so being strict with consistent use of commas is generally good.)

> +const offset_\name, align=4
> +        .word \offset, \offset, \offset, \offset
> +endconst
> +function ff_\name, export=1
> +.if \max != 0
> +        mov             w3, #\max
> +        dup             v24.8h, w3
> +.endif
> +        mov             w3, #\mult
> +        dup             v25.4s, w3
> +        movrel          x3, offset_\name
> +        ld1             {v26.4s}, [x3]

FWIW, I did see that you were recommended this form, over ld1r, based on 
some microarchitectural performance numbers. However in our preexisting 
assembly, manually pre-splatting vectors like this is unusual I would say. 
I don't have a strong opinion on the matter though.

Anyway, the assembly looks reasonable to me.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [FFmpeg-devel] [PATCH 4/4] swscale/aarch64: add neon {lum, chr}ConvertRange
  2024-06-10 11:56   ` Martin Storsjö
@ 2024-06-11 12:33     ` Ramiro Polla
  0 siblings, 0 replies; 11+ messages in thread
From: Ramiro Polla @ 2024-06-11 12:33 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

On Mon, Jun 10, 2024 at 1:56 PM Martin Storsjö <martin@martin.st> wrote:
> On Fri, 7 Jun 2024, Ramiro Polla wrote:
>
> > chrRangeFromJpeg_8_c: 28.5
> > chrRangeFromJpeg_8_neon: 21.2
> > chrRangeFromJpeg_24_c: 81.2
> > chrRangeFromJpeg_24_neon: 34.7
> > chrRangeFromJpeg_128_c: 425.2
> > chrRangeFromJpeg_128_neon: 162.0
> > chrRangeFromJpeg_144_c: 480.2
> > chrRangeFromJpeg_144_neon: 180.2
> > chrRangeFromJpeg_256_c: 838.2
> > chrRangeFromJpeg_256_neon: 318.0
> > chrRangeFromJpeg_512_c: 1698.2
> > chrRangeFromJpeg_512_neon: 630.0
> > chrRangeToJpeg_8_c: 56.0
> > chrRangeToJpeg_8_neon: 23.5
> > chrRangeToJpeg_24_c: 147.7
> > chrRangeToJpeg_24_neon: 38.2
> > chrRangeToJpeg_128_c: 760.2
> > chrRangeToJpeg_128_neon: 182.5
> > chrRangeToJpeg_144_c: 857.7
> > chrRangeToJpeg_144_neon: 204.5
> > chrRangeToJpeg_256_c: 1504.2
> > chrRangeToJpeg_256_neon: 358.5
> > chrRangeToJpeg_512_c: 3025.7
> > chrRangeToJpeg_512_neon: 710.5
> > lumRangeFromJpeg_8_c: 24.0
> > lumRangeFromJpeg_8_neon: 18.2
> > lumRangeFromJpeg_24_c: 64.0
> > lumRangeFromJpeg_24_neon: 22.2
> > lumRangeFromJpeg_128_c: 289.2
> > lumRangeFromJpeg_128_neon: 79.2
> > lumRangeFromJpeg_144_c: 334.7
> > lumRangeFromJpeg_144_neon: 87.7
> > lumRangeFromJpeg_256_c: 579.5
> > lumRangeFromJpeg_256_neon: 152.0
> > lumRangeFromJpeg_512_c: 1208.0
> > lumRangeFromJpeg_512_neon: 299.0
> > lumRangeToJpeg_8_c: 30.0
> > lumRangeToJpeg_8_neon: 19.0
> > lumRangeToJpeg_24_c: 82.2
> > lumRangeToJpeg_24_neon: 24.0
> > lumRangeToJpeg_128_c: 440.7
> > lumRangeToJpeg_128_neon: 90.5
> > lumRangeToJpeg_144_c: 502.0
> > lumRangeToJpeg_144_neon: 102.2
> > lumRangeToJpeg_256_c: 893.7
> > lumRangeToJpeg_256_neon: 178.0
> > lumRangeToJpeg_512_c: 1793.7
> > lumRangeToJpeg_512_neon: 355.0
> > ---
> > libswscale/aarch64/Makefile             |   1 +
> > libswscale/aarch64/range_convert_neon.S | 103 ++++++++++++++++++++++++
> > libswscale/aarch64/swscale.c            |  21 +++++
> > libswscale/swscale_internal.h           |   1 +
> > libswscale/utils.c                      |   4 +-
> > 5 files changed, 129 insertions(+), 1 deletion(-)
> > create mode 100644 libswscale/aarch64/range_convert_neon.S
> >
> > diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
> > index da1d909561..6923827f82 100644
> > --- a/libswscale/aarch64/Makefile
> > +++ b/libswscale/aarch64/Makefile
> > @@ -4,5 +4,6 @@ OBJS        += aarch64/rgb2rgb.o                \
> >
> > NEON-OBJS   += aarch64/hscale.o                 \
> >                aarch64/output.o                 \
> > +               aarch64/range_convert_neon.o     \
> >                aarch64/rgb2rgb_neon.o           \
> >                aarch64/yuv2rgb_neon.o           \
> > diff --git a/libswscale/aarch64/range_convert_neon.S b/libswscale/aarch64/range_convert_neon.S
> > new file mode 100644
> > index 0000000000..5e104971f0
> > --- /dev/null
> > +++ b/libswscale/aarch64/range_convert_neon.S
> > @@ -0,0 +1,103 @@
> > +/*
> > + * Copyright (c) 2024 Ramiro Polla
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> > + */
> > +
> > +#include "libavutil/aarch64/asm.S"
> > +
> > +.macro lumConvertRange name max mult offset shift
>
> We usually use commas between the macro arguments here. Apparently it
> doesn't make any difference for any of the tools we support, but it would
> be nice for consistency. (When invoking macros, commas between arguments
> are optional for most platforms, but not when targeting Apple platforms,
> so being strict with consistent use of commas is generally good.)

Fixed in the new patchset.

> > +const offset_\name, align=4
> > +        .word \offset, \offset, \offset, \offset
> > +endconst
> > +function ff_\name, export=1
> > +.if \max != 0
> > +        mov             w3, #\max
> > +        dup             v24.8h, w3
> > +.endif
> > +        mov             w3, #\mult
> > +        dup             v25.4s, w3
> > +        movrel          x3, offset_\name
> > +        ld1             {v26.4s}, [x3]
>
> FWIW, I did see that you were recommended this form, over ld1r, based on
> some microarchitectural performance numbers. However in our preexisting
> assembly, manually pre-splatting vectors like this is unusual I would say.
> I don't have a strong opinion on the matter though.
>
> Anyway, the assembly looks reasonable to me.

I changed it to movz/movk/dup in the new patchset (tested on rpi5, but
not on macos).

Thanks,
Ramiro
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2024-06-11 12:33 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-06-07 14:05 [FFmpeg-devel] [PATCH 1/4] tests/checkasm: cosmetics, one object per line in Makefile Ramiro Polla
2024-06-07 14:05 ` [FFmpeg-devel] [PATCH 2/4] checkasm: add tests for {lum, chr}ConvertRange Ramiro Polla
2024-06-07 14:05 ` [FFmpeg-devel] [PATCH 3/4] swscale/x86: add sse4 " Ramiro Polla
2024-06-07 17:38   ` Ramiro Polla
2024-06-07 14:05 ` [FFmpeg-devel] [PATCH 4/4] swscale/aarch64: add neon " Ramiro Polla
2024-06-10 11:56   ` Martin Storsjö
2024-06-11 12:33     ` Ramiro Polla
2024-06-07 18:45 ` [FFmpeg-devel] [PATCH 1/4] tests/checkasm: cosmetics, one object per line in Makefile Andreas Rheinhardt
2024-06-07 19:09   ` Ramiro Polla
2024-06-07 19:12     ` Andreas Rheinhardt
2024-06-07 19:47       ` Ramiro Polla

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git