* [FFmpeg-devel] [PATCH v2 2/4] swscale/x86: add sse4 {lum, chr}ConvertRange
2024-06-11 12:28 [FFmpeg-devel] [PATCH v2 1/4] checkasm: add tests for {lum, chr}ConvertRange Ramiro Polla
@ 2024-06-11 12:28 ` Ramiro Polla
2024-06-11 12:32 ` James Almer
2024-06-11 18:26 ` Michael Niedermayer
2024-06-11 12:28 ` [FFmpeg-devel] [PATCH v2 3/4] swscale/x86: add avx2 " Ramiro Polla
` (2 subsequent siblings)
3 siblings, 2 replies; 12+ messages in thread
From: Ramiro Polla @ 2024-06-11 12:28 UTC (permalink / raw)
To: ffmpeg-devel
chrRangeFromJpeg_8_c: 28.7
chrRangeFromJpeg_8_sse4: 16.2
chrRangeFromJpeg_24_c: 152.7
chrRangeFromJpeg_24_sse4: 29.7
chrRangeFromJpeg_128_c: 366.5
chrRangeFromJpeg_128_sse4: 233.0
chrRangeFromJpeg_144_c: 408.0
chrRangeFromJpeg_144_sse4: 182.5
chrRangeFromJpeg_256_c: 698.7
chrRangeFromJpeg_256_sse4: 325.5
chrRangeFromJpeg_512_c: 1348.7
chrRangeFromJpeg_512_sse4: 660.2
chrRangeToJpeg_8_c: 37.7
chrRangeToJpeg_8_sse4: 16.2
chrRangeToJpeg_24_c: 115.7
chrRangeToJpeg_24_sse4: 36.2
chrRangeToJpeg_128_c: 631.2
chrRangeToJpeg_128_sse4: 163.7
chrRangeToJpeg_144_c: 710.7
chrRangeToJpeg_144_sse4: 183.0
chrRangeToJpeg_256_c: 1253.0
chrRangeToJpeg_256_sse4: 343.5
chrRangeToJpeg_512_c: 2491.2
chrRangeToJpeg_512_sse4: 654.2
lumRangeFromJpeg_8_c: 11.7
lumRangeFromJpeg_8_sse4: 10.5
lumRangeFromJpeg_24_c: 38.5
lumRangeFromJpeg_24_sse4: 19.0
lumRangeFromJpeg_128_c: 237.5
lumRangeFromJpeg_128_sse4: 79.2
lumRangeFromJpeg_144_c: 255.7
lumRangeFromJpeg_144_sse4: 90.5
lumRangeFromJpeg_256_c: 441.5
lumRangeFromJpeg_256_sse4: 161.7
lumRangeFromJpeg_512_c: 879.0
lumRangeFromJpeg_512_sse4: 333.2
lumRangeToJpeg_8_c: 20.0
lumRangeToJpeg_8_sse4: 11.7
lumRangeToJpeg_24_c: 61.5
lumRangeToJpeg_24_sse4: 17.7
lumRangeToJpeg_128_c: 357.5
lumRangeToJpeg_128_sse4: 80.0
lumRangeToJpeg_144_c: 371.5
lumRangeToJpeg_144_sse4: 93.2
lumRangeToJpeg_256_c: 651.5
lumRangeToJpeg_256_sse4: 164.5
lumRangeToJpeg_512_c: 1279.0
lumRangeToJpeg_512_sse4: 333.7
---
libswscale/swscale_internal.h | 1 +
libswscale/utils.c | 2 +
libswscale/x86/Makefile | 1 +
libswscale/x86/range_convert.asm | 130 +++++++++++++++++++++++++++++++
libswscale/x86/swscale.c | 36 +++++++++
5 files changed, 170 insertions(+)
create mode 100644 libswscale/x86/range_convert.asm
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 5007dd422f..d5e7b5e71c 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -698,6 +698,7 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY);
av_cold void ff_sws_init_range_convert(SwsContext *c);
av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c);
+av_cold void ff_sws_init_range_convert_x86(SwsContext *c);
SwsFunc ff_yuv2rgb_init_x86(SwsContext *c);
SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c);
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 476a24fea5..8dfa57b5ff 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -1082,6 +1082,8 @@ int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
ff_sws_init_range_convert(c);
#if ARCH_LOONGARCH64
ff_sws_init_range_convert_loongarch(c);
+#elif ARCH_X86
+ ff_sws_init_range_convert_x86(c);
#endif
}
diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index 68391494be..f00154941d 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -12,6 +12,7 @@ X86ASM-OBJS += x86/input.o \
x86/output.o \
x86/scale.o \
x86/scale_avx2.o \
+ x86/range_convert.o \
x86/rgb_2_rgb.o \
x86/yuv_2_rgb.o \
x86/yuv2yuvX.o \
diff --git a/libswscale/x86/range_convert.asm b/libswscale/x86/range_convert.asm
new file mode 100644
index 0000000000..13983a386b
--- /dev/null
+++ b/libswscale/x86/range_convert.asm
@@ -0,0 +1,130 @@
+;******************************************************************************
+;* Copyright (c) 2024 Ramiro Polla
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+chr_to_mult: times 4 dd 4663
+chr_to_offset: times 4 dd -9289992
+%define chr_to_shift 12
+
+chr_from_mult: times 4 dd 1799
+chr_from_offset: times 4 dd 4081085
+%define chr_from_shift 11
+
+lum_to_mult: times 4 dd 19077
+lum_to_offset: times 4 dd -39057361
+%define lum_to_shift 14
+
+lum_from_mult: times 4 dd 14071
+lum_from_offset: times 4 dd 33561947
+%define lum_from_shift 14
+
+SECTION .text
+
+; NOTE: there is no need to clamp the input when converting to jpeg range
+; (like we do in the C code) because packssdw will saturate the output.
+
+;-----------------------------------------------------------------------------
+; lumConvertRange
+;
+; void ff_lumRangeToJpeg_<opt>(int16_t *dst, int width);
+; void ff_lumRangeFromJpeg_<opt>(int16_t *dst, int width);
+;
+;-----------------------------------------------------------------------------
+
+%macro LUMCONVERTRANGE 4
+cglobal %1, 2, 3, 3, dst, width, x
+ movsxdifnidn widthq, widthd
+ xor xq, xq
+ mova m4, [%2]
+ mova m5, [%3]
+.loop:
+ pmovsxwd m0, [dstq+xq*2]
+ pmovsxwd m1, [dstq+xq*2+mmsize/2]
+ pmulld m0, m4
+ pmulld m1, m4
+ paddd m0, m5
+ paddd m1, m5
+ psrad m0, %4
+ psrad m1, %4
+ packssdw m0, m0
+ packssdw m1, m1
+ movq [dstq+xq*2], m0
+ movq [dstq+xq*2+mmsize/2], m1
+ add xq, mmsize / 2
+ cmp xd, widthd
+ jl .loop
+ RET
+%endmacro
+
+;-----------------------------------------------------------------------------
+; chrConvertRange
+;
+; void ff_chrRangeToJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width);
+; void ff_chrRangeFromJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width);
+;
+;-----------------------------------------------------------------------------
+
+%macro CHRCONVERTRANGE 4
+cglobal %1, 3, 4, 4, dstU, dstV, width, x
+ movsxdifnidn widthq, widthd
+ xor xq, xq
+ mova m4, [%2]
+ mova m5, [%3]
+.loop:
+ pmovsxwd m0, [dstUq+xq*2]
+ pmovsxwd m1, [dstUq+xq*2+mmsize/2]
+ pmovsxwd m2, [dstVq+xq*2]
+ pmovsxwd m3, [dstVq+xq*2+mmsize/2]
+ pmulld m0, m4
+ pmulld m1, m4
+ pmulld m2, m4
+ pmulld m3, m4
+ paddd m0, m5
+ paddd m1, m5
+ paddd m2, m5
+ paddd m3, m5
+ psrad m0, %4
+ psrad m1, %4
+ psrad m2, %4
+ psrad m3, %4
+ packssdw m0, m0
+ packssdw m1, m1
+ packssdw m2, m2
+ packssdw m3, m3
+ movq [dstUq+xq*2], m0
+ movq [dstUq+xq*2+mmsize/2], m1
+ movq [dstVq+xq*2], m2
+ movq [dstVq+xq*2+mmsize/2], m3
+ add xq, mmsize / 2
+ cmp xd, widthd
+ jl .loop
+ RET
+%endmacro
+
+%if ARCH_X86_64
+INIT_XMM sse4
+LUMCONVERTRANGE lumRangeToJpeg, lum_to_mult, lum_to_offset, lum_to_shift
+CHRCONVERTRANGE chrRangeToJpeg, chr_to_mult, chr_to_offset, chr_to_shift
+LUMCONVERTRANGE lumRangeFromJpeg, lum_from_mult, lum_from_offset, lum_from_shift
+CHRCONVERTRANGE chrRangeFromJpeg, chr_from_mult, chr_from_offset, chr_from_shift
+%endif
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 5a9da23265..8f477b7b72 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -453,6 +453,38 @@ INPUT_PLANAR_RGB_UV_ALL_DECL(avx2);
INPUT_PLANAR_RGB_A_ALL_DECL(avx2);
#endif
+#if ARCH_X86_64
+#define RANGE_CONVERT_FUNCS(opt) do { \
+ if (c->dstBpc <= 14) { \
+ if (c->srcRange) { \
+ c->lumConvertRange = ff_lumRangeFromJpeg_ ##opt; \
+ c->chrConvertRange = ff_chrRangeFromJpeg_ ##opt; \
+ } else { \
+ c->lumConvertRange = ff_lumRangeToJpeg_ ##opt; \
+ c->chrConvertRange = ff_chrRangeToJpeg_ ##opt; \
+ } \
+ } \
+} while (0)
+
+#define RANGE_CONVERT_FUNCS_DECL(opt) \
+void ff_lumRangeFromJpeg_ ##opt(int16_t *dst, int width); \
+void ff_chrRangeFromJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width); \
+void ff_lumRangeToJpeg_ ##opt(int16_t *dst, int width); \
+void ff_chrRangeToJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width); \
+
+RANGE_CONVERT_FUNCS_DECL(sse4);
+
+av_cold void ff_sws_init_range_convert_x86(SwsContext *c)
+{
+ if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
+ int cpu_flags = av_get_cpu_flags();
+ if (EXTERNAL_SSE4(cpu_flags)) {
+ RANGE_CONVERT_FUNCS(sse4);
+ }
+ }
+}
+#endif
+
av_cold void ff_sws_init_swscale_x86(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
@@ -820,4 +852,8 @@ switch(c->dstBpc){ \
}
#endif
+
+#if ARCH_X86_64
+ ff_sws_init_range_convert_x86(c);
+#endif
}
--
2.30.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 2/4] swscale/x86: add sse4 {lum, chr}ConvertRange
2024-06-11 12:28 ` [FFmpeg-devel] [PATCH v2 2/4] swscale/x86: add sse4 " Ramiro Polla
@ 2024-06-11 12:32 ` James Almer
2024-06-11 18:26 ` Michael Niedermayer
1 sibling, 0 replies; 12+ messages in thread
From: James Almer @ 2024-06-11 12:32 UTC (permalink / raw)
To: ffmpeg-devel
On 6/11/2024 9:28 AM, Ramiro Polla wrote:
> chrRangeFromJpeg_8_c: 28.7
> chrRangeFromJpeg_8_sse4: 16.2
> chrRangeFromJpeg_24_c: 152.7
> chrRangeFromJpeg_24_sse4: 29.7
> chrRangeFromJpeg_128_c: 366.5
> chrRangeFromJpeg_128_sse4: 233.0
> chrRangeFromJpeg_144_c: 408.0
> chrRangeFromJpeg_144_sse4: 182.5
> chrRangeFromJpeg_256_c: 698.7
> chrRangeFromJpeg_256_sse4: 325.5
> chrRangeFromJpeg_512_c: 1348.7
> chrRangeFromJpeg_512_sse4: 660.2
> chrRangeToJpeg_8_c: 37.7
> chrRangeToJpeg_8_sse4: 16.2
> chrRangeToJpeg_24_c: 115.7
> chrRangeToJpeg_24_sse4: 36.2
> chrRangeToJpeg_128_c: 631.2
> chrRangeToJpeg_128_sse4: 163.7
> chrRangeToJpeg_144_c: 710.7
> chrRangeToJpeg_144_sse4: 183.0
> chrRangeToJpeg_256_c: 1253.0
> chrRangeToJpeg_256_sse4: 343.5
> chrRangeToJpeg_512_c: 2491.2
> chrRangeToJpeg_512_sse4: 654.2
> lumRangeFromJpeg_8_c: 11.7
> lumRangeFromJpeg_8_sse4: 10.5
> lumRangeFromJpeg_24_c: 38.5
> lumRangeFromJpeg_24_sse4: 19.0
> lumRangeFromJpeg_128_c: 237.5
> lumRangeFromJpeg_128_sse4: 79.2
> lumRangeFromJpeg_144_c: 255.7
> lumRangeFromJpeg_144_sse4: 90.5
> lumRangeFromJpeg_256_c: 441.5
> lumRangeFromJpeg_256_sse4: 161.7
> lumRangeFromJpeg_512_c: 879.0
> lumRangeFromJpeg_512_sse4: 333.2
> lumRangeToJpeg_8_c: 20.0
> lumRangeToJpeg_8_sse4: 11.7
> lumRangeToJpeg_24_c: 61.5
> lumRangeToJpeg_24_sse4: 17.7
> lumRangeToJpeg_128_c: 357.5
> lumRangeToJpeg_128_sse4: 80.0
> lumRangeToJpeg_144_c: 371.5
> lumRangeToJpeg_144_sse4: 93.2
> lumRangeToJpeg_256_c: 651.5
> lumRangeToJpeg_256_sse4: 164.5
> lumRangeToJpeg_512_c: 1279.0
> lumRangeToJpeg_512_sse4: 333.7
> ---
> libswscale/swscale_internal.h | 1 +
> libswscale/utils.c | 2 +
> libswscale/x86/Makefile | 1 +
> libswscale/x86/range_convert.asm | 130 +++++++++++++++++++++++++++++++
> libswscale/x86/swscale.c | 36 +++++++++
> 5 files changed, 170 insertions(+)
> create mode 100644 libswscale/x86/range_convert.asm
>
> diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
> index 5007dd422f..d5e7b5e71c 100644
> --- a/libswscale/swscale_internal.h
> +++ b/libswscale/swscale_internal.h
> @@ -698,6 +698,7 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY);
>
> av_cold void ff_sws_init_range_convert(SwsContext *c);
> av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c);
> +av_cold void ff_sws_init_range_convert_x86(SwsContext *c);
>
> SwsFunc ff_yuv2rgb_init_x86(SwsContext *c);
> SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c);
> diff --git a/libswscale/utils.c b/libswscale/utils.c
> index 476a24fea5..8dfa57b5ff 100644
> --- a/libswscale/utils.c
> +++ b/libswscale/utils.c
> @@ -1082,6 +1082,8 @@ int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
> ff_sws_init_range_convert(c);
> #if ARCH_LOONGARCH64
> ff_sws_init_range_convert_loongarch(c);
> +#elif ARCH_X86
> + ff_sws_init_range_convert_x86(c);
> #endif
> }
>
> diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
> index 68391494be..f00154941d 100644
> --- a/libswscale/x86/Makefile
> +++ b/libswscale/x86/Makefile
> @@ -12,6 +12,7 @@ X86ASM-OBJS += x86/input.o \
> x86/output.o \
> x86/scale.o \
> x86/scale_avx2.o \
> + x86/range_convert.o \
> x86/rgb_2_rgb.o \
> x86/yuv_2_rgb.o \
> x86/yuv2yuvX.o \
> diff --git a/libswscale/x86/range_convert.asm b/libswscale/x86/range_convert.asm
> new file mode 100644
> index 0000000000..13983a386b
> --- /dev/null
> +++ b/libswscale/x86/range_convert.asm
> @@ -0,0 +1,130 @@
> +;******************************************************************************
> +;* Copyright (c) 2024 Ramiro Polla
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA
> +
> +chr_to_mult: times 4 dd 4663
> +chr_to_offset: times 4 dd -9289992
> +%define chr_to_shift 12
> +
> +chr_from_mult: times 4 dd 1799
> +chr_from_offset: times 4 dd 4081085
> +%define chr_from_shift 11
> +
> +lum_to_mult: times 4 dd 19077
> +lum_to_offset: times 4 dd -39057361
> +%define lum_to_shift 14
> +
> +lum_from_mult: times 4 dd 14071
> +lum_from_offset: times 4 dd 33561947
> +%define lum_from_shift 14
> +
> +SECTION .text
> +
> +; NOTE: there is no need to clamp the input when converting to jpeg range
> +; (like we do in the C code) because packssdw will saturate the output.
> +
> +;-----------------------------------------------------------------------------
> +; lumConvertRange
> +;
> +; void ff_lumRangeToJpeg_<opt>(int16_t *dst, int width);
> +; void ff_lumRangeFromJpeg_<opt>(int16_t *dst, int width);
> +;
> +;-----------------------------------------------------------------------------
> +
> +%macro LUMCONVERTRANGE 4
> +cglobal %1, 2, 3, 3, dst, width, x
> + movsxdifnidn widthq, widthd
> + xor xq, xq
> + mova m4, [%2]
> + mova m5, [%3]
> +.loop:
> + pmovsxwd m0, [dstq+xq*2]
> + pmovsxwd m1, [dstq+xq*2+mmsize/2]
> + pmulld m0, m4
> + pmulld m1, m4
Can't you use pmaddwd without sign extending to dword instead? pmulld is
pretty slow.
> + paddd m0, m5
> + paddd m1, m5
> + psrad m0, %4
> + psrad m1, %4
> + packssdw m0, m0
> + packssdw m1, m1
> + movq [dstq+xq*2], m0
> + movq [dstq+xq*2+mmsize/2], m1
> + add xq, mmsize / 2
> + cmp xd, widthd
> + jl .loop
> + RET
> +%endmacro
> +
> +;-----------------------------------------------------------------------------
> +; chrConvertRange
> +;
> +; void ff_chrRangeToJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width);
> +; void ff_chrRangeFromJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width);
> +;
> +;-----------------------------------------------------------------------------
> +
> +%macro CHRCONVERTRANGE 4
> +cglobal %1, 3, 4, 4, dstU, dstV, width, x
> + movsxdifnidn widthq, widthd
> + xor xq, xq
> + mova m4, [%2]
> + mova m5, [%3]
> +.loop:
> + pmovsxwd m0, [dstUq+xq*2]
> + pmovsxwd m1, [dstUq+xq*2+mmsize/2]
> + pmovsxwd m2, [dstVq+xq*2]
> + pmovsxwd m3, [dstVq+xq*2+mmsize/2]
> + pmulld m0, m4
> + pmulld m1, m4
> + pmulld m2, m4
> + pmulld m3, m4
> + paddd m0, m5
> + paddd m1, m5
> + paddd m2, m5
> + paddd m3, m5
> + psrad m0, %4
> + psrad m1, %4
> + psrad m2, %4
> + psrad m3, %4
> + packssdw m0, m0
> + packssdw m1, m1
> + packssdw m2, m2
> + packssdw m3, m3
> + movq [dstUq+xq*2], m0
> + movq [dstUq+xq*2+mmsize/2], m1
> + movq [dstVq+xq*2], m2
> + movq [dstVq+xq*2+mmsize/2], m3
> + add xq, mmsize / 2
> + cmp xd, widthd
> + jl .loop
> + RET
> +%endmacro
> +
> +%if ARCH_X86_64
> +INIT_XMM sse4
> +LUMCONVERTRANGE lumRangeToJpeg, lum_to_mult, lum_to_offset, lum_to_shift
> +CHRCONVERTRANGE chrRangeToJpeg, chr_to_mult, chr_to_offset, chr_to_shift
> +LUMCONVERTRANGE lumRangeFromJpeg, lum_from_mult, lum_from_offset, lum_from_shift
> +CHRCONVERTRANGE chrRangeFromJpeg, chr_from_mult, chr_from_offset, chr_from_shift
> +%endif
> diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
> index 5a9da23265..8f477b7b72 100644
> --- a/libswscale/x86/swscale.c
> +++ b/libswscale/x86/swscale.c
> @@ -453,6 +453,38 @@ INPUT_PLANAR_RGB_UV_ALL_DECL(avx2);
> INPUT_PLANAR_RGB_A_ALL_DECL(avx2);
> #endif
>
> +#if ARCH_X86_64
> +#define RANGE_CONVERT_FUNCS(opt) do { \
> + if (c->dstBpc <= 14) { \
> + if (c->srcRange) { \
> + c->lumConvertRange = ff_lumRangeFromJpeg_ ##opt; \
> + c->chrConvertRange = ff_chrRangeFromJpeg_ ##opt; \
> + } else { \
> + c->lumConvertRange = ff_lumRangeToJpeg_ ##opt; \
> + c->chrConvertRange = ff_chrRangeToJpeg_ ##opt; \
> + } \
> + } \
> +} while (0)
> +
> +#define RANGE_CONVERT_FUNCS_DECL(opt) \
> +void ff_lumRangeFromJpeg_ ##opt(int16_t *dst, int width); \
> +void ff_chrRangeFromJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width); \
> +void ff_lumRangeToJpeg_ ##opt(int16_t *dst, int width); \
> +void ff_chrRangeToJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width); \
> +
> +RANGE_CONVERT_FUNCS_DECL(sse4);
> +
> +av_cold void ff_sws_init_range_convert_x86(SwsContext *c)
> +{
> + if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
> + int cpu_flags = av_get_cpu_flags();
> + if (EXTERNAL_SSE4(cpu_flags)) {
> + RANGE_CONVERT_FUNCS(sse4);
> + }
> + }
> +}
> +#endif
> +
> av_cold void ff_sws_init_swscale_x86(SwsContext *c)
> {
> int cpu_flags = av_get_cpu_flags();
> @@ -820,4 +852,8 @@ switch(c->dstBpc){ \
> }
>
> #endif
> +
> +#if ARCH_X86_64
> + ff_sws_init_range_convert_x86(c);
> +#endif
> }
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 2/4] swscale/x86: add sse4 {lum, chr}ConvertRange
2024-06-11 12:28 ` [FFmpeg-devel] [PATCH v2 2/4] swscale/x86: add sse4 " Ramiro Polla
2024-06-11 12:32 ` James Almer
@ 2024-06-11 18:26 ` Michael Niedermayer
2024-06-11 18:43 ` James Almer
1 sibling, 1 reply; 12+ messages in thread
From: Michael Niedermayer @ 2024-06-11 18:26 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1.1: Type: text/plain, Size: 2648 bytes --]
On Tue, Jun 11, 2024 at 02:28:56PM +0200, Ramiro Polla wrote:
> chrRangeFromJpeg_8_c: 28.7
> chrRangeFromJpeg_8_sse4: 16.2
> chrRangeFromJpeg_24_c: 152.7
> chrRangeFromJpeg_24_sse4: 29.7
> chrRangeFromJpeg_128_c: 366.5
> chrRangeFromJpeg_128_sse4: 233.0
> chrRangeFromJpeg_144_c: 408.0
> chrRangeFromJpeg_144_sse4: 182.5
> chrRangeFromJpeg_256_c: 698.7
> chrRangeFromJpeg_256_sse4: 325.5
> chrRangeFromJpeg_512_c: 1348.7
> chrRangeFromJpeg_512_sse4: 660.2
> chrRangeToJpeg_8_c: 37.7
> chrRangeToJpeg_8_sse4: 16.2
> chrRangeToJpeg_24_c: 115.7
> chrRangeToJpeg_24_sse4: 36.2
> chrRangeToJpeg_128_c: 631.2
> chrRangeToJpeg_128_sse4: 163.7
> chrRangeToJpeg_144_c: 710.7
> chrRangeToJpeg_144_sse4: 183.0
> chrRangeToJpeg_256_c: 1253.0
> chrRangeToJpeg_256_sse4: 343.5
> chrRangeToJpeg_512_c: 2491.2
> chrRangeToJpeg_512_sse4: 654.2
> lumRangeFromJpeg_8_c: 11.7
> lumRangeFromJpeg_8_sse4: 10.5
> lumRangeFromJpeg_24_c: 38.5
> lumRangeFromJpeg_24_sse4: 19.0
> lumRangeFromJpeg_128_c: 237.5
> lumRangeFromJpeg_128_sse4: 79.2
> lumRangeFromJpeg_144_c: 255.7
> lumRangeFromJpeg_144_sse4: 90.5
> lumRangeFromJpeg_256_c: 441.5
> lumRangeFromJpeg_256_sse4: 161.7
> lumRangeFromJpeg_512_c: 879.0
> lumRangeFromJpeg_512_sse4: 333.2
> lumRangeToJpeg_8_c: 20.0
> lumRangeToJpeg_8_sse4: 11.7
> lumRangeToJpeg_24_c: 61.5
> lumRangeToJpeg_24_sse4: 17.7
> lumRangeToJpeg_128_c: 357.5
> lumRangeToJpeg_128_sse4: 80.0
> lumRangeToJpeg_144_c: 371.5
> lumRangeToJpeg_144_sse4: 93.2
> lumRangeToJpeg_256_c: 651.5
> lumRangeToJpeg_256_sse4: 164.5
> lumRangeToJpeg_512_c: 1279.0
> lumRangeToJpeg_512_sse4: 333.7
> ---
> libswscale/swscale_internal.h | 1 +
> libswscale/utils.c | 2 +
> libswscale/x86/Makefile | 1 +
> libswscale/x86/range_convert.asm | 130 +++++++++++++++++++++++++++++++
> libswscale/x86/swscale.c | 36 +++++++++
> 5 files changed, 170 insertions(+)
> create mode 100644 libswscale/x86/range_convert.asm
breaks x86-32 build
LD ffmpeg_g
/usr/lib/gcc-cross/i686-linux-gnu/7/../../../../i686-linux-gnu/bin/ld: libswscale/libswscale.a(utils.o): in function `sws_setColorspaceDetails':
ffmpeg/linux32/src/libswscale/utils.c:1086: undefined reference to `ff_sws_init_range_convert_x86'
collect2: error: ld returned 1 exit status
make: *** [Makefile:139: ffmpeg_g] Error 1
thx
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
Whats the most studid thing your enemy could do ? Blow himself up
Whats the most studid thing you could do ? Give up your rights and
freedom because your enemy blew himself up.
[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]
[-- Attachment #2: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 2/4] swscale/x86: add sse4 {lum, chr}ConvertRange
2024-06-11 18:26 ` Michael Niedermayer
@ 2024-06-11 18:43 ` James Almer
2024-06-12 14:54 ` Ramiro Polla
0 siblings, 1 reply; 12+ messages in thread
From: James Almer @ 2024-06-11 18:43 UTC (permalink / raw)
To: ffmpeg-devel
On 6/11/2024 3:26 PM, Michael Niedermayer wrote:
> On Tue, Jun 11, 2024 at 02:28:56PM +0200, Ramiro Polla wrote:
>> chrRangeFromJpeg_8_c: 28.7
>> chrRangeFromJpeg_8_sse4: 16.2
>> chrRangeFromJpeg_24_c: 152.7
>> chrRangeFromJpeg_24_sse4: 29.7
>> chrRangeFromJpeg_128_c: 366.5
>> chrRangeFromJpeg_128_sse4: 233.0
>> chrRangeFromJpeg_144_c: 408.0
>> chrRangeFromJpeg_144_sse4: 182.5
>> chrRangeFromJpeg_256_c: 698.7
>> chrRangeFromJpeg_256_sse4: 325.5
>> chrRangeFromJpeg_512_c: 1348.7
>> chrRangeFromJpeg_512_sse4: 660.2
>> chrRangeToJpeg_8_c: 37.7
>> chrRangeToJpeg_8_sse4: 16.2
>> chrRangeToJpeg_24_c: 115.7
>> chrRangeToJpeg_24_sse4: 36.2
>> chrRangeToJpeg_128_c: 631.2
>> chrRangeToJpeg_128_sse4: 163.7
>> chrRangeToJpeg_144_c: 710.7
>> chrRangeToJpeg_144_sse4: 183.0
>> chrRangeToJpeg_256_c: 1253.0
>> chrRangeToJpeg_256_sse4: 343.5
>> chrRangeToJpeg_512_c: 2491.2
>> chrRangeToJpeg_512_sse4: 654.2
>> lumRangeFromJpeg_8_c: 11.7
>> lumRangeFromJpeg_8_sse4: 10.5
>> lumRangeFromJpeg_24_c: 38.5
>> lumRangeFromJpeg_24_sse4: 19.0
>> lumRangeFromJpeg_128_c: 237.5
>> lumRangeFromJpeg_128_sse4: 79.2
>> lumRangeFromJpeg_144_c: 255.7
>> lumRangeFromJpeg_144_sse4: 90.5
>> lumRangeFromJpeg_256_c: 441.5
>> lumRangeFromJpeg_256_sse4: 161.7
>> lumRangeFromJpeg_512_c: 879.0
>> lumRangeFromJpeg_512_sse4: 333.2
>> lumRangeToJpeg_8_c: 20.0
>> lumRangeToJpeg_8_sse4: 11.7
>> lumRangeToJpeg_24_c: 61.5
>> lumRangeToJpeg_24_sse4: 17.7
>> lumRangeToJpeg_128_c: 357.5
>> lumRangeToJpeg_128_sse4: 80.0
>> lumRangeToJpeg_144_c: 371.5
>> lumRangeToJpeg_144_sse4: 93.2
>> lumRangeToJpeg_256_c: 651.5
>> lumRangeToJpeg_256_sse4: 164.5
>> lumRangeToJpeg_512_c: 1279.0
>> lumRangeToJpeg_512_sse4: 333.7
>> ---
>> libswscale/swscale_internal.h | 1 +
>> libswscale/utils.c | 2 +
>> libswscale/x86/Makefile | 1 +
>> libswscale/x86/range_convert.asm | 130 +++++++++++++++++++++++++++++++
>> libswscale/x86/swscale.c | 36 +++++++++
>> 5 files changed, 170 insertions(+)
>> create mode 100644 libswscale/x86/range_convert.asm
>
> breaks x86-32 build
>
> LD ffmpeg_g
> /usr/lib/gcc-cross/i686-linux-gnu/7/../../../../i686-linux-gnu/bin/ld: libswscale/libswscale.a(utils.o): in function `sws_setColorspaceDetails':
> ffmpeg/linux32/src/libswscale/utils.c:1086: undefined reference to `ff_sws_init_range_convert_x86'
> collect2: error: ld returned 1 exit status
> make: *** [Makefile:139: ffmpeg_g] Error 1
>
> thx
The functions are wrapped in ARCH_X86_64 checks for seemingly no reason,
so they should be removed in the next iteration.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 2/4] swscale/x86: add sse4 {lum, chr}ConvertRange
2024-06-11 18:43 ` James Almer
@ 2024-06-12 14:54 ` Ramiro Polla
2024-06-14 15:46 ` Ramiro Polla
0 siblings, 1 reply; 12+ messages in thread
From: Ramiro Polla @ 2024-06-12 14:54 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1: Type: text/plain, Size: 3015 bytes --]
Hi,
On Tue, Jun 11, 2024 at 8:42 PM James Almer <jamrial@gmail.com> wrote:
>
> On 6/11/2024 3:26 PM, Michael Niedermayer wrote:
> > On Tue, Jun 11, 2024 at 02:28:56PM +0200, Ramiro Polla wrote:
> >> chrRangeFromJpeg_8_c: 28.7
> >> chrRangeFromJpeg_8_sse4: 16.2
> >> chrRangeFromJpeg_24_c: 152.7
> >> chrRangeFromJpeg_24_sse4: 29.7
> >> chrRangeFromJpeg_128_c: 366.5
> >> chrRangeFromJpeg_128_sse4: 233.0
> >> chrRangeFromJpeg_144_c: 408.0
> >> chrRangeFromJpeg_144_sse4: 182.5
> >> chrRangeFromJpeg_256_c: 698.7
> >> chrRangeFromJpeg_256_sse4: 325.5
> >> chrRangeFromJpeg_512_c: 1348.7
> >> chrRangeFromJpeg_512_sse4: 660.2
> >> chrRangeToJpeg_8_c: 37.7
> >> chrRangeToJpeg_8_sse4: 16.2
> >> chrRangeToJpeg_24_c: 115.7
> >> chrRangeToJpeg_24_sse4: 36.2
> >> chrRangeToJpeg_128_c: 631.2
> >> chrRangeToJpeg_128_sse4: 163.7
> >> chrRangeToJpeg_144_c: 710.7
> >> chrRangeToJpeg_144_sse4: 183.0
> >> chrRangeToJpeg_256_c: 1253.0
> >> chrRangeToJpeg_256_sse4: 343.5
> >> chrRangeToJpeg_512_c: 2491.2
> >> chrRangeToJpeg_512_sse4: 654.2
> >> lumRangeFromJpeg_8_c: 11.7
> >> lumRangeFromJpeg_8_sse4: 10.5
> >> lumRangeFromJpeg_24_c: 38.5
> >> lumRangeFromJpeg_24_sse4: 19.0
> >> lumRangeFromJpeg_128_c: 237.5
> >> lumRangeFromJpeg_128_sse4: 79.2
> >> lumRangeFromJpeg_144_c: 255.7
> >> lumRangeFromJpeg_144_sse4: 90.5
> >> lumRangeFromJpeg_256_c: 441.5
> >> lumRangeFromJpeg_256_sse4: 161.7
> >> lumRangeFromJpeg_512_c: 879.0
> >> lumRangeFromJpeg_512_sse4: 333.2
> >> lumRangeToJpeg_8_c: 20.0
> >> lumRangeToJpeg_8_sse4: 11.7
> >> lumRangeToJpeg_24_c: 61.5
> >> lumRangeToJpeg_24_sse4: 17.7
> >> lumRangeToJpeg_128_c: 357.5
> >> lumRangeToJpeg_128_sse4: 80.0
> >> lumRangeToJpeg_144_c: 371.5
> >> lumRangeToJpeg_144_sse4: 93.2
> >> lumRangeToJpeg_256_c: 651.5
> >> lumRangeToJpeg_256_sse4: 164.5
> >> lumRangeToJpeg_512_c: 1279.0
> >> lumRangeToJpeg_512_sse4: 333.7
> >> ---
> >> libswscale/swscale_internal.h | 1 +
> >> libswscale/utils.c | 2 +
> >> libswscale/x86/Makefile | 1 +
> >> libswscale/x86/range_convert.asm | 130 +++++++++++++++++++++++++++++++
> >> libswscale/x86/swscale.c | 36 +++++++++
> >> 5 files changed, 170 insertions(+)
> >> create mode 100644 libswscale/x86/range_convert.asm
> >
> > breaks x86-32 build
> >
> > LD ffmpeg_g
> > /usr/lib/gcc-cross/i686-linux-gnu/7/../../../../i686-linux-gnu/bin/ld: libswscale/libswscale.a(utils.o): in function `sws_setColorspaceDetails':
> > ffmpeg/linux32/src/libswscale/utils.c:1086: undefined reference to `ff_sws_init_range_convert_x86'
> > collect2: error: ld returned 1 exit status
> > make: *** [Makefile:139: ffmpeg_g] Error 1
> >
> > thx
>
> The functions are wrapped in ARCH_X86_64 checks for seemingly no reason,
> so they should be removed in the next iteration.
Fixed.
James walked me through on IRC to optimize and improve the functions
in a way that they work both with sse2 and avx2. New patch attached.
[-- Attachment #2: 0001-swscale-x86-add-sse2-and-avx2-lum-chr-ConvertRange.patch --]
[-- Type: text/x-patch, Size: 11420 bytes --]
From 9e49e72f6766e96cc06bec869fb776fff4c477bf Mon Sep 17 00:00:00 2001
From: Ramiro Polla <ramiro.polla@gmail.com>
Date: Thu, 6 Jun 2024 18:33:34 +0200
Subject: [PATCH] swscale/x86: add sse2 and avx2 {lum,chr}ConvertRange
chrRangeFromJpeg_8_c: 22.3
chrRangeFromJpeg_8_sse2: 13.3
chrRangeFromJpeg_8_avx2: 13.3
chrRangeFromJpeg_24_c: 72.8
chrRangeFromJpeg_24_sse2: 22.3
chrRangeFromJpeg_24_avx2: 17.5
chrRangeFromJpeg_128_c: 345.5
chrRangeFromJpeg_128_sse2: 106.0
chrRangeFromJpeg_128_avx2: 57.8
chrRangeFromJpeg_144_c: 380.5
chrRangeFromJpeg_144_sse2: 118.5
chrRangeFromJpeg_144_avx2: 62.3
chrRangeFromJpeg_256_c: 646.3
chrRangeFromJpeg_256_sse2: 218.8
chrRangeFromJpeg_256_avx2: 109.0
chrRangeFromJpeg_512_c: 1461.5
chrRangeFromJpeg_512_sse2: 426.5
chrRangeFromJpeg_512_avx2: 211.5
chrRangeToJpeg_8_c: 37.8
chrRangeToJpeg_8_sse2: 10.5
chrRangeToJpeg_8_avx2: 14.0
chrRangeToJpeg_24_c: 114.3
chrRangeToJpeg_24_sse2: 23.5
chrRangeToJpeg_24_avx2: 16.3
chrRangeToJpeg_128_c: 633.5
chrRangeToJpeg_128_sse2: 107.5
chrRangeToJpeg_128_avx2: 55.0
chrRangeToJpeg_144_c: 758.3
chrRangeToJpeg_144_sse2: 132.0
chrRangeToJpeg_144_avx2: 64.5
chrRangeToJpeg_256_c: 1345.0
chrRangeToJpeg_256_sse2: 218.0
chrRangeToJpeg_256_avx2: 105.3
chrRangeToJpeg_512_c: 2524.0
chrRangeToJpeg_512_sse2: 417.0
chrRangeToJpeg_512_avx2: 218.8
lumRangeFromJpeg_8_c: 11.8
lumRangeFromJpeg_8_sse2: 11.0
lumRangeFromJpeg_8_avx2: 10.3
lumRangeFromJpeg_24_c: 38.5
lumRangeFromJpeg_24_sse2: 15.5
lumRangeFromJpeg_24_avx2: 12.5
lumRangeFromJpeg_128_c: 232.3
lumRangeFromJpeg_128_sse2: 60.0
lumRangeFromJpeg_128_avx2: 26.8
lumRangeFromJpeg_144_c: 259.5
lumRangeFromJpeg_144_sse2: 65.3
lumRangeFromJpeg_144_avx2: 29.0
lumRangeFromJpeg_256_c: 464.5
lumRangeFromJpeg_256_sse2: 107.5
lumRangeFromJpeg_256_avx2: 54.0
lumRangeFromJpeg_512_c: 897.5
lumRangeFromJpeg_512_sse2: 224.5
lumRangeFromJpeg_512_avx2: 109.8
lumRangeToJpeg_8_c: 17.8
lumRangeToJpeg_8_sse2: 11.0
lumRangeToJpeg_8_avx2: 11.8
lumRangeToJpeg_24_c: 56.3
lumRangeToJpeg_24_sse2: 11.0
lumRangeToJpeg_24_avx2: 12.5
lumRangeToJpeg_128_c: 333.8
lumRangeToJpeg_128_sse2: 53.3
lumRangeToJpeg_128_avx2: 26.5
lumRangeToJpeg_144_c: 375.5
lumRangeToJpeg_144_sse2: 60.8
lumRangeToJpeg_144_avx2: 29.0
lumRangeToJpeg_256_c: 652.0
lumRangeToJpeg_256_sse2: 109.5
lumRangeToJpeg_256_avx2: 53.5
lumRangeToJpeg_512_c: 1284.3
lumRangeToJpeg_512_sse2: 218.0
lumRangeToJpeg_512_avx2: 108.3
---
libswscale/swscale_internal.h | 1 +
libswscale/utils.c | 2 +
libswscale/x86/Makefile | 1 +
libswscale/x86/range_convert.asm | 134 +++++++++++++++++++++++++++++++
libswscale/x86/swscale.c | 35 ++++++++
5 files changed, 173 insertions(+)
create mode 100644 libswscale/x86/range_convert.asm
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 5007dd422f..d5e7b5e71c 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -698,6 +698,7 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY);
av_cold void ff_sws_init_range_convert(SwsContext *c);
av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c);
+av_cold void ff_sws_init_range_convert_x86(SwsContext *c);
SwsFunc ff_yuv2rgb_init_x86(SwsContext *c);
SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c);
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 476a24fea5..8dfa57b5ff 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -1082,6 +1082,8 @@ int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
ff_sws_init_range_convert(c);
#if ARCH_LOONGARCH64
ff_sws_init_range_convert_loongarch(c);
+#elif ARCH_X86
+ ff_sws_init_range_convert_x86(c);
#endif
}
diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index 68391494be..f00154941d 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -12,6 +12,7 @@ X86ASM-OBJS += x86/input.o \
x86/output.o \
x86/scale.o \
x86/scale_avx2.o \
+ x86/range_convert.o \
x86/rgb_2_rgb.o \
x86/yuv_2_rgb.o \
x86/yuv2yuvX.o \
diff --git a/libswscale/x86/range_convert.asm b/libswscale/x86/range_convert.asm
new file mode 100644
index 0000000000..ae51e9d573
--- /dev/null
+++ b/libswscale/x86/range_convert.asm
@@ -0,0 +1,134 @@
+;******************************************************************************
+;* Copyright (c) 2024 Ramiro Polla
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+chr_to_mult: times 4 dw 4663, 0
+chr_to_offset: times 4 dd -9289992
+%define chr_to_shift 12
+
+chr_from_mult: times 4 dw 1799, 0
+chr_from_offset: times 4 dd 4081085
+%define chr_from_shift 11
+
+lum_to_mult: times 4 dw 19077, 0
+lum_to_offset: times 4 dd -39057361
+%define lum_to_shift 14
+
+lum_from_mult: times 4 dw 14071, 0
+lum_from_offset: times 4 dd 33561947
+%define lum_from_shift 14
+
+SECTION .text
+
+; NOTE: there is no need to clamp the input when converting to jpeg range
+; (like we do in the C code) because packssdw will saturate the output.
+
+;-----------------------------------------------------------------------------
+; lumConvertRange
+;
+; void ff_lumRangeToJpeg_<opt>(int16_t *dst, int width);
+; void ff_lumRangeFromJpeg_<opt>(int16_t *dst, int width);
+;
+;-----------------------------------------------------------------------------
+
+%macro LUMCONVERTRANGE 4
+cglobal %1, 2, 2, 7, dst, width
+ shl widthd, 1
+ VBROADCASTI128 m4, [%2]
+ VBROADCASTI128 m5, [%3]
+ pxor m6, m6
+ add dstq, widthq
+ neg widthq
+.loop:
+ movu m0, [dstq+widthq]
+ punpckhwd m1, m0, m6
+ punpcklwd m0, m6
+ pmaddwd m0, m4
+ pmaddwd m1, m4
+ paddd m0, m5
+ paddd m1, m5
+ psrad m0, %4
+ psrad m1, %4
+ packssdw m0, m1
+ movu [dstq+widthq], m0
+ add widthq, mmsize
+ jl .loop
+ RET
+%endmacro
+
+;-----------------------------------------------------------------------------
+; chrConvertRange
+;
+; void ff_chrRangeToJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width);
+; void ff_chrRangeFromJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width);
+;
+;-----------------------------------------------------------------------------
+
+%macro CHRCONVERTRANGE 4
+cglobal %1, 3, 3, 7, dstU, dstV, width
+ shl widthd, 1
+ VBROADCASTI128 m4, [%2]
+ VBROADCASTI128 m5, [%3]
+ pxor m6, m6
+ add dstUq, widthq
+ add dstVq, widthq
+ neg widthq
+.loop:
+ movu m0, [dstUq+widthq]
+ movu m2, [dstVq+widthq]
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m2, m6
+ punpcklwd m0, m6
+ punpcklwd m2, m6
+ pmaddwd m0, m4
+ pmaddwd m1, m4
+ pmaddwd m2, m4
+ pmaddwd m3, m4
+ paddd m0, m5
+ paddd m1, m5
+ paddd m2, m5
+ paddd m3, m5
+ psrad m0, %4
+ psrad m1, %4
+ psrad m2, %4
+ psrad m3, %4
+ packssdw m0, m1
+ packssdw m2, m3
+ movu [dstUq+widthq], m0
+ movu [dstVq+widthq], m2
+ add widthq, mmsize
+ jl .loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+LUMCONVERTRANGE lumRangeToJpeg, lum_to_mult, lum_to_offset, lum_to_shift
+CHRCONVERTRANGE chrRangeToJpeg, chr_to_mult, chr_to_offset, chr_to_shift
+LUMCONVERTRANGE lumRangeFromJpeg, lum_from_mult, lum_from_offset, lum_from_shift
+CHRCONVERTRANGE chrRangeFromJpeg, chr_from_mult, chr_from_offset, chr_from_shift
+
+INIT_YMM avx2
+LUMCONVERTRANGE lumRangeToJpeg, lum_to_mult, lum_to_offset, lum_to_shift
+CHRCONVERTRANGE chrRangeToJpeg, chr_to_mult, chr_to_offset, chr_to_shift
+LUMCONVERTRANGE lumRangeFromJpeg, lum_from_mult, lum_from_offset, lum_from_shift
+CHRCONVERTRANGE chrRangeFromJpeg, chr_from_mult, chr_from_offset, chr_from_shift
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 5a9da23265..ad7f67f90e 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -453,6 +453,39 @@ INPUT_PLANAR_RGB_UV_ALL_DECL(avx2);
INPUT_PLANAR_RGB_A_ALL_DECL(avx2);
#endif
+#define RANGE_CONVERT_FUNCS(opt) do { \
+ if (c->dstBpc <= 14) { \
+ if (c->srcRange) { \
+ c->lumConvertRange = ff_lumRangeFromJpeg_ ##opt; \
+ c->chrConvertRange = ff_chrRangeFromJpeg_ ##opt; \
+ } else { \
+ c->lumConvertRange = ff_lumRangeToJpeg_ ##opt; \
+ c->chrConvertRange = ff_chrRangeToJpeg_ ##opt; \
+ } \
+ } \
+} while (0)
+
+#define RANGE_CONVERT_FUNCS_DECL(opt) \
+void ff_lumRangeFromJpeg_ ##opt(int16_t *dst, int width); \
+void ff_chrRangeFromJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width); \
+void ff_lumRangeToJpeg_ ##opt(int16_t *dst, int width); \
+void ff_chrRangeToJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width); \
+
+RANGE_CONVERT_FUNCS_DECL(sse2);
+RANGE_CONVERT_FUNCS_DECL(avx2);
+
+av_cold void ff_sws_init_range_convert_x86(SwsContext *c)
+{
+ if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
+ int cpu_flags = av_get_cpu_flags();
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ RANGE_CONVERT_FUNCS(avx2);
+ } else if (EXTERNAL_SSE2(cpu_flags)) {
+ RANGE_CONVERT_FUNCS(sse2);
+ }
+ }
+}
+
av_cold void ff_sws_init_swscale_x86(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
@@ -820,4 +853,6 @@ switch(c->dstBpc){ \
}
#endif
+
+ ff_sws_init_range_convert_x86(c);
}
--
2.30.2
[-- Attachment #3: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 2/4] swscale/x86: add sse4 {lum, chr}ConvertRange
2024-06-12 14:54 ` Ramiro Polla
@ 2024-06-14 15:46 ` Ramiro Polla
0 siblings, 0 replies; 12+ messages in thread
From: Ramiro Polla @ 2024-06-14 15:46 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Wed, Jun 12, 2024 at 4:54 PM Ramiro Polla <ramiro.polla@gmail.com> wrote:
>
> Hi,
>
> On Tue, Jun 11, 2024 at 8:42 PM James Almer <jamrial@gmail.com> wrote:
> >
> > On 6/11/2024 3:26 PM, Michael Niedermayer wrote:
> > > On Tue, Jun 11, 2024 at 02:28:56PM +0200, Ramiro Polla wrote:
> > >> chrRangeFromJpeg_8_c: 28.7
> > >> chrRangeFromJpeg_8_sse4: 16.2
> > >> chrRangeFromJpeg_24_c: 152.7
> > >> chrRangeFromJpeg_24_sse4: 29.7
> > >> chrRangeFromJpeg_128_c: 366.5
> > >> chrRangeFromJpeg_128_sse4: 233.0
> > >> chrRangeFromJpeg_144_c: 408.0
> > >> chrRangeFromJpeg_144_sse4: 182.5
> > >> chrRangeFromJpeg_256_c: 698.7
> > >> chrRangeFromJpeg_256_sse4: 325.5
> > >> chrRangeFromJpeg_512_c: 1348.7
> > >> chrRangeFromJpeg_512_sse4: 660.2
> > >> chrRangeToJpeg_8_c: 37.7
> > >> chrRangeToJpeg_8_sse4: 16.2
> > >> chrRangeToJpeg_24_c: 115.7
> > >> chrRangeToJpeg_24_sse4: 36.2
> > >> chrRangeToJpeg_128_c: 631.2
> > >> chrRangeToJpeg_128_sse4: 163.7
> > >> chrRangeToJpeg_144_c: 710.7
> > >> chrRangeToJpeg_144_sse4: 183.0
> > >> chrRangeToJpeg_256_c: 1253.0
> > >> chrRangeToJpeg_256_sse4: 343.5
> > >> chrRangeToJpeg_512_c: 2491.2
> > >> chrRangeToJpeg_512_sse4: 654.2
> > >> lumRangeFromJpeg_8_c: 11.7
> > >> lumRangeFromJpeg_8_sse4: 10.5
> > >> lumRangeFromJpeg_24_c: 38.5
> > >> lumRangeFromJpeg_24_sse4: 19.0
> > >> lumRangeFromJpeg_128_c: 237.5
> > >> lumRangeFromJpeg_128_sse4: 79.2
> > >> lumRangeFromJpeg_144_c: 255.7
> > >> lumRangeFromJpeg_144_sse4: 90.5
> > >> lumRangeFromJpeg_256_c: 441.5
> > >> lumRangeFromJpeg_256_sse4: 161.7
> > >> lumRangeFromJpeg_512_c: 879.0
> > >> lumRangeFromJpeg_512_sse4: 333.2
> > >> lumRangeToJpeg_8_c: 20.0
> > >> lumRangeToJpeg_8_sse4: 11.7
> > >> lumRangeToJpeg_24_c: 61.5
> > >> lumRangeToJpeg_24_sse4: 17.7
> > >> lumRangeToJpeg_128_c: 357.5
> > >> lumRangeToJpeg_128_sse4: 80.0
> > >> lumRangeToJpeg_144_c: 371.5
> > >> lumRangeToJpeg_144_sse4: 93.2
> > >> lumRangeToJpeg_256_c: 651.5
> > >> lumRangeToJpeg_256_sse4: 164.5
> > >> lumRangeToJpeg_512_c: 1279.0
> > >> lumRangeToJpeg_512_sse4: 333.7
> > >> ---
> > >> libswscale/swscale_internal.h | 1 +
> > >> libswscale/utils.c | 2 +
> > >> libswscale/x86/Makefile | 1 +
> > >> libswscale/x86/range_convert.asm | 130 +++++++++++++++++++++++++++++++
> > >> libswscale/x86/swscale.c | 36 +++++++++
> > >> 5 files changed, 170 insertions(+)
> > >> create mode 100644 libswscale/x86/range_convert.asm
> > >
> > > breaks x86-32 build
> > >
> > > LD ffmpeg_g
> > > /usr/lib/gcc-cross/i686-linux-gnu/7/../../../../i686-linux-gnu/bin/ld: libswscale/libswscale.a(utils.o): in function `sws_setColorspaceDetails':
> > > ffmpeg/linux32/src/libswscale/utils.c:1086: undefined reference to `ff_sws_init_range_convert_x86'
> > > collect2: error: ld returned 1 exit status
> > > make: *** [Makefile:139: ffmpeg_g] Error 1
> > >
> > > thx
> >
> > The functions are wrapped in ARCH_X86_64 checks for seemingly no reason,
> > so they should be removed in the next iteration.
>
> Fixed.
>
> James walked me through on IRC to optimize and improve the functions
> in a way that they work both with sse2 and avx2. New patch attached.
I'll apply tomorrow if there are no more comments.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 12+ messages in thread
* [FFmpeg-devel] [PATCH v2 3/4] swscale/x86: add avx2 {lum, chr}ConvertRange
2024-06-11 12:28 [FFmpeg-devel] [PATCH v2 1/4] checkasm: add tests for {lum, chr}ConvertRange Ramiro Polla
2024-06-11 12:28 ` [FFmpeg-devel] [PATCH v2 2/4] swscale/x86: add sse4 " Ramiro Polla
@ 2024-06-11 12:28 ` Ramiro Polla
2024-06-11 12:28 ` [FFmpeg-devel] [PATCH v2 4/4] swscale/aarch64: add neon " Ramiro Polla
2024-06-14 15:45 ` [FFmpeg-devel] [PATCH v2 1/4] checkasm: add tests for " Ramiro Polla
3 siblings, 0 replies; 12+ messages in thread
From: Ramiro Polla @ 2024-06-11 12:28 UTC (permalink / raw)
To: ffmpeg-devel
chrRangeFromJpeg_8_c: 24.1
chrRangeFromJpeg_8_sse4: 16.1
chrRangeFromJpeg_8_avx2: 19.9
chrRangeFromJpeg_24_c: 72.6
chrRangeFromJpeg_24_sse4: 34.6
chrRangeFromJpeg_24_avx2: 30.9
chrRangeFromJpeg_128_c: 341.1
chrRangeFromJpeg_128_sse4: 160.9
chrRangeFromJpeg_128_avx2: 94.1
chrRangeFromJpeg_144_c: 381.9
chrRangeFromJpeg_144_sse4: 183.6
chrRangeFromJpeg_144_avx2: 108.9
chrRangeFromJpeg_256_c: 646.1
chrRangeFromJpeg_256_sse4: 320.4
chrRangeFromJpeg_256_avx2: 190.6
chrRangeFromJpeg_512_c: 1255.9
chrRangeFromJpeg_512_sse4: 654.1
chrRangeFromJpeg_512_avx2: 392.4
chrRangeToJpeg_8_c: 36.9
chrRangeToJpeg_8_sse4: 13.9
chrRangeToJpeg_8_avx2: 20.6
chrRangeToJpeg_24_c: 113.4
chrRangeToJpeg_24_sse4: 29.6
chrRangeToJpeg_24_avx2: 28.9
chrRangeToJpeg_128_c: 632.1
chrRangeToJpeg_128_sse4: 162.4
chrRangeToJpeg_128_avx2: 94.6
chrRangeToJpeg_144_c: 709.9
chrRangeToJpeg_144_sse4: 183.9
chrRangeToJpeg_144_avx2: 108.1
chrRangeToJpeg_256_c: 2672.9
chrRangeToJpeg_256_sse4: 334.4
chrRangeToJpeg_256_avx2: 190.6
chrRangeToJpeg_512_c: 2500.9
chrRangeToJpeg_512_sse4: 654.1
chrRangeToJpeg_512_avx2: 379.6
lumRangeFromJpeg_8_c: 10.9
lumRangeFromJpeg_8_sse4: 12.4
lumRangeFromJpeg_8_avx2: 17.6
lumRangeFromJpeg_24_c: 38.4
lumRangeFromJpeg_24_sse4: 16.9
lumRangeFromJpeg_24_avx2: 20.6
lumRangeFromJpeg_128_c: 233.6
lumRangeFromJpeg_128_sse4: 79.9
lumRangeFromJpeg_128_avx2: 51.6
lumRangeFromJpeg_144_c: 263.9
lumRangeFromJpeg_144_sse4: 90.1
lumRangeFromJpeg_144_avx2: 57.6
lumRangeFromJpeg_256_c: 436.9
lumRangeFromJpeg_256_sse4: 162.1
lumRangeFromJpeg_256_avx2: 100.6
lumRangeFromJpeg_512_c: 878.4
lumRangeFromJpeg_512_sse4: 335.1
lumRangeFromJpeg_512_avx2: 199.4
lumRangeToJpeg_8_c: 19.1
lumRangeToJpeg_8_sse4: 11.6
lumRangeToJpeg_8_avx2: 17.6
lumRangeToJpeg_24_c: 56.9
lumRangeToJpeg_24_sse4: 17.6
lumRangeToJpeg_24_avx2: 21.4
lumRangeToJpeg_128_c: 335.9
lumRangeToJpeg_128_sse4: 79.1
lumRangeToJpeg_128_avx2: 48.9
lumRangeToJpeg_144_c: 372.9
lumRangeToJpeg_144_sse4: 91.6
lumRangeToJpeg_144_avx2: 55.4
lumRangeToJpeg_256_c: 651.9
lumRangeToJpeg_256_sse4: 163.6
lumRangeToJpeg_256_avx2: 99.1
lumRangeToJpeg_512_c: 1289.9
lumRangeToJpeg_512_sse4: 333.6
lumRangeToJpeg_512_avx2: 211.1
---
libswscale/x86/range_convert.asm | 46 ++++++++++++++++++++++++++------
libswscale/x86/swscale.c | 5 +++-
2 files changed, 42 insertions(+), 9 deletions(-)
diff --git a/libswscale/x86/range_convert.asm b/libswscale/x86/range_convert.asm
index 13983a386b..54c2f64769 100644
--- a/libswscale/x86/range_convert.asm
+++ b/libswscale/x86/range_convert.asm
@@ -22,20 +22,20 @@
SECTION_RODATA
-chr_to_mult: times 4 dd 4663
-chr_to_offset: times 4 dd -9289992
+chr_to_mult: times 8 dd 4663
+chr_to_offset: times 8 dd -9289992
%define chr_to_shift 12
-chr_from_mult: times 4 dd 1799
-chr_from_offset: times 4 dd 4081085
+chr_from_mult: times 8 dd 1799
+chr_from_offset: times 8 dd 4081085
%define chr_from_shift 11
-lum_to_mult: times 4 dd 19077
-lum_to_offset: times 4 dd -39057361
+lum_to_mult: times 8 dd 19077
+lum_to_offset: times 8 dd -39057361
%define lum_to_shift 14
-lum_from_mult: times 4 dd 14071
-lum_from_offset: times 4 dd 33561947
+lum_from_mult: times 8 dd 14071
+lum_from_offset: times 8 dd 33561947
%define lum_from_shift 14
SECTION .text
@@ -66,10 +66,19 @@ cglobal %1, 2, 3, 3, dst, width, x
paddd m1, m5
psrad m0, %4
psrad m1, %4
+%if mmsize == 16
packssdw m0, m0
packssdw m1, m1
movq [dstq+xq*2], m0
movq [dstq+xq*2+mmsize/2], m1
+%else
+ vextracti128 xm7, ym0, 1
+ packssdw xm0, xm7
+ vextracti128 xm7, ym1, 1
+ packssdw xm1, xm7
+ movdqu [dstq+xq*2], xm0
+ movdqu [dstq+xq*2+mmsize/2], xm1
+%endif
add xq, mmsize / 2
cmp xd, widthd
jl .loop
@@ -107,6 +116,7 @@ cglobal %1, 3, 4, 4, dstU, dstV, width, x
psrad m1, %4
psrad m2, %4
psrad m3, %4
+%if mmsize == 16
packssdw m0, m0
packssdw m1, m1
packssdw m2, m2
@@ -115,6 +125,20 @@ cglobal %1, 3, 4, 4, dstU, dstV, width, x
movq [dstUq+xq*2+mmsize/2], m1
movq [dstVq+xq*2], m2
movq [dstVq+xq*2+mmsize/2], m3
+%else
+ vextracti128 xm7, ym0, 1
+ packssdw xm0, xm7
+ vextracti128 xm7, ym1, 1
+ packssdw xm1, xm7
+ vextracti128 xm7, ym2, 1
+ packssdw xm2, xm7
+ vextracti128 xm7, ym3, 1
+ packssdw xm3, xm7
+ movdqu [dstUq+xq*2], xm0
+ movdqu [dstUq+xq*2+mmsize/2], xm1
+ movdqu [dstVq+xq*2], xm2
+ movdqu [dstVq+xq*2+mmsize/2], xm3
+%endif
add xq, mmsize / 2
cmp xd, widthd
jl .loop
@@ -127,4 +151,10 @@ LUMCONVERTRANGE lumRangeToJpeg, lum_to_mult, lum_to_offset, lum_to_shift
CHRCONVERTRANGE chrRangeToJpeg, chr_to_mult, chr_to_offset, chr_to_shift
LUMCONVERTRANGE lumRangeFromJpeg, lum_from_mult, lum_from_offset, lum_from_shift
CHRCONVERTRANGE chrRangeFromJpeg, chr_from_mult, chr_from_offset, chr_from_shift
+
+INIT_YMM avx2
+LUMCONVERTRANGE lumRangeToJpeg, lum_to_mult, lum_to_offset, lum_to_shift
+CHRCONVERTRANGE chrRangeToJpeg, chr_to_mult, chr_to_offset, chr_to_shift
+LUMCONVERTRANGE lumRangeFromJpeg, lum_from_mult, lum_from_offset, lum_from_shift
+CHRCONVERTRANGE chrRangeFromJpeg, chr_from_mult, chr_from_offset, chr_from_shift
%endif
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 8f477b7b72..704e5f9c85 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -473,12 +473,15 @@ void ff_lumRangeToJpeg_ ##opt(int16_t *dst, int width); \
void ff_chrRangeToJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width); \
RANGE_CONVERT_FUNCS_DECL(sse4);
+RANGE_CONVERT_FUNCS_DECL(avx2);
av_cold void ff_sws_init_range_convert_x86(SwsContext *c)
{
if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
int cpu_flags = av_get_cpu_flags();
- if (EXTERNAL_SSE4(cpu_flags)) {
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ RANGE_CONVERT_FUNCS(avx2);
+ } else if (EXTERNAL_SSE4(cpu_flags)) {
RANGE_CONVERT_FUNCS(sse4);
}
}
--
2.30.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 12+ messages in thread
* [FFmpeg-devel] [PATCH v2 4/4] swscale/aarch64: add neon {lum, chr}ConvertRange
2024-06-11 12:28 [FFmpeg-devel] [PATCH v2 1/4] checkasm: add tests for {lum, chr}ConvertRange Ramiro Polla
2024-06-11 12:28 ` [FFmpeg-devel] [PATCH v2 2/4] swscale/x86: add sse4 " Ramiro Polla
2024-06-11 12:28 ` [FFmpeg-devel] [PATCH v2 3/4] swscale/x86: add avx2 " Ramiro Polla
@ 2024-06-11 12:28 ` Ramiro Polla
2024-06-18 17:42 ` Ramiro Polla
2024-06-14 15:45 ` [FFmpeg-devel] [PATCH v2 1/4] checkasm: add tests for " Ramiro Polla
3 siblings, 1 reply; 12+ messages in thread
From: Ramiro Polla @ 2024-06-11 12:28 UTC (permalink / raw)
To: ffmpeg-devel
chrRangeFromJpeg_8_c: 29.2
chrRangeFromJpeg_8_neon: 19.5
chrRangeFromJpeg_24_c: 80.5
chrRangeFromJpeg_24_neon: 34.0
chrRangeFromJpeg_128_c: 413.7
chrRangeFromJpeg_128_neon: 156.0
chrRangeFromJpeg_144_c: 471.0
chrRangeFromJpeg_144_neon: 174.2
chrRangeFromJpeg_256_c: 842.0
chrRangeFromJpeg_256_neon: 305.5
chrRangeFromJpeg_512_c: 1699.0
chrRangeFromJpeg_512_neon: 608.0
chrRangeToJpeg_8_c: 51.7
chrRangeToJpeg_8_neon: 22.7
chrRangeToJpeg_24_c: 149.7
chrRangeToJpeg_24_neon: 38.0
chrRangeToJpeg_128_c: 761.7
chrRangeToJpeg_128_neon: 176.7
chrRangeToJpeg_144_c: 866.2
chrRangeToJpeg_144_neon: 198.7
chrRangeToJpeg_256_c: 1516.5
chrRangeToJpeg_256_neon: 348.7
chrRangeToJpeg_512_c: 3067.2
chrRangeToJpeg_512_neon: 692.7
lumRangeFromJpeg_8_c: 24.0
lumRangeFromJpeg_8_neon: 17.0
lumRangeFromJpeg_24_c: 56.7
lumRangeFromJpeg_24_neon: 21.0
lumRangeFromJpeg_128_c: 294.5
lumRangeFromJpeg_128_neon: 76.7
lumRangeFromJpeg_144_c: 332.5
lumRangeFromJpeg_144_neon: 86.7
lumRangeFromJpeg_256_c: 586.0
lumRangeFromJpeg_256_neon: 152.2
lumRangeFromJpeg_512_c: 1190.0
lumRangeFromJpeg_512_neon: 298.0
lumRangeToJpeg_8_c: 31.7
lumRangeToJpeg_8_neon: 19.5
lumRangeToJpeg_24_c: 83.5
lumRangeToJpeg_24_neon: 24.2
lumRangeToJpeg_128_c: 440.5
lumRangeToJpeg_128_neon: 91.0
lumRangeToJpeg_144_c: 504.2
lumRangeToJpeg_144_neon: 101.0
lumRangeToJpeg_256_c: 879.7
lumRangeToJpeg_256_neon: 177.2
lumRangeToJpeg_512_c: 1794.2
lumRangeToJpeg_512_neon: 354.0
---
libswscale/aarch64/Makefile | 1 +
libswscale/aarch64/range_convert_neon.S | 99 +++++++++++++++++++++++++
libswscale/aarch64/swscale.c | 21 ++++++
libswscale/swscale_internal.h | 1 +
libswscale/utils.c | 4 +-
5 files changed, 125 insertions(+), 1 deletion(-)
create mode 100644 libswscale/aarch64/range_convert_neon.S
diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
index adfd90a1b6..37ad960619 100644
--- a/libswscale/aarch64/Makefile
+++ b/libswscale/aarch64/Makefile
@@ -5,5 +5,6 @@ OBJS += aarch64/rgb2rgb.o \
NEON-OBJS += aarch64/hscale.o \
aarch64/input.o \
aarch64/output.o \
+ aarch64/range_convert_neon.o \
aarch64/rgb2rgb_neon.o \
aarch64/yuv2rgb_neon.o \
diff --git a/libswscale/aarch64/range_convert_neon.S b/libswscale/aarch64/range_convert_neon.S
new file mode 100644
index 0000000000..ea56dc2e32
--- /dev/null
+++ b/libswscale/aarch64/range_convert_neon.S
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2024 Ramiro Polla
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+.macro lumConvertRange name, max, mult, offset, shift
+function ff_\name, export=1
+.if \max != 0
+ mov w3, #\max
+ dup v24.8h, w3
+.endif
+ mov w3, #\mult
+ dup v25.4s, w3
+ movz w3, \offset & 0xffff
+ movk w3, (\offset >> 16) & 0xffff, lsl #16
+ dup v26.4s, w3
+1:
+ ld1 {v0.8h}, [x0]
+.if \max != 0
+ smin v0.8h, v0.8h, v24.8h
+.endif
+ mov v16.16b, v26.16b
+ mov v18.16b, v26.16b
+ sxtl v20.4s, v0.4h
+ sxtl2 v22.4s, v0.8h
+ mla v16.4s, v20.4s, v25.4s
+ mla v18.4s, v22.4s, v25.4s
+ shrn v0.4h, v16.4s, #\shift
+ shrn2 v0.8h, v18.4s, #\shift
+ subs w1, w1, #8
+ st1 {v0.8h}, [x0], #16
+ b.gt 1b
+ ret
+endfunc
+.endm
+
+.macro chrConvertRange name, max, mult, offset, shift
+function ff_\name, export=1
+.if \max != 0
+ mov w3, #\max
+ dup v24.8h, w3
+.endif
+ mov w3, #\mult
+ dup v25.4s, w3
+ movz w3, \offset & 0xffff
+ movk w3, (\offset >> 16) & 0xffff, lsl #16
+ dup v26.4s, w3
+1:
+ ld1 {v0.8h}, [x0]
+ ld1 {v1.8h}, [x1]
+.if \max != 0
+ smin v0.8h, v0.8h, v24.8h
+ smin v1.8h, v1.8h, v24.8h
+.endif
+ mov v16.16b, v26.16b
+ mov v17.16b, v26.16b
+ mov v18.16b, v26.16b
+ mov v19.16b, v26.16b
+ sxtl v20.4s, v0.4h
+ sxtl v21.4s, v1.4h
+ sxtl2 v22.4s, v0.8h
+ sxtl2 v23.4s, v1.8h
+ mla v16.4s, v20.4s, v25.4s
+ mla v17.4s, v21.4s, v25.4s
+ mla v18.4s, v22.4s, v25.4s
+ mla v19.4s, v23.4s, v25.4s
+ shrn v0.4h, v16.4s, #\shift
+ shrn v1.4h, v17.4s, #\shift
+ shrn2 v0.8h, v18.4s, #\shift
+ shrn2 v1.8h, v19.4s, #\shift
+ subs w2, w2, #8
+ st1 {v0.8h}, [x0], #16
+ st1 {v1.8h}, [x1], #16
+ b.gt 1b
+ ret
+endfunc
+.endm
+
+lumConvertRange lumRangeToJpeg_neon, 30189, 19077, -39057361, 14
+chrConvertRange chrRangeToJpeg_neon, 30775, 4663, -9289992, 12
+lumConvertRange lumRangeFromJpeg_neon, 0, 14071, 33561947, 14
+chrConvertRange chrRangeFromJpeg_neon, 0, 1799, 4081085, 11
diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
index 4c4ea39dc1..e4ea3309ba 100644
--- a/libswscale/aarch64/swscale.c
+++ b/libswscale/aarch64/swscale.c
@@ -215,6 +215,26 @@ void ff_rgb24ToUV_half_neon(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unuse
const uint8_t *src2, int width, uint32_t *rgb2yuv,
void *opq);
+void ff_lumRangeFromJpeg_neon(int16_t *dst, int width);
+void ff_chrRangeFromJpeg_neon(int16_t *dstU, int16_t *dstV, int width);
+void ff_lumRangeToJpeg_neon(int16_t *dst, int width);
+void ff_chrRangeToJpeg_neon(int16_t *dstU, int16_t *dstV, int width);
+
+av_cold void ff_sws_init_range_convert_aarch64(SwsContext *c)
+{
+ if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
+ if (c->dstBpc <= 14) {
+ if (c->srcRange) {
+ c->lumConvertRange = ff_lumRangeFromJpeg_neon;
+ c->chrConvertRange = ff_chrRangeFromJpeg_neon;
+ } else {
+ c->lumConvertRange = ff_lumRangeToJpeg_neon;
+ c->chrConvertRange = ff_chrRangeToJpeg_neon;
+ }
+ }
+ }
+}
+
av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
@@ -237,5 +257,6 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
default:
break;
}
+ ff_sws_init_range_convert_aarch64(c);
}
}
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index d5e7b5e71c..0818f50c7f 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -697,6 +697,7 @@ void ff_yuv2rgb_init_tables_ppc(SwsContext *c, const int inv_table[4],
void ff_updateMMXDitherTables(SwsContext *c, int dstY);
av_cold void ff_sws_init_range_convert(SwsContext *c);
+av_cold void ff_sws_init_range_convert_aarch64(SwsContext *c);
av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c);
av_cold void ff_sws_init_range_convert_x86(SwsContext *c);
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 8dfa57b5ff..12dba712c1 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -1080,7 +1080,9 @@ int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
if (need_reinit) {
ff_sws_init_range_convert(c);
-#if ARCH_LOONGARCH64
+#if ARCH_AARCH64
+ ff_sws_init_range_convert_aarch64(c);
+#elif ARCH_LOONGARCH64
ff_sws_init_range_convert_loongarch(c);
#elif ARCH_X86
ff_sws_init_range_convert_x86(c);
--
2.30.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 4/4] swscale/aarch64: add neon {lum, chr}ConvertRange
2024-06-11 12:28 ` [FFmpeg-devel] [PATCH v2 4/4] swscale/aarch64: add neon " Ramiro Polla
@ 2024-06-18 17:42 ` Ramiro Polla
2024-06-18 21:15 ` Ramiro Polla
0 siblings, 1 reply; 12+ messages in thread
From: Ramiro Polla @ 2024-06-18 17:42 UTC (permalink / raw)
To: ffmpeg-devel
On Tue, Jun 11, 2024 at 2:29 PM Ramiro Polla <ramiro.polla@gmail.com> wrote:
>
> chrRangeFromJpeg_8_c: 29.2
> chrRangeFromJpeg_8_neon: 19.5
> chrRangeFromJpeg_24_c: 80.5
> chrRangeFromJpeg_24_neon: 34.0
> chrRangeFromJpeg_128_c: 413.7
> chrRangeFromJpeg_128_neon: 156.0
> chrRangeFromJpeg_144_c: 471.0
> chrRangeFromJpeg_144_neon: 174.2
> chrRangeFromJpeg_256_c: 842.0
> chrRangeFromJpeg_256_neon: 305.5
> chrRangeFromJpeg_512_c: 1699.0
> chrRangeFromJpeg_512_neon: 608.0
> chrRangeToJpeg_8_c: 51.7
> chrRangeToJpeg_8_neon: 22.7
> chrRangeToJpeg_24_c: 149.7
> chrRangeToJpeg_24_neon: 38.0
> chrRangeToJpeg_128_c: 761.7
> chrRangeToJpeg_128_neon: 176.7
> chrRangeToJpeg_144_c: 866.2
> chrRangeToJpeg_144_neon: 198.7
> chrRangeToJpeg_256_c: 1516.5
> chrRangeToJpeg_256_neon: 348.7
> chrRangeToJpeg_512_c: 3067.2
> chrRangeToJpeg_512_neon: 692.7
> lumRangeFromJpeg_8_c: 24.0
> lumRangeFromJpeg_8_neon: 17.0
> lumRangeFromJpeg_24_c: 56.7
> lumRangeFromJpeg_24_neon: 21.0
> lumRangeFromJpeg_128_c: 294.5
> lumRangeFromJpeg_128_neon: 76.7
> lumRangeFromJpeg_144_c: 332.5
> lumRangeFromJpeg_144_neon: 86.7
> lumRangeFromJpeg_256_c: 586.0
> lumRangeFromJpeg_256_neon: 152.2
> lumRangeFromJpeg_512_c: 1190.0
> lumRangeFromJpeg_512_neon: 298.0
> lumRangeToJpeg_8_c: 31.7
> lumRangeToJpeg_8_neon: 19.5
> lumRangeToJpeg_24_c: 83.5
> lumRangeToJpeg_24_neon: 24.2
> lumRangeToJpeg_128_c: 440.5
> lumRangeToJpeg_128_neon: 91.0
> lumRangeToJpeg_144_c: 504.2
> lumRangeToJpeg_144_neon: 101.0
> lumRangeToJpeg_256_c: 879.7
> lumRangeToJpeg_256_neon: 177.2
> lumRangeToJpeg_512_c: 1794.2
> lumRangeToJpeg_512_neon: 354.0
> ---
> libswscale/aarch64/Makefile | 1 +
> libswscale/aarch64/range_convert_neon.S | 99 +++++++++++++++++++++++++
> libswscale/aarch64/swscale.c | 21 ++++++
> libswscale/swscale_internal.h | 1 +
> libswscale/utils.c | 4 +-
> 5 files changed, 125 insertions(+), 1 deletion(-)
> create mode 100644 libswscale/aarch64/range_convert_neon.S
>
> diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
> index adfd90a1b6..37ad960619 100644
> --- a/libswscale/aarch64/Makefile
> +++ b/libswscale/aarch64/Makefile
> @@ -5,5 +5,6 @@ OBJS += aarch64/rgb2rgb.o \
> NEON-OBJS += aarch64/hscale.o \
> aarch64/input.o \
> aarch64/output.o \
> + aarch64/range_convert_neon.o \
> aarch64/rgb2rgb_neon.o \
> aarch64/yuv2rgb_neon.o \
> diff --git a/libswscale/aarch64/range_convert_neon.S b/libswscale/aarch64/range_convert_neon.S
> new file mode 100644
> index 0000000000..ea56dc2e32
> --- /dev/null
> +++ b/libswscale/aarch64/range_convert_neon.S
> @@ -0,0 +1,99 @@
> +/*
> + * Copyright (c) 2024 Ramiro Polla
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +
> +.macro lumConvertRange name, max, mult, offset, shift
> +function ff_\name, export=1
> +.if \max != 0
> + mov w3, #\max
> + dup v24.8h, w3
> +.endif
> + mov w3, #\mult
> + dup v25.4s, w3
> + movz w3, \offset & 0xffff
> + movk w3, (\offset >> 16) & 0xffff, lsl #16
> + dup v26.4s, w3
> +1:
> + ld1 {v0.8h}, [x0]
> +.if \max != 0
> + smin v0.8h, v0.8h, v24.8h
> +.endif
> + mov v16.16b, v26.16b
> + mov v18.16b, v26.16b
> + sxtl v20.4s, v0.4h
> + sxtl2 v22.4s, v0.8h
> + mla v16.4s, v20.4s, v25.4s
> + mla v18.4s, v22.4s, v25.4s
> + shrn v0.4h, v16.4s, #\shift
> + shrn2 v0.8h, v18.4s, #\shift
> + subs w1, w1, #8
> + st1 {v0.8h}, [x0], #16
> + b.gt 1b
> + ret
> +endfunc
> +.endm
> +
> +.macro chrConvertRange name, max, mult, offset, shift
> +function ff_\name, export=1
> +.if \max != 0
> + mov w3, #\max
> + dup v24.8h, w3
> +.endif
> + mov w3, #\mult
> + dup v25.4s, w3
> + movz w3, \offset & 0xffff
> + movk w3, (\offset >> 16) & 0xffff, lsl #16
> + dup v26.4s, w3
> +1:
> + ld1 {v0.8h}, [x0]
> + ld1 {v1.8h}, [x1]
> +.if \max != 0
> + smin v0.8h, v0.8h, v24.8h
> + smin v1.8h, v1.8h, v24.8h
> +.endif
> + mov v16.16b, v26.16b
> + mov v17.16b, v26.16b
> + mov v18.16b, v26.16b
> + mov v19.16b, v26.16b
> + sxtl v20.4s, v0.4h
> + sxtl v21.4s, v1.4h
> + sxtl2 v22.4s, v0.8h
> + sxtl2 v23.4s, v1.8h
> + mla v16.4s, v20.4s, v25.4s
> + mla v17.4s, v21.4s, v25.4s
> + mla v18.4s, v22.4s, v25.4s
> + mla v19.4s, v23.4s, v25.4s
> + shrn v0.4h, v16.4s, #\shift
> + shrn v1.4h, v17.4s, #\shift
> + shrn2 v0.8h, v18.4s, #\shift
> + shrn2 v1.8h, v19.4s, #\shift
> + subs w2, w2, #8
> + st1 {v0.8h}, [x0], #16
> + st1 {v1.8h}, [x1], #16
> + b.gt 1b
> + ret
> +endfunc
> +.endm
> +
> +lumConvertRange lumRangeToJpeg_neon, 30189, 19077, -39057361, 14
> +chrConvertRange chrRangeToJpeg_neon, 30775, 4663, -9289992, 12
> +lumConvertRange lumRangeFromJpeg_neon, 0, 14071, 33561947, 14
> +chrConvertRange chrRangeFromJpeg_neon, 0, 1799, 4081085, 11
> diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
> index 4c4ea39dc1..e4ea3309ba 100644
> --- a/libswscale/aarch64/swscale.c
> +++ b/libswscale/aarch64/swscale.c
> @@ -215,6 +215,26 @@ void ff_rgb24ToUV_half_neon(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unuse
> const uint8_t *src2, int width, uint32_t *rgb2yuv,
> void *opq);
>
> +void ff_lumRangeFromJpeg_neon(int16_t *dst, int width);
> +void ff_chrRangeFromJpeg_neon(int16_t *dstU, int16_t *dstV, int width);
> +void ff_lumRangeToJpeg_neon(int16_t *dst, int width);
> +void ff_chrRangeToJpeg_neon(int16_t *dstU, int16_t *dstV, int width);
> +
> +av_cold void ff_sws_init_range_convert_aarch64(SwsContext *c)
> +{
> + if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
> + if (c->dstBpc <= 14) {
> + if (c->srcRange) {
> + c->lumConvertRange = ff_lumRangeFromJpeg_neon;
> + c->chrConvertRange = ff_chrRangeFromJpeg_neon;
> + } else {
> + c->lumConvertRange = ff_lumRangeToJpeg_neon;
> + c->chrConvertRange = ff_chrRangeToJpeg_neon;
> + }
> + }
> + }
> +}
> +
> av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
> {
> int cpu_flags = av_get_cpu_flags();
> @@ -237,5 +257,6 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
> default:
> break;
> }
> + ff_sws_init_range_convert_aarch64(c);
> }
> }
> diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
> index d5e7b5e71c..0818f50c7f 100644
> --- a/libswscale/swscale_internal.h
> +++ b/libswscale/swscale_internal.h
> @@ -697,6 +697,7 @@ void ff_yuv2rgb_init_tables_ppc(SwsContext *c, const int inv_table[4],
> void ff_updateMMXDitherTables(SwsContext *c, int dstY);
>
> av_cold void ff_sws_init_range_convert(SwsContext *c);
> +av_cold void ff_sws_init_range_convert_aarch64(SwsContext *c);
> av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c);
> av_cold void ff_sws_init_range_convert_x86(SwsContext *c);
>
> diff --git a/libswscale/utils.c b/libswscale/utils.c
> index 8dfa57b5ff..12dba712c1 100644
> --- a/libswscale/utils.c
> +++ b/libswscale/utils.c
> @@ -1080,7 +1080,9 @@ int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
>
> if (need_reinit) {
> ff_sws_init_range_convert(c);
> -#if ARCH_LOONGARCH64
> +#if ARCH_AARCH64
> + ff_sws_init_range_convert_aarch64(c);
> +#elif ARCH_LOONGARCH64
> ff_sws_init_range_convert_loongarch(c);
> #elif ARCH_X86
> ff_sws_init_range_convert_x86(c);
> --
> 2.30.2
>
I finally tested this patch with movz/movk/dup instead of ld1 on apple
arm. I'll apply tomorrow if there are no more comments.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 4/4] swscale/aarch64: add neon {lum, chr}ConvertRange
2024-06-18 17:42 ` Ramiro Polla
@ 2024-06-18 21:15 ` Ramiro Polla
0 siblings, 0 replies; 12+ messages in thread
From: Ramiro Polla @ 2024-06-18 21:15 UTC (permalink / raw)
To: ffmpeg-devel
On Tue, Jun 18, 2024 at 7:42 PM Ramiro Polla <ramiro.polla@gmail.com> wrote:
>
> On Tue, Jun 11, 2024 at 2:29 PM Ramiro Polla <ramiro.polla@gmail.com> wrote:
> >
> > chrRangeFromJpeg_8_c: 29.2
> > chrRangeFromJpeg_8_neon: 19.5
> > chrRangeFromJpeg_24_c: 80.5
> > chrRangeFromJpeg_24_neon: 34.0
> > chrRangeFromJpeg_128_c: 413.7
> > chrRangeFromJpeg_128_neon: 156.0
> > chrRangeFromJpeg_144_c: 471.0
> > chrRangeFromJpeg_144_neon: 174.2
> > chrRangeFromJpeg_256_c: 842.0
> > chrRangeFromJpeg_256_neon: 305.5
> > chrRangeFromJpeg_512_c: 1699.0
> > chrRangeFromJpeg_512_neon: 608.0
> > chrRangeToJpeg_8_c: 51.7
> > chrRangeToJpeg_8_neon: 22.7
> > chrRangeToJpeg_24_c: 149.7
> > chrRangeToJpeg_24_neon: 38.0
> > chrRangeToJpeg_128_c: 761.7
> > chrRangeToJpeg_128_neon: 176.7
> > chrRangeToJpeg_144_c: 866.2
> > chrRangeToJpeg_144_neon: 198.7
> > chrRangeToJpeg_256_c: 1516.5
> > chrRangeToJpeg_256_neon: 348.7
> > chrRangeToJpeg_512_c: 3067.2
> > chrRangeToJpeg_512_neon: 692.7
> > lumRangeFromJpeg_8_c: 24.0
> > lumRangeFromJpeg_8_neon: 17.0
> > lumRangeFromJpeg_24_c: 56.7
> > lumRangeFromJpeg_24_neon: 21.0
> > lumRangeFromJpeg_128_c: 294.5
> > lumRangeFromJpeg_128_neon: 76.7
> > lumRangeFromJpeg_144_c: 332.5
> > lumRangeFromJpeg_144_neon: 86.7
> > lumRangeFromJpeg_256_c: 586.0
> > lumRangeFromJpeg_256_neon: 152.2
> > lumRangeFromJpeg_512_c: 1190.0
> > lumRangeFromJpeg_512_neon: 298.0
> > lumRangeToJpeg_8_c: 31.7
> > lumRangeToJpeg_8_neon: 19.5
> > lumRangeToJpeg_24_c: 83.5
> > lumRangeToJpeg_24_neon: 24.2
> > lumRangeToJpeg_128_c: 440.5
> > lumRangeToJpeg_128_neon: 91.0
> > lumRangeToJpeg_144_c: 504.2
> > lumRangeToJpeg_144_neon: 101.0
> > lumRangeToJpeg_256_c: 879.7
> > lumRangeToJpeg_256_neon: 177.2
> > lumRangeToJpeg_512_c: 1794.2
> > lumRangeToJpeg_512_neon: 354.0
> > ---
> > libswscale/aarch64/Makefile | 1 +
> > libswscale/aarch64/range_convert_neon.S | 99 +++++++++++++++++++++++++
> > libswscale/aarch64/swscale.c | 21 ++++++
> > libswscale/swscale_internal.h | 1 +
> > libswscale/utils.c | 4 +-
> > 5 files changed, 125 insertions(+), 1 deletion(-)
> > create mode 100644 libswscale/aarch64/range_convert_neon.S
> >
> > diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
> > index adfd90a1b6..37ad960619 100644
> > --- a/libswscale/aarch64/Makefile
> > +++ b/libswscale/aarch64/Makefile
> > @@ -5,5 +5,6 @@ OBJS += aarch64/rgb2rgb.o \
> > NEON-OBJS += aarch64/hscale.o \
> > aarch64/input.o \
> > aarch64/output.o \
> > + aarch64/range_convert_neon.o \
> > aarch64/rgb2rgb_neon.o \
> > aarch64/yuv2rgb_neon.o \
> > diff --git a/libswscale/aarch64/range_convert_neon.S b/libswscale/aarch64/range_convert_neon.S
> > new file mode 100644
> > index 0000000000..ea56dc2e32
> > --- /dev/null
> > +++ b/libswscale/aarch64/range_convert_neon.S
> > @@ -0,0 +1,99 @@
> > +/*
> > + * Copyright (c) 2024 Ramiro Polla
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> > + */
> > +
> > +#include "libavutil/aarch64/asm.S"
> > +
> > +.macro lumConvertRange name, max, mult, offset, shift
> > +function ff_\name, export=1
> > +.if \max != 0
> > + mov w3, #\max
> > + dup v24.8h, w3
> > +.endif
> > + mov w3, #\mult
> > + dup v25.4s, w3
> > + movz w3, \offset & 0xffff
> > + movk w3, (\offset >> 16) & 0xffff, lsl #16
> > + dup v26.4s, w3
> > +1:
> > + ld1 {v0.8h}, [x0]
> > +.if \max != 0
> > + smin v0.8h, v0.8h, v24.8h
> > +.endif
> > + mov v16.16b, v26.16b
> > + mov v18.16b, v26.16b
> > + sxtl v20.4s, v0.4h
> > + sxtl2 v22.4s, v0.8h
> > + mla v16.4s, v20.4s, v25.4s
> > + mla v18.4s, v22.4s, v25.4s
> > + shrn v0.4h, v16.4s, #\shift
> > + shrn2 v0.8h, v18.4s, #\shift
> > + subs w1, w1, #8
> > + st1 {v0.8h}, [x0], #16
> > + b.gt 1b
> > + ret
> > +endfunc
> > +.endm
> > +
> > +.macro chrConvertRange name, max, mult, offset, shift
> > +function ff_\name, export=1
> > +.if \max != 0
> > + mov w3, #\max
> > + dup v24.8h, w3
> > +.endif
> > + mov w3, #\mult
> > + dup v25.4s, w3
> > + movz w3, \offset & 0xffff
> > + movk w3, (\offset >> 16) & 0xffff, lsl #16
> > + dup v26.4s, w3
> > +1:
> > + ld1 {v0.8h}, [x0]
> > + ld1 {v1.8h}, [x1]
> > +.if \max != 0
> > + smin v0.8h, v0.8h, v24.8h
> > + smin v1.8h, v1.8h, v24.8h
> > +.endif
> > + mov v16.16b, v26.16b
> > + mov v17.16b, v26.16b
> > + mov v18.16b, v26.16b
> > + mov v19.16b, v26.16b
> > + sxtl v20.4s, v0.4h
> > + sxtl v21.4s, v1.4h
> > + sxtl2 v22.4s, v0.8h
> > + sxtl2 v23.4s, v1.8h
> > + mla v16.4s, v20.4s, v25.4s
> > + mla v17.4s, v21.4s, v25.4s
> > + mla v18.4s, v22.4s, v25.4s
> > + mla v19.4s, v23.4s, v25.4s
> > + shrn v0.4h, v16.4s, #\shift
> > + shrn v1.4h, v17.4s, #\shift
> > + shrn2 v0.8h, v18.4s, #\shift
> > + shrn2 v1.8h, v19.4s, #\shift
> > + subs w2, w2, #8
> > + st1 {v0.8h}, [x0], #16
> > + st1 {v1.8h}, [x1], #16
> > + b.gt 1b
> > + ret
> > +endfunc
> > +.endm
> > +
> > +lumConvertRange lumRangeToJpeg_neon, 30189, 19077, -39057361, 14
> > +chrConvertRange chrRangeToJpeg_neon, 30775, 4663, -9289992, 12
> > +lumConvertRange lumRangeFromJpeg_neon, 0, 14071, 33561947, 14
> > +chrConvertRange chrRangeFromJpeg_neon, 0, 1799, 4081085, 11
> > diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
> > index 4c4ea39dc1..e4ea3309ba 100644
> > --- a/libswscale/aarch64/swscale.c
> > +++ b/libswscale/aarch64/swscale.c
> > @@ -215,6 +215,26 @@ void ff_rgb24ToUV_half_neon(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unuse
> > const uint8_t *src2, int width, uint32_t *rgb2yuv,
> > void *opq);
> >
> > +void ff_lumRangeFromJpeg_neon(int16_t *dst, int width);
> > +void ff_chrRangeFromJpeg_neon(int16_t *dstU, int16_t *dstV, int width);
> > +void ff_lumRangeToJpeg_neon(int16_t *dst, int width);
> > +void ff_chrRangeToJpeg_neon(int16_t *dstU, int16_t *dstV, int width);
> > +
> > +av_cold void ff_sws_init_range_convert_aarch64(SwsContext *c)
> > +{
> > + if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
> > + if (c->dstBpc <= 14) {
> > + if (c->srcRange) {
> > + c->lumConvertRange = ff_lumRangeFromJpeg_neon;
> > + c->chrConvertRange = ff_chrRangeFromJpeg_neon;
> > + } else {
> > + c->lumConvertRange = ff_lumRangeToJpeg_neon;
> > + c->chrConvertRange = ff_chrRangeToJpeg_neon;
> > + }
> > + }
> > + }
> > +}
> > +
> > av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
> > {
> > int cpu_flags = av_get_cpu_flags();
> > @@ -237,5 +257,6 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
> > default:
> > break;
> > }
> > + ff_sws_init_range_convert_aarch64(c);
> > }
> > }
> > diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
> > index d5e7b5e71c..0818f50c7f 100644
> > --- a/libswscale/swscale_internal.h
> > +++ b/libswscale/swscale_internal.h
> > @@ -697,6 +697,7 @@ void ff_yuv2rgb_init_tables_ppc(SwsContext *c, const int inv_table[4],
> > void ff_updateMMXDitherTables(SwsContext *c, int dstY);
> >
> > av_cold void ff_sws_init_range_convert(SwsContext *c);
> > +av_cold void ff_sws_init_range_convert_aarch64(SwsContext *c);
> > av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c);
> > av_cold void ff_sws_init_range_convert_x86(SwsContext *c);
> >
> > diff --git a/libswscale/utils.c b/libswscale/utils.c
> > index 8dfa57b5ff..12dba712c1 100644
> > --- a/libswscale/utils.c
> > +++ b/libswscale/utils.c
> > @@ -1080,7 +1080,9 @@ int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
> >
> > if (need_reinit) {
> > ff_sws_init_range_convert(c);
> > -#if ARCH_LOONGARCH64
> > +#if ARCH_AARCH64
> > + ff_sws_init_range_convert_aarch64(c);
> > +#elif ARCH_LOONGARCH64
> > ff_sws_init_range_convert_loongarch(c);
> > #elif ARCH_X86
> > ff_sws_init_range_convert_x86(c);
> > --
> > 2.30.2
> >
>
> I finally tested this patch with movz/movk/dup instead of ld1 on apple
> arm. I'll apply tomorrow if there are no more comments.
Reviewed by Martin on IRC and pushed.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2 1/4] checkasm: add tests for {lum, chr}ConvertRange
2024-06-11 12:28 [FFmpeg-devel] [PATCH v2 1/4] checkasm: add tests for {lum, chr}ConvertRange Ramiro Polla
` (2 preceding siblings ...)
2024-06-11 12:28 ` [FFmpeg-devel] [PATCH v2 4/4] swscale/aarch64: add neon " Ramiro Polla
@ 2024-06-14 15:45 ` Ramiro Polla
3 siblings, 0 replies; 12+ messages in thread
From: Ramiro Polla @ 2024-06-14 15:45 UTC (permalink / raw)
To: ffmpeg-devel
On Tue, Jun 11, 2024 at 2:29 PM Ramiro Polla <ramiro.polla@gmail.com> wrote:
>
> ---
> tests/checkasm/Makefile | 2 +-
> tests/checkasm/checkasm.c | 1 +
> tests/checkasm/checkasm.h | 1 +
> tests/checkasm/sw_range_convert.c | 134 ++++++++++++++++++++++++++++++
> 4 files changed, 137 insertions(+), 1 deletion(-)
> create mode 100644 tests/checkasm/sw_range_convert.c
>
> diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
> index 6eb94d10d5..f20732b37a 100644
> --- a/tests/checkasm/Makefile
> +++ b/tests/checkasm/Makefile
> @@ -63,7 +63,7 @@ AVFILTEROBJS-$(CONFIG_SOBEL_FILTER) += vf_convolution.o
> CHECKASMOBJS-$(CONFIG_AVFILTER) += $(AVFILTEROBJS-yes)
>
> # swscale tests
> -SWSCALEOBJS += sw_gbrp.o sw_rgb.o sw_scale.o
> +SWSCALEOBJS += sw_gbrp.o sw_range_convert.o sw_rgb.o sw_scale.o
>
> CHECKASMOBJS-$(CONFIG_SWSCALE) += $(SWSCALEOBJS)
>
> diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
> index 2329e2e1bc..56232ab1e0 100644
> --- a/tests/checkasm/checkasm.c
> +++ b/tests/checkasm/checkasm.c
> @@ -251,6 +251,7 @@ static const struct {
> #endif
> #if CONFIG_SWSCALE
> { "sw_gbrp", checkasm_check_sw_gbrp },
> + { "sw_range_convert", checkasm_check_sw_range_convert },
> { "sw_rgb", checkasm_check_sw_rgb },
> { "sw_scale", checkasm_check_sw_scale },
> #endif
> diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
> index 211d7f52e6..e544007b67 100644
> --- a/tests/checkasm/checkasm.h
> +++ b/tests/checkasm/checkasm.h
> @@ -119,6 +119,7 @@ void checkasm_check_rv40dsp(void);
> void checkasm_check_svq1enc(void);
> void checkasm_check_synth_filter(void);
> void checkasm_check_sw_gbrp(void);
> +void checkasm_check_sw_range_convert(void);
> void checkasm_check_sw_rgb(void);
> void checkasm_check_sw_scale(void);
> void checkasm_check_takdsp(void);
> diff --git a/tests/checkasm/sw_range_convert.c b/tests/checkasm/sw_range_convert.c
> new file mode 100644
> index 0000000000..08029103d1
> --- /dev/null
> +++ b/tests/checkasm/sw_range_convert.c
> @@ -0,0 +1,134 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
> + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
> + */
> +
> +#include <string.h>
> +
> +#include "libavutil/common.h"
> +#include "libavutil/intreadwrite.h"
> +#include "libavutil/mem.h"
> +#include "libavutil/mem_internal.h"
> +
> +#include "libswscale/swscale.h"
> +#include "libswscale/swscale_internal.h"
> +
> +#include "checkasm.h"
> +
> +static void check_lumConvertRange(int from)
> +{
> + const char *func_str = from ? "lumRangeFromJpeg" : "lumRangeToJpeg";
> +#define LARGEST_INPUT_SIZE 512
> +#define INPUT_SIZES 6
> + static const int input_sizes[] = {8, 24, 128, 144, 256, 512};
> + struct SwsContext *ctx;
> +
> + LOCAL_ALIGNED_32(int16_t, dst0, [LARGEST_INPUT_SIZE]);
> + LOCAL_ALIGNED_32(int16_t, dst1, [LARGEST_INPUT_SIZE]);
> +
> + declare_func(void, int16_t *dst, int width);
> +
> + ctx = sws_alloc_context();
> + if (sws_init_context(ctx, NULL, NULL) < 0)
> + fail();
> +
> + ctx->srcFormat = from ? AV_PIX_FMT_YUVJ444P : AV_PIX_FMT_YUV444P;
> + ctx->dstFormat = from ? AV_PIX_FMT_YUV444P : AV_PIX_FMT_YUVJ444P;
> + ctx->srcRange = from;
> + ctx->dstRange = !from;
> +
> + for (int dstWi = 0; dstWi < INPUT_SIZES; dstWi++) {
> + int width = input_sizes[dstWi];
> + for (int i = 0; i < width; i++) {
> + uint8_t r = rnd();
> + dst0[i] = (int16_t) r << 7;
> + dst1[i] = (int16_t) r << 7;
> + }
> + ff_sws_init_scale(ctx);
> + if (check_func(ctx->lumConvertRange, "%s_%d", func_str, width)) {
> + call_ref(dst0, width);
> + call_new(dst1, width);
> + if (memcmp(dst0, dst1, width * sizeof(int16_t)))
> + fail();
> + bench_new(dst1, width);
> + }
> + }
> +
> + sws_freeContext(ctx);
> +}
> +#undef LARGEST_INPUT_SIZE
> +#undef INPUT_SIZES
> +
> +static void check_chrConvertRange(int from)
> +{
> + const char *func_str = from ? "chrRangeFromJpeg" : "chrRangeToJpeg";
> +#define LARGEST_INPUT_SIZE 512
> +#define INPUT_SIZES 6
> + static const int input_sizes[] = {8, 24, 128, 144, 256, 512};
> + struct SwsContext *ctx;
> +
> + LOCAL_ALIGNED_32(int16_t, dstU0, [LARGEST_INPUT_SIZE]);
> + LOCAL_ALIGNED_32(int16_t, dstV0, [LARGEST_INPUT_SIZE]);
> + LOCAL_ALIGNED_32(int16_t, dstU1, [LARGEST_INPUT_SIZE]);
> + LOCAL_ALIGNED_32(int16_t, dstV1, [LARGEST_INPUT_SIZE]);
> +
> + declare_func(void, int16_t *dstU, int16_t *dstV, int width);
> +
> + ctx = sws_alloc_context();
> + if (sws_init_context(ctx, NULL, NULL) < 0)
> + fail();
> +
> + ctx->srcFormat = from ? AV_PIX_FMT_YUVJ444P : AV_PIX_FMT_YUV444P;
> + ctx->dstFormat = from ? AV_PIX_FMT_YUV444P : AV_PIX_FMT_YUVJ444P;
> + ctx->srcRange = from;
> + ctx->dstRange = !from;
> +
> + for (int dstWi = 0; dstWi < INPUT_SIZES; dstWi++) {
> + int width = input_sizes[dstWi];
> + for (int i = 0; i < width; i++) {
> + uint8_t r = rnd();
> + dstU0[i] = (int16_t) r << 7;
> + dstV0[i] = (int16_t) r << 7;
> + dstU1[i] = (int16_t) r << 7;
> + dstV1[i] = (int16_t) r << 7;
> + }
> + ff_sws_init_scale(ctx);
> + if (check_func(ctx->chrConvertRange, "%s_%d", func_str, width)) {
> + call_ref(dstU0, dstV0, width);
> + call_new(dstU1, dstV1, width);
> + if (memcmp(dstU0, dstU1, width * sizeof(int16_t)) ||
> + memcmp(dstV0, dstV1, width * sizeof(int16_t)))
> + fail();
> + bench_new(dstU1, dstV1, width);
> + }
> + }
> +
> + sws_freeContext(ctx);
> +}
> +#undef LARGEST_INPUT_SIZE
> +#undef INPUT_SIZES
> +
> +void checkasm_check_sw_range_convert(void)
> +{
> + check_lumConvertRange(1);
> + report("lumRangeFromJpeg");
> + check_chrConvertRange(1);
> + report("chrRangeFromJpeg");
> + check_lumConvertRange(0);
> + report("lumRangeToJpeg");
> + check_chrConvertRange(0);
> + report("chrRangeToJpeg");
> +}
> --
> 2.30.2
>
I'll apply tomorrow if there are no comments.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 12+ messages in thread