* [FFmpeg-devel] [PATCH 1/3] checkasm/v210enc: test the entire width of 10-bit planar input arrays
@ 2022-11-21 12:44 James Darnley
2022-11-21 12:44 ` [FFmpeg-devel] [PATCH 2/3] avcodec/x86/v210: replace register use with named register James Darnley
` (6 more replies)
0 siblings, 7 replies; 10+ messages in thread
From: James Darnley @ 2022-11-21 12:44 UTC (permalink / raw)
To: ffmpeg-devel
---
tests/checkasm/v210enc.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/tests/checkasm/v210enc.c b/tests/checkasm/v210enc.c
index 9942e08137..9fb8321c25 100644
--- a/tests/checkasm/v210enc.c
+++ b/tests/checkasm/v210enc.c
@@ -72,8 +72,10 @@
randomize_buffers(mask); \
call_ref(y0 + y_offset, u0 + uv_offset, v0 + uv_offset, dst0, width); \
call_new(y1 + y_offset, u1 + uv_offset, v1 + uv_offset, dst1, width); \
- if (memcmp(y0, y1, BUF_SIZE) || memcmp(u0, u1, BUF_SIZE / 2) || \
- memcmp(v0, v1, BUF_SIZE / 2) || memcmp(dst0, dst1, width * 8 / 3)) \
+ if (memcmp(y0, y1, BUF_SIZE * sizeof(type)) \
+ || memcmp(u0, u1, BUF_SIZE * sizeof(type) / 2) \
+ || memcmp(v0, v1, BUF_SIZE * sizeof(type) / 2) \
+ || memcmp(dst0, dst1, width * 8 / 3)) \
fail(); \
bench_new(y1 + y_offset, u1 + uv_offset, v1 + uv_offset, dst1, width); \
} \
--
2.38.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH 2/3] avcodec/x86/v210: replace register use with named register
2022-11-21 12:44 [FFmpeg-devel] [PATCH 1/3] checkasm/v210enc: test the entire width of 10-bit planar input arrays James Darnley
@ 2022-11-21 12:44 ` James Darnley
2022-11-21 12:44 ` [FFmpeg-devel] [PATCH 3/3] avcodec/v210enc: add new 10-bit function for avx512 avx512icl James Darnley
` (5 subsequent siblings)
6 siblings, 0 replies; 10+ messages in thread
From: James Darnley @ 2022-11-21 12:44 UTC (permalink / raw)
To: ffmpeg-devel
---
libavcodec/x86/v210enc.asm | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm
index afac238ede..c2ad3d72c0 100644
--- a/libavcodec/x86/v210enc.asm
+++ b/libavcodec/x86/v210enc.asm
@@ -62,7 +62,7 @@ SECTION .text
; v210_planar_pack_10(const uint16_t *y, const uint16_t *u, const uint16_t *v, uint8_t *dst, ptrdiff_t width)
cglobal v210_planar_pack_10, 5, 5, 4+cpuflag(avx2), y, u, v, dst, width
- lea r0, [yq+2*widthq]
+ lea yq, [yq+2*widthq]
add uq, widthq
add vq, widthq
neg widthq
--
2.38.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH 3/3] avcodec/v210enc: add new 10-bit function for avx512 avx512icl
2022-11-21 12:44 [FFmpeg-devel] [PATCH 1/3] checkasm/v210enc: test the entire width of 10-bit planar input arrays James Darnley
2022-11-21 12:44 ` [FFmpeg-devel] [PATCH 2/3] avcodec/x86/v210: replace register use with named register James Darnley
@ 2022-11-21 12:44 ` James Darnley
2022-11-21 14:12 ` Andreas Rheinhardt
2022-11-25 15:17 ` [FFmpeg-devel] [PATCH v2 1/5] checkasm/v210enc: test the entire width of 10-bit planar input arrays James Darnley
` (4 subsequent siblings)
6 siblings, 1 reply; 10+ messages in thread
From: James Darnley @ 2022-11-21 12:44 UTC (permalink / raw)
To: ffmpeg-devel
avx512 on Skylake-X (Xeon D-2123IT):
1.19x faster (970±91.2 vs. 817±104.4 decicycles) compared with avx2
avx512icl on Ice Lake (Xeon Silver 4316):
2.52x faster (1350±5.3 vs. 535±9.5 decicycles) compared with avx2
---
libavcodec/x86/v210enc.asm | 99 +++++++++++++++++++++++++++++++++++
libavcodec/x86/v210enc_init.c | 12 +++++
2 files changed, 111 insertions(+)
diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm
index c2ad3d72c0..9cee954619 100644
--- a/libavcodec/x86/v210enc.asm
+++ b/libavcodec/x86/v210enc.asm
@@ -56,6 +56,36 @@ v210enc_8_permd: dd 0,1,4,5, 1,2,5,6
v210enc_8_mult: db 4, 0, 64, 0
v210enc_8_mask: dd 255<<12
+icl_perm_y: ; vpermb does not set bytes to zero when the high bit is set unlike pshufb
+%assign i 0
+%rep 8
+ db -1,i+0,i+1,-1 , i+2,i+3,i+4,i+5
+ %assign i i+6
+%endrep
+
+icl_perm_uv: ; vpermb does not set bytes to zero when the high bit is set unlike pshufb
+%assign i 0
+%rep 4
+ db i+0,i+1,i+32,i+33 , -1,i+2,i+3,-1 , i+34,i+35,i+4,i+5 , -1,i+36,i+37,-1
+ %assign i i+6
+%endrep
+
+icl_perm_y_kmask: times 8 db 0b1111_0110
+icl_perm_uv_kmask: times 8 db 0b0110_1111
+
+icl_shift_y: times 10 dw 2,0,4
+ times 4 db 0 ; padding to 64 bytes
+icl_shift_uv: times 5 dw 0,2,4
+ times 2 db 0 ; padding to 32 bytes
+ times 5 dw 4,0,2
+ times 2 db 0 ; padding to 32 bytes
+
+v210enc_10_permd_y: dd 0,1,2,-1 , 3,4,5,-1
+v210enc_10_shufb_y: db -1,0,1,-1 , 2,3,4,5 , -1,6,7,-1 , 8,9,10,11
+v210enc_10_permd_uv: dd 0,1,4,5 , 1,2,5,6
+v210enc_10_shufb_uv: db 0,1, 8, 9 , -1,2,3,-1 , 10,11,4,5 , -1,12,13,-1
+ db 2,3,10,11 , -1,4,5,-1 , 12,13,6,7 , -1,14,15,-1
+
SECTION .text
%macro v210_planar_pack_10 0
@@ -113,6 +143,75 @@ INIT_YMM avx2
v210_planar_pack_10
%endif
+%macro v210_planar_pack_10_new 0
+
+cglobal v210_planar_pack_10, 5, 5, 8+2*notcpuflag(avx512icl), y, u, v, dst, width
+ lea yq, [yq+2*widthq]
+ add uq, widthq
+ add vq, widthq
+ neg widthq
+
+ %if cpuflag(avx512icl)
+ movu m6, [icl_perm_y]
+ movu m7, [icl_perm_uv]
+ kmovq k1, [icl_perm_y_kmask]
+ kmovq k2, [icl_perm_uv_kmask]
+ %else
+ movu m6, [v210enc_10_permd_y]
+ VBROADCASTI128 m7, [v210enc_10_shufb_y]
+ movu m8, [v210enc_10_permd_uv]
+ movu m9, [v210enc_10_shufb_uv]
+ %endif
+ movu m2, [icl_shift_y]
+ movu m3, [icl_shift_uv]
+ VBROADCASTI128 m4, [v210_enc_min_10] ; only ymm sized
+ VBROADCASTI128 m5, [v210_enc_max_10] ; only ymm sized
+
+ .loop:
+ movu m0, [yq + widthq*2]
+ %if cpuflag(avx512icl)
+ movu ym1, [uq + widthq*1]
+ vinserti32x8 zm1, [vq + widthq*1], 1
+ %else
+ movu xm1, [uq + widthq*1]
+ vinserti128 ym1, [vq + widthq*1], 1
+ %endif
+ CLIPW m0, m4, m5
+ CLIPW m1, m4, m5
+
+ vpsllvw m0, m2
+ vpsllvw m1, m3
+ %if cpuflag(avx512icl)
+ vpermb m0{k1}{z}, m6, m0
+ vpermb m1{k2}{z}, m7, m1
+ %else
+ vpermd m0, m6, m0
+ pshufb m0, m7
+ vpermd m1, m8, m1
+ pshufb m1, m9
+ %endif
+ por m0, m1
+
+ movu [dstq], m0
+ add dstq, mmsize
+ add widthq, (mmsize*3)/8
+ jl .loop
+RET
+
+%endmacro
+
+%if ARCH_X86_64
+%if HAVE_AVX512_EXTERNAL
+INIT_YMM avx512
+v210_planar_pack_10_new
+%endif
+%endif
+
+%if HAVE_AVX512ICL_EXTERNAL
+INIT_ZMM avx512icl
+v210_planar_pack_10_new
+%endif
+
%macro v210_planar_pack_8 0
; v210_planar_pack_8(const uint8_t *y, const uint8_t *u, const uint8_t *v, uint8_t *dst, ptrdiff_t width)
diff --git a/libavcodec/x86/v210enc_init.c b/libavcodec/x86/v210enc_init.c
index 6e9f8c6e61..5d1ebcb893 100644
--- a/libavcodec/x86/v210enc_init.c
+++ b/libavcodec/x86/v210enc_init.c
@@ -37,6 +37,12 @@ void ff_v210_planar_pack_10_ssse3(const uint16_t *y, const uint16_t *u,
void ff_v210_planar_pack_10_avx2(const uint16_t *y, const uint16_t *u,
const uint16_t *v, uint8_t *dst,
ptrdiff_t width);
+void ff_v210_planar_pack_10_avx512(const uint16_t *y, const uint16_t *u,
+ const uint16_t *v, uint8_t *dst,
+ ptrdiff_t width);
+void ff_v210_planar_pack_10_avx512icl(const uint16_t *y, const uint16_t *u,
+ const uint16_t *v, uint8_t *dst,
+ ptrdiff_t width);
av_cold void ff_v210enc_init_x86(V210EncContext *s)
{
@@ -60,10 +66,16 @@ av_cold void ff_v210enc_init_x86(V210EncContext *s)
if (EXTERNAL_AVX512(cpu_flags)) {
s->sample_factor_8 = 2;
s->pack_line_8 = ff_v210_planar_pack_8_avx512;
+#ifdef ARCH_X86_64
+ s->sample_factor_10 = 2;
+ s->pack_line_10 = ff_v210_planar_pack_10_avx512;
+#endif
}
if (EXTERNAL_AVX512ICL(cpu_flags)) {
s->sample_factor_8 = 4;
s->pack_line_8 = ff_v210_planar_pack_8_avx512icl;
+ s->sample_factor_10 = 4;
+ s->pack_line_10 = ff_v210_planar_pack_10_avx512icl;
}
}
--
2.38.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH 3/3] avcodec/v210enc: add new 10-bit function for avx512 avx512icl
2022-11-21 12:44 ` [FFmpeg-devel] [PATCH 3/3] avcodec/v210enc: add new 10-bit function for avx512 avx512icl James Darnley
@ 2022-11-21 14:12 ` Andreas Rheinhardt
2022-11-21 16:50 ` James Darnley
0 siblings, 1 reply; 10+ messages in thread
From: Andreas Rheinhardt @ 2022-11-21 14:12 UTC (permalink / raw)
To: ffmpeg-devel
James Darnley:
> avx512 on Skylake-X (Xeon D-2123IT):
> 1.19x faster (970±91.2 vs. 817±104.4 decicycles) compared with avx2
>
> avx512icl on Ice Lake (Xeon Silver 4316):
> 2.52x faster (1350±5.3 vs. 535±9.5 decicycles) compared with avx2
> ---
> libavcodec/x86/v210enc.asm | 99 +++++++++++++++++++++++++++++++++++
> libavcodec/x86/v210enc_init.c | 12 +++++
> 2 files changed, 111 insertions(+)
>
> diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm
> index c2ad3d72c0..9cee954619 100644
> --- a/libavcodec/x86/v210enc.asm
> +++ b/libavcodec/x86/v210enc.asm
> @@ -56,6 +56,36 @@ v210enc_8_permd: dd 0,1,4,5, 1,2,5,6
> v210enc_8_mult: db 4, 0, 64, 0
> v210enc_8_mask: dd 255<<12
>
> +icl_perm_y: ; vpermb does not set bytes to zero when the high bit is set unlike pshufb
> +%assign i 0
> +%rep 8
> + db -1,i+0,i+1,-1 , i+2,i+3,i+4,i+5
> + %assign i i+6
> +%endrep
> +
> +icl_perm_uv: ; vpermb does not set bytes to zero when the high bit is set unlike pshufb
> +%assign i 0
> +%rep 4
> + db i+0,i+1,i+32,i+33 , -1,i+2,i+3,-1 , i+34,i+35,i+4,i+5 , -1,i+36,i+37,-1
> + %assign i i+6
> +%endrep
> +
> +icl_perm_y_kmask: times 8 db 0b1111_0110
> +icl_perm_uv_kmask: times 8 db 0b0110_1111
> +
> +icl_shift_y: times 10 dw 2,0,4
> + times 4 db 0 ; padding to 64 bytes
> +icl_shift_uv: times 5 dw 0,2,4
> + times 2 db 0 ; padding to 32 bytes
> + times 5 dw 4,0,2
> + times 2 db 0 ; padding to 32 bytes
> +
> +v210enc_10_permd_y: dd 0,1,2,-1 , 3,4,5,-1
> +v210enc_10_shufb_y: db -1,0,1,-1 , 2,3,4,5 , -1,6,7,-1 , 8,9,10,11
> +v210enc_10_permd_uv: dd 0,1,4,5 , 1,2,5,6
> +v210enc_10_shufb_uv: db 0,1, 8, 9 , -1,2,3,-1 , 10,11,4,5 , -1,12,13,-1
> + db 2,3,10,11 , -1,4,5,-1 , 12,13,6,7 , -1,14,15,-1
> +
> SECTION .text
>
> %macro v210_planar_pack_10 0
> @@ -113,6 +143,75 @@ INIT_YMM avx2
> v210_planar_pack_10
> %endif
>
> +%macro v210_planar_pack_10_new 0
> +
> +cglobal v210_planar_pack_10, 5, 5, 8+2*notcpuflag(avx512icl), y, u, v, dst, width
> + lea yq, [yq+2*widthq]
> + add uq, widthq
> + add vq, widthq
> + neg widthq
> +
> + %if cpuflag(avx512icl)
> + movu m6, [icl_perm_y]
> + movu m7, [icl_perm_uv]
> + kmovq k1, [icl_perm_y_kmask]
> + kmovq k2, [icl_perm_uv_kmask]
> + %else
> + movu m6, [v210enc_10_permd_y]
> + VBROADCASTI128 m7, [v210enc_10_shufb_y]
> + movu m8, [v210enc_10_permd_uv]
> + movu m9, [v210enc_10_shufb_uv]
> + %endif
> + movu m2, [icl_shift_y]
> + movu m3, [icl_shift_uv]
> + VBROADCASTI128 m4, [v210_enc_min_10] ; only ymm sized
> + VBROADCASTI128 m5, [v210_enc_max_10] ; only ymm sized
> +
> + .loop:
> + movu m0, [yq + widthq*2]
> + %if cpuflag(avx512icl)
> + movu ym1, [uq + widthq*1]
> + vinserti32x8 zm1, [vq + widthq*1], 1
> + %else
> + movu xm1, [uq + widthq*1]
> + vinserti128 ym1, [vq + widthq*1], 1
> + %endif
> + CLIPW m0, m4, m5
> + CLIPW m1, m4, m5
> +
> + vpsllvw m0, m2
> + vpsllvw m1, m3
> + %if cpuflag(avx512icl)
> + vpermb m0{k1}{z}, m6, m0
> + vpermb m1{k2}{z}, m7, m1
> + %else
> + vpermd m0, m6, m0
> + pshufb m0, m7
> + vpermd m1, m8, m1
> + pshufb m1, m9
> + %endif
> + por m0, m1
> +
> + movu [dstq], m0
> + add dstq, mmsize
> + add widthq, (mmsize*3)/8
> + jl .loop
> +RET
> +
> +%endmacro
> +
> +%if ARCH_X86_64
> +%if HAVE_AVX512_EXTERNAL
> +INIT_YMM avx512
> +v210_planar_pack_10_new
> +%endif
> +%endif
> +
> +%if HAVE_AVX512ICL_EXTERNAL
> +INIT_ZMM avx512icl
> +v210_planar_pack_10_new
> +%endif
> +
> %macro v210_planar_pack_8 0
>
> ; v210_planar_pack_8(const uint8_t *y, const uint8_t *u, const uint8_t *v, uint8_t *dst, ptrdiff_t width)
> diff --git a/libavcodec/x86/v210enc_init.c b/libavcodec/x86/v210enc_init.c
> index 6e9f8c6e61..5d1ebcb893 100644
> --- a/libavcodec/x86/v210enc_init.c
> +++ b/libavcodec/x86/v210enc_init.c
> @@ -37,6 +37,12 @@ void ff_v210_planar_pack_10_ssse3(const uint16_t *y, const uint16_t *u,
> void ff_v210_planar_pack_10_avx2(const uint16_t *y, const uint16_t *u,
> const uint16_t *v, uint8_t *dst,
> ptrdiff_t width);
> +void ff_v210_planar_pack_10_avx512(const uint16_t *y, const uint16_t *u,
> + const uint16_t *v, uint8_t *dst,
> + ptrdiff_t width);
> +void ff_v210_planar_pack_10_avx512icl(const uint16_t *y, const uint16_t *u,
> + const uint16_t *v, uint8_t *dst,
> + ptrdiff_t width);
>
> av_cold void ff_v210enc_init_x86(V210EncContext *s)
> {
> @@ -60,10 +66,16 @@ av_cold void ff_v210enc_init_x86(V210EncContext *s)
> if (EXTERNAL_AVX512(cpu_flags)) {
> s->sample_factor_8 = 2;
> s->pack_line_8 = ff_v210_planar_pack_8_avx512;
> +#ifdef ARCH_X86_64
ARCH_X86_64 is always defined. So checks of this type need to check with
#if.
> + s->sample_factor_10 = 2;
> + s->pack_line_10 = ff_v210_planar_pack_10_avx512;
> +#endif
> }
>
> if (EXTERNAL_AVX512ICL(cpu_flags)) {
> s->sample_factor_8 = 4;
> s->pack_line_8 = ff_v210_planar_pack_8_avx512icl;
> + s->sample_factor_10 = 4;
> + s->pack_line_10 = ff_v210_planar_pack_10_avx512icl;
> }
> }
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH 3/3] avcodec/v210enc: add new 10-bit function for avx512 avx512icl
2022-11-21 14:12 ` Andreas Rheinhardt
@ 2022-11-21 16:50 ` James Darnley
0 siblings, 0 replies; 10+ messages in thread
From: James Darnley @ 2022-11-21 16:50 UTC (permalink / raw)
To: ffmpeg-devel
> ARCH_X86_64 is always defined. So checks of this type need to check with #if.
Thanks. I forgot the ffmpeg convention there.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH v2 1/5] checkasm/v210enc: test the entire width of 10-bit planar input arrays
2022-11-21 12:44 [FFmpeg-devel] [PATCH 1/3] checkasm/v210enc: test the entire width of 10-bit planar input arrays James Darnley
2022-11-21 12:44 ` [FFmpeg-devel] [PATCH 2/3] avcodec/x86/v210: replace register use with named register James Darnley
2022-11-21 12:44 ` [FFmpeg-devel] [PATCH 3/3] avcodec/v210enc: add new 10-bit function for avx512 avx512icl James Darnley
@ 2022-11-25 15:17 ` James Darnley
2022-11-25 15:17 ` [FFmpeg-devel] [PATCH v2 2/5] avcodec/x86/v210enc: replace register use with named register James Darnley
` (3 subsequent siblings)
6 siblings, 0 replies; 10+ messages in thread
From: James Darnley @ 2022-11-25 15:17 UTC (permalink / raw)
To: ffmpeg-devel
---
tests/checkasm/v210enc.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/tests/checkasm/v210enc.c b/tests/checkasm/v210enc.c
index 9942e08137..9fb8321c25 100644
--- a/tests/checkasm/v210enc.c
+++ b/tests/checkasm/v210enc.c
@@ -72,8 +72,10 @@
randomize_buffers(mask); \
call_ref(y0 + y_offset, u0 + uv_offset, v0 + uv_offset, dst0, width); \
call_new(y1 + y_offset, u1 + uv_offset, v1 + uv_offset, dst1, width); \
- if (memcmp(y0, y1, BUF_SIZE) || memcmp(u0, u1, BUF_SIZE / 2) || \
- memcmp(v0, v1, BUF_SIZE / 2) || memcmp(dst0, dst1, width * 8 / 3)) \
+ if (memcmp(y0, y1, BUF_SIZE * sizeof(type)) \
+ || memcmp(u0, u1, BUF_SIZE * sizeof(type) / 2) \
+ || memcmp(v0, v1, BUF_SIZE * sizeof(type) / 2) \
+ || memcmp(dst0, dst1, width * 8 / 3)) \
fail(); \
bench_new(y1 + y_offset, u1 + uv_offset, v1 + uv_offset, dst1, width); \
} \
--
2.38.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH v2 2/5] avcodec/x86/v210enc: replace register use with named register
2022-11-21 12:44 [FFmpeg-devel] [PATCH 1/3] checkasm/v210enc: test the entire width of 10-bit planar input arrays James Darnley
` (2 preceding siblings ...)
2022-11-25 15:17 ` [FFmpeg-devel] [PATCH v2 1/5] checkasm/v210enc: test the entire width of 10-bit planar input arrays James Darnley
@ 2022-11-25 15:17 ` James Darnley
2022-11-25 15:17 ` [FFmpeg-devel] [PATCH v2 3/5] avcodec/v210enc: add new 10-bit function for avx512 avx512icl James Darnley
` (2 subsequent siblings)
6 siblings, 0 replies; 10+ messages in thread
From: James Darnley @ 2022-11-25 15:17 UTC (permalink / raw)
To: ffmpeg-devel
---
libavcodec/x86/v210enc.asm | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm
index afac238ede..c2ad3d72c0 100644
--- a/libavcodec/x86/v210enc.asm
+++ b/libavcodec/x86/v210enc.asm
@@ -62,7 +62,7 @@ SECTION .text
; v210_planar_pack_10(const uint16_t *y, const uint16_t *u, const uint16_t *v, uint8_t *dst, ptrdiff_t width)
cglobal v210_planar_pack_10, 5, 5, 4+cpuflag(avx2), y, u, v, dst, width
- lea r0, [yq+2*widthq]
+ lea yq, [yq+2*widthq]
add uq, widthq
add vq, widthq
neg widthq
--
2.38.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH v2 3/5] avcodec/v210enc: add new 10-bit function for avx512 avx512icl
2022-11-21 12:44 [FFmpeg-devel] [PATCH 1/3] checkasm/v210enc: test the entire width of 10-bit planar input arrays James Darnley
` (3 preceding siblings ...)
2022-11-25 15:17 ` [FFmpeg-devel] [PATCH v2 2/5] avcodec/x86/v210enc: replace register use with named register James Darnley
@ 2022-11-25 15:17 ` James Darnley
2022-11-25 15:17 ` [FFmpeg-devel] [PATCH v2 4/5] avcodec/x86/v210enc: expand and correct comments James Darnley
2022-11-25 15:17 ` [FFmpeg-devel] [PATCH v2 5/5] avcodec/x86/v210enc: remove unneeded instruction James Darnley
6 siblings, 0 replies; 10+ messages in thread
From: James Darnley @ 2022-11-25 15:17 UTC (permalink / raw)
To: ffmpeg-devel
avx512 on Skylake-X (Xeon D-2123IT):
1.19x faster (970±91.2 vs. 817±104.4 decicycles) compared with avx2
avx512icl on Ice Lake (Xeon Silver 4316):
2.52x faster (1350±5.3 vs. 535±9.5 decicycles) compared with avx2
---
libavcodec/x86/v210enc.asm | 99 +++++++++++++++++++++++++++++++++++
libavcodec/x86/v210enc_init.c | 12 +++++
2 files changed, 111 insertions(+)
diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm
index c2ad3d72c0..552164a8be 100644
--- a/libavcodec/x86/v210enc.asm
+++ b/libavcodec/x86/v210enc.asm
@@ -56,6 +56,36 @@ v210enc_8_permd: dd 0,1,4,5, 1,2,5,6
v210enc_8_mult: db 4, 0, 64, 0
v210enc_8_mask: dd 255<<12
+icl_perm_y: ; vpermb does not set bytes to zero when the high bit is set unlike pshufb
+%assign i 0
+%rep 8
+ db -1,i+0,i+1,-1 , i+2,i+3,i+4,i+5
+ %assign i i+6
+%endrep
+
+icl_perm_uv: ; vpermb does not set bytes to zero when the high bit is set unlike pshufb
+%assign i 0
+%rep 4
+ db i+0,i+1,i+32,i+33 , -1,i+2,i+3,-1 , i+34,i+35,i+4,i+5 , -1,i+36,i+37,-1
+ %assign i i+6
+%endrep
+
+icl_perm_y_kmask: times 8 db 0b1111_0110
+icl_perm_uv_kmask: times 8 db 0b0110_1111
+
+icl_shift_y: times 10 dw 2,0,4
+ times 4 db 0 ; padding to 64 bytes
+icl_shift_uv: times 5 dw 0,2,4
+ times 2 db 0 ; padding to 32 bytes
+ times 5 dw 4,0,2
+ times 2 db 0 ; padding to 32 bytes
+
+v210enc_10_permd_y: dd 0,1,2,-1 , 3,4,5,-1
+v210enc_10_shufb_y: db -1,0,1,-1 , 2,3,4,5 , -1,6,7,-1 , 8,9,10,11
+v210enc_10_permd_uv: dd 0,1,4,5 , 1,2,5,6
+v210enc_10_shufb_uv: db 0,1, 8, 9 , -1,2,3,-1 , 10,11,4,5 , -1,12,13,-1
+ db 2,3,10,11 , -1,4,5,-1 , 12,13,6,7 , -1,14,15,-1
+
SECTION .text
%macro v210_planar_pack_10 0
@@ -113,6 +143,75 @@ INIT_YMM avx2
v210_planar_pack_10
%endif
+%macro v210_planar_pack_10_new 0
+
+cglobal v210_planar_pack_10, 5, 5, 8+2*notcpuflag(avx512icl), y, u, v, dst, width
+ lea yq, [yq+2*widthq]
+ add uq, widthq
+ add vq, widthq
+ neg widthq
+
+ %if cpuflag(avx512icl)
+ movu m6, [icl_perm_y]
+ movu m7, [icl_perm_uv]
+ kmovq k1, [icl_perm_y_kmask]
+ kmovq k2, [icl_perm_uv_kmask]
+ %else
+ movu m6, [v210enc_10_permd_y]
+ VBROADCASTI128 m7, [v210enc_10_shufb_y]
+ movu m8, [v210enc_10_permd_uv]
+ movu m9, [v210enc_10_shufb_uv]
+ %endif
+ movu m2, [icl_shift_y]
+ movu m3, [icl_shift_uv]
+ VBROADCASTI128 m4, [v210_enc_min_10] ; only ymm sized
+ VBROADCASTI128 m5, [v210_enc_max_10] ; only ymm sized
+
+ .loop:
+ movu m0, [yq + widthq*2]
+ %if cpuflag(avx512icl)
+ movu ym1, [uq + widthq*1]
+ vinserti32x8 zm1, [vq + widthq*1], 1
+ %else
+ movu xm1, [uq + widthq*1]
+ vinserti128 ym1, [vq + widthq*1], 1
+ %endif
+ CLIPW m0, m4, m5
+ CLIPW m1, m4, m5
+
+ vpsllvw m0, m2
+ vpsllvw m1, m3
+ %if cpuflag(avx512icl)
+ vpermb m0{k1}{z}, m6, m0 ; make space for uv where the k-mask sets to zero
+ vpermb m1{k2}{z}, m7, m1 ; interleave uv and make space for y where the k-mask sets to zero
+ %else
+ vpermd m0, m6, m0
+ pshufb m0, m7
+ vpermd m1, m8, m1
+ pshufb m1, m9
+ %endif
+ por m0, m1
+
+ movu [dstq], m0
+ add dstq, mmsize
+ add widthq, (mmsize*3)/8
+ jl .loop
+RET
+
+%endmacro
+
+%if ARCH_X86_64
+%if HAVE_AVX512_EXTERNAL
+INIT_YMM avx512
+v210_planar_pack_10_new
+%endif
+%endif
+
+%if HAVE_AVX512ICL_EXTERNAL
+INIT_ZMM avx512icl
+v210_planar_pack_10_new
+%endif
+
%macro v210_planar_pack_8 0
; v210_planar_pack_8(const uint8_t *y, const uint8_t *u, const uint8_t *v, uint8_t *dst, ptrdiff_t width)
diff --git a/libavcodec/x86/v210enc_init.c b/libavcodec/x86/v210enc_init.c
index 6e9f8c6e61..44f22ca7fe 100644
--- a/libavcodec/x86/v210enc_init.c
+++ b/libavcodec/x86/v210enc_init.c
@@ -37,6 +37,12 @@ void ff_v210_planar_pack_10_ssse3(const uint16_t *y, const uint16_t *u,
void ff_v210_planar_pack_10_avx2(const uint16_t *y, const uint16_t *u,
const uint16_t *v, uint8_t *dst,
ptrdiff_t width);
+void ff_v210_planar_pack_10_avx512(const uint16_t *y, const uint16_t *u,
+ const uint16_t *v, uint8_t *dst,
+ ptrdiff_t width);
+void ff_v210_planar_pack_10_avx512icl(const uint16_t *y, const uint16_t *u,
+ const uint16_t *v, uint8_t *dst,
+ ptrdiff_t width);
av_cold void ff_v210enc_init_x86(V210EncContext *s)
{
@@ -60,10 +66,16 @@ av_cold void ff_v210enc_init_x86(V210EncContext *s)
if (EXTERNAL_AVX512(cpu_flags)) {
s->sample_factor_8 = 2;
s->pack_line_8 = ff_v210_planar_pack_8_avx512;
+#if ARCH_X86_64
+ s->sample_factor_10 = 2;
+ s->pack_line_10 = ff_v210_planar_pack_10_avx512;
+#endif
}
if (EXTERNAL_AVX512ICL(cpu_flags)) {
s->sample_factor_8 = 4;
s->pack_line_8 = ff_v210_planar_pack_8_avx512icl;
+ s->sample_factor_10 = 4;
+ s->pack_line_10 = ff_v210_planar_pack_10_avx512icl;
}
}
--
2.38.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH v2 4/5] avcodec/x86/v210enc: expand and correct comments
2022-11-21 12:44 [FFmpeg-devel] [PATCH 1/3] checkasm/v210enc: test the entire width of 10-bit planar input arrays James Darnley
` (4 preceding siblings ...)
2022-11-25 15:17 ` [FFmpeg-devel] [PATCH v2 3/5] avcodec/v210enc: add new 10-bit function for avx512 avx512icl James Darnley
@ 2022-11-25 15:17 ` James Darnley
2022-11-25 15:17 ` [FFmpeg-devel] [PATCH v2 5/5] avcodec/x86/v210enc: remove unneeded instruction James Darnley
6 siblings, 0 replies; 10+ messages in thread
From: James Darnley @ 2022-11-25 15:17 UTC (permalink / raw)
To: ffmpeg-devel
---
libavcodec/x86/v210enc.asm | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm
index 552164a8be..d3639cd440 100644
--- a/libavcodec/x86/v210enc.asm
+++ b/libavcodec/x86/v210enc.asm
@@ -314,7 +314,7 @@ cglobal v210_planar_pack_8, 5, 5, 7+notcpuflag(avx512icl), y, u, v, dst, width
movu ym1, [yq + 2*widthq]
vinserti32x4 m1, [uq + 1*widthq], 2
vinserti32x4 m1, [vq + 1*widthq], 3
- vpermb m1, m2, m1 ; uyv0 yuy0 vyu0 yvy0
+ vpermb m1, m2, m1 ; uyvx yuyx vyux yvyx
%else
movq xm0, [uq + 1*widthq] ; uuuu uuxx
movq xm1, [vq + 1*widthq] ; vvvv vvxx
@@ -325,10 +325,10 @@ cglobal v210_planar_pack_8, 5, 5, 7+notcpuflag(avx512icl), y, u, v, dst, width
%endif
CLIPUB m1, m4, m5
- pmaddubsw m0, m1, m3
- pslld m1, 4
+ pmaddubsw m0, m1, m3 ; shift high and low samples of each dword and mask out other bits
+ pslld m1, 4 ; shift center sample of each dword
%if cpuflag(avx512)
- vpternlogd m0, m1, m6, 0xd8 ; C?B:A
+ vpternlogd m0, m1, m6, 0xd8 ; C?B:A ; merge and mask out bad bits from B
%else
pand m1, m6, m1
pandn m0, m6, m0
--
2.38.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH v2 5/5] avcodec/x86/v210enc: remove unneeded instruction
2022-11-21 12:44 [FFmpeg-devel] [PATCH 1/3] checkasm/v210enc: test the entire width of 10-bit planar input arrays James Darnley
` (5 preceding siblings ...)
2022-11-25 15:17 ` [FFmpeg-devel] [PATCH v2 4/5] avcodec/x86/v210enc: expand and correct comments James Darnley
@ 2022-11-25 15:17 ` James Darnley
6 siblings, 0 replies; 10+ messages in thread
From: James Darnley @ 2022-11-25 15:17 UTC (permalink / raw)
To: ffmpeg-devel
---
libavcodec/x86/v210enc.asm | 1 -
1 file changed, 1 deletion(-)
diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm
index d3639cd440..daf5f2ab81 100644
--- a/libavcodec/x86/v210enc.asm
+++ b/libavcodec/x86/v210enc.asm
@@ -331,7 +331,6 @@ cglobal v210_planar_pack_8, 5, 5, 7+notcpuflag(avx512icl), y, u, v, dst, width
vpternlogd m0, m1, m6, 0xd8 ; C?B:A ; merge and mask out bad bits from B
%else
pand m1, m6, m1
- pandn m0, m6, m0
por m0, m0, m1
%endif
--
2.38.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2022-11-25 15:20 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-11-21 12:44 [FFmpeg-devel] [PATCH 1/3] checkasm/v210enc: test the entire width of 10-bit planar input arrays James Darnley
2022-11-21 12:44 ` [FFmpeg-devel] [PATCH 2/3] avcodec/x86/v210: replace register use with named register James Darnley
2022-11-21 12:44 ` [FFmpeg-devel] [PATCH 3/3] avcodec/v210enc: add new 10-bit function for avx512 avx512icl James Darnley
2022-11-21 14:12 ` Andreas Rheinhardt
2022-11-21 16:50 ` James Darnley
2022-11-25 15:17 ` [FFmpeg-devel] [PATCH v2 1/5] checkasm/v210enc: test the entire width of 10-bit planar input arrays James Darnley
2022-11-25 15:17 ` [FFmpeg-devel] [PATCH v2 2/5] avcodec/x86/v210enc: replace register use with named register James Darnley
2022-11-25 15:17 ` [FFmpeg-devel] [PATCH v2 3/5] avcodec/v210enc: add new 10-bit function for avx512 avx512icl James Darnley
2022-11-25 15:17 ` [FFmpeg-devel] [PATCH v2 4/5] avcodec/x86/v210enc: expand and correct comments James Darnley
2022-11-25 15:17 ` [FFmpeg-devel] [PATCH v2 5/5] avcodec/x86/v210enc: remove unneeded instruction James Darnley
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git