* [FFmpeg-devel] [PR] avcodec/x86/vvc: Various improvements (PR #21790)
@ 2026-02-19 1:52 mkver via ffmpeg-devel
0 siblings, 0 replies; only message in thread
From: mkver via ffmpeg-devel @ 2026-02-19 1:52 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: mkver
PR #21790 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21790
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21790.patch
>From 04e99a13d87192cc498cda4eda5f6f7edc649f34 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Tue, 17 Feb 2026 15:14:04 +0100
Subject: [PATCH 01/12] avcodec/x86/vvc/mc: Avoid redundant clipping for 8bit
It is already done by packuswb.
Old benchmarks:
avg_8_2x2_c: 11.1 ( 1.00x)
avg_8_2x2_avx2: 8.6 ( 1.28x)
avg_8_4x4_c: 30.0 ( 1.00x)
avg_8_4x4_avx2: 10.8 ( 2.78x)
avg_8_8x8_c: 132.0 ( 1.00x)
avg_8_8x8_avx2: 25.7 ( 5.14x)
avg_8_16x16_c: 254.6 ( 1.00x)
avg_8_16x16_avx2: 33.2 ( 7.67x)
avg_8_32x32_c: 897.5 ( 1.00x)
avg_8_32x32_avx2: 115.6 ( 7.76x)
avg_8_64x64_c: 3316.9 ( 1.00x)
avg_8_64x64_avx2: 626.5 ( 5.29x)
avg_8_128x128_c: 12973.6 ( 1.00x)
avg_8_128x128_avx2: 1914.0 ( 6.78x)
w_avg_8_2x2_c: 16.7 ( 1.00x)
w_avg_8_2x2_avx2: 14.4 ( 1.16x)
w_avg_8_4x4_c: 48.2 ( 1.00x)
w_avg_8_4x4_avx2: 16.5 ( 2.92x)
w_avg_8_8x8_c: 168.1 ( 1.00x)
w_avg_8_8x8_avx2: 49.7 ( 3.38x)
w_avg_8_16x16_c: 392.4 ( 1.00x)
w_avg_8_16x16_avx2: 61.1 ( 6.43x)
w_avg_8_32x32_c: 1455.3 ( 1.00x)
w_avg_8_32x32_avx2: 224.6 ( 6.48x)
w_avg_8_64x64_c: 5632.1 ( 1.00x)
w_avg_8_64x64_avx2: 896.9 ( 6.28x)
w_avg_8_128x128_c: 22136.3 ( 1.00x)
w_avg_8_128x128_avx2: 3626.7 ( 6.10x)
New benchmarks:
avg_8_2x2_c: 12.3 ( 1.00x)
avg_8_2x2_avx2: 8.1 ( 1.52x)
avg_8_4x4_c: 30.3 ( 1.00x)
avg_8_4x4_avx2: 11.3 ( 2.67x)
avg_8_8x8_c: 131.8 ( 1.00x)
avg_8_8x8_avx2: 21.3 ( 6.20x)
avg_8_16x16_c: 255.0 ( 1.00x)
avg_8_16x16_avx2: 30.6 ( 8.33x)
avg_8_32x32_c: 898.5 ( 1.00x)
avg_8_32x32_avx2: 104.9 ( 8.57x)
avg_8_64x64_c: 3317.7 ( 1.00x)
avg_8_64x64_avx2: 540.9 ( 6.13x)
avg_8_128x128_c: 12986.5 ( 1.00x)
avg_8_128x128_avx2: 1663.4 ( 7.81x)
w_avg_8_2x2_c: 16.8 ( 1.00x)
w_avg_8_2x2_avx2: 13.9 ( 1.21x)
w_avg_8_4x4_c: 48.2 ( 1.00x)
w_avg_8_4x4_avx2: 16.2 ( 2.98x)
w_avg_8_8x8_c: 168.6 ( 1.00x)
w_avg_8_8x8_avx2: 46.3 ( 3.64x)
w_avg_8_16x16_c: 392.4 ( 1.00x)
w_avg_8_16x16_avx2: 57.7 ( 6.80x)
w_avg_8_32x32_c: 1454.6 ( 1.00x)
w_avg_8_32x32_avx2: 214.6 ( 6.78x)
w_avg_8_64x64_c: 5638.4 ( 1.00x)
w_avg_8_64x64_avx2: 875.6 ( 6.44x)
w_avg_8_128x128_c: 22133.5 ( 1.00x)
w_avg_8_128x128_avx2: 3334.3 ( 6.64x)
Also saves 550B of .text here. The improvements will likely
be even better on Win64, because it avoids using two nonvolatile
registers in the weighted average case.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/vvc/mc.asm | 26 +++++++++++++++++---------
1 file changed, 17 insertions(+), 9 deletions(-)
diff --git a/libavcodec/x86/vvc/mc.asm b/libavcodec/x86/vvc/mc.asm
index 30aa97c65a..a3f858edd8 100644
--- a/libavcodec/x86/vvc/mc.asm
+++ b/libavcodec/x86/vvc/mc.asm
@@ -64,12 +64,12 @@ SECTION .text
%rep %3
%define off %%i
AVG_LOAD_W16 0, off
- %2
+ %2 %1
AVG_SAVE_W16 %1, 0, off
AVG_LOAD_W16 1, off
- %2
+ %2 %1
AVG_SAVE_W16 %1, 1, off
%assign %%i %%i+1
@@ -84,7 +84,7 @@ SECTION .text
pinsrd xm0, [src0q + AVG_SRC_STRIDE], 1
movd xm1, [src1q]
pinsrd xm1, [src1q + AVG_SRC_STRIDE], 1
- %2
+ %2 %1
AVG_SAVE_W2 %1
AVG_LOOP_END .w2
@@ -93,7 +93,7 @@ SECTION .text
pinsrq xm0, [src0q + AVG_SRC_STRIDE], 1
movq xm1, [src1q]
pinsrq xm1, [src1q + AVG_SRC_STRIDE], 1
- %2
+ %2 %1
AVG_SAVE_W4 %1
AVG_LOOP_END .w4
@@ -103,7 +103,7 @@ SECTION .text
vinserti128 m0, m0, [src0q + AVG_SRC_STRIDE], 1
vinserti128 m1, m1, [src1q], 0
vinserti128 m1, m1, [src1q + AVG_SRC_STRIDE], 1
- %2
+ %2 %1
AVG_SAVE_W8 %1
AVG_LOOP_END .w8
@@ -132,13 +132,15 @@ SECTION .text
RET
%endmacro
-%macro AVG 0
+%macro AVG 1
paddsw m0, m1
pmulhrsw m0, m2
+%if %1 != 8
CLIPW m0, m3, m4
+%endif
%endmacro
-%macro W_AVG 0
+%macro W_AVG 1
punpckhwd m5, m0, m1
pmaddwd m5, m3
paddd m5, m4
@@ -150,7 +152,9 @@ SECTION .text
psrad m0, xm2
packssdw m0, m5
+%if %1 != 8
CLIPW m0, m6, m7
+%endif
%endmacro
%macro AVG_LOAD_W16 2 ; line, offset
@@ -217,11 +221,13 @@ SECTION .text
;void ff_vvc_avg_%1bpc_avx2(uint8_t *dst, ptrdiff_t dst_stride,
; const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, intptr_t pixel_max);
%macro VVC_AVG_AVX2 1
-cglobal vvc_avg_%1bpc, 4, 7, 5, dst, stride, src0, src1, w, h, bd
+cglobal vvc_avg_%1bpc, 4, 7, 3+2*(%1 != 8), dst, stride, src0, src1, w, h, bd
movifnidn hd, hm
+%if %1 != 8
pxor m3, m3 ; pixel min
vpbroadcastw m4, bdm ; pixel max
+%endif
movifnidn bdd, bdm
inc bdd
@@ -245,7 +251,7 @@ cglobal vvc_avg_%1bpc, 4, 7, 5, dst, stride, src0, src1, w, h, bd
; const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height,
; intptr_t denom, intptr_t w0, intptr_t w1, intptr_t o0, intptr_t o1, intptr_t pixel_max);
%macro VVC_W_AVG_AVX2 1
-cglobal vvc_w_avg_%1bpc, 4, 8, 8, dst, stride, src0, src1, w, h, t0, t1
+cglobal vvc_w_avg_%1bpc, 4, 8, 6+2*(%1 != 8), dst, stride, src0, src1, w, h, t0, t1
movifnidn hd, hm
@@ -255,8 +261,10 @@ cglobal vvc_w_avg_%1bpc, 4, 8, 8, dst, stride, src0, src1, w, h, t0, t1
movd xm3, t0d
vpbroadcastd m3, xm3 ; w0, w1
+%if %1 != 8
pxor m6, m6 ;pixel min
vpbroadcastw m7, r11m ;pixel max
+%endif
mov t1q, rcx ; save ecx
mov ecx, r11m
--
2.52.0
>From d54ad72df4f969acb5b23749927a3e1f3e29bf62 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Tue, 17 Feb 2026 17:34:49 +0100
Subject: [PATCH 02/12] avcodec/x86/vvc/mc: Avoid pextr[dq],
v{insert,extract}i128
Use mov[dq], movdqu instead if the least significant parts
are set (i.e. if the immediate value is 0x0).
Old benchmarks:
avg_8_2x2_c: 11.3 ( 1.00x)
avg_8_2x2_avx2: 7.5 ( 1.50x)
avg_8_4x4_c: 31.2 ( 1.00x)
avg_8_4x4_avx2: 10.7 ( 2.91x)
avg_8_8x8_c: 133.5 ( 1.00x)
avg_8_8x8_avx2: 21.2 ( 6.30x)
avg_8_16x16_c: 254.7 ( 1.00x)
avg_8_16x16_avx2: 30.1 ( 8.46x)
avg_8_32x32_c: 896.9 ( 1.00x)
avg_8_32x32_avx2: 103.9 ( 8.63x)
avg_8_64x64_c: 3320.7 ( 1.00x)
avg_8_64x64_avx2: 539.4 ( 6.16x)
avg_8_128x128_c: 12991.5 ( 1.00x)
avg_8_128x128_avx2: 1661.3 ( 7.82x)
avg_10_2x2_c: 21.3 ( 1.00x)
avg_10_2x2_avx2: 8.3 ( 2.55x)
avg_10_4x4_c: 34.9 ( 1.00x)
avg_10_4x4_avx2: 10.6 ( 3.28x)
avg_10_8x8_c: 76.3 ( 1.00x)
avg_10_8x8_avx2: 20.2 ( 3.77x)
avg_10_16x16_c: 255.9 ( 1.00x)
avg_10_16x16_avx2: 24.1 (10.60x)
avg_10_32x32_c: 932.4 ( 1.00x)
avg_10_32x32_avx2: 73.3 (12.72x)
avg_10_64x64_c: 3516.4 ( 1.00x)
avg_10_64x64_avx2: 601.7 ( 5.84x)
avg_10_128x128_c: 13690.6 ( 1.00x)
avg_10_128x128_avx2: 1613.2 ( 8.49x)
avg_12_2x2_c: 14.0 ( 1.00x)
avg_12_2x2_avx2: 8.3 ( 1.67x)
avg_12_4x4_c: 35.3 ( 1.00x)
avg_12_4x4_avx2: 10.9 ( 3.26x)
avg_12_8x8_c: 76.5 ( 1.00x)
avg_12_8x8_avx2: 20.3 ( 3.77x)
avg_12_16x16_c: 256.7 ( 1.00x)
avg_12_16x16_avx2: 24.1 (10.63x)
avg_12_32x32_c: 932.5 ( 1.00x)
avg_12_32x32_avx2: 73.3 (12.72x)
avg_12_64x64_c: 3520.5 ( 1.00x)
avg_12_64x64_avx2: 602.6 ( 5.84x)
avg_12_128x128_c: 13689.6 ( 1.00x)
avg_12_128x128_avx2: 1613.1 ( 8.49x)
w_avg_8_2x2_c: 16.7 ( 1.00x)
w_avg_8_2x2_avx2: 13.4 ( 1.25x)
w_avg_8_4x4_c: 44.5 ( 1.00x)
w_avg_8_4x4_avx2: 15.9 ( 2.81x)
w_avg_8_8x8_c: 166.1 ( 1.00x)
w_avg_8_8x8_avx2: 45.7 ( 3.63x)
w_avg_8_16x16_c: 392.9 ( 1.00x)
w_avg_8_16x16_avx2: 57.8 ( 6.80x)
w_avg_8_32x32_c: 1455.5 ( 1.00x)
w_avg_8_32x32_avx2: 215.0 ( 6.77x)
w_avg_8_64x64_c: 5621.8 ( 1.00x)
w_avg_8_64x64_avx2: 875.2 ( 6.42x)
w_avg_8_128x128_c: 22131.3 ( 1.00x)
w_avg_8_128x128_avx2: 3390.1 ( 6.53x)
w_avg_10_2x2_c: 18.0 ( 1.00x)
w_avg_10_2x2_avx2: 14.0 ( 1.28x)
w_avg_10_4x4_c: 53.9 ( 1.00x)
w_avg_10_4x4_avx2: 15.9 ( 3.40x)
w_avg_10_8x8_c: 109.5 ( 1.00x)
w_avg_10_8x8_avx2: 40.4 ( 2.71x)
w_avg_10_16x16_c: 395.7 ( 1.00x)
w_avg_10_16x16_avx2: 44.7 ( 8.86x)
w_avg_10_32x32_c: 1532.7 ( 1.00x)
w_avg_10_32x32_avx2: 142.4 (10.77x)
w_avg_10_64x64_c: 6007.7 ( 1.00x)
w_avg_10_64x64_avx2: 745.5 ( 8.06x)
w_avg_10_128x128_c: 23719.7 ( 1.00x)
w_avg_10_128x128_avx2: 2217.7 (10.70x)
w_avg_12_2x2_c: 18.9 ( 1.00x)
w_avg_12_2x2_avx2: 13.6 ( 1.38x)
w_avg_12_4x4_c: 47.5 ( 1.00x)
w_avg_12_4x4_avx2: 15.9 ( 2.99x)
w_avg_12_8x8_c: 109.3 ( 1.00x)
w_avg_12_8x8_avx2: 40.9 ( 2.67x)
w_avg_12_16x16_c: 395.6 ( 1.00x)
w_avg_12_16x16_avx2: 44.8 ( 8.84x)
w_avg_12_32x32_c: 1531.0 ( 1.00x)
w_avg_12_32x32_avx2: 141.8 (10.80x)
w_avg_12_64x64_c: 6016.7 ( 1.00x)
w_avg_12_64x64_avx2: 732.8 ( 8.21x)
w_avg_12_128x128_c: 23762.2 ( 1.00x)
w_avg_12_128x128_avx2: 2223.4 (10.69x)
New benchmarks:
avg_8_2x2_c: 11.3 ( 1.00x)
avg_8_2x2_avx2: 7.6 ( 1.49x)
avg_8_4x4_c: 31.2 ( 1.00x)
avg_8_4x4_avx2: 10.8 ( 2.89x)
avg_8_8x8_c: 131.6 ( 1.00x)
avg_8_8x8_avx2: 15.6 ( 8.42x)
avg_8_16x16_c: 255.3 ( 1.00x)
avg_8_16x16_avx2: 27.9 ( 9.16x)
avg_8_32x32_c: 897.9 ( 1.00x)
avg_8_32x32_avx2: 81.2 (11.06x)
avg_8_64x64_c: 3320.0 ( 1.00x)
avg_8_64x64_avx2: 335.1 ( 9.91x)
avg_8_128x128_c: 12999.1 ( 1.00x)
avg_8_128x128_avx2: 1456.3 ( 8.93x)
avg_10_2x2_c: 12.0 ( 1.00x)
avg_10_2x2_avx2: 8.6 ( 1.40x)
avg_10_4x4_c: 34.9 ( 1.00x)
avg_10_4x4_avx2: 9.7 ( 3.61x)
avg_10_8x8_c: 76.7 ( 1.00x)
avg_10_8x8_avx2: 16.3 ( 4.69x)
avg_10_16x16_c: 256.3 ( 1.00x)
avg_10_16x16_avx2: 25.2 (10.18x)
avg_10_32x32_c: 932.8 ( 1.00x)
avg_10_32x32_avx2: 73.3 (12.72x)
avg_10_64x64_c: 3518.8 ( 1.00x)
avg_10_64x64_avx2: 416.8 ( 8.44x)
avg_10_128x128_c: 13691.6 ( 1.00x)
avg_10_128x128_avx2: 1612.9 ( 8.49x)
avg_12_2x2_c: 14.1 ( 1.00x)
avg_12_2x2_avx2: 8.7 ( 1.62x)
avg_12_4x4_c: 35.7 ( 1.00x)
avg_12_4x4_avx2: 9.7 ( 3.68x)
avg_12_8x8_c: 77.0 ( 1.00x)
avg_12_8x8_avx2: 16.9 ( 4.57x)
avg_12_16x16_c: 256.2 ( 1.00x)
avg_12_16x16_avx2: 25.7 ( 9.96x)
avg_12_32x32_c: 933.5 ( 1.00x)
avg_12_32x32_avx2: 74.0 (12.62x)
avg_12_64x64_c: 3516.4 ( 1.00x)
avg_12_64x64_avx2: 408.7 ( 8.60x)
avg_12_128x128_c: 13691.6 ( 1.00x)
avg_12_128x128_avx2: 1613.8 ( 8.48x)
w_avg_8_2x2_c: 16.7 ( 1.00x)
w_avg_8_2x2_avx2: 14.0 ( 1.19x)
w_avg_8_4x4_c: 48.2 ( 1.00x)
w_avg_8_4x4_avx2: 16.1 ( 3.00x)
w_avg_8_8x8_c: 168.0 ( 1.00x)
w_avg_8_8x8_avx2: 22.5 ( 7.47x)
w_avg_8_16x16_c: 392.5 ( 1.00x)
w_avg_8_16x16_avx2: 47.9 ( 8.19x)
w_avg_8_32x32_c: 1453.7 ( 1.00x)
w_avg_8_32x32_avx2: 176.1 ( 8.26x)
w_avg_8_64x64_c: 5631.4 ( 1.00x)
w_avg_8_64x64_avx2: 690.8 ( 8.15x)
w_avg_8_128x128_c: 22139.5 ( 1.00x)
w_avg_8_128x128_avx2: 2742.4 ( 8.07x)
w_avg_10_2x2_c: 18.1 ( 1.00x)
w_avg_10_2x2_avx2: 13.8 ( 1.31x)
w_avg_10_4x4_c: 47.0 ( 1.00x)
w_avg_10_4x4_avx2: 16.4 ( 2.87x)
w_avg_10_8x8_c: 110.0 ( 1.00x)
w_avg_10_8x8_avx2: 21.6 ( 5.09x)
w_avg_10_16x16_c: 395.2 ( 1.00x)
w_avg_10_16x16_avx2: 45.4 ( 8.71x)
w_avg_10_32x32_c: 1533.8 ( 1.00x)
w_avg_10_32x32_avx2: 142.6 (10.76x)
w_avg_10_64x64_c: 6004.4 ( 1.00x)
w_avg_10_64x64_avx2: 672.8 ( 8.92x)
w_avg_10_128x128_c: 23748.5 ( 1.00x)
w_avg_10_128x128_avx2: 2198.0 (10.80x)
w_avg_12_2x2_c: 17.2 ( 1.00x)
w_avg_12_2x2_avx2: 13.9 ( 1.24x)
w_avg_12_4x4_c: 51.4 ( 1.00x)
w_avg_12_4x4_avx2: 16.5 ( 3.11x)
w_avg_12_8x8_c: 109.1 ( 1.00x)
w_avg_12_8x8_avx2: 22.0 ( 4.96x)
w_avg_12_16x16_c: 395.9 ( 1.00x)
w_avg_12_16x16_avx2: 44.9 ( 8.81x)
w_avg_12_32x32_c: 1533.5 ( 1.00x)
w_avg_12_32x32_avx2: 142.3 (10.78x)
w_avg_12_64x64_c: 6002.0 ( 1.00x)
w_avg_12_64x64_avx2: 557.5 (10.77x)
w_avg_12_128x128_c: 23749.5 ( 1.00x)
w_avg_12_128x128_avx2: 2202.0 (10.79x)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/vvc/mc.asm | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/libavcodec/x86/vvc/mc.asm b/libavcodec/x86/vvc/mc.asm
index a3f858edd8..4fb5a19761 100644
--- a/libavcodec/x86/vvc/mc.asm
+++ b/libavcodec/x86/vvc/mc.asm
@@ -99,9 +99,9 @@ SECTION .text
AVG_LOOP_END .w4
.w8:
- vinserti128 m0, m0, [src0q], 0
+ movu xm0, [src0q]
+ movu xm1, [src1q]
vinserti128 m0, m0, [src0q + AVG_SRC_STRIDE], 1
- vinserti128 m1, m1, [src1q], 0
vinserti128 m1, m1, [src1q + AVG_SRC_STRIDE], 1
%2 %1
AVG_SAVE_W8 %1
@@ -164,7 +164,7 @@ SECTION .text
%macro AVG_SAVE_W2 1 ;bpc
%if %1 == 16
- pextrd [dstq], xm0, 0
+ movd [dstq], xm0
pextrd [dstq + strideq], xm0, 1
%else
packuswb m0, m0
@@ -175,23 +175,23 @@ SECTION .text
%macro AVG_SAVE_W4 1 ;bpc
%if %1 == 16
- pextrq [dstq], xm0, 0
+ movq [dstq], xm0
pextrq [dstq + strideq], xm0, 1
%else
packuswb m0, m0
- pextrd [dstq], xm0, 0
+ movd [dstq], xm0
pextrd [dstq + strideq], xm0, 1
%endif
%endmacro
%macro AVG_SAVE_W8 1 ;bpc
%if %1 == 16
- vextracti128 [dstq], m0, 0
+ movu [dstq], xm0
vextracti128 [dstq + strideq], m0, 1
%else
packuswb m0, m0
vpermq m0, m0, 1000b
- pextrq [dstq], xm0, 0
+ movq [dstq], xm0
pextrq [dstq + strideq], xm0, 1
%endif
%endmacro
@@ -202,7 +202,7 @@ SECTION .text
%else
packuswb m0, m0
vpermq m0, m0, 1000b
- vextracti128 [dstq + %2 * strideq + %3 * 16], m0, 0
+ movu [dstq + %2 * strideq + %3 * 16], xm0
%endif
%endmacro
--
2.52.0
>From 6707b96cfd42d3f71b6f73bc2203b0fd611fa766 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Tue, 17 Feb 2026 19:10:18 +0100
Subject: [PATCH 03/12] avcodec/x86/vvc/mc: Avoid ymm registers where possible
Widths 2 and 4 fit into xmm registers.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/vvc/mc.asm | 2 ++
1 file changed, 2 insertions(+)
diff --git a/libavcodec/x86/vvc/mc.asm b/libavcodec/x86/vvc/mc.asm
index 4fb5a19761..640e7d1d12 100644
--- a/libavcodec/x86/vvc/mc.asm
+++ b/libavcodec/x86/vvc/mc.asm
@@ -79,6 +79,7 @@ SECTION .text
%macro AVG_FN 2 ; bpc, op
jmp wq
+INIT_XMM cpuname
.w2:
movd xm0, [src0q]
pinsrd xm0, [src0q + AVG_SRC_STRIDE], 1
@@ -98,6 +99,7 @@ SECTION .text
AVG_LOOP_END .w4
+INIT_YMM cpuname
.w8:
movu xm0, [src0q]
movu xm1, [src1q]
--
2.52.0
>From 9937ba4b518e6dc9b61defbc71c528ebb1a2bde7 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Tue, 17 Feb 2026 19:23:27 +0100
Subject: [PATCH 04/12] avcodec/x86/vvc/mc: Avoid unused work
The high quadword of these registers is zero for width 2.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/vvc/mc.asm | 20 +++++++++++++-------
1 file changed, 13 insertions(+), 7 deletions(-)
diff --git a/libavcodec/x86/vvc/mc.asm b/libavcodec/x86/vvc/mc.asm
index 640e7d1d12..a592218e96 100644
--- a/libavcodec/x86/vvc/mc.asm
+++ b/libavcodec/x86/vvc/mc.asm
@@ -64,12 +64,12 @@ SECTION .text
%rep %3
%define off %%i
AVG_LOAD_W16 0, off
- %2 %1
+ %2 %1, 16
AVG_SAVE_W16 %1, 0, off
AVG_LOAD_W16 1, off
- %2 %1
+ %2 %1, 16
AVG_SAVE_W16 %1, 1, off
%assign %%i %%i+1
@@ -85,7 +85,7 @@ INIT_XMM cpuname
pinsrd xm0, [src0q + AVG_SRC_STRIDE], 1
movd xm1, [src1q]
pinsrd xm1, [src1q + AVG_SRC_STRIDE], 1
- %2 %1
+ %2 %1, 2
AVG_SAVE_W2 %1
AVG_LOOP_END .w2
@@ -94,7 +94,7 @@ INIT_XMM cpuname
pinsrq xm0, [src0q + AVG_SRC_STRIDE], 1
movq xm1, [src1q]
pinsrq xm1, [src1q + AVG_SRC_STRIDE], 1
- %2 %1
+ %2 %1, 4
AVG_SAVE_W4 %1
AVG_LOOP_END .w4
@@ -105,7 +105,7 @@ INIT_YMM cpuname
movu xm1, [src1q]
vinserti128 m0, m0, [src0q + AVG_SRC_STRIDE], 1
vinserti128 m1, m1, [src1q + AVG_SRC_STRIDE], 1
- %2 %1
+ %2 %1, 8
AVG_SAVE_W8 %1
AVG_LOOP_END .w8
@@ -134,7 +134,7 @@ INIT_YMM cpuname
RET
%endmacro
-%macro AVG 1
+%macro AVG 2 ; bpc, width
paddsw m0, m1
pmulhrsw m0, m2
%if %1 != 8
@@ -142,18 +142,24 @@ INIT_YMM cpuname
%endif
%endmacro
-%macro W_AVG 1
+%macro W_AVG 2 ; bpc, width
+%if %2 > 2
punpckhwd m5, m0, m1
pmaddwd m5, m3
paddd m5, m4
psrad m5, xm2
+%endif
punpcklwd m0, m0, m1
pmaddwd m0, m3
paddd m0, m4
psrad m0, xm2
+%if %2 == 2
+ packssdw m0, m0
+%else
packssdw m0, m5
+%endif
%if %1 != 8
CLIPW m0, m6, m7
%endif
--
2.52.0
>From 5eb22539905f49120418c0697afe5c9660589105 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Tue, 17 Feb 2026 20:03:42 +0100
Subject: [PATCH 05/12] avcodec/x86/vvc/mc: Remove unused constants
Also avoid overaligning .rodata.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/vvc/mc.asm | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/libavcodec/x86/vvc/mc.asm b/libavcodec/x86/vvc/mc.asm
index a592218e96..539a5a4bb3 100644
--- a/libavcodec/x86/vvc/mc.asm
+++ b/libavcodec/x86/vvc/mc.asm
@@ -29,16 +29,14 @@
%define MAX_PB_SIZE 128
-SECTION_RODATA 32
+SECTION_RODATA
%if ARCH_X86_64
%if HAVE_AVX2_EXTERNAL
pw_0 times 2 dw 0
-pw_1 times 2 dw 1
pw_4 times 2 dw 4
-pw_12 times 2 dw 12
pw_256 times 2 dw 256
%macro AVG_JMP_TABLE 3-*
--
2.52.0
>From d7a40e38d834dacef12896830bb7ab3d5bd0b76c Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Tue, 17 Feb 2026 20:27:51 +0100
Subject: [PATCH 06/12] avcodec/x86/vvc/mc: Remove always-false branches
The C versions of the average and weighted average functions
contains "FFMAX(3, 15 - BIT_DEPTH)" and the code here followed
this; yet it is only instantiated for bit depths 8, 10 and 12,
for which the above is just 15-BIT_DEPTH. So the comparisons
are unnecessary.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/vvc/mc.asm | 8 +-------
1 file changed, 1 insertion(+), 7 deletions(-)
diff --git a/libavcodec/x86/vvc/mc.asm b/libavcodec/x86/vvc/mc.asm
index 539a5a4bb3..3272765b57 100644
--- a/libavcodec/x86/vvc/mc.asm
+++ b/libavcodec/x86/vvc/mc.asm
@@ -35,8 +35,6 @@ SECTION_RODATA
%if HAVE_AVX2_EXTERNAL
-pw_0 times 2 dw 0
-pw_4 times 2 dw 4
pw_256 times 2 dw 256
%macro AVG_JMP_TABLE 3-*
@@ -241,8 +239,6 @@ cglobal vvc_avg_%1bpc, 4, 7, 3+2*(%1 != 8), dst, stride, src0, src1, w, h, bd
sub bdd, 8
movd xm0, bdd
- vpbroadcastd m1, [pw_4]
- pminuw m0, m1
vpbroadcastd m2, [pw_256]
psllw m2, xm0 ; shift
@@ -283,9 +279,7 @@ cglobal vvc_w_avg_%1bpc, 4, 8, 6+2*(%1 != 8), dst, stride, src0, src1, w, h, t0,
inc t0d ;((o0 + o1) << (BIT_DEPTH - 8)) + 1
neg ecx
- add ecx, 4 ; bd - 12
- cmovl ecx, [pw_0]
- add ecx, 3
+ add ecx, 7
add ecx, r6m
movd xm2, ecx ; shift
--
2.52.0
>From d171a6ae712f06f2e184cb6071043a844794e5ff Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Tue, 17 Feb 2026 23:00:30 +0100
Subject: [PATCH 07/12] avcodec/x86/vvc/mc,dsp_init: Avoid pointless wrappers
Up until now, there were two averaging assembly functions,
one for eight bit content and one for <=16 bit content;
there are also three C-wrappers around these functions,
for 8, 10 and 12 bpp. These wrappers simply forward the
maximum permissible value (i.e. (1<<bpp)-1) and promote
some integer values to ptrdiff_t.
Yet these wrappers are absolutely useless: The assembly functions
rederive the bpp from the maximum and only the integer part
of the promoted ptrdiff_t values is ever used. Of course,
these wrappers also entail an additional call (not a tail call,
because the additional maximum parameter is passed on the stack).
Remove the wrappers and add per-bpp assembly functions instead.
Given that the only difference between 10 and 12 bits are some
constants in registers, the main part of these functions can be
shared (given that this code uses a jumptable, it can even
be done without adding any additional jump).
Old benchmarks:
avg_8_2x2_c: 11.4 ( 1.00x)
avg_8_2x2_avx2: 7.9 ( 1.44x)
avg_8_4x4_c: 30.7 ( 1.00x)
avg_8_4x4_avx2: 10.4 ( 2.95x)
avg_8_8x8_c: 134.5 ( 1.00x)
avg_8_8x8_avx2: 16.6 ( 8.12x)
avg_8_16x16_c: 255.6 ( 1.00x)
avg_8_16x16_avx2: 28.2 ( 9.07x)
avg_8_32x32_c: 897.7 ( 1.00x)
avg_8_32x32_avx2: 83.9 (10.70x)
avg_8_64x64_c: 3320.0 ( 1.00x)
avg_8_64x64_avx2: 321.1 (10.34x)
avg_8_128x128_c: 12981.8 ( 1.00x)
avg_8_128x128_avx2: 1480.1 ( 8.77x)
avg_10_2x2_c: 12.0 ( 1.00x)
avg_10_2x2_avx2: 8.4 ( 1.43x)
avg_10_4x4_c: 34.9 ( 1.00x)
avg_10_4x4_avx2: 9.8 ( 3.56x)
avg_10_8x8_c: 76.8 ( 1.00x)
avg_10_8x8_avx2: 15.1 ( 5.08x)
avg_10_16x16_c: 256.6 ( 1.00x)
avg_10_16x16_avx2: 25.1 (10.20x)
avg_10_32x32_c: 932.9 ( 1.00x)
avg_10_32x32_avx2: 73.4 (12.72x)
avg_10_64x64_c: 3517.9 ( 1.00x)
avg_10_64x64_avx2: 414.8 ( 8.48x)
avg_10_128x128_c: 13695.3 ( 1.00x)
avg_10_128x128_avx2: 1648.1 ( 8.31x)
avg_12_2x2_c: 13.1 ( 1.00x)
avg_12_2x2_avx2: 8.6 ( 1.53x)
avg_12_4x4_c: 35.4 ( 1.00x)
avg_12_4x4_avx2: 10.1 ( 3.49x)
avg_12_8x8_c: 76.6 ( 1.00x)
avg_12_8x8_avx2: 16.7 ( 4.60x)
avg_12_16x16_c: 256.6 ( 1.00x)
avg_12_16x16_avx2: 25.5 (10.07x)
avg_12_32x32_c: 933.2 ( 1.00x)
avg_12_32x32_avx2: 75.7 (12.34x)
avg_12_64x64_c: 3519.1 ( 1.00x)
avg_12_64x64_avx2: 416.8 ( 8.44x)
avg_12_128x128_c: 13695.1 ( 1.00x)
avg_12_128x128_avx2: 1651.6 ( 8.29x)
New benchmarks:
avg_8_2x2_c: 11.5 ( 1.00x)
avg_8_2x2_avx2: 6.0 ( 1.91x)
avg_8_4x4_c: 29.7 ( 1.00x)
avg_8_4x4_avx2: 8.0 ( 3.72x)
avg_8_8x8_c: 131.4 ( 1.00x)
avg_8_8x8_avx2: 12.2 (10.74x)
avg_8_16x16_c: 254.3 ( 1.00x)
avg_8_16x16_avx2: 24.8 (10.25x)
avg_8_32x32_c: 897.7 ( 1.00x)
avg_8_32x32_avx2: 77.8 (11.54x)
avg_8_64x64_c: 3321.3 ( 1.00x)
avg_8_64x64_avx2: 318.7 (10.42x)
avg_8_128x128_c: 12988.4 ( 1.00x)
avg_8_128x128_avx2: 1430.1 ( 9.08x)
avg_10_2x2_c: 12.1 ( 1.00x)
avg_10_2x2_avx2: 5.7 ( 2.13x)
avg_10_4x4_c: 35.0 ( 1.00x)
avg_10_4x4_avx2: 9.0 ( 3.88x)
avg_10_8x8_c: 77.2 ( 1.00x)
avg_10_8x8_avx2: 12.4 ( 6.24x)
avg_10_16x16_c: 256.2 ( 1.00x)
avg_10_16x16_avx2: 24.3 (10.56x)
avg_10_32x32_c: 932.9 ( 1.00x)
avg_10_32x32_avx2: 71.9 (12.97x)
avg_10_64x64_c: 3516.8 ( 1.00x)
avg_10_64x64_avx2: 414.7 ( 8.48x)
avg_10_128x128_c: 13693.7 ( 1.00x)
avg_10_128x128_avx2: 1609.3 ( 8.51x)
avg_12_2x2_c: 14.1 ( 1.00x)
avg_12_2x2_avx2: 5.7 ( 2.48x)
avg_12_4x4_c: 35.8 ( 1.00x)
avg_12_4x4_avx2: 9.0 ( 3.96x)
avg_12_8x8_c: 76.9 ( 1.00x)
avg_12_8x8_avx2: 12.4 ( 6.22x)
avg_12_16x16_c: 256.5 ( 1.00x)
avg_12_16x16_avx2: 24.4 (10.50x)
avg_12_32x32_c: 934.1 ( 1.00x)
avg_12_32x32_avx2: 72.0 (12.97x)
avg_12_64x64_c: 3518.2 ( 1.00x)
avg_12_64x64_avx2: 414.8 ( 8.48x)
avg_12_128x128_c: 13689.5 ( 1.00x)
avg_12_128x128_avx2: 1611.1 ( 8.50x)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/vvc/dsp_init.c | 11 ++-----
libavcodec/x86/vvc/mc.asm | 55 ++++++++++++++++-------------------
2 files changed, 28 insertions(+), 38 deletions(-)
diff --git a/libavcodec/x86/vvc/dsp_init.c b/libavcodec/x86/vvc/dsp_init.c
index cbcfa40a66..80df8e46ee 100644
--- a/libavcodec/x86/vvc/dsp_init.c
+++ b/libavcodec/x86/vvc/dsp_init.c
@@ -36,8 +36,6 @@
#define BF(fn, bpc, opt) fn##_##bpc##bpc_##opt
#define AVG_BPC_PROTOTYPES(bpc, opt) \
-void BF(ff_vvc_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
- const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, intptr_t pixel_max); \
void BF(ff_vvc_w_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, \
intptr_t denom, intptr_t w0, intptr_t w1, intptr_t o0, intptr_t o1, intptr_t pixel_max);
@@ -171,11 +169,6 @@ FW_PUT_16BPC_AVX2(10)
FW_PUT_16BPC_AVX2(12)
#define AVG_FUNCS(bpc, bd, opt) \
-static void bf(vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
- const int16_t *src0, const int16_t *src1, int width, int height) \
-{ \
- BF(ff_vvc_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, (1 << bd) - 1); \
-} \
static void bf(vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
const int16_t *src0, const int16_t *src1, int width, int height, \
int denom, int w0, int w1, int o0, int o1) \
@@ -254,7 +247,9 @@ SAO_FILTER_FUNCS(12, avx2)
} while (0)
#define AVG_INIT(bd, opt) do { \
- c->inter.avg = bf(vvc_avg, bd, opt); \
+void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *src0, const int16_t *src1, int width, int height);\
+ c->inter.avg = bf(ff_vvc_avg, bd, opt); \
c->inter.w_avg = bf(vvc_w_avg, bd, opt); \
} while (0)
diff --git a/libavcodec/x86/vvc/mc.asm b/libavcodec/x86/vvc/mc.asm
index 3272765b57..7599ee2e6a 100644
--- a/libavcodec/x86/vvc/mc.asm
+++ b/libavcodec/x86/vvc/mc.asm
@@ -35,23 +35,21 @@ SECTION_RODATA
%if HAVE_AVX2_EXTERNAL
-pw_256 times 2 dw 256
-
-%macro AVG_JMP_TABLE 3-*
- %xdefine %1_%2_%3_table (%%table - 2*%4)
- %xdefine %%base %1_%2_%3_table
- %xdefine %%prefix mangle(private_prefix %+ _vvc_%1_%2bpc_%3)
+%macro AVG_JMP_TABLE 4-*
+ %xdefine %1_%2_%4_table (%%table - 2*%5)
+ %xdefine %%base %1_%2_%4_table
+ %xdefine %%prefix mangle(private_prefix %+ _vvc_%1_%3_%4)
%%table:
- %rep %0 - 3
- dd %%prefix %+ .w%4 - %%base
+ %rep %0 - 4
+ dd %%prefix %+ .w%5 - %%base
%rotate 1
%endrep
%endmacro
-AVG_JMP_TABLE avg, 8, avx2, 2, 4, 8, 16, 32, 64, 128
-AVG_JMP_TABLE avg, 16, avx2, 2, 4, 8, 16, 32, 64, 128
-AVG_JMP_TABLE w_avg, 8, avx2, 2, 4, 8, 16, 32, 64, 128
-AVG_JMP_TABLE w_avg, 16, avx2, 2, 4, 8, 16, 32, 64, 128
+AVG_JMP_TABLE avg, 8, 8, avx2, 2, 4, 8, 16, 32, 64, 128
+AVG_JMP_TABLE avg, 16, 10, avx2, 2, 4, 8, 16, 32, 64, 128
+AVG_JMP_TABLE w_avg, 8, 8bpc, avx2, 2, 4, 8, 16, 32, 64, 128
+AVG_JMP_TABLE w_avg, 16, 16bpc, avx2, 2, 4, 8, 16, 32, 64, 128
SECTION .text
@@ -72,9 +70,10 @@ SECTION .text
%endrep
%endmacro
-%macro AVG_FN 2 ; bpc, op
+%macro AVG_FN 2-3 1; bpc, op, instantiate implementation
jmp wq
+%if %3
INIT_XMM cpuname
.w2:
movd xm0, [src0q]
@@ -128,6 +127,7 @@ INIT_YMM cpuname
.ret:
RET
+%endif
%endmacro
%macro AVG 2 ; bpc, width
@@ -222,31 +222,24 @@ INIT_YMM cpuname
%define AVG_SRC_STRIDE MAX_PB_SIZE*2
-;void ff_vvc_avg_%1bpc_avx2(uint8_t *dst, ptrdiff_t dst_stride,
-; const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, intptr_t pixel_max);
-%macro VVC_AVG_AVX2 1
-cglobal vvc_avg_%1bpc, 4, 7, 3+2*(%1 != 8), dst, stride, src0, src1, w, h, bd
+;void ff_vvc_avg_%1_avx2(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src0,
+; const int16_t *src1, int width, int height);
+%macro VVC_AVG_AVX2 3
+cglobal vvc_avg_%2, 4, 7, 5, dst, stride, src0, src1, w, h
movifnidn hd, hm
+ pcmpeqw m2, m2
%if %1 != 8
pxor m3, m3 ; pixel min
- vpbroadcastw m4, bdm ; pixel max
%endif
- movifnidn bdd, bdm
- inc bdd
- tzcnt bdd, bdd ; bit depth
-
- sub bdd, 8
- movd xm0, bdd
- vpbroadcastd m2, [pw_256]
- psllw m2, xm0 ; shift
-
lea r6, [avg_%1 %+ SUFFIX %+ _table]
tzcnt wd, wm
movsxd wq, dword [r6+wq*4]
+ psrlw m4, m2, 16-%2 ; pixel max
+ psubw m2, m4, m2 ; 1 << bpp
add wq, r6
- AVG_FN %1, AVG
+ AVG_FN %1, AVG, %3
%endmacro
;void ff_vvc_w_avg_%1bpc_avx(uint8_t *dst, ptrdiff_t dst_stride,
@@ -298,9 +291,11 @@ cglobal vvc_w_avg_%1bpc, 4, 8, 6+2*(%1 != 8), dst, stride, src0, src1, w, h, t0,
INIT_YMM avx2
-VVC_AVG_AVX2 16
+VVC_AVG_AVX2 16, 12, 0
-VVC_AVG_AVX2 8
+VVC_AVG_AVX2 16, 10, 1
+
+VVC_AVG_AVX2 8, 8, 1
VVC_W_AVG_AVX2 16
--
2.52.0
>From f28ff6702e735e99aa4dba956520a04c37597824 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 19 Feb 2026 00:40:42 +0100
Subject: [PATCH 08/12] avcodec/x86/vvc/mc,dsp_init: Avoid pointless wrappers
for w_avg
They only add overhead (in form of another function call,
sign-extending some parameters to 64bit (although the upper
bits are not used at all) and rederiving the actual number
of bits (from the maximum value (1<<bpp)-1)).
Old benchmarks:
w_avg_8_2x2_c: 16.4 ( 1.00x)
w_avg_8_2x2_avx2: 12.9 ( 1.27x)
w_avg_8_4x4_c: 48.0 ( 1.00x)
w_avg_8_4x4_avx2: 14.9 ( 3.23x)
w_avg_8_8x8_c: 168.2 ( 1.00x)
w_avg_8_8x8_avx2: 22.4 ( 7.49x)
w_avg_8_16x16_c: 396.5 ( 1.00x)
w_avg_8_16x16_avx2: 47.9 ( 8.28x)
w_avg_8_32x32_c: 1466.3 ( 1.00x)
w_avg_8_32x32_avx2: 172.8 ( 8.48x)
w_avg_8_64x64_c: 5629.3 ( 1.00x)
w_avg_8_64x64_avx2: 678.7 ( 8.29x)
w_avg_8_128x128_c: 22122.4 ( 1.00x)
w_avg_8_128x128_avx2: 2743.5 ( 8.06x)
w_avg_10_2x2_c: 18.7 ( 1.00x)
w_avg_10_2x2_avx2: 13.1 ( 1.43x)
w_avg_10_4x4_c: 50.3 ( 1.00x)
w_avg_10_4x4_avx2: 15.9 ( 3.17x)
w_avg_10_8x8_c: 109.3 ( 1.00x)
w_avg_10_8x8_avx2: 20.6 ( 5.30x)
w_avg_10_16x16_c: 395.5 ( 1.00x)
w_avg_10_16x16_avx2: 44.8 ( 8.83x)
w_avg_10_32x32_c: 1534.2 ( 1.00x)
w_avg_10_32x32_avx2: 141.4 (10.85x)
w_avg_10_64x64_c: 6003.6 ( 1.00x)
w_avg_10_64x64_avx2: 557.4 (10.77x)
w_avg_10_128x128_c: 23722.7 ( 1.00x)
w_avg_10_128x128_avx2: 2205.0 (10.76x)
w_avg_12_2x2_c: 18.6 ( 1.00x)
w_avg_12_2x2_avx2: 13.1 ( 1.42x)
w_avg_12_4x4_c: 52.2 ( 1.00x)
w_avg_12_4x4_avx2: 16.1 ( 3.24x)
w_avg_12_8x8_c: 109.2 ( 1.00x)
w_avg_12_8x8_avx2: 20.6 ( 5.29x)
w_avg_12_16x16_c: 396.1 ( 1.00x)
w_avg_12_16x16_avx2: 45.0 ( 8.81x)
w_avg_12_32x32_c: 1532.6 ( 1.00x)
w_avg_12_32x32_avx2: 142.1 (10.79x)
w_avg_12_64x64_c: 6002.2 ( 1.00x)
w_avg_12_64x64_avx2: 557.3 (10.77x)
w_avg_12_128x128_c: 23748.7 ( 1.00x)
w_avg_12_128x128_avx2: 2206.4 (10.76x)
New benchmarks:
w_avg_8_2x2_c: 16.0 ( 1.00x)
w_avg_8_2x2_avx2: 9.3 ( 1.71x)
w_avg_8_4x4_c: 48.4 ( 1.00x)
w_avg_8_4x4_avx2: 12.4 ( 3.91x)
w_avg_8_8x8_c: 168.7 ( 1.00x)
w_avg_8_8x8_avx2: 21.1 ( 8.00x)
w_avg_8_16x16_c: 394.5 ( 1.00x)
w_avg_8_16x16_avx2: 46.2 ( 8.54x)
w_avg_8_32x32_c: 1456.3 ( 1.00x)
w_avg_8_32x32_avx2: 171.8 ( 8.48x)
w_avg_8_64x64_c: 5636.2 ( 1.00x)
w_avg_8_64x64_avx2: 676.9 ( 8.33x)
w_avg_8_128x128_c: 22129.1 ( 1.00x)
w_avg_8_128x128_avx2: 2734.3 ( 8.09x)
w_avg_10_2x2_c: 18.7 ( 1.00x)
w_avg_10_2x2_avx2: 10.3 ( 1.82x)
w_avg_10_4x4_c: 50.8 ( 1.00x)
w_avg_10_4x4_avx2: 13.4 ( 3.79x)
w_avg_10_8x8_c: 109.7 ( 1.00x)
w_avg_10_8x8_avx2: 20.4 ( 5.38x)
w_avg_10_16x16_c: 395.2 ( 1.00x)
w_avg_10_16x16_avx2: 41.7 ( 9.48x)
w_avg_10_32x32_c: 1535.6 ( 1.00x)
w_avg_10_32x32_avx2: 137.9 (11.13x)
w_avg_10_64x64_c: 6002.1 ( 1.00x)
w_avg_10_64x64_avx2: 548.5 (10.94x)
w_avg_10_128x128_c: 23742.7 ( 1.00x)
w_avg_10_128x128_avx2: 2179.8 (10.89x)
w_avg_12_2x2_c: 18.9 ( 1.00x)
w_avg_12_2x2_avx2: 10.3 ( 1.84x)
w_avg_12_4x4_c: 52.4 ( 1.00x)
w_avg_12_4x4_avx2: 13.4 ( 3.91x)
w_avg_12_8x8_c: 109.2 ( 1.00x)
w_avg_12_8x8_avx2: 20.3 ( 5.39x)
w_avg_12_16x16_c: 396.3 ( 1.00x)
w_avg_12_16x16_avx2: 41.7 ( 9.51x)
w_avg_12_32x32_c: 1532.6 ( 1.00x)
w_avg_12_32x32_avx2: 138.6 (11.06x)
w_avg_12_64x64_c: 5996.7 ( 1.00x)
w_avg_12_64x64_avx2: 549.6 (10.91x)
w_avg_12_128x128_c: 23738.0 ( 1.00x)
w_avg_12_128x128_avx2: 2177.2 (10.90x)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/vvc/dsp_init.c | 26 ++---------
libavcodec/x86/vvc/mc.asm | 82 ++++++++++++++++++-----------------
2 files changed, 47 insertions(+), 61 deletions(-)
diff --git a/libavcodec/x86/vvc/dsp_init.c b/libavcodec/x86/vvc/dsp_init.c
index 80df8e46ee..357f4ea8a1 100644
--- a/libavcodec/x86/vvc/dsp_init.c
+++ b/libavcodec/x86/vvc/dsp_init.c
@@ -35,14 +35,6 @@
#define bf(fn, bd, opt) fn##_##bd##_##opt
#define BF(fn, bpc, opt) fn##_##bpc##bpc_##opt
-#define AVG_BPC_PROTOTYPES(bpc, opt) \
-void BF(ff_vvc_w_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
- const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, \
- intptr_t denom, intptr_t w0, intptr_t w1, intptr_t o0, intptr_t o1, intptr_t pixel_max);
-
-AVG_BPC_PROTOTYPES( 8, avx2)
-AVG_BPC_PROTOTYPES(16, avx2)
-
#define DMVR_PROTOTYPES(bd, opt) \
void ff_vvc_dmvr_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \
int height, intptr_t mx, intptr_t my, int width); \
@@ -168,19 +160,6 @@ FW_PUT_AVX2(12)
FW_PUT_16BPC_AVX2(10)
FW_PUT_16BPC_AVX2(12)
-#define AVG_FUNCS(bpc, bd, opt) \
-static void bf(vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
- const int16_t *src0, const int16_t *src1, int width, int height, \
- int denom, int w0, int w1, int o0, int o1) \
-{ \
- BF(ff_vvc_w_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, \
- denom, w0, w1, o0, o1, (1 << bd) - 1); \
-}
-
-AVG_FUNCS(8, 8, avx2)
-AVG_FUNCS(16, 10, avx2)
-AVG_FUNCS(16, 12, avx2)
-
#define ALF_FUNCS(bpc, bd, opt) \
static void bf(vvc_alf_filter_luma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, \
int width, int height, const int16_t *filter, const int16_t *clip, const int vb_pos) \
@@ -249,8 +228,11 @@ SAO_FILTER_FUNCS(12, avx2)
#define AVG_INIT(bd, opt) do { \
void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
const int16_t *src0, const int16_t *src1, int width, int height);\
+void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *src0, const int16_t *src1, int width, int height, \
+ int denom, int w0, int w1, int o0, int o1); \
c->inter.avg = bf(ff_vvc_avg, bd, opt); \
- c->inter.w_avg = bf(vvc_w_avg, bd, opt); \
+ c->inter.w_avg = bf(ff_vvc_w_avg, bd, opt); \
} while (0)
#define DMVR_INIT(bd) do { \
diff --git a/libavcodec/x86/vvc/mc.asm b/libavcodec/x86/vvc/mc.asm
index 7599ee2e6a..5f19144157 100644
--- a/libavcodec/x86/vvc/mc.asm
+++ b/libavcodec/x86/vvc/mc.asm
@@ -48,8 +48,8 @@ SECTION_RODATA
AVG_JMP_TABLE avg, 8, 8, avx2, 2, 4, 8, 16, 32, 64, 128
AVG_JMP_TABLE avg, 16, 10, avx2, 2, 4, 8, 16, 32, 64, 128
-AVG_JMP_TABLE w_avg, 8, 8bpc, avx2, 2, 4, 8, 16, 32, 64, 128
-AVG_JMP_TABLE w_avg, 16, 16bpc, avx2, 2, 4, 8, 16, 32, 64, 128
+AVG_JMP_TABLE w_avg, 8, 8, avx2, 2, 4, 8, 16, 32, 64, 128
+AVG_JMP_TABLE w_avg, 16, 10, avx2, 2, 4, 8, 16, 32, 64, 128
SECTION .text
@@ -242,51 +242,53 @@ cglobal vvc_avg_%2, 4, 7, 5, dst, stride, src0, src1, w, h
AVG_FN %1, AVG, %3
%endmacro
-;void ff_vvc_w_avg_%1bpc_avx(uint8_t *dst, ptrdiff_t dst_stride,
-; const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height,
-; intptr_t denom, intptr_t w0, intptr_t w1, intptr_t o0, intptr_t o1, intptr_t pixel_max);
-%macro VVC_W_AVG_AVX2 1
-cglobal vvc_w_avg_%1bpc, 4, 8, 6+2*(%1 != 8), dst, stride, src0, src1, w, h, t0, t1
+;void ff_vvc_w_avg_%2_avx(uint8_t *dst, ptrdiff_t dst_stride,
+; const int16_t *src0, const int16_t *src1, int width, int height,
+; int denom, intptr_t w0, int w1, int o0, int o1);
+%macro VVC_W_AVG_AVX2 3
+cglobal vvc_w_avg_%2, 4, 7+2*UNIX64, 6+2*(%1 != 8), dst, stride, src0, src1, w, h
+%if UNIX64
+ ; r6-r8 are volatile and not used for parameter passing
+ DECLARE_REG_TMP 6, 7, 8
+%else ; Win64
+ ; r4-r6 are volatile and not used for parameter passing
+ DECLARE_REG_TMP 4, 5, 6
+%endif
- movifnidn hd, hm
-
- movifnidn t0d, r8m ; w1
- shl t0d, 16
- mov t0w, r7m ; w0
- movd xm3, t0d
+ mov t1d, r6m ; denom
+ mov t0d, r9m ; o0
+ add t0d, r10m ; o1
+ movifnidn t2d, r8m ; w1
+ add t1d, 15-%2
+%if %2 != 8
+ shl t0d, %2 - 8
+%endif
+ movd xm2, t1d ; shift
+ inc t0d ; ((o0 + o1) << (BIT_DEPTH - 8)) + 1
+ shl t2d, 16
+ movd xm4, t0d
+ mov t2w, r7m ; w0
+ movd xm3, t2d
vpbroadcastd m3, xm3 ; w0, w1
%if %1 != 8
- pxor m6, m6 ;pixel min
- vpbroadcastw m7, r11m ;pixel max
+ pcmpeqw m7, m7
+ pxor m6, m6 ; pixel min
+ psrlw m7, 16-%2 ; pixel max
%endif
- mov t1q, rcx ; save ecx
- mov ecx, r11m
- inc ecx ; bd
- tzcnt ecx, ecx
- sub ecx, 8
- mov t0d, r9m ; o0
- add t0d, r10m ; o1
- shl t0d, cl
- inc t0d ;((o0 + o1) << (BIT_DEPTH - 8)) + 1
-
- neg ecx
- add ecx, 7
- add ecx, r6m
- movd xm2, ecx ; shift
-
- dec ecx
- shl t0d, cl
- movd xm4, t0d
- vpbroadcastd m4, xm4 ; offset
- mov rcx, t1q ; restore ecx
-
lea r6, [w_avg_%1 %+ SUFFIX %+ _table]
tzcnt wd, wm
movsxd wq, dword [r6+wq*4]
+
+ pslld xm4, xm2
+ psrad xm4, 1
+ vpbroadcastd m4, xm4 ; offset
+
+ movifnidn hd, hm
+
add wq, r6
- AVG_FN %1, W_AVG
+ AVG_FN %1, W_AVG, %3
%endmacro
INIT_YMM avx2
@@ -297,9 +299,11 @@ VVC_AVG_AVX2 16, 10, 1
VVC_AVG_AVX2 8, 8, 1
-VVC_W_AVG_AVX2 16
+VVC_W_AVG_AVX2 16, 12, 0
-VVC_W_AVG_AVX2 8
+VVC_W_AVG_AVX2 16, 10, 1
+
+VVC_W_AVG_AVX2 8, 8, 1
%endif
%endif
--
2.52.0
>From 95734a9614f4ba9ecf6a43cbf8ae4237f01aa3c9 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 19 Feb 2026 01:06:21 +0100
Subject: [PATCH 09/12] avcodec/x86/vvc/of: Avoid unused register
Avoids a push+pop.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/vvc/of.asm | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libavcodec/x86/vvc/of.asm b/libavcodec/x86/vvc/of.asm
index 5893bfb23a..1481a4a09b 100644
--- a/libavcodec/x86/vvc/of.asm
+++ b/libavcodec/x86/vvc/of.asm
@@ -352,7 +352,7 @@ INIT_YMM avx2
;void ff_vvc_apply_bdof_%1(uint8_t *dst, const ptrdiff_t dst_stride, int16_t *src0, int16_t *src1,
; const int w, const int h, const int int pixel_max)
%macro BDOF_AVX2 0
-cglobal vvc_apply_bdof, 7, 10, 16, BDOF_STACK_SIZE*32, dst, ds, src0, src1, w, h, pixel_max, ds3, tmp0, tmp1
+cglobal vvc_apply_bdof, 7, 9, 16, BDOF_STACK_SIZE*32, dst, ds, src0, src1, w, h, pixel_max, ds3, tmp0
lea ds3q, [dsq * 3]
sub src0q, SRC_STRIDE + SRC_PS
--
2.52.0
>From bb6b86eedc5b5675451ab84d09357f69b2c535cf Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 19 Feb 2026 01:31:31 +0100
Subject: [PATCH 10/12] avcodec/x86/vvc/of: Unify shuffling
One can use the same shuffles for the width 8 and width 16
case if one also changes the permutation in vpermd (that always
follows pshufb for width 16).
This also allows to load it before checking width.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/vvc/of.asm | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/libavcodec/x86/vvc/of.asm b/libavcodec/x86/vvc/of.asm
index 1481a4a09b..b071c56dbb 100644
--- a/libavcodec/x86/vvc/of.asm
+++ b/libavcodec/x86/vvc/of.asm
@@ -32,9 +32,8 @@ SECTION_RODATA 32
pd_15 times 8 dd 15
pd_m15 times 8 dd -15
-pb_shuffle_w8 times 2 db 0, 1, 0xff, 0xff, 8, 9, 0xff, 0xff, 6, 7, 0xff, 0xff, 14, 15, 0xff, 0xff
-pb_shuffle_w16 times 2 db 0, 1, 0xff, 0xff, 6, 7, 0xff, 0xff, 8, 9, 0xff, 0xff, 14, 15, 0xff, 0xff
-pd_perm_w16 dd 0, 2, 1, 4, 3, 6, 5, 7
+pb_shuffle times 2 db 0, 1, 0xff, 0xff, 8, 9, 0xff, 0xff, 6, 7, 0xff, 0xff, 14, 15, 0xff, 0xff
+pd_perm_w16 dd 0, 1, 2, 4, 3, 5, 6, 7
%if ARCH_X86_64
%if HAVE_AVX2_EXTERNAL
@@ -186,6 +185,8 @@ INIT_YMM avx2
DIFF ndiff, c1, c0, SHIFT2, t0 ; -diff
+ mova t0, [pb_shuffle]
+
psignw m7, ndiff, m8 ; sgxdi
psignw m9, ndiff, m6 ; sgydi
psignw m10, m8, m6 ; sgxgy
@@ -194,10 +195,10 @@ INIT_YMM avx2
pabsw m8, m8 ; sgx2
; use t0, t1 as temporary buffers
+
cmp wd, 16
je %%w16
- mova t0, [pb_shuffle_w8]
SUM_MIN_BLOCK_W8 m6, t0, m11
SUM_MIN_BLOCK_W8 m7, t0, m11
SUM_MIN_BLOCK_W8 m8, t0, m11
@@ -206,7 +207,6 @@ INIT_YMM avx2
jmp %%wend
%%w16:
- mova t0, [pb_shuffle_w16]
mova t1, [pd_perm_w16]
SUM_MIN_BLOCK_W16 m6, t0, t1, m11
SUM_MIN_BLOCK_W16 m7, t0, t1, m11
--
2.52.0
>From 809d02ad97fb5e2c91c3bd03f5907a156fb357a6 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 19 Feb 2026 02:08:32 +0100
Subject: [PATCH 11/12] avcodec/x86/vvc/of: Break dependency chain
Don't extract and update one word of one and the same register
at a time; use separate src and dst registers, so that pextrw
and bsr can be done in parallel. Also use movd instead of pinsrw
for the first word.
Old benchmarks:
apply_bdof_8_8x16_c: 3275.2 ( 1.00x)
apply_bdof_8_8x16_avx2: 487.6 ( 6.72x)
apply_bdof_8_16x8_c: 3243.1 ( 1.00x)
apply_bdof_8_16x8_avx2: 284.4 (11.40x)
apply_bdof_8_16x16_c: 6501.8 ( 1.00x)
apply_bdof_8_16x16_avx2: 570.0 (11.41x)
apply_bdof_10_8x16_c: 3286.5 ( 1.00x)
apply_bdof_10_8x16_avx2: 461.7 ( 7.12x)
apply_bdof_10_16x8_c: 3274.5 ( 1.00x)
apply_bdof_10_16x8_avx2: 271.4 (12.06x)
apply_bdof_10_16x16_c: 6590.0 ( 1.00x)
apply_bdof_10_16x16_avx2: 543.9 (12.12x)
apply_bdof_12_8x16_c: 3307.6 ( 1.00x)
apply_bdof_12_8x16_avx2: 462.2 ( 7.16x)
apply_bdof_12_16x8_c: 3287.4 ( 1.00x)
apply_bdof_12_16x8_avx2: 271.8 (12.10x)
apply_bdof_12_16x16_c: 6465.7 ( 1.00x)
apply_bdof_12_16x16_avx2: 543.8 (11.89x)
New benchmarks:
apply_bdof_8_8x16_c: 3255.7 ( 1.00x)
apply_bdof_8_8x16_avx2: 349.3 ( 9.32x)
apply_bdof_8_16x8_c: 3262.5 ( 1.00x)
apply_bdof_8_16x8_avx2: 214.8 (15.19x)
apply_bdof_8_16x16_c: 6471.6 ( 1.00x)
apply_bdof_8_16x16_avx2: 429.8 (15.06x)
apply_bdof_10_8x16_c: 3227.7 ( 1.00x)
apply_bdof_10_8x16_avx2: 321.6 (10.04x)
apply_bdof_10_16x8_c: 3250.2 ( 1.00x)
apply_bdof_10_16x8_avx2: 201.2 (16.16x)
apply_bdof_10_16x16_c: 6476.5 ( 1.00x)
apply_bdof_10_16x16_avx2: 400.9 (16.16x)
apply_bdof_12_8x16_c: 3230.7 ( 1.00x)
apply_bdof_12_8x16_avx2: 321.8 (10.04x)
apply_bdof_12_16x8_c: 3210.5 ( 1.00x)
apply_bdof_12_16x8_avx2: 200.9 (15.98x)
apply_bdof_12_16x16_c: 6474.5 ( 1.00x)
apply_bdof_12_16x16_avx2: 400.2 (16.18x)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/vvc/of.asm | 31 +++++++++++++++++--------------
1 file changed, 17 insertions(+), 14 deletions(-)
diff --git a/libavcodec/x86/vvc/of.asm b/libavcodec/x86/vvc/of.asm
index b071c56dbb..be19bb1be0 100644
--- a/libavcodec/x86/vvc/of.asm
+++ b/libavcodec/x86/vvc/of.asm
@@ -252,21 +252,25 @@ INIT_YMM avx2
psrlw %3, %4
%endmacro
-%macro LOG2 2 ; dst/src, offset
- pextrw tmp0d, xm%1, %2
+%macro LOG2 3 ; dst, src, offset
+ pextrw tmp0d, xm%2, %3
bsr tmp0d, tmp0d
- pinsrw xm%1, tmp0d, %2
+%if %3 != 0
+ pinsrw xm%1, tmp0d, %3
+%else
+ movd xm%1, tmp0d
+%endif
%endmacro
-%macro LOG2 1 ; dst/src
- LOG2 %1, 0
- LOG2 %1, 1
- LOG2 %1, 2
- LOG2 %1, 3
- LOG2 %1, 4
- LOG2 %1, 5
- LOG2 %1, 6
- LOG2 %1, 7
+%macro LOG2 2 ; dst, src
+ LOG2 %1, %2, 0
+ LOG2 %1, %2, 1
+ LOG2 %1, %2, 2
+ LOG2 %1, %2, 3
+ LOG2 %1, %2, 4
+ LOG2 %1, %2, 5
+ LOG2 %1, %2, 6
+ LOG2 %1, %2, 7
%endmacro
; %1: 4 (sgx2, sgy2, sgxdi, gydi)
@@ -278,8 +282,7 @@ INIT_YMM avx2
punpcklqdq m8, m%1, m7 ; 4 (sgx2, sgy2)
punpckhqdq m9, m%1, m7 ; 4 (sgxdi, sgydi)
- mova m10, m8
- LOG2 10 ; 4 (log2(sgx2), log2(sgy2))
+ LOG2 10, 8 ; 4 (log2(sgx2), log2(sgy2))
; Promote to dword since vpsrlvw is AVX-512 only
pmovsxwd m8, xm8
--
2.52.0
>From 18a5346792d4f8527c103c3435fc8f287d195e2e Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 19 Feb 2026 02:22:49 +0100
Subject: [PATCH 12/12] avcodec/x86/vvc/dsp_init: Mark dsp init function as
av_cold
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/vvc/dsp_init.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/libavcodec/x86/vvc/dsp_init.c b/libavcodec/x86/vvc/dsp_init.c
index 357f4ea8a1..cd3d02c0fb 100644
--- a/libavcodec/x86/vvc/dsp_init.c
+++ b/libavcodec/x86/vvc/dsp_init.c
@@ -23,6 +23,7 @@
#include "config.h"
+#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/vvc/dec.h"
@@ -321,7 +322,7 @@ int ff_vvc_sad_avx2(const int16_t *src0, const int16_t *src1, int dx, int dy, in
#endif // ARCH_X86_64
-void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
+av_cold void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
{
#if ARCH_X86_64
const int cpu_flags = av_get_cpu_flags();
--
2.52.0
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2026-02-19 1:53 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2026-02-19 1:52 [FFmpeg-devel] [PR] avcodec/x86/vvc: Various improvements (PR #21790) mkver via ffmpeg-devel
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git