From: mkver via ffmpeg-devel <ffmpeg-devel@ffmpeg.org>
To: ffmpeg-devel@ffmpeg.org
Cc: mkver <code@ffmpeg.org>
Subject: [FFmpeg-devel] [PR] avcodec/x86/hevc/deblock: Avoid vmovdqa (PR #21582)
Date: Mon, 26 Jan 2026 03:06:36 -0000
Message-ID: <176939679744.25.10975189851690820186@4457048688e7> (raw)
PR #21582 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21582
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21582.patch
Also reduce the amount of GPRs used.
>From 4580374e4ad922924a485cc7586f625eaca25482 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 21 Jan 2026 12:50:11 +0100
Subject: [PATCH 1/4] avcodec/x86/vp9lpf: Avoid vmovdqa
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/vp9lpf.asm | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/libavcodec/x86/vp9lpf.asm b/libavcodec/x86/vp9lpf.asm
index 4e7ede2235..92e5d03ad7 100644
--- a/libavcodec/x86/vp9lpf.asm
+++ b/libavcodec/x86/vp9lpf.asm
@@ -818,8 +818,7 @@ cglobal vp9_loop_filter_%1_%2_ %+ mmsize, 2, 6, 16, %3 + %4 + %%ext, dst, stride
; (m0: hev, m1: p0', m2: q0-p0, m3: fm, m7: q0', [m8: flat8out], m10..13: p1 p0 q0 q1, m14: pb_10, [m15: flat8in], )
; filter4()
- mova m4, m2
- paddsb m2, m4 ; 2 * (q0 - p0)
+ paddsb m4, m2, m2 ; 2 * (q0 - p0)
paddsb m2, m4 ; 3 * (q0 - p0)
paddsb m6, m2, [pb_4] ; m6: f1 = clip(f + 4, 127)
paddsb m2, [pb_3] ; m2: f2 = clip(f + 3, 127)
--
2.52.0
>From f437bab6ccc58d4f2ae7fb1eef380191ebbcda8c Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 21 Jan 2026 13:56:05 +0100
Subject: [PATCH 2/4] avcodec/x86/hevc/deblock: Avoid vmovdqa
(It would even be possible to avoid a clobbering m10 in
MASKED_COPY and the mask register (%3) in MASKED_COPY2
when VEX encoding is in use.)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/hevc/deblock.asm | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/libavcodec/x86/hevc/deblock.asm b/libavcodec/x86/hevc/deblock.asm
index 61b79f8079..fbb12322e3 100644
--- a/libavcodec/x86/hevc/deblock.asm
+++ b/libavcodec/x86/hevc/deblock.asm
@@ -257,8 +257,7 @@ INIT_XMM sse2
%macro MASKED_COPY 2
pand %2, m11 ; and mask
pandn m10, m11, %1; and -mask
- por %2, m10
- mova %1, %2
+ por %1, %2, m10
%endmacro
; in: %2 clobbered
@@ -267,8 +266,7 @@ INIT_XMM sse2
%macro MASKED_COPY2 3
pand %2, %3 ; and mask
pandn %3, %1; and -mask
- por %2, %3
- mova %1, %2
+ por %1, %2, %3
%endmacro
ALIGN 16
--
2.52.0
>From 37cade3edf22ece324c8784f94637943a255b30f Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 21 Jan 2026 14:31:14 +0100
Subject: [PATCH 3/4] avcodec/x86/hevc/deblock: avoid unused GPR
r12 is unused, so use it instead of r13 to reduce
the amount of push/pops.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/hevc/deblock.asm | 32 ++++++++++++++++----------------
1 file changed, 16 insertions(+), 16 deletions(-)
diff --git a/libavcodec/x86/hevc/deblock.asm b/libavcodec/x86/hevc/deblock.asm
index fbb12322e3..9671415c66 100644
--- a/libavcodec/x86/hevc/deblock.asm
+++ b/libavcodec/x86/hevc/deblock.asm
@@ -333,8 +333,8 @@ ALIGN 16
;compare
pcmpgtw m15, m13, m14
- movmskps r13, m15 ;filtering mask 0d0 + 0d3 < beta0 (bit 2 or 3) , 1d0 + 1d3 < beta1 (bit 0 or 1)
- test r13, r13
+ movmskps r12, m15 ;filtering mask 0d0 + 0d3 < beta0 (bit 2 or 3) , 1d0 + 1d3 < beta1 (bit 0 or 1)
+ test r12, r12
je .bypassluma
;weak / strong decision compare to beta_2
@@ -361,14 +361,14 @@ ALIGN 16
; end calc for weak filter
; filtering mask
- mov r11, r13
+ mov r11, r12
shr r11, 3
movd m15, r11d
- and r13, 1
- movd m11, r13d
+ and r12, 1
+ movd m11, r12d
shufps m11, m15, 0
shl r11, 1
- or r13, r11
+ or r12, r11
pcmpeqd m11, [pd_1]; filtering mask
@@ -522,7 +522,7 @@ ALIGN 16
.weakfilter:
not r6; strong mask -> weak mask
- and r6, r13; final weak filtering mask, bits 0 and 1
+ and r6, r12; final weak filtering mask, bits 0 and 1
jz .store
; weak filtering mask
@@ -534,9 +534,9 @@ ALIGN 16
shufps m11, m12, 0
pcmpeqd m11, [pd_1]; filtering mask
- mov r13, betaq
- shr r13, 1;
- add betaq, r13
+ mov r12, betaq
+ shr r12, 1;
+ add betaq, r12
shr betaq, 3; ((beta + (beta >> 1)) >> 3))
psubw m12, m4, m3 ; q0 - p0
@@ -742,7 +742,7 @@ LOOP_FILTER_CHROMA
; void ff_hevc_v_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
; int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
;-----------------------------------------------------------------------------
-cglobal hevc_v_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
+cglobal hevc_v_loop_filter_luma_8, 4, 13, 16, pix, stride, beta, tc, pix0, src3stride
sub pixq, 4
lea pix0q, [3 * r1]
mov src3strideq, pixq
@@ -754,7 +754,7 @@ cglobal hevc_v_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3s
.bypassluma:
RET
-cglobal hevc_v_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
+cglobal hevc_v_loop_filter_luma_10, 4, 13, 16, pix, stride, beta, tc, pix0, src3stride
sub pixq, 8
lea pix0q, [3 * strideq]
mov src3strideq, pixq
@@ -766,7 +766,7 @@ cglobal hevc_v_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3
.bypassluma:
RET
-cglobal hevc_v_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
+cglobal hevc_v_loop_filter_luma_12, 4, 13, 16, pix, stride, beta, tc, pix0, src3stride
sub pixq, 8
lea pix0q, [3 * strideq]
mov src3strideq, pixq
@@ -782,7 +782,7 @@ cglobal hevc_v_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3
; void ff_hevc_h_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
; int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
;-----------------------------------------------------------------------------
-cglobal hevc_h_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
+cglobal hevc_h_loop_filter_luma_8, 4, 13, 16, pix, stride, beta, tc, pix0, src3stride
lea src3strideq, [3 * strideq]
mov pix0q, pixq
sub pix0q, src3strideq
@@ -818,7 +818,7 @@ cglobal hevc_h_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3s
.bypassluma:
RET
-cglobal hevc_h_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
+cglobal hevc_h_loop_filter_luma_10, 4, 13, 16, pix, stride, beta, tc, pix0, src3stride
lea src3strideq, [3 * strideq]
mov pix0q, pixq
sub pix0q, src3strideq
@@ -849,7 +849,7 @@ cglobal hevc_h_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3
.bypassluma:
RET
-cglobal hevc_h_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
+cglobal hevc_h_loop_filter_luma_12, 4, 13, 16, pix, stride, beta, tc, pix0, src3stride
lea src3strideq, [3 * strideq]
mov pix0q, pixq
sub pix0q, src3strideq
--
2.52.0
>From e24fedee8d141487c11a18cf4a955ff0c0433579 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sun, 25 Jan 2026 00:28:53 +0100
Subject: [PATCH 4/4] avcodec/x86/hevc/deblock: Reduce usage of GPRs
Don't use two GPRs to store two words from xmm registers;
shuffle these words so that they are fit into one GPR.
This reduces the amount of GPRs used and leads to tiny speedups
here. Also avoid rex prefixes whenever possible (for lines
that needed to be modified anyway).
Old benchmarks:
hevc_h_loop_filter_luma8_skip_c: 23.8 ( 1.00x)
hevc_h_loop_filter_luma8_skip_sse2: 8.5 ( 2.80x)
hevc_h_loop_filter_luma8_skip_ssse3: 7.2 ( 3.29x)
hevc_h_loop_filter_luma8_skip_avx: 6.4 ( 3.71x)
hevc_h_loop_filter_luma8_strong_c: 150.4 ( 1.00x)
hevc_h_loop_filter_luma8_strong_sse2: 34.4 ( 4.37x)
hevc_h_loop_filter_luma8_strong_ssse3: 34.5 ( 4.36x)
hevc_h_loop_filter_luma8_strong_avx: 32.3 ( 4.65x)
hevc_h_loop_filter_luma8_weak_c: 103.2 ( 1.00x)
hevc_h_loop_filter_luma8_weak_sse2: 34.5 ( 2.99x)
hevc_h_loop_filter_luma8_weak_ssse3: 7.3 (14.22x)
hevc_h_loop_filter_luma8_weak_avx: 32.4 ( 3.18x)
hevc_h_loop_filter_luma10_skip_c: 23.5 ( 1.00x)
hevc_h_loop_filter_luma10_skip_sse2: 6.6 ( 3.58x)
hevc_h_loop_filter_luma10_skip_ssse3: 6.1 ( 3.86x)
hevc_h_loop_filter_luma10_skip_avx: 5.4 ( 4.34x)
hevc_h_loop_filter_luma10_strong_c: 161.8 ( 1.00x)
hevc_h_loop_filter_luma10_strong_sse2: 32.2 ( 5.03x)
hevc_h_loop_filter_luma10_strong_ssse3: 30.4 ( 5.33x)
hevc_h_loop_filter_luma10_strong_avx: 30.3 ( 5.33x)
hevc_h_loop_filter_luma10_weak_c: 23.5 ( 1.00x)
hevc_h_loop_filter_luma10_weak_sse2: 6.6 ( 3.58x)
hevc_h_loop_filter_luma10_weak_ssse3: 6.1 ( 3.85x)
hevc_h_loop_filter_luma10_weak_avx: 5.4 ( 4.35x)
hevc_h_loop_filter_luma12_skip_c: 18.8 ( 1.00x)
hevc_h_loop_filter_luma12_skip_sse2: 6.6 ( 2.87x)
hevc_h_loop_filter_luma12_skip_ssse3: 6.1 ( 3.08x)
hevc_h_loop_filter_luma12_skip_avx: 6.2 ( 3.06x)
hevc_h_loop_filter_luma12_strong_c: 159.0 ( 1.00x)
hevc_h_loop_filter_luma12_strong_sse2: 36.3 ( 4.38x)
hevc_h_loop_filter_luma12_strong_ssse3: 36.1 ( 4.40x)
hevc_h_loop_filter_luma12_strong_avx: 33.5 ( 4.75x)
hevc_h_loop_filter_luma12_weak_c: 40.1 ( 1.00x)
hevc_h_loop_filter_luma12_weak_sse2: 35.5 ( 1.13x)
hevc_h_loop_filter_luma12_weak_ssse3: 36.1 ( 1.11x)
hevc_h_loop_filter_luma12_weak_avx: 6.2 ( 6.52x)
hevc_v_loop_filter_luma8_skip_c: 25.5 ( 1.00x)
hevc_v_loop_filter_luma8_skip_sse2: 10.6 ( 2.40x)
hevc_v_loop_filter_luma8_skip_ssse3: 11.4 ( 2.24x)
hevc_v_loop_filter_luma8_skip_avx: 8.3 ( 3.07x)
hevc_v_loop_filter_luma8_strong_c: 146.8 ( 1.00x)
hevc_v_loop_filter_luma8_strong_sse2: 43.9 ( 3.35x)
hevc_v_loop_filter_luma8_strong_ssse3: 43.7 ( 3.36x)
hevc_v_loop_filter_luma8_strong_avx: 42.3 ( 3.47x)
hevc_v_loop_filter_luma8_weak_c: 25.5 ( 1.00x)
hevc_v_loop_filter_luma8_weak_sse2: 10.6 ( 2.40x)
hevc_v_loop_filter_luma8_weak_ssse3: 44.0 ( 0.58x)
hevc_v_loop_filter_luma8_weak_avx: 8.3 ( 3.09x)
hevc_v_loop_filter_luma10_skip_c: 20.0 ( 1.00x)
hevc_v_loop_filter_luma10_skip_sse2: 11.3 ( 1.77x)
hevc_v_loop_filter_luma10_skip_ssse3: 11.0 ( 1.82x)
hevc_v_loop_filter_luma10_skip_avx: 9.3 ( 2.15x)
hevc_v_loop_filter_luma10_strong_c: 193.5 ( 1.00x)
hevc_v_loop_filter_luma10_strong_sse2: 46.1 ( 4.19x)
hevc_v_loop_filter_luma10_strong_ssse3: 44.2 ( 4.38x)
hevc_v_loop_filter_luma10_strong_avx: 44.4 ( 4.35x)
hevc_v_loop_filter_luma10_weak_c: 90.3 ( 1.00x)
hevc_v_loop_filter_luma10_weak_sse2: 46.3 ( 1.95x)
hevc_v_loop_filter_luma10_weak_ssse3: 10.8 ( 8.37x)
hevc_v_loop_filter_luma10_weak_avx: 44.4 ( 2.03x)
hevc_v_loop_filter_luma12_skip_c: 16.8 ( 1.00x)
hevc_v_loop_filter_luma12_skip_sse2: 11.8 ( 1.42x)
hevc_v_loop_filter_luma12_skip_ssse3: 11.7 ( 1.43x)
hevc_v_loop_filter_luma12_skip_avx: 8.7 ( 1.93x)
hevc_v_loop_filter_luma12_strong_c: 159.3 ( 1.00x)
hevc_v_loop_filter_luma12_strong_sse2: 45.3 ( 3.52x)
hevc_v_loop_filter_luma12_strong_ssse3: 60.3 ( 2.64x)
hevc_v_loop_filter_luma12_strong_avx: 44.1 ( 3.61x)
hevc_v_loop_filter_luma12_weak_c: 63.6 ( 1.00x)
hevc_v_loop_filter_luma12_weak_sse2: 45.3 ( 1.40x)
hevc_v_loop_filter_luma12_weak_ssse3: 11.7 ( 5.41x)
hevc_v_loop_filter_luma12_weak_avx: 43.9 ( 1.45x)
New benchmarks:
hevc_h_loop_filter_luma8_skip_c: 24.2 ( 1.00x)
hevc_h_loop_filter_luma8_skip_sse2: 8.6 ( 2.82x)
hevc_h_loop_filter_luma8_skip_ssse3: 7.0 ( 3.46x)
hevc_h_loop_filter_luma8_skip_avx: 6.8 ( 3.54x)
hevc_h_loop_filter_luma8_strong_c: 150.4 ( 1.00x)
hevc_h_loop_filter_luma8_strong_sse2: 33.3 ( 4.52x)
hevc_h_loop_filter_luma8_strong_ssse3: 32.7 ( 4.61x)
hevc_h_loop_filter_luma8_strong_avx: 32.7 ( 4.60x)
hevc_h_loop_filter_luma8_weak_c: 104.0 ( 1.00x)
hevc_h_loop_filter_luma8_weak_sse2: 33.2 ( 3.13x)
hevc_h_loop_filter_luma8_weak_ssse3: 7.0 (14.91x)
hevc_h_loop_filter_luma8_weak_avx: 31.3 ( 3.32x)
hevc_h_loop_filter_luma10_skip_c: 19.2 ( 1.00x)
hevc_h_loop_filter_luma10_skip_sse2: 6.2 ( 3.08x)
hevc_h_loop_filter_luma10_skip_ssse3: 6.2 ( 3.08x)
hevc_h_loop_filter_luma10_skip_avx: 5.0 ( 3.85x)
hevc_h_loop_filter_luma10_strong_c: 159.8 ( 1.00x)
hevc_h_loop_filter_luma10_strong_sse2: 30.0 ( 5.32x)
hevc_h_loop_filter_luma10_strong_ssse3: 29.2 ( 5.48x)
hevc_h_loop_filter_luma10_strong_avx: 28.6 ( 5.58x)
hevc_h_loop_filter_luma10_weak_c: 19.2 ( 1.00x)
hevc_h_loop_filter_luma10_weak_sse2: 6.2 ( 3.09x)
hevc_h_loop_filter_luma10_weak_ssse3: 6.2 ( 3.09x)
hevc_h_loop_filter_luma10_weak_avx: 5.0 ( 3.88x)
hevc_h_loop_filter_luma12_skip_c: 18.7 ( 1.00x)
hevc_h_loop_filter_luma12_skip_sse2: 6.2 ( 3.00x)
hevc_h_loop_filter_luma12_skip_ssse3: 5.7 ( 3.27x)
hevc_h_loop_filter_luma12_skip_avx: 5.2 ( 3.61x)
hevc_h_loop_filter_luma12_strong_c: 160.2 ( 1.00x)
hevc_h_loop_filter_luma12_strong_sse2: 34.2 ( 4.68x)
hevc_h_loop_filter_luma12_strong_ssse3: 29.3 ( 5.48x)
hevc_h_loop_filter_luma12_strong_avx: 31.4 ( 5.10x)
hevc_h_loop_filter_luma12_weak_c: 40.2 ( 1.00x)
hevc_h_loop_filter_luma12_weak_sse2: 35.2 ( 1.14x)
hevc_h_loop_filter_luma12_weak_ssse3: 29.3 ( 1.37x)
hevc_h_loop_filter_luma12_weak_avx: 5.0 ( 8.09x)
hevc_v_loop_filter_luma8_skip_c: 25.6 ( 1.00x)
hevc_v_loop_filter_luma8_skip_sse2: 10.2 ( 2.52x)
hevc_v_loop_filter_luma8_skip_ssse3: 10.5 ( 2.45x)
hevc_v_loop_filter_luma8_skip_avx: 8.2 ( 3.11x)
hevc_v_loop_filter_luma8_strong_c: 147.1 ( 1.00x)
hevc_v_loop_filter_luma8_strong_sse2: 42.6 ( 3.45x)
hevc_v_loop_filter_luma8_strong_ssse3: 42.4 ( 3.47x)
hevc_v_loop_filter_luma8_strong_avx: 40.1 ( 3.67x)
hevc_v_loop_filter_luma8_weak_c: 25.6 ( 1.00x)
hevc_v_loop_filter_luma8_weak_sse2: 10.6 ( 2.42x)
hevc_v_loop_filter_luma8_weak_ssse3: 42.7 ( 0.60x)
hevc_v_loop_filter_luma8_weak_avx: 8.2 ( 3.11x)
hevc_v_loop_filter_luma10_skip_c: 16.7 ( 1.00x)
hevc_v_loop_filter_luma10_skip_sse2: 11.0 ( 1.52x)
hevc_v_loop_filter_luma10_skip_ssse3: 10.5 ( 1.59x)
hevc_v_loop_filter_luma10_skip_avx: 9.6 ( 1.74x)
hevc_v_loop_filter_luma10_strong_c: 190.0 ( 1.00x)
hevc_v_loop_filter_luma10_strong_sse2: 44.8 ( 4.24x)
hevc_v_loop_filter_luma10_strong_ssse3: 42.3 ( 4.49x)
hevc_v_loop_filter_luma10_strong_avx: 42.5 ( 4.47x)
hevc_v_loop_filter_luma10_weak_c: 88.3 ( 1.00x)
hevc_v_loop_filter_luma10_weak_sse2: 45.7 ( 1.93x)
hevc_v_loop_filter_luma10_weak_ssse3: 10.5 ( 8.40x)
hevc_v_loop_filter_luma10_weak_avx: 42.4 ( 2.09x)
hevc_v_loop_filter_luma12_skip_c: 16.7 ( 1.00x)
hevc_v_loop_filter_luma12_skip_sse2: 11.7 ( 1.42x)
hevc_v_loop_filter_luma12_skip_ssse3: 10.5 ( 1.59x)
hevc_v_loop_filter_luma12_skip_avx: 8.8 ( 1.90x)
hevc_v_loop_filter_luma12_strong_c: 159.4 ( 1.00x)
hevc_v_loop_filter_luma12_strong_sse2: 45.2 ( 3.53x)
hevc_v_loop_filter_luma12_strong_ssse3: 59.3 ( 2.69x)
hevc_v_loop_filter_luma12_strong_avx: 41.7 ( 3.82x)
hevc_v_loop_filter_luma12_weak_c: 63.3 ( 1.00x)
hevc_v_loop_filter_luma12_weak_sse2: 44.9 ( 1.41x)
hevc_v_loop_filter_luma12_weak_ssse3: 10.5 ( 6.02x)
hevc_v_loop_filter_luma12_weak_avx: 41.7 ( 1.52x)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/hevc/deblock.asm | 105 +++++++++++++++-----------------
1 file changed, 49 insertions(+), 56 deletions(-)
diff --git a/libavcodec/x86/hevc/deblock.asm b/libavcodec/x86/hevc/deblock.asm
index 9671415c66..d43d95142a 100644
--- a/libavcodec/x86/hevc/deblock.asm
+++ b/libavcodec/x86/hevc/deblock.asm
@@ -333,8 +333,8 @@ ALIGN 16
;compare
pcmpgtw m15, m13, m14
- movmskps r12, m15 ;filtering mask 0d0 + 0d3 < beta0 (bit 2 or 3) , 1d0 + 1d3 < beta1 (bit 0 or 1)
- test r12, r12
+ movmskps r10, m15 ;filtering mask 0d0 + 0d3 < beta0 (bit 2 or 3) , 1d0 + 1d3 < beta1 (bit 0 or 1)
+ test r10d, r10d
je .bypassluma
;weak / strong decision compare to beta_2
@@ -345,45 +345,41 @@ ALIGN 16
;end weak / strong decision
; weak filter nd_p/q calculation
- pshufd m8, m10, 0x31
- psrld m8, 16
+ psrlq m8, m10, 48
paddw m8, m10
- movd r7d, m8
- pshufd m8, m8, 0x4E
- movd r8d, m8
+ pshufd m8, m8, 0x8
+ movq r7q, m8
- pshufd m8, m11, 0x31
- psrld m8, 16
+ psrlq m8, m11, 48
paddw m8, m11
- movd r9d, m8
- pshufd m8, m8, 0x4E
- movd r10d, m8
+ pshufd m8, m8, 0x8
+ movq r8q, m8
; end calc for weak filter
; filtering mask
- mov r11, r12
- shr r11, 3
- movd m15, r11d
- and r12, 1
- movd m11, r12d
+ mov r9, r10
+ shr r9d, 3
+ movd m15, r9d
+ and r10d, 1
+ movd m11, r10d
shufps m11, m15, 0
- shl r11, 1
- or r12, r11
+ shl r9d, 1
+ or r10d, r9d
pcmpeqd m11, [pd_1]; filtering mask
;decide between strong and weak filtering
;tc25 calculations
- mov r11d, [tcq];
+ mov r9d, [tcq];
%if %1 > 8
- shl r11, %1 - 8
+ shl r9d, %1 - 8
%endif
- movd m8, r11d; tc0
+ movd m8, r9d; tc0
mov r3d, [tcq+4];
%if %1 > 8
- shl r3, %1 - 8
+ shl r3d, %1 - 8
%endif
- add r11d, r3d; tc0 + tc1
+ add r9d, r3d; tc0 + tc1
jz .bypassluma
movd m9, r3d; tc1
punpcklwd m8, m8
@@ -408,8 +404,8 @@ ALIGN 16
psraw m13, 3; beta >> 3
pcmpgtw m13, m12;
- movmskps r11, m13;
- and r6, r11; strong mask , beta_2 and beta_3 comparisons
+ movmskps r9d, m13;
+ and r6d, r9d; strong mask , beta_2 and beta_3 comparisons
;----beta_3 comparison end-----
;----tc25 comparison---
psubw m12, m3, m4; p0 - q0
@@ -419,24 +415,24 @@ ALIGN 16
pshuflw m12, m12, 0xf0 ;0b11110000;
pcmpgtw m8, m12; tc25 comparisons
- movmskps r11, m8;
- and r6, r11; strong mask, beta_2, beta_3 and tc25 comparisons
+ movmskps r9d, m8;
+ and r6d, r9d; strong mask, beta_2, beta_3 and tc25 comparisons
;----tc25 comparison end---
- mov r11, r6;
- shr r11, 1;
- and r6, r11; strong mask, bits 2 and 0
+ mov r9d, r6d;
+ shr r9d, 1;
+ and r6d, r9d; strong mask, bits 2 and 0
pmullw m14, m9, [pw_m2]; -tc * 2
paddw m9, m9
and r6, 5; 0b101
- mov r11, r6; strong mask
+ mov r9d, r6d; strong mask
shr r6, 2;
movd m12, r6d; store to xmm for mask generation
shl r6, 1
- and r11, 1
- movd m10, r11d; store to xmm for mask generation
- or r6, r11; final strong mask, bits 1 and 0
+ and r9d, 1
+ movd m10, r9d; store to xmm for mask generation
+ or r6d, r9d; final strong mask, bits 1 and 0
jz .weakfilter
shufps m10, m12, 0
@@ -522,21 +518,21 @@ ALIGN 16
.weakfilter:
not r6; strong mask -> weak mask
- and r6, r12; final weak filtering mask, bits 0 and 1
+ and r6d, r10d; final weak filtering mask, bits 0 and 1
jz .store
; weak filtering mask
- mov r11, r6
- shr r11, 1
- movd m12, r11d
+ mov r9, r6
+ shr r9d, 1
+ movd m12, r9d
and r6, 1
movd m11, r6d
shufps m11, m12, 0
pcmpeqd m11, [pd_1]; filtering mask
- mov r12, betaq
- shr r12, 1;
- add betaq, r12
+ mov r10d, betad
+ shr r10d, 1;
+ add betad, r10d
shr betaq, 3; ((beta + (beta >> 1)) >> 3))
psubw m12, m4, m3 ; q0 - p0
@@ -605,11 +601,9 @@ ALIGN 16
movd m10, betad
SPLATW m10, m10, 0
- movd m13, r7d; 1dp0 + 1dp3
- movd m8, r8d; 0dp0 + 0dp3
- punpcklwd m8, m8
+ movq m13, r7q; 1dp0 + 1dp3, 0dp0 + 0dp3
punpcklwd m13, m13
- shufps m13, m8, 0;
+ pshufd m13, m13, 10100000b
pcmpgtw m8, m10, m13
pand m8, m11
;end beta calculations
@@ -623,11 +617,10 @@ ALIGN 16
pminsw m8, m9; av_clip(deltaq1, -tc/2, tc/2)
paddw m8, m5; q1'
- movd m13, r9d;
- movd m15, r10d;
- punpcklwd m15, m15
+ movq m13, r8q;
punpcklwd m13, m13
- shufps m13, m15, 0; dq0 + dq3
+ movhlps m15, m13
+ pshufd m13, m13, 10100000b
pcmpgtw m10, m13; compare to ((beta+(beta>>1))>>3)
pand m10, m11
@@ -742,7 +735,7 @@ LOOP_FILTER_CHROMA
; void ff_hevc_v_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
; int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
;-----------------------------------------------------------------------------
-cglobal hevc_v_loop_filter_luma_8, 4, 13, 16, pix, stride, beta, tc, pix0, src3stride
+cglobal hevc_v_loop_filter_luma_8, 4, 11, 16, pix, stride, beta, tc, pix0, src3stride
sub pixq, 4
lea pix0q, [3 * r1]
mov src3strideq, pixq
@@ -754,7 +747,7 @@ cglobal hevc_v_loop_filter_luma_8, 4, 13, 16, pix, stride, beta, tc, pix0, src3s
.bypassluma:
RET
-cglobal hevc_v_loop_filter_luma_10, 4, 13, 16, pix, stride, beta, tc, pix0, src3stride
+cglobal hevc_v_loop_filter_luma_10, 4, 11, 16, pix, stride, beta, tc, pix0, src3stride
sub pixq, 8
lea pix0q, [3 * strideq]
mov src3strideq, pixq
@@ -766,7 +759,7 @@ cglobal hevc_v_loop_filter_luma_10, 4, 13, 16, pix, stride, beta, tc, pix0, src3
.bypassluma:
RET
-cglobal hevc_v_loop_filter_luma_12, 4, 13, 16, pix, stride, beta, tc, pix0, src3stride
+cglobal hevc_v_loop_filter_luma_12, 4, 11, 16, pix, stride, beta, tc, pix0, src3stride
sub pixq, 8
lea pix0q, [3 * strideq]
mov src3strideq, pixq
@@ -782,7 +775,7 @@ cglobal hevc_v_loop_filter_luma_12, 4, 13, 16, pix, stride, beta, tc, pix0, src3
; void ff_hevc_h_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
; int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
;-----------------------------------------------------------------------------
-cglobal hevc_h_loop_filter_luma_8, 4, 13, 16, pix, stride, beta, tc, pix0, src3stride
+cglobal hevc_h_loop_filter_luma_8, 4, 11, 16, pix, stride, beta, tc, pix0, src3stride
lea src3strideq, [3 * strideq]
mov pix0q, pixq
sub pix0q, src3strideq
@@ -818,7 +811,7 @@ cglobal hevc_h_loop_filter_luma_8, 4, 13, 16, pix, stride, beta, tc, pix0, src3s
.bypassluma:
RET
-cglobal hevc_h_loop_filter_luma_10, 4, 13, 16, pix, stride, beta, tc, pix0, src3stride
+cglobal hevc_h_loop_filter_luma_10, 4, 11, 16, pix, stride, beta, tc, pix0, src3stride
lea src3strideq, [3 * strideq]
mov pix0q, pixq
sub pix0q, src3strideq
@@ -849,7 +842,7 @@ cglobal hevc_h_loop_filter_luma_10, 4, 13, 16, pix, stride, beta, tc, pix0, src3
.bypassluma:
RET
-cglobal hevc_h_loop_filter_luma_12, 4, 13, 16, pix, stride, beta, tc, pix0, src3stride
+cglobal hevc_h_loop_filter_luma_12, 4, 11, 16, pix, stride, beta, tc, pix0, src3stride
lea src3strideq, [3 * strideq]
mov pix0q, pixq
sub pix0q, src3strideq
--
2.52.0
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
reply other threads:[~2026-01-26 3:07 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=176939679744.25.10975189851690820186@4457048688e7 \
--to=ffmpeg-devel@ffmpeg.org \
--cc=code@ffmpeg.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git