* [FFmpeg-devel] [PATCH] avcodec/x86/hevc/add_res: Remove AVX add_residual functions (PR #20789)
@ 2025-10-30 9:00 mkver via ffmpeg-devel
0 siblings, 0 replies; only message in thread
From: mkver via ffmpeg-devel @ 2025-10-30 9:00 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: mkver
PR #20789 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20789
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20789.patch
>From a0fa1c8e484f06cc9a9e2e3cfe53ec121fb74659 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 30 Oct 2025 08:30:40 +0100
Subject: [PATCH 1/3] avcodec/x86/hevc/add_res: Remove AVX add_residual
functions
The AVX and SSE2 functions are identical except for the VEX encodings
used since e9abef437f0a348c017d4ac8b23a122881c1dc87 and
8b8492452d53293b2ac8c842877fadf7925fc950.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/hevc/add_res.asm | 7 +------
libavcodec/x86/hevc/dsp.h | 4 ----
libavcodec/x86/hevc/dsp_init.c | 4 ----
3 files changed, 1 insertion(+), 14 deletions(-)
diff --git a/libavcodec/x86/hevc/add_res.asm b/libavcodec/x86/hevc/add_res.asm
index 3ecbd4269c..5d7115620f 100644
--- a/libavcodec/x86/hevc/add_res.asm
+++ b/libavcodec/x86/hevc/add_res.asm
@@ -117,7 +117,7 @@ cglobal hevc_add_residual_4_8, 3, 3, 6
%endmacro
-%macro TRANSFORM_ADD_8 0
+INIT_XMM sse2
; void ff_hevc_add_residual_8_8_<opt>(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
cglobal hevc_add_residual_8_8, 3, 4, 8
pxor m4, m4
@@ -154,12 +154,7 @@ cglobal hevc_add_residual_32_8, 3, 5, 7
dec r4d
jg .loop
RET
-%endmacro
-INIT_XMM sse2
-TRANSFORM_ADD_8
-INIT_XMM avx
-TRANSFORM_ADD_8
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
diff --git a/libavcodec/x86/hevc/dsp.h b/libavcodec/x86/hevc/dsp.h
index 03986b970a..0062699ce0 100644
--- a/libavcodec/x86/hevc/dsp.h
+++ b/libavcodec/x86/hevc/dsp.h
@@ -172,10 +172,6 @@ void ff_hevc_add_residual_8_8_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t s
void ff_hevc_add_residual_16_8_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_32_8_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_8_8_avx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_16_8_avx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_32_8_avx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
-
void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_4_10_mmxext(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
diff --git a/libavcodec/x86/hevc/dsp_init.c b/libavcodec/x86/hevc/dsp_init.c
index 6966340c42..f1558b7e3e 100644
--- a/libavcodec/x86/hevc/dsp_init.c
+++ b/libavcodec/x86/hevc/dsp_init.c
@@ -877,10 +877,6 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
c->idct[0] = ff_hevc_idct_4x4_8_avx;
c->idct[1] = ff_hevc_idct_8x8_8_avx;
-
- c->add_residual[1] = ff_hevc_add_residual_8_8_avx;
- c->add_residual[2] = ff_hevc_add_residual_16_8_avx;
- c->add_residual[3] = ff_hevc_add_residual_32_8_avx;
}
if (EXTERNAL_AVX2(cpu_flags)) {
c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_8_avx2;
--
2.49.1
>From 17526beaf2ea13fd7e1484e8af0ae44baee6f8cb Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 30 Oct 2025 08:49:38 +0100
Subject: [PATCH 2/3] avcodec/x86/hevc/add_res: Reduce number of registers used
This makes these functions use only volatile registers (even on Win64).
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/hevc/add_res.asm | 32 +++++++++++++++++---------------
1 file changed, 17 insertions(+), 15 deletions(-)
diff --git a/libavcodec/x86/hevc/add_res.asm b/libavcodec/x86/hevc/add_res.asm
index 5d7115620f..8abfcab893 100644
--- a/libavcodec/x86/hevc/add_res.asm
+++ b/libavcodec/x86/hevc/add_res.asm
@@ -61,20 +61,16 @@ cglobal hevc_add_residual_4_8, 3, 3, 6
movq m1, [r0+r2]
punpcklbw m0, m4
punpcklbw m1, m4
- mova m2, [r1]
- mova m3, [r1+16]
- paddsw m0, m2
- paddsw m1, m3
+ paddsw m0, [r1]
+ paddsw m1, [r1+16]
packuswb m0, m1
movq m2, [r0+r2*2]
movq m3, [r0+r3]
punpcklbw m2, m4
punpcklbw m3, m4
- mova m6, [r1+32]
- mova m7, [r1+48]
- paddsw m2, m6
- paddsw m3, m7
+ paddsw m2, [r1+32]
+ paddsw m3, [r1+48]
packuswb m2, m3
movq [r0], m0
@@ -88,27 +84,33 @@ cglobal hevc_add_residual_4_8, 3, 3, 6
mova m2, m1
punpcklbw m1, m0
punpckhbw m2, m0
+%if cpuflag(avx2)
mova xm5, [r1+%1]
mova xm6, [r1+%1+16]
-%if cpuflag(avx2)
vinserti128 m5, m5, [r1+%1+32], 1
vinserti128 m6, m6, [r1+%1+48], 1
-%endif
paddsw m1, m5
paddsw m2, m6
+%else
+ paddsw m1, [r1+%1]
+ paddsw m2, [r1+%1+16]
+%endif
mova m3, [%3]
mova m4, m3
punpcklbw m3, m0
punpckhbw m4, m0
+%if cpuflag(avx2)
mova xm5, [r1+%1+mmsize*2]
mova xm6, [r1+%1+mmsize*2+16]
-%if cpuflag(avx2)
vinserti128 m5, m5, [r1+%1+96], 1
vinserti128 m6, m6, [r1+%1+112], 1
-%endif
paddsw m3, m5
paddsw m4, m6
+%else
+ paddsw m3, [r1+%1+mmsize*2]
+ paddsw m4, [r1+%1+mmsize*2+16]
+%endif
packuswb m1, m2
packuswb m3, m4
@@ -119,7 +121,7 @@ cglobal hevc_add_residual_4_8, 3, 3, 6
INIT_XMM sse2
; void ff_hevc_add_residual_8_8_<opt>(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
-cglobal hevc_add_residual_8_8, 3, 4, 8
+cglobal hevc_add_residual_8_8, 3, 4, 5
pxor m4, m4
lea r3, [r2*3]
ADD_RES_SSE_8_8
@@ -129,7 +131,7 @@ cglobal hevc_add_residual_8_8, 3, 4, 8
RET
; void ff_hevc_add_residual_16_8_<opt>(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
-cglobal hevc_add_residual_16_8, 3, 5, 7
+cglobal hevc_add_residual_16_8, 3, 5, 5
pxor m0, m0
lea r3, [r2*3]
mov r4d, 4
@@ -143,7 +145,7 @@ cglobal hevc_add_residual_16_8, 3, 5, 7
RET
; void ff_hevc_add_residual_32_8_<opt>(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
-cglobal hevc_add_residual_32_8, 3, 5, 7
+cglobal hevc_add_residual_32_8, 3, 5, 5
pxor m0, m0
mov r4d, 16
.loop:
--
2.49.1
>From 894f415b278a07c9afbe349697bde80bbdab4e11 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 30 Oct 2025 09:58:13 +0100
Subject: [PATCH 3/3] avcodec/x86/hevc/add_res: Avoid unnecessary modification
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/hevc/add_res.asm | 32 ++++++++++++++------------------
1 file changed, 14 insertions(+), 18 deletions(-)
diff --git a/libavcodec/x86/hevc/add_res.asm b/libavcodec/x86/hevc/add_res.asm
index 8abfcab893..3489e04e2b 100644
--- a/libavcodec/x86/hevc/add_res.asm
+++ b/libavcodec/x86/hevc/add_res.asm
@@ -27,9 +27,9 @@ cextern pw_1023
%define max_pixels_10 pw_1023
; the add_res macros and functions were largely inspired by h264_idct.asm from the x264 project
-%macro ADD_RES_MMX_4_8 0
- mova m0, [r1]
- mova m2, [r1+8]
+%macro ADD_RES_MMX_4_8 1
+ mova m0, [r1+%1]
+ mova m2, [r1+%1+8]
movd m1, [r0]
movd m3, [r0+r2]
@@ -50,27 +50,26 @@ INIT_MMX mmxext
; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
cglobal hevc_add_residual_4_8, 3, 3, 6
pxor m4, m4
- ADD_RES_MMX_4_8
- add r1, 16
+ ADD_RES_MMX_4_8 0
lea r0, [r0+r2*2]
- ADD_RES_MMX_4_8
+ ADD_RES_MMX_4_8 16
RET
-%macro ADD_RES_SSE_8_8 0
+%macro ADD_RES_SSE_8_8 1
movq m0, [r0]
movq m1, [r0+r2]
punpcklbw m0, m4
punpcklbw m1, m4
- paddsw m0, [r1]
- paddsw m1, [r1+16]
+ paddsw m0, [r1+%1]
+ paddsw m1, [r1+%1+16]
packuswb m0, m1
movq m2, [r0+r2*2]
movq m3, [r0+r3]
punpcklbw m2, m4
punpcklbw m3, m4
- paddsw m2, [r1+32]
- paddsw m3, [r1+48]
+ paddsw m2, [r1+%1+32]
+ paddsw m3, [r1+%1+48]
packuswb m2, m3
movq [r0], m0
@@ -124,10 +123,9 @@ INIT_XMM sse2
cglobal hevc_add_residual_8_8, 3, 4, 5
pxor m4, m4
lea r3, [r2*3]
- ADD_RES_SSE_8_8
- add r1, 64
+ ADD_RES_SSE_8_8 0
lea r0, [r0+r2*4]
- ADD_RES_SSE_8_8
+ ADD_RES_SSE_8_8 64
RET
; void ff_hevc_add_residual_16_8_<opt>(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
@@ -292,9 +290,8 @@ cglobal hevc_add_residual_4_10, 3, 3, 6
pxor m2, m2
mova m3, [max_pixels_10]
ADD_RES_MMX_4_10 r0, r2, r1
- add r1, 16
lea r0, [r0+2*r2]
- ADD_RES_MMX_4_10 r0, r2, r1
+ ADD_RES_MMX_4_10 r0, r2, r1+16
RET
INIT_XMM sse2
@@ -305,8 +302,7 @@ cglobal hevc_add_residual_8_10, 3, 4, 6
ADD_RES_SSE_8_10 r0, r2, r3, r1
lea r0, [r0+r2*4]
- add r1, 64
- ADD_RES_SSE_8_10 r0, r2, r3, r1
+ ADD_RES_SSE_8_10 r0, r2, r3, r1+64
RET
cglobal hevc_add_residual_16_10, 3, 5, 6
--
2.49.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2025-10-30 9:02 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-10-30 9:00 [FFmpeg-devel] [PATCH] avcodec/x86/hevc/add_res: Remove AVX add_residual functions (PR #20789) mkver via ffmpeg-devel
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git