[FFmpeg-devel] [PATCH] avcodec/x86/hevc/add_res: Remove AVX add

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed

* [FFmpeg-devel] [PATCH] avcodec/x86/hevc/add_res: Remove AVX add_residual functions (PR #20789)
@ 2025-10-30  9:00 mkver via ffmpeg-devel
  0 siblings, 0 replies; only message in thread
From: mkver via ffmpeg-devel @ 2025-10-30  9:00 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: mkver

PR #20789 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20789
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20789.patch


>From a0fa1c8e484f06cc9a9e2e3cfe53ec121fb74659 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 30 Oct 2025 08:30:40 +0100
Subject: [PATCH 1/3] avcodec/x86/hevc/add_res: Remove AVX add_residual
 functions

The AVX and SSE2 functions are identical except for the VEX encodings
used since e9abef437f0a348c017d4ac8b23a122881c1dc87 and
8b8492452d53293b2ac8c842877fadf7925fc950.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/x86/hevc/add_res.asm | 7 +------
 libavcodec/x86/hevc/dsp.h       | 4 ----
 libavcodec/x86/hevc/dsp_init.c  | 4 ----
 3 files changed, 1 insertion(+), 14 deletions(-)

diff --git a/libavcodec/x86/hevc/add_res.asm b/libavcodec/x86/hevc/add_res.asm
index 3ecbd4269c..5d7115620f 100644
--- a/libavcodec/x86/hevc/add_res.asm
+++ b/libavcodec/x86/hevc/add_res.asm
@@ -117,7 +117,7 @@ cglobal hevc_add_residual_4_8, 3, 3, 6
 %endmacro
 
 
-%macro TRANSFORM_ADD_8 0
+INIT_XMM sse2
 ; void ff_hevc_add_residual_8_8_<opt>(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
 cglobal hevc_add_residual_8_8, 3, 4, 8
     pxor              m4, m4
@@ -154,12 +154,7 @@ cglobal hevc_add_residual_32_8, 3, 5, 7
     dec                r4d
     jg .loop
     RET
-%endmacro
 
-INIT_XMM sse2
-TRANSFORM_ADD_8
-INIT_XMM avx
-TRANSFORM_ADD_8
 
 %if HAVE_AVX2_EXTERNAL
 INIT_YMM avx2
diff --git a/libavcodec/x86/hevc/dsp.h b/libavcodec/x86/hevc/dsp.h
index 03986b970a..0062699ce0 100644
--- a/libavcodec/x86/hevc/dsp.h
+++ b/libavcodec/x86/hevc/dsp.h
@@ -172,10 +172,6 @@ void ff_hevc_add_residual_8_8_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t s
 void ff_hevc_add_residual_16_8_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
 void ff_hevc_add_residual_32_8_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
 
-void ff_hevc_add_residual_8_8_avx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_16_8_avx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_32_8_avx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
-
 void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
 
 void ff_hevc_add_residual_4_10_mmxext(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
diff --git a/libavcodec/x86/hevc/dsp_init.c b/libavcodec/x86/hevc/dsp_init.c
index 6966340c42..f1558b7e3e 100644
--- a/libavcodec/x86/hevc/dsp_init.c
+++ b/libavcodec/x86/hevc/dsp_init.c
@@ -877,10 +877,6 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
 
             c->idct[0] = ff_hevc_idct_4x4_8_avx;
             c->idct[1] = ff_hevc_idct_8x8_8_avx;
-
-            c->add_residual[1] = ff_hevc_add_residual_8_8_avx;
-            c->add_residual[2] = ff_hevc_add_residual_16_8_avx;
-            c->add_residual[3] = ff_hevc_add_residual_32_8_avx;
         }
         if (EXTERNAL_AVX2(cpu_flags)) {
             c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_8_avx2;
-- 
2.49.1


>From 17526beaf2ea13fd7e1484e8af0ae44baee6f8cb Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 30 Oct 2025 08:49:38 +0100
Subject: [PATCH 2/3] avcodec/x86/hevc/add_res: Reduce number of registers used

This makes these functions use only volatile registers (even on Win64).

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/x86/hevc/add_res.asm | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/libavcodec/x86/hevc/add_res.asm b/libavcodec/x86/hevc/add_res.asm
index 5d7115620f..8abfcab893 100644
--- a/libavcodec/x86/hevc/add_res.asm
+++ b/libavcodec/x86/hevc/add_res.asm
@@ -61,20 +61,16 @@ cglobal hevc_add_residual_4_8, 3, 3, 6
     movq              m1, [r0+r2]
     punpcklbw         m0, m4
     punpcklbw         m1, m4
-    mova              m2, [r1]
-    mova              m3, [r1+16]
-    paddsw            m0, m2
-    paddsw            m1, m3
+    paddsw            m0, [r1]
+    paddsw            m1, [r1+16]
     packuswb          m0, m1
 
     movq              m2, [r0+r2*2]
     movq              m3, [r0+r3]
     punpcklbw         m2, m4
     punpcklbw         m3, m4
-    mova              m6, [r1+32]
-    mova              m7, [r1+48]
-    paddsw            m2, m6
-    paddsw            m3, m7
+    paddsw            m2, [r1+32]
+    paddsw            m3, [r1+48]
     packuswb          m2, m3
 
     movq            [r0], m0
@@ -88,27 +84,33 @@ cglobal hevc_add_residual_4_8, 3, 3, 6
     mova              m2, m1
     punpcklbw         m1, m0
     punpckhbw         m2, m0
+%if cpuflag(avx2)
     mova             xm5, [r1+%1]
     mova             xm6, [r1+%1+16]
-%if cpuflag(avx2)
     vinserti128       m5, m5, [r1+%1+32], 1
     vinserti128       m6, m6, [r1+%1+48], 1
-%endif
     paddsw            m1, m5
     paddsw            m2, m6
+%else
+    paddsw            m1, [r1+%1]
+    paddsw            m2, [r1+%1+16]
+%endif
 
     mova              m3, [%3]
     mova              m4, m3
     punpcklbw         m3, m0
     punpckhbw         m4, m0
+%if cpuflag(avx2)
     mova             xm5, [r1+%1+mmsize*2]
     mova             xm6, [r1+%1+mmsize*2+16]
-%if cpuflag(avx2)
     vinserti128       m5, m5, [r1+%1+96], 1
     vinserti128       m6, m6, [r1+%1+112], 1
-%endif
     paddsw            m3, m5
     paddsw            m4, m6
+%else
+    paddsw            m3, [r1+%1+mmsize*2]
+    paddsw            m4, [r1+%1+mmsize*2+16]
+%endif
 
     packuswb          m1, m2
     packuswb          m3, m4
@@ -119,7 +121,7 @@ cglobal hevc_add_residual_4_8, 3, 3, 6
 
 INIT_XMM sse2
 ; void ff_hevc_add_residual_8_8_<opt>(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
-cglobal hevc_add_residual_8_8, 3, 4, 8
+cglobal hevc_add_residual_8_8, 3, 4, 5
     pxor              m4, m4
     lea               r3, [r2*3]
     ADD_RES_SSE_8_8
@@ -129,7 +131,7 @@ cglobal hevc_add_residual_8_8, 3, 4, 8
     RET
 
 ; void ff_hevc_add_residual_16_8_<opt>(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
-cglobal hevc_add_residual_16_8, 3, 5, 7
+cglobal hevc_add_residual_16_8, 3, 5, 5
     pxor                m0, m0
     lea                 r3, [r2*3]
     mov                r4d, 4
@@ -143,7 +145,7 @@ cglobal hevc_add_residual_16_8, 3, 5, 7
     RET
 
 ; void ff_hevc_add_residual_32_8_<opt>(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
-cglobal hevc_add_residual_32_8, 3, 5, 7
+cglobal hevc_add_residual_32_8, 3, 5, 5
     pxor                m0, m0
     mov                r4d, 16
 .loop:
-- 
2.49.1


>From 894f415b278a07c9afbe349697bde80bbdab4e11 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 30 Oct 2025 09:58:13 +0100
Subject: [PATCH 3/3] avcodec/x86/hevc/add_res: Avoid unnecessary modification

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/x86/hevc/add_res.asm | 32 ++++++++++++++------------------
 1 file changed, 14 insertions(+), 18 deletions(-)

diff --git a/libavcodec/x86/hevc/add_res.asm b/libavcodec/x86/hevc/add_res.asm
index 8abfcab893..3489e04e2b 100644
--- a/libavcodec/x86/hevc/add_res.asm
+++ b/libavcodec/x86/hevc/add_res.asm
@@ -27,9 +27,9 @@ cextern pw_1023
 %define max_pixels_10 pw_1023
 
 ; the add_res macros and functions were largely inspired by h264_idct.asm from the x264 project
-%macro ADD_RES_MMX_4_8 0
-    mova              m0, [r1]
-    mova              m2, [r1+8]
+%macro ADD_RES_MMX_4_8 1
+    mova              m0, [r1+%1]
+    mova              m2, [r1+%1+8]
 
     movd              m1, [r0]
     movd              m3, [r0+r2]
@@ -50,27 +50,26 @@ INIT_MMX mmxext
 ; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
 cglobal hevc_add_residual_4_8, 3, 3, 6
     pxor              m4, m4
-    ADD_RES_MMX_4_8
-    add               r1, 16
+    ADD_RES_MMX_4_8    0
     lea               r0, [r0+r2*2]
-    ADD_RES_MMX_4_8
+    ADD_RES_MMX_4_8   16
     RET
 
-%macro ADD_RES_SSE_8_8 0
+%macro ADD_RES_SSE_8_8 1
     movq              m0, [r0]
     movq              m1, [r0+r2]
     punpcklbw         m0, m4
     punpcklbw         m1, m4
-    paddsw            m0, [r1]
-    paddsw            m1, [r1+16]
+    paddsw            m0, [r1+%1]
+    paddsw            m1, [r1+%1+16]
     packuswb          m0, m1
 
     movq              m2, [r0+r2*2]
     movq              m3, [r0+r3]
     punpcklbw         m2, m4
     punpcklbw         m3, m4
-    paddsw            m2, [r1+32]
-    paddsw            m3, [r1+48]
+    paddsw            m2, [r1+%1+32]
+    paddsw            m3, [r1+%1+48]
     packuswb          m2, m3
 
     movq            [r0], m0
@@ -124,10 +123,9 @@ INIT_XMM sse2
 cglobal hevc_add_residual_8_8, 3, 4, 5
     pxor              m4, m4
     lea               r3, [r2*3]
-    ADD_RES_SSE_8_8
-    add               r1, 64
+    ADD_RES_SSE_8_8    0
     lea               r0, [r0+r2*4]
-    ADD_RES_SSE_8_8
+    ADD_RES_SSE_8_8   64
     RET
 
 ; void ff_hevc_add_residual_16_8_<opt>(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
@@ -292,9 +290,8 @@ cglobal hevc_add_residual_4_10, 3, 3, 6
     pxor              m2, m2
     mova              m3, [max_pixels_10]
     ADD_RES_MMX_4_10  r0, r2, r1
-    add               r1, 16
     lea               r0, [r0+2*r2]
-    ADD_RES_MMX_4_10  r0, r2, r1
+    ADD_RES_MMX_4_10  r0, r2, r1+16
     RET
 
 INIT_XMM sse2
@@ -305,8 +302,7 @@ cglobal hevc_add_residual_8_10, 3, 4, 6
 
     ADD_RES_SSE_8_10  r0, r2, r3, r1
     lea               r0, [r0+r2*4]
-    add               r1, 64
-    ADD_RES_SSE_8_10  r0, r2, r3, r1
+    ADD_RES_SSE_8_10  r0, r2, r3, r1+64
     RET
 
 cglobal hevc_add_residual_16_10, 3, 5, 6
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2025-10-30  9:02 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-10-30  9:00 [FFmpeg-devel] [PATCH] avcodec/x86/hevc/add_res: Remove AVX add_residual functions (PR #20789) mkver via ffmpeg-devel

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git