Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PR] avcodec/x86/bswapdsp: Minor improvements (PR #22307)
@ 2026-02-27 13:03 mkver via ffmpeg-devel
  0 siblings, 0 replies; only message in thread
From: mkver via ffmpeg-devel @ 2026-02-27 13:03 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: mkver

PR #22307 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22307
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22307.patch


>From 451d53eb3db21189d9ca66a3a3b6684eb8e34efb Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Fri, 27 Feb 2026 13:19:47 +0100
Subject: [PATCH 1/3] avcodec/x86/bswapdsp: Avoid register copies

No change in benchmarks here.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/x86/bswapdsp.asm | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/libavcodec/x86/bswapdsp.asm b/libavcodec/x86/bswapdsp.asm
index 31c6c48a21..12fd494ffe 100644
--- a/libavcodec/x86/bswapdsp.asm
+++ b/libavcodec/x86/bswapdsp.asm
@@ -33,10 +33,10 @@ SECTION .text
 ; %1 = aligned/unaligned
 %macro BSWAP_LOOPS  1
     mov      r3d, r2d
-    sar      r2d, 3
+    sar      r3d, 3
     jz       .left4_%1
 %if cpuflag(avx2)
-    sar      r2d, 1
+    sar      r3d, 1
     jz       .left8_%1
 %endif
 .loop8_%1:
@@ -65,12 +65,11 @@ SECTION .text
 %endif
     add      r0, mmsize*2
     add      r1, mmsize*2
-    dec      r2d
+    dec      r3d
     jnz      .loop8_%1
 %if cpuflag(avx2)
 .left8_%1:
-    mov      r2d, r3d
-    test     r3d, 8
+    test     r2d, 8
     jz       .left4_%1
     mov%1    m0, [r1]
     pshufb   m0, m2
@@ -79,8 +78,7 @@ SECTION .text
     add r0, mmsize
 %endif
 .left4_%1:
-    mov      r2d, r3d
-    test     r3d, 4
+    test     r2d, 4
     jz       .left
     mov%1    xm0, [r1]
 %if cpuflag(ssse3)
-- 
2.52.0


>From 3db6adc772ebfadf0537390740883ab6feed2841 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Fri, 27 Feb 2026 13:24:04 +0100
Subject: [PATCH 2/3] avcodec/x86/bswapdsp: combine shifting, avoid check for
 AVX2

This avoids a check and a shift if >=8 elements are processed;
it adds a check if < 8 elements are processed (which should
be rare).
No change in benchmarks here.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/x86/bswapdsp.asm | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/libavcodec/x86/bswapdsp.asm b/libavcodec/x86/bswapdsp.asm
index 12fd494ffe..f89ca76cf1 100644
--- a/libavcodec/x86/bswapdsp.asm
+++ b/libavcodec/x86/bswapdsp.asm
@@ -33,11 +33,12 @@ SECTION .text
 ; %1 = aligned/unaligned
 %macro BSWAP_LOOPS  1
     mov      r3d, r2d
+%if cpuflag(avx2)
+    sar      r3d, 4
+    jz       .left8_%1
+%else
     sar      r3d, 3
     jz       .left4_%1
-%if cpuflag(avx2)
-    sar      r3d, 1
-    jz       .left8_%1
 %endif
 .loop8_%1:
     mov%1    m0, [r1 +  0]
-- 
2.52.0


>From 311a587c7f2b90f54a04bb19505736cf9f304a48 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Fri, 27 Feb 2026 13:54:21 +0100
Subject: [PATCH 3/3] avcodec/x86/bswapdsp: Avoid aligned vs unaligned
 codepaths for AVX2

For modern cpus (like those supporting AVX2) loads and stores
using the unaligned versions of instructions are as fast
as aligned ones if the address is aligned, so remove
the aligned AVX2 version (and the alignment check) and just
remove the unaligned one.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/x86/bswapdsp.asm | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/libavcodec/x86/bswapdsp.asm b/libavcodec/x86/bswapdsp.asm
index f89ca76cf1..2b80d8a75e 100644
--- a/libavcodec/x86/bswapdsp.asm
+++ b/libavcodec/x86/bswapdsp.asm
@@ -100,10 +100,15 @@ SECTION .text
 
 ; void ff_bswap_buf(uint32_t *dst, const uint32_t *src, int w);
 %macro BSWAP32_BUF 0
-%if cpuflag(ssse3)||cpuflag(avx2)
+%if cpuflag(avx2)
+cglobal bswap32_buf, 3,4,3
+    vbroadcasti128  m2, [pb_bswap32]
+    BSWAP_LOOPS  u
+%else
+%if cpuflag(ssse3)
 cglobal bswap32_buf, 3,4,3
     mov      r3, r1
-    VBROADCASTI128  m2, [pb_bswap32]
+    mova     m2, [pb_bswap32]
 %else
 cglobal bswap32_buf, 3,4,5
     mov      r3, r1
@@ -115,6 +120,7 @@ cglobal bswap32_buf, 3,4,5
     jmp      .left
 .start_align:
     BSWAP_LOOPS  a
+%endif
 .left:
 %if cpuflag(ssse3)
     test     r2d, 2
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2026-02-27 13:04 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2026-02-27 13:03 [FFmpeg-devel] [PR] avcodec/x86/bswapdsp: Minor improvements (PR #22307) mkver via ffmpeg-devel

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git