Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
From: mkver via ffmpeg-devel <ffmpeg-devel@ffmpeg.org>
To: ffmpeg-devel@ffmpeg.org
Cc: mkver <code@ffmpeg.org>
Subject: [FFmpeg-devel] [PATCH] avcodec/x86/hpeldsp: Don't use saturated addition when unnecessary (PR #20791)
Date: Thu, 30 Oct 2025 11:02:06 -0000
Message-ID: <176182212706.81.6416319141458007086@7d278768979e> (raw)

PR #20791 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20791
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20791.patch


>From 50f2e0e7ba41e4aedf36244d63c42a1381fc0336 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 30 Oct 2025 10:27:00 +0100
Subject: [PATCH 1/3] avcodec/x86/hpeldsp: Actually use constants in registers

Forgotten in 36f92206bb90d6f0268749bd6fe6aa57974442db.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/x86/hpeldsp.asm | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index 2587e3c315..0974286b0d 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -428,7 +428,7 @@ cglobal %1%3_pixels8_xy2, 4,5,5
     psrlw       m2, 2
 %else
     paddusw     m2, m0
-    pmulhrsw    m2, [pw_8192]
+    pmulhrsw    m2, m3
 %endif
 %ifidn %1, avg
     movh        m1, [r0+r4]
@@ -450,7 +450,7 @@ cglobal %1%3_pixels8_xy2, 4,5,5
     psrlw       m0, 2
 %else
     paddusw     m0, m2
-    pmulhrsw    m0, [pw_8192]
+    pmulhrsw    m0, m3
 %endif
 %ifidn %1, avg
     movh        m1, [r0+r4]
-- 
2.49.1


>From a84ea10f93fbb66530eaa5ebb6f0275203d18356 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 30 Oct 2025 10:44:41 +0100
Subject: [PATCH 2/3] avcodec/x86/hpeldsp: Don't use saturated addition when
 unnecessary

The numbers here are small (sums of values unpacked from bytes).

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/x86/hpeldsp.asm | 48 +++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index 0974286b0d..c92c70f5ad 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -423,11 +423,11 @@ cglobal %1%3_pixels8_xy2, 4,5,5
     punpcklbw   m0, m1
     pmaddubsw   m0, m4
 %ifidn %3, _no_rnd
-    paddusw     m2, m3
-    paddusw     m2, m0
+    paddw       m2, m3
+    paddw       m2, m0
     psrlw       m2, 2
 %else
-    paddusw     m2, m0
+    paddw       m2, m0
     pmulhrsw    m2, m3
 %endif
 %ifidn %1, avg
@@ -445,11 +445,11 @@ cglobal %1%3_pixels8_xy2, 4,5,5
     punpcklbw   m2, m1
     pmaddubsw   m2, m4
 %ifidn %3, _no_rnd
-    paddusw     m0, m3
-    paddusw     m0, m2
+    paddw       m0, m3
+    paddw       m0, m2
     psrlw       m0, 2
 %else
-    paddusw     m0, m2
+    paddw       m0, m2
     pmulhrsw    m0, m3
 %endif
 %ifidn %1, avg
@@ -485,8 +485,8 @@ cglobal %1%3_pixels16_xy2, 4,5,8
     punpcklbw   m4, m7
     punpckhbw   m1, m7
     punpckhbw   m5, m7
-    paddusw     m4, m0
-    paddusw     m5, m1
+    paddw       m4, m0
+    paddw       m5, m1
     xor         r4, r4
     add         r1, r2
 .loop:
@@ -498,12 +498,12 @@ cglobal %1%3_pixels16_xy2, 4,5,8
     punpcklbw   m2, m7
     punpckhbw   m1, m7
     punpckhbw   m3, m7
-    paddusw     m0, m2
-    paddusw     m1, m3
-    paddusw     m4, m6
-    paddusw     m5, m6
-    paddusw     m4, m0
-    paddusw     m5, m1
+    paddw       m0, m2
+    paddw       m1, m3
+    paddw       m4, m6
+    paddw       m5, m6
+    paddw       m4, m0
+    paddw       m5, m1
     psrlw       m4, 2
     psrlw       m5, 2
 %ifidn %1, avg
@@ -524,12 +524,12 @@ cglobal %1%3_pixels16_xy2, 4,5,8
     punpcklbw   m4, m7
     punpckhbw   m3, m7
     punpckhbw   m5, m7
-    paddusw     m4, m2
-    paddusw     m5, m3
-    paddusw     m0, m6
-    paddusw     m1, m6
-    paddusw     m0, m4
-    paddusw     m1, m5
+    paddw       m4, m2
+    paddw       m5, m3
+    paddw       m0, m6
+    paddw       m1, m6
+    paddw       m0, m4
+    paddw       m1, m5
     psrlw       m0, 2
     psrlw       m1, 2
 %ifidn %1, avg
@@ -567,8 +567,8 @@ cglobal %1_pixels16_xy2, 4,5,%2
     movu        m3, [r1+r4+1]
     pmaddubsw   m2, m5
     pmaddubsw   m3, m5
-    paddusw     m0, m2
-    paddusw     m1, m3
+    paddw       m0, m2
+    paddw       m1, m3
     pmulhrsw    m0, [pw_8192]
     pmulhrsw    m1, [pw_8192]
 %ifidn %1, avg
@@ -587,8 +587,8 @@ cglobal %1_pixels16_xy2, 4,5,%2
     movu        m1, [r1+r4+1]
     pmaddubsw   m0, m5
     pmaddubsw   m1, m5
-    paddusw     m2, m0
-    paddusw     m3, m1
+    paddw       m2, m0
+    paddw       m3, m1
     pmulhrsw    m2, [pw_8192]
     pmulhrsw    m3, [pw_8192]
 %ifidn %1, avg
-- 
2.49.1


>From 88f4641db2d488308f04b70cba9f285d30da6eb5 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 30 Oct 2025 11:07:43 +0100
Subject: [PATCH 3/3] avcodec/x86/hpeldsp: Don't use PAVGB macro

It was only needed for MMX and there are no MMX functions here any more.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/x86/hpeldsp.asm | 84 +++++++++++++++++++-------------------
 1 file changed, 42 insertions(+), 42 deletions(-)

diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index c92c70f5ad..cbdf0e460d 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -54,8 +54,8 @@ cglobal put_pixels8_x2, 4,5
     pavgb        m0, m2
     pavgb        m1, m3
 %else
-    PAVGB        m0, [r1]
-    PAVGB        m1, [r1+r2]
+    pavgb        m0, [r1]
+    pavgb        m1, [r1+r2]
 %endif
     mova       [r0], m0
     mova    [r0+r2], m1
@@ -69,8 +69,8 @@ cglobal put_pixels8_x2, 4,5
     pavgb        m0, m2
     pavgb        m1, m3
 %else
-    PAVGB        m0, [r1]
-    PAVGB        m1, [r1+r2]
+    pavgb        m0, [r1]
+    pavgb        m1, [r1+r2]
 %endif
     add          r1, r4
     mova       [r0], m0
@@ -103,8 +103,8 @@ cglobal put_no_rnd_pixels8_x2, 4,5
     add          r1, r4
     psubusb      m0, m6
     psubusb      m2, m6
-    PAVGB        m0, m1
-    PAVGB        m2, m3
+    pavgb        m0, m1
+    pavgb        m2, m3
     mova       [r0], m0
     mova    [r0+r2], m2
     mova         m0, [r1]
@@ -115,8 +115,8 @@ cglobal put_no_rnd_pixels8_x2, 4,5
     add          r1, r4
     psubusb      m0, m6
     psubusb      m2, m6
-    PAVGB        m0, m1
-    PAVGB        m2, m3
+    pavgb        m0, m1
+    pavgb        m2, m3
     mova       [r0], m0
     mova    [r0+r2], m2
     add          r0, r4
@@ -143,8 +143,8 @@ cglobal %1_no_rnd_pixels8_x2_exact, 4,5
     pxor         m2, m4
     pxor         m1, m4
     pxor         m3, m4
-    PAVGB        m0, m1
-    PAVGB        m2, m3
+    pavgb        m0, m1
+    pavgb        m2, m3
     pxor         m0, m4
     pxor         m2, m4
 %ifidn %1, avg
@@ -161,8 +161,8 @@ cglobal %1_no_rnd_pixels8_x2_exact, 4,5
     pxor         m1, m4
     pxor         m2, m4
     pxor         m3, m4
-    PAVGB        m0, m1
-    PAVGB        m2, m3
+    pavgb        m0, m1
+    pavgb        m2, m3
     pxor         m0, m4
     pxor         m2, m4
 %ifidn %1, avg
@@ -198,16 +198,16 @@ cglobal put_pixels8_y2, 4,5
     movu         m1, [r1+r2]
     movu         m2, [r1+r4]
     add          r1, r4
-    PAVGB        m0, m1
-    PAVGB        m1, m2
+    pavgb        m0, m1
+    pavgb        m1, m2
     mova    [r0+r2], m0
     mova    [r0+r4], m1
     movu         m1, [r1+r2]
     movu         m0, [r1+r4]
     add          r0, r4
     add          r1, r4
-    PAVGB        m2, m1
-    PAVGB        m1, m0
+    pavgb        m2, m1
+    pavgb        m1, m0
     mova    [r0+r2], m2
     mova    [r0+r4], m1
     add          r0, r4
@@ -235,8 +235,8 @@ cglobal put_no_rnd_pixels8_y2, 4,5
     mova         m2, [r1+r4]
     add          r1, r4
     psubusb      m1, m6
-    PAVGB        m0, m1
-    PAVGB        m1, m2
+    pavgb        m0, m1
+    pavgb        m1, m2
     mova    [r0+r2], m0
     mova    [r0+r4], m1
     mova         m1, [r1+r2]
@@ -244,8 +244,8 @@ cglobal put_no_rnd_pixels8_y2, 4,5
     add          r0, r4
     add          r1, r4
     psubusb      m1, m6
-    PAVGB        m2, m1
-    PAVGB        m1, m0
+    pavgb        m2, m1
+    pavgb        m1, m0
     mova    [r0+r2], m2
     mova    [r0+r4], m1
     add          r0, r4
@@ -271,8 +271,8 @@ cglobal %1_no_rnd_pixels8_y2_exact, 4,5
     movu         m2, [r1+r2]
     pxor         m1, m3
     pxor         m2, m3
-    PAVGB        m0, m1
-    PAVGB        m1, m2
+    pavgb        m0, m1
+    pavgb        m1, m2
     pxor         m0, m3
     pxor         m1, m3
 %ifidn %1, avg
@@ -285,8 +285,8 @@ cglobal %1_no_rnd_pixels8_y2_exact, 4,5
     movu         m0, [r1+r4]
     pxor         m1, m3
     pxor         m0, m3
-    PAVGB        m2, m1
-    PAVGB        m1, m0
+    pavgb        m2, m1
+    pavgb        m1, m0
     pxor         m2, m3
     pxor         m1, m3
 %ifidn %1, avg
@@ -325,11 +325,11 @@ cglobal avg_pixels8_x2, 4,5
     pavgb        m0, m1
     pavgb        m2, m3
 %else
-    PAVGB        m0, [r1+1], m3, m5
-    PAVGB        m2, [r1+r2+1], m4, m5
+    pavgb        m0, [r1+1]
+    pavgb        m2, [r1+r2+1]
 %endif
-    PAVGB        m0, [r0], m3, m5
-    PAVGB        m2, [r0+r2], m4, m5
+    pavgb        m0, [r0]
+    pavgb        m2, [r0+r2]
     add          r1, r4
     mova       [r0], m0
     mova    [r0+r2], m2
@@ -341,13 +341,13 @@ cglobal avg_pixels8_x2, 4,5
     pavgb        m0, m1
     pavgb        m2, m3
 %else
-    PAVGB        m0, [r1+1], m3, m5
-    PAVGB        m2, [r1+r2+1], m4, m5
+    pavgb        m0, [r1+1]
+    pavgb        m2, [r1+r2+1]
 %endif
     add          r0, r4
     add          r1, r4
-    PAVGB        m0, [r0], m3, m5
-    PAVGB        m2, [r0+r2], m4, m5
+    pavgb        m0, [r0]
+    pavgb        m2, [r0+r2]
     mova       [r0], m0
     mova    [r0+r2], m2
     add          r0, r4
@@ -377,20 +377,20 @@ cglobal avg_pixels8_y2, 4,5
     movu         m1, [r1+r2]
     movu         m2, [r1+r4]
     add          r1, r4
-    PAVGB        m0, m1
-    PAVGB        m1, m2
-    PAVGB        m0, [r0+r2]
-    PAVGB        m1, [r0+r4]
+    pavgb        m0, m1
+    pavgb        m1, m2
+    pavgb        m0, [r0+r2]
+    pavgb        m1, [r0+r4]
     mova    [r0+r2], m0
     mova    [r0+r4], m1
     movu         m1, [r1+r2]
     movu         m0, [r1+r4]
-    PAVGB        m2, m1
-    PAVGB        m1, m0
+    pavgb        m2, m1
+    pavgb        m1, m0
     add          r0, r4
     add          r1, r4
-    PAVGB        m2, [r0+r2]
-    PAVGB        m1, [r0+r4]
+    pavgb        m2, [r0+r2]
+    pavgb        m1, [r0+r4]
     mova    [r0+r2], m2
     mova    [r0+r4], m1
     add          r0, r4
@@ -509,7 +509,7 @@ cglobal %1%3_pixels16_xy2, 4,5,8
 %ifidn %1, avg
     mova        m3, [r0+r4]
     packuswb    m4, m5
-    PAVGB       m4, m3
+    pavgb       m4, m3
 %else
     packuswb    m4, m5
 %endif
@@ -535,7 +535,7 @@ cglobal %1%3_pixels16_xy2, 4,5,8
 %ifidn %1, avg
     mova        m3, [r0+r4]
     packuswb    m0, m1
-    PAVGB       m0, m3
+    pavgb       m0, m3
 %else
     packuswb    m0, m1
 %endif
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org

                 reply	other threads:[~2025-10-30 11:02 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=176182212706.81.6416319141458007086@7d278768979e \
    --to=ffmpeg-devel@ffmpeg.org \
    --cc=code@ffmpeg.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git