* [FFmpeg-devel] [PATCH 1/4] avcodec/hpeldsp: Remove duplicate pel functions
@ 2025-05-30 15:09 Andreas Rheinhardt
2025-06-01 20:31 ` Andreas Rheinhardt
0 siblings, 1 reply; 2+ messages in thread
From: Andreas Rheinhardt @ 2025-05-30 15:09 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1: Type: text/plain, Size: 29 bytes --]
Patches attached.
- Andreas
[-- Attachment #2: 0001-avcodec-hpeldsp-Remove-duplicate-pel-functions.patch --]
[-- Type: text/x-patch, Size: 1687 bytes --]
From a33bb0298c0eaf7816c36be7c5558cfcc5a3f37c Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Fri, 30 May 2025 12:40:16 +0200
Subject: [PATCH 1/4] avcodec/hpeldsp: Remove duplicate pel functions
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/hpeldsp.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/libavcodec/hpeldsp.c b/libavcodec/hpeldsp.c
index 80494c9749..db0e02ee93 100644
--- a/libavcodec/hpeldsp.c
+++ b/libavcodec/hpeldsp.c
@@ -314,9 +314,6 @@ CALL_2X_PIXELS(OPNAME ## _pixels16_y2_8_c, \
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_8_c, \
OPNAME ## _pixels8_xy2_8_c, \
8) \
-CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_8_c, \
- OPNAME ## _pixels8_8_c, \
- 8) \
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_8_c, \
OPNAME ## _no_rnd_pixels8_x2_8_c, \
8) \
@@ -330,6 +327,8 @@ CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_8_c, \
#define op_avg(a, b) a = rnd_avg32(a, b)
#define op_put(a, b) a = b
#define put_no_rnd_pixels8_8_c put_pixels8_8_c
+#define put_no_rnd_pixels16_8_c put_pixels16_8_c
+#define avg_no_rnd_pixels16_8_c avg_pixels16_8_c
PIXOP2(avg, op_avg)
PIXOP2(put, op_put)
#undef op_avg
--
2.45.2
[-- Attachment #3: 0002-avcodec-x86-hpeldsp_init-Use-ff_avg_pixels16_mmxext.patch --]
[-- Type: text/x-patch, Size: 1880 bytes --]
From 380cf96fd94da3a47e9fd008213b1f7d9e6e454d Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Fri, 30 May 2025 12:55:54 +0200
Subject: [PATCH 2/4] avcodec/x86/hpeldsp_init: Use ff_avg_pixels16_mmxext
avg_pixels_tab[0][0] does the same as avg_no_rnd_pixels_tab[0].
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/hpeldsp_init.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index 4a0513d06d..12edcd9e83 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -122,7 +122,6 @@ CALL_2X_PIXELS(put_pixels16_xy2_mmx, ff_put_pixels8_xy2_mmx, 8)
CALL_2X_PIXELS(put_no_rnd_pixels16_x2 ## CPUEXT, ff_put_no_rnd_pixels8_x2 ## CPUEXT, 8) \
CALL_2X_PIXELS(put_pixels16_y2 ## CPUEXT, ff_put_pixels8_y2 ## CPUEXT, 8) \
CALL_2X_PIXELS(put_no_rnd_pixels16_y2 ## CPUEXT, ff_put_no_rnd_pixels8_y2 ## CPUEXT, 8) \
- CALL_2X_PIXELS(avg_pixels16 ## CPUEXT, ff_avg_pixels8 ## CPUEXT, 8) \
CALL_2X_PIXELS(avg_pixels16_x2 ## CPUEXT, ff_avg_pixels8_x2 ## CPUEXT, 8) \
CALL_2X_PIXELS(avg_pixels16_y2 ## CPUEXT, ff_avg_pixels8_y2 ## CPUEXT, 8) \
CALL_2X_PIXELS(avg_pixels16_xy2 ## CPUEXT, ff_avg_pixels8_xy2 ## CPUEXT, 8) \
@@ -170,7 +169,7 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags)
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext;
- c->avg_pixels_tab[0][0] = avg_pixels16_mmxext;
+ c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext;
c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
--
2.45.2
[-- Attachment #4: 0003-avcodec-hpeldsp_init-Detemplatize.patch --]
[-- Type: text/x-patch, Size: 25781 bytes --]
From 268232ef14099c1a46e753840491c94040696532 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Fri, 30 May 2025 13:54:50 +0200
Subject: [PATCH 3/4] avcodec/hpeldsp_init: Detemplatize
Since a51279bbdea0d6db920d71980262bccd0ce78226,
hpeldsp_rnd_template.c was only included once and
one of the two functions in rnd_template.c was
only enabled once.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/hpeldsp_init.c | 258 +++++++++++++++++++++++++-
libavcodec/x86/hpeldsp_rnd_template.c | 202 --------------------
libavcodec/x86/rnd_template.c | 79 --------
3 files changed, 252 insertions(+), 287 deletions(-)
delete mode 100644 libavcodec/x86/hpeldsp_rnd_template.c
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index 12edcd9e83..6b2ad4494b 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -22,6 +22,9 @@
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
*/
+#include <stddef.h>
+#include <stdint.h>
+
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
@@ -74,19 +77,263 @@ void ff_avg_approx_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
/* MMX no rounding */
#define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
#define SET_RND MOVQ_WONE
-#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
-#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
#define STATIC static
#include "rnd_template.c"
-#include "hpeldsp_rnd_template.c"
#undef DEF
#undef SET_RND
-#undef PAVGBP
-#undef PAVGB
#undef STATIC
+// this routine is 'slightly' suboptimal but mostly unused
+static void avg_no_rnd_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h)
+{
+ MOVQ_ZERO(mm7);
+ MOVQ_WONE(mm6); // =2 for rnd and =1 for no_rnd version
+ __asm__ volatile(
+ "movq (%1), %%mm0 \n\t"
+ "movq 1(%1), %%mm4 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm4, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "punpckhbw %%mm7, %%mm1 \n\t"
+ "punpckhbw %%mm7, %%mm5 \n\t"
+ "paddusw %%mm0, %%mm4 \n\t"
+ "paddusw %%mm1, %%mm5 \n\t"
+ "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
+ "add %3, %1 \n\t"
+ ".p2align 3 \n\t"
+ "1: \n\t"
+ "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
+ "movq 1(%1, %%"FF_REG_a"), %%mm2 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpckhbw %%mm7, %%mm1 \n\t"
+ "punpckhbw %%mm7, %%mm3 \n\t"
+ "paddusw %%mm2, %%mm0 \n\t"
+ "paddusw %%mm3, %%mm1 \n\t"
+ "paddusw %%mm6, %%mm4 \n\t"
+ "paddusw %%mm6, %%mm5 \n\t"
+ "paddusw %%mm0, %%mm4 \n\t"
+ "paddusw %%mm1, %%mm5 \n\t"
+ "psrlw $2, %%mm4 \n\t"
+ "psrlw $2, %%mm5 \n\t"
+ "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
+ "packuswb %%mm5, %%mm4 \n\t"
+ "pcmpeqd %%mm2, %%mm2 \n\t"
+ "paddb %%mm2, %%mm2 \n\t"
+ PAVGB_MMX(%%mm3, %%mm4, %%mm5, %%mm2)
+ "movq %%mm5, (%2, %%"FF_REG_a") \n\t"
+ "add %3, %%"FF_REG_a" \n\t"
+
+ "movq (%1, %%"FF_REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
+ "movq 1(%1, %%"FF_REG_a"), %%mm4 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "movq %%mm4, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "punpckhbw %%mm7, %%mm3 \n\t"
+ "punpckhbw %%mm7, %%mm5 \n\t"
+ "paddusw %%mm2, %%mm4 \n\t"
+ "paddusw %%mm3, %%mm5 \n\t"
+ "paddusw %%mm6, %%mm0 \n\t"
+ "paddusw %%mm6, %%mm1 \n\t"
+ "paddusw %%mm4, %%mm0 \n\t"
+ "paddusw %%mm5, %%mm1 \n\t"
+ "psrlw $2, %%mm0 \n\t"
+ "psrlw $2, %%mm1 \n\t"
+ "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
+ "packuswb %%mm1, %%mm0 \n\t"
+ "pcmpeqd %%mm2, %%mm2 \n\t"
+ "paddb %%mm2, %%mm2 \n\t"
+ PAVGB_MMX(%%mm3, %%mm0, %%mm1, %%mm2)
+ "movq %%mm1, (%2, %%"FF_REG_a") \n\t"
+ "add %3, %%"FF_REG_a" \n\t"
+
+ "subl $2, %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+S"(pixels)
+ :"D"(block), "r"((x86_reg)line_size)
+ :FF_REG_a, "memory");
+}
+
+static void put_no_rnd_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+ MOVQ_BFE(mm6);
+ __asm__ volatile(
+ "lea (%3, %3), %%"FF_REG_a" \n\t"
+ ".p2align 3 \n\t"
+ "1: \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq 1(%1), %%mm1 \n\t"
+ "movq (%1, %3), %%mm2 \n\t"
+ "movq 1(%1, %3), %%mm3 \n\t"
+ PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
+ "movq %%mm4, (%2) \n\t"
+ "movq %%mm5, (%2, %3) \n\t"
+ "add %%"FF_REG_a", %1 \n\t"
+ "add %%"FF_REG_a", %2 \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq 1(%1), %%mm1 \n\t"
+ "movq (%1, %3), %%mm2 \n\t"
+ "movq 1(%1, %3), %%mm3 \n\t"
+ PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
+ "movq %%mm4, (%2) \n\t"
+ "movq %%mm5, (%2, %3) \n\t"
+ "add %%"FF_REG_a", %1 \n\t"
+ "add %%"FF_REG_a", %2 \n\t"
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+S"(pixels), "+D"(block)
+ :"r"((x86_reg)line_size)
+ :FF_REG_a, "memory");
+}
+
+static void put_no_rnd_pixels16_x2_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+ MOVQ_BFE(mm6);
+ __asm__ volatile(
+ "lea (%3, %3), %%"FF_REG_a" \n\t"
+ ".p2align 3 \n\t"
+ "1: \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq 1(%1), %%mm1 \n\t"
+ "movq (%1, %3), %%mm2 \n\t"
+ "movq 1(%1, %3), %%mm3 \n\t"
+ PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
+ "movq %%mm4, (%2) \n\t"
+ "movq %%mm5, (%2, %3) \n\t"
+ "movq 8(%1), %%mm0 \n\t"
+ "movq 9(%1), %%mm1 \n\t"
+ "movq 8(%1, %3), %%mm2 \n\t"
+ "movq 9(%1, %3), %%mm3 \n\t"
+ PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
+ "movq %%mm4, 8(%2) \n\t"
+ "movq %%mm5, 8(%2, %3) \n\t"
+ "add %%"FF_REG_a", %1 \n\t"
+ "add %%"FF_REG_a", %2 \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq 1(%1), %%mm1 \n\t"
+ "movq (%1, %3), %%mm2 \n\t"
+ "movq 1(%1, %3), %%mm3 \n\t"
+ PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
+ "movq %%mm4, (%2) \n\t"
+ "movq %%mm5, (%2, %3) \n\t"
+ "movq 8(%1), %%mm0 \n\t"
+ "movq 9(%1), %%mm1 \n\t"
+ "movq 8(%1, %3), %%mm2 \n\t"
+ "movq 9(%1, %3), %%mm3 \n\t"
+ PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
+ "movq %%mm4, 8(%2) \n\t"
+ "movq %%mm5, 8(%2, %3) \n\t"
+ "add %%"FF_REG_a", %1 \n\t"
+ "add %%"FF_REG_a", %2 \n\t"
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+S"(pixels), "+D"(block)
+ :"r"((x86_reg)line_size)
+ :FF_REG_a, "memory");
+}
+
+static void put_no_rnd_pixels8_y2_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+ MOVQ_BFE(mm6);
+ __asm__ volatile(
+ "lea (%3, %3), %%"FF_REG_a" \n\t"
+ "movq (%1), %%mm0 \n\t"
+ ".p2align 3 \n\t"
+ "1: \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "movq (%1, %%"FF_REG_a"),%%mm2\n\t"
+ PAVGBP_MMX_NO_RND(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
+ "movq %%mm4, (%2) \n\t"
+ "movq %%mm5, (%2, %3) \n\t"
+ "add %%"FF_REG_a", %1 \n\t"
+ "add %%"FF_REG_a", %2 \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "movq (%1, %%"FF_REG_a"),%%mm0\n\t"
+ PAVGBP_MMX_NO_RND(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
+ "movq %%mm4, (%2) \n\t"
+ "movq %%mm5, (%2, %3) \n\t"
+ "add %%"FF_REG_a", %1 \n\t"
+ "add %%"FF_REG_a", %2 \n\t"
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+S"(pixels), "+D"(block)
+ :"r"((x86_reg)line_size)
+ :FF_REG_a, "memory");
+}
+
+static void avg_no_rnd_pixels16_x2_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+ MOVQ_BFE(mm6);
+ __asm__ volatile(
+ ".p2align 3 \n\t"
+ "1: \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq 1(%1), %%mm1 \n\t"
+ "movq (%2), %%mm3 \n\t"
+ PAVGB_MMX_NO_RND(%%mm0, %%mm1, %%mm2, %%mm6)
+ PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
+ "movq %%mm0, (%2) \n\t"
+ "movq 8(%1), %%mm0 \n\t"
+ "movq 9(%1), %%mm1 \n\t"
+ "movq 8(%2), %%mm3 \n\t"
+ PAVGB_MMX_NO_RND(%%mm0, %%mm1, %%mm2, %%mm6)
+ PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
+ "movq %%mm0, 8(%2) \n\t"
+ "add %3, %1 \n\t"
+ "add %3, %2 \n\t"
+ "subl $1, %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+S"(pixels), "+D"(block)
+ :"r"((x86_reg)line_size)
+ :"memory");
+}
+
+static void avg_no_rnd_pixels8_y2_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+{
+ MOVQ_BFE(mm6);
+ __asm__ volatile(
+ "lea (%3, %3), %%"FF_REG_a" \n\t"
+ "movq (%1), %%mm0 \n\t"
+ ".p2align 3 \n\t"
+ "1: \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "movq (%1, %%"FF_REG_a"), %%mm2 \n\t"
+ PAVGBP_MMX_NO_RND(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
+ "movq (%2), %%mm3 \n\t"
+ PAVGB_MMX(%%mm3, %%mm4, %%mm0, %%mm6)
+ "movq (%2, %3), %%mm3 \n\t"
+ PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
+ "movq %%mm0, (%2) \n\t"
+ "movq %%mm1, (%2, %3) \n\t"
+ "add %%"FF_REG_a", %1 \n\t"
+ "add %%"FF_REG_a", %2 \n\t"
+
+ "movq (%1, %3), %%mm1 \n\t"
+ "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
+ PAVGBP_MMX_NO_RND(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
+ "movq (%2), %%mm3 \n\t"
+ PAVGB_MMX(%%mm3, %%mm4, %%mm2, %%mm6)
+ "movq (%2, %3), %%mm3 \n\t"
+ PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
+ "movq %%mm2, (%2) \n\t"
+ "movq %%mm1, (%2, %3) \n\t"
+ "add %%"FF_REG_a", %1 \n\t"
+ "add %%"FF_REG_a", %2 \n\t"
+
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+S"(pixels), "+D"(block)
+ :"r"((x86_reg)line_size)
+ :FF_REG_a, "memory");
+}
+
#if HAVE_MMX
CALL_2X_PIXELS(avg_no_rnd_pixels16_y2_mmx, avg_no_rnd_pixels8_y2_mmx, 8)
CALL_2X_PIXELS(put_no_rnd_pixels16_y2_mmx, put_no_rnd_pixels8_y2_mmx, 8)
@@ -101,7 +348,6 @@ CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, put_no_rnd_pixels8_xy2_mmx, 8)
#define SET_RND MOVQ_WTWO
#define DEF(x, y) ff_ ## x ## _ ## y ## _mmx
#define STATIC
-#define NO_AVG
#include "rnd_template.c"
diff --git a/libavcodec/x86/hpeldsp_rnd_template.c b/libavcodec/x86/hpeldsp_rnd_template.c
deleted file mode 100644
index 2bff2d2766..0000000000
--- a/libavcodec/x86/hpeldsp_rnd_template.c
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
- * SIMD-optimized halfpel functions are compiled twice for rnd/no_rnd
- * Copyright (c) 2000, 2001 Fabrice Bellard
- * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
- *
- * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
- * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
- * and improved by Zdenek Kabelac <kabi@users.sf.net>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stddef.h>
-#include <stdint.h>
-
-// put_pixels
-av_unused static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-{
- MOVQ_BFE(mm6);
- __asm__ volatile(
- "lea (%3, %3), %%"FF_REG_a" \n\t"
- ".p2align 3 \n\t"
- "1: \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq 1(%1), %%mm1 \n\t"
- "movq (%1, %3), %%mm2 \n\t"
- "movq 1(%1, %3), %%mm3 \n\t"
- PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
- "movq %%mm4, (%2) \n\t"
- "movq %%mm5, (%2, %3) \n\t"
- "add %%"FF_REG_a", %1 \n\t"
- "add %%"FF_REG_a", %2 \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq 1(%1), %%mm1 \n\t"
- "movq (%1, %3), %%mm2 \n\t"
- "movq 1(%1, %3), %%mm3 \n\t"
- PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
- "movq %%mm4, (%2) \n\t"
- "movq %%mm5, (%2, %3) \n\t"
- "add %%"FF_REG_a", %1 \n\t"
- "add %%"FF_REG_a", %2 \n\t"
- "subl $4, %0 \n\t"
- "jnz 1b \n\t"
- :"+g"(h), "+S"(pixels), "+D"(block)
- :"r"((x86_reg)line_size)
- :FF_REG_a, "memory");
-}
-
-av_unused static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-{
- MOVQ_BFE(mm6);
- __asm__ volatile(
- "lea (%3, %3), %%"FF_REG_a" \n\t"
- ".p2align 3 \n\t"
- "1: \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq 1(%1), %%mm1 \n\t"
- "movq (%1, %3), %%mm2 \n\t"
- "movq 1(%1, %3), %%mm3 \n\t"
- PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
- "movq %%mm4, (%2) \n\t"
- "movq %%mm5, (%2, %3) \n\t"
- "movq 8(%1), %%mm0 \n\t"
- "movq 9(%1), %%mm1 \n\t"
- "movq 8(%1, %3), %%mm2 \n\t"
- "movq 9(%1, %3), %%mm3 \n\t"
- PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
- "movq %%mm4, 8(%2) \n\t"
- "movq %%mm5, 8(%2, %3) \n\t"
- "add %%"FF_REG_a", %1 \n\t"
- "add %%"FF_REG_a", %2 \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq 1(%1), %%mm1 \n\t"
- "movq (%1, %3), %%mm2 \n\t"
- "movq 1(%1, %3), %%mm3 \n\t"
- PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
- "movq %%mm4, (%2) \n\t"
- "movq %%mm5, (%2, %3) \n\t"
- "movq 8(%1), %%mm0 \n\t"
- "movq 9(%1), %%mm1 \n\t"
- "movq 8(%1, %3), %%mm2 \n\t"
- "movq 9(%1, %3), %%mm3 \n\t"
- PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
- "movq %%mm4, 8(%2) \n\t"
- "movq %%mm5, 8(%2, %3) \n\t"
- "add %%"FF_REG_a", %1 \n\t"
- "add %%"FF_REG_a", %2 \n\t"
- "subl $4, %0 \n\t"
- "jnz 1b \n\t"
- :"+g"(h), "+S"(pixels), "+D"(block)
- :"r"((x86_reg)line_size)
- :FF_REG_a, "memory");
-}
-
-av_unused static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-{
- MOVQ_BFE(mm6);
- __asm__ volatile(
- "lea (%3, %3), %%"FF_REG_a" \n\t"
- "movq (%1), %%mm0 \n\t"
- ".p2align 3 \n\t"
- "1: \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- "movq (%1, %%"FF_REG_a"),%%mm2\n\t"
- PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
- "movq %%mm4, (%2) \n\t"
- "movq %%mm5, (%2, %3) \n\t"
- "add %%"FF_REG_a", %1 \n\t"
- "add %%"FF_REG_a", %2 \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- "movq (%1, %%"FF_REG_a"),%%mm0\n\t"
- PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
- "movq %%mm4, (%2) \n\t"
- "movq %%mm5, (%2, %3) \n\t"
- "add %%"FF_REG_a", %1 \n\t"
- "add %%"FF_REG_a", %2 \n\t"
- "subl $4, %0 \n\t"
- "jnz 1b \n\t"
- :"+g"(h), "+S"(pixels), "+D"(block)
- :"r"((x86_reg)line_size)
- :FF_REG_a, "memory");
-}
-
-av_unused static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-{
- MOVQ_BFE(mm6);
- __asm__ volatile(
- ".p2align 3 \n\t"
- "1: \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq 1(%1), %%mm1 \n\t"
- "movq (%2), %%mm3 \n\t"
- PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
- PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
- "movq %%mm0, (%2) \n\t"
- "movq 8(%1), %%mm0 \n\t"
- "movq 9(%1), %%mm1 \n\t"
- "movq 8(%2), %%mm3 \n\t"
- PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
- PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
- "movq %%mm0, 8(%2) \n\t"
- "add %3, %1 \n\t"
- "add %3, %2 \n\t"
- "subl $1, %0 \n\t"
- "jnz 1b \n\t"
- :"+g"(h), "+S"(pixels), "+D"(block)
- :"r"((x86_reg)line_size)
- :"memory");
-}
-
-av_unused static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-{
- MOVQ_BFE(mm6);
- __asm__ volatile(
- "lea (%3, %3), %%"FF_REG_a" \n\t"
- "movq (%1), %%mm0 \n\t"
- ".p2align 3 \n\t"
- "1: \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- "movq (%1, %%"FF_REG_a"), %%mm2 \n\t"
- PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
- "movq (%2), %%mm3 \n\t"
- PAVGB_MMX(%%mm3, %%mm4, %%mm0, %%mm6)
- "movq (%2, %3), %%mm3 \n\t"
- PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
- "movq %%mm0, (%2) \n\t"
- "movq %%mm1, (%2, %3) \n\t"
- "add %%"FF_REG_a", %1 \n\t"
- "add %%"FF_REG_a", %2 \n\t"
-
- "movq (%1, %3), %%mm1 \n\t"
- "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
- PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
- "movq (%2), %%mm3 \n\t"
- PAVGB_MMX(%%mm3, %%mm4, %%mm2, %%mm6)
- "movq (%2, %3), %%mm3 \n\t"
- PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
- "movq %%mm2, (%2) \n\t"
- "movq %%mm1, (%2, %3) \n\t"
- "add %%"FF_REG_a", %1 \n\t"
- "add %%"FF_REG_a", %2 \n\t"
-
- "subl $4, %0 \n\t"
- "jnz 1b \n\t"
- :"+g"(h), "+S"(pixels), "+D"(block)
- :"r"((x86_reg)line_size)
- :FF_REG_a, "memory");
-}
diff --git a/libavcodec/x86/rnd_template.c b/libavcodec/x86/rnd_template.c
index b825eeba6e..4590aeddf0 100644
--- a/libavcodec/x86/rnd_template.c
+++ b/libavcodec/x86/rnd_template.c
@@ -96,82 +96,3 @@ av_unused STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixel
:"D"(block), "r"((x86_reg)line_size)
:FF_REG_a, "memory");
}
-
-#ifndef NO_AVG
-// avg_pixels
-// this routine is 'slightly' suboptimal but mostly unused
-av_unused STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h)
-{
- MOVQ_ZERO(mm7);
- SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
- __asm__ volatile(
- "movq (%1), %%mm0 \n\t"
- "movq 1(%1), %%mm4 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm4, %%mm5 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t"
- "punpcklbw %%mm7, %%mm4 \n\t"
- "punpckhbw %%mm7, %%mm1 \n\t"
- "punpckhbw %%mm7, %%mm5 \n\t"
- "paddusw %%mm0, %%mm4 \n\t"
- "paddusw %%mm1, %%mm5 \n\t"
- "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
- "add %3, %1 \n\t"
- ".p2align 3 \n\t"
- "1: \n\t"
- "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
- "movq 1(%1, %%"FF_REG_a"), %%mm2 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm2, %%mm3 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t"
- "punpcklbw %%mm7, %%mm2 \n\t"
- "punpckhbw %%mm7, %%mm1 \n\t"
- "punpckhbw %%mm7, %%mm3 \n\t"
- "paddusw %%mm2, %%mm0 \n\t"
- "paddusw %%mm3, %%mm1 \n\t"
- "paddusw %%mm6, %%mm4 \n\t"
- "paddusw %%mm6, %%mm5 \n\t"
- "paddusw %%mm0, %%mm4 \n\t"
- "paddusw %%mm1, %%mm5 \n\t"
- "psrlw $2, %%mm4 \n\t"
- "psrlw $2, %%mm5 \n\t"
- "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
- "packuswb %%mm5, %%mm4 \n\t"
- "pcmpeqd %%mm2, %%mm2 \n\t"
- "paddb %%mm2, %%mm2 \n\t"
- PAVGB_MMX(%%mm3, %%mm4, %%mm5, %%mm2)
- "movq %%mm5, (%2, %%"FF_REG_a") \n\t"
- "add %3, %%"FF_REG_a" \n\t"
-
- "movq (%1, %%"FF_REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
- "movq 1(%1, %%"FF_REG_a"), %%mm4 \n\t"
- "movq %%mm2, %%mm3 \n\t"
- "movq %%mm4, %%mm5 \n\t"
- "punpcklbw %%mm7, %%mm2 \n\t"
- "punpcklbw %%mm7, %%mm4 \n\t"
- "punpckhbw %%mm7, %%mm3 \n\t"
- "punpckhbw %%mm7, %%mm5 \n\t"
- "paddusw %%mm2, %%mm4 \n\t"
- "paddusw %%mm3, %%mm5 \n\t"
- "paddusw %%mm6, %%mm0 \n\t"
- "paddusw %%mm6, %%mm1 \n\t"
- "paddusw %%mm4, %%mm0 \n\t"
- "paddusw %%mm5, %%mm1 \n\t"
- "psrlw $2, %%mm0 \n\t"
- "psrlw $2, %%mm1 \n\t"
- "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
- "packuswb %%mm1, %%mm0 \n\t"
- "pcmpeqd %%mm2, %%mm2 \n\t"
- "paddb %%mm2, %%mm2 \n\t"
- PAVGB_MMX(%%mm3, %%mm0, %%mm1, %%mm2)
- "movq %%mm1, (%2, %%"FF_REG_a") \n\t"
- "add %3, %%"FF_REG_a" \n\t"
-
- "subl $2, %0 \n\t"
- "jnz 1b \n\t"
- :"+g"(h), "+S"(pixels)
- :"D"(block), "r"((x86_reg)line_size)
- :FF_REG_a, "memory");
-}
-#endif
--
2.45.2
[-- Attachment #5: 0004-avcodec-vc1dsp-Fix-vc1op_pixels_func-semantics.patch --]
[-- Type: text/x-patch, Size: 1020 bytes --]
From 1754b6a8aba861c2962811a2d1b7f87ec3e130b2 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Fri, 30 May 2025 14:56:57 +0200
Subject: [PATCH 4/4] avcodec/vc1dsp: Fix vc1op_pixels_func semantics
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/vc1dsp.h | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h
index e3b90d2b62..b018537af3 100644
--- a/libavcodec/vc1dsp.h
+++ b/libavcodec/vc1dsp.h
@@ -30,7 +30,9 @@
#include "hpeldsp.h"
#include "h264chroma.h"
-typedef void (*vc1op_pixels_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, ptrdiff_t line_size, int h);
+typedef void (*vc1op_pixels_func)(uint8_t *block/*align width (8 or 16)*/,
+ const uint8_t *pixels/*align 1*/,
+ ptrdiff_t line_size, int round);
typedef struct VC1DSPContext {
/* vc1 functions */
--
2.45.2
[-- Attachment #6: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/4] avcodec/hpeldsp: Remove duplicate pel functions
2025-05-30 15:09 [FFmpeg-devel] [PATCH 1/4] avcodec/hpeldsp: Remove duplicate pel functions Andreas Rheinhardt
@ 2025-06-01 20:31 ` Andreas Rheinhardt
0 siblings, 0 replies; 2+ messages in thread
From: Andreas Rheinhardt @ 2025-06-01 20:31 UTC (permalink / raw)
To: ffmpeg-devel
Andreas Rheinhardt:
> Patches attached.
>
> - Andreas
>
Will apply this patchset tomorrow unless there are objections.
- Andreas
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2025-06-01 20:32 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-05-30 15:09 [FFmpeg-devel] [PATCH 1/4] avcodec/hpeldsp: Remove duplicate pel functions Andreas Rheinhardt
2025-06-01 20:31 ` Andreas Rheinhardt
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git