[FFmpeg-devel] [PATCH] h264chroma (PR #20813)

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed

* [FFmpeg-devel] [PATCH] h264chroma (PR #20813)
@ 2025-11-01 11:10 mkver via ffmpeg-devel
  0 siblings, 0 replies; only message in thread
From: mkver via ffmpeg-devel @ 2025-11-01 11:10 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: mkver

PR #20813 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20813
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20813.patch


>From f3d54991d9243bd5065d22c4424628ff8972f1c4 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 30 Oct 2025 15:40:02 +0100
Subject: [PATCH 1/4] avcodec/h264chroma: Move mc1 function to mpegvideo_dec.c

It is only used by mpegvideo decoders (for lowres). It is also only used
for bitdepth == 8, so don't build the bitdepth == 16 function at all any
more.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/h264chroma.c          |  2 --
 libavcodec/h264chroma_template.c | 34 --------------------------
 libavcodec/mpegvideo_dec.c       | 41 ++++++++++++++++++++++++++++++++
 tests/checkasm/h264chroma.c      |  2 +-
 4 files changed, 42 insertions(+), 37 deletions(-)

diff --git a/libavcodec/h264chroma.c b/libavcodec/h264chroma.c
index 1eeab7bc40..5000c89aa7 100644
--- a/libavcodec/h264chroma.c
+++ b/libavcodec/h264chroma.c
@@ -32,11 +32,9 @@
     c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_ ## depth ## _c; \
     c->put_h264_chroma_pixels_tab[1] = put_h264_chroma_mc4_ ## depth ## _c; \
     c->put_h264_chroma_pixels_tab[2] = put_h264_chroma_mc2_ ## depth ## _c; \
-    c->put_h264_chroma_pixels_tab[3] = put_h264_chroma_mc1_ ## depth ## _c; \
     c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_ ## depth ## _c; \
     c->avg_h264_chroma_pixels_tab[1] = avg_h264_chroma_mc4_ ## depth ## _c; \
     c->avg_h264_chroma_pixels_tab[2] = avg_h264_chroma_mc2_ ## depth ## _c; \
-    c->avg_h264_chroma_pixels_tab[3] = avg_h264_chroma_mc1_ ## depth ## _c; \
 
 av_cold void ff_h264chroma_init(H264ChromaContext *c, int bit_depth)
 {
diff --git a/libavcodec/h264chroma_template.c b/libavcodec/h264chroma_template.c
index b9d24f5a0c..b58be192cd 100644
--- a/libavcodec/h264chroma_template.c
+++ b/libavcodec/h264chroma_template.c
@@ -26,40 +26,6 @@
 #include "bit_depth_template.c"
 
 #define H264_CHROMA_MC(OPNAME, OP)\
-static void FUNCC(OPNAME ## h264_chroma_mc1)(uint8_t *_dst /*align 8*/, const uint8_t *_src /*align 1*/, ptrdiff_t stride, int h, int x, int y){\
-    pixel *dst = (pixel*)_dst;\
-    const pixel *src = (const pixel*)_src;\
-    const int A=(8-x)*(8-y);\
-    const int B=(  x)*(8-y);\
-    const int C=(8-x)*(  y);\
-    const int D=(  x)*(  y);\
-    int i;\
-    stride >>= sizeof(pixel)-1;\
-    \
-    av_assert2(x<8 && y<8 && x>=0 && y>=0);\
-\
-    if(D){\
-        for(i=0; i<h; i++){\
-            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
-            dst+= stride;\
-            src+= stride;\
-        }\
-    } else if (B + C) {\
-        const int E= B+C;\
-        const int step= C ? stride : 1;\
-        for(i=0; i<h; i++){\
-            OP(dst[0], (A*src[0] + E*src[step+0]));\
-            dst+= stride;\
-            src+= stride;\
-        }\
-    } else {\
-        for(i=0; i<h; i++){\
-            OP(dst[0], (A*src[0]));\
-            dst+= stride;\
-            src+= stride;\
-        }\
-    }\
-}\
 static void FUNCC(OPNAME ## h264_chroma_mc2)(uint8_t *_dst /*align 8*/, const uint8_t *_src /*align 1*/, ptrdiff_t stride, int h, int x, int y)\
 {\
     pixel *dst = (pixel*)_dst;\
diff --git a/libavcodec/mpegvideo_dec.c b/libavcodec/mpegvideo_dec.c
index 85e24c667e..ad27180efd 100644
--- a/libavcodec/mpegvideo_dec.c
+++ b/libavcodec/mpegvideo_dec.c
@@ -44,6 +44,45 @@
 #include "threadprogress.h"
 #include "wmv2dec.h"
 
+#define H264_CHROMA_MC(OPNAME, OP)\
+static void OPNAME ## h264_chroma_mc1(uint8_t *dst /*align 8*/, const uint8_t *src /*align 1*/, ptrdiff_t stride, int h, int x, int y)\
+{\
+    const int A = (8-x) * (8-y);\
+    const int B = (  x) * (8-y);\
+    const int C = (8-x) * (  y);\
+    const int D = (  x) * (  y);\
+    \
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);\
+\
+    if (D) {\
+        for (int i = 0; i < h; ++i) {\
+            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
+            dst += stride;\
+            src += stride;\
+        }\
+    } else if (B + C) {\
+        const int E    = B + C;\
+        const int step = C ? stride : 1;\
+        for (int i = 0; i < h; ++i) {\
+            OP(dst[0], (A*src[0] + E*src[step+0]));\
+            dst += stride;\
+            src += stride;\
+        }\
+    } else {\
+        for (int i = 0; i < h; ++i) {\
+            OP(dst[0], (A*src[0]));\
+            dst += stride;\
+            src += stride;\
+        }\
+    }\
+}\
+
+#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
+#define op_put(a, b) a = (((b) + 32)>>6)
+
+H264_CHROMA_MC(put_, op_put)
+H264_CHROMA_MC(avg_, op_avg)
+
 av_cold int ff_mpv_decode_init(MpegEncContext *s, AVCodecContext *avctx)
 {
     enum ThreadingStatus thread_status;
@@ -62,6 +101,8 @@ av_cold int ff_mpv_decode_init(MpegEncContext *s, AVCodecContext *avctx)
     ff_mpv_idct_init(s);
 
     ff_h264chroma_init(&s->h264chroma, 8); //for lowres
+    s->h264chroma.avg_h264_chroma_pixels_tab[3] = avg_h264_chroma_mc1;
+    s->h264chroma.put_h264_chroma_pixels_tab[3] = put_h264_chroma_mc1;
 
     if (s->picture_pool)  // VC-1 can call this multiple times
         return 0;
diff --git a/tests/checkasm/h264chroma.c b/tests/checkasm/h264chroma.c
index 9579fceab7..52aa220152 100644
--- a/tests/checkasm/h264chroma.c
+++ b/tests/checkasm/h264chroma.c
@@ -51,7 +51,7 @@ static void check_chroma_mc(void)
     for (int bit_depth = 8; bit_depth <= 10; bit_depth++) {
         ff_h264chroma_init(&h, bit_depth);
         randomize_buffers(bit_depth);
-        for (int size = 0; size < 4; size++) {
+        for (int size = 0; size < 3; size++) {
 
 #define CHECK_CHROMA_MC(name)                                                                             \
             do {                                                                                          \
-- 
2.49.1


>From 392026641d525d8df0b5fff01102de98d9e4097b Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sat, 1 Nov 2025 11:27:22 +0100
Subject: [PATCH 2/4] configure: Add config_components.asm

This is in preparation for the next commit.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 configure | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/configure b/configure
index 3b132d07c9..17f72b735e 100755
--- a/configure
+++ b/configure
@@ -8524,8 +8524,10 @@ echo "#endif /* FFMPEG_CONFIG_H */" >> $TMPH
 cp_if_changed $TMPH config.h
 touch ffbuild/.config
 
-# Copy config.asm before printing ALL_COMPONENTS; that's not needed in assembly.
-enabled x86asm && cp_if_changed $TMPASM config.asm
+# Copy config.asm and reopen a new TMPASM for config_components.asm
+enabled x86asm && cp_if_changed $TMPASM config.asm && cat > $TMPASM <<EOF
+; Automatically generated by configure - do not modify!
+EOF
 
 # Reopen a new TMPH for config_components.h.
 cat > $TMPH <<EOF
@@ -8540,6 +8542,7 @@ echo "#endif /* FFMPEG_CONFIG_COMPONENTS_H */" >> $TMPH
 echo "endif # FFMPEG_CONFIG_MAK" >> ffbuild/config.mak
 
 cp_if_changed $TMPH config_components.h
+enabled x86asm && cp_if_changed $TMPASM config_components.asm
 
 cat > $TMPH <<EOF
 /* Generated by ffmpeg configure */
-- 
2.49.1


>From ec488bbdbec83c23ac74e34f2a5b9d19379b7de4 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sat, 1 Nov 2025 10:37:25 +0100
Subject: [PATCH 3/4] avcodec/x86/h264_chromamc: Add SSSE3 RV40 chroma motion
 compensation functions

The only difference between it and the H.264/VC-1 versions is the
bias constant which depends on the shift parameters for RV40.
This value ends up in a register and therefore one can reuse
the H.264 code by setting the registers for RV40 and then jumping
into the relevant H.264 function, making the four new functions
cheap (just 256 bytes in total). This approach uses one jump more
for the no-filter case and one jump less in the one-dimensional
case than an approach using separate functions.

avg_chroma_mc4_c:                                      167.5 ( 1.00x)
avg_chroma_mc4_mmxext:                                  48.1 ( 3.48x)
avg_chroma_mc4_ssse3:                                   31.1 ( 5.39x)
avg_chroma_mc8_c:                                      325.5 ( 1.00x)
avg_chroma_mc8_mmxext:                                 103.2 ( 3.15x)
avg_chroma_mc8_ssse3:                                   33.5 ( 9.71x)
put_chroma_mc4_c:                                      137.4 ( 1.00x)
put_chroma_mc4_mmx:                                     44.5 ( 3.09x)
put_chroma_mc4_ssse3:                                   28.4 ( 4.83x)
put_chroma_mc8_c:                                      271.4 ( 1.00x)
put_chroma_mc8_mmx:                                     99.9 ( 2.72x)
put_chroma_mc8_ssse3:                                   30.6 ( 8.86x)

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/rv40dsp.c             |  7 +--
 libavcodec/x86/h264_chromamc.asm | 83 +++++++++++++++++++++++++++++---
 libavcodec/x86/rv40dsp_init.c    | 10 ++++
 3 files changed, 91 insertions(+), 9 deletions(-)

diff --git a/libavcodec/rv40dsp.c b/libavcodec/rv40dsp.c
index 970faec5de..dd73737bd6 100644
--- a/libavcodec/rv40dsp.c
+++ b/libavcodec/rv40dsp.c
@@ -24,6 +24,7 @@
  * RV40 decoder motion compensation functions
  */
 
+#include "libavutil/attributes_internal.h"
 #include "libavutil/common.h"
 #include "libavutil/intreadwrite.h"
 #include "h264qpel.h"
@@ -283,7 +284,7 @@ static void avg_rv40_qpel8_mc33_c(uint8_t *dst, const uint8_t *src, ptrdiff_t st
     avg_pixels8_xy2_8_c(dst, src, stride, 8);
 }
 
-static const int rv40_bias[4][4] = {
+attribute_visibility_hidden const int ff_rv40_bias[4][4] = {
     {  0, 16, 32, 16 },
     { 32, 28, 32, 28 },
     {  0, 32, 16, 32 },
@@ -300,7 +301,7 @@ static void OPNAME ## rv40_chroma_mc4_c(uint8_t *dst /*align 8*/,\
     const int C = (8-x) * (  y);\
     const int D = (  x) * (  y);\
     int i;\
-    int bias = rv40_bias[y>>1][x>>1];\
+    int bias = ff_rv40_bias[y>>1][x>>1];\
     \
     av_assert2(x<8 && y<8 && x>=0 && y>=0);\
 \
@@ -336,7 +337,7 @@ static void OPNAME ## rv40_chroma_mc8_c(uint8_t *dst/*align 8*/,\
     const int C = (8-x) * (  y);\
     const int D = (  x) * (  y);\
     int i;\
-    int bias = rv40_bias[y>>1][x>>1];\
+    int bias = ff_rv40_bias[y>>1][x>>1];\
     \
     av_assert2(x<8 && y<8 && x>=0 && y>=0);\
 \
diff --git a/libavcodec/x86/h264_chromamc.asm b/libavcodec/x86/h264_chromamc.asm
index ec6288d48e..8cdc0fe5a1 100644
--- a/libavcodec/x86/h264_chromamc.asm
+++ b/libavcodec/x86/h264_chromamc.asm
@@ -20,6 +20,7 @@
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
+%include "config_components.asm"
 %include "libavutil/x86/x86util.asm"
 
 SECTION_RODATA
@@ -64,6 +65,8 @@ pw_28: times 8 dw 28
 cextern pw_32
 cextern pw_64
 
+cextern rv40_bias
+
 SECTION .text
 
 %macro mv0_pixels_mc8 0
@@ -447,11 +450,12 @@ chroma_mc4_mmx_func avg, rv40
 chroma_mc2_mmx_func avg, h264
 
 %macro chroma_mc8_ssse3_func 2-3
-cglobal %1_%2_chroma_mc8%3, 6, 7, 8
+cglobal %1_%2_chroma_mc8%3, 6, 7+UNIX64, 8
     mov          r6d, r5d
     or           r6d, r4d
     jne .at_least_one_non_zero
     ; mx == 0 AND my == 0 - no filter needed
+..@%1_%2_chroma_mc8_no_filter_ %+ cpuname:
     mv0_pixels_mc8
     RET
 
@@ -462,6 +466,8 @@ cglobal %1_%2_chroma_mc8%3, 6, 7, 8
     je .mx_is_zero
 
     ; general case, bilinear
+    movdqa        m5, [rnd_2d_%2]
+..@%1_%2_chroma_mc8_both_nonzero_ %+ cpuname:
     mov          r6d, r4d
     shl          r4d, 8
     sub           r4, r6
@@ -473,7 +479,6 @@ cglobal %1_%2_chroma_mc8%3, 6, 7, 8
 
     movd          m7, r6d
     movd          m6, r4d
-    movdqa        m5, [rnd_2d_%2]
     movq          m0, [r1  ]
     movq          m1, [r1+1]
     pshuflw       m7, m7, 0
@@ -517,12 +522,13 @@ cglobal %1_%2_chroma_mc8%3, 6, 7, 8
     RET
 
 .my_is_zero:
+    movdqa        m6, [rnd_1d_%2]
+..@%1_%2_chroma_mc8_my_zero_ %+ cpuname:
     mov          r5d, r4d
     shl          r4d, 8
     add           r4, 8
     sub           r4, r5          ; 255*x+8 = x<<8 | (8-x)
     movd          m7, r4d
-    movdqa        m6, [rnd_1d_%2]
     pshuflw       m7, m7, 0
     movlhps       m7, m7
 
@@ -554,12 +560,13 @@ cglobal %1_%2_chroma_mc8%3, 6, 7, 8
     RET
 
 .mx_is_zero:
+    movdqa        m6, [rnd_1d_%2]
+..@%1_%2_chroma_mc8_mx_zero_ %+ cpuname:
     mov          r4d, r5d
     shl          r5d, 8
     add           r5, 8
     sub           r5, r4          ; 255*y+8 = y<<8 | (8-y)
     movd          m7, r5d
-    movdqa        m6, [rnd_1d_%2]
     pshuflw       m7, m7, 0
     movlhps       m7, m7
 
@@ -592,7 +599,9 @@ cglobal %1_%2_chroma_mc8%3, 6, 7, 8
 %endmacro
 
 %macro chroma_mc4_ssse3_func 2
-cglobal %1_%2_chroma_mc4, 6, 7, 0
+cglobal %1_%2_chroma_mc4, 6, 7+UNIX64, 0
+    movq          m5, [pw_32]
+..@%1_%2_chroma_mc4_after_init_ %+ cpuname:
     mov           r6, r4
     shl          r4d, 8
     sub          r4d, r6d
@@ -604,7 +613,6 @@ cglobal %1_%2_chroma_mc4, 6, 7, 0
 
     movd          m7, r6d
     movd          m6, r4d
-    movq          m5, [pw_32]
     movd          m0, [r1  ]
     pshufw        m7, m7, 0
     punpcklbw     m0, [r1+1]
@@ -641,16 +649,79 @@ cglobal %1_%2_chroma_mc4, 6, 7, 0
     RET
 %endmacro
 
+%macro rv40_get_bias 1 ; dst reg
+%if !PIC || UNIX64
+    ; on UNIX64 we have enough volatile registers
+%if PIC && UNIX64
+    lea           r7, [rv40_bias]
+%endif
+    mov          r6d, r5d
+    and          r6d, 6         ; &~1 for mx/my=[0,7]
+    lea          r6d, [r6d*4+r4d]
+    sar          r6d, 1
+%if PIC && UNIX64
+    movd          %1, [r7+4*r6]
+%else
+    movd          %1, [rv40_bias+4*r6]
+%endif
+%else  ; PIC && !UNIX64, de facto WIN64
+    lea           r6, [rv40_bias]
+%ifidn r5d, r5m ; always false for currently supported calling conventions
+    push          r5
+%endif
+    and          r5d, 6         ; &~1 for mx/my=[0,7]
+    lea          r5d, [r5d*4+r4d]
+    sar          r5d, 1
+    movd          %1, [r6+4*r5]
+%ifidn r5d, r5m
+    pop           r5
+%else
+    mov          r5d, r5m
+%endif
+%endif
+    SPLATW        %1, %1
+%endmacro
+
+%macro rv40_chroma_mc8_func 1 ; put vs avg
+%if CONFIG_RV40_DECODER
+    cglobal rv40_%1_chroma_mc8, 6, 7+UNIX64, 8
+    mov          r6d, r5d
+    or           r6d, r4d
+    jz           ..@%1_h264_chroma_mc8_no_filter_ %+ cpuname
+    rv40_get_bias m5
+    ; the bilinear code expects bias in m5, the one-dimensional code in m6
+    mova          m6, m5
+    psraw         m6, 3
+    test         r5d, r5d
+    je           ..@%1_h264_chroma_mc8_my_zero_ %+ cpuname
+    test         r4d, r4d
+    je           ..@%1_h264_chroma_mc8_mx_zero_ %+ cpuname
+    jmp          ..@%1_h264_chroma_mc8_both_nonzero_ %+ cpuname
+%endif
+%endmacro
+
+%macro rv40_chroma_mc4_func 1 ; put vs avg
+%if CONFIG_RV40_DECODER
+    cglobal rv40_%1_chroma_mc4, 6, 7+UNIX64, 0
+    rv40_get_bias m5
+    jmp           ..@%1_h264_chroma_mc4_after_init_ %+ cpuname
+%endif
+%endmacro
+
 %define CHROMAMC_AVG NOTHING
 INIT_XMM ssse3
 chroma_mc8_ssse3_func put, h264, _rnd
 chroma_mc8_ssse3_func put, vc1,  _nornd
+rv40_chroma_mc8_func put
 INIT_MMX ssse3
 chroma_mc4_ssse3_func put, h264
+rv40_chroma_mc4_func put
 
 %define CHROMAMC_AVG DIRECT_AVG
 INIT_XMM ssse3
 chroma_mc8_ssse3_func avg, h264, _rnd
 chroma_mc8_ssse3_func avg, vc1,  _nornd
+rv40_chroma_mc8_func avg
 INIT_MMX ssse3
 chroma_mc4_ssse3_func avg, h264
+rv40_chroma_mc4_func avg
diff --git a/libavcodec/x86/rv40dsp_init.c b/libavcodec/x86/rv40dsp_init.c
index 780358abc2..ce2c955cb1 100644
--- a/libavcodec/x86/rv40dsp_init.c
+++ b/libavcodec/x86/rv40dsp_init.c
@@ -178,6 +178,12 @@ DEFINE_FN(avg, 8, ssse3)
 
 DEFINE_FN(avg, 16, sse2)
 DEFINE_FN(avg, 16, ssse3)
+
+#define CHROMA_MC_FUNC(OP, SIZE, XMM) \
+void ff_rv40_ ## OP ## _chroma_mc ## SIZE ## _ ## XMM(uint8_t *dst, const uint8_t *src,      \
+                                                      ptrdiff_t stride, int h, int x, int y);\
+    c->OP ## _chroma_pixels_tab[SIZE == 4] = ff_rv40_ ## OP ## _chroma_mc ## SIZE ## _ ## XMM
+
 #endif /* HAVE_X86ASM */
 
 av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
@@ -204,6 +210,10 @@ av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
         QPEL_MC_SET(avg_, _sse2)
     }
     if (EXTERNAL_SSSE3(cpu_flags)) {
+        CHROMA_MC_FUNC(put, 8, ssse3);
+        CHROMA_MC_FUNC(put, 4, ssse3);
+        CHROMA_MC_FUNC(avg, 8, ssse3);
+        CHROMA_MC_FUNC(avg, 4, ssse3);
         c->put_pixels_tab[0][15]        = put_rv40_qpel16_mc33_ssse3;
         c->put_pixels_tab[1][15]        = put_rv40_qpel8_mc33_ssse3;
         c->avg_pixels_tab[0][15]        = avg_rv40_qpel16_mc33_ssse3;
-- 
2.49.1


>From 30371f23dfbdf100204e98079aaad8fffec904fb Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 30 Oct 2025 15:59:21 +0100
Subject: [PATCH 4/4] avcodec/x86/h264_chromamc: Remove MMX(EXT) funcs
 overridden by SSSE3

SSSE3 is already quite old (introduced 2006 for Intel, 2011 for AMD),
so that the overwhelming majority of our users (particularly those
that actually update their FFmpeg) will be using the SSSE3 versions.
This commit therefore removes the MMX(EXT) functions overridden
by them (which don't abide by the ABI) to get closer to a removal
of emms_c.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/x86/h264_chromamc.asm | 327 +------------------------------
 libavcodec/x86/h264chroma_init.c |  17 --
 libavcodec/x86/rv40dsp_init.c    |  18 --
 libavcodec/x86/vc1dsp_init.c     |   9 -
 4 files changed, 2 insertions(+), 369 deletions(-)

diff --git a/libavcodec/x86/h264_chromamc.asm b/libavcodec/x86/h264_chromamc.asm
index 8cdc0fe5a1..6a65d5cabd 100644
--- a/libavcodec/x86/h264_chromamc.asm
+++ b/libavcodec/x86/h264_chromamc.asm
@@ -25,39 +25,6 @@
 
 SECTION_RODATA
 
-rnd_rv40_2d_tbl: times 4 dw  0
-                 times 4 dw 16
-                 times 4 dw 32
-                 times 4 dw 16
-                 times 4 dw 32
-                 times 4 dw 28
-                 times 4 dw 32
-                 times 4 dw 28
-                 times 4 dw  0
-                 times 4 dw 32
-                 times 4 dw 16
-                 times 4 dw 32
-                 times 4 dw 32
-                 times 4 dw 28
-                 times 4 dw 32
-                 times 4 dw 28
-rnd_rv40_1d_tbl: times 4 dw  0
-                 times 4 dw  2
-                 times 4 dw  4
-                 times 4 dw  2
-                 times 4 dw  4
-                 times 4 dw  3
-                 times 4 dw  4
-                 times 4 dw  3
-                 times 4 dw  0
-                 times 4 dw  4
-                 times 4 dw  2
-                 times 4 dw  4
-                 times 4 dw  4
-                 times 4 dw  3
-                 times 4 dw  4
-                 times 4 dw  3
-
 cextern pw_3
 cextern pw_4
 cextern pw_8
@@ -92,285 +59,6 @@ SECTION .text
     jne .next4rows
 %endmacro
 
-%macro chroma_mc8_mmx_func 2-3
-%ifidn %2, rv40
-%if PIC
-%define rnd_1d_rv40 r8
-%define rnd_2d_rv40 r8
-%define extra_regs 2
-%else ; no-PIC
-%define rnd_1d_rv40 rnd_rv40_1d_tbl
-%define rnd_2d_rv40 rnd_rv40_2d_tbl
-%define extra_regs 1
-%endif ; PIC
-%else
-%define extra_regs 0
-%endif ; rv40
-; void ff_put/avg_h264_chroma_mc8_*(uint8_t *dst /* align 8 */,
-;                                   const uint8_t *src /* align 1 */,
-;                                   ptrdiff_t stride, int h, int mx, int my)
-cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0
-    mov          r6d, r5d
-    or           r6d, r4d
-    jne .at_least_one_non_zero
-    ; mx == 0 AND my == 0 - no filter needed
-    mv0_pixels_mc8
-    RET
-
-.at_least_one_non_zero:
-%ifidn %2, rv40
-%if ARCH_X86_64
-    mov           r7, r5
-    and           r7, 6         ; &~1 for mx/my=[0,7]
-    lea           r7, [r7*4+r4]
-    sar          r7d, 1
-%define rnd_bias r7
-%define dest_reg r0
-%else ; x86-32
-    mov           r0, r5
-    and           r0, 6         ; &~1 for mx/my=[0,7]
-    lea           r0, [r0*4+r4]
-    sar          r0d, 1
-%define rnd_bias r0
-%define dest_reg r5
-%endif
-%else ; vc1, h264
-%define rnd_bias  0
-%define dest_reg r0
-%endif
-
-    test         r5d, r5d
-    mov           r6, 1
-    je .my_is_zero
-    test         r4d, r4d
-    mov           r6, r2        ; dxy = x ? 1 : stride
-    jne .both_non_zero
-.my_is_zero:
-    ; mx == 0 XOR my == 0 - 1 dimensional filter only
-    or           r4d, r5d       ; x + y
-
-%ifidn %2, rv40
-%if PIC
-    lea           r8, [rnd_rv40_1d_tbl]
-%endif
-%if ARCH_X86_64 == 0
-    mov           r5, r0m
-%endif
-%endif
-
-    movd          m5, r4d
-    movq          m4, [pw_8]
-    movq          m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3
-    punpcklwd     m5, m5
-    punpckldq     m5, m5        ; mm5 = B = x
-    pxor          m7, m7
-    psubw         m4, m5        ; mm4 = A = 8-x
-
-.next1drow:
-    movq          m0, [r1   ]   ; mm0 = src[0..7]
-    movq          m2, [r1+r6]   ; mm1 = src[1..8]
-
-    movq          m1, m0
-    movq          m3, m2
-    punpcklbw     m0, m7
-    punpckhbw     m1, m7
-    punpcklbw     m2, m7
-    punpckhbw     m3, m7
-    pmullw        m0, m4        ; [mm0,mm1] = A * src[0..7]
-    pmullw        m1, m4
-    pmullw        m2, m5        ; [mm2,mm3] = B * src[1..8]
-    pmullw        m3, m5
-
-    paddw         m0, m6
-    paddw         m1, m6
-    paddw         m0, m2
-    paddw         m1, m3
-    psrlw         m0, 3
-    psrlw         m1, 3
-    packuswb      m0, m1
-    CHROMAMC_AVG  m0, [dest_reg]
-    movq  [dest_reg], m0        ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
-
-    add     dest_reg, r2
-    add           r1, r2
-    dec           r3d
-    jne .next1drow
-    RET
-
-.both_non_zero: ; general case, bilinear
-    movd          m4, r4d         ; x
-    movd          m6, r5d         ; y
-%ifidn %2, rv40
-%if PIC
-    lea           r8, [rnd_rv40_2d_tbl]
-%endif
-%if ARCH_X86_64 == 0
-    mov           r5, r0m
-%endif
-%endif
-    mov           r6, rsp         ; backup stack pointer
-    and          rsp, ~(mmsize-1) ; align stack
-    sub          rsp, 16          ; AA and DD
-
-    punpcklwd     m4, m4
-    punpcklwd     m6, m6
-    punpckldq     m4, m4          ; mm4 = x words
-    punpckldq     m6, m6          ; mm6 = y words
-    movq          m5, m4
-    pmullw        m4, m6          ; mm4 = x * y
-    psllw         m5, 3
-    psllw         m6, 3
-    movq          m7, m5
-    paddw         m7, m6
-    movq     [rsp+8], m4          ; DD = x * y
-    psubw         m5, m4          ; mm5 = B = 8x - xy
-    psubw         m6, m4          ; mm6 = C = 8y - xy
-    paddw         m4, [pw_64]
-    psubw         m4, m7          ; mm4 = A = xy - (8x+8y) + 64
-    pxor          m7, m7
-    movq     [rsp  ], m4
-
-    movq          m0, [r1  ]      ; mm0 = src[0..7]
-    movq          m1, [r1+1]      ; mm1 = src[1..8]
-.next2drow:
-    add           r1, r2
-
-    movq          m2, m0
-    movq          m3, m1
-    punpckhbw     m0, m7
-    punpcklbw     m1, m7
-    punpcklbw     m2, m7
-    punpckhbw     m3, m7
-    pmullw        m0, [rsp]
-    pmullw        m2, [rsp]
-    pmullw        m1, m5
-    pmullw        m3, m5
-    paddw         m2, m1          ; mm2 = A * src[0..3] + B * src[1..4]
-    paddw         m3, m0          ; mm3 = A * src[4..7] + B * src[5..8]
-
-    movq          m0, [r1]
-    movq          m1, m0
-    punpcklbw     m0, m7
-    punpckhbw     m1, m7
-    pmullw        m0, m6
-    pmullw        m1, m6
-    paddw         m2, m0
-    paddw         m3, m1          ; [mm2,mm3] += C * src[0..7]
-
-    movq          m1, [r1+1]
-    movq          m0, m1
-    movq          m4, m1
-    punpcklbw     m0, m7
-    punpckhbw     m4, m7
-    pmullw        m0, [rsp+8]
-    pmullw        m4, [rsp+8]
-    paddw         m2, m0
-    paddw         m3, m4          ; [mm2,mm3] += D * src[1..8]
-    movq          m0, [r1]
-
-    paddw         m2, [rnd_2d_%2+rnd_bias*8]
-    paddw         m3, [rnd_2d_%2+rnd_bias*8]
-    psrlw         m2, 6
-    psrlw         m3, 6
-    packuswb      m2, m3
-    CHROMAMC_AVG  m2, [dest_reg]
-    movq  [dest_reg], m2          ; dst[0..7] = ([mm2,mm3] + rnd) >> 6
-
-    add     dest_reg, r2
-    dec          r3d
-    jne .next2drow
-    mov          rsp, r6          ; restore stack pointer
-    RET
-%endmacro
-
-%macro chroma_mc4_mmx_func 2
-%define extra_regs 0
-%ifidn %2, rv40
-%if PIC
-%define extra_regs 1
-%endif ; PIC
-%endif ; rv40
-cglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0
-    pxor          m7, m7
-    movd          m2, r4d         ; x
-    movd          m3, r5d         ; y
-    movq          m4, [pw_8]
-    movq          m5, [pw_8]
-    punpcklwd     m2, m2
-    punpcklwd     m3, m3
-    punpcklwd     m2, m2
-    punpcklwd     m3, m3
-    psubw         m4, m2
-    psubw         m5, m3
-
-%ifidn %2, rv40
-%if PIC
-   lea            r6, [rnd_rv40_2d_tbl]
-%define rnd_2d_rv40 r6
-%else
-%define rnd_2d_rv40 rnd_rv40_2d_tbl
-%endif
-    and           r5, 6         ; &~1 for mx/my=[0,7]
-    lea           r5, [r5*4+r4]
-    sar          r5d, 1
-%define rnd_bias r5
-%else ; vc1, h264
-%define rnd_bias 0
-%endif
-
-    movd          m0, [r1  ]
-    movd          m6, [r1+1]
-    add           r1, r2
-    punpcklbw     m0, m7
-    punpcklbw     m6, m7
-    pmullw        m0, m4
-    pmullw        m6, m2
-    paddw         m6, m0
-
-.next2rows:
-    movd          m0, [r1  ]
-    movd          m1, [r1+1]
-    add           r1, r2
-    punpcklbw     m0, m7
-    punpcklbw     m1, m7
-    pmullw        m0, m4
-    pmullw        m1, m2
-    paddw         m1, m0
-    movq          m0, m1
-
-    pmullw        m6, m5
-    pmullw        m1, m3
-    paddw         m6, [rnd_2d_%2+rnd_bias*8]
-    paddw         m1, m6
-    psrlw         m1, 6
-    packuswb      m1, m1
-    CHROMAMC_AVG4 m1, m6, [r0]
-    movd        [r0], m1
-    add           r0, r2
-
-    movd          m6, [r1  ]
-    movd          m1, [r1+1]
-    add           r1, r2
-    punpcklbw     m6, m7
-    punpcklbw     m1, m7
-    pmullw        m6, m4
-    pmullw        m1, m2
-    paddw         m1, m6
-    movq          m6, m1
-    pmullw        m0, m5
-    pmullw        m1, m3
-    paddw         m0, [rnd_2d_%2+rnd_bias*8]
-    paddw         m1, m0
-    psrlw         m1, 6
-    packuswb      m1, m1
-    CHROMAMC_AVG4 m1, m0, [r0]
-    movd        [r0], m1
-    add           r0, r2
-    sub          r3d, 2
-    jnz .next2rows
-    RET
-%endmacro
-
 %macro chroma_mc2_mmx_func 2
 cglobal %1_%2_chroma_mc2, 6, 7, 0
     mov          r6d, r4d
@@ -428,25 +116,14 @@ cglobal %1_%2_chroma_mc2, 6, 7, 0
     PAVGB         %1, %2
 %endmacro
 
-INIT_MMX mmx
-%define CHROMAMC_AVG  NOTHING
-%define CHROMAMC_AVG4 NOTHING
-chroma_mc8_mmx_func put, h264, _rnd
-chroma_mc8_mmx_func put, vc1,  _nornd
-chroma_mc8_mmx_func put, rv40
-chroma_mc4_mmx_func put, h264
-chroma_mc4_mmx_func put, rv40
 
 INIT_MMX mmxext
+%define CHROMAMC_AVG  NOTHING
+%define CHROMAMC_AVG4 NOTHING
 chroma_mc2_mmx_func put, h264
 
 %define CHROMAMC_AVG  DIRECT_AVG
 %define CHROMAMC_AVG4 COPY_AVG
-chroma_mc8_mmx_func avg, h264, _rnd
-chroma_mc8_mmx_func avg, vc1,  _nornd
-chroma_mc8_mmx_func avg, rv40
-chroma_mc4_mmx_func avg, h264
-chroma_mc4_mmx_func avg, rv40
 chroma_mc2_mmx_func avg, h264
 
 %macro chroma_mc8_ssse3_func 2-3
diff --git a/libavcodec/x86/h264chroma_init.c b/libavcodec/x86/h264chroma_init.c
index 34934b6ad0..6eb52746ad 100644
--- a/libavcodec/x86/h264chroma_init.c
+++ b/libavcodec/x86/h264chroma_init.c
@@ -24,16 +24,6 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/h264chroma.h"
 
-void ff_put_h264_chroma_mc8_rnd_mmx  (uint8_t *dst, const uint8_t *src,
-                                      ptrdiff_t stride, int h, int x, int y);
-void ff_avg_h264_chroma_mc8_rnd_mmxext(uint8_t *dst, const uint8_t *src,
-                                       ptrdiff_t stride, int h, int x, int y);
-
-void ff_put_h264_chroma_mc4_mmx      (uint8_t *dst, const uint8_t *src,
-                                      ptrdiff_t stride, int h, int x, int y);
-void ff_avg_h264_chroma_mc4_mmxext   (uint8_t *dst, const uint8_t *src,
-                                      ptrdiff_t stride, int h, int x, int y);
-
 void ff_put_h264_chroma_mc2_mmxext   (uint8_t *dst, const uint8_t *src,
                                       ptrdiff_t stride, int h, int x, int y);
 void ff_avg_h264_chroma_mc2_mmxext   (uint8_t *dst, const uint8_t *src,
@@ -68,14 +58,7 @@ av_cold void ff_h264chroma_init_x86(H264ChromaContext *c, int bit_depth)
     int high_bit_depth = bit_depth > 8;
     int cpu_flags      = av_get_cpu_flags();
 
-    if (EXTERNAL_MMX(cpu_flags) && !high_bit_depth) {
-        c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_mmx;
-        c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
-    }
-
     if (EXTERNAL_MMXEXT(cpu_flags) && !high_bit_depth) {
-        c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext;
-        c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext;
         c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmxext;
         c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmxext;
     }
diff --git a/libavcodec/x86/rv40dsp_init.c b/libavcodec/x86/rv40dsp_init.c
index ce2c955cb1..a07acae6bc 100644
--- a/libavcodec/x86/rv40dsp_init.c
+++ b/libavcodec/x86/rv40dsp_init.c
@@ -40,16 +40,6 @@ static void op##_rv40_qpel##size##_mc33_##insn(uint8_t *dst, const uint8_t *src,
 }
 
 #if HAVE_X86ASM
-void ff_put_rv40_chroma_mc8_mmx  (uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t stride, int h, int x, int y);
-void ff_avg_rv40_chroma_mc8_mmxext(uint8_t *dst, const uint8_t *src,
-                                   ptrdiff_t stride, int h, int x, int y);
-
-void ff_put_rv40_chroma_mc4_mmx  (uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t stride, int h, int x, int y);
-void ff_avg_rv40_chroma_mc4_mmxext(uint8_t *dst, const uint8_t *src,
-                                   ptrdiff_t stride, int h, int x, int y);
-
 #define DECLARE_WEIGHT(opt) \
 void ff_rv40_weight_func_rnd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
                                       int w1, int w2, ptrdiff_t stride); \
@@ -191,14 +181,6 @@ av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
     av_unused int cpu_flags = av_get_cpu_flags();
 
 #if HAVE_X86ASM
-    if (EXTERNAL_MMX(cpu_flags)) {
-        c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_mmx;
-        c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_mmx;
-    }
-    if (EXTERNAL_MMXEXT(cpu_flags)) {
-        c->avg_chroma_pixels_tab[0]     = ff_avg_rv40_chroma_mc8_mmxext;
-        c->avg_chroma_pixels_tab[1]     = ff_avg_rv40_chroma_mc4_mmxext;
-    }
     if (EXTERNAL_SSE2(cpu_flags)) {
         c->put_pixels_tab[0][15]        = put_rv40_qpel16_mc33_sse2;
         c->avg_pixels_tab[0][15]        = avg_rv40_qpel16_mc33_sse2;
diff --git a/libavcodec/x86/vc1dsp_init.c b/libavcodec/x86/vc1dsp_init.c
index e7874d2a5a..5cebc1f6f2 100644
--- a/libavcodec/x86/vc1dsp_init.c
+++ b/libavcodec/x86/vc1dsp_init.c
@@ -80,10 +80,6 @@ DECLARE_FUNCTION(avg_, 16, _sse2)
 
 #endif /* HAVE_X86ASM */
 
-void ff_put_vc1_chroma_mc8_nornd_mmx  (uint8_t *dst, const uint8_t *src,
-                                       ptrdiff_t stride, int h, int x, int y);
-void ff_avg_vc1_chroma_mc8_nornd_mmxext(uint8_t *dst, const uint8_t *src,
-                                        ptrdiff_t stride, int h, int x, int y);
 void ff_put_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, const uint8_t *src,
                                        ptrdiff_t stride, int h, int x, int y);
 void ff_avg_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, const uint8_t *src,
@@ -122,13 +118,8 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
         dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_ ## EXT
 
 #if HAVE_X86ASM
-    if (EXTERNAL_MMX(cpu_flags)) {
-        dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_mmx;
-
-    }
     if (EXTERNAL_MMXEXT(cpu_flags)) {
         ASSIGN_LF4(mmxext);
-        dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_mmxext;
 
         dsp->avg_vc1_mspel_pixels_tab[1][0]      = avg_vc1_mspel_mc00_8_mmxext;
 
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2025-11-01 11:11 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-11-01 11:10 [FFmpeg-devel] [PATCH] h264chroma (PR #20813) mkver via ffmpeg-devel

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git