* [FFmpeg-devel] [PATCH] avcodec/x86/mpegvideoenc cleanup (PR #20932)
@ 2025-11-16 11:43 mkver via ffmpeg-devel
0 siblings, 0 replies; only message in thread
From: mkver via ffmpeg-devel @ 2025-11-16 11:43 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: mkver
PR #20932 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20932
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20932.patch
>From a7102ce7ed9e6c0a8c61a92eb8e66b4260057adb Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sat, 15 Nov 2025 16:18:16 +0100
Subject: [PATCH 1/9] avcodec/x86/mpegvideoenc: Remove check for MMX
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/mpegvideoenc.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c
index eac9947590..bb1d2cc319 100644
--- a/libavcodec/x86/mpegvideoenc.c
+++ b/libavcodec/x86/mpegvideoenc.c
@@ -123,16 +123,14 @@ av_cold void ff_dct_encode_init_x86(MPVEncContext *const s)
const int dct_algo = s->c.avctx->dct_algo;
if (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX) {
-#if HAVE_MMX_INLINE
- int cpu_flags = av_get_cpu_flags();
#if HAVE_SSE2_INLINE
+ int cpu_flags = av_get_cpu_flags();
if (INLINE_SSE2(cpu_flags)) {
#if HAVE_6REGS
s->dct_quantize = dct_quantize_sse2;
#endif
s->denoise_dct = denoise_dct_sse2;
}
-#endif
#if HAVE_6REGS && HAVE_SSSE3_INLINE
if (INLINE_SSSE3(cpu_flags))
s->dct_quantize = dct_quantize_ssse3;
--
2.49.1
>From feecc0585a8b83eb0d0897c8a842e82f080d6f26 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sat, 15 Nov 2025 16:46:18 +0100
Subject: [PATCH 2/9] avcodec/x86/mpegvideoenc: Reduce number of registers used
Avoids a push+pop on x64 Windows.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/mpegvideoenc.c | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c
index bb1d2cc319..2ca05f69ea 100644
--- a/libavcodec/x86/mpegvideoenc.c
+++ b/libavcodec/x86/mpegvideoenc.c
@@ -68,7 +68,7 @@ static void denoise_dct_sse2(MPVEncContext *const s, int16_t block[])
s->dct_count[intra]++;
__asm__ volatile(
- "pxor %%xmm7, %%xmm7 \n\t"
+ "pxor %%xmm6, %%xmm6 \n\t"
"1: \n\t"
"pxor %%xmm0, %%xmm0 \n\t"
"pxor %%xmm1, %%xmm1 \n\t"
@@ -90,18 +90,18 @@ static void denoise_dct_sse2(MPVEncContext *const s, int16_t block[])
"psubw %%xmm1, %%xmm3 \n\t"
"movdqa %%xmm2, (%0) \n\t"
"movdqa %%xmm3, 16(%0) \n\t"
- "movdqa %%xmm4, %%xmm6 \n\t"
+ "movdqa %%xmm4, %%xmm2 \n\t"
"movdqa %%xmm5, %%xmm0 \n\t"
- "punpcklwd %%xmm7, %%xmm4 \n\t"
- "punpckhwd %%xmm7, %%xmm6 \n\t"
- "punpcklwd %%xmm7, %%xmm5 \n\t"
- "punpckhwd %%xmm7, %%xmm0 \n\t"
+ "punpcklwd %%xmm6, %%xmm4 \n\t"
+ "punpckhwd %%xmm6, %%xmm2 \n\t"
+ "punpcklwd %%xmm6, %%xmm5 \n\t"
+ "punpckhwd %%xmm6, %%xmm0 \n\t"
"paddd (%1), %%xmm4 \n\t"
- "paddd 16(%1), %%xmm6 \n\t"
+ "paddd 16(%1), %%xmm2 \n\t"
"paddd 32(%1), %%xmm5 \n\t"
"paddd 48(%1), %%xmm0 \n\t"
"movdqa %%xmm4, (%1) \n\t"
- "movdqa %%xmm6, 16(%1) \n\t"
+ "movdqa %%xmm2, 16(%1) \n\t"
"movdqa %%xmm5, 32(%1) \n\t"
"movdqa %%xmm0, 48(%1) \n\t"
"add $32, %0 \n\t"
@@ -112,7 +112,7 @@ static void denoise_dct_sse2(MPVEncContext *const s, int16_t block[])
: "+r" (block), "+r" (sum), "+r" (offset)
: "r"(block+64)
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
- "%xmm4", "%xmm5", "%xmm6", "%xmm7")
+ "%xmm4", "%xmm5", "%xmm6")
);
}
#endif /* HAVE_SSE2_INLINE */
--
2.49.1
>From 89a1bacded6e635f4773d2ae8b72cbd4f9a12338 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sat, 15 Nov 2025 17:32:29 +0100
Subject: [PATCH 3/9] avcodec/x86/mpegvideoenc: Port denoise_dct_sse2 to
external assembly
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/mpegvideoenc.c | 59 ++++--------------------------
libavcodec/x86/mpegvideoencdsp.asm | 46 +++++++++++++++++++++++
2 files changed, 54 insertions(+), 51 deletions(-)
diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c
index 2ca05f69ea..e5665ac781 100644
--- a/libavcodec/x86/mpegvideoenc.c
+++ b/libavcodec/x86/mpegvideoenc.c
@@ -57,8 +57,10 @@ DECLARE_ALIGNED(16, static const uint16_t, inv_zigzag_direct16)[64] = {
#endif /* HAVE_6REGS */
-#if HAVE_INLINE_ASM
-#if HAVE_SSE2_INLINE
+#if HAVE_SSE2_EXTERNAL
+void ff_mpv_denoise_dct_sse2(int16_t block[64], int dct_error_sum[64],
+ const uint16_t dct_offset[64]);
+
static void denoise_dct_sse2(MPVEncContext *const s, int16_t block[])
{
const int intra = s->c.mb_intra;
@@ -67,56 +69,9 @@ static void denoise_dct_sse2(MPVEncContext *const s, int16_t block[])
s->dct_count[intra]++;
- __asm__ volatile(
- "pxor %%xmm6, %%xmm6 \n\t"
- "1: \n\t"
- "pxor %%xmm0, %%xmm0 \n\t"
- "pxor %%xmm1, %%xmm1 \n\t"
- "movdqa (%0), %%xmm2 \n\t"
- "movdqa 16(%0), %%xmm3 \n\t"
- "pcmpgtw %%xmm2, %%xmm0 \n\t"
- "pcmpgtw %%xmm3, %%xmm1 \n\t"
- "pxor %%xmm0, %%xmm2 \n\t"
- "pxor %%xmm1, %%xmm3 \n\t"
- "psubw %%xmm0, %%xmm2 \n\t"
- "psubw %%xmm1, %%xmm3 \n\t"
- "movdqa %%xmm2, %%xmm4 \n\t"
- "movdqa %%xmm3, %%xmm5 \n\t"
- "psubusw (%2), %%xmm2 \n\t"
- "psubusw 16(%2), %%xmm3 \n\t"
- "pxor %%xmm0, %%xmm2 \n\t"
- "pxor %%xmm1, %%xmm3 \n\t"
- "psubw %%xmm0, %%xmm2 \n\t"
- "psubw %%xmm1, %%xmm3 \n\t"
- "movdqa %%xmm2, (%0) \n\t"
- "movdqa %%xmm3, 16(%0) \n\t"
- "movdqa %%xmm4, %%xmm2 \n\t"
- "movdqa %%xmm5, %%xmm0 \n\t"
- "punpcklwd %%xmm6, %%xmm4 \n\t"
- "punpckhwd %%xmm6, %%xmm2 \n\t"
- "punpcklwd %%xmm6, %%xmm5 \n\t"
- "punpckhwd %%xmm6, %%xmm0 \n\t"
- "paddd (%1), %%xmm4 \n\t"
- "paddd 16(%1), %%xmm2 \n\t"
- "paddd 32(%1), %%xmm5 \n\t"
- "paddd 48(%1), %%xmm0 \n\t"
- "movdqa %%xmm4, (%1) \n\t"
- "movdqa %%xmm2, 16(%1) \n\t"
- "movdqa %%xmm5, 32(%1) \n\t"
- "movdqa %%xmm0, 48(%1) \n\t"
- "add $32, %0 \n\t"
- "add $64, %1 \n\t"
- "add $32, %2 \n\t"
- "cmp %3, %0 \n\t"
- " jb 1b \n\t"
- : "+r" (block), "+r" (sum), "+r" (offset)
- : "r"(block+64)
- XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
- "%xmm4", "%xmm5", "%xmm6")
- );
+ ff_mpv_denoise_dct_sse2(block, sum, offset);
}
-#endif /* HAVE_SSE2_INLINE */
-#endif /* HAVE_INLINE_ASM */
+#endif /* HAVE_SSE2_EXTERNAL */
av_cold void ff_dct_encode_init_x86(MPVEncContext *const s)
{
@@ -129,7 +84,9 @@ av_cold void ff_dct_encode_init_x86(MPVEncContext *const s)
#if HAVE_6REGS
s->dct_quantize = dct_quantize_sse2;
#endif
+#if HAVE_SSE2_EXTERNAL
s->denoise_dct = denoise_dct_sse2;
+#endif
}
#if HAVE_6REGS && HAVE_SSSE3_INLINE
if (INLINE_SSSE3(cpu_flags))
diff --git a/libavcodec/x86/mpegvideoencdsp.asm b/libavcodec/x86/mpegvideoencdsp.asm
index d12646ae54..0e86a5304c 100644
--- a/libavcodec/x86/mpegvideoencdsp.asm
+++ b/libavcodec/x86/mpegvideoencdsp.asm
@@ -24,6 +24,52 @@
%include "libavutil/x86/x86util.asm"
SECTION .text
+
+INIT_XMM sse2
+cglobal mpv_denoise_dct, 3, 4, 7, block, sum, offset
+ pxor m6, m6
+ lea r3, [sumq+256]
+.loop:
+ mova m2, [blockq]
+ mova m3, [blockq+16]
+ mova m0, m6
+ mova m1, m6
+ pcmpgtw m0, m2
+ pcmpgtw m1, m3
+ pxor m2, m0
+ pxor m3, m1
+ psubw m2, m0
+ psubw m3, m1
+ psubusw m4, m2, [offsetq]
+ psubusw m5, m3, [offsetq+16]
+ pxor m4, m0
+ pxor m5, m1
+ add offsetq, 32
+ psubw m4, m0
+ psubw m5, m1
+ mova [blockq], m4
+ mova [blockq+16], m5
+ mova m0, m2
+ mova m1, m3
+ add blockq, 32
+ punpcklwd m0, m6
+ punpckhwd m2, m6
+ punpcklwd m1, m6
+ punpckhwd m3, m6
+ paddd m0, [sumq]
+ paddd m2, [sumq+16]
+ paddd m1, [sumq+32]
+ paddd m3, [sumq+48]
+ mova [sumq], m0
+ mova [sumq+16], m2
+ mova [sumq+32], m1
+ mova [sumq+48], m3
+ add sumq, 64
+ cmp sumq, r3
+ jb .loop
+ RET
+
+
; int ff_pix_sum16(const uint8_t *pix, ptrdiff_t line_size)
; %1 = number of loops
; %2 = number of GPRs used
--
2.49.1
>From bdc7fcbd439cca0cb1d85f51ca06fce91ac7c150 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sat, 15 Nov 2025 18:24:18 +0100
Subject: [PATCH 4/9] avcodec/mpegvideo_enc: Port denoise_dct to
MpegvideoEncDSPContext
It is very simple to remove the MPVEncContext from it.
Notice that this also fixes a bug in x86/mpegvideoenc.c: It only
used the SSE2 version of denoise_dct when dct_algo was auto or mmx
(and it was therefore unused during FATE).
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/mips/Makefile | 3 +-
libavcodec/mips/mpegvideo_mips.h | 3 +-
libavcodec/mips/mpegvideoenc_init_mips.c | 33 ----------------
libavcodec/mips/mpegvideoencdsp_init_mips.c | 5 +++
...egvideoenc_mmi.c => mpegvideoencdsp_mmi.c} | 7 +---
libavcodec/mpegvideo_enc.c | 38 +++++--------------
libavcodec/mpegvideoenc.h | 2 -
libavcodec/mpegvideoencdsp.c | 25 ++++++++++++
libavcodec/mpegvideoencdsp.h | 3 ++
libavcodec/x86/mpegvideoenc.c | 19 ----------
libavcodec/x86/mpegvideoenc_template.c | 7 +++-
libavcodec/x86/mpegvideoencdsp_init.c | 3 ++
12 files changed, 53 insertions(+), 95 deletions(-)
delete mode 100644 libavcodec/mips/mpegvideoenc_init_mips.c
rename libavcodec/mips/{mpegvideoenc_mmi.c => mpegvideoencdsp_mmi.c} (95%)
diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile
index 4bbc2f00ea..1d777293d0 100644
--- a/libavcodec/mips/Makefile
+++ b/libavcodec/mips/Makefile
@@ -54,7 +54,6 @@ OBJS-$(CONFIG_BLOCKDSP) += mips/blockdsp_init_mips.o
OBJS-$(CONFIG_PIXBLOCKDSP) += mips/pixblockdsp_init_mips.o
OBJS-$(CONFIG_IDCTDSP) += mips/idctdsp_init_mips.o
OBJS-$(CONFIG_MPEGVIDEO) += mips/mpegvideo_init_mips.o
-OBJS-$(CONFIG_MPEGVIDEOENC) += mips/mpegvideoenc_init_mips.o
OBJS-$(CONFIG_MPEGVIDEOENCDSP) += mips/mpegvideoencdsp_init_mips.o
OBJS-$(CONFIG_ME_CMP) += mips/me_cmp_init_mips.o
OBJS-$(CONFIG_MPEG4_DECODER) += mips/xvididct_init_mips.o
@@ -100,7 +99,7 @@ MMI-OBJS-$(CONFIG_H264DSP) += mips/h264dsp_mmi.o
MMI-OBJS-$(CONFIG_H264CHROMA) += mips/h264chroma_mmi.o
MMI-OBJS-$(CONFIG_H264PRED) += mips/h264pred_mmi.o
MMI-OBJS-$(CONFIG_MPEGVIDEO) += mips/mpegvideo_mmi.o
-MMI-OBJS-$(CONFIG_MPEGVIDEOENC) += mips/mpegvideoenc_mmi.o
+MMI-OBJS-$(CONFIG_MPEGVIDEOENCDSP) += mips/mpegvideoenc_mmi.o
MMI-OBJS-$(CONFIG_IDCTDSP) += mips/idctdsp_mmi.o \
mips/simple_idct_mmi.o
MMI-OBJS-$(CONFIG_MPEG4_DECODER) += mips/xvid_idct_mmi.o
diff --git a/libavcodec/mips/mpegvideo_mips.h b/libavcodec/mips/mpegvideo_mips.h
index 72ffed6985..2a9ea4006e 100644
--- a/libavcodec/mips/mpegvideo_mips.h
+++ b/libavcodec/mips/mpegvideo_mips.h
@@ -22,7 +22,6 @@
#define AVCODEC_MIPS_MPEGVIDEO_MIPS_H
#include "libavcodec/mpegvideo.h"
-#include "libavcodec/mpegvideoenc.h"
void ff_dct_unquantize_h263_intra_mmi(MpegEncContext *s, int16_t *block,
int n, int qscale);
@@ -34,6 +33,6 @@ void ff_dct_unquantize_mpeg1_inter_mmi(MpegEncContext *s, int16_t *block,
int n, int qscale);
void ff_dct_unquantize_mpeg2_intra_mmi(MpegEncContext *s, int16_t *block,
int n, int qscale);
-void ff_denoise_dct_mmi(MPVEncContext *s, int16_t *block);
+void ff_denoise_dct_mmi(int16_t block[64], int sum[64], const uint16_t offset[64]);
#endif /* AVCODEC_MIPS_MPEGVIDEO_MIPS_H */
diff --git a/libavcodec/mips/mpegvideoenc_init_mips.c b/libavcodec/mips/mpegvideoenc_init_mips.c
deleted file mode 100644
index 7831973eb8..0000000000
--- a/libavcodec/mips/mpegvideoenc_init_mips.c
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/mips/cpu.h"
-#include "libavcodec/mpegvideoenc.h"
-#include "mpegvideo_mips.h"
-
-av_cold void ff_mpvenc_dct_init_mips(MPVEncContext *s)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (have_mmi(cpu_flags)) {
- s->denoise_dct = ff_denoise_dct_mmi;
- }
-}
diff --git a/libavcodec/mips/mpegvideoencdsp_init_mips.c b/libavcodec/mips/mpegvideoencdsp_init_mips.c
index 24a17b91db..df916282a2 100644
--- a/libavcodec/mips/mpegvideoencdsp_init_mips.c
+++ b/libavcodec/mips/mpegvideoencdsp_init_mips.c
@@ -23,12 +23,17 @@
#include "libavcodec/bit_depth_template.c"
#include "libavcodec/mpegvideoencdsp.h"
#include "h263dsp_mips.h"
+#include "mpegvideo_mips.h"
av_cold void ff_mpegvideoencdsp_init_mips(MpegvideoEncDSPContext *c,
AVCodecContext *avctx)
{
int cpu_flags = av_get_cpu_flags();
+ if (have_mmi(cpu_flags)) {
+ c->denoise_dct = ff_denoise_dct_mmi;
+ }
+
if (have_msa(cpu_flags)) {
#if BIT_DEPTH == 8
c->pix_sum = ff_pix_sum_msa;
diff --git a/libavcodec/mips/mpegvideoenc_mmi.c b/libavcodec/mips/mpegvideoencdsp_mmi.c
similarity index 95%
rename from libavcodec/mips/mpegvideoenc_mmi.c
rename to libavcodec/mips/mpegvideoencdsp_mmi.c
index 085be3b0ec..2239a05978 100644
--- a/libavcodec/mips/mpegvideoenc_mmi.c
+++ b/libavcodec/mips/mpegvideoencdsp_mmi.c
@@ -25,17 +25,12 @@
#include "mpegvideo_mips.h"
#include "libavutil/mips/mmiutils.h"
-void ff_denoise_dct_mmi(MPVEncContext *s, int16_t *block)
+void ff_denoise_dct_mmi(int16_t block[64], int sum[64], const uint16_t offset[64])
{
- const int intra = s->c.mb_intra;
- int *sum = s->dct_error_sum[intra];
- uint16_t *offset = s->dct_offset[intra];
double ftmp[8];
mips_reg addr[1];
DECLARE_VAR_ALL64;
- s->dct_count[intra]++;
-
__asm__ volatile(
"pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
"1: \n\t"
diff --git a/libavcodec/mpegvideo_enc.c b/libavcodec/mpegvideo_enc.c
index ce0ee4bb68..9e83026b51 100644
--- a/libavcodec/mpegvideo_enc.c
+++ b/libavcodec/mpegvideo_enc.c
@@ -86,7 +86,6 @@
static int encode_picture(MPVMainEncContext *const s, const AVPacket *pkt);
static int dct_quantize_refine(MPVEncContext *const s, int16_t *block, int16_t *weight, int16_t *orig, int n, int qscale);
static int sse_mb(MPVEncContext *const s);
-static void denoise_dct_c(MPVEncContext *const s, int16_t *block);
static int dct_quantize_c(MPVEncContext *const s,
int16_t *block, int n,
int qscale, int *overflow);
@@ -300,11 +299,8 @@ static av_cold void mpv_encode_defaults(MPVMainEncContext *const m)
av_cold void ff_dct_encode_init(MPVEncContext *const s)
{
s->dct_quantize = dct_quantize_c;
- s->denoise_dct = denoise_dct_c;
-#if ARCH_MIPS
- ff_mpvenc_dct_init_mips(s);
-#elif ARCH_X86
+#if ARCH_X86
ff_dct_encode_init_x86(s);
#endif
@@ -3955,29 +3951,14 @@ static int encode_picture(MPVMainEncContext *const m, const AVPacket *pkt)
return 0;
}
-static void denoise_dct_c(MPVEncContext *const s, int16_t *block)
+static inline void denoise_dct(MPVEncContext *const s, int16_t block[])
{
+ if (!s->dct_error_sum)
+ return;
+
const int intra = s->c.mb_intra;
- int i;
-
s->dct_count[intra]++;
-
- for(i=0; i<64; i++){
- int level= block[i];
-
- if(level){
- if(level>0){
- s->dct_error_sum[intra][i] += level;
- level -= s->dct_offset[intra][i];
- if(level<0) level=0;
- }else{
- s->dct_error_sum[intra][i] -= level;
- level += s->dct_offset[intra][i];
- if(level>0) level=0;
- }
- block[i]= level;
- }
- }
+ s->mpvencdsp.denoise_dct(block, s->dct_error_sum[intra], s->dct_offset[intra]);
}
static int dct_quantize_trellis_c(MPVEncContext *const s,
@@ -4009,8 +3990,8 @@ static int dct_quantize_trellis_c(MPVEncContext *const s,
s->fdsp.fdct(block);
- if(s->dct_error_sum)
- s->denoise_dct(s, block);
+ denoise_dct(s, block);
+
qmul= qscale*16;
qadd= ((qscale-1)|1)*8;
@@ -4678,8 +4659,7 @@ static int dct_quantize_c(MPVEncContext *const s,
s->fdsp.fdct(block);
- if(s->dct_error_sum)
- s->denoise_dct(s, block);
+ denoise_dct(s, block);
if (s->c.mb_intra) {
scantable = s->c.intra_scantable.scantable;
diff --git a/libavcodec/mpegvideoenc.h b/libavcodec/mpegvideoenc.h
index ee115c3611..131908c10a 100644
--- a/libavcodec/mpegvideoenc.h
+++ b/libavcodec/mpegvideoenc.h
@@ -123,7 +123,6 @@ typedef struct MPVEncContext {
uint16_t (*q_inter_matrix16)[2][64];
/* noise reduction */
- void (*denoise_dct)(struct MPVEncContext *s, int16_t *block);
int (*dct_error_sum)[64];
int dct_count[2];
uint16_t (*dct_offset)[64];
@@ -397,7 +396,6 @@ int ff_mpv_reallocate_putbitbuffer(MPVEncContext *s, size_t threshold, size_t si
void ff_write_quant_matrix(PutBitContext *pb, uint16_t *matrix);
void ff_dct_encode_init(MPVEncContext *s);
-void ff_mpvenc_dct_init_mips(MPVEncContext *s);
void ff_dct_encode_init_x86(MPVEncContext *s);
void ff_convert_matrix(MPVEncContext *s, int (*qmat)[64], uint16_t (*qmat16)[2][64],
diff --git a/libavcodec/mpegvideoencdsp.c b/libavcodec/mpegvideoencdsp.c
index b4fd2af915..3b4a57d58a 100644
--- a/libavcodec/mpegvideoencdsp.c
+++ b/libavcodec/mpegvideoencdsp.c
@@ -28,6 +28,29 @@
#include "mathops.h"
#include "mpegvideoencdsp.h"
+static void denoise_dct_c(int16_t block[64], int dct_error_sum[64],
+ const uint16_t dct_offset[64])
+{
+ for (int i = 0; i < 64; ++i) {
+ int level = block[i];
+
+ if (level) {
+ if (level > 0) {
+ dct_error_sum[i] += level;
+ level -= dct_offset[i];
+ if (level < 0)
+ level = 0;
+ } else {
+ dct_error_sum[i] -= level;
+ level += dct_offset[i];
+ if (level > 0)
+ level = 0;
+ }
+ block[i] = level;
+ }
+ }
+}
+
static int try_8x8basis_c(const int16_t rem[64], const int16_t weight[64],
const int16_t basis[64], int scale)
{
@@ -253,6 +276,8 @@ static void shrink88(uint8_t *dst, ptrdiff_t dst_wrap,
av_cold void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c,
AVCodecContext *avctx)
{
+ c->denoise_dct = denoise_dct_c;
+
c->try_8x8basis = try_8x8basis_c;
c->add_8x8basis = add_8x8basis_c;
diff --git a/libavcodec/mpegvideoencdsp.h b/libavcodec/mpegvideoencdsp.h
index 6ec665677b..989503f25f 100644
--- a/libavcodec/mpegvideoencdsp.h
+++ b/libavcodec/mpegvideoencdsp.h
@@ -30,6 +30,9 @@
#define EDGE_BOTTOM 2
typedef struct MpegvideoEncDSPContext {
+ void (*denoise_dct)(int16_t block[64], int dct_error_sum[64],
+ const uint16_t dct_offset[64]);
+
int (*try_8x8basis)(const int16_t rem[64], const int16_t weight[64],
const int16_t basis[64], int scale);
void (*add_8x8basis)(int16_t rem[64], const int16_t basis[64], int scale);
diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c
index e5665ac781..c667dcd2a2 100644
--- a/libavcodec/x86/mpegvideoenc.c
+++ b/libavcodec/x86/mpegvideoenc.c
@@ -57,22 +57,6 @@ DECLARE_ALIGNED(16, static const uint16_t, inv_zigzag_direct16)[64] = {
#endif /* HAVE_6REGS */
-#if HAVE_SSE2_EXTERNAL
-void ff_mpv_denoise_dct_sse2(int16_t block[64], int dct_error_sum[64],
- const uint16_t dct_offset[64]);
-
-static void denoise_dct_sse2(MPVEncContext *const s, int16_t block[])
-{
- const int intra = s->c.mb_intra;
- int *sum= s->dct_error_sum[intra];
- uint16_t *offset= s->dct_offset[intra];
-
- s->dct_count[intra]++;
-
- ff_mpv_denoise_dct_sse2(block, sum, offset);
-}
-#endif /* HAVE_SSE2_EXTERNAL */
-
av_cold void ff_dct_encode_init_x86(MPVEncContext *const s)
{
const int dct_algo = s->c.avctx->dct_algo;
@@ -83,9 +67,6 @@ av_cold void ff_dct_encode_init_x86(MPVEncContext *const s)
if (INLINE_SSE2(cpu_flags)) {
#if HAVE_6REGS
s->dct_quantize = dct_quantize_sse2;
-#endif
-#if HAVE_SSE2_EXTERNAL
- s->denoise_dct = denoise_dct_sse2;
#endif
}
#if HAVE_6REGS && HAVE_SSSE3_INLINE
diff --git a/libavcodec/x86/mpegvideoenc_template.c b/libavcodec/x86/mpegvideoenc_template.c
index f0b95c1621..14e993de2b 100644
--- a/libavcodec/x86/mpegvideoenc_template.c
+++ b/libavcodec/x86/mpegvideoenc_template.c
@@ -76,8 +76,11 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
//s->fdct (block);
ff_fdct_sse2(block); // cannot be anything else ...
- if(s->dct_error_sum)
- s->denoise_dct(s, block);
+ if (s->dct_error_sum) {
+ const int intra = s->c.mb_intra;
+ s->dct_count[intra]++;
+ s->mpvencdsp.denoise_dct(block, s->dct_error_sum[intra], s->dct_offset[intra]);
+ }
if (s->c.mb_intra) {
int dummy;
diff --git a/libavcodec/x86/mpegvideoencdsp_init.c b/libavcodec/x86/mpegvideoencdsp_init.c
index bf5b722016..f6169b5399 100644
--- a/libavcodec/x86/mpegvideoencdsp_init.c
+++ b/libavcodec/x86/mpegvideoencdsp_init.c
@@ -27,6 +27,8 @@
#include "libavcodec/avcodec.h"
#include "libavcodec/mpegvideoencdsp.h"
+void ff_mpv_denoise_dct_sse2(int16_t block[64], int dct_error_sum[64],
+ const uint16_t dct_offset[64]);
int ff_pix_sum16_sse2(const uint8_t *pix, ptrdiff_t line_size);
int ff_pix_sum16_xop(const uint8_t *pix, ptrdiff_t line_size);
int ff_pix_norm1_sse2(const uint8_t *pix, ptrdiff_t line_size);
@@ -209,6 +211,7 @@ av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE2(cpu_flags)) {
+ c->denoise_dct = ff_mpv_denoise_dct_sse2;
c->pix_sum = ff_pix_sum16_sse2;
c->pix_norm1 = ff_pix_norm1_sse2;
}
--
2.49.1
>From 06076cb368f1cf6baaed5f6de8ed2894236c5910 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sat, 15 Nov 2025 19:06:14 +0100
Subject: [PATCH 5/9] tests/checkasm/mpegvideoencdsp: Test denoise_dct
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
tests/checkasm/mpegvideoencdsp.c | 33 ++++++++++++++++++++++++++++++++
1 file changed, 33 insertions(+)
diff --git a/tests/checkasm/mpegvideoencdsp.c b/tests/checkasm/mpegvideoencdsp.c
index a4a4fa6f5c..955cd9f5b7 100644
--- a/tests/checkasm/mpegvideoencdsp.c
+++ b/tests/checkasm/mpegvideoencdsp.c
@@ -37,6 +37,37 @@
buf[j] = rnd() % (max - min + 1) + min; \
} while (0)
+static void check_denoise_dct(MpegvideoEncDSPContext *c)
+{
+ declare_func(void, int16_t block[64], int dct_error_sum[64],
+ const uint16_t dct_offset[64]);
+
+ if (check_func(c->denoise_dct, "denoise_dct")) {
+ DECLARE_ALIGNED(16, int16_t, block_ref)[64];
+ DECLARE_ALIGNED(16, int16_t, block_new)[64];
+ DECLARE_ALIGNED(16, int, dct_error_sum_ref)[64];
+ DECLARE_ALIGNED(16, int, dct_error_sum_new)[64];
+ DECLARE_ALIGNED(16, uint16_t, dct_offset)[64];
+
+ for (size_t i = 0; i < FF_ARRAY_ELEMS(block_ref); ++i) {
+ unsigned random = rnd();
+ block_ref[i] = random & (1 << 16) ? random : 0;
+ }
+ randomize_buffers(dct_offset, sizeof(dct_offset));
+ randomize_buffer_clipped(dct_error_sum_ref, 0, (1 << 24) - 1);
+ memcpy(block_new, block_ref, sizeof(block_new));
+ memcpy(dct_error_sum_new, dct_error_sum_ref, sizeof(dct_error_sum_ref));
+
+ call_ref(block_ref, dct_error_sum_ref, dct_offset);
+ call_new(block_new, dct_error_sum_new, dct_offset);
+ if (memcmp(block_ref, block_new, sizeof(block_ref)) ||
+ memcmp(dct_error_sum_new, dct_error_sum_ref, sizeof(dct_error_sum_new)))
+ fail();
+
+ bench_new(block_new, dct_error_sum_new, dct_offset);
+ }
+}
+
static void check_add_8x8basis(MpegvideoEncDSPContext *c)
{
declare_func(void, int16_t rem[64], const int16_t basis[64], int scale);
@@ -166,6 +197,8 @@ void checkasm_check_mpegvideoencdsp(void)
ff_mpegvideoencdsp_init(&c, &avctx);
+ check_denoise_dct(&c);
+ report("denoise_dct");
check_pix_sum(&c);
report("pix_sum");
check_pix_norm1(&c);
--
2.49.1
>From 1b9714e7bc08908a0f03c4a0a5757485489cec4c Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sat, 15 Nov 2025 19:44:02 +0100
Subject: [PATCH 6/9] avcodec/x86/mpegvideoencdsp: Port add_8x8basis_ssse3() to
ASM
Both GCC and Clang completely unroll the unlikely loop at -O3,
leading to codesize bloat; their code is also suboptimal, as they
don't make use of pmulhrsw (even with -mssse3). This commit
therefore ports the whole function to external assembly. The new
function occupies 176B here vs 1406B for GCC.
Benchmarks for a testcase with huge qscale (notice that the C version
is unrolled just like the unlikely loop in the SSSE3 version):
add_8x8basis_c: 43.4 ( 1.00x)
add_8x8basis_ssse3 (old): 43.6 ( 1.00x)
add_8x8basis_ssse3 (new): 12.6 ( 3.46x)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/mpegvideoencdsp.asm | 46 +++++++++++++++++++++++++++
libavcodec/x86/mpegvideoencdsp_init.c | 46 ++++-----------------------
2 files changed, 53 insertions(+), 39 deletions(-)
diff --git a/libavcodec/x86/mpegvideoencdsp.asm b/libavcodec/x86/mpegvideoencdsp.asm
index 0e86a5304c..a85de32449 100644
--- a/libavcodec/x86/mpegvideoencdsp.asm
+++ b/libavcodec/x86/mpegvideoencdsp.asm
@@ -25,6 +25,52 @@
SECTION .text
+; void ff_add_8x8basis_ssse3(int16_t rem[64], const int16_t basis[64], int scale)
+INIT_XMM ssse3
+cglobal add_8x8basis, 3, 3, 4, rem, basis, scale
+ movd m0, scaled
+ add scaled, 1024
+ add basisq, 128
+ add remq, 128
+ cmp scaled, 2047
+ mov r2q, -128
+ ja .huge_scale
+
+ punpcklwd m0, m0
+ pshufd m0, m0, 0x0
+ psllw m0, 5
+.loop1:
+ mova m1, [basisq+r2q]
+ mova m2, [basisq+r2q+16]
+ pmulhrsw m1, m0
+ pmulhrsw m2, m0
+ paddw m1, [remq+r2q]
+ paddw m2, [remq+r2q+16]
+ mova [remq+r2q], m1
+ mova [remq+r2q+16], m2
+ add r2q, 32
+ js .loop1
+ RET
+
+.huge_scale:
+ pslld m0, 6
+ pshuflw m1, m0, 0x55
+ psrlw m0, 1
+ punpcklwd m0, m0
+ punpcklwd m1, m1
+ pshufd m0, m0, 0x0
+.loop2:
+ mova m2, [basisq+r2q]
+ pmulhrsw m3, m2, m0
+ pmullw m2, m1
+ paddw m2, m3
+ paddw m2, [remq+r2q]
+ mova [remq+r2q], m2
+ add r2q, 16
+ js .loop2
+ RET
+
+
INIT_XMM sse2
cglobal mpv_denoise_dct, 3, 4, 7, block, sum, offset
pxor m6, m6
diff --git a/libavcodec/x86/mpegvideoencdsp_init.c b/libavcodec/x86/mpegvideoencdsp_init.c
index f6169b5399..220c75785a 100644
--- a/libavcodec/x86/mpegvideoencdsp_init.c
+++ b/libavcodec/x86/mpegvideoencdsp_init.c
@@ -32,6 +32,7 @@ void ff_mpv_denoise_dct_sse2(int16_t block[64], int dct_error_sum[64],
int ff_pix_sum16_sse2(const uint8_t *pix, ptrdiff_t line_size);
int ff_pix_sum16_xop(const uint8_t *pix, ptrdiff_t line_size);
int ff_pix_norm1_sse2(const uint8_t *pix, ptrdiff_t line_size);
+void ff_add_8x8basis_ssse3(int16_t rem[64], const int16_t basis[64], int scale);
#if HAVE_INLINE_ASM
#if HAVE_SSSE3_INLINE
@@ -83,41 +84,6 @@ static int try_8x8basis_ssse3(const int16_t rem[64], const int16_t weight[64], c
);
return i;
}
-
-static void add_8x8basis_ssse3(int16_t rem[64], const int16_t basis[64], int scale)
-{
- x86_reg i=0;
-
- if (FFABS(scale) < 1024) {
- scale *= 1 << (16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT);
- __asm__ volatile(
- "movd %3, %%xmm2 \n\t"
- "punpcklwd %%xmm2, %%xmm2 \n\t"
- "pshufd $0, %%xmm2, %%xmm2 \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movdqa (%1, %0), %%xmm0 \n\t"
- "movdqa 16(%1, %0), %%xmm1 \n\t"
- "pmulhrsw %%xmm2, %%xmm0 \n\t"
- "pmulhrsw %%xmm2, %%xmm1 \n\t"
- "paddw (%2, %0), %%xmm0 \n\t"
- "paddw 16(%2, %0), %%xmm1 \n\t"
- "movdqa %%xmm0, (%2, %0) \n\t"
- "movdqa %%xmm1, 16(%2, %0) \n\t"
- "add $32, %0 \n\t"
- "cmp $128, %0 \n\t" // FIXME optimize & bench
- " jb 1b \n\t"
- : "+r" (i)
- : "r"(basis), "r"(rem), "g"(scale)
- XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2")
- );
- } else {
- for (i=0; i<8*8; i++) {
- rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
- }
- }
-}
-
#endif /* HAVE_SSSE3_INLINE */
/* Draw the edges of width 'w' of an image of size width, height */
@@ -227,15 +193,17 @@ av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
c->draw_edges = draw_edges_mmx;
}
}
+#endif /* HAVE_INLINE_ASM */
+ if (X86_SSSE3(cpu_flags)) {
#if HAVE_SSSE3_INLINE
- if (INLINE_SSSE3(cpu_flags)) {
if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
c->try_8x8basis = try_8x8basis_ssse3;
}
- c->add_8x8basis = add_8x8basis_ssse3;
- }
#endif /* HAVE_SSSE3_INLINE */
+#if HAVE_SSSE3_EXTERNAL
+ c->add_8x8basis = ff_add_8x8basis_ssse3;
+#endif
+ }
-#endif /* HAVE_INLINE_ASM */
}
--
2.49.1
>From 0dfe66422eedfd67028b89c46ff7db2f8fef80eb Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sat, 15 Nov 2025 19:56:23 +0100
Subject: [PATCH 7/9] avcodec/x86/mpegvideoenc_template: Avoid touching
nonvolatile register
xmm7 is nonvolatile on x64 Windows.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/mpegvideoenc_template.c | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/libavcodec/x86/mpegvideoenc_template.c b/libavcodec/x86/mpegvideoenc_template.c
index 14e993de2b..b5417f6d32 100644
--- a/libavcodec/x86/mpegvideoenc_template.c
+++ b/libavcodec/x86/mpegvideoenc_template.c
@@ -117,7 +117,7 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
__asm__ volatile(
"movd %%"FF_REG_a", %%xmm3 \n\t" // last_non_zero_p1
SPREADW("%%xmm3")
- "pxor %%xmm7, %%xmm7 \n\t" // 0
+ "pxor %%xmm2, %%xmm2 \n\t" // 0
"pxor %%xmm4, %%xmm4 \n\t" // 0
"movdqa (%2), %%xmm5 \n\t" // qmat[0]
"pxor %%xmm6, %%xmm6 \n\t"
@@ -132,9 +132,9 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
"por %%xmm0, %%xmm4 \n\t"
RESTORE_SIGN("%%xmm1", "%%xmm0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
"movdqa %%xmm0, (%5, %%"FF_REG_a") \n\t"
- "pcmpeqw %%xmm7, %%xmm0 \n\t" // out==0 ? 0xFF : 0x00
+ "pcmpeqw %%xmm2, %%xmm0 \n\t" // out==0 ? 0xFF : 0x00
"movdqa (%4, %%"FF_REG_a"), %%xmm1 \n\t"
- "movdqa %%xmm7, (%1, %%"FF_REG_a") \n\t" // 0
+ "movdqa %%xmm2, (%1, %%"FF_REG_a") \n\t" // 0
"pandn %%xmm1, %%xmm0 \n\t"
"pmaxsw %%xmm0, %%xmm3 \n\t"
"add $16, %%"FF_REG_a" \n\t"
@@ -146,13 +146,13 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
: "r" (block+64), "r" (qmat), "r" (bias),
"r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
- "%xmm4", "%xmm5", "%xmm6", "%xmm7")
+ "%xmm4", "%xmm5", "%xmm6")
);
}else{ // FMT_H263
__asm__ volatile(
"movd %%"FF_REG_a", %%xmm3 \n\t" // last_non_zero_p1
SPREADW("%%xmm3")
- "pxor %%xmm7, %%xmm7 \n\t" // 0
+ "pxor %%xmm2, %%xmm2 \n\t" // 0
"pxor %%xmm4, %%xmm4 \n\t" // 0
"mov $-128, %%"FF_REG_a" \n\t"
".p2align 4 \n\t"
@@ -166,9 +166,9 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
"por %%xmm0, %%xmm4 \n\t"
RESTORE_SIGN("%%xmm1", "%%xmm0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
"movdqa %%xmm0, (%5, %%"FF_REG_a") \n\t"
- "pcmpeqw %%xmm7, %%xmm0 \n\t" // out==0 ? 0xFF : 0x00
+ "pcmpeqw %%xmm2, %%xmm0 \n\t" // out==0 ? 0xFF : 0x00
"movdqa (%4, %%"FF_REG_a"), %%xmm1 \n\t"
- "movdqa %%xmm7, (%1, %%"FF_REG_a") \n\t" // 0
+ "movdqa %%xmm2, (%1, %%"FF_REG_a") \n\t" // 0
"pandn %%xmm1, %%xmm0 \n\t"
"pmaxsw %%xmm0, %%xmm3 \n\t"
"add $16, %%"FF_REG_a" \n\t"
@@ -180,7 +180,7 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
: "r" (block+64), "r" (qmat+64), "r" (bias+64),
"r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
- "%xmm4", "%xmm5", "%xmm6", "%xmm7")
+ "%xmm4", "%xmm5", "%xmm6")
);
}
__asm__ volatile(
--
2.49.1
>From 1c1109ba320528f01d610da9b25aae8591458526 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sun, 16 Nov 2025 11:10:07 +0100
Subject: [PATCH 8/9] avcodec/x86/mpegvideoenc_template: Reduce number of
registers used
qmat and bias always have a constant offset, so one can use one register
to address both of them. This allows to remove the check for HAVE_6REGS
(untested on a system where HAVE_6REGS is false).
Also avoid FF_REG_a while at it.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/mpegvideoenc.c | 8 +-------
libavcodec/x86/mpegvideoenc_template.c | 21 +++++++++------------
2 files changed, 10 insertions(+), 19 deletions(-)
diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c
index c667dcd2a2..24dd049200 100644
--- a/libavcodec/x86/mpegvideoenc.c
+++ b/libavcodec/x86/mpegvideoenc.c
@@ -39,8 +39,6 @@ DECLARE_ALIGNED(16, static const uint16_t, inv_zigzag_direct16)[64] = {
36, 37, 49, 50, 58, 59, 63, 64,
};
-#if HAVE_6REGS
-
#if HAVE_SSE2_INLINE
#define COMPILE_TEMPLATE_SSSE3 0
#define RENAME(a) a ## _sse2
@@ -55,8 +53,6 @@ DECLARE_ALIGNED(16, static const uint16_t, inv_zigzag_direct16)[64] = {
#include "mpegvideoenc_template.c"
#endif /* HAVE_SSSE3_INLINE */
-#endif /* HAVE_6REGS */
-
av_cold void ff_dct_encode_init_x86(MPVEncContext *const s)
{
const int dct_algo = s->c.avctx->dct_algo;
@@ -65,11 +61,9 @@ av_cold void ff_dct_encode_init_x86(MPVEncContext *const s)
#if HAVE_SSE2_INLINE
int cpu_flags = av_get_cpu_flags();
if (INLINE_SSE2(cpu_flags)) {
-#if HAVE_6REGS
s->dct_quantize = dct_quantize_sse2;
-#endif
}
-#if HAVE_6REGS && HAVE_SSSE3_INLINE
+#if HAVE_SSSE3_INLINE
if (INLINE_SSSE3(cpu_flags))
s->dct_quantize = dct_quantize_ssse3;
#endif
diff --git a/libavcodec/x86/mpegvideoenc_template.c b/libavcodec/x86/mpegvideoenc_template.c
index b5417f6d32..e6ce791347 100644
--- a/libavcodec/x86/mpegvideoenc_template.c
+++ b/libavcodec/x86/mpegvideoenc_template.c
@@ -70,7 +70,7 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
{
x86_reg last_non_zero_p1;
int level=0, q; //=0 is because gcc says uninitialized ...
- const uint16_t *qmat, *bias;
+ const uint16_t *qmat;
LOCAL_ALIGNED_16(int16_t, temp_block, [64]);
//s->fdct (block);
@@ -86,11 +86,9 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
int dummy;
if (n < 4){
q = s->c.y_dc_scale;
- bias = s->q_intra_matrix16[qscale][1];
qmat = s->q_intra_matrix16[qscale][0];
}else{
q = s->c.c_dc_scale;
- bias = s->q_chroma_intra_matrix16[qscale][1];
qmat = s->q_chroma_intra_matrix16[qscale][0];
}
/* note: block[0] is assumed to be positive */
@@ -109,7 +107,6 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
last_non_zero_p1 = 1;
} else {
last_non_zero_p1 = 0;
- bias = s->q_inter_matrix16[qscale][1];
qmat = s->q_inter_matrix16[qscale][0];
}
@@ -121,7 +118,7 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
"pxor %%xmm4, %%xmm4 \n\t" // 0
"movdqa (%2), %%xmm5 \n\t" // qmat[0]
"pxor %%xmm6, %%xmm6 \n\t"
- "psubw (%3), %%xmm6 \n\t" // -bias[0]
+ "psubw 128(%2), %%xmm6 \n\t" // -bias[0]
"mov $-128, %%"FF_REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
@@ -131,9 +128,9 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
"pmulhw %%xmm5, %%xmm0 \n\t" // (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16
"por %%xmm0, %%xmm4 \n\t"
RESTORE_SIGN("%%xmm1", "%%xmm0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
- "movdqa %%xmm0, (%5, %%"FF_REG_a") \n\t"
+ "movdqa %%xmm0, (%4, %0) \n\t"
"pcmpeqw %%xmm2, %%xmm0 \n\t" // out==0 ? 0xFF : 0x00
- "movdqa (%4, %%"FF_REG_a"), %%xmm1 \n\t"
+ "movdqa (%3, %0), %%xmm1 \n\t"
"movdqa %%xmm2, (%1, %%"FF_REG_a") \n\t" // 0
"pandn %%xmm1, %%xmm0 \n\t"
"pmaxsw %%xmm0, %%xmm3 \n\t"
@@ -143,7 +140,7 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
"movd %%xmm3, %%"FF_REG_a" \n\t"
"movzbl %%al, %%eax \n\t" // last_non_zero_p1
: "+a" (last_non_zero_p1)
- : "r" (block+64), "r" (qmat), "r" (bias),
+ : "r" (block+64), "r" (qmat),
"r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6")
@@ -159,15 +156,15 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
"1: \n\t"
"movdqa (%1, %%"FF_REG_a"), %%xmm0 \n\t" // block[i]
SAVE_SIGN("%%xmm1", "%%xmm0") // ABS(block[i])
- "movdqa (%3, %%"FF_REG_a"), %%xmm6 \n\t" // bias[0]
+ "movdqa 128(%2, %0), %%xmm6 \n\t" // bias[i]
"paddusw %%xmm6, %%xmm0 \n\t" // ABS(block[i]) + bias[0]
"movdqa (%2, %%"FF_REG_a"), %%xmm5 \n\t" // qmat[i]
"pmulhw %%xmm5, %%xmm0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16
"por %%xmm0, %%xmm4 \n\t"
RESTORE_SIGN("%%xmm1", "%%xmm0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
- "movdqa %%xmm0, (%5, %%"FF_REG_a") \n\t"
+ "movdqa %%xmm0, (%4, %0) \n\t"
"pcmpeqw %%xmm2, %%xmm0 \n\t" // out==0 ? 0xFF : 0x00
- "movdqa (%4, %%"FF_REG_a"), %%xmm1 \n\t"
+ "movdqa (%3, %0), %%xmm1 \n\t"
"movdqa %%xmm2, (%1, %%"FF_REG_a") \n\t" // 0
"pandn %%xmm1, %%xmm0 \n\t"
"pmaxsw %%xmm0, %%xmm3 \n\t"
@@ -177,7 +174,7 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
"movd %%xmm3, %%"FF_REG_a" \n\t"
"movzbl %%al, %%eax \n\t" // last_non_zero_p1
: "+a" (last_non_zero_p1)
- : "r" (block+64), "r" (qmat+64), "r" (bias+64),
+ : "r" (block+64), "r" (qmat+64),
"r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6")
--
2.49.1
>From 8ae2428ebedca7f191846e5fde2442069d15e8b1 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sun, 16 Nov 2025 12:10:22 +0100
Subject: [PATCH 9/9] avutil/x86/asm: Remove wrong comment, rename FF_REG_sp
Before FFmpeg commit 531b0a316b24f00965cd8a88efdbea2c6d63147f,
FFmpeg used REG_SP as macro for the stack pointer, yet this
clashed with a REG_SP define in Solaris system headers, so it
was changed to REG_sp and a comment was added for this.
Libav fixed it by adding an FF_ prefix to the macros in
1e9c5bf4c136fe9e010cc8a7e7270bba0d1bf45e. FFmpeg switched
to using these prefixes in 9eb3da2f9942cf1b1148d242bccfc383f666feb6,
using FF_REG_sp instead of Libav's FF_REG_SP. In said commit
the comment was changed to claim that Solaris system headers
define FF_REG_SP, but this is (most likely) wrong.
This commit removes the wrong comment and renames the (actually unused)
macro to FF_REG_SP to make it consistent with FF_REG_BP.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavutil/x86/asm.h | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/libavutil/x86/asm.h b/libavutil/x86/asm.h
index 9bff42d628..f06ea25035 100644
--- a/libavutil/x86/asm.h
+++ b/libavutil/x86/asm.h
@@ -38,8 +38,7 @@ typedef struct ymm_reg { uint64_t a, b, c, d; } ymm_reg;
# define FF_PTR_SIZE "8"
typedef int64_t x86_reg;
-/* FF_REG_SP is defined in Solaris sys headers, so use FF_REG_sp */
-# define FF_REG_sp "rsp"
+# define FF_REG_SP "rsp"
# define FF_REG_BP "rbp"
# define FF_REGBP rbp
# define FF_REGa rax
@@ -60,7 +59,7 @@ typedef int64_t x86_reg;
# define FF_PTR_SIZE "4"
typedef int32_t x86_reg;
-# define FF_REG_sp "esp"
+# define FF_REG_SP "esp"
# define FF_REG_BP "ebp"
# define FF_REGBP ebp
# define FF_REGa eax
--
2.49.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2025-11-16 11:44 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-11-16 11:43 [FFmpeg-devel] [PATCH] avcodec/x86/mpegvideoenc cleanup (PR #20932) mkver via ffmpeg-devel
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git