* [FFmpeg-devel] [PR] x86/huffyuvencdsp: Remove mmxext sub_hfyu_median_pred_int16, add sse2, avx2 (PR #22288)
@ 2026-02-26 2:59 mkver via ffmpeg-devel
0 siblings, 0 replies; only message in thread
From: mkver via ffmpeg-devel @ 2026-02-26 2:59 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: mkver
PR #22288 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22288
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22288.patch
>From 47d966306d84a0fb91d8fc8bed22648d6d097f5c Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 25 Feb 2026 20:00:46 +0100
Subject: [PATCH 01/10] avcodec/huffyuvenc: Calculate mask only once
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/huffyuvenc.c | 16 +++++++++-------
1 file changed, 9 insertions(+), 7 deletions(-)
diff --git a/libavcodec/huffyuvenc.c b/libavcodec/huffyuvenc.c
index b213d4dc95..370b383de5 100644
--- a/libavcodec/huffyuvenc.c
+++ b/libavcodec/huffyuvenc.c
@@ -54,7 +54,7 @@ typedef struct HYuvEncContext {
int bitstream_bpp;
int version;
int bps;
- int n; // 1<<bps
+ unsigned mask; // (1<<bps)-1
int vlc_n; // number of vlc codes (FFMIN(1<<bps, MAX_VLC_N))
int alpha;
int chroma;
@@ -84,7 +84,7 @@ static inline void diff_bytes(HYuvEncContext *s, uint8_t *dst,
if (s->bps <= 8) {
s->llvidencdsp.diff_bytes(dst, src0, src1, w);
} else {
- s->hencdsp.diff_int16((uint16_t *)dst, (const uint16_t *)src0, (const uint16_t *)src1, s->n - 1, w);
+ s->hencdsp.diff_int16((uint16_t *)dst, (const uint16_t *)src0, (const uint16_t *)src1, s->mask, w);
}
}
@@ -114,7 +114,7 @@ static inline int sub_left_prediction(HYuvEncContext *s, uint8_t *dst,
}
if (w < 32)
return left;
- s->hencdsp.diff_int16(dst16 + 32, src16 + 32, src16 + 31, s->n - 1, w - 32);
+ s->hencdsp.diff_int16(dst16 + 32, src16 + 32, src16 + 31, s->mask, w - 32);
return src16[w-1];
}
}
@@ -190,7 +190,8 @@ static void sub_median_prediction(HYuvEncContext *s, uint8_t *dst,
if (s->bps <= 8) {
s->llvidencdsp.sub_median_pred(dst, src1, src2, w , left, left_top);
} else {
- s->hencdsp.sub_hfyu_median_pred_int16((uint16_t *)dst, (const uint16_t *)src1, (const uint16_t *)src2, s->n - 1, w , left, left_top);
+ s->hencdsp.sub_hfyu_median_pred_int16((uint16_t *)dst, (const uint16_t *)src1,
+ (const uint16_t *)src2, s->mask, w, left, left_top);
}
}
@@ -274,6 +275,9 @@ static av_cold int encode_init(AVCodecContext *avctx)
s->chroma_h_shift = desc->log2_chroma_w;
s->chroma_v_shift = desc->log2_chroma_h;
+ s->mask = (1 << s->bps) - 1;
+ s->vlc_n = FFMIN(1 << s->bps, MAX_VLC_N);
+
switch (avctx->pix_fmt) {
case AV_PIX_FMT_YUV420P:
case AV_PIX_FMT_YUV422P:
@@ -335,8 +339,6 @@ static av_cold int encode_init(AVCodecContext *avctx)
av_log(avctx, AV_LOG_ERROR, "format not supported\n");
return AVERROR(EINVAL);
}
- s->n = 1<<s->bps;
- s->vlc_n = FFMIN(s->n, MAX_VLC_N);
avctx->bits_per_coded_sample = s->bitstream_bpp;
s->decorrelate = s->bitstream_bpp >= 24 && !s->yuv && !(desc->flags & AV_PIX_FMT_FLAG_PLANAR);
@@ -587,7 +589,7 @@ do { \
if (s->bps <= 8) {
ENCODE_PLANE(LOAD2, LOADEND, WRITE2, WRITEEND, STAT2, STATEND);
} else if (s->bps <= 14) {
- int mask = s->n - 1;
+ unsigned mask = s->mask;
ENCODE_PLANE(LOAD2_14, LOADEND_14, WRITE2, WRITEEND, STAT2, STATEND);
} else {
--
2.52.0
>From 82a0114e403764717637b30df7a685dd4d168015 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 25 Feb 2026 20:13:50 +0100
Subject: [PATCH 02/10] avcodec/huffyuvenc: Mark unreachable code as such
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/huffyuvenc.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/libavcodec/huffyuvenc.c b/libavcodec/huffyuvenc.c
index 370b383de5..8f320f0272 100644
--- a/libavcodec/huffyuvenc.c
+++ b/libavcodec/huffyuvenc.c
@@ -336,8 +336,7 @@ static av_cold int encode_init(AVCodecContext *avctx)
s->bitstream_bpp = 24;
break;
default:
- av_log(avctx, AV_LOG_ERROR, "format not supported\n");
- return AVERROR(EINVAL);
+ av_unreachable("Already checked via CODEC_PIXFMTS");
}
avctx->bits_per_coded_sample = s->bitstream_bpp;
--
2.52.0
>From 397d53d9c8cb0388e00d104b4608dbaf23449469 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 25 Feb 2026 20:33:04 +0100
Subject: [PATCH 03/10] avcodec/huffyuvencdsp: Pass bpp, not AVPixelFormat for
init
Avoids having to get a pixel format descriptor.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/huffyuvenc.c | 3 ++-
libavcodec/huffyuvencdsp.c | 4 ++--
libavcodec/huffyuvencdsp.h | 6 ++----
libavcodec/x86/huffyuvencdsp_init.c | 6 ++----
4 files changed, 8 insertions(+), 11 deletions(-)
diff --git a/libavcodec/huffyuvenc.c b/libavcodec/huffyuvenc.c
index 8f320f0272..0f2cf1791d 100644
--- a/libavcodec/huffyuvenc.c
+++ b/libavcodec/huffyuvenc.c
@@ -253,7 +253,6 @@ static av_cold int encode_init(AVCodecContext *avctx)
s->flags = avctx->flags;
ff_bswapdsp_init(&s->bdsp);
- ff_huffyuvencdsp_init(&s->hencdsp, avctx->pix_fmt);
ff_llvidencdsp_init(&s->llvidencdsp);
avctx->extradata = av_mallocz(3*MAX_N + 4);
@@ -278,6 +277,8 @@ static av_cold int encode_init(AVCodecContext *avctx)
s->mask = (1 << s->bps) - 1;
s->vlc_n = FFMIN(1 << s->bps, MAX_VLC_N);
+ ff_huffyuvencdsp_init(&s->hencdsp, s->bps);
+
switch (avctx->pix_fmt) {
case AV_PIX_FMT_YUV420P:
case AV_PIX_FMT_YUV422P:
diff --git a/libavcodec/huffyuvencdsp.c b/libavcodec/huffyuvencdsp.c
index e332f678d4..dcae51f4f8 100644
--- a/libavcodec/huffyuvencdsp.c
+++ b/libavcodec/huffyuvencdsp.c
@@ -84,12 +84,12 @@ static void sub_hfyu_median_pred_int16_c(uint16_t *dst, const uint16_t *src1, co
*left_top = lt;
}
-av_cold void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c, enum AVPixelFormat pix_fmt)
+av_cold void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c, int bpp)
{
c->diff_int16 = diff_int16_c;
c->sub_hfyu_median_pred_int16 = sub_hfyu_median_pred_int16_c;
#if ARCH_X86 && HAVE_X86ASM
- ff_huffyuvencdsp_init_x86(c, pix_fmt);
+ ff_huffyuvencdsp_init_x86(c, bpp);
#endif
}
diff --git a/libavcodec/huffyuvencdsp.h b/libavcodec/huffyuvencdsp.h
index 779a51ac79..fae182add1 100644
--- a/libavcodec/huffyuvencdsp.h
+++ b/libavcodec/huffyuvencdsp.h
@@ -21,8 +21,6 @@
#include <stdint.h>
-#include "libavutil/pixfmt.h"
-
typedef struct HuffYUVEncDSPContext {
void (*diff_int16)(uint16_t *dst /* align 16 */,
const uint16_t *src1 /* align 16 */,
@@ -34,7 +32,7 @@ typedef struct HuffYUVEncDSPContext {
int w, int *left, int *left_top);
} HuffYUVEncDSPContext;
-void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c, enum AVPixelFormat pix_fmt);
-void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, enum AVPixelFormat pix_fmt);
+void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c, int bpp);
+void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int bpp);
#endif /* AVCODEC_HUFFYUVENCDSP_H */
diff --git a/libavcodec/x86/huffyuvencdsp_init.c b/libavcodec/x86/huffyuvencdsp_init.c
index c9c33b75b4..fd54fdcc00 100644
--- a/libavcodec/x86/huffyuvencdsp_init.c
+++ b/libavcodec/x86/huffyuvencdsp_init.c
@@ -24,7 +24,6 @@
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
-#include "libavutil/pixdesc.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/huffyuvencdsp.h"
@@ -35,12 +34,11 @@ void ff_diff_int16_avx2(uint16_t *dst, const uint16_t *src1, const uint16_t *src
void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
unsigned mask, int w, int *left, int *left_top);
-av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, enum AVPixelFormat pix_fmt)
+av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int bpp)
{
av_unused int cpu_flags = av_get_cpu_flags();
- const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(pix_fmt);
- if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc && pix_desc->comp[0].depth<16) {
+ if (EXTERNAL_MMXEXT(cpu_flags) && bpp < 16) {
c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_mmxext;
}
--
2.52.0
>From 12fbc3f8187da1f3b4a7fa82fa6ef6959757e263 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 25 Feb 2026 21:38:30 +0100
Subject: [PATCH 04/10] tests/checkasm: Add huffyuvencdsp test
Only covers sub_hfyu_median_pred_int16 for now.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
tests/checkasm/Makefile | 1 +
tests/checkasm/checkasm.c | 3 ++
tests/checkasm/checkasm.h | 1 +
tests/checkasm/huffyuvencdsp.c | 89 ++++++++++++++++++++++++++++++++++
tests/fate/checkasm.mak | 1 +
5 files changed, 95 insertions(+)
create mode 100644 tests/checkasm/huffyuvencdsp.c
diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 48de4d22a0..491bde2a7a 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -13,6 +13,7 @@ AVCODECOBJS-$(CONFIG_H264DSP) += h264dsp.o
AVCODECOBJS-$(CONFIG_H264PRED) += h264pred.o
AVCODECOBJS-$(CONFIG_H264QPEL) += h264qpel.o
AVCODECOBJS-$(CONFIG_HPELDSP) += hpeldsp.o
+AVCODECOBJS-$(CONFIG_HUFFYUVENCDSP) += huffyuvencdsp.o
AVCODECOBJS-$(CONFIG_IDCTDSP) += idctdsp.o
AVCODECOBJS-$(CONFIG_LLAUDDSP) += llauddsp.o
AVCODECOBJS-$(CONFIG_LLVIDDSP) += llviddsp.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index bdaaa8695d..38bd1edce7 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -198,6 +198,9 @@ static const struct {
#if CONFIG_HUFFYUV_DECODER
{ "huffyuvdsp", checkasm_check_huffyuvdsp },
#endif
+ #if CONFIG_HUFFYUVENCDSP
+ { "huffyuvencdsp", checkasm_check_huffyuvencdsp },
+ #endif
#if CONFIG_IDCTDSP
{ "idctdsp", checkasm_check_idctdsp },
#endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 2a6c7e8ea6..db30ddb863 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -116,6 +116,7 @@ void checkasm_check_hevc_pel(void);
void checkasm_check_hevc_sao(void);
void checkasm_check_hpeldsp(void);
void checkasm_check_huffyuvdsp(void);
+void checkasm_check_huffyuvencdsp(void);
void checkasm_check_idctdsp(void);
void checkasm_check_idet(void);
void checkasm_check_jpeg2000dsp(void);
diff --git a/tests/checkasm/huffyuvencdsp.c b/tests/checkasm/huffyuvencdsp.c
new file mode 100644
index 0000000000..049a7d126b
--- /dev/null
+++ b/tests/checkasm/huffyuvencdsp.c
@@ -0,0 +1,89 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "checkasm.h"
+#include "libavcodec/huffyuvencdsp.h"
+#include "libavutil/cpu.h"
+#include "libavutil/macros.h"
+#include "libavutil/mem_internal.h"
+
+enum {
+ MAX_WIDTH = 4096, ///< maximum test width, must be a power of two smaller than the maximum alignment
+};
+
+#define randomize_buffers(buf, size, mask) \
+ do { \
+ for (size_t j = 0; j < size; ++j) \
+ buf[j] = rnd() & mask; \
+ } while (0)
+
+
+static void check_sub_hfyu_median_pred_int16(const char *aligned, unsigned width)
+{
+ static const int bpps[] = { 9, 16, };
+ HuffYUVEncDSPContext c;
+
+ declare_func_emms(AV_CPU_FLAG_MMXEXT, void, uint16_t *dst, const uint16_t *src1,
+ const uint16_t *src2, unsigned mask, int w, int *left, int *left_top);
+
+ for (size_t i = 0; i < FF_ARRAY_ELEMS(bpps); ++i) {
+ const int bpp = bpps[i];
+
+ ff_huffyuvencdsp_init(&c, bpp);
+
+ if (check_func(c.sub_hfyu_median_pred_int16, "sub_hfyu_median_pred_int16_%dbpp%s", bpp, aligned)) {
+ DECLARE_ALIGNED(32, uint16_t, dst0)[MAX_WIDTH];
+ DECLARE_ALIGNED(32, uint16_t, dst1)[MAX_WIDTH];
+ uint16_t src1[MAX_WIDTH];
+ uint16_t src2[MAX_WIDTH];
+ const unsigned mask = (1 << bpp) - 1;
+ int l1 = rnd() & mask, lt1 = rnd() & mask, l2 = l1, lt2 = lt1;
+
+ randomize_buffers(src1, width, mask);
+ randomize_buffers(src2, width, mask);
+
+ call_ref(dst0, src1, src2, mask, width, &l1, <1);
+ call_new(dst1, src1, src2, mask, width, &l2, <2);
+ if (l1 != l2 || lt1 != lt2 || memcmp(dst0, dst1, width * sizeof(dst0[0])))
+ fail();
+ bench_new(dst1, src1, src2, mask, width, &l2, <2);
+ }
+ }
+}
+
+void checkasm_check_huffyuvencdsp(void)
+{
+ static unsigned width = 0;
+
+ if (!width) {
+ width = rnd() % MAX_WIDTH;
+ width = width ? width : 1;
+ }
+
+ const size_t align = av_cpu_max_align();
+
+ check_sub_hfyu_median_pred_int16("_aligned", FFALIGN(width, align / sizeof(uint16_t)));
+ report("sub_hfyu_median_pred_int16_aligned");
+
+ check_sub_hfyu_median_pred_int16("", width);
+ report("sub_hfyu_median_pred_int16");
+}
diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
index 16c6f1f775..b05dc61f67 100644
--- a/tests/fate/checkasm.mak
+++ b/tests/fate/checkasm.mak
@@ -33,6 +33,7 @@ FATE_CHECKASM = fate-checkasm-aacencdsp \
fate-checkasm-hevc_sao \
fate-checkasm-hpeldsp \
fate-checkasm-huffyuvdsp \
+ fate-checkasm-huffyuvencdsp \
fate-checkasm-idctdsp \
fate-checkasm-jpeg2000dsp \
fate-checkasm-llauddsp \
--
2.52.0
>From 42c44aae631e58f4c456e760fd36fd92ec62a449 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 25 Feb 2026 21:49:31 +0100
Subject: [PATCH 05/10] tests/checkasm: Fix huffyuvdsp test criterion
Use CONFIG_HUFFYUVDSP, not CONFIG_HUFFYUV_DECODER
(although they are equivalent).
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
tests/checkasm/Makefile | 2 +-
tests/checkasm/checkasm.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 491bde2a7a..a9b58f5d1d 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -13,6 +13,7 @@ AVCODECOBJS-$(CONFIG_H264DSP) += h264dsp.o
AVCODECOBJS-$(CONFIG_H264PRED) += h264pred.o
AVCODECOBJS-$(CONFIG_H264QPEL) += h264qpel.o
AVCODECOBJS-$(CONFIG_HPELDSP) += hpeldsp.o
+AVCODECOBJS-$(CONFIG_HUFFYUVDSP) += huffyuvdsp.o
AVCODECOBJS-$(CONFIG_HUFFYUVENCDSP) += huffyuvencdsp.o
AVCODECOBJS-$(CONFIG_IDCTDSP) += idctdsp.o
AVCODECOBJS-$(CONFIG_LLAUDDSP) += llauddsp.o
@@ -39,7 +40,6 @@ AVCODECOBJS-$(CONFIG_DCA_DECODER) += dcadsp.o synth_filter.o
AVCODECOBJS-$(CONFIG_DIRAC_DECODER) += diracdsp.o
AVCODECOBJS-$(CONFIG_EXR_DECODER) += exrdsp.o
AVCODECOBJS-$(CONFIG_FLAC_DECODER) += flacdsp.o
-AVCODECOBJS-$(CONFIG_HUFFYUV_DECODER) += huffyuvdsp.o
AVCODECOBJS-$(CONFIG_JPEG2000_DECODER) += jpeg2000dsp.o
AVCODECOBJS-$(CONFIG_OPUS_DECODER) += opusdsp.o
AVCODECOBJS-$(CONFIG_PIXBLOCKDSP) += pixblockdsp.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 38bd1edce7..407267a4c3 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -195,7 +195,7 @@ static const struct {
#if CONFIG_HPELDSP
{ "hpeldsp", checkasm_check_hpeldsp },
#endif
- #if CONFIG_HUFFYUV_DECODER
+ #if CONFIG_HUFFYUVDSP
{ "huffyuvdsp", checkasm_check_huffyuvdsp },
#endif
#if CONFIG_HUFFYUVENCDSP
--
2.52.0
>From f59de7aeb552df07979d51d9a53714e968639bb9 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 25 Feb 2026 21:50:46 +0100
Subject: [PATCH 06/10] avcodec/hufyuvencdsp: Add width parameter to init
This allows to only use certain functions using wide registers
if there is enough work to do and if one can even read a whole
register wide without overreading.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/huffyuvenc.c | 2 +-
libavcodec/huffyuvencdsp.c | 4 ++--
libavcodec/huffyuvencdsp.h | 4 ++--
libavcodec/x86/huffyuvencdsp_init.c | 2 +-
tests/checkasm/huffyuvencdsp.c | 2 +-
5 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/libavcodec/huffyuvenc.c b/libavcodec/huffyuvenc.c
index 0f2cf1791d..f7211d1ba0 100644
--- a/libavcodec/huffyuvenc.c
+++ b/libavcodec/huffyuvenc.c
@@ -277,7 +277,7 @@ static av_cold int encode_init(AVCodecContext *avctx)
s->mask = (1 << s->bps) - 1;
s->vlc_n = FFMIN(1 << s->bps, MAX_VLC_N);
- ff_huffyuvencdsp_init(&s->hencdsp, s->bps);
+ ff_huffyuvencdsp_init(&s->hencdsp, s->bps, avctx->width >> s->chroma_h_shift);
switch (avctx->pix_fmt) {
case AV_PIX_FMT_YUV420P:
diff --git a/libavcodec/huffyuvencdsp.c b/libavcodec/huffyuvencdsp.c
index dcae51f4f8..9dd84dbafe 100644
--- a/libavcodec/huffyuvencdsp.c
+++ b/libavcodec/huffyuvencdsp.c
@@ -84,12 +84,12 @@ static void sub_hfyu_median_pred_int16_c(uint16_t *dst, const uint16_t *src1, co
*left_top = lt;
}
-av_cold void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c, int bpp)
+av_cold void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c, int bpp, int width)
{
c->diff_int16 = diff_int16_c;
c->sub_hfyu_median_pred_int16 = sub_hfyu_median_pred_int16_c;
#if ARCH_X86 && HAVE_X86ASM
- ff_huffyuvencdsp_init_x86(c, bpp);
+ ff_huffyuvencdsp_init_x86(c, bpp, width);
#endif
}
diff --git a/libavcodec/huffyuvencdsp.h b/libavcodec/huffyuvencdsp.h
index fae182add1..173fbca08f 100644
--- a/libavcodec/huffyuvencdsp.h
+++ b/libavcodec/huffyuvencdsp.h
@@ -32,7 +32,7 @@ typedef struct HuffYUVEncDSPContext {
int w, int *left, int *left_top);
} HuffYUVEncDSPContext;
-void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c, int bpp);
-void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int bpp);
+void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c, int bpp, int width);
+void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int bpp, int width);
#endif /* AVCODEC_HUFFYUVENCDSP_H */
diff --git a/libavcodec/x86/huffyuvencdsp_init.c b/libavcodec/x86/huffyuvencdsp_init.c
index fd54fdcc00..153edabf02 100644
--- a/libavcodec/x86/huffyuvencdsp_init.c
+++ b/libavcodec/x86/huffyuvencdsp_init.c
@@ -34,7 +34,7 @@ void ff_diff_int16_avx2(uint16_t *dst, const uint16_t *src1, const uint16_t *src
void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
unsigned mask, int w, int *left, int *left_top);
-av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int bpp)
+av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int bpp, int width)
{
av_unused int cpu_flags = av_get_cpu_flags();
diff --git a/tests/checkasm/huffyuvencdsp.c b/tests/checkasm/huffyuvencdsp.c
index 049a7d126b..a74b4295d6 100644
--- a/tests/checkasm/huffyuvencdsp.c
+++ b/tests/checkasm/huffyuvencdsp.c
@@ -48,7 +48,7 @@ static void check_sub_hfyu_median_pred_int16(const char *aligned, unsigned width
for (size_t i = 0; i < FF_ARRAY_ELEMS(bpps); ++i) {
const int bpp = bpps[i];
- ff_huffyuvencdsp_init(&c, bpp);
+ ff_huffyuvencdsp_init(&c, bpp, width);
if (check_func(c.sub_hfyu_median_pred_int16, "sub_hfyu_median_pred_int16_%dbpp%s", bpp, aligned)) {
DECLARE_ALIGNED(32, uint16_t, dst0)[MAX_WIDTH];
--
2.52.0
>From 0bbc5641732210564dd836166f7dabe56a1f8b2f Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 26 Feb 2026 00:43:09 +0100
Subject: [PATCH 07/10] avcodec/x86/huffyuvencdsp: Add SSE2
sub_hfyu_median_pred_int16
Contrary to the MMXEXT version this version does not overread at all
(the MMXEXT version processes the input of 2*w width in eight byte
chunks and overreads by a further six bytes, because it loads
the next left and left top values at the end of the loop,
i.e. it reads FFALIGN(2*w,8)+6 bytes instead of 2*w).
Benchmarks:
sub_hfyu_median_pred_int16_9bpp_c: 12673.6 ( 1.00x)
sub_hfyu_median_pred_int16_9bpp_mmxext: 1947.7 ( 6.51x)
sub_hfyu_median_pred_int16_9bpp_sse2: 993.9 (12.75x)
sub_hfyu_median_pred_int16_9bpp_aligned_c: 12596.1 ( 1.00x)
sub_hfyu_median_pred_int16_9bpp_aligned_mmxext: 1956.1 ( 6.44x)
sub_hfyu_median_pred_int16_9bpp_aligned_sse2: 989.4 (12.73x)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/huffyuvencdsp.asm | 50 +++++++++++++++++++++++++++++
libavcodec/x86/huffyuvencdsp_init.c | 4 +++
2 files changed, 54 insertions(+)
diff --git a/libavcodec/x86/huffyuvencdsp.asm b/libavcodec/x86/huffyuvencdsp.asm
index 8bfd0face0..3d38931893 100644
--- a/libavcodec/x86/huffyuvencdsp.asm
+++ b/libavcodec/x86/huffyuvencdsp.asm
@@ -94,3 +94,53 @@ cglobal sub_hfyu_median_pred_int16, 7,7,0, dst, src1, src2, mask, w, left, left_
movzx maskd, word [src2q + wq - 2]
mov [leftq], maskd
RET
+
+INIT_XMM sse2
+cglobal sub_hfyu_median_pred_int16, 7,7,6, dst, src1, src2, mask, w, left, left_top
+ movd m5, maskd
+ lea wd, [wd+wd-(mmsize-1)]
+ movu m0, [src1q]
+ movu m2, [src2q]
+ SPLATW m5, m5
+ add dstq, wq
+ movd m1, [left_topq]
+ neg wq
+ movd m3, [leftq]
+ sub src1q, wq
+ sub src2q, wq
+ pslldq m0, 2
+ pslldq m2, 2
+ por m0, m1
+ por m2, m3
+ jmp .init
+
+.loop:
+ movu m0, [src1q + wq - 2] ; lt
+ movu m2, [src2q + wq - 2] ; l
+.init:
+ movu m1, [src1q + wq] ; t
+ movu m3, [src2q + wq]
+ psubw m4, m2, m0 ; l - lt
+ pmaxsw m0, m1, m2
+ paddw m4, m1 ; l - lt + t
+ pminsw m2, m1
+ pand m4, m5 ; (l - lt + t)&mask
+ pminsw m4, m0
+ pmaxsw m4, m2 ; pred
+ psubw m3, m4 ; l - pred
+ pand m3, m5
+ movu [dstq + wq], m3
+ add wq, 16
+ js .loop
+
+ cmp wd, mmsize-1
+ jne .tail
+
+ movzx src1d, word [src1q + (mmsize-1) - 2]
+ movzx src2d, word [src2q + (mmsize-1) - 2]
+ mov [left_topq], src1d
+ mov [leftq], src2d
+ RET
+.tail:
+ mov wq, -1
+ jmp .loop
diff --git a/libavcodec/x86/huffyuvencdsp_init.c b/libavcodec/x86/huffyuvencdsp_init.c
index 153edabf02..e32b7ea19d 100644
--- a/libavcodec/x86/huffyuvencdsp_init.c
+++ b/libavcodec/x86/huffyuvencdsp_init.c
@@ -33,6 +33,8 @@ void ff_diff_int16_avx2(uint16_t *dst, const uint16_t *src1, const uint16_t *src
unsigned mask, int w);
void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
unsigned mask, int w, int *left, int *left_top);
+void ff_sub_hfyu_median_pred_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
+ unsigned mask, int w, int *left, int *left_top);
av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int bpp, int width)
{
@@ -44,6 +46,8 @@ av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int bpp, int wid
if (EXTERNAL_SSE2(cpu_flags)) {
c->diff_int16 = ff_diff_int16_sse2;
+ if (bpp < 16 && width >= 8)
+ c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_sse2;
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
--
2.52.0
>From 8851c0b73444fa979ba49d27b430cdd476f50207 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 26 Feb 2026 02:37:48 +0100
Subject: [PATCH 08/10] avcodec/x86/huffyuvencdsp: Add AVX2
sub_hfyu_median_pred_int16
This version can also process 16bpp.
Benchmarks:
sub_hfyu_median_pred_int16_9bpp_c: 12667.7 ( 1.00x)
sub_hfyu_median_pred_int16_9bpp_mmxext: 1966.5 ( 6.44x)
sub_hfyu_median_pred_int16_9bpp_sse2: 997.6 (12.70x)
sub_hfyu_median_pred_int16_9bpp_avx2: 474.8 (26.68x)
sub_hfyu_median_pred_int16_9bpp_aligned_c: 12604.6 ( 1.00x)
sub_hfyu_median_pred_int16_9bpp_aligned_mmxext: 1964.6 ( 6.42x)
sub_hfyu_median_pred_int16_9bpp_aligned_sse2: 981.9 (12.84x)
sub_hfyu_median_pred_int16_9bpp_aligned_avx2: 462.6 (27.25x)
sub_hfyu_median_pred_int16_16bpp_c: 12592.5 ( 1.00x)
sub_hfyu_median_pred_int16_16bpp_avx2: 465.6 (27.04x)
sub_hfyu_median_pred_int16_16bpp_aligned_c: 12587.5 ( 1.00x)
sub_hfyu_median_pred_int16_16bpp_aligned_avx2: 462.5 (27.22x)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/huffyuvencdsp.asm | 50 ++++++++++++++++++++---------
libavcodec/x86/huffyuvencdsp_init.c | 4 +++
2 files changed, 38 insertions(+), 16 deletions(-)
diff --git a/libavcodec/x86/huffyuvencdsp.asm b/libavcodec/x86/huffyuvencdsp.asm
index 3d38931893..11f4b8c01f 100644
--- a/libavcodec/x86/huffyuvencdsp.asm
+++ b/libavcodec/x86/huffyuvencdsp.asm
@@ -95,23 +95,32 @@ cglobal sub_hfyu_median_pred_int16, 7,7,0, dst, src1, src2, mask, w, left, left_
mov [leftq], maskd
RET
-INIT_XMM sse2
+%macro SUB_HFYU_MEDIAN_PRED_INT16 1 ; u,s for pmaxuw vs pmaxsw
cglobal sub_hfyu_median_pred_int16, 7,7,6, dst, src1, src2, mask, w, left, left_top
- movd m5, maskd
+ movd xm5, maskd
lea wd, [wd+wd-(mmsize-1)]
- movu m0, [src1q]
- movu m2, [src2q]
- SPLATW m5, m5
+ movu xm0, [src1q]
+ movu xm2, [src2q]
+ SPLATW m5, xm5
add dstq, wq
- movd m1, [left_topq]
+ movd xm1, [left_topq]
neg wq
- movd m3, [leftq]
+ movd xm3, [leftq]
+%if mmsize >= 32
+ movu xm4, [src1q+14]
+%endif
sub src1q, wq
+ pslldq xm0, 2
+ pslldq xm2, 2
+ por xm0, xm1
+%if mmsize >= 32
+ vinserti128 m0, xm4, 1
+%endif
+ por xm2, xm3
+%if mmsize >= 32
+ vinserti128 m2, [src2q+14], 1
+%endif
sub src2q, wq
- pslldq m0, 2
- pslldq m2, 2
- por m0, m1
- por m2, m3
jmp .init
.loop:
@@ -121,16 +130,16 @@ cglobal sub_hfyu_median_pred_int16, 7,7,6, dst, src1, src2, mask, w, left, left_
movu m1, [src1q + wq] ; t
movu m3, [src2q + wq]
psubw m4, m2, m0 ; l - lt
- pmaxsw m0, m1, m2
+ pmax%1w m0, m1, m2
paddw m4, m1 ; l - lt + t
- pminsw m2, m1
+ pmin%1w m2, m1
pand m4, m5 ; (l - lt + t)&mask
- pminsw m4, m0
- pmaxsw m4, m2 ; pred
+ pmin%1w m4, m0
+ pmax%1w m4, m2 ; pred
psubw m3, m4 ; l - pred
pand m3, m5
movu [dstq + wq], m3
- add wq, 16
+ add wq, mmsize
js .loop
cmp wd, mmsize-1
@@ -144,3 +153,12 @@ cglobal sub_hfyu_median_pred_int16, 7,7,6, dst, src1, src2, mask, w, left, left_
.tail:
mov wq, -1
jmp .loop
+%endmacro
+
+INIT_XMM sse2
+SUB_HFYU_MEDIAN_PRED_INT16 s
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+SUB_HFYU_MEDIAN_PRED_INT16 u
+%endif
diff --git a/libavcodec/x86/huffyuvencdsp_init.c b/libavcodec/x86/huffyuvencdsp_init.c
index e32b7ea19d..7289e94bc7 100644
--- a/libavcodec/x86/huffyuvencdsp_init.c
+++ b/libavcodec/x86/huffyuvencdsp_init.c
@@ -35,6 +35,8 @@ void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, c
unsigned mask, int w, int *left, int *left_top);
void ff_sub_hfyu_median_pred_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
unsigned mask, int w, int *left, int *left_top);
+void ff_sub_hfyu_median_pred_int16_avx2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
+ unsigned mask, int w, int *left, int *left_top);
av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int bpp, int width)
{
@@ -52,5 +54,7 @@ av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int bpp, int wid
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
c->diff_int16 = ff_diff_int16_avx2;
+ if (width >= 16)
+ c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_avx2;
}
}
--
2.52.0
>From af610f25b25214bab2ba2bb5b0679b93b9b68323 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 26 Feb 2026 02:44:37 +0100
Subject: [PATCH 09/10] avcodec/x86/huffyuvencdsp: Remove MMX
sub_hfyu_median_pred_int16
Superseded by SSE2 and AVX2.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/huffyuvenc.c | 2 --
libavcodec/x86/huffyuvencdsp.asm | 40 -----------------------------
libavcodec/x86/huffyuvencdsp_init.c | 6 -----
tests/checkasm/huffyuvencdsp.c | 4 +--
4 files changed, 2 insertions(+), 50 deletions(-)
diff --git a/libavcodec/huffyuvenc.c b/libavcodec/huffyuvenc.c
index f7211d1ba0..706d65597a 100644
--- a/libavcodec/huffyuvenc.c
+++ b/libavcodec/huffyuvenc.c
@@ -39,7 +39,6 @@
#include "huffyuvencdsp.h"
#include "lossless_videoencdsp.h"
#include "put_bits.h"
-#include "libavutil/emms.h"
#include "libavutil/mem.h"
#include "libavutil/opt.h"
#include "libavutil/pixdesc.h"
@@ -939,7 +938,6 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
} else {
av_log(avctx, AV_LOG_ERROR, "Format not supported!\n");
}
- emms_c();
size += (put_bits_count(&s->pb) + 31) / 8;
put_bits(&s->pb, 16, 0);
diff --git a/libavcodec/x86/huffyuvencdsp.asm b/libavcodec/x86/huffyuvencdsp.asm
index 11f4b8c01f..e8e7a6469d 100644
--- a/libavcodec/x86/huffyuvencdsp.asm
+++ b/libavcodec/x86/huffyuvencdsp.asm
@@ -55,46 +55,6 @@ INIT_YMM avx2
DIFF_INT16
%endif
-INIT_MMX mmxext
-cglobal sub_hfyu_median_pred_int16, 7,7,0, dst, src1, src2, mask, w, left, left_top
- add wd, wd
- movd mm7, maskd
- SPLATW mm7, mm7
- movq mm0, [src1q]
- movq mm2, [src2q]
- psllq mm0, 16
- psllq mm2, 16
- movd mm6, [left_topq]
- por mm0, mm6
- movd mm6, [leftq]
- por mm2, mm6
- xor maskq, maskq
-.loop:
- movq mm1, [src1q + maskq]
- movq mm3, [src2q + maskq]
- movq mm4, mm2
- psubw mm2, mm0
- paddw mm2, mm1
- pand mm2, mm7
- movq mm5, mm4
- pmaxsw mm4, mm1
- pminsw mm1, mm5
- pminsw mm4, mm2
- pmaxsw mm4, mm1
- psubw mm3, mm4
- pand mm3, mm7
- movq [dstq + maskq], mm3
- add maskq, 8
- movq mm0, [src1q + maskq - 2]
- movq mm2, [src2q + maskq - 2]
- cmp maskq, wq
- jb .loop
- movzx maskd, word [src1q + wq - 2]
- mov [left_topq], maskd
- movzx maskd, word [src2q + wq - 2]
- mov [leftq], maskd
- RET
-
%macro SUB_HFYU_MEDIAN_PRED_INT16 1 ; u,s for pmaxuw vs pmaxsw
cglobal sub_hfyu_median_pred_int16, 7,7,6, dst, src1, src2, mask, w, left, left_top
movd xm5, maskd
diff --git a/libavcodec/x86/huffyuvencdsp_init.c b/libavcodec/x86/huffyuvencdsp_init.c
index 7289e94bc7..c46be95cb9 100644
--- a/libavcodec/x86/huffyuvencdsp_init.c
+++ b/libavcodec/x86/huffyuvencdsp_init.c
@@ -31,8 +31,6 @@ void ff_diff_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src
unsigned mask, int w);
void ff_diff_int16_avx2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
unsigned mask, int w);
-void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
- unsigned mask, int w, int *left, int *left_top);
void ff_sub_hfyu_median_pred_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
unsigned mask, int w, int *left, int *left_top);
void ff_sub_hfyu_median_pred_int16_avx2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
@@ -42,10 +40,6 @@ av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int bpp, int wid
{
av_unused int cpu_flags = av_get_cpu_flags();
- if (EXTERNAL_MMXEXT(cpu_flags) && bpp < 16) {
- c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_mmxext;
- }
-
if (EXTERNAL_SSE2(cpu_flags)) {
c->diff_int16 = ff_diff_int16_sse2;
if (bpp < 16 && width >= 8)
diff --git a/tests/checkasm/huffyuvencdsp.c b/tests/checkasm/huffyuvencdsp.c
index a74b4295d6..b5d02cda6d 100644
--- a/tests/checkasm/huffyuvencdsp.c
+++ b/tests/checkasm/huffyuvencdsp.c
@@ -42,8 +42,8 @@ static void check_sub_hfyu_median_pred_int16(const char *aligned, unsigned width
static const int bpps[] = { 9, 16, };
HuffYUVEncDSPContext c;
- declare_func_emms(AV_CPU_FLAG_MMXEXT, void, uint16_t *dst, const uint16_t *src1,
- const uint16_t *src2, unsigned mask, int w, int *left, int *left_top);
+ declare_func(void, uint16_t *dst, const uint16_t *src1,
+ const uint16_t *src2, unsigned mask, int w, int *left, int *left_top);
for (size_t i = 0; i < FF_ARRAY_ELEMS(bpps); ++i) {
const int bpp = bpps[i];
--
2.52.0
>From e7fd30a2dbe801b29fa7a9395f588870e98d2556 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 26 Feb 2026 02:46:40 +0100
Subject: [PATCH 10/10] avcodec/x86/huffyuvencdsp_init: Remove pointless
av_unused
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavcodec/x86/huffyuvencdsp_init.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libavcodec/x86/huffyuvencdsp_init.c b/libavcodec/x86/huffyuvencdsp_init.c
index c46be95cb9..b4dd69bd28 100644
--- a/libavcodec/x86/huffyuvencdsp_init.c
+++ b/libavcodec/x86/huffyuvencdsp_init.c
@@ -38,7 +38,7 @@ void ff_sub_hfyu_median_pred_int16_avx2(uint16_t *dst, const uint16_t *src1, con
av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, int bpp, int width)
{
- av_unused int cpu_flags = av_get_cpu_flags();
+ int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE2(cpu_flags)) {
c->diff_int16 = ff_diff_int16_sse2;
--
2.52.0
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2026-02-26 3:02 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2026-02-26 2:59 [FFmpeg-devel] [PR] x86/huffyuvencdsp: Remove mmxext sub_hfyu_median_pred_int16, add sse2, avx2 (PR #22288) mkver via ffmpeg-devel
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git