From: mkver via ffmpeg-devel <ffmpeg-devel@ffmpeg.org>
To: ffmpeg-devel@ffmpeg.org
Cc: mkver <code@ffmpeg.org>
Subject: [FFmpeg-devel] [PATCH] avfilter/vf_fspp: Add checkasm, port to SSE2, fix big-endian (PR #20909)
Date: Thu, 13 Nov 2025 11:55:11 -0000
Message-ID: <176303491254.25.8423688227525973225@2cb04c0e5124> (raw)
PR #20909 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20909
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20909.patch
>From 92fe3d96e6f9a3b169a3edcdb48ecdc543ba862e Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sun, 9 Nov 2025 17:06:46 +0100
Subject: [PATCH 01/23] avfilter/vf_fspp: Add DSPCtx, move DSP functions to
file of their own
This is in preparation for adding checkasm tests; without it,
checkasm would pull all of libavfilter in.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavfilter/Makefile | 2 +-
libavfilter/vf_fspp.c | 399 +++---------------------
libavfilter/vf_fsppdsp.c | 369 ++++++++++++++++++++++
libavfilter/{vf_fspp.h => vf_fsppdsp.h} | 85 +++--
libavfilter/x86/vf_fspp_init.c | 4 +-
5 files changed, 455 insertions(+), 404 deletions(-)
create mode 100644 libavfilter/vf_fsppdsp.c
rename libavfilter/{vf_fspp.h => vf_fsppdsp.h} (52%)
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 69d74183b2..d56a458e45 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -329,7 +329,7 @@ OBJS-$(CONFIG_FRAMESTEP_FILTER) += vf_framestep.o
OBJS-$(CONFIG_FREEZEDETECT_FILTER) += vf_freezedetect.o
OBJS-$(CONFIG_FREEZEFRAMES_FILTER) += vf_freezeframes.o
OBJS-$(CONFIG_FREI0R_FILTER) += vf_frei0r.o
-OBJS-$(CONFIG_FSPP_FILTER) += vf_fspp.o qp_table.o
+OBJS-$(CONFIG_FSPP_FILTER) += vf_fspp.o vf_fsppdsp.o qp_table.o
OBJS-$(CONFIG_FSYNC_FILTER) += vf_fsync.o
OBJS-$(CONFIG_GBLUR_FILTER) += vf_gblur.o
OBJS-$(CONFIG_GBLUR_VULKAN_FILTER) += vf_gblur_vulkan.o vulkan.o vulkan_filter.o
diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c
index 6b4a715367..9371c63e77 100644
--- a/libavfilter/vf_fspp.c
+++ b/libavfilter/vf_fspp.c
@@ -41,12 +41,40 @@
#include "libavutil/mem_internal.h"
#include "libavutil/opt.h"
#include "libavutil/pixdesc.h"
+#include "libavutil/video_enc_params.h"
+#include "avfilter.h"
#include "filters.h"
#include "qp_table.h"
-#include "vf_fspp.h"
+#include "vf_fsppdsp.h"
#include "video.h"
+#define BLOCKSZ 12
+#define MAX_LEVEL 5
+
+typedef struct FSPPContext {
+ const struct AVClass *class;
+ uint64_t threshold_mtx_noq[8 * 2];
+ uint64_t threshold_mtx[8 * 2]; //used in both C & MMX (& later SSE2) versions
+
+ int log2_count;
+ int strength;
+ int hsub;
+ int vsub;
+ int temp_stride;
+ int qp;
+ enum AVVideoEncParamsType qscale_type;
+ int prev_q;
+ uint8_t *src;
+ int16_t *temp;
+ int8_t *non_b_qp_table;
+ int non_b_qp_stride;
+ int use_bframe_qp;
+
+ FSPPDSPContext dsp;
+} FSPPContext;
+
+
#define OFFSET(x) offsetof(FSPPContext, x)
#define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
static const AVOption fspp_options[] = {
@@ -59,17 +87,6 @@ static const AVOption fspp_options[] = {
AVFILTER_DEFINE_CLASS(fspp);
-DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = {
- { 0, 48, 12, 60, 3, 51, 15, 63, },
- { 32, 16, 44, 28, 35, 19, 47, 31, },
- { 8, 56, 4, 52, 11, 59, 7, 55, },
- { 40, 24, 36, 20, 43, 27, 39, 23, },
- { 2, 50, 14, 62, 1, 49, 13, 61, },
- { 34, 18, 46, 30, 33, 17, 45, 29, },
- { 10, 58, 6, 54, 9, 57, 5, 53, },
- { 42, 26, 38, 22, 41, 25, 37, 21, },
-};
-
static const short custom_threshold[64] = {
// values (296) can't be too high
// -it causes too big quant dependence
@@ -84,73 +101,6 @@ static const short custom_threshold[64] = {
20, 27, 26, 23, 20, 15, 11, 5
};
-//This func reads from 1 slice, 1 and clears 0 & 1
-static void store_slice_c(uint8_t *dst, int16_t *src,
- ptrdiff_t dst_stride, ptrdiff_t src_stride,
- ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
-{
- int y, x;
-#define STORE(pos) \
- temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale); \
- src[x + pos] = src[x + pos - 8 * src_stride] = 0; \
- if (temp & 0x100) temp = ~(temp >> 31); \
- dst[x + pos] = temp;
-
- for (y = 0; y < height; y++) {
- const uint8_t *d = dither[y];
- for (x = 0; x < width; x += 8) {
- int temp;
- STORE(0);
- STORE(1);
- STORE(2);
- STORE(3);
- STORE(4);
- STORE(5);
- STORE(6);
- STORE(7);
- }
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-//This func reads from 2 slices, 0 & 2 and clears 2-nd
-static void store_slice2_c(uint8_t *dst, int16_t *src,
- ptrdiff_t dst_stride, ptrdiff_t src_stride,
- ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
-{
- int y, x;
-#define STORE2(pos) \
- temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >> log2_scale)) >> (6 - log2_scale); \
- src[x + pos + 16 * src_stride] = 0; \
- if (temp & 0x100) temp = ~(temp >> 31); \
- dst[x + pos] = temp;
-
- for (y = 0; y < height; y++) {
- const uint8_t *d = dither[y];
- for (x = 0; x < width; x += 8) {
- int temp;
- STORE2(0);
- STORE2(1);
- STORE2(2);
- STORE2(3);
- STORE2(4);
- STORE2(5);
- STORE2(6);
- STORE2(7);
- }
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-static void mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q)
-{
- int a;
- for (a = 0; a < 64; a++)
- thr_adr[a] = q * thr_adr_noq[a];
-}
-
static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
int dst_stride, int src_stride,
int width, int height,
@@ -197,13 +147,13 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
if (qy < 0) qy = 0;
qy = (qy >> qpsv) * qp_stride;
- p->row_fdct(block, p->src + y * stride + 2 - (y&1), stride, 2);
+ p->dsp.row_fdct(block, p->src + y * stride + 2 - (y&1), stride, 2);
for (x0 = 0; x0 < width + 8 - 8 * (BLOCKSZ - 1); x0 += 8 * (BLOCKSZ - 1)) {
- p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y&1), stride, 2 * (BLOCKSZ - 1));
+ p->dsp.row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y&1), stride, 2 * (BLOCKSZ - 1));
if (p->qp)
- p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + 0 * 8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
+ p->dsp.column_fidct((int16_t *)(&p->threshold_mtx[0]), block + 0 * 8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
else
for (x = 0; x < 8 * (BLOCKSZ - 1); x += 8) {
t = x + x0 - 2; //correct t=x+x0-2-(y&1), but its the same
@@ -213,288 +163,42 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
t = qp_store[qy + (t >> qpsh)];
t = ff_norm_qscale(t, p->qscale_type);
- if (t != p->prev_q) p->prev_q = t, p->mul_thrmat((int16_t *)(&p->threshold_mtx_noq[0]), (int16_t *)(&p->threshold_mtx[0]), t);
- p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT
+ if (t != p->prev_q) p->prev_q = t, p->dsp.mul_thrmat((int16_t *)(&p->threshold_mtx_noq[0]), (int16_t *)(&p->threshold_mtx[0]), t);
+ p->dsp.column_fidct((int16_t *)(&p->threshold_mtx[0]), block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT
}
- p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, 2 * (BLOCKSZ - 1));
+ p->dsp.row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, 2 * (BLOCKSZ - 1));
memmove(block, block + (BLOCKSZ - 1) * 64, 8 * 8 * sizeof(int16_t)); //cycling
memmove(block3, block3 + (BLOCKSZ - 1) * 64, 6 * 8 * sizeof(int16_t));
}
es = width + 8 - x0; // 8, ...
if (es > 8)
- p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y & 1), stride, (es - 4) >> 2);
+ p->dsp.row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y & 1), stride, (es - 4) >> 2);
- p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block, block3, es&(~1));
+ p->dsp.column_fidct((int16_t *)(&p->threshold_mtx[0]), block, block3, es&(~1));
if (es > 3)
- p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, es >> 2);
+ p->dsp.row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, es >> 2);
if (!(y1 & 7) && y1) {
if (y1 & 8)
- p->store_slice(dst + (y1 - 8) * dst_stride, p->temp + 8 + 8 * stride,
- dst_stride, stride, width, 8, 5 - p->log2_count);
+ p->dsp.store_slice(dst + (y1 - 8) * dst_stride, p->temp + 8 + 8 * stride,
+ dst_stride, stride, width, 8, 5 - p->log2_count);
else
- p->store_slice2(dst + (y1 - 8) * dst_stride, p->temp + 8 + 0 * stride,
- dst_stride, stride, width, 8, 5 - p->log2_count);
+ p->dsp.store_slice2(dst + (y1 - 8) * dst_stride, p->temp + 8 + 0 * stride,
+ dst_stride, stride, width, 8, 5 - p->log2_count);
}
}
if (y & 7) { // height % 8 != 0
if (y & 8)
- p->store_slice(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 8 * stride,
- dst_stride, stride, width, y&7, 5 - p->log2_count);
+ p->dsp.store_slice(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 8 * stride,
+ dst_stride, stride, width, y&7, 5 - p->log2_count);
else
- p->store_slice2(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 0 * stride,
+ p->dsp.store_slice2(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 0 * stride,
dst_stride, stride, width, y&7, 5 - p->log2_count);
}
}
-static void column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt)
-{
- int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- int_simd16_t tmp10, tmp11, tmp12, tmp13;
- int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
- int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
-
- int16_t *dataptr;
- int16_t *wsptr;
- int16_t *threshold;
- int ctr;
-
- dataptr = data;
- wsptr = output;
-
- for (; cnt > 0; cnt -= 2) { //start positions
- threshold = (int16_t *)thr_adr;//threshold_mtx
- for (ctr = DCTSIZE; ctr > 0; ctr--) {
- // Process columns from input, add to output.
- tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
- tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
-
- tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
- tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
-
- tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
- tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
-
- tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
- tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
-
- // Even part of FDCT
-
- tmp10 = tmp0 + tmp3;
- tmp13 = tmp0 - tmp3;
- tmp11 = tmp1 + tmp2;
- tmp12 = tmp1 - tmp2;
-
- d0 = tmp10 + tmp11;
- d4 = tmp10 - tmp11;
-
- z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
- d2 = tmp13 + z1;
- d6 = tmp13 - z1;
-
- // Even part of IDCT
-
- THRESHOLD(tmp0, d0, threshold[0 * 8]);
- THRESHOLD(tmp1, d2, threshold[2 * 8]);
- THRESHOLD(tmp2, d4, threshold[4 * 8]);
- THRESHOLD(tmp3, d6, threshold[6 * 8]);
- tmp0 += 2;
- tmp10 = (tmp0 + tmp2) >> 2;
- tmp11 = (tmp0 - tmp2) >> 2;
-
- tmp13 = (tmp1 + tmp3) >>2; //+2 ! (psnr decides)
- tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; //<<2
-
- tmp0 = tmp10 + tmp13; //->temps
- tmp3 = tmp10 - tmp13; //->temps
- tmp1 = tmp11 + tmp12; //->temps
- tmp2 = tmp11 - tmp12; //->temps
-
- // Odd part of FDCT
-
- tmp10 = tmp4 + tmp5;
- tmp11 = tmp5 + tmp6;
- tmp12 = tmp6 + tmp7;
-
- z5 = MULTIPLY16H((tmp10 - tmp12) << 2, FIX_0_382683433);
- z2 = MULTIPLY16H(tmp10 << 2, FIX_0_541196100) + z5;
- z4 = MULTIPLY16H(tmp12 << 2, FIX_1_306562965) + z5;
- z3 = MULTIPLY16H(tmp11 << 2, FIX_0_707106781);
-
- z11 = tmp7 + z3;
- z13 = tmp7 - z3;
-
- d5 = z13 + z2;
- d3 = z13 - z2;
- d1 = z11 + z4;
- d7 = z11 - z4;
-
- // Odd part of IDCT
-
- THRESHOLD(tmp4, d1, threshold[1 * 8]);
- THRESHOLD(tmp5, d3, threshold[3 * 8]);
- THRESHOLD(tmp6, d5, threshold[5 * 8]);
- THRESHOLD(tmp7, d7, threshold[7 * 8]);
-
- //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
- z13 = tmp6 + tmp5;
- z10 = (tmp6 - tmp5) << 1;
- z11 = tmp4 + tmp7;
- z12 = (tmp4 - tmp7) << 1;
-
- tmp7 = (z11 + z13) >> 2; //+2 !
- tmp11 = MULTIPLY16H((z11 - z13) << 1, FIX_1_414213562);
- z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
- tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
- tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - !!
-
- tmp6 = tmp12 - tmp7;
- tmp5 = tmp11 - tmp6;
- tmp4 = tmp10 + tmp5;
-
- wsptr[DCTSIZE * 0] += (tmp0 + tmp7);
- wsptr[DCTSIZE * 1] += (tmp1 + tmp6);
- wsptr[DCTSIZE * 2] += (tmp2 + tmp5);
- wsptr[DCTSIZE * 3] += (tmp3 - tmp4);
- wsptr[DCTSIZE * 4] += (tmp3 + tmp4);
- wsptr[DCTSIZE * 5] += (tmp2 - tmp5);
- wsptr[DCTSIZE * 6] = (tmp1 - tmp6);
- wsptr[DCTSIZE * 7] = (tmp0 - tmp7);
- //
- dataptr++; //next column
- wsptr++;
- threshold++;
- }
- dataptr += 8; //skip each second start pos
- wsptr += 8;
- }
-}
-
-static void row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt)
-{
- int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- int_simd16_t tmp10, tmp11, tmp12, tmp13;
- int_simd16_t z5, z10, z11, z12, z13;
- int16_t *outptr;
- int16_t *wsptr;
-
- cnt *= 4;
- wsptr = workspace;
- outptr = output_adr;
- for (; cnt > 0; cnt--) {
- // Even part
- //Simd version reads 4x4 block and transposes it
- tmp10 = wsptr[2] + wsptr[3];
- tmp11 = wsptr[2] - wsptr[3];
-
- tmp13 = wsptr[0] + wsptr[1];
- tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) << 2) - tmp13;//this shift order to avoid overflow
-
- tmp0 = tmp10 + tmp13; //->temps
- tmp3 = tmp10 - tmp13; //->temps
- tmp1 = tmp11 + tmp12;
- tmp2 = tmp11 - tmp12;
-
- // Odd part
- //Also transpose, with previous:
- // ---- ---- ||||
- // ---- ---- idct ||||
- // ---- ---- ---> ||||
- // ---- ---- ||||
- z13 = wsptr[4] + wsptr[5];
- z10 = wsptr[4] - wsptr[5];
- z11 = wsptr[6] + wsptr[7];
- z12 = wsptr[6] - wsptr[7];
-
- tmp7 = z11 + z13;
- tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
-
- z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
- tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
- tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - FIX_
-
- tmp6 = (tmp12 << 3) - tmp7;
- tmp5 = (tmp11 << 3) - tmp6;
- tmp4 = (tmp10 << 3) + tmp5;
-
- // Final output stage: descale and write column
- outptr[0 * output_stride] += DESCALE(tmp0 + tmp7, 3);
- outptr[1 * output_stride] += DESCALE(tmp1 + tmp6, 3);
- outptr[2 * output_stride] += DESCALE(tmp2 + tmp5, 3);
- outptr[3 * output_stride] += DESCALE(tmp3 - tmp4, 3);
- outptr[4 * output_stride] += DESCALE(tmp3 + tmp4, 3);
- outptr[5 * output_stride] += DESCALE(tmp2 - tmp5, 3);
- outptr[6 * output_stride] += DESCALE(tmp1 - tmp6, 3); //no += ?
- outptr[7 * output_stride] += DESCALE(tmp0 - tmp7, 3); //no += ?
- outptr++;
-
- wsptr += DCTSIZE; // advance pointer to next row
- }
-}
-
-static void row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt)
-{
- int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- int_simd16_t tmp10, tmp11, tmp12, tmp13;
- int_simd16_t z1, z2, z3, z4, z5, z11, z13;
- int16_t *dataptr;
-
- cnt *= 4;
- // Pass 1: process rows.
-
- dataptr = data;
- for (; cnt > 0; cnt--) {
- tmp0 = pixels[line_size * 0] + pixels[line_size * 7];
- tmp7 = pixels[line_size * 0] - pixels[line_size * 7];
- tmp1 = pixels[line_size * 1] + pixels[line_size * 6];
- tmp6 = pixels[line_size * 1] - pixels[line_size * 6];
- tmp2 = pixels[line_size * 2] + pixels[line_size * 5];
- tmp5 = pixels[line_size * 2] - pixels[line_size * 5];
- tmp3 = pixels[line_size * 3] + pixels[line_size * 4];
- tmp4 = pixels[line_size * 3] - pixels[line_size * 4];
-
- // Even part
-
- tmp10 = tmp0 + tmp3;
- tmp13 = tmp0 - tmp3;
- tmp11 = tmp1 + tmp2;
- tmp12 = tmp1 - tmp2;
- //Even columns are written first, this leads to different order of columns
- //in column_fidct(), but they are processed independently, so all ok.
- //Later in the row_idct() columns are read in the same order.
- dataptr[2] = tmp10 + tmp11;
- dataptr[3] = tmp10 - tmp11;
-
- z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
- dataptr[0] = tmp13 + z1;
- dataptr[1] = tmp13 - z1;
-
- // Odd part
-
- tmp10 = (tmp4 + tmp5) << 2;
- tmp11 = (tmp5 + tmp6) << 2;
- tmp12 = (tmp6 + tmp7) << 2;
-
- z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
- z2 = MULTIPLY16H(tmp10, FIX_0_541196100) + z5;
- z4 = MULTIPLY16H(tmp12, FIX_1_306562965) + z5;
- z3 = MULTIPLY16H(tmp11, FIX_0_707106781);
-
- z11 = tmp7 + z3;
- z13 = tmp7 - z3;
-
- dataptr[4] = z13 + z2;
- dataptr[5] = z13 - z2;
- dataptr[6] = z11 + z4;
- dataptr[7] = z11 - z4;
-
- pixels++; // advance pointer to next column
- dataptr += DCTSIZE;
- }
-}
-
static const enum AVPixelFormat pix_fmts[] = {
AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV422P,
AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV411P,
@@ -522,16 +226,7 @@ static int config_input(AVFilterLink *inlink)
if (!fspp->temp || !fspp->src)
return AVERROR(ENOMEM);
- fspp->store_slice = store_slice_c;
- fspp->store_slice2 = store_slice2_c;
- fspp->mul_thrmat = mul_thrmat_c;
- fspp->column_fidct = column_fidct_c;
- fspp->row_idct = row_idct_c;
- fspp->row_fdct = row_fdct_c;
-
-#if ARCH_X86
- ff_fspp_init_x86(fspp);
-#endif
+ ff_fsppdsp_init(&fspp->dsp);
return 0;
}
@@ -567,7 +262,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
}
if (fspp->qp)
- fspp->prev_q = fspp->qp, fspp->mul_thrmat((int16_t *)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]), fspp->qp);
+ fspp->prev_q = fspp->qp, fspp->dsp.mul_thrmat((int16_t *)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]), fspp->qp);
/* if we are not in a constant user quantizer mode and we don't want to use
* the quantizers from the B-frames (B-frames often have a higher QP), we
diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
new file mode 100644
index 0000000000..ab31c77203
--- /dev/null
+++ b/libavfilter/vf_fsppdsp.c
@@ -0,0 +1,369 @@
+/*
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
+ * Copyright (c) 2014 Arwa Arif <arwaarif1994@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <stdint.h>
+
+#include "vf_fsppdsp.h"
+
+#include "libavutil/mathematics.h"
+#include "libavutil/mem_internal.h"
+
+#define DCTSIZE 8
+
+#define FIX(x,s) ((x) * (1 << s) + 0.5)
+
+#define MULTIPLY16H(x,k) (((x) * (k)) >> 16)
+#define THRESHOLD(r,x,t) \
+ if(((unsigned)((x) + t)) > t * 2) r = (x); \
+ else r = 0;
+#define DESCALE(x,n) (((x) + (1 << ((n) - 1))) >> n)
+
+typedef int32_t int_simd16_t;
+static const int16_t FIX_0_382683433 = FIX(0.382683433, 14);
+static const int16_t FIX_0_541196100 = FIX(0.541196100, 14);
+static const int16_t FIX_0_707106781 = FIX(M_SQRT1_2 , 14);
+static const int16_t FIX_1_306562965 = FIX(1.306562965, 14);
+static const int16_t FIX_1_414213562_A = FIX(M_SQRT2 , 14);
+static const int16_t FIX_1_847759065 = FIX(1.847759065, 13);
+static const int16_t FIX_2_613125930 = FIX(-2.613125930, 13);
+static const int16_t FIX_1_414213562 = FIX(M_SQRT2 , 13);
+static const int16_t FIX_1_082392200 = FIX(1.082392200, 13);
+
+DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = {
+ { 0, 48, 12, 60, 3, 51, 15, 63, },
+ { 32, 16, 44, 28, 35, 19, 47, 31, },
+ { 8, 56, 4, 52, 11, 59, 7, 55, },
+ { 40, 24, 36, 20, 43, 27, 39, 23, },
+ { 2, 50, 14, 62, 1, 49, 13, 61, },
+ { 34, 18, 46, 30, 33, 17, 45, 29, },
+ { 10, 58, 6, 54, 9, 57, 5, 53, },
+ { 42, 26, 38, 22, 41, 25, 37, 21, },
+};
+
+//This func reads from 1 slice, 1 and clears 0 & 1
+void ff_store_slice_c(uint8_t *dst, int16_t *src,
+ ptrdiff_t dst_stride, ptrdiff_t src_stride,
+ ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
+{
+#define STORE(pos) \
+ temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale); \
+ src[x + pos] = src[x + pos - 8 * src_stride] = 0; \
+ if (temp & 0x100) temp = ~(temp >> 31); \
+ dst[x + pos] = temp;
+
+ for (int y = 0; y < height; y++) {
+ const uint8_t *d = dither[y];
+ for (int x = 0; x < width; x += 8) {
+ int temp;
+ STORE(0);
+ STORE(1);
+ STORE(2);
+ STORE(3);
+ STORE(4);
+ STORE(5);
+ STORE(6);
+ STORE(7);
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+//This func reads from 2 slices, 0 & 2 and clears 2-nd
+void ff_store_slice2_c(uint8_t *dst, int16_t *src,
+ ptrdiff_t dst_stride, ptrdiff_t src_stride,
+ ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
+{
+#define STORE2(pos) \
+ temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >> log2_scale)) >> (6 - log2_scale); \
+ src[x + pos + 16 * src_stride] = 0; \
+ if (temp & 0x100) temp = ~(temp >> 31); \
+ dst[x + pos] = temp;
+
+ for (int y = 0; y < height; y++) {
+ const uint8_t *d = dither[y];
+ for (int x = 0; x < width; x += 8) {
+ int temp;
+ STORE2(0);
+ STORE2(1);
+ STORE2(2);
+ STORE2(3);
+ STORE2(4);
+ STORE2(5);
+ STORE2(6);
+ STORE2(7);
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void ff_mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q)
+{
+ for (int a = 0; a < 64; a++)
+ thr_adr[a] = q * thr_adr_noq[a];
+}
+
+void ff_column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt)
+{
+ int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int_simd16_t tmp10, tmp11, tmp12, tmp13;
+ int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
+ int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
+
+ int16_t *dataptr;
+ int16_t *wsptr;
+ int16_t *threshold;
+
+ dataptr = data;
+ wsptr = output;
+
+ for (; cnt > 0; cnt -= 2) { //start positions
+ threshold = (int16_t *)thr_adr;//threshold_mtx
+ for (int ctr = DCTSIZE; ctr > 0; ctr--) {
+ // Process columns from input, add to output.
+ tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
+ tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
+
+ tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
+ tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
+
+ tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
+ tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
+
+ tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
+ tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
+
+ // Even part of FDCT
+
+ tmp10 = tmp0 + tmp3;
+ tmp13 = tmp0 - tmp3;
+ tmp11 = tmp1 + tmp2;
+ tmp12 = tmp1 - tmp2;
+
+ d0 = tmp10 + tmp11;
+ d4 = tmp10 - tmp11;
+
+ z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
+ d2 = tmp13 + z1;
+ d6 = tmp13 - z1;
+
+ // Even part of IDCT
+
+ THRESHOLD(tmp0, d0, threshold[0 * 8]);
+ THRESHOLD(tmp1, d2, threshold[2 * 8]);
+ THRESHOLD(tmp2, d4, threshold[4 * 8]);
+ THRESHOLD(tmp3, d6, threshold[6 * 8]);
+ tmp0 += 2;
+ tmp10 = (tmp0 + tmp2) >> 2;
+ tmp11 = (tmp0 - tmp2) >> 2;
+
+ tmp13 = (tmp1 + tmp3) >>2; //+2 ! (psnr decides)
+ tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; //<<2
+
+ tmp0 = tmp10 + tmp13; //->temps
+ tmp3 = tmp10 - tmp13; //->temps
+ tmp1 = tmp11 + tmp12; //->temps
+ tmp2 = tmp11 - tmp12; //->temps
+
+ // Odd part of FDCT
+
+ tmp10 = tmp4 + tmp5;
+ tmp11 = tmp5 + tmp6;
+ tmp12 = tmp6 + tmp7;
+
+ z5 = MULTIPLY16H((tmp10 - tmp12) << 2, FIX_0_382683433);
+ z2 = MULTIPLY16H(tmp10 << 2, FIX_0_541196100) + z5;
+ z4 = MULTIPLY16H(tmp12 << 2, FIX_1_306562965) + z5;
+ z3 = MULTIPLY16H(tmp11 << 2, FIX_0_707106781);
+
+ z11 = tmp7 + z3;
+ z13 = tmp7 - z3;
+
+ d5 = z13 + z2;
+ d3 = z13 - z2;
+ d1 = z11 + z4;
+ d7 = z11 - z4;
+
+ // Odd part of IDCT
+
+ THRESHOLD(tmp4, d1, threshold[1 * 8]);
+ THRESHOLD(tmp5, d3, threshold[3 * 8]);
+ THRESHOLD(tmp6, d5, threshold[5 * 8]);
+ THRESHOLD(tmp7, d7, threshold[7 * 8]);
+
+ //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
+ z13 = tmp6 + tmp5;
+ z10 = (tmp6 - tmp5) << 1;
+ z11 = tmp4 + tmp7;
+ z12 = (tmp4 - tmp7) << 1;
+
+ tmp7 = (z11 + z13) >> 2; //+2 !
+ tmp11 = MULTIPLY16H((z11 - z13) << 1, FIX_1_414213562);
+ z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
+ tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
+ tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - !!
+
+ tmp6 = tmp12 - tmp7;
+ tmp5 = tmp11 - tmp6;
+ tmp4 = tmp10 + tmp5;
+
+ wsptr[DCTSIZE * 0] += (tmp0 + tmp7);
+ wsptr[DCTSIZE * 1] += (tmp1 + tmp6);
+ wsptr[DCTSIZE * 2] += (tmp2 + tmp5);
+ wsptr[DCTSIZE * 3] += (tmp3 - tmp4);
+ wsptr[DCTSIZE * 4] += (tmp3 + tmp4);
+ wsptr[DCTSIZE * 5] += (tmp2 - tmp5);
+ wsptr[DCTSIZE * 6] = (tmp1 - tmp6);
+ wsptr[DCTSIZE * 7] = (tmp0 - tmp7);
+ //
+ dataptr++; //next column
+ wsptr++;
+ threshold++;
+ }
+ dataptr += 8; //skip each second start pos
+ wsptr += 8;
+ }
+}
+
+void ff_row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt)
+{
+ int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int_simd16_t tmp10, tmp11, tmp12, tmp13;
+ int_simd16_t z5, z10, z11, z12, z13;
+ int16_t *outptr;
+ int16_t *wsptr;
+
+ cnt *= 4;
+ wsptr = workspace;
+ outptr = output_adr;
+ for (; cnt > 0; cnt--) {
+ // Even part
+ //Simd version reads 4x4 block and transposes it
+ tmp10 = wsptr[2] + wsptr[3];
+ tmp11 = wsptr[2] - wsptr[3];
+
+ tmp13 = wsptr[0] + wsptr[1];
+ tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) << 2) - tmp13;//this shift order to avoid overflow
+
+ tmp0 = tmp10 + tmp13; //->temps
+ tmp3 = tmp10 - tmp13; //->temps
+ tmp1 = tmp11 + tmp12;
+ tmp2 = tmp11 - tmp12;
+
+ // Odd part
+ //Also transpose, with previous:
+ // ---- ---- ||||
+ // ---- ---- idct ||||
+ // ---- ---- ---> ||||
+ // ---- ---- ||||
+ z13 = wsptr[4] + wsptr[5];
+ z10 = wsptr[4] - wsptr[5];
+ z11 = wsptr[6] + wsptr[7];
+ z12 = wsptr[6] - wsptr[7];
+
+ tmp7 = z11 + z13;
+ tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
+
+ z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
+ tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
+ tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - FIX_
+
+ tmp6 = (tmp12 << 3) - tmp7;
+ tmp5 = (tmp11 << 3) - tmp6;
+ tmp4 = (tmp10 << 3) + tmp5;
+
+ // Final output stage: descale and write column
+ outptr[0 * output_stride] += DESCALE(tmp0 + tmp7, 3);
+ outptr[1 * output_stride] += DESCALE(tmp1 + tmp6, 3);
+ outptr[2 * output_stride] += DESCALE(tmp2 + tmp5, 3);
+ outptr[3 * output_stride] += DESCALE(tmp3 - tmp4, 3);
+ outptr[4 * output_stride] += DESCALE(tmp3 + tmp4, 3);
+ outptr[5 * output_stride] += DESCALE(tmp2 - tmp5, 3);
+ outptr[6 * output_stride] += DESCALE(tmp1 - tmp6, 3); //no += ?
+ outptr[7 * output_stride] += DESCALE(tmp0 - tmp7, 3); //no += ?
+ outptr++;
+
+ wsptr += DCTSIZE; // advance pointer to next row
+ }
+}
+
+void ff_row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt)
+{
+ int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int_simd16_t tmp10, tmp11, tmp12, tmp13;
+ int_simd16_t z1, z2, z3, z4, z5, z11, z13;
+ int16_t *dataptr;
+
+ cnt *= 4;
+ // Pass 1: process rows.
+
+ dataptr = data;
+ for (; cnt > 0; cnt--) {
+ tmp0 = pixels[line_size * 0] + pixels[line_size * 7];
+ tmp7 = pixels[line_size * 0] - pixels[line_size * 7];
+ tmp1 = pixels[line_size * 1] + pixels[line_size * 6];
+ tmp6 = pixels[line_size * 1] - pixels[line_size * 6];
+ tmp2 = pixels[line_size * 2] + pixels[line_size * 5];
+ tmp5 = pixels[line_size * 2] - pixels[line_size * 5];
+ tmp3 = pixels[line_size * 3] + pixels[line_size * 4];
+ tmp4 = pixels[line_size * 3] - pixels[line_size * 4];
+
+ // Even part
+
+ tmp10 = tmp0 + tmp3;
+ tmp13 = tmp0 - tmp3;
+ tmp11 = tmp1 + tmp2;
+ tmp12 = tmp1 - tmp2;
+ //Even columns are written first, this leads to different order of columns
+ //in column_fidct(), but they are processed independently, so all ok.
+ //Later in the row_idct() columns are read in the same order.
+ dataptr[2] = tmp10 + tmp11;
+ dataptr[3] = tmp10 - tmp11;
+
+ z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
+ dataptr[0] = tmp13 + z1;
+ dataptr[1] = tmp13 - z1;
+
+ // Odd part
+
+ tmp10 = (tmp4 + tmp5) << 2;
+ tmp11 = (tmp5 + tmp6) << 2;
+ tmp12 = (tmp6 + tmp7) << 2;
+
+ z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
+ z2 = MULTIPLY16H(tmp10, FIX_0_541196100) + z5;
+ z4 = MULTIPLY16H(tmp12, FIX_1_306562965) + z5;
+ z3 = MULTIPLY16H(tmp11, FIX_0_707106781);
+
+ z11 = tmp7 + z3;
+ z13 = tmp7 - z3;
+
+ dataptr[4] = z13 + z2;
+ dataptr[5] = z13 - z2;
+ dataptr[6] = z11 + z4;
+ dataptr[7] = z11 - z4;
+
+ pixels++; // advance pointer to next column
+ dataptr += DCTSIZE;
+ }
+}
diff --git a/libavfilter/vf_fspp.h b/libavfilter/vf_fsppdsp.h
similarity index 52%
rename from libavfilter/vf_fspp.h
rename to libavfilter/vf_fsppdsp.h
index ee7de3ffef..c441b75094 100644
--- a/libavfilter/vf_fspp.h
+++ b/libavfilter/vf_fsppdsp.h
@@ -20,56 +20,17 @@
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
-#ifndef AVFILTER_FSPP_H
-#define AVFILTER_FSPP_H
+#ifndef AVFILTER_FSPPDSP_H
+#define AVFILTER_FSPPDSP_H
-#include "libavutil/video_enc_params.h"
-#include "avfilter.h"
+#include <stddef.h>
+#include <stdint.h>
-#define BLOCKSZ 12
-#define MAX_LEVEL 5
+#include "config.h"
-#define DCTSIZE 8
-#define DCTSIZE_S "8"
-
-#define FIX(x,s) ((x) * (1 << s) + 0.5)
-
-#define MULTIPLY16H(x,k) (((x) * (k)) >> 16)
-#define THRESHOLD(r,x,t) \
- if(((unsigned)((x) + t)) > t * 2) r = (x); \
- else r = 0;
-#define DESCALE(x,n) (((x) + (1 << ((n) - 1))) >> n)
-
-typedef int32_t int_simd16_t;
-static const int16_t FIX_0_382683433 = FIX(0.382683433, 14);
-static const int16_t FIX_0_541196100 = FIX(0.541196100, 14);
-static const int16_t FIX_0_707106781 = FIX(M_SQRT1_2 , 14);
-static const int16_t FIX_1_306562965 = FIX(1.306562965, 14);
-static const int16_t FIX_1_414213562_A = FIX(M_SQRT2 , 14);
-static const int16_t FIX_1_847759065 = FIX(1.847759065, 13);
-static const int16_t FIX_2_613125930 = FIX(-2.613125930, 13);
-static const int16_t FIX_1_414213562 = FIX(M_SQRT2 , 13);
-static const int16_t FIX_1_082392200 = FIX(1.082392200, 13);
-
-typedef struct FSPPContext {
- AVClass *class;
- uint64_t threshold_mtx_noq[8 * 2];
- uint64_t threshold_mtx[8 * 2]; //used in both C & MMX (& later SSE2) versions
-
- int log2_count;
- int strength;
- int hsub;
- int vsub;
- int temp_stride;
- int qp;
- enum AVVideoEncParamsType qscale_type;
- int prev_q;
- uint8_t *src;
- int16_t *temp;
- int8_t *non_b_qp_table;
- int non_b_qp_stride;
- int use_bframe_qp;
+#include "libavutil/attributes_internal.h"
+typedef struct FSPPDSPContext {
void (*store_slice)(uint8_t *dst, int16_t *src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
@@ -88,9 +49,35 @@ typedef struct FSPPContext {
void (*row_fdct)(int16_t *data, const uint8_t *pixels,
ptrdiff_t line_size, int cnt);
+} FSPPDSPContext;
-} FSPPContext;
+FF_VISIBILITY_PUSH_HIDDEN
+void ff_store_slice_c(uint8_t *dst, int16_t *src,
+ ptrdiff_t dst_stride, ptrdiff_t src_stride,
+ ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
+void ff_store_slice2_c(uint8_t *dst, int16_t *src,
+ ptrdiff_t dst_stride, ptrdiff_t src_stride,
+ ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
+void ff_mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+void ff_column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
+void ff_row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
+void ff_row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt);
-void ff_fspp_init_x86(FSPPContext *fspp);
+void ff_fsppdsp_init_x86(FSPPDSPContext *fspp);
+FF_VISIBILITY_POP_HIDDEN
-#endif /* AVFILTER_FSPP_H */
+static inline void ff_fsppdsp_init(FSPPDSPContext *fspp)
+{
+ fspp->store_slice = ff_store_slice_c;
+ fspp->store_slice2 = ff_store_slice2_c;
+ fspp->mul_thrmat = ff_mul_thrmat_c;
+ fspp->column_fidct = ff_column_fidct_c;
+ fspp->row_idct = ff_row_idct_c;
+ fspp->row_fdct = ff_row_fdct_c;
+
+#if ARCH_X86
+ ff_fsppdsp_init_x86(fspp);
+#endif
+}
+
+#endif /* AVFILTER_FSPPDSP_H */
diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c
index 8e00317cb7..2aadb50967 100644
--- a/libavfilter/x86/vf_fspp_init.c
+++ b/libavfilter/x86/vf_fspp_init.c
@@ -21,7 +21,7 @@
#include "libavutil/attributes.h"
#include "libavutil/x86/cpu.h"
-#include "libavfilter/vf_fspp.h"
+#include "libavfilter/vf_fsppdsp.h"
void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
@@ -34,7 +34,7 @@ void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int c
void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt);
-av_cold void ff_fspp_init_x86(FSPPContext *s)
+av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
--
2.49.1
>From 4f3d8ea9d11842357998cca26f502831d5d5c9c0 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sun, 9 Nov 2025 17:22:21 +0100
Subject: [PATCH 02/23] avfilter/vf_fsppdsp: Use enum for constants
It means that the compiler does not have to optimize the static const
object away.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavfilter/vf_fsppdsp.c | 23 +++++++++++++----------
1 file changed, 13 insertions(+), 10 deletions(-)
diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
index ab31c77203..d2d04463b4 100644
--- a/libavfilter/vf_fsppdsp.c
+++ b/libavfilter/vf_fsppdsp.c
@@ -29,7 +29,7 @@
#define DCTSIZE 8
-#define FIX(x,s) ((x) * (1 << s) + 0.5)
+#define FIX(x,s) (int)((x) * (1 << s) + 0.5)
#define MULTIPLY16H(x,k) (((x) * (k)) >> 16)
#define THRESHOLD(r,x,t) \
@@ -38,15 +38,18 @@
#define DESCALE(x,n) (((x) + (1 << ((n) - 1))) >> n)
typedef int32_t int_simd16_t;
-static const int16_t FIX_0_382683433 = FIX(0.382683433, 14);
-static const int16_t FIX_0_541196100 = FIX(0.541196100, 14);
-static const int16_t FIX_0_707106781 = FIX(M_SQRT1_2 , 14);
-static const int16_t FIX_1_306562965 = FIX(1.306562965, 14);
-static const int16_t FIX_1_414213562_A = FIX(M_SQRT2 , 14);
-static const int16_t FIX_1_847759065 = FIX(1.847759065, 13);
-static const int16_t FIX_2_613125930 = FIX(-2.613125930, 13);
-static const int16_t FIX_1_414213562 = FIX(M_SQRT2 , 13);
-static const int16_t FIX_1_082392200 = FIX(1.082392200, 13);
+
+enum {
+ FIX_0_382683433 = FIX(0.382683433, 14),
+ FIX_0_541196100 = FIX(0.541196100, 14),
+ FIX_0_707106781 = FIX(M_SQRT1_2 , 14),
+ FIX_1_306562965 = FIX(1.306562965, 14),
+ FIX_1_414213562_A = FIX(M_SQRT2 , 14),
+ FIX_1_847759065 = FIX(1.847759065, 13),
+ FIX_2_613125930 = FIX(-2.613125930, 13),
+ FIX_1_414213562 = FIX(M_SQRT2 , 13),
+ FIX_1_082392200 = FIX(1.082392200, 13),
+};
DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = {
{ 0, 48, 12, 60, 3, 51, 15, 63, },
--
2.49.1
>From 787c89a3ac68fa1d023e2f06c653b55ba26f0917 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sun, 9 Nov 2025 17:27:16 +0100
Subject: [PATCH 03/23] avfilter/x86/vf_fspp: Don't duplicate dither table
Reuse the one from vf_fsppdsp.c; also don't overalign said table too
much.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavfilter/vf_fsppdsp.c | 6 +++---
libavfilter/vf_fsppdsp.h | 2 ++
libavfilter/x86/vf_fspp.asm | 9 +++------
3 files changed, 8 insertions(+), 9 deletions(-)
diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
index d2d04463b4..b84d7b57bb 100644
--- a/libavfilter/vf_fsppdsp.c
+++ b/libavfilter/vf_fsppdsp.c
@@ -51,7 +51,7 @@ enum {
FIX_1_082392200 = FIX(1.082392200, 13),
};
-DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = {
+DECLARE_ALIGNED(8, const uint8_t, ff_fspp_dither)[8][8] = {
{ 0, 48, 12, 60, 3, 51, 15, 63, },
{ 32, 16, 44, 28, 35, 19, 47, 31, },
{ 8, 56, 4, 52, 11, 59, 7, 55, },
@@ -74,7 +74,7 @@ void ff_store_slice_c(uint8_t *dst, int16_t *src,
dst[x + pos] = temp;
for (int y = 0; y < height; y++) {
- const uint8_t *d = dither[y];
+ const uint8_t *d = ff_fspp_dither[y];
for (int x = 0; x < width; x += 8) {
int temp;
STORE(0);
@@ -103,7 +103,7 @@ void ff_store_slice2_c(uint8_t *dst, int16_t *src,
dst[x + pos] = temp;
for (int y = 0; y < height; y++) {
- const uint8_t *d = dither[y];
+ const uint8_t *d = ff_fspp_dither[y];
for (int x = 0; x < width; x += 8) {
int temp;
STORE2(0);
diff --git a/libavfilter/vf_fsppdsp.h b/libavfilter/vf_fsppdsp.h
index c441b75094..0dbd628abf 100644
--- a/libavfilter/vf_fsppdsp.h
+++ b/libavfilter/vf_fsppdsp.h
@@ -52,6 +52,8 @@ typedef struct FSPPDSPContext {
} FSPPDSPContext;
FF_VISIBILITY_PUSH_HIDDEN
+extern const uint8_t ff_fspp_dither[8][8];
+
void ff_store_slice_c(uint8_t *dst, int16_t *src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm
index c7f8f64f1b..0ea6216193 100644
--- a/libavfilter/x86/vf_fspp.asm
+++ b/libavfilter/x86/vf_fspp.asm
@@ -25,10 +25,7 @@
SECTION_RODATA
-pb_dither: db 0, 48, 12, 60, 3, 51, 15, 63, 32, 16, 44, 28, 35, 19, 47, 31, \
- 8, 56, 4, 52, 11, 59, 7, 55, 40, 24, 36, 20, 43, 27, 39, 23, \
- 2, 50, 14, 62, 1, 49, 13, 61, 34, 18, 46, 30, 33, 17, 45, 29, \
- 10, 58, 6, 54, 9, 57, 5, 53, 42, 26, 38, 22, 41, 25, 37, 21
+cextern fspp_dither
pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14)
pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13)
pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13)
@@ -73,7 +70,7 @@ cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
sub tmp2q, widthq
movd m2, ditherd ; log2_scale
add tmp2q, tmp2q
- lea ditherq, [pb_dither]
+ lea ditherq, [fspp_dither]
mov src_strideq, tmp2q
shl tmpq, 4
lea dither_heightq, [ditherq+dither_heightq*8]
@@ -139,7 +136,7 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
sub tmp2q, widthq
movd m2, ditherd ; log2_scale
add tmp2q, tmp2q
- lea ditherq, [pb_dither]
+ lea ditherq, [fspp_dither]
mov src_strideq, tmp2q
shl tmpq, 5
lea dither_heightq, [ditherq+dither_heightq*8]
--
2.49.1
>From 659b75505b3b0e03a20701f7f8ebf77dd954205b Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sun, 9 Nov 2025 18:50:48 +0100
Subject: [PATCH 04/23] tests/checkasm: Add vf_fspp mul_thrmat test
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
tests/checkasm/Makefile | 1 +
tests/checkasm/checkasm.c | 3 +++
tests/checkasm/checkasm.h | 1 +
tests/checkasm/vf_fspp.c | 52 +++++++++++++++++++++++++++++++++++++++
tests/fate/checkasm.mak | 1 +
5 files changed, 58 insertions(+)
create mode 100644 tests/checkasm/vf_fspp.c
diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index e47070d90f..6636bc7774 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -64,6 +64,7 @@ AVFILTEROBJS-$(CONFIG_BWDIF_FILTER) += vf_bwdif.o
AVFILTEROBJS-$(CONFIG_COLORDETECT_FILTER)+= vf_colordetect.o
AVFILTEROBJS-$(CONFIG_COLORSPACE_FILTER) += vf_colorspace.o
AVFILTEROBJS-$(CONFIG_EQ_FILTER) += vf_eq.o
+AVFILTEROBJS-$(CONFIG_FSPP_FILTER) += vf_fspp.o
AVFILTEROBJS-$(CONFIG_GBLUR_FILTER) += vf_gblur.o
AVFILTEROBJS-$(CONFIG_HFLIP_FILTER) += vf_hflip.o
AVFILTEROBJS-$(CONFIG_IDET_FILTER) += vf_idet.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 4469e043f5..20d8f19757 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -297,6 +297,9 @@ static const struct {
#if CONFIG_EQ_FILTER
{ "vf_eq", checkasm_check_vf_eq },
#endif
+ #if CONFIG_FSPP_FILTER
+ { "vf_fspp", checkasm_check_vf_fspp },
+ #endif
#if CONFIG_GBLUR_FILTER
{ "vf_gblur", checkasm_check_vf_gblur },
#endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index e1ccd4011b..45cd23cac4 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -148,6 +148,7 @@ void checkasm_check_v210enc(void);
void checkasm_check_vc1dsp(void);
void checkasm_check_vf_bwdif(void);
void checkasm_check_vf_eq(void);
+void checkasm_check_vf_fspp(void);
void checkasm_check_vf_gblur(void);
void checkasm_check_vf_hflip(void);
void checkasm_check_vf_threshold(void);
diff --git a/tests/checkasm/vf_fspp.c b/tests/checkasm/vf_fspp.c
new file mode 100644
index 0000000000..a84ae8d5af
--- /dev/null
+++ b/tests/checkasm/vf_fspp.c
@@ -0,0 +1,52 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "checkasm.h"
+#include "libavfilter/vf_fsppdsp.h"
+
+#define randomize_buffers(buf) \
+ do { \
+ for (size_t j = 0; j < FF_ARRAY_ELEMS(buf); ++j) \
+ buf[j] = rnd(); \
+ } while (0)
+
+
+static void check_mul_thrmat(void)
+{
+ FSPPDSPContext fspp;
+ int16_t src[64];
+ int16_t dst_ref[64], dst_new[64];
+ const int q = (uint8_t)rnd();
+ declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+
+ ff_fsppdsp_init(&fspp);
+
+ if (check_func(fspp.mul_thrmat, "mul_thrmat")) {
+ randomize_buffers(src);
+ call_ref(src, dst_ref, q);
+ call_new(src, dst_new, q);
+ if (memcmp(dst_ref, dst_new, sizeof(dst_ref)))
+ fail();
+ bench_new(src, dst_new, q);
+ }
+}
+
+void checkasm_check_vf_fspp(void)
+{
+ check_mul_thrmat();
+}
diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
index ca1cd0dea3..2be880c8db 100644
--- a/tests/fate/checkasm.mak
+++ b/tests/fate/checkasm.mak
@@ -67,6 +67,7 @@ FATE_CHECKASM = fate-checkasm-aacencdsp \
fate-checkasm-vf_colordetect \
fate-checkasm-vf_colorspace \
fate-checkasm-vf_eq \
+ fate-checkasm-vf_fspp \
fate-checkasm-vf_gblur \
fate-checkasm-vf_hflip \
fate-checkasm-vf_nlmeans \
--
2.49.1
>From bd0b98cc10caea569331eae8fd1af13d4d546ddb Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Sun, 9 Nov 2025 19:10:30 +0100
Subject: [PATCH 05/23] avfilter/x86/vf_fspp: Port mul_thrmat to SSE2
This fixes an ABI violation, as mul_thrmat did not issue emms.
It seems that this ABI violation could reach the user, namely
if ff_get_video_buffer() fails. Notice that ff_get_video_buffer()
itself could fail because of this, namely if the allocator uses
floating point registers.
On x64 (where GCC already used SSE2 in the C version)
mul_thrmat_c: 4.4 ( 1.00x)
mul_thrmat_mmx: 8.6 ( 0.52x)
mul_thrmat_sse2: 4.4 ( 1.00x)
On 32bit (where SSE2 is not known to be available):
mul_thrmat_c: 56.0 ( 1.00x)
mul_thrmat_sse2: 6.0 ( 9.40x)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavfilter/vf_fspp.c | 5 +-
libavfilter/vf_fsppdsp.h | 3 +-
libavfilter/x86/vf_fspp.asm | 84 +++++++++++++---------------------
libavfilter/x86/vf_fspp_init.c | 6 ++-
tests/checkasm/vf_fspp.c | 8 ++--
5 files changed, 45 insertions(+), 61 deletions(-)
diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c
index 9371c63e77..fa562cbd45 100644
--- a/libavfilter/vf_fspp.c
+++ b/libavfilter/vf_fspp.c
@@ -54,8 +54,6 @@
typedef struct FSPPContext {
const struct AVClass *class;
- uint64_t threshold_mtx_noq[8 * 2];
- uint64_t threshold_mtx[8 * 2]; //used in both C & MMX (& later SSE2) versions
int log2_count;
int strength;
@@ -72,6 +70,9 @@ typedef struct FSPPContext {
int use_bframe_qp;
FSPPDSPContext dsp;
+
+ DECLARE_ALIGNED(16, uint64_t, threshold_mtx_noq)[8 * 2];
+ DECLARE_ALIGNED(16, uint64_t, threshold_mtx)[8 * 2];
} FSPPContext;
diff --git a/libavfilter/vf_fsppdsp.h b/libavfilter/vf_fsppdsp.h
index 0dbd628abf..e87fa6861c 100644
--- a/libavfilter/vf_fsppdsp.h
+++ b/libavfilter/vf_fsppdsp.h
@@ -39,7 +39,8 @@ typedef struct FSPPDSPContext {
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
- void (*mul_thrmat)(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+ void (*mul_thrmat)(int16_t *thr_adr_noq /* align 16 */,
+ int16_t *thr_adr /* align 16 */, int q);
void (*column_fidct)(int16_t *thr_adr, int16_t *data,
int16_t *output, int cnt);
diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm
index 0ea6216193..c9408978d8 100644
--- a/libavfilter/x86/vf_fspp.asm
+++ b/libavfilter/x86/vf_fspp.asm
@@ -177,59 +177,36 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
jl .loop_height
RET
-;void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
-cglobal mul_thrmat, 3, 3, 0, thrn, thr, q
- movd m7, qd
- movq m0, [thrnq]
- punpcklwd m7, m7
- movq m1, [thrnq+8]
- punpckldq m7, m7
- pmullw m0, m7
- movq m2, [thrnq+8*2]
- pmullw m1, m7
- movq m3, [thrnq+8*3]
- pmullw m2, m7
- movq [thrq], m0
- movq m4, [thrnq+8*4]
- pmullw m3, m7
- movq [thrq+8], m1
- movq m5, [thrnq+8*5]
- pmullw m4, m7
- movq [thrq+8*2], m2
- movq m6, [thrnq+8*6]
- pmullw m5, m7
- movq [thrq+8*3], m3
- movq m0, [thrnq+8*7]
- pmullw m6, m7
- movq [thrq+8*4], m4
- movq m1, [thrnq+8*7+8]
- pmullw m0, m7
- movq [thrq+8*5], m5
- movq m2, [thrnq+8*7+8*2]
- pmullw m1, m7
- movq [thrq+8*6], m6
- movq m3, [thrnq+8*7+8*3]
- pmullw m2, m7
- movq [thrq+8*7], m0
- movq m4, [thrnq+8*7+8*4]
- pmullw m3, m7
- movq [thrq+8*7+8], m1
- movq m5, [thrnq+8*7+8*5]
- pmullw m4, m7
- movq [thrq+8*7+8*2], m2
- movq m6, [thrnq+8*7+8*6]
- pmullw m5, m7
- movq [thrq+8*7+8*3], m3
- movq m0, [thrnq+14*8]
- pmullw m6, m7
- movq [thrq+8*7+8*4], m4
- movq m1, [thrnq+14*8+8]
- pmullw m0, m7
- movq [thrq+8*7+8*5], m5
- pmullw m1, m7
- movq [thrq+8*7+8*6], m6
- movq [thrq+14*8], m0
- movq [thrq+14*8+8], m1
+;void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+INIT_XMM sse2
+cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
+ movd m4, qd
+ mova m0, [thrnq]
+ punpcklwd m4, m4
+ mova m1, [thrnq+16]
+ pshufd m4, m4, 0
+ pmullw m0, m4
+ mova m2, [thrnq+16*2]
+ pmullw m1, m4
+ mova m3, [thrnq+16*3]
+ pmullw m2, m4
+ mova [thrq], m0
+ mova m0, [thrnq+16*4]
+ pmullw m3, m4
+ mova [thrq+16], m1
+ mova m1, [thrnq+16*5]
+ pmullw m0, m4
+ mova [thrq+16*2], m2
+ mova m2, [thrnq+16*6]
+ pmullw m1, m4
+ mova [thrq+16*3], m3
+ mova m3, [thrnq+16*7]
+ pmullw m2, m4
+ mova [thrq+16*4], m0
+ pmullw m3, m4
+ mova [thrq+16*5], m1
+ mova [thrq+16*6], m2
+ mova [thrq+16*7], m3
RET
%macro COLUMN_FDCT 1-3 0, 0
@@ -457,6 +434,7 @@ cglobal mul_thrmat, 3, 3, 0, thrn, thr, q
add outq, 8+%1
%endmacro
+INIT_MMX mmx
;void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
cglobal column_fidct, 4, 5, 0, 32, thr, src, out, cnt, tmp
.fdct1:
diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c
index 2aadb50967..9f6095ce24 100644
--- a/libavfilter/x86/vf_fspp_init.c
+++ b/libavfilter/x86/vf_fspp_init.c
@@ -29,7 +29,7 @@ void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
-void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt);
@@ -41,9 +41,11 @@ av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
if (EXTERNAL_MMX(cpu_flags)) {
s->store_slice = ff_store_slice_mmx;
s->store_slice2 = ff_store_slice2_mmx;
- s->mul_thrmat = ff_mul_thrmat_mmx;
s->column_fidct = ff_column_fidct_mmx;
s->row_idct = ff_row_idct_mmx;
s->row_fdct = ff_row_fdct_mmx;
}
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ s->mul_thrmat = ff_mul_thrmat_sse2;
+ }
}
diff --git a/tests/checkasm/vf_fspp.c b/tests/checkasm/vf_fspp.c
index a84ae8d5af..117e1c670e 100644
--- a/tests/checkasm/vf_fspp.c
+++ b/tests/checkasm/vf_fspp.c
@@ -18,6 +18,7 @@
#include "checkasm.h"
#include "libavfilter/vf_fsppdsp.h"
+#include "libavutil/mem_internal.h"
#define randomize_buffers(buf) \
do { \
@@ -29,10 +30,11 @@
static void check_mul_thrmat(void)
{
FSPPDSPContext fspp;
- int16_t src[64];
- int16_t dst_ref[64], dst_new[64];
+ DECLARE_ALIGNED(16, int16_t, src)[64];
+ DECLARE_ALIGNED(16, int16_t, dst_ref)[64];
+ DECLARE_ALIGNED(16, int16_t, dst_new)[64];
const int q = (uint8_t)rnd();
- declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+ declare_func(void, int16_t *thr_adr_noq, int16_t *thr_adr, int q);
ff_fsppdsp_init(&fspp);
--
2.49.1
>From cd9e9ca3c1126d820bf6108c939b9911f2e72bd9 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Mon, 10 Nov 2025 12:54:31 +0100
Subject: [PATCH 06/23] avfilter/vf_fsppdsp: Use standard clamping
This is obviously what is intended and what the MMX code does;
yet I cannot rule out that it changes the output for some inputs:
I have observed individual src values which would lead to temp
values just above 512 if they came in pairs (i.e. if both inputs
were simultaneously huge).
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavfilter/vf_fsppdsp.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
index b84d7b57bb..f3f7c87174 100644
--- a/libavfilter/vf_fsppdsp.c
+++ b/libavfilter/vf_fsppdsp.c
@@ -24,6 +24,7 @@
#include "vf_fsppdsp.h"
+#include "libavutil/common.h"
#include "libavutil/mathematics.h"
#include "libavutil/mem_internal.h"
@@ -70,7 +71,7 @@ void ff_store_slice_c(uint8_t *dst, int16_t *src,
#define STORE(pos) \
temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale); \
src[x + pos] = src[x + pos - 8 * src_stride] = 0; \
- if (temp & 0x100) temp = ~(temp >> 31); \
+ temp = av_clip_uint8(temp); \
dst[x + pos] = temp;
for (int y = 0; y < height; y++) {
@@ -99,7 +100,7 @@ void ff_store_slice2_c(uint8_t *dst, int16_t *src,
#define STORE2(pos) \
temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >> log2_scale)) >> (6 - log2_scale); \
src[x + pos + 16 * src_stride] = 0; \
- if (temp & 0x100) temp = ~(temp >> 31); \
+ temp = av_clip_uint8(temp); \
dst[x + pos] = temp;
for (int y = 0; y < height; y++) {
--
2.49.1
>From c90066ba04c4f8ff8471f99d80c8cda68a491b63 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Mon, 10 Nov 2025 21:57:45 +0100
Subject: [PATCH 07/23] tests/checkasm/vf_fspp: Test store_slice
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
tests/checkasm/vf_fspp.c | 77 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 77 insertions(+)
diff --git a/tests/checkasm/vf_fspp.c b/tests/checkasm/vf_fspp.c
index 117e1c670e..eab62c9450 100644
--- a/tests/checkasm/vf_fspp.c
+++ b/tests/checkasm/vf_fspp.c
@@ -16,8 +16,12 @@
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
+#include <stddef.h>
+#include <stdint.h>
+
#include "checkasm.h"
#include "libavfilter/vf_fsppdsp.h"
+#include "libavcodec/mathops.h"
#include "libavutil/mem_internal.h"
#define randomize_buffers(buf) \
@@ -26,6 +30,78 @@
buf[j] = rnd(); \
} while (0)
+#define randomize_mask_buffers(buf, buf2, nb_elems, nb_bits)\
+ do { \
+ for (size_t j = 0; j < nb_elems; ++j) \
+ buf[j] = buf2[j] = sign_extend(rnd(), nb_bits); \
+ } while (0)
+
+static void check_store_slice(void)
+{
+ enum {
+ MAX_WIDTH = 256,
+ /// in elements, not in bytes; 32 is arbirary
+ MAX_STRIDE = MAX_WIDTH + 32,
+ MAX_HEIGHT = 8,
+ };
+ FSPPDSPContext fspp;
+ ff_fsppdsp_init(&fspp);
+ declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, int16_t *src,
+ ptrdiff_t dst_stride, ptrdiff_t src_stride,
+ ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
+
+ for (int i = 0; i < 2; ++i) {
+ if (check_func(i ? fspp.store_slice2 : fspp.store_slice, "store_slice%s", i ? "2" : "")) {
+ // store slice resets the row eight lines above the current one
+ DECLARE_ALIGNED(16, int16_t, src_ref1)[MAX_STRIDE * ( 8 + MAX_HEIGHT - 1) + MAX_WIDTH];
+ DECLARE_ALIGNED(16, int16_t, src_new1)[MAX_STRIDE * ( 8 + MAX_HEIGHT - 1) + MAX_WIDTH];
+ // store_slice2 resets the row 16 lines below the current one
+ DECLARE_ALIGNED(16, int16_t, src_ref2)[MAX_STRIDE * (16 + MAX_HEIGHT - 1) + MAX_WIDTH];
+ DECLARE_ALIGNED(16, int16_t, src_new2)[MAX_STRIDE * (16 + MAX_HEIGHT - 1) + MAX_WIDTH];
+ uint8_t dstbuf_new[MAX_STRIDE * (MAX_HEIGHT - 1) + MAX_WIDTH], dstbuf_ref[MAX_STRIDE * (MAX_HEIGHT - 1) + MAX_WIDTH];
+ uint8_t *dst_new = dstbuf_new, *dst_ref = dstbuf_ref;
+ int16_t *src_ref, *src_new, *or_src_ref, *or_src_new;
+ ptrdiff_t width = 1 + rnd() % MAX_WIDTH;
+ ptrdiff_t src_stride = FFALIGN(width + 1 + rnd() % (MAX_STRIDE - MAX_WIDTH), 8);
+ ptrdiff_t dst_stride = FFALIGN(width + 1 + rnd() % (MAX_STRIDE - MAX_WIDTH), 8);
+ ptrdiff_t height = 1 + rnd() % 8;
+ size_t nb_elems;
+
+ if (i) {
+ src_ref = src_ref2;
+ src_new = src_new2;
+ or_src_ref = src_ref2;
+ or_src_new = src_new2;
+ nb_elems = FF_ARRAY_ELEMS(src_ref2);
+ } else {
+ src_ref = src_ref1 + 8 * src_stride;
+ src_new = src_new1 + 8 * src_stride;
+ or_src_ref = src_ref1;
+ or_src_new = src_new1;
+ nb_elems = FF_ARRAY_ELEMS(src_ref1);
+ }
+ if (rnd() & 1) {
+ dst_ref += dst_stride * (height - 1);
+ dst_new += dst_stride * (height - 1);
+ dst_stride *= -1;
+ }
+ randomize_buffers(dstbuf_new);
+ memcpy(dstbuf_ref, dstbuf_new, sizeof(dstbuf_ref));
+ randomize_mask_buffers(or_src_ref, or_src_new, nb_elems, 14);
+
+ ptrdiff_t log2_scale = rnd() & 1;
+ call_ref(dst_ref, src_ref, dst_stride, src_stride, width, height, log2_scale);
+ call_new(dst_new, src_new, dst_stride, src_stride, width, height, log2_scale);
+ if (memcmp(dstbuf_new, dstbuf_ref, sizeof(dstbuf_ref)) ||
+ memcmp(or_src_ref, or_src_new, sizeof(*or_src_new) * nb_elems))
+ fail();
+ // don't use random parameters for benchmarks
+ src_ref = or_src_ref + !i * 8 * MAX_STRIDE;
+ bench_new(dstbuf_new, src_ref,
+ MAX_STRIDE, MAX_STRIDE, MAX_WIDTH, 8, 1);
+ }
+ }
+}
static void check_mul_thrmat(void)
{
@@ -50,5 +126,6 @@ static void check_mul_thrmat(void)
void checkasm_check_vf_fspp(void)
{
+ check_store_slice();
check_mul_thrmat();
}
--
2.49.1
>From e67ee1a479f984274016ffacc71aae7ac636417c Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Mon, 10 Nov 2025 22:06:34 +0100
Subject: [PATCH 08/23] avfilter/x86/vf_fspp: Port store_slice to SSE2
Old benchmarks:
store_slice_c: 2798.3 ( 1.00x)
store_slice_mmx: 950.2 ( 2.94x)
store_slice2_c: 3811.7 ( 1.00x)
store_slice2_mmx: 682.3 ( 5.59x)
New benchmarks:
store_slice_c: 2797.2 ( 1.00x)
store_slice_sse2: 543.5 ( 5.15x)
store_slice2_c: 3817.0 ( 1.00x)
store_slice2_sse2: 408.2 ( 9.35x)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavfilter/vf_fsppdsp.h | 4 +-
libavfilter/x86/vf_fspp.asm | 70 +++++++++++++---------------------
libavfilter/x86/vf_fspp_init.c | 12 +++---
3 files changed, 34 insertions(+), 52 deletions(-)
diff --git a/libavfilter/vf_fsppdsp.h b/libavfilter/vf_fsppdsp.h
index e87fa6861c..b440809f02 100644
--- a/libavfilter/vf_fsppdsp.h
+++ b/libavfilter/vf_fsppdsp.h
@@ -31,11 +31,11 @@
#include "libavutil/attributes_internal.h"
typedef struct FSPPDSPContext {
- void (*store_slice)(uint8_t *dst, int16_t *src,
+ void (*store_slice)(uint8_t *dst, int16_t *src /* align 16 */,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
- void (*store_slice2)(uint8_t *dst, int16_t *src,
+ void (*store_slice2)(uint8_t *dst, int16_t *src /* align 16 */,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm
index c9408978d8..489e69f8ce 100644
--- a/libavfilter/x86/vf_fspp.asm
+++ b/libavfilter/x86/vf_fspp.asm
@@ -43,15 +43,15 @@ SECTION .text
%define DCTSIZE 8
-INIT_MMX mmx
+INIT_XMM sse2
-;void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
-; ptrdiff_t dst_stride, ptrdiff_t src_stride,
-; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
+;void ff_store_slice_sse2(uint8_t *dst, int16_t *src,
+; ptrdiff_t dst_stride, ptrdiff_t src_stride,
+; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
%if ARCH_X86_64
-cglobal store_slice, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
+cglobal store_slice, 7, 9, 5, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
%else
-cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
+cglobal store_slice, 2, 7, 5, dst, src, width, dither_height, dither, tmp, tmp2
%define dst_strideq r2m
%define src_strideq r3m
mov widthq, r4m
@@ -62,7 +62,7 @@ cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
mov tmpq, src_strideq
and widthq, ~7
sub dst_strideq, widthq
- movd m5, ditherd ; log2_scale
+ movd m4, ditherd ; log2_scale
xor ditherq, -1 ; log2_scale
mov tmp2q, tmpq
add ditherq, 7 ; log2_scale
@@ -74,29 +74,21 @@ cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
mov src_strideq, tmp2q
shl tmpq, 4
lea dither_heightq, [ditherq+dither_heightq*8]
- pxor m7, m7
+ pxor m1, m1
.loop_height:
movq m3, [ditherq]
- movq m4, m3
- punpcklbw m3, m7
- punpckhbw m4, m7
+ punpcklbw m3, m1
mov tmp2q, widthq
- psraw m3, m5
- psraw m4, m5
+ psraw m3, m4
.loop_width:
- movq [srcq+tmpq], m7
- movq m0, [srcq]
- movq m1, [srcq+8]
- movq [srcq+tmpq+8], m7
+ mova m0, [srcq]
+ mova [srcq+tmpq], m1
paddw m0, m3
- paddw m1, m4
- movq [srcq], m7
+ mova [srcq], m1
psraw m0, m2
- psraw m1, m2
- movq [srcq+8], m7
- packuswb m0, m1
+ packuswb m0, m0
add srcq, 16
movq [dstq], m0
add dstq, 8
@@ -110,13 +102,13 @@ cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
jl .loop_height
RET
-;void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
-; ptrdiff_t dst_stride, ptrdiff_t src_stride,
-; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
+;void ff_store_slice2_sse2(uint8_t *dst, int16_t *src,
+; ptrdiff_t dst_stride, ptrdiff_t src_stride,
+; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
%if ARCH_X86_64
-cglobal store_slice2, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
+cglobal store_slice2, 7, 9, 5, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
%else
-cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
+cglobal store_slice2, 0, 7, 5, dst, src, width, dither_height, dither, tmp, tmp2
%define dst_strideq r2m
%define src_strideq r3m
mov dstq, dstm
@@ -129,7 +121,7 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
mov tmpq, src_strideq
and widthq, ~7
sub dst_strideq, widthq
- movd m5, ditherd ; log2_scale
+ movd m4, ditherd ; log2_scale
xor ditherq, -1 ; log2_scale
mov tmp2q, tmpq
add ditherq, 7 ; log2_scale
@@ -140,30 +132,21 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
mov src_strideq, tmp2q
shl tmpq, 5
lea dither_heightq, [ditherq+dither_heightq*8]
- pxor m7, m7
+ pxor m1, m1
.loop_height:
movq m3, [ditherq]
- movq m4, m3
- punpcklbw m3, m7
- punpckhbw m4, m7
+ punpcklbw m3, m1
mov tmp2q,widthq
- psraw m3, m5
- psraw m4, m5
+ psraw m3, m4
.loop_width:
- movq m0, [srcq]
- movq m1, [srcq+8]
+ mova m0, [srcq]
paddw m0, m3
paddw m0, [srcq+tmpq]
- paddw m1, m4
- movq m6, [srcq+tmpq+8]
- movq [srcq+tmpq], m7
+ mova [srcq+tmpq], m1
psraw m0, m2
- paddw m1, m6
- movq [srcq+tmpq+8], m7
- psraw m1, m2
- packuswb m0, m1
+ packuswb m0, m0
movq [dstq], m0
add srcq, 16
add dstq, 8
@@ -178,7 +161,6 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
RET
;void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
-INIT_XMM sse2
cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
movd m4, qd
mova m0, [thrnq]
diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c
index 9f6095ce24..ee875547d2 100644
--- a/libavfilter/x86/vf_fspp_init.c
+++ b/libavfilter/x86/vf_fspp_init.c
@@ -23,12 +23,12 @@
#include "libavutil/x86/cpu.h"
#include "libavfilter/vf_fsppdsp.h"
-void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
- ptrdiff_t dst_stride, ptrdiff_t src_stride,
- ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
-void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
+void ff_store_slice_sse2(uint8_t *dst, int16_t *src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
+void ff_store_slice2_sse2(uint8_t *dst, int16_t *src,
+ ptrdiff_t dst_stride, ptrdiff_t src_stride,
+ ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
@@ -39,13 +39,13 @@ av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
- s->store_slice = ff_store_slice_mmx;
- s->store_slice2 = ff_store_slice2_mmx;
s->column_fidct = ff_column_fidct_mmx;
s->row_idct = ff_row_idct_mmx;
s->row_fdct = ff_row_fdct_mmx;
}
if (EXTERNAL_SSE2(cpu_flags)) {
+ s->store_slice = ff_store_slice_sse2;
+ s->store_slice2 = ff_store_slice2_sse2;
s->mul_thrmat = ff_mul_thrmat_sse2;
}
}
--
2.49.1
>From d1b45c85cef16657579aafe61eebaf23c816c75d Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Mon, 10 Nov 2025 23:03:23 +0100
Subject: [PATCH 09/23] avfilter/vf_fsppdsp: Use restrict
It is possible because the requirements are fulfilled;
it is also beneficial performance and code-size wise.
For GCC 14 (with -O3), this reduced codesize by 26750B
here; for Clang 20, it was 432B.
Old benchmarks:
mul_thrmat_c: 4.3 ( 1.00x)
mul_thrmat_sse2: 4.3 ( 1.00x)
store_slice_c: 2810.8 ( 1.00x)
store_slice_sse2: 542.5 ( 5.18x)
store_slice2_c: 3817.0 ( 1.00x)
store_slice2_sse2: 410.4 ( 9.30x)
New benchmarks:
mul_thrmat_c: 4.3 ( 1.00x)
mul_thrmat_sse2: 4.3 ( 1.00x)
store_slice_c: 1510.1 ( 1.00x)
store_slice_sse2: 545.2 ( 2.77x)
store_slice2_c: 1763.5 ( 1.00x)
store_slice2_sse2: 408.3 ( 4.32x)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavfilter/vf_fsppdsp.c | 15 +++++++++------
libavfilter/vf_fsppdsp.h | 31 +++++++++++++++++--------------
2 files changed, 26 insertions(+), 20 deletions(-)
diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
index f3f7c87174..583571bf94 100644
--- a/libavfilter/vf_fsppdsp.c
+++ b/libavfilter/vf_fsppdsp.c
@@ -64,7 +64,7 @@ DECLARE_ALIGNED(8, const uint8_t, ff_fspp_dither)[8][8] = {
};
//This func reads from 1 slice, 1 and clears 0 & 1
-void ff_store_slice_c(uint8_t *dst, int16_t *src,
+void ff_store_slice_c(uint8_t *restrict dst, int16_t *restrict src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
{
@@ -93,7 +93,7 @@ void ff_store_slice_c(uint8_t *dst, int16_t *src,
}
//This func reads from 2 slices, 0 & 2 and clears 2-nd
-void ff_store_slice2_c(uint8_t *dst, int16_t *src,
+void ff_store_slice2_c(uint8_t *restrict dst, int16_t *restrict src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
{
@@ -121,13 +121,14 @@ void ff_store_slice2_c(uint8_t *dst, int16_t *src,
}
}
-void ff_mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q)
+void ff_mul_thrmat_c(int16_t *restrict thr_adr_noq, int16_t *restrict thr_adr, int q)
{
for (int a = 0; a < 64; a++)
thr_adr[a] = q * thr_adr_noq[a];
}
-void ff_column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt)
+void ff_column_fidct_c(int16_t *restrict thr_adr, int16_t *restrict data,
+ int16_t *restrict output, int cnt)
{
int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int_simd16_t tmp10, tmp11, tmp12, tmp13;
@@ -249,7 +250,8 @@ void ff_column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt
}
}
-void ff_row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt)
+void ff_row_idct_c(int16_t *restrict workspace, int16_t *restrict output_adr,
+ ptrdiff_t output_stride, int cnt)
{
int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int_simd16_t tmp10, tmp11, tmp12, tmp13;
@@ -311,7 +313,8 @@ void ff_row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_str
}
}
-void ff_row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt)
+void ff_row_fdct_c(int16_t *restrict data, const uint8_t *restrict pixels,
+ ptrdiff_t line_size, int cnt)
{
int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int_simd16_t tmp10, tmp11, tmp12, tmp13;
diff --git a/libavfilter/vf_fsppdsp.h b/libavfilter/vf_fsppdsp.h
index b440809f02..66030da4b1 100644
--- a/libavfilter/vf_fsppdsp.h
+++ b/libavfilter/vf_fsppdsp.h
@@ -31,40 +31,43 @@
#include "libavutil/attributes_internal.h"
typedef struct FSPPDSPContext {
- void (*store_slice)(uint8_t *dst, int16_t *src /* align 16 */,
+ void (*store_slice)(uint8_t *restrict dst, int16_t *restrict src /* align 16 */,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
- void (*store_slice2)(uint8_t *dst, int16_t *src /* align 16 */,
+ void (*store_slice2)(uint8_t *restrict dst, int16_t *restrict src /* align 16 */,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
- void (*mul_thrmat)(int16_t *thr_adr_noq /* align 16 */,
- int16_t *thr_adr /* align 16 */, int q);
+ void (*mul_thrmat)(int16_t *restrict thr_adr_noq /* align 16 */,
+ int16_t *restrict thr_adr /* align 16 */, int q);
- void (*column_fidct)(int16_t *thr_adr, int16_t *data,
- int16_t *output, int cnt);
+ void (*column_fidct)(int16_t *restrict thr_adr, int16_t *data,
+ int16_t *restrict output, int cnt);
- void (*row_idct)(int16_t *workspace, int16_t *output_adr,
+ void (*row_idct)(int16_t *restrict workspace, int16_t *restrict output_adr,
ptrdiff_t output_stride, int cnt);
- void (*row_fdct)(int16_t *data, const uint8_t *pixels,
+ void (*row_fdct)(int16_t *restrict data, const uint8_t *restrict pixels,
ptrdiff_t line_size, int cnt);
} FSPPDSPContext;
FF_VISIBILITY_PUSH_HIDDEN
extern const uint8_t ff_fspp_dither[8][8];
-void ff_store_slice_c(uint8_t *dst, int16_t *src,
+void ff_store_slice_c(uint8_t *restrict dst, int16_t *restrict src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
-void ff_store_slice2_c(uint8_t *dst, int16_t *src,
+void ff_store_slice2_c(uint8_t *restrict dst, int16_t *restrict src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
-void ff_mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
-void ff_column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
-void ff_row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
-void ff_row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt);
+void ff_mul_thrmat_c(int16_t *restrict thr_adr_noq, int16_t *restrict thr_adr, int q);
+void ff_column_fidct_c(int16_t *restrict thr_adr, int16_t *restrict data,
+ int16_t *restrict output, int cnt);
+void ff_row_idct_c(int16_t *restrict workspace, int16_t *restrict output_adr,
+ ptrdiff_t output_stride, int cnt);
+void ff_row_fdct_c(int16_t *restrict data, const uint8_t *restrict pixels,
+ ptrdiff_t line_size, int cnt);
void ff_fsppdsp_init_x86(FSPPDSPContext *fspp);
FF_VISIBILITY_POP_HIDDEN
--
2.49.1
>From 50665134560dd4f7bc70dcde7b6c0c64af53a14b Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 12 Nov 2025 14:21:09 +0100
Subject: [PATCH 10/23] avfilter/vf_fsppdsp: Reduce discrepancies between C
code and x86 asm
The x86 assembly uses the following pattern to zero all
the values with abs<threshold:
x -= threshold;
x satu+= threshold (unsigned saturated addition)
x += threshold
x satu-= threshold (unsigned saturated subtraction)
The reference C code meanwhile zeroed everything
with abs <= threshold. This commit makes the C code behave
like the x86 assembly to reduce discrepancies between the two.
An alternative would be to require SSSE3, so that
one can use pabsw, pcmpgtw for abs>threshold, followed by
a pand with the original data. Or one could modify the thresholds
to make both equal.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavfilter/vf_fsppdsp.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
index 583571bf94..e530bcd06b 100644
--- a/libavfilter/vf_fsppdsp.c
+++ b/libavfilter/vf_fsppdsp.c
@@ -34,7 +34,7 @@
#define MULTIPLY16H(x,k) (((x) * (k)) >> 16)
#define THRESHOLD(r,x,t) \
- if(((unsigned)((x) + t)) > t * 2) r = (x); \
+ if (((unsigned)((x) + t)) >= t * 2) r = (x); \
else r = 0;
#define DESCALE(x,n) (((x) + (1 << ((n) - 1))) >> n)
--
2.49.1
>From ab13abb61eae8bec9f90f26020c30824e4ef175e Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 12 Nov 2025 18:44:49 +0100
Subject: [PATCH 11/23] avfilter/x86/vf_fspp: Make ff_column_fidct_mmx()
bitexact
It currently is not, because the shortcut mode uses different rounding
than the C code (as well as the non-shortcut code).
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavfilter/x86/vf_fspp.asm | 36 ++++++++++++++++++------------------
1 file changed, 18 insertions(+), 18 deletions(-)
diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm
index 489e69f8ce..2f49945c13 100644
--- a/libavfilter/x86/vf_fspp.asm
+++ b/libavfilter/x86/vf_fspp.asm
@@ -33,9 +33,6 @@ pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14)
pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14)
pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13)
pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13)
-pw_3642: times 4 dw 0x3642 ; FIX64(0.847759065, 14)
-pw_2441: times 4 dw 0x2441 ; FIX64(0.566454497, 14)
-pw_0CBB: times 4 dw 0x0CBB ; FIX64(0.198912367, 14)
pw_4: times 4 dw 4
pw_2: times 4 dw 2
@@ -315,31 +312,34 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
or tmpd, tmpd
jnz %1
movq m4, [rsp]
- movq m1, m0
- pmulhw m0, [pw_3642]
- movq m2, m1
- movq m5, [outq+DCTSIZE*0*2]
- movq m3, m2
- pmulhw m1, [pw_2441]
+ psraw m3, m0, 2
+ psllw m0, 1
+ mova m5, [outq+DCTSIZE*0*2]
+ pmulhw m1, m0, [pw_3B21]
+ pmulhw m2, m0, [pw_22A3]
+ pmulhw m0, [pw_2D41]
paddw m5, m4
movq m6, [rsp+8]
- psraw m3, 2
- pmulhw m2, [pw_0CBB]
+ psubw m2, m1
psubw m4, m3
movq m7, [outq+DCTSIZE*1*2]
paddw m5, m3
- movq [outq+DCTSIZE*7*2], m4
+ psubw m1, m3
+ mova [outq+DCTSIZE*7*2], m4
+ psubw m0, m1
+ paddw m2, m0
+ mova [outq+DCTSIZE*0*2], m5
paddw m7, m6
movq m3, [rsp+8*2]
- psubw m6, m0
+ psubw m6, m1
movq m4, [outq+DCTSIZE*2*2]
- paddw m7, m0
+ paddw m7, m1
movq [outq], m5
paddw m4, m3
movq [outq+DCTSIZE*6*2], m6
- psubw m3, m1
+ psubw m3, m0
movq m5, [outq+DCTSIZE*5*2]
- paddw m4, m1
+ paddw m4, m0
movq m6, [outq+DCTSIZE*3*2]
paddw m5, m3
movq m0, [rsp+8*3]
@@ -347,9 +347,9 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
movq [outq+DCTSIZE*1*2], m7
paddw m6, m0
movq [outq+DCTSIZE*2*2], m4
- psubw m0, m2
+ paddw m0, m2
movq m7, [outq+DCTSIZE*4*2]
- paddw m6, m2
+ psubw m6, m2
movq [outq+DCTSIZE*5*2], m5
paddw m7, m0
movq [outq+DCTSIZE*3*2], m6
--
2.49.1
>From 7b5a0acf7916e700e59a0e54401e0c4eb5f5e672 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 12 Nov 2025 19:39:35 +0100
Subject: [PATCH 12/23] avfilter/x86/vf_fspp: Put shifts into constants
This avoids some shift instructions and also gives us more headroom
in the registers. In fact, I have proven to myself that everything
that is supposed to fit into 16bits now actually does so.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavfilter/x86/vf_fspp.asm | 36 +++++++++++++++++-------------------
1 file changed, 17 insertions(+), 19 deletions(-)
diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm
index 2f49945c13..f61efc99f8 100644
--- a/libavfilter/x86/vf_fspp.asm
+++ b/libavfilter/x86/vf_fspp.asm
@@ -27,10 +27,13 @@ SECTION_RODATA
cextern fspp_dither
pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14)
+pw_61F8: times 4 dw 0x61F8 ; 4*FIX(0.382683433, 14)
pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13)
+pw_4546: times 4 dw 0x4546 ; 2*FIX(1.082392200, 13)
pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13)
pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14)
pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14)
+pw_7642: times 4 dw 0x7642 ; 2*FIX(1.847759065, 13)
pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13)
pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13)
pw_4: times 4 dw 4
@@ -211,12 +214,12 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
psubw m2, m6
paddw m7, m1
movq m6, [thrq+4*16+%2]
- psllw m7, 2
+ psllw m7, 1
psubw m5, [thrq+%2]
psubw m2, m6
paddusw m5, [thrq+%2]
paddusw m2, m6
- pmulhw m7, [pw_2D41]
+ pmulhw m7, [pw_5A82]
paddw m5, [thrq+%2]
paddw m2, m6
psubusw m5, [thrq+%2]
@@ -261,15 +264,14 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
paddw m4, m0
movq m7, m3
psubw m3, m4
- psllw m3, 2
- psllw m7, 2
- pmulhw m3, [pw_187E]
+ psllw m7, 1
+ pmulhw m3, [pw_61F8]
psllw m4, 2
- pmulhw m7, [pw_22A3]
- psllw m2, 2
+ pmulhw m7, [pw_4546]
+ psllw m2, 1
pmulhw m4, [pw_539F]
paddw m5, m1
- pmulhw m2, [pw_2D41]
+ pmulhw m2, [pw_5A82]
psubw m6, m1
paddw m7, m3
movq [rsp+8], m5
@@ -313,11 +315,10 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
jnz %1
movq m4, [rsp]
psraw m3, m0, 2
- psllw m0, 1
mova m5, [outq+DCTSIZE*0*2]
- pmulhw m1, m0, [pw_3B21]
- pmulhw m2, m0, [pw_22A3]
- pmulhw m0, [pw_2D41]
+ pmulhw m1, m0, [pw_7642]
+ pmulhw m2, m0, [pw_4546]
+ pmulhw m0, [pw_5A82]
paddw m5, m4
movq m6, [rsp+8]
psubw m2, m1
@@ -360,23 +361,20 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
%macro COLUMN_IDCT 0-1 0
movq m3, m5
psubw m5, m1
- psllw m5, 1
paddw m3, m1
movq m2, m0
psubw m0, m6
- movq m1, m5
- psllw m0, 1
+ psllw m1, m5, 1
pmulhw m1, [pw_AC62]
paddw m5, m0
- pmulhw m5, [pw_3B21]
+ pmulhw m5, [pw_7642]
paddw m2, m6
- pmulhw m0, [pw_22A3]
+ pmulhw m0, [pw_4546]
movq m7, m2
movq m4, [rsp]
psubw m2, m3
- psllw m2, 1
paddw m7, m3
- pmulhw m2, [pw_2D41]
+ pmulhw m2, [pw_5A82]
movq m6, m4
psraw m7, 2
paddw m4, [outq]
--
2.49.1
>From 55342723889a54740920fb24d8ef2f83a7ec5b80 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 12 Nov 2025 21:03:06 +0100
Subject: [PATCH 13/23] tests/checkasm/vf_fspp: Add test for column_fidct
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
tests/checkasm/vf_fspp.c | 39 +++++++++++++++++++++++++++++++++++++++
1 file changed, 39 insertions(+)
diff --git a/tests/checkasm/vf_fspp.c b/tests/checkasm/vf_fspp.c
index eab62c9450..f9e7b35e88 100644
--- a/tests/checkasm/vf_fspp.c
+++ b/tests/checkasm/vf_fspp.c
@@ -36,6 +36,12 @@
buf[j] = buf2[j] = sign_extend(rnd(), nb_bits); \
} while (0)
+#define randomize_buffer_range(buf, min, max) \
+ do { \
+ for (size_t j = 0; j < FF_ARRAY_ELEMS(buf); ++j) \
+ buf[j] = min + rnd() % (max - min + 1); \
+ } while (0)
+
static void check_store_slice(void)
{
enum {
@@ -124,8 +130,41 @@ static void check_mul_thrmat(void)
}
}
+static void check_column_fidct(void)
+{
+ enum {
+ NB_BLOCKS = 8, ///< arbitrary
+ };
+ FSPPDSPContext fspp;
+ declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *thr_adr, int16_t *data,
+ int16_t *output, int cnt);
+
+ ff_fsppdsp_init(&fspp);
+
+ if (check_func(fspp.column_fidct, "column_fidct")) {
+ DECLARE_ALIGNED(16, int16_t, threshold)[64];
+ DECLARE_ALIGNED(16, int16_t, src)[8*(8*NB_BLOCKS + 6)];
+ DECLARE_ALIGNED(16, int16_t, dst_new)[8*(8*NB_BLOCKS + 6)];
+ DECLARE_ALIGNED(16, int16_t, dst_ref)[8*(8*NB_BLOCKS + 6)];
+
+ randomize_buffer_range(threshold, 0, INT16_MAX);
+ randomize_buffer_range(src, -1284, 1284);
+ randomize_buffers(dst_new);
+ memcpy(dst_ref, dst_new, sizeof(dst_ref));
+
+ call_ref(threshold, src, dst_ref, NB_BLOCKS * 8);
+ call_new(threshold, src, dst_new, NB_BLOCKS * 8);
+
+ if (memcmp(dst_new, dst_ref, sizeof(dst_new)))
+ fail();
+
+ bench_new(threshold, src, dst_new, NB_BLOCKS * 8);
+ }
+}
+
void checkasm_check_vf_fspp(void)
{
check_store_slice();
check_mul_thrmat();
+ check_column_fidct();
}
--
2.49.1
>From 1c3f7376e10a466c018a51e589cbd9d46a1d3792 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 12 Nov 2025 21:42:32 +0100
Subject: [PATCH 14/23] avfilter/x86/vf_fspp: Port ff_column_fidct_mmx() to
SSE2
It gains a lot because it has to operate on eight words;
it also saves 608B of .text here.
Old benchmarks:
column_fidct_c: 3365.7 ( 1.00x)
column_fidct_mmx: 1784.6 ( 1.89x)
New benchmarks:
column_fidct_c: 3361.5 ( 1.00x)
column_fidct_sse2: 801.1 ( 4.20x)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavfilter/x86/vf_fspp.asm | 209 ++++++++++++++++-----------------
libavfilter/x86/vf_fspp_init.c | 4 +-
tests/checkasm/vf_fspp.c | 4 +-
3 files changed, 107 insertions(+), 110 deletions(-)
diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm
index f61efc99f8..3f37911722 100644
--- a/libavfilter/x86/vf_fspp.asm
+++ b/libavfilter/x86/vf_fspp.asm
@@ -26,18 +26,18 @@
SECTION_RODATA
cextern fspp_dither
+pw_4546: times 8 dw 0x4546 ; FIX(1.082392200, 13)*2
+pw_61F8: times 8 dw 0x61F8 ; FIX(0.382683433, 14)*4
+pw_539F: times 8 dw 0x539F ; FIX(1.306562965, 14)
+pw_5A82: times 8 dw 0x5A82 ; FIX(1.414213562, 14)
+pw_7642: times 8 dw 0x7642 ; FIX(1.847759065, 13)*2
+pw_AC62: times 8 dw 0xAC62 ; FIX(-2.613125930, 13)
+pw_2: times 8 dw 2
pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14)
-pw_61F8: times 4 dw 0x61F8 ; 4*FIX(0.382683433, 14)
pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13)
-pw_4546: times 4 dw 0x4546 ; 2*FIX(1.082392200, 13)
pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13)
-pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14)
-pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14)
-pw_7642: times 4 dw 0x7642 ; 2*FIX(1.847759065, 13)
pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13)
-pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13)
pw_4: times 4 dw 4
-pw_2: times 4 dw 2
SECTION .text
@@ -191,82 +191,83 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
mova [thrq+16*7], m3
RET
-%macro COLUMN_FDCT 1-3 0, 0
- movq m1, [srcq+DCTSIZE*0*2]
- movq m7, [srcq+DCTSIZE*3*2]
- movq m0, m1
+%macro COLUMN_FDCT 1
+ mova m1, [srcq+DCTSIZE*0*2]
+ mova m7, [srcq+DCTSIZE*3*2]
+ mova m0, m1
paddw m1, [srcq+DCTSIZE*7*2]
- movq m3, m7
+ mova m3, m7
paddw m7, [srcq+DCTSIZE*4*2]
- movq m5, m1
- movq m6, [srcq+DCTSIZE*1*2]
+ mova m5, m1
+ mova m6, [srcq+DCTSIZE*1*2]
psubw m1, m7
- movq m2, [srcq+DCTSIZE*2*2]
- movq m4, m6
+ mova m2, [srcq+DCTSIZE*2*2]
+ mova m4, m6
paddw m6, [srcq+DCTSIZE*6*2]
paddw m5, m7
paddw m2, [srcq+DCTSIZE*5*2]
- movq m7, m6
+ mova m7, m6
paddw m6, m2
psubw m7, m2
- movq m2, m5
+ mova m2, m5
paddw m5, m6
psubw m2, m6
paddw m7, m1
- movq m6, [thrq+4*16+%2]
+ mova m6, [thrq+4*16]
psllw m7, 1
- psubw m5, [thrq+%2]
+ psubw m5, [thrq]
psubw m2, m6
- paddusw m5, [thrq+%2]
+ paddusw m5, [thrq]
paddusw m2, m6
pmulhw m7, [pw_5A82]
- paddw m5, [thrq+%2]
+ paddw m5, [thrq]
paddw m2, m6
- psubusw m5, [thrq+%2]
+ psubusw m5, [thrq]
psubusw m2, m6
paddw m5, [pw_2]
- movq m6, m2
+ mova m6, m2
paddw m2, m5
psubw m5, m6
- movq m6, m1
+ mova m6, m1
paddw m1, m7
- psubw m1, [thrq+2*16+%2]
+ psubw m1, [thrq+2*16]
psubw m6, m7
- movq m7, [thrq+6*16+%2]
+ mova m7, [thrq+6*16]
psraw m5, 2
- paddusw m1, [thrq+2*16+%2]
+ paddusw m1, [thrq+2*16]
psubw m6, m7
- paddw m1, [thrq+2*16+%2]
+ paddw m1, [thrq+2*16]
paddusw m6, m7
- psubusw m1, [thrq+2*16+%2]
+ psubusw m1, [thrq+2*16]
paddw m6, m7
psubw m3, [srcq+DCTSIZE*4*2]
psubusw m6, m7
- movq m7, m1
+ mova m7, m1
psraw m2, 2
psubw m4, [srcq+DCTSIZE*6*2]
psubw m1, m6
psubw m0, [srcq+DCTSIZE*7*2]
paddw m6, m7
psraw m6, 2
- movq m7, m2
+ mova m7, m2
pmulhw m1, [pw_5A82]
paddw m2, m6
- movq [rsp], m2
+ mova [rsp], m2
psubw m7, m6
- movq m2, [srcq+DCTSIZE*2*2]
+ mova m2, [srcq+DCTSIZE*2*2]
psubw m1, m6
psubw m2, [srcq+DCTSIZE*5*2]
- movq m6, m5
- movq [rsp+8*3], m7
+ mova m6, m5
+ mova [rsp+16*3], m7
paddw m3, m2
paddw m2, m4
paddw m4, m0
- movq m7, m3
+ mova m7, m3
psubw m3, m4
psllw m7, 1
pmulhw m3, [pw_61F8]
psllw m4, 2
+ add srcq, 32
pmulhw m7, [pw_4546]
psllw m2, 1
pmulhw m4, [pw_539F]
@@ -274,25 +275,25 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
pmulhw m2, [pw_5A82]
psubw m6, m1
paddw m7, m3
- movq [rsp+8], m5
+ mova [rsp+16], m5
paddw m4, m3
- movq m3, [thrq+3*16+%2]
- movq m1, m0
- movq [rsp+8*2], m6
+ mova m3, [thrq+3*16]
+ mova m1, m0
+ mova [rsp+16*2], m6
psubw m1, m2
paddw m0, m2
- movq m5, m1
- movq m2, [thrq+5*16+%2]
+ mova m5, m1
+ mova m2, [thrq+5*16]
psubw m1, m7
paddw m5, m7
psubw m1, m3
- movq m7, [thrq+16+%2]
+ mova m7, [thrq+16]
psubw m5, m2
- movq m6, m0
+ mova m6, m0
paddw m0, m4
paddusw m1, m3
psubw m6, m4
- movq m4, [thrq+7*16+%2]
+ mova m4, [thrq+7*16]
psubw m0, m7
psubw m6, m4
paddusw m5, m2
@@ -303,27 +304,32 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
psubusw m1, m3
psubusw m5, m2
psubusw m6, m4
- movq m4, m1
+ mova m4, m1
por m4, m5
paddusw m0, m7
por m4, m6
paddw m0, m7
packssdw m4, m4
psubusw m0, m7
- movd tmpd, m4
- or tmpd, tmpd
+%if ARCH_X86_64
+ movq tmpq, m4
+%else
+ packssdw m4, m4
+ movd tmpd, m4
+%endif
+ or tmpq, tmpq
jnz %1
- movq m4, [rsp]
+ mova m4, [rsp]
psraw m3, m0, 2
mova m5, [outq+DCTSIZE*0*2]
pmulhw m1, m0, [pw_7642]
pmulhw m2, m0, [pw_4546]
pmulhw m0, [pw_5A82]
paddw m5, m4
- movq m6, [rsp+8]
+ mova m6, [rsp+16]
psubw m2, m1
psubw m4, m3
- movq m7, [outq+DCTSIZE*1*2]
+ mova m7, [outq+DCTSIZE*1*2]
paddw m5, m3
psubw m1, m3
mova [outq+DCTSIZE*7*2], m4
@@ -331,38 +337,37 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
paddw m2, m0
mova [outq+DCTSIZE*0*2], m5
paddw m7, m6
- movq m3, [rsp+8*2]
+ mova m3, [rsp+16*2]
psubw m6, m1
- movq m4, [outq+DCTSIZE*2*2]
+ mova m4, [outq+DCTSIZE*2*2]
paddw m7, m1
- movq [outq], m5
+ mova [outq], m5
paddw m4, m3
- movq [outq+DCTSIZE*6*2], m6
+ mova [outq+DCTSIZE*6*2], m6
psubw m3, m0
- movq m5, [outq+DCTSIZE*5*2]
+ mova m5, [outq+DCTSIZE*5*2]
paddw m4, m0
- movq m6, [outq+DCTSIZE*3*2]
+ mova m6, [outq+DCTSIZE*3*2]
paddw m5, m3
- movq m0, [rsp+8*3]
- add srcq, 8+%3
- movq [outq+DCTSIZE*1*2], m7
+ mova m0, [rsp+16*3]
+ mova [outq+DCTSIZE*1*2], m7
paddw m6, m0
- movq [outq+DCTSIZE*2*2], m4
+ mova [outq+DCTSIZE*2*2], m4
paddw m0, m2
- movq m7, [outq+DCTSIZE*4*2]
+ mova m7, [outq+DCTSIZE*4*2]
psubw m6, m2
- movq [outq+DCTSIZE*5*2], m5
+ mova [outq+DCTSIZE*5*2], m5
paddw m7, m0
- movq [outq+DCTSIZE*3*2], m6
- movq [outq+DCTSIZE*4*2], m7
- add outq, 8+%3
+ mova [outq+DCTSIZE*3*2], m6
+ mova [outq+DCTSIZE*4*2], m7
+ add outq, 32
%endmacro
-%macro COLUMN_IDCT 0-1 0
- movq m3, m5
+%macro COLUMN_IDCT 0
+ mova m3, m5
psubw m5, m1
paddw m3, m1
- movq m2, m0
+ mova m2, m0
psubw m0, m6
psllw m1, m5, 1
pmulhw m1, [pw_AC62]
@@ -370,72 +375,64 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
pmulhw m5, [pw_7642]
paddw m2, m6
pmulhw m0, [pw_4546]
- movq m7, m2
- movq m4, [rsp]
+ mova m7, m2
+ mova m4, [rsp]
psubw m2, m3
paddw m7, m3
pmulhw m2, [pw_5A82]
- movq m6, m4
+ mova m6, m4
psraw m7, 2
paddw m4, [outq]
psubw m6, m7
- movq m3, [rsp+8]
+ mova m3, [rsp+16]
paddw m4, m7
- movq [outq+DCTSIZE*7*2], m6
+ mova [outq+DCTSIZE*7*2], m6
paddw m1, m5
- movq [outq], m4
+ mova [outq], m4
psubw m1, m7
- movq m7, [rsp+8*2]
+ mova m7, [rsp+16*2]
psubw m0, m5
- movq m6, [rsp+8*3]
- movq m5, m3
+ mova m6, [rsp+16*3]
+ mova m5, m3
paddw m3, [outq+DCTSIZE*1*2]
psubw m5, m1
psubw m2, m1
paddw m3, m1
- movq [outq+DCTSIZE*6*2], m5
- movq m4, m7
+ mova [outq+DCTSIZE*6*2], m5
+ mova m4, m7
paddw m7, [outq+DCTSIZE*2*2]
psubw m4, m2
paddw m4, [outq+DCTSIZE*5*2]
paddw m7, m2
- movq [outq+DCTSIZE*1*2], m3
+ mova [outq+DCTSIZE*1*2], m3
paddw m0, m2
- movq [outq+DCTSIZE*2*2], m7
- movq m1, m6
+ mova [outq+DCTSIZE*2*2], m7
+ mova m1, m6
paddw m6, [outq+DCTSIZE*4*2]
psubw m1, m0
paddw m1, [outq+DCTSIZE*3*2]
paddw m6, m0
- movq [outq+DCTSIZE*5*2], m4
- add srcq, 8+%1
- movq [outq+DCTSIZE*4*2], m6
- movq [outq+DCTSIZE*3*2], m1
- add outq, 8+%1
+ mova [outq+DCTSIZE*5*2], m4
+ mova [outq+DCTSIZE*4*2], m6
+ mova [outq+DCTSIZE*3*2], m1
+ add outq, 32
%endmacro
-INIT_MMX mmx
-;void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
-cglobal column_fidct, 4, 5, 0, 32, thr, src, out, cnt, tmp
-.fdct1:
- COLUMN_FDCT .idct1
- jmp .fdct2
+;void ff_column_fidct_sse2(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
+cglobal column_fidct, 4, 5, 8, 64, thr, src, out, cnt, tmp
+.fdct:
+ COLUMN_FDCT .idct
+ sub cntd, 2
+ jg .fdct
+ RET
-.idct1:
+.idct:
COLUMN_IDCT
-
-.fdct2:
- COLUMN_FDCT .idct2, 8, 16
sub cntd, 2
- jg .fdct1
- RET
-
-.idct2:
- COLUMN_IDCT 16
- sub cntd, 2
- jg .fdct1
+ jg .fdct
RET
+INIT_MMX mmx
;void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
cglobal row_idct, 4, 5, 0, 16, src, dst, stride, cnt, stride3
add strideq, strideq
diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c
index ee875547d2..c7a9b1799e 100644
--- a/libavfilter/x86/vf_fspp_init.c
+++ b/libavfilter/x86/vf_fspp_init.c
@@ -30,7 +30,7 @@ void ff_store_slice2_sse2(uint8_t *dst, int16_t *src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
-void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
+void ff_column_fidct_sse2(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt);
@@ -39,7 +39,6 @@ av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
- s->column_fidct = ff_column_fidct_mmx;
s->row_idct = ff_row_idct_mmx;
s->row_fdct = ff_row_fdct_mmx;
}
@@ -47,5 +46,6 @@ av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
s->store_slice = ff_store_slice_sse2;
s->store_slice2 = ff_store_slice2_sse2;
s->mul_thrmat = ff_mul_thrmat_sse2;
+ s->column_fidct = ff_column_fidct_sse2;
}
}
diff --git a/tests/checkasm/vf_fspp.c b/tests/checkasm/vf_fspp.c
index f9e7b35e88..b65a46247d 100644
--- a/tests/checkasm/vf_fspp.c
+++ b/tests/checkasm/vf_fspp.c
@@ -136,8 +136,8 @@ static void check_column_fidct(void)
NB_BLOCKS = 8, ///< arbitrary
};
FSPPDSPContext fspp;
- declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *thr_adr, int16_t *data,
- int16_t *output, int cnt);
+ declare_func(void, int16_t *thr_adr, int16_t *data,
+ int16_t *output, int cnt);
ff_fsppdsp_init(&fspp);
--
2.49.1
>From ce16476ecb7cbf7496a4fe8ece6c8d77f5bc3f31 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 12 Nov 2025 22:44:28 +0100
Subject: [PATCH 15/23] avfilter/x86/vf_fspp: Avoid stack on x64
Possible due to the amount of registers.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavfilter/x86/vf_fspp.asm | 78 ++++++++++++++++++++++++-------------
1 file changed, 52 insertions(+), 26 deletions(-)
diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm
index 3f37911722..cad44ed0bf 100644
--- a/libavfilter/x86/vf_fspp.asm
+++ b/libavfilter/x86/vf_fspp.asm
@@ -210,35 +210,47 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
paddw m6, m2
psubw m7, m2
mova m2, m5
+%if ARCH_X86_64
+ mova m8, [thrq]
+%define THRQ m8
+%else
+%define THRQ [thrq]
+%endif
paddw m5, m6
psubw m2, m6
paddw m7, m1
mova m6, [thrq+4*16]
psllw m7, 1
- psubw m5, [thrq]
+ psubw m5, THRQ
psubw m2, m6
- paddusw m5, [thrq]
+ paddusw m5, THRQ
paddusw m2, m6
- pmulhw m7, [pw_5A82]
- paddw m5, [thrq]
+ pmulhw m7, SQRT2
+ paddw m5, THRQ
paddw m2, m6
- psubusw m5, [thrq]
+ psubusw m5, THRQ
psubusw m2, m6
paddw m5, [pw_2]
mova m6, m2
paddw m2, m5
+%if ARCH_X86_64
+ mova m8, [thrq+2*16]
+%define THRQ m8
+%else
+%define THRQ [thrq+2*16]
+%endif
psubw m5, m6
mova m6, m1
paddw m1, m7
- psubw m1, [thrq+2*16]
+ psubw m1, THRQ
psubw m6, m7
mova m7, [thrq+6*16]
psraw m5, 2
- paddusw m1, [thrq+2*16]
+ paddusw m1, THRQ
psubw m6, m7
- paddw m1, [thrq+2*16]
+ paddw m1, THRQ
paddusw m6, m7
- psubusw m1, [thrq+2*16]
+ psubusw m1, THRQ
paddw m6, m7
psubw m3, [srcq+DCTSIZE*4*2]
psubusw m6, m7
@@ -250,15 +262,15 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
paddw m6, m7
psraw m6, 2
mova m7, m2
- pmulhw m1, [pw_5A82]
+ pmulhw m1, SQRT2
paddw m2, m6
- mova [rsp], m2
+ mova tmp0, m2
psubw m7, m6
mova m2, [srcq+DCTSIZE*2*2]
psubw m1, m6
psubw m2, [srcq+DCTSIZE*5*2]
mova m6, m5
- mova [rsp+16*3], m7
+ mova tmp3, m7
paddw m3, m2
paddw m2, m4
paddw m4, m0
@@ -272,14 +284,14 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
psllw m2, 1
pmulhw m4, [pw_539F]
paddw m5, m1
- pmulhw m2, [pw_5A82]
+ pmulhw m2, SQRT2
psubw m6, m1
paddw m7, m3
- mova [rsp+16], m5
+ mova tmp1, m5
paddw m4, m3
mova m3, [thrq+3*16]
mova m1, m0
- mova [rsp+16*2], m6
+ mova tmp2, m6
psubw m1, m2
paddw m0, m2
mova m5, m1
@@ -319,14 +331,14 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
%endif
or tmpq, tmpq
jnz %1
- mova m4, [rsp]
+ mova m4, tmp0
psraw m3, m0, 2
mova m5, [outq+DCTSIZE*0*2]
pmulhw m1, m0, [pw_7642]
pmulhw m2, m0, [pw_4546]
- pmulhw m0, [pw_5A82]
+ pmulhw m0, SQRT2
paddw m5, m4
- mova m6, [rsp+16]
+ mova m6, tmp1
psubw m2, m1
psubw m4, m3
mova m7, [outq+DCTSIZE*1*2]
@@ -337,7 +349,7 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
paddw m2, m0
mova [outq+DCTSIZE*0*2], m5
paddw m7, m6
- mova m3, [rsp+16*2]
+ mova m3, tmp2
psubw m6, m1
mova m4, [outq+DCTSIZE*2*2]
paddw m7, m1
@@ -349,7 +361,7 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
paddw m4, m0
mova m6, [outq+DCTSIZE*3*2]
paddw m5, m3
- mova m0, [rsp+16*3]
+ mova m0, tmp3
mova [outq+DCTSIZE*1*2], m7
paddw m6, m0
mova [outq+DCTSIZE*2*2], m4
@@ -376,23 +388,23 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
paddw m2, m6
pmulhw m0, [pw_4546]
mova m7, m2
- mova m4, [rsp]
+ mova m4, tmp0
psubw m2, m3
paddw m7, m3
- pmulhw m2, [pw_5A82]
+ pmulhw m2, SQRT2
mova m6, m4
psraw m7, 2
paddw m4, [outq]
psubw m6, m7
- mova m3, [rsp+16]
+ mova m3, tmp1
paddw m4, m7
mova [outq+DCTSIZE*7*2], m6
paddw m1, m5
mova [outq], m4
psubw m1, m7
- mova m7, [rsp+16*2]
+ mova m7, tmp2
psubw m0, m5
- mova m6, [rsp+16*3]
+ mova m6, tmp3
mova m5, m3
paddw m3, [outq+DCTSIZE*1*2]
psubw m5, m1
@@ -419,7 +431,21 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
%endmacro
;void ff_column_fidct_sse2(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
-cglobal column_fidct, 4, 5, 8, 64, thr, src, out, cnt, tmp
+cglobal column_fidct, 4, 5, 8+5*ARCH_X86_64, 64*!ARCH_X86_64, thr, src, out, cnt, tmp
+%if ARCH_X86_64
+ %define tmp0 m8
+ %define tmp1 m9
+ %define tmp2 m10
+ %define tmp3 m11
+ %define SQRT2 m12
+ mova m12, [pw_5A82]
+%else
+ %define tmp0 [rsp]
+ %define tmp1 [rsp+16]
+ %define tmp2 [rsp+2*16]
+ %define tmp3 [rsp+3*16]
+ %define SQRT2 [pw_5A82]
+%endif
.fdct:
COLUMN_FDCT .idct
sub cntd, 2
--
2.49.1
>From cfe9edb8bd267e1bcadad15a8fba244c866cc6bc Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 12 Nov 2025 23:05:30 +0100
Subject: [PATCH 16/23] avfilter/vf_fspp: Fix effective type violation
Also don't use unnecessarily large alignment; it avoids having to align
the stack.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavfilter/vf_fspp.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c
index fa562cbd45..3db7fe114e 100644
--- a/libavfilter/vf_fspp.c
+++ b/libavfilter/vf_fspp.c
@@ -114,9 +114,9 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
const int qpsh = 4 - p->hsub * !is_luma;
const int qpsv = 4 - p->vsub * !is_luma;
- DECLARE_ALIGNED(32, int32_t, block_align)[4 * 8 * BLOCKSZ + 4 * 8 * BLOCKSZ];
- int16_t *block = (int16_t *)block_align;
- int16_t *block3 = (int16_t *)(block_align + 4 * 8 * BLOCKSZ);
+ DECLARE_ALIGNED(16, int16_t, block_align)[8 * 8 * BLOCKSZ + 8 * 8 * BLOCKSZ];
+ int16_t *block = block_align;
+ int16_t *block3 = block_align + 8 * 8 * BLOCKSZ;
memset(block3, 0, 4 * 8 * BLOCKSZ);
--
2.49.1
>From 24019cd51376f55e5477b7f038dfffb779b9a21c Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 12 Nov 2025 23:15:24 +0100
Subject: [PATCH 17/23] avfilter/vf_fsppdsp: Constify
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavfilter/vf_fsppdsp.c | 30 +++++++++++++-----------------
libavfilter/vf_fsppdsp.h | 12 ++++++------
libavfilter/x86/vf_fspp_init.c | 6 +++---
tests/checkasm/vf_fspp.c | 4 ++--
4 files changed, 24 insertions(+), 28 deletions(-)
diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
index e530bcd06b..7fdc5ece25 100644
--- a/libavfilter/vf_fsppdsp.c
+++ b/libavfilter/vf_fsppdsp.c
@@ -121,13 +121,13 @@ void ff_store_slice2_c(uint8_t *restrict dst, int16_t *restrict src,
}
}
-void ff_mul_thrmat_c(int16_t *restrict thr_adr_noq, int16_t *restrict thr_adr, int q)
+void ff_mul_thrmat_c(const int16_t *restrict thr_adr_noq, int16_t *restrict thr_adr, int q)
{
for (int a = 0; a < 64; a++)
thr_adr[a] = q * thr_adr_noq[a];
}
-void ff_column_fidct_c(int16_t *restrict thr_adr, int16_t *restrict data,
+void ff_column_fidct_c(const int16_t *restrict thr_adr, const int16_t *restrict data,
int16_t *restrict output, int cnt)
{
int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
@@ -135,28 +135,26 @@ void ff_column_fidct_c(int16_t *restrict thr_adr, int16_t *restrict data,
int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
- int16_t *dataptr;
int16_t *wsptr;
int16_t *threshold;
- dataptr = data;
wsptr = output;
for (; cnt > 0; cnt -= 2) { //start positions
threshold = (int16_t *)thr_adr;//threshold_mtx
for (int ctr = DCTSIZE; ctr > 0; ctr--) {
// Process columns from input, add to output.
- tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
- tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
+ tmp0 = data[DCTSIZE * 0] + data[DCTSIZE * 7];
+ tmp7 = data[DCTSIZE * 0] - data[DCTSIZE * 7];
- tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
- tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
+ tmp1 = data[DCTSIZE * 1] + data[DCTSIZE * 6];
+ tmp6 = data[DCTSIZE * 1] - data[DCTSIZE * 6];
- tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
- tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
+ tmp2 = data[DCTSIZE * 2] + data[DCTSIZE * 5];
+ tmp5 = data[DCTSIZE * 2] - data[DCTSIZE * 5];
- tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
- tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
+ tmp3 = data[DCTSIZE * 3] + data[DCTSIZE * 4];
+ tmp4 = data[DCTSIZE * 3] - data[DCTSIZE * 4];
// Even part of FDCT
@@ -241,26 +239,24 @@ void ff_column_fidct_c(int16_t *restrict thr_adr, int16_t *restrict data,
wsptr[DCTSIZE * 6] = (tmp1 - tmp6);
wsptr[DCTSIZE * 7] = (tmp0 - tmp7);
//
- dataptr++; //next column
+ data++; //next column
wsptr++;
threshold++;
}
- dataptr += 8; //skip each second start pos
+ data += 8; //skip each second start pos
wsptr += 8;
}
}
-void ff_row_idct_c(int16_t *restrict workspace, int16_t *restrict output_adr,
+void ff_row_idct_c(const int16_t *restrict wsptr, int16_t *restrict output_adr,
ptrdiff_t output_stride, int cnt)
{
int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int_simd16_t tmp10, tmp11, tmp12, tmp13;
int_simd16_t z5, z10, z11, z12, z13;
int16_t *outptr;
- int16_t *wsptr;
cnt *= 4;
- wsptr = workspace;
outptr = output_adr;
for (; cnt > 0; cnt--) {
// Even part
diff --git a/libavfilter/vf_fsppdsp.h b/libavfilter/vf_fsppdsp.h
index 66030da4b1..5a2f1af030 100644
--- a/libavfilter/vf_fsppdsp.h
+++ b/libavfilter/vf_fsppdsp.h
@@ -39,13 +39,13 @@ typedef struct FSPPDSPContext {
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
- void (*mul_thrmat)(int16_t *restrict thr_adr_noq /* align 16 */,
+ void (*mul_thrmat)(const int16_t *restrict thr_adr_noq /* align 16 */,
int16_t *restrict thr_adr /* align 16 */, int q);
- void (*column_fidct)(int16_t *restrict thr_adr, int16_t *data,
+ void (*column_fidct)(const int16_t *restrict thr_adr, const int16_t *restrict data,
int16_t *restrict output, int cnt);
- void (*row_idct)(int16_t *restrict workspace, int16_t *restrict output_adr,
+ void (*row_idct)(const int16_t *restrict workspace, int16_t *restrict output_adr,
ptrdiff_t output_stride, int cnt);
void (*row_fdct)(int16_t *restrict data, const uint8_t *restrict pixels,
@@ -61,10 +61,10 @@ void ff_store_slice_c(uint8_t *restrict dst, int16_t *restrict src,
void ff_store_slice2_c(uint8_t *restrict dst, int16_t *restrict src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
-void ff_mul_thrmat_c(int16_t *restrict thr_adr_noq, int16_t *restrict thr_adr, int q);
-void ff_column_fidct_c(int16_t *restrict thr_adr, int16_t *restrict data,
+void ff_mul_thrmat_c(const int16_t *restrict thr_adr_noq, int16_t *restrict thr_adr, int q);
+void ff_column_fidct_c(const int16_t *restrict thr_adr, const int16_t *restrict data,
int16_t *restrict output, int cnt);
-void ff_row_idct_c(int16_t *restrict workspace, int16_t *restrict output_adr,
+void ff_row_idct_c(const int16_t *restrict workspace, int16_t *restrict output_adr,
ptrdiff_t output_stride, int cnt);
void ff_row_fdct_c(int16_t *restrict data, const uint8_t *restrict pixels,
ptrdiff_t line_size, int cnt);
diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c
index c7a9b1799e..caf94b30d6 100644
--- a/libavfilter/x86/vf_fspp_init.c
+++ b/libavfilter/x86/vf_fspp_init.c
@@ -29,9 +29,9 @@ void ff_store_slice_sse2(uint8_t *dst, int16_t *src,
void ff_store_slice2_sse2(uint8_t *dst, int16_t *src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
-void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
-void ff_column_fidct_sse2(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
-void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
+void ff_mul_thrmat_sse2(const int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+void ff_column_fidct_sse2(const int16_t *thr_adr, const int16_t *data, int16_t *output, int cnt);
+void ff_row_idct_mmx(const int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt);
av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
diff --git a/tests/checkasm/vf_fspp.c b/tests/checkasm/vf_fspp.c
index b65a46247d..341ce0fd37 100644
--- a/tests/checkasm/vf_fspp.c
+++ b/tests/checkasm/vf_fspp.c
@@ -116,7 +116,7 @@ static void check_mul_thrmat(void)
DECLARE_ALIGNED(16, int16_t, dst_ref)[64];
DECLARE_ALIGNED(16, int16_t, dst_new)[64];
const int q = (uint8_t)rnd();
- declare_func(void, int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+ declare_func(void, const int16_t *thr_adr_noq, int16_t *thr_adr, int q);
ff_fsppdsp_init(&fspp);
@@ -136,7 +136,7 @@ static void check_column_fidct(void)
NB_BLOCKS = 8, ///< arbitrary
};
FSPPDSPContext fspp;
- declare_func(void, int16_t *thr_adr, int16_t *data,
+ declare_func(void, const int16_t *thr_adr, const int16_t *data,
int16_t *output, int cnt);
ff_fsppdsp_init(&fspp);
--
2.49.1
>From c794b6db8dae32f228de7a123e5c79cc880868ca Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Wed, 12 Nov 2025 23:26:04 +0100
Subject: [PATCH 18/23] avfilter/x86/vf_spp: Fix comment
Forgotten in dcb28ed860166c9715afb7c71c70889e6b9b8c8d.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavfilter/x86/vf_spp.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libavfilter/x86/vf_spp.c b/libavfilter/x86/vf_spp.c
index 48c3d25d7c..7dcf18ec7d 100644
--- a/libavfilter/x86/vf_spp.c
+++ b/libavfilter/x86/vf_spp.c
@@ -64,7 +64,7 @@ static void store_slice_sse2(uint8_t *dst, const int16_t *src,
}
}
-#endif /* HAVE_MMX_INLINE */
+#endif /* HAVE_SSE2_INLINE */
av_cold void ff_spp_init_x86(SPPContext *s)
{
--
2.49.1
>From 4b047d8788cee8ff6ca8190c88d24937b5e7783c Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 13 Nov 2025 10:48:23 +0100
Subject: [PATCH 19/23] avfilter/vf_fspp: Avoid casts, effective-type
violations
Maybe uint64_t has been used as a poor man's alignment specifier?
Anyway, reading an uint64_t via an lvalue of type int16_t (as happens
in the C versions of the dsp functions) is undefined behavior.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavfilter/vf_fspp.c | 30 ++++++++++++++++++------------
1 file changed, 18 insertions(+), 12 deletions(-)
diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c
index 3db7fe114e..670e9288d9 100644
--- a/libavfilter/vf_fspp.c
+++ b/libavfilter/vf_fspp.c
@@ -37,6 +37,7 @@
#include "libavutil/emms.h"
#include "libavutil/imgutils.h"
+#include "libavutil/intreadwrite.h"
#include "libavutil/mem.h"
#include "libavutil/mem_internal.h"
#include "libavutil/opt.h"
@@ -71,8 +72,8 @@ typedef struct FSPPContext {
FSPPDSPContext dsp;
- DECLARE_ALIGNED(16, uint64_t, threshold_mtx_noq)[8 * 2];
- DECLARE_ALIGNED(16, uint64_t, threshold_mtx)[8 * 2];
+ DECLARE_ALIGNED(16, int16_t, threshold_mtx_noq)[8 * 8];
+ DECLARE_ALIGNED(16, int16_t, threshold_mtx)[8 * 8];
} FSPPContext;
@@ -154,7 +155,7 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
p->dsp.row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y&1), stride, 2 * (BLOCKSZ - 1));
if (p->qp)
- p->dsp.column_fidct((int16_t *)(&p->threshold_mtx[0]), block + 0 * 8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
+ p->dsp.column_fidct(p->threshold_mtx, block + 0 * 8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
else
for (x = 0; x < 8 * (BLOCKSZ - 1); x += 8) {
t = x + x0 - 2; //correct t=x+x0-2-(y&1), but its the same
@@ -164,8 +165,11 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
t = qp_store[qy + (t >> qpsh)];
t = ff_norm_qscale(t, p->qscale_type);
- if (t != p->prev_q) p->prev_q = t, p->dsp.mul_thrmat((int16_t *)(&p->threshold_mtx_noq[0]), (int16_t *)(&p->threshold_mtx[0]), t);
- p->dsp.column_fidct((int16_t *)(&p->threshold_mtx[0]), block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT
+ if (t != p->prev_q) {
+ p->prev_q = t;
+ p->dsp.mul_thrmat(p->threshold_mtx_noq, p->threshold_mtx, t);
+ }
+ p->dsp.column_fidct(p->threshold_mtx, block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT
}
p->dsp.row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, 2 * (BLOCKSZ - 1));
memmove(block, block + (BLOCKSZ - 1) * 64, 8 * 8 * sizeof(int16_t)); //cycling
@@ -176,7 +180,7 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
if (es > 8)
p->dsp.row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y & 1), stride, (es - 4) >> 2);
- p->dsp.column_fidct((int16_t *)(&p->threshold_mtx[0]), block, block3, es&(~1));
+ p->dsp.column_fidct(p->threshold_mtx, block, block3, es&(~1));
if (es > 3)
p->dsp.row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, es >> 2);
@@ -251,19 +255,21 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
custom_threshold_m[i] = (int)(custom_threshold[i] * (bias / 71.0) + 0.5);
for (i = 0; i < 8; i++) {
- fspp->threshold_mtx_noq[2 * i] = (uint64_t)custom_threshold_m[i * 8 + 2]
+ AV_WN64A(&fspp->threshold_mtx_noq[8 * i], (uint64_t)custom_threshold_m[i * 8 + 2]
|(((uint64_t)custom_threshold_m[i * 8 + 6]) << 16)
|(((uint64_t)custom_threshold_m[i * 8 + 0]) << 32)
- |(((uint64_t)custom_threshold_m[i * 8 + 4]) << 48);
+ |(((uint64_t)custom_threshold_m[i * 8 + 4]) << 48));
- fspp->threshold_mtx_noq[2 * i + 1] = (uint64_t)custom_threshold_m[i * 8 + 5]
+ AV_WN64A(&fspp->threshold_mtx_noq[8 * i + 4], (uint64_t)custom_threshold_m[i * 8 + 5]
|(((uint64_t)custom_threshold_m[i * 8 + 3]) << 16)
|(((uint64_t)custom_threshold_m[i * 8 + 1]) << 32)
- |(((uint64_t)custom_threshold_m[i * 8 + 7]) << 48);
+ |(((uint64_t)custom_threshold_m[i * 8 + 7]) << 48));
}
- if (fspp->qp)
- fspp->prev_q = fspp->qp, fspp->dsp.mul_thrmat((int16_t *)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]), fspp->qp);
+ if (fspp->qp) {
+ fspp->prev_q = fspp->qp;
+ fspp->dsp.mul_thrmat(fspp->threshold_mtx_noq, fspp->threshold_mtx, fspp->qp);
+ }
/* if we are not in a constant user quantizer mode and we don't want to use
* the quantizers from the B-frames (B-frames often have a higher QP), we
--
2.49.1
>From 6d4b85dc3c10f0e41410928db16af912e6945dc0 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 13 Nov 2025 11:02:56 +0100
Subject: [PATCH 20/23] avfilter/vf_fspp: Make output endian-independent
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavfilter/vf_fspp.c | 20 +++++++++-----------
1 file changed, 9 insertions(+), 11 deletions(-)
diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c
index 670e9288d9..9e5c688fb2 100644
--- a/libavfilter/vf_fspp.c
+++ b/libavfilter/vf_fspp.c
@@ -37,7 +37,6 @@
#include "libavutil/emms.h"
#include "libavutil/imgutils.h"
-#include "libavutil/intreadwrite.h"
#include "libavutil/mem.h"
#include "libavutil/mem_internal.h"
#include "libavutil/opt.h"
@@ -254,16 +253,15 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
for (i = 0; i < 64; i++) //FIXME: tune custom_threshold[] and remove this !
custom_threshold_m[i] = (int)(custom_threshold[i] * (bias / 71.0) + 0.5);
- for (i = 0; i < 8; i++) {
- AV_WN64A(&fspp->threshold_mtx_noq[8 * i], (uint64_t)custom_threshold_m[i * 8 + 2]
- |(((uint64_t)custom_threshold_m[i * 8 + 6]) << 16)
- |(((uint64_t)custom_threshold_m[i * 8 + 0]) << 32)
- |(((uint64_t)custom_threshold_m[i * 8 + 4]) << 48));
-
- AV_WN64A(&fspp->threshold_mtx_noq[8 * i + 4], (uint64_t)custom_threshold_m[i * 8 + 5]
- |(((uint64_t)custom_threshold_m[i * 8 + 3]) << 16)
- |(((uint64_t)custom_threshold_m[i * 8 + 1]) << 32)
- |(((uint64_t)custom_threshold_m[i * 8 + 7]) << 48));
+ for (int i = 0; i < 64; i += 8) {
+ fspp->threshold_mtx_noq[i + 0] = custom_threshold_m[i + 2];
+ fspp->threshold_mtx_noq[i + 1] = custom_threshold_m[i + 6];
+ fspp->threshold_mtx_noq[i + 2] = custom_threshold_m[i + 0];
+ fspp->threshold_mtx_noq[i + 3] = custom_threshold_m[i + 4];
+ fspp->threshold_mtx_noq[i + 4] = custom_threshold_m[i + 5];
+ fspp->threshold_mtx_noq[i + 5] = custom_threshold_m[i + 3];
+ fspp->threshold_mtx_noq[i + 6] = custom_threshold_m[i + 1];
+ fspp->threshold_mtx_noq[i + 7] = custom_threshold_m[i + 7];
}
if (fspp->qp) {
--
2.49.1
>From c362c3e167b7a26c95e4cb6fe24f7eaf486bcb40 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 13 Nov 2025 11:18:28 +0100
Subject: [PATCH 21/23] avfilter/vf_fspp: Pre-reorder threshold table
Avoids reordering at runtime.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavfilter/vf_fspp.c | 38 +++++++++++++-------------------------
1 file changed, 13 insertions(+), 25 deletions(-)
diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c
index 9e5c688fb2..cbf2e06d67 100644
--- a/libavfilter/vf_fspp.c
+++ b/libavfilter/vf_fspp.c
@@ -92,14 +92,16 @@ static const short custom_threshold[64] = {
// values (296) can't be too high
// -it causes too big quant dependence
// or maybe overflow(check), which results in some flashing
- 71, 296, 295, 237, 71, 40, 38, 19,
- 245, 193, 185, 121, 102, 73, 53, 27,
- 158, 129, 141, 107, 97, 73, 50, 26,
- 102, 116, 109, 98, 82, 66, 45, 23,
- 71, 94, 95, 81, 70, 56, 38, 20,
- 56, 77, 74, 66, 56, 44, 30, 15,
- 38, 53, 50, 45, 38, 30, 21, 11,
- 20, 27, 26, 23, 20, 15, 11, 5
+// reorder coefficients to the order in which columns are processed
+#define REORDER(a,b,c,d,e,f,g,h) c, g, a, e, f, d, b, h
+ REORDER( 71, 296, 295, 237, 71, 40, 38, 19),
+ REORDER(245, 193, 185, 121, 102, 73, 53, 27),
+ REORDER(158, 129, 141, 107, 97, 73, 50, 26),
+ REORDER(102, 116, 109, 98, 82, 66, 45, 23),
+ REORDER( 71, 94, 95, 81, 70, 56, 38, 20),
+ REORDER( 56, 77, 74, 66, 56, 44, 30, 15),
+ REORDER( 38, 53, 50, 45, 38, 30, 21, 11),
+ REORDER( 20, 27, 26, 23, 20, 15, 11, 5)
};
static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
@@ -244,25 +246,11 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
int qp_stride = 0;
int8_t *qp_table = NULL;
- int i, bias;
int ret = 0;
- int custom_threshold_m[64];
- bias = (1 << 4) + fspp->strength;
-
- for (i = 0; i < 64; i++) //FIXME: tune custom_threshold[] and remove this !
- custom_threshold_m[i] = (int)(custom_threshold[i] * (bias / 71.0) + 0.5);
-
- for (int i = 0; i < 64; i += 8) {
- fspp->threshold_mtx_noq[i + 0] = custom_threshold_m[i + 2];
- fspp->threshold_mtx_noq[i + 1] = custom_threshold_m[i + 6];
- fspp->threshold_mtx_noq[i + 2] = custom_threshold_m[i + 0];
- fspp->threshold_mtx_noq[i + 3] = custom_threshold_m[i + 4];
- fspp->threshold_mtx_noq[i + 4] = custom_threshold_m[i + 5];
- fspp->threshold_mtx_noq[i + 5] = custom_threshold_m[i + 3];
- fspp->threshold_mtx_noq[i + 6] = custom_threshold_m[i + 1];
- fspp->threshold_mtx_noq[i + 7] = custom_threshold_m[i + 7];
- }
+ //FIXME: tune custom_threshold[] and remove this !
+ for (int i = 0, bias = (1 << 4) + fspp->strength; i < 64; ++i)
+ fspp->threshold_mtx_noq[i] = (int)(custom_threshold[i] * (bias / 71.0) + 0.5);
if (fspp->qp) {
fspp->prev_q = fspp->qp;
--
2.49.1
>From a86a9361989d99ad8db46eabf39f664ed3f89072 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 13 Nov 2025 12:04:15 +0100
Subject: [PATCH 22/23] avfilter/vf_fsppdsp: Remove pointless cast
Also don't cast const away and use a smaller scope.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavfilter/vf_fsppdsp.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
index 7fdc5ece25..3230376a19 100644
--- a/libavfilter/vf_fsppdsp.c
+++ b/libavfilter/vf_fsppdsp.c
@@ -136,12 +136,11 @@ void ff_column_fidct_c(const int16_t *restrict thr_adr, const int16_t *restrict
int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
int16_t *wsptr;
- int16_t *threshold;
wsptr = output;
for (; cnt > 0; cnt -= 2) { //start positions
- threshold = (int16_t *)thr_adr;//threshold_mtx
+ const int16_t *threshold = thr_adr;//threshold_mtx
for (int ctr = DCTSIZE; ctr > 0; ctr--) {
// Process columns from input, add to output.
tmp0 = data[DCTSIZE * 0] + data[DCTSIZE * 7];
--
2.49.1
>From 57ca0480e6dcb64f1b4f948b6a79bcb8aaa97723 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Date: Thu, 13 Nov 2025 11:57:02 +0100
Subject: [PATCH 23/23] avfilter/vf_fsppdsp: Fix left shifts of negative
numbers
They are undefined behavior and UBSan warns about them
(in the checkasm test). Put the shifts in the constants
instead. This even gives a tiny speedup here:
Old benchmarks:
column_fidct_c: 3369.9 ( 1.00x)
column_fidct_sse2: 829.1 ( 4.06x)
New benchmarks:
column_fidct_c: 3304.2 ( 1.00x)
column_fidct_sse2: 827.9 ( 3.99x)
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
libavfilter/vf_fsppdsp.c | 46 ++++++++++++++++++++--------------------
1 file changed, 23 insertions(+), 23 deletions(-)
diff --git a/libavfilter/vf_fsppdsp.c b/libavfilter/vf_fsppdsp.c
index 3230376a19..8025e87366 100644
--- a/libavfilter/vf_fsppdsp.c
+++ b/libavfilter/vf_fsppdsp.c
@@ -165,7 +165,7 @@ void ff_column_fidct_c(const int16_t *restrict thr_adr, const int16_t *restrict
d0 = tmp10 + tmp11;
d4 = tmp10 - tmp11;
- z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
+ z1 = MULTIPLY16H(tmp12 + tmp13, FIX_0_707106781 << 2);
d2 = tmp13 + z1;
d6 = tmp13 - z1;
@@ -193,10 +193,10 @@ void ff_column_fidct_c(const int16_t *restrict thr_adr, const int16_t *restrict
tmp11 = tmp5 + tmp6;
tmp12 = tmp6 + tmp7;
- z5 = MULTIPLY16H((tmp10 - tmp12) << 2, FIX_0_382683433);
- z2 = MULTIPLY16H(tmp10 << 2, FIX_0_541196100) + z5;
- z4 = MULTIPLY16H(tmp12 << 2, FIX_1_306562965) + z5;
- z3 = MULTIPLY16H(tmp11 << 2, FIX_0_707106781);
+ z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433 << 2);
+ z2 = MULTIPLY16H(tmp10, FIX_0_541196100 << 2) + z5;
+ z4 = MULTIPLY16H(tmp12, FIX_1_306562965 << 2) + z5;
+ z3 = MULTIPLY16H(tmp11, FIX_0_707106781 << 2);
z11 = tmp7 + z3;
z13 = tmp7 - z3;
@@ -215,15 +215,15 @@ void ff_column_fidct_c(const int16_t *restrict thr_adr, const int16_t *restrict
//Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
z13 = tmp6 + tmp5;
- z10 = (tmp6 - tmp5) << 1;
+ z10 = (tmp6 - tmp5) * 2;
z11 = tmp4 + tmp7;
- z12 = (tmp4 - tmp7) << 1;
+ z12 = (tmp4 - tmp7) * 2;
tmp7 = (z11 + z13) >> 2; //+2 !
- tmp11 = MULTIPLY16H((z11 - z13) << 1, FIX_1_414213562);
- z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
- tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
- tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - !!
+ tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562 << 1);
+ z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
+ tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
+ tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - !!
tmp6 = tmp12 - tmp7;
tmp5 = tmp11 - tmp6;
@@ -264,7 +264,7 @@ void ff_row_idct_c(const int16_t *restrict wsptr, int16_t *restrict output_adr,
tmp11 = wsptr[2] - wsptr[3];
tmp13 = wsptr[0] + wsptr[1];
- tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) << 2) - tmp13;//this shift order to avoid overflow
+ tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) * 4) - tmp13;//this shift order to avoid overflow
tmp0 = tmp10 + tmp13; //->temps
tmp3 = tmp10 - tmp13; //->temps
@@ -289,9 +289,9 @@ void ff_row_idct_c(const int16_t *restrict wsptr, int16_t *restrict output_adr,
tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - FIX_
- tmp6 = (tmp12 << 3) - tmp7;
- tmp5 = (tmp11 << 3) - tmp6;
- tmp4 = (tmp10 << 3) + tmp5;
+ tmp6 = tmp12 * 8 - tmp7;
+ tmp5 = tmp11 * 8 - tmp6;
+ tmp4 = tmp10 * 8 + tmp5;
// Final output stage: descale and write column
outptr[0 * output_stride] += DESCALE(tmp0 + tmp7, 3);
@@ -342,20 +342,20 @@ void ff_row_fdct_c(int16_t *restrict data, const uint8_t *restrict pixels,
dataptr[2] = tmp10 + tmp11;
dataptr[3] = tmp10 - tmp11;
- z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
+ z1 = MULTIPLY16H(tmp12 + tmp13, FIX_0_707106781 << 2);
dataptr[0] = tmp13 + z1;
dataptr[1] = tmp13 - z1;
// Odd part
- tmp10 = (tmp4 + tmp5) << 2;
- tmp11 = (tmp5 + tmp6) << 2;
- tmp12 = (tmp6 + tmp7) << 2;
+ tmp10 = tmp4 + tmp5;
+ tmp11 = tmp5 + tmp6;
+ tmp12 = tmp6 + tmp7;
- z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
- z2 = MULTIPLY16H(tmp10, FIX_0_541196100) + z5;
- z4 = MULTIPLY16H(tmp12, FIX_1_306562965) + z5;
- z3 = MULTIPLY16H(tmp11, FIX_0_707106781);
+ z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433 << 2);
+ z2 = MULTIPLY16H(tmp10, FIX_0_541196100 << 2) + z5;
+ z4 = MULTIPLY16H(tmp12, FIX_1_306562965 << 2) + z5;
+ z3 = MULTIPLY16H(tmp11, FIX_0_707106781 << 2);
z11 = tmp7 + z3;
z13 = tmp7 - z3;
--
2.49.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
reply other threads:[~2025-11-13 11:57 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=176303491254.25.8423688227525973225@2cb04c0e5124 \
--to=ffmpeg-devel@ffmpeg.org \
--cc=code@ffmpeg.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git