* [FFmpeg-devel] [PATCH v3 2/8] avcodec/hevcdsp_template: reuse put/put_luma/put_chroma from h2656_inter_template
[not found] <20240122152527.601122-1-toqsxw@outlook.com>
@ 2024-01-22 15:25 ` toqsxw
2024-01-22 15:25 ` [FFmpeg-devel] [PATCH v3 3/8] avcodec/x86/hevc_mc: move put/put_uni to h26x/h2656_inter.asm toqsxw
` (5 subsequent siblings)
6 siblings, 0 replies; 10+ messages in thread
From: toqsxw @ 2024-01-22 15:25 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Wu Jianhua
From: Wu Jianhua <toqsxw@outlook.com>
Signed-off-by: Wu Jianhua <toqsxw@outlook.com>
---
libavcodec/hevcdsp_template.c | 594 +++-------------------------------
1 file changed, 46 insertions(+), 548 deletions(-)
diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c
index 0de14e9dcf..9b48bdf08e 100644
--- a/libavcodec/hevcdsp_template.c
+++ b/libavcodec/hevcdsp_template.c
@@ -26,6 +26,7 @@
#include "bit_depth_template.c"
#include "hevcdsp.h"
#include "h26x/h2656_sao_template.c"
+#include "h26x/h2656_inter_template.c"
static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
GetBitContext *gb, int pcm_bit_depth)
@@ -299,37 +300,51 @@ IDCT_DC(32)
////////////////////////////////////////////////////////////////////////////////
//
////////////////////////////////////////////////////////////////////////////////
-static void FUNC(put_hevc_pel_pixels)(int16_t *dst,
- const uint8_t *_src, ptrdiff_t _srcstride,
- int height, intptr_t mx, intptr_t my, int width)
-{
- int x, y;
- const pixel *src = (const pixel *)_src;
- ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++)
- dst[x] = src[x] << (14 - BIT_DEPTH);
- src += srcstride;
- dst += MAX_PB_SIZE;
- }
-}
-
-static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride,
- int height, intptr_t mx, intptr_t my, int width)
-{
- int y;
- const pixel *src = (const pixel *)_src;
- ptrdiff_t srcstride = _srcstride / sizeof(pixel);
- pixel *dst = (pixel *)_dst;
- ptrdiff_t dststride = _dststride / sizeof(pixel);
-
- for (y = 0; y < height; y++) {
- memcpy(dst, src, width * sizeof(pixel));
- src += srcstride;
- dst += dststride;
- }
-}
+#define ff_hevc_pel_filters ff_hevc_qpel_filters
+#define DECL_HV_FILTER(f) \
+ const uint8_t *hf = ff_hevc_ ## f ## _filters[mx - 1]; \
+ const uint8_t *vf = ff_hevc_ ## f ## _filters[my - 1];
+
+#define FW_PUT(p, f, t) \
+static void FUNC(put_hevc_## f)(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, int height, \
+ intptr_t mx, intptr_t my, int width) \
+{ \
+ DECL_HV_FILTER(p) \
+ FUNC(put_ ## t)(dst, src, srcstride, height, hf, vf, width); \
+}
+
+#define FW_PUT_UNI(p, f, t) \
+static void FUNC(put_hevc_ ## f)(uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, \
+ ptrdiff_t srcstride, int height, intptr_t mx, intptr_t my, int width) \
+{ \
+ DECL_HV_FILTER(p) \
+ FUNC(put_ ## t)(dst, dststride, src, srcstride, height, hf, vf, width); \
+}
+
+#define FW_PUT_UNI_W(p, f, t) \
+static void FUNC(put_hevc_ ## f)(uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, \
+ ptrdiff_t srcstride,int height, int denom, int wx, int ox, \
+ intptr_t mx, intptr_t my, int width) \
+{ \
+ DECL_HV_FILTER(p) \
+ FUNC(put_ ## t)(dst, dststride, src, srcstride, height, denom, wx, ox, hf, vf, width); \
+}
+
+#define FW_PUT_FUNCS(f, t, dir) \
+ FW_PUT(f, f ## _ ## dir, t ## _ ## dir) \
+ FW_PUT_UNI(f, f ## _uni_ ## dir, uni_ ## t ## _ ## dir) \
+ FW_PUT_UNI_W(f, f ## _uni_w_ ## dir, uni_## t ## _w_ ## dir)
+
+FW_PUT(pel, pel_pixels, pixels)
+FW_PUT_UNI(pel, pel_uni_pixels, uni_pixels)
+FW_PUT_UNI_W(pel, pel_uni_w_pixels, uni_w_pixels)
+
+FW_PUT_FUNCS(qpel, luma, h )
+FW_PUT_FUNCS(qpel, luma, v )
+FW_PUT_FUNCS(qpel, luma, hv )
+FW_PUT_FUNCS(epel, chroma, h )
+FW_PUT_FUNCS(epel, chroma, v )
+FW_PUT_FUNCS(epel, chroma, hv )
static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride,
const int16_t *src2,
@@ -357,30 +372,6 @@ static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_dst, ptrdiff_t _dststride, co
}
}
-static void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride,
- int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
-{
- int x, y;
- const pixel *src = (const pixel *)_src;
- ptrdiff_t srcstride = _srcstride / sizeof(pixel);
- pixel *dst = (pixel *)_dst;
- ptrdiff_t dststride = _dststride / sizeof(pixel);
- int shift = denom + 14 - BIT_DEPTH;
-#if BIT_DEPTH < 14
- int offset = 1 << (shift - 1);
-#else
- int offset = 0;
-#endif
-
- ox = ox * (1 << (BIT_DEPTH - 8));
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++)
- dst[x] = av_clip_pixel((((src[x] << (14 - BIT_DEPTH)) * wx + offset) >> shift) + ox);
- src += srcstride;
- dst += dststride;
- }
-}
-
static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride,
const int16_t *src2,
int height, int denom, int wx0, int wx1,
@@ -420,96 +411,6 @@ static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride,
filter[6] * src[x + 3 * stride] + \
filter[7] * src[x + 4 * stride])
-static void FUNC(put_hevc_qpel_h)(int16_t *dst,
- const uint8_t *_src, ptrdiff_t _srcstride,
- int height, intptr_t mx, intptr_t my, int width)
-{
- int x, y;
- const pixel *src = (const pixel*)_src;
- ptrdiff_t srcstride = _srcstride / sizeof(pixel);
- const int8_t *filter = ff_hevc_qpel_filters[mx - 1];
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++)
- dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
- src += srcstride;
- dst += MAX_PB_SIZE;
- }
-}
-
-static void FUNC(put_hevc_qpel_v)(int16_t *dst,
- const uint8_t *_src, ptrdiff_t _srcstride,
- int height, intptr_t mx, intptr_t my, int width)
-{
- int x, y;
- const pixel *src = (const pixel*)_src;
- ptrdiff_t srcstride = _srcstride / sizeof(pixel);
- const int8_t *filter = ff_hevc_qpel_filters[my - 1];
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++)
- dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
- src += srcstride;
- dst += MAX_PB_SIZE;
- }
-}
-
-static void FUNC(put_hevc_qpel_hv)(int16_t *dst,
- const uint8_t *_src,
- ptrdiff_t _srcstride,
- int height, intptr_t mx,
- intptr_t my, int width)
-{
- int x, y;
- const int8_t *filter;
- const pixel *src = (const pixel*)_src;
- ptrdiff_t srcstride = _srcstride / sizeof(pixel);
- int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
- int16_t *tmp = tmp_array;
-
- src -= QPEL_EXTRA_BEFORE * srcstride;
- filter = ff_hevc_qpel_filters[mx - 1];
- for (y = 0; y < height + QPEL_EXTRA; y++) {
- for (x = 0; x < width; x++)
- tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
- src += srcstride;
- tmp += MAX_PB_SIZE;
- }
-
- tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
- filter = ff_hevc_qpel_filters[my - 1];
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++)
- dst[x] = QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
- tmp += MAX_PB_SIZE;
- dst += MAX_PB_SIZE;
- }
-}
-
-static void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride,
- const uint8_t *_src, ptrdiff_t _srcstride,
- int height, intptr_t mx, intptr_t my, int width)
-{
- int x, y;
- const pixel *src = (const pixel*)_src;
- ptrdiff_t srcstride = _srcstride / sizeof(pixel);
- pixel *dst = (pixel *)_dst;
- ptrdiff_t dststride = _dststride / sizeof(pixel);
- const int8_t *filter = ff_hevc_qpel_filters[mx - 1];
- int shift = 14 - BIT_DEPTH;
-
-#if BIT_DEPTH < 14
- int offset = 1 << (shift - 1);
-#else
- int offset = 0;
-#endif
-
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++)
- dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
- src += srcstride;
- dst += dststride;
- }
-}
-
static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride,
const int16_t *src2,
int height, intptr_t mx, intptr_t my, int width)
@@ -538,33 +439,6 @@ static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, const
}
}
-static void FUNC(put_hevc_qpel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride,
- const uint8_t *_src, ptrdiff_t _srcstride,
- int height, intptr_t mx, intptr_t my, int width)
-{
- int x, y;
- const pixel *src = (const pixel*)_src;
- ptrdiff_t srcstride = _srcstride / sizeof(pixel);
- pixel *dst = (pixel *)_dst;
- ptrdiff_t dststride = _dststride / sizeof(pixel);
- const int8_t *filter = ff_hevc_qpel_filters[my - 1];
- int shift = 14 - BIT_DEPTH;
-
-#if BIT_DEPTH < 14
- int offset = 1 << (shift - 1);
-#else
- int offset = 0;
-#endif
-
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++)
- dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
- src += srcstride;
- dst += dststride;
- }
-}
-
-
static void FUNC(put_hevc_qpel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride, const int16_t *src2,
int height, intptr_t mx, intptr_t my, int width)
@@ -593,46 +467,6 @@ static void FUNC(put_hevc_qpel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride,
}
}
-static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride,
- const uint8_t *_src, ptrdiff_t _srcstride,
- int height, intptr_t mx, intptr_t my, int width)
-{
- int x, y;
- const int8_t *filter;
- const pixel *src = (const pixel*)_src;
- ptrdiff_t srcstride = _srcstride / sizeof(pixel);
- pixel *dst = (pixel *)_dst;
- ptrdiff_t dststride = _dststride / sizeof(pixel);
- int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
- int16_t *tmp = tmp_array;
- int shift = 14 - BIT_DEPTH;
-
-#if BIT_DEPTH < 14
- int offset = 1 << (shift - 1);
-#else
- int offset = 0;
-#endif
-
- src -= QPEL_EXTRA_BEFORE * srcstride;
- filter = ff_hevc_qpel_filters[mx - 1];
- for (y = 0; y < height + QPEL_EXTRA; y++) {
- for (x = 0; x < width; x++)
- tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
- src += srcstride;
- tmp += MAX_PB_SIZE;
- }
-
- tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
- filter = ff_hevc_qpel_filters[my - 1];
-
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++)
- dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
- tmp += MAX_PB_SIZE;
- dst += dststride;
- }
-}
-
static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride, const int16_t *src2,
int height, intptr_t mx, intptr_t my, int width)
@@ -673,33 +507,6 @@ static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride,
}
}
-static void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride,
- const uint8_t *_src, ptrdiff_t _srcstride,
- int height, int denom, int wx, int ox,
- intptr_t mx, intptr_t my, int width)
-{
- int x, y;
- const pixel *src = (const pixel*)_src;
- ptrdiff_t srcstride = _srcstride / sizeof(pixel);
- pixel *dst = (pixel *)_dst;
- ptrdiff_t dststride = _dststride / sizeof(pixel);
- const int8_t *filter = ff_hevc_qpel_filters[mx - 1];
- int shift = denom + 14 - BIT_DEPTH;
-#if BIT_DEPTH < 14
- int offset = 1 << (shift - 1);
-#else
- int offset = 0;
-#endif
-
- ox = ox * (1 << (BIT_DEPTH - 8));
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++)
- dst[x] = av_clip_pixel((((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
- src += srcstride;
- dst += dststride;
- }
-}
-
static void FUNC(put_hevc_qpel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride, const int16_t *src2,
int height, int denom, int wx0, int wx1,
@@ -728,33 +535,6 @@ static void FUNC(put_hevc_qpel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride,
}
}
-static void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride,
- const uint8_t *_src, ptrdiff_t _srcstride,
- int height, int denom, int wx, int ox,
- intptr_t mx, intptr_t my, int width)
-{
- int x, y;
- const pixel *src = (const pixel*)_src;
- ptrdiff_t srcstride = _srcstride / sizeof(pixel);
- pixel *dst = (pixel *)_dst;
- ptrdiff_t dststride = _dststride / sizeof(pixel);
- const int8_t *filter = ff_hevc_qpel_filters[my - 1];
- int shift = denom + 14 - BIT_DEPTH;
-#if BIT_DEPTH < 14
- int offset = 1 << (shift - 1);
-#else
- int offset = 0;
-#endif
-
- ox = ox * (1 << (BIT_DEPTH - 8));
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++)
- dst[x] = av_clip_pixel((((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
- src += srcstride;
- dst += dststride;
- }
-}
-
static void FUNC(put_hevc_qpel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride, const int16_t *src2,
int height, int denom, int wx0, int wx1,
@@ -783,47 +563,6 @@ static void FUNC(put_hevc_qpel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride,
}
}
-static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride,
- const uint8_t *_src, ptrdiff_t _srcstride,
- int height, int denom, int wx, int ox,
- intptr_t mx, intptr_t my, int width)
-{
- int x, y;
- const int8_t *filter;
- const pixel *src = (const pixel*)_src;
- ptrdiff_t srcstride = _srcstride / sizeof(pixel);
- pixel *dst = (pixel *)_dst;
- ptrdiff_t dststride = _dststride / sizeof(pixel);
- int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
- int16_t *tmp = tmp_array;
- int shift = denom + 14 - BIT_DEPTH;
-#if BIT_DEPTH < 14
- int offset = 1 << (shift - 1);
-#else
- int offset = 0;
-#endif
-
- src -= QPEL_EXTRA_BEFORE * srcstride;
- filter = ff_hevc_qpel_filters[mx - 1];
- for (y = 0; y < height + QPEL_EXTRA; y++) {
- for (x = 0; x < width; x++)
- tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
- src += srcstride;
- tmp += MAX_PB_SIZE;
- }
-
- tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
- filter = ff_hevc_qpel_filters[my - 1];
-
- ox = ox * (1 << (BIT_DEPTH - 8));
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++)
- dst[x] = av_clip_pixel((((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
- tmp += MAX_PB_SIZE;
- dst += dststride;
- }
-}
-
static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride, const int16_t *src2,
int height, int denom, int wx0, int wx1,
@@ -873,94 +612,6 @@ static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride,
filter[2] * src[x + stride] + \
filter[3] * src[x + 2 * stride])
-static void FUNC(put_hevc_epel_h)(int16_t *dst,
- const uint8_t *_src, ptrdiff_t _srcstride,
- int height, intptr_t mx, intptr_t my, int width)
-{
- int x, y;
- const pixel *src = (const pixel *)_src;
- ptrdiff_t srcstride = _srcstride / sizeof(pixel);
- const int8_t *filter = ff_hevc_epel_filters[mx - 1];
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++)
- dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
- src += srcstride;
- dst += MAX_PB_SIZE;
- }
-}
-
-static void FUNC(put_hevc_epel_v)(int16_t *dst,
- const uint8_t *_src, ptrdiff_t _srcstride,
- int height, intptr_t mx, intptr_t my, int width)
-{
- int x, y;
- const pixel *src = (const pixel *)_src;
- ptrdiff_t srcstride = _srcstride / sizeof(pixel);
- const int8_t *filter = ff_hevc_epel_filters[my - 1];
-
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++)
- dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
- src += srcstride;
- dst += MAX_PB_SIZE;
- }
-}
-
-static void FUNC(put_hevc_epel_hv)(int16_t *dst,
- const uint8_t *_src, ptrdiff_t _srcstride,
- int height, intptr_t mx, intptr_t my, int width)
-{
- int x, y;
- const pixel *src = (const pixel *)_src;
- ptrdiff_t srcstride = _srcstride / sizeof(pixel);
- const int8_t *filter = ff_hevc_epel_filters[mx - 1];
- int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
- int16_t *tmp = tmp_array;
-
- src -= EPEL_EXTRA_BEFORE * srcstride;
-
- for (y = 0; y < height + EPEL_EXTRA; y++) {
- for (x = 0; x < width; x++)
- tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
- src += srcstride;
- tmp += MAX_PB_SIZE;
- }
-
- tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
- filter = ff_hevc_epel_filters[my - 1];
-
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++)
- dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
- tmp += MAX_PB_SIZE;
- dst += MAX_PB_SIZE;
- }
-}
-
-static void FUNC(put_hevc_epel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride,
- int height, intptr_t mx, intptr_t my, int width)
-{
- int x, y;
- const pixel *src = (const pixel *)_src;
- ptrdiff_t srcstride = _srcstride / sizeof(pixel);
- pixel *dst = (pixel *)_dst;
- ptrdiff_t dststride = _dststride / sizeof(pixel);
- const int8_t *filter = ff_hevc_epel_filters[mx - 1];
- int shift = 14 - BIT_DEPTH;
-#if BIT_DEPTH < 14
- int offset = 1 << (shift - 1);
-#else
- int offset = 0;
-#endif
-
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++)
- dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
- src += srcstride;
- dst += dststride;
- }
-}
-
static void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride, const int16_t *src2,
int height, intptr_t mx, intptr_t my, int width)
@@ -988,30 +639,6 @@ static void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride,
}
}
-static void FUNC(put_hevc_epel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride,
- int height, intptr_t mx, intptr_t my, int width)
-{
- int x, y;
- const pixel *src = (const pixel *)_src;
- ptrdiff_t srcstride = _srcstride / sizeof(pixel);
- pixel *dst = (pixel *)_dst;
- ptrdiff_t dststride = _dststride / sizeof(pixel);
- const int8_t *filter = ff_hevc_epel_filters[my - 1];
- int shift = 14 - BIT_DEPTH;
-#if BIT_DEPTH < 14
- int offset = 1 << (shift - 1);
-#else
- int offset = 0;
-#endif
-
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++)
- dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
- src += srcstride;
- dst += dststride;
- }
-}
-
static void FUNC(put_hevc_epel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride, const int16_t *src2,
int height, intptr_t mx, intptr_t my, int width)
@@ -1038,44 +665,6 @@ static void FUNC(put_hevc_epel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride,
}
}
-static void FUNC(put_hevc_epel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride,
- int height, intptr_t mx, intptr_t my, int width)
-{
- int x, y;
- const pixel *src = (const pixel *)_src;
- ptrdiff_t srcstride = _srcstride / sizeof(pixel);
- pixel *dst = (pixel *)_dst;
- ptrdiff_t dststride = _dststride / sizeof(pixel);
- const int8_t *filter = ff_hevc_epel_filters[mx - 1];
- int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
- int16_t *tmp = tmp_array;
- int shift = 14 - BIT_DEPTH;
-#if BIT_DEPTH < 14
- int offset = 1 << (shift - 1);
-#else
- int offset = 0;
-#endif
-
- src -= EPEL_EXTRA_BEFORE * srcstride;
-
- for (y = 0; y < height + EPEL_EXTRA; y++) {
- for (x = 0; x < width; x++)
- tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
- src += srcstride;
- tmp += MAX_PB_SIZE;
- }
-
- tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
- filter = ff_hevc_epel_filters[my - 1];
-
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++)
- dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
- tmp += MAX_PB_SIZE;
- dst += dststride;
- }
-}
-
static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride, const int16_t *src2,
int height, intptr_t mx, intptr_t my, int width)
@@ -1116,32 +705,6 @@ static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride,
}
}
-static void FUNC(put_hevc_epel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride,
- int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
-{
- int x, y;
- const pixel *src = (const pixel *)_src;
- ptrdiff_t srcstride = _srcstride / sizeof(pixel);
- pixel *dst = (pixel *)_dst;
- ptrdiff_t dststride = _dststride / sizeof(pixel);
- const int8_t *filter = ff_hevc_epel_filters[mx - 1];
- int shift = denom + 14 - BIT_DEPTH;
-#if BIT_DEPTH < 14
- int offset = 1 << (shift - 1);
-#else
- int offset = 0;
-#endif
-
- ox = ox * (1 << (BIT_DEPTH - 8));
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++) {
- dst[x] = av_clip_pixel((((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
- }
- dst += dststride;
- src += srcstride;
- }
-}
-
static void FUNC(put_hevc_epel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride, const int16_t *src2,
int height, int denom, int wx0, int wx1,
@@ -1168,32 +731,6 @@ static void FUNC(put_hevc_epel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride,
}
}
-static void FUNC(put_hevc_epel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride,
- int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
-{
- int x, y;
- const pixel *src = (const pixel *)_src;
- ptrdiff_t srcstride = _srcstride / sizeof(pixel);
- pixel *dst = (pixel *)_dst;
- ptrdiff_t dststride = _dststride / sizeof(pixel);
- const int8_t *filter = ff_hevc_epel_filters[my - 1];
- int shift = denom + 14 - BIT_DEPTH;
-#if BIT_DEPTH < 14
- int offset = 1 << (shift - 1);
-#else
- int offset = 0;
-#endif
-
- ox = ox * (1 << (BIT_DEPTH - 8));
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++) {
- dst[x] = av_clip_pixel((((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
- }
- dst += dststride;
- src += srcstride;
- }
-}
-
static void FUNC(put_hevc_epel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride, const int16_t *src2,
int height, int denom, int wx0, int wx1,
@@ -1220,45 +757,6 @@ static void FUNC(put_hevc_epel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride,
}
}
-static void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride,
- int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
-{
- int x, y;
- const pixel *src = (const pixel *)_src;
- ptrdiff_t srcstride = _srcstride / sizeof(pixel);
- pixel *dst = (pixel *)_dst;
- ptrdiff_t dststride = _dststride / sizeof(pixel);
- const int8_t *filter = ff_hevc_epel_filters[mx - 1];
- int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
- int16_t *tmp = tmp_array;
- int shift = denom + 14 - BIT_DEPTH;
-#if BIT_DEPTH < 14
- int offset = 1 << (shift - 1);
-#else
- int offset = 0;
-#endif
-
- src -= EPEL_EXTRA_BEFORE * srcstride;
-
- for (y = 0; y < height + EPEL_EXTRA; y++) {
- for (x = 0; x < width; x++)
- tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
- src += srcstride;
- tmp += MAX_PB_SIZE;
- }
-
- tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
- filter = ff_hevc_epel_filters[my - 1];
-
- ox = ox * (1 << (BIT_DEPTH - 8));
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++)
- dst[x] = av_clip_pixel((((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
- tmp += MAX_PB_SIZE;
- dst += dststride;
- }
-}
-
static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride, const int16_t *src2,
int height, int denom, int wx0, int wx1,
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH v3 3/8] avcodec/x86/hevc_mc: move put/put_uni to h26x/h2656_inter.asm
[not found] <20240122152527.601122-1-toqsxw@outlook.com>
2024-01-22 15:25 ` [FFmpeg-devel] [PATCH v3 2/8] avcodec/hevcdsp_template: reuse put/put_luma/put_chroma from h2656_inter_template toqsxw
@ 2024-01-22 15:25 ` toqsxw
2024-01-22 15:25 ` [FFmpeg-devel] [PATCH v3 4/8] avcodec/x86/h26x/h2656_inter: add dststride to put toqsxw
` (4 subsequent siblings)
6 siblings, 0 replies; 10+ messages in thread
From: toqsxw @ 2024-01-22 15:25 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Wu Jianhua
From: Wu Jianhua <toqsxw@outlook.com>
This enable that the asm optimization can be reused by VVC
Signed-off-by: Wu Jianhua <toqsxw@outlook.com>
---
libavcodec/x86/Makefile | 1 +
libavcodec/x86/h26x/h2656_inter.asm | 1145 +++++++++++++++++++++++++++
libavcodec/x86/h26x/h2656dsp.c | 98 +++
libavcodec/x86/h26x/h2656dsp.h | 103 +++
libavcodec/x86/hevc_mc.asm | 462 +----------
libavcodec/x86/hevcdsp_init.c | 108 ++-
6 files changed, 1471 insertions(+), 446 deletions(-)
create mode 100644 libavcodec/x86/h26x/h2656_inter.asm
create mode 100644 libavcodec/x86/h26x/h2656dsp.c
create mode 100644 libavcodec/x86/h26x/h2656dsp.h
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index d5fb30645a..8098cd840c 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -167,6 +167,7 @@ X86ASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_add_res.o \
x86/hevc_deblock.o \
x86/hevc_idct.o \
x86/hevc_mc.o \
+ x86/h26x/h2656_inter.o \
x86/hevc_sao.o \
x86/hevc_sao_10bit.o
X86ASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o
diff --git a/libavcodec/x86/h26x/h2656_inter.asm b/libavcodec/x86/h26x/h2656_inter.asm
new file mode 100644
index 0000000000..aa296d549c
--- /dev/null
+++ b/libavcodec/x86/h26x/h2656_inter.asm
@@ -0,0 +1,1145 @@
+; /*
+; * Provide SSE luma and chroma mc functions for HEVC/VVC decoding
+; * Copyright (c) 2013 Pierre-Edouard LEPERE
+; * Copyright (c) 2023-2024 Nuo Mi
+; * Copyright (c) 2023-2024 Wu Jianhua
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+; */
+%include "libavutil/x86/x86util.asm"
+
+%define MAX_PB_SIZE 64
+
+SECTION_RODATA 32
+cextern pw_255
+cextern pw_512
+cextern pw_2048
+cextern pw_1023
+cextern pw_1024
+cextern pw_4096
+cextern pw_8192
+%define scale_8 pw_512
+%define scale_10 pw_2048
+%define scale_12 pw_8192
+%define max_pixels_8 pw_255
+%define max_pixels_10 pw_1023
+max_pixels_12: times 16 dw ((1 << 12)-1)
+cextern pb_0
+
+SECTION .text
+%macro SIMPLE_LOAD 4 ;width, bitd, tab, r1
+%if %1 == 2 || (%2 == 8 && %1 <= 4)
+ movd %4, [%3] ; load data from source
+%elif %1 == 4 || (%2 == 8 && %1 <= 8)
+ movq %4, [%3] ; load data from source
+%elif notcpuflag(avx)
+ movu %4, [%3] ; load data from source
+%elif %1 <= 8 || (%2 == 8 && %1 <= 16)
+ movdqu %4, [%3]
+%else
+ movu %4, [%3]
+%endif
+%endmacro
+
+%macro VPBROADCASTW 2
+%if notcpuflag(avx2)
+ movd %1, %2
+ pshuflw %1, %1, 0
+ punpcklwd %1, %1
+%else
+ vpbroadcastw %1, %2
+%endif
+%endmacro
+
+%macro MC_4TAP_FILTER 4 ; bitdepth, filter, a, b,
+ VPBROADCASTW %3, [%2q + 0 * 2] ; coeff 0, 1
+ VPBROADCASTW %4, [%2q + 1 * 2] ; coeff 2, 3
+%if %1 != 8
+ pmovsxbw %3, xmm%3
+ pmovsxbw %4, xmm%4
+%endif
+%endmacro
+
+%macro MC_4TAP_HV_FILTER 1
+ VPBROADCASTW m12, [vfq + 0 * 2] ; vf 0, 1
+ VPBROADCASTW m13, [vfq + 1 * 2] ; vf 2, 3
+ VPBROADCASTW m14, [hfq + 0 * 2] ; hf 0, 1
+ VPBROADCASTW m15, [hfq + 1 * 2] ; hf 2, 3
+
+ pmovsxbw m12, xm12
+ pmovsxbw m13, xm13
+%if %1 != 8
+ pmovsxbw m14, xm14
+ pmovsxbw m15, xm15
+%endif
+ lea r3srcq, [srcstrideq*3]
+%endmacro
+
+%macro MC_8TAP_SAVE_FILTER 5 ;offset, mm registers
+ mova [rsp + %1 + 0*mmsize], %2
+ mova [rsp + %1 + 1*mmsize], %3
+ mova [rsp + %1 + 2*mmsize], %4
+ mova [rsp + %1 + 3*mmsize], %5
+%endmacro
+
+%macro MC_8TAP_FILTER 2-3 ;bitdepth, filter, offset
+ VPBROADCASTW m12, [%2q + 0 * 2] ; coeff 0, 1
+ VPBROADCASTW m13, [%2q + 1 * 2] ; coeff 2, 3
+ VPBROADCASTW m14, [%2q + 2 * 2] ; coeff 4, 5
+ VPBROADCASTW m15, [%2q + 3 * 2] ; coeff 6, 7
+%if %0 == 3
+ MC_8TAP_SAVE_FILTER %3, m12, m13, m14, m15
+%endif
+
+%if %1 != 8
+ pmovsxbw m12, xm12
+ pmovsxbw m13, xm13
+ pmovsxbw m14, xm14
+ pmovsxbw m15, xm15
+ %if %0 == 3
+ MC_8TAP_SAVE_FILTER %3 + 4*mmsize, m12, m13, m14, m15
+ %endif
+%elif %0 == 3
+ pmovsxbw m8, xm12
+ pmovsxbw m9, xm13
+ pmovsxbw m10, xm14
+ pmovsxbw m11, xm15
+ MC_8TAP_SAVE_FILTER %3 + 4*mmsize, m8, m9, m10, m11
+%endif
+
+%endmacro
+
+%macro MC_4TAP_LOAD 4
+%if (%1 == 8 && %4 <= 4)
+%define %%load movd
+%elif (%1 == 8 && %4 <= 8) || (%1 > 8 && %4 <= 4)
+%define %%load movq
+%else
+%define %%load movdqu
+%endif
+
+ %%load m0, [%2q ]
+%ifnum %3
+ %%load m1, [%2q+ %3]
+ %%load m2, [%2q+2*%3]
+ %%load m3, [%2q+3*%3]
+%else
+ %%load m1, [%2q+ %3q]
+ %%load m2, [%2q+2*%3q]
+ %%load m3, [%2q+r3srcq]
+%endif
+%if %1 == 8
+%if %4 > 8
+ SBUTTERFLY bw, 0, 1, 7
+ SBUTTERFLY bw, 2, 3, 7
+%else
+ punpcklbw m0, m1
+ punpcklbw m2, m3
+%endif
+%else
+%if %4 > 4
+ SBUTTERFLY wd, 0, 1, 7
+ SBUTTERFLY wd, 2, 3, 7
+%else
+ punpcklwd m0, m1
+ punpcklwd m2, m3
+%endif
+%endif
+%endmacro
+
+%macro MC_8TAP_H_LOAD 4
+%assign %%stride (%1+7)/8
+%if %1 == 8
+%if %3 <= 4
+%define %%load movd
+%elif %3 == 8
+%define %%load movq
+%else
+%define %%load movu
+%endif
+%else
+%if %3 == 2
+%define %%load movd
+%elif %3 == 4
+%define %%load movq
+%else
+%define %%load movu
+%endif
+%endif
+ %%load m0, [%2-3*%%stride] ;load data from source
+ %%load m1, [%2-2*%%stride]
+ %%load m2, [%2-%%stride ]
+ %%load m3, [%2 ]
+ %%load m4, [%2+%%stride ]
+ %%load m5, [%2+2*%%stride]
+ %%load m6, [%2+3*%%stride]
+ %%load m7, [%2+4*%%stride]
+
+%if %1 == 8
+%if %3 > 8
+ SBUTTERFLY wd, 0, 1, %4
+ SBUTTERFLY wd, 2, 3, %4
+ SBUTTERFLY wd, 4, 5, %4
+ SBUTTERFLY wd, 6, 7, %4
+%else
+ punpcklbw m0, m1
+ punpcklbw m2, m3
+ punpcklbw m4, m5
+ punpcklbw m6, m7
+%endif
+%else
+%if %3 > 4
+ SBUTTERFLY dq, 0, 1, %4
+ SBUTTERFLY dq, 2, 3, %4
+ SBUTTERFLY dq, 4, 5, %4
+ SBUTTERFLY dq, 6, 7, %4
+%else
+ punpcklwd m0, m1
+ punpcklwd m2, m3
+ punpcklwd m4, m5
+ punpcklwd m6, m7
+%endif
+%endif
+%endmacro
+
+%macro MC_8TAP_V_LOAD 5
+ lea %5q, [%2]
+ sub %5q, r3srcq
+ movu m0, [%5q ] ;load x- 3*srcstride
+ movu m1, [%5q+ %3q ] ;load x- 2*srcstride
+ movu m2, [%5q+ 2*%3q ] ;load x-srcstride
+ movu m3, [%2 ] ;load x
+ movu m4, [%2+ %3q] ;load x+stride
+ movu m5, [%2+ 2*%3q] ;load x+2*stride
+ movu m6, [%2+r3srcq] ;load x+3*stride
+ movu m7, [%2+ 4*%3q] ;load x+4*stride
+%if %1 == 8
+%if %4 > 8
+ SBUTTERFLY bw, 0, 1, 8
+ SBUTTERFLY bw, 2, 3, 8
+ SBUTTERFLY bw, 4, 5, 8
+ SBUTTERFLY bw, 6, 7, 8
+%else
+ punpcklbw m0, m1
+ punpcklbw m2, m3
+ punpcklbw m4, m5
+ punpcklbw m6, m7
+%endif
+%else
+%if %4 > 4
+ SBUTTERFLY wd, 0, 1, 8
+ SBUTTERFLY wd, 2, 3, 8
+ SBUTTERFLY wd, 4, 5, 8
+ SBUTTERFLY wd, 6, 7, 8
+%else
+ punpcklwd m0, m1
+ punpcklwd m2, m3
+ punpcklwd m4, m5
+ punpcklwd m6, m7
+%endif
+%endif
+%endmacro
+
+%macro PEL_12STORE2 3
+ movd [%1], %2
+%endmacro
+%macro PEL_12STORE4 3
+ movq [%1], %2
+%endmacro
+%macro PEL_12STORE6 3
+ movq [%1], %2
+ psrldq %2, 8
+ movd [%1+8], %2
+%endmacro
+%macro PEL_12STORE8 3
+ movdqu [%1], %2
+%endmacro
+%macro PEL_12STORE12 3
+ PEL_12STORE8 %1, %2, %3
+ movq [%1+16], %3
+%endmacro
+%macro PEL_12STORE16 3
+%if cpuflag(avx2)
+ movu [%1], %2
+%else
+ PEL_12STORE8 %1, %2, %3
+ movdqu [%1+16], %3
+%endif
+%endmacro
+
+%macro PEL_10STORE2 3
+ movd [%1], %2
+%endmacro
+%macro PEL_10STORE4 3
+ movq [%1], %2
+%endmacro
+%macro PEL_10STORE6 3
+ movq [%1], %2
+ psrldq %2, 8
+ movd [%1+8], %2
+%endmacro
+%macro PEL_10STORE8 3
+ movdqu [%1], %2
+%endmacro
+%macro PEL_10STORE12 3
+ PEL_10STORE8 %1, %2, %3
+ movq [%1+16], %3
+%endmacro
+%macro PEL_10STORE16 3
+%if cpuflag(avx2)
+ movu [%1], %2
+%else
+ PEL_10STORE8 %1, %2, %3
+ movdqu [%1+16], %3
+%endif
+%endmacro
+%macro PEL_10STORE32 3
+ PEL_10STORE16 %1, %2, %3
+ movu [%1+32], %3
+%endmacro
+
+%macro PEL_8STORE2 3
+ pextrw [%1], %2, 0
+%endmacro
+%macro PEL_8STORE4 3
+ movd [%1], %2
+%endmacro
+%macro PEL_8STORE6 3
+ movd [%1], %2
+ pextrw [%1+4], %2, 2
+%endmacro
+%macro PEL_8STORE8 3
+ movq [%1], %2
+%endmacro
+%macro PEL_8STORE12 3
+ movq [%1], %2
+ psrldq %2, 8
+ movd [%1+8], %2
+%endmacro
+%macro PEL_8STORE16 3
+%if cpuflag(avx2)
+ movdqu [%1], %2
+%else
+ movu [%1], %2
+%endif ; avx
+%endmacro
+%macro PEL_8STORE32 3
+ movu [%1], %2
+%endmacro
+
+%macro LOOP_END 3
+ add %1q, 2*MAX_PB_SIZE ; dst += dststride
+ add %2q, %3q ; src += srcstride
+ dec heightd ; cmp height
+ jnz .loop ; height loop
+%endmacro
+
+
+%macro MC_PIXEL_COMPUTE 2-3 ;width, bitdepth
+%if %2 == 8
+%if cpuflag(avx2) && %0 ==3
+%if %1 > 16
+ vextracti128 xm1, m0, 1
+ pmovzxbw m1, xm1
+ psllw m1, 14-%2
+%endif
+ pmovzxbw m0, xm0
+%else ; not avx
+%if %1 > 8
+ punpckhbw m1, m0, m2
+ psllw m1, 14-%2
+%endif
+ punpcklbw m0, m2
+%endif
+%endif ;avx
+ psllw m0, 14-%2
+%endmacro
+
+%macro MC_4TAP_COMPUTE 4-8 ; bitdepth, width, filter1, filter2, HV/m0, m2, m1, m3
+%if %0 == 8
+%define %%reg0 %5
+%define %%reg2 %6
+%define %%reg1 %7
+%define %%reg3 %8
+%else
+%define %%reg0 m0
+%define %%reg2 m2
+%define %%reg1 m1
+%define %%reg3 m3
+%endif
+%if %1 == 8
+%if cpuflag(avx2) && (%0 == 5)
+%if %2 > 16
+ vperm2i128 m10, m0, m1, q0301
+%endif
+ vinserti128 m0, m0, xm1, 1
+ mova m1, m10
+%if %2 > 16
+ vperm2i128 m10, m2, m3, q0301
+%endif
+ vinserti128 m2, m2, xm3, 1
+ mova m3, m10
+%endif
+ pmaddubsw %%reg0, %3 ;x1*c1+x2*c2
+ pmaddubsw %%reg2, %4 ;x3*c3+x4*c4
+ paddw %%reg0, %%reg2
+%if %2 > 8
+ pmaddubsw %%reg1, %3
+ pmaddubsw %%reg3, %4
+ paddw %%reg1, %%reg3
+%endif
+%else
+ pmaddwd %%reg0, %3
+ pmaddwd %%reg2, %4
+ paddd %%reg0, %%reg2
+%if %2 > 4
+ pmaddwd %%reg1, %3
+ pmaddwd %%reg3, %4
+ paddd %%reg1, %%reg3
+%if %1 != 8
+ psrad %%reg1, %1-8
+%endif
+%endif
+%if %1 != 8
+ psrad %%reg0, %1-8
+%endif
+ packssdw %%reg0, %%reg1
+%endif
+%endmacro
+
+%macro MC_8TAP_HV_COMPUTE 4 ; width, bitdepth, filter
+
+%if %2 == 8
+ pmaddubsw m0, [%3q+0*mmsize] ;x1*c1+x2*c2
+ pmaddubsw m2, [%3q+1*mmsize] ;x3*c3+x4*c4
+ pmaddubsw m4, [%3q+2*mmsize] ;x5*c5+x6*c6
+ pmaddubsw m6, [%3q+3*mmsize] ;x7*c7+x8*c8
+ paddw m0, m2
+ paddw m4, m6
+ paddw m0, m4
+%else
+ pmaddwd m0, [%3q+4*mmsize]
+ pmaddwd m2, [%3q+5*mmsize]
+ pmaddwd m4, [%3q+6*mmsize]
+ pmaddwd m6, [%3q+7*mmsize]
+ paddd m0, m2
+ paddd m4, m6
+ paddd m0, m4
+%if %2 != 8
+ psrad m0, %2-8
+%endif
+%if %1 > 4
+ pmaddwd m1, [%3q+4*mmsize]
+ pmaddwd m3, [%3q+5*mmsize]
+ pmaddwd m5, [%3q+6*mmsize]
+ pmaddwd m7, [%3q+7*mmsize]
+ paddd m1, m3
+ paddd m5, m7
+ paddd m1, m5
+%if %2 != 8
+ psrad m1, %2-8
+%endif
+%endif
+ p%4 m0, m1
+%endif
+%endmacro
+
+
+%macro MC_8TAP_COMPUTE 2-3 ; width, bitdepth
+%if %2 == 8
+%if cpuflag(avx2) && (%0 == 3)
+
+ vperm2i128 m10, m0, m1, q0301
+ vinserti128 m0, m0, xm1, 1
+ SWAP 1, 10
+
+ vperm2i128 m10, m2, m3, q0301
+ vinserti128 m2, m2, xm3, 1
+ SWAP 3, 10
+
+
+ vperm2i128 m10, m4, m5, q0301
+ vinserti128 m4, m4, xm5, 1
+ SWAP 5, 10
+
+ vperm2i128 m10, m6, m7, q0301
+ vinserti128 m6, m6, xm7, 1
+ SWAP 7, 10
+%endif
+
+ pmaddubsw m0, m12 ;x1*c1+x2*c2
+ pmaddubsw m2, m13 ;x3*c3+x4*c4
+ pmaddubsw m4, m14 ;x5*c5+x6*c6
+ pmaddubsw m6, m15 ;x7*c7+x8*c8
+ paddw m0, m2
+ paddw m4, m6
+ paddw m0, m4
+%if %1 > 8
+ pmaddubsw m1, m12
+ pmaddubsw m3, m13
+ pmaddubsw m5, m14
+ pmaddubsw m7, m15
+ paddw m1, m3
+ paddw m5, m7
+ paddw m1, m5
+%endif
+%else
+ pmaddwd m0, m12
+ pmaddwd m2, m13
+ pmaddwd m4, m14
+ pmaddwd m6, m15
+ paddd m0, m2
+ paddd m4, m6
+ paddd m0, m4
+%if %2 != 8
+ psrad m0, %2-8
+%endif
+%if %1 > 4
+ pmaddwd m1, m12
+ pmaddwd m3, m13
+ pmaddwd m5, m14
+ pmaddwd m7, m15
+ paddd m1, m3
+ paddd m5, m7
+ paddd m1, m5
+%if %2 != 8
+ psrad m1, %2-8
+%endif
+%endif
+%endif
+%endmacro
+%macro UNI_COMPUTE 5
+ pmulhrsw %3, %5
+%if %1 > 8 || (%2 > 8 && %1 > 4)
+ pmulhrsw %4, %5
+%endif
+%if %2 == 8
+ packuswb %3, %4
+%else
+ CLIPW %3, [pb_0], [max_pixels_%2]
+%if (%1 > 8 && notcpuflag(avx)) || %1 > 16
+ CLIPW %4, [pb_0], [max_pixels_%2]
+%endif
+%endif
+%endmacro
+
+
+; ******************************
+; void %1_put_pixels(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride,
+; int height, const int8_t *hf, const int8_t *vf, int width)
+; ******************************
+
+%macro PUT_PIXELS 3
+ MC_PIXELS %1, %2, %3
+ MC_UNI_PIXELS %1, %2, %3
+%endmacro
+
+%macro MC_PIXELS 3
+cglobal %1_put_pixels%2_%3, 4, 4, 3, dst, src, srcstride, height
+ pxor m2, m2
+.loop:
+ SIMPLE_LOAD %2, %3, srcq, m0
+ MC_PIXEL_COMPUTE %2, %3, 1
+ PEL_10STORE%2 dstq, m0, m1
+ LOOP_END dst, src, srcstride
+ RET
+%endmacro
+
+%macro MC_UNI_PIXELS 3
+cglobal %1_put_uni_pixels%2_%3, 5, 5, 2, dst, dststride, src, srcstride, height
+.loop:
+ SIMPLE_LOAD %2, %3, srcq, m0
+ PEL_%3STORE%2 dstq, m0, m1
+ add dstq, dststrideq ; dst += dststride
+ add srcq, srcstrideq ; src += srcstride
+ dec heightd ; cmp height
+ jnz .loop ; height loop
+ RET
+%endmacro
+
+%macro PUT_4TAP 3
+%if cpuflag(avx2)
+%define XMM_REGS 11
+%else
+%define XMM_REGS 8
+%endif
+
+; ******************************
+; void %1_put_4tap_hX(int16_t *dst,
+; const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, int8_t *vf, int width);
+; ******************************
+cglobal %1_put_4tap_h%2_%3, 5, 5, XMM_REGS, dst, src, srcstride, height, hf
+%assign %%stride ((%3 + 7)/8)
+ MC_4TAP_FILTER %3, hf, m4, m5
+.loop:
+ MC_4TAP_LOAD %3, srcq-%%stride, %%stride, %2
+ MC_4TAP_COMPUTE %3, %2, m4, m5, 1
+ PEL_10STORE%2 dstq, m0, m1
+ LOOP_END dst, src, srcstride
+ RET
+
+; ******************************
+; void %1_put_uni_4tap_hX(uint8_t *dst, ptrdiff_t dststride,
+; const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, int8_t *vf, int width);
+; ******************************
+cglobal %1_put_uni_4tap_h%2_%3, 6, 7, XMM_REGS, dst, dststride, src, srcstride, height, hf
+%assign %%stride ((%3 + 7)/8)
+ movdqa m6, [scale_%3]
+ MC_4TAP_FILTER %3, hf, m4, m5
+.loop:
+ MC_4TAP_LOAD %3, srcq-%%stride, %%stride, %2
+ MC_4TAP_COMPUTE %3, %2, m4, m5
+ UNI_COMPUTE %2, %3, m0, m1, m6
+ PEL_%3STORE%2 dstq, m0, m1
+ add dstq, dststrideq ; dst += dststride
+ add srcq, srcstrideq ; src += srcstride
+ dec heightd ; cmp height
+ jnz .loop ; height loop
+ RET
+
+; ******************************
+; void %1_put_4tap_v(int16_t *dst,
+; const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, int8_t *vf, int width)
+; ******************************
+cglobal %1_put_4tap_v%2_%3, 6, 6, XMM_REGS, dst, src, srcstride, height, r3src, vf
+ sub srcq, srcstrideq
+ MC_4TAP_FILTER %3, vf, m4, m5
+ lea r3srcq, [srcstrideq*3]
+.loop:
+ MC_4TAP_LOAD %3, srcq, srcstride, %2
+ MC_4TAP_COMPUTE %3, %2, m4, m5, 1
+ PEL_10STORE%2 dstq, m0, m1
+ LOOP_END dst, src, srcstride
+ RET
+
+; ******************************
+; void %1_put_uni_4tap_vX(uint8_t *dst, ptrdiff_t dststride,
+; const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, int8_t *vf, int width);
+; ******************************
+cglobal %1_put_uni_4tap_v%2_%3, 7, 7, XMM_REGS, dst, dststride, src, srcstride, height, r3src, vf
+ movdqa m6, [scale_%3]
+ sub srcq, srcstrideq
+ MC_4TAP_FILTER %3, vf, m4, m5
+ lea r3srcq, [srcstrideq*3]
+.loop:
+ MC_4TAP_LOAD %3, srcq, srcstride, %2
+ MC_4TAP_COMPUTE %3, %2, m4, m5
+ UNI_COMPUTE %2, %3, m0, m1, m6
+ PEL_%3STORE%2 dstq, m0, m1
+ add dstq, dststrideq ; dst += dststride
+ add srcq, srcstrideq ; src += srcstride
+ dec heightd ; cmp height
+ jnz .loop ; height loop
+ RET
+%endmacro
+
+%macro PUT_4TAP_HV 3
+; ******************************
+; void put_4tap_hv(int16_t *dst,
+; const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, int8_t *vf, int width)
+; ******************************
+cglobal %1_put_4tap_hv%2_%3, 6, 7, 16 , dst, src, srcstride, height, hf, vf, r3src
+%assign %%stride ((%3 + 7)/8)
+ sub srcq, srcstrideq
+ MC_4TAP_HV_FILTER %3
+ MC_4TAP_LOAD %3, srcq-%%stride, %%stride, %2
+ MC_4TAP_COMPUTE %3, %2, m14, m15
+%if (%2 > 8 && (%3 == 8))
+ SWAP m8, m1
+%endif
+ SWAP m4, m0
+ add srcq, srcstrideq
+ MC_4TAP_LOAD %3, srcq-%%stride, %%stride, %2
+ MC_4TAP_COMPUTE %3, %2, m14, m15
+%if (%2 > 8 && (%3 == 8))
+ SWAP m9, m1
+%endif
+ SWAP m5, m0
+ add srcq, srcstrideq
+ MC_4TAP_LOAD %3, srcq-%%stride, %%stride, %2
+ MC_4TAP_COMPUTE %3, %2, m14, m15
+%if (%2 > 8 && (%3 == 8))
+ SWAP m10, m1
+%endif
+ SWAP m6, m0
+ add srcq, srcstrideq
+.loop:
+ MC_4TAP_LOAD %3, srcq-%%stride, %%stride, %2
+ MC_4TAP_COMPUTE %3, %2, m14, m15
+%if (%2 > 8 && (%3 == 8))
+ SWAP m11, m1
+%endif
+ SWAP m7, m0
+ punpcklwd m0, m4, m5
+ punpcklwd m2, m6, m7
+%if %2 > 4
+ punpckhwd m1, m4, m5
+ punpckhwd m3, m6, m7
+%endif
+ MC_4TAP_COMPUTE 14, %2, m12, m13
+%if (%2 > 8 && (%3 == 8))
+ punpcklwd m4, m8, m9
+ punpcklwd m2, m10, m11
+ punpckhwd m8, m8, m9
+ punpckhwd m3, m10, m11
+ MC_4TAP_COMPUTE 14, %2, m12, m13, m4, m2, m8, m3
+%if cpuflag(avx2)
+ vinserti128 m2, m0, xm4, 1
+ vperm2i128 m3, m0, m4, q0301
+ PEL_10STORE%2 dstq, m2, m3
+%else
+ PEL_10STORE%2 dstq, m0, m4
+%endif
+%else
+ PEL_10STORE%2 dstq, m0, m1
+%endif
+ movdqa m4, m5
+ movdqa m5, m6
+ movdqa m6, m7
+%if (%2 > 8 && (%3 == 8))
+ mova m8, m9
+ mova m9, m10
+ mova m10, m11
+%endif
+ LOOP_END dst, src, srcstride
+ RET
+
+cglobal %1_put_uni_4tap_hv%2_%3, 7, 8, 16 , dst, dststride, src, srcstride, height, hf, vf, r3src
+%assign %%stride ((%3 + 7)/8)
+ sub srcq, srcstrideq
+ MC_4TAP_HV_FILTER %3
+ MC_4TAP_LOAD %3, srcq-%%stride, %%stride, %2
+ MC_4TAP_COMPUTE %3, %2, m14, m15
+%if (%2 > 8 && (%3 == 8))
+ SWAP m8, m1
+%endif
+ SWAP m4, m0
+ add srcq, srcstrideq
+ MC_4TAP_LOAD %3, srcq-%%stride, %%stride, %2
+ MC_4TAP_COMPUTE %3, %2, m14, m15
+%if (%2 > 8 && (%3 == 8))
+ SWAP m9, m1
+%endif
+ SWAP m5, m0
+ add srcq, srcstrideq
+ MC_4TAP_LOAD %3, srcq-%%stride, %%stride, %2
+ MC_4TAP_COMPUTE %3, %2, m14, m15
+%if (%2 > 8 && (%3 == 8))
+ SWAP m10, m1
+%endif
+ SWAP m6, m0
+ add srcq, srcstrideq
+.loop:
+ MC_4TAP_LOAD %3, srcq-%%stride, %%stride, %2
+ MC_4TAP_COMPUTE %3, %2, m14, m15
+%if (%2 > 8 && (%3 == 8))
+ SWAP m11, m1
+%endif
+ mova m7, m0
+ punpcklwd m0, m4, m5
+ punpcklwd m2, m6, m7
+%if %2 > 4
+ punpckhwd m1, m4, m5
+ punpckhwd m3, m6, m7
+%endif
+ MC_4TAP_COMPUTE 14, %2, m12, m13
+%if (%2 > 8 && (%3 == 8))
+ punpcklwd m4, m8, m9
+ punpcklwd m2, m10, m11
+ punpckhwd m8, m8, m9
+ punpckhwd m3, m10, m11
+ MC_4TAP_COMPUTE 14, %2, m12, m13, m4, m2, m8, m3
+ UNI_COMPUTE %2, %3, m0, m4, [scale_%3]
+%else
+ UNI_COMPUTE %2, %3, m0, m1, [scale_%3]
+%endif
+ PEL_%3STORE%2 dstq, m0, m1
+ mova m4, m5
+ mova m5, m6
+ mova m6, m7
+%if (%2 > 8 && (%3 == 8))
+ mova m8, m9
+ mova m9, m10
+ mova m10, m11
+%endif
+ add dstq, dststrideq ; dst += dststride
+ add srcq, srcstrideq ; src += srcstride
+ dec heightd ; cmp height
+ jnz .loop ; height loop
+ RET
+%endmacro
+
+; ******************************
+; void put_8tap_hX_X_X(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride,
+; int height, const int8_t *hf, const int8_t *vf, int width)
+; ******************************
+
+%macro PUT_8TAP 3
+cglobal %1_put_8tap_h%2_%3, 5, 5, 16, dst, src, srcstride, height, hf
+ MC_8TAP_FILTER %3, hf
+.loop:
+ MC_8TAP_H_LOAD %3, srcq, %2, 10
+ MC_8TAP_COMPUTE %2, %3, 1
+%if %3 > 8
+ packssdw m0, m1
+%endif
+ PEL_10STORE%2 dstq, m0, m1
+ LOOP_END dst, src, srcstride
+ RET
+
+; ******************************
+; void put_uni_8tap_hX_X_X(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t srcstride,
+; int height, const int8_t *hf, const int8_t *vf, int width)
+; ******************************
+cglobal %1_put_uni_8tap_h%2_%3, 6, 7, 16 , dst, dststride, src, srcstride, height, hf
+ mova m9, [scale_%3]
+ MC_8TAP_FILTER %3, hf
+.loop:
+ MC_8TAP_H_LOAD %3, srcq, %2, 10
+ MC_8TAP_COMPUTE %2, %3
+%if %3 > 8
+ packssdw m0, m1
+%endif
+ UNI_COMPUTE %2, %3, m0, m1, m9
+ PEL_%3STORE%2 dstq, m0, m1
+ add dstq, dststrideq ; dst += dststride
+ add srcq, srcstrideq ; src += srcstride
+ dec heightd ; cmp height
+ jnz .loop ; height loop
+ RET
+
+
+; ******************************
+; void put_8tap_vX_X_X(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride,
+; int height, const int8_t *hf, const int8_t *vf, int width)
+; ******************************
+cglobal %1_put_8tap_v%2_%3, 6, 8, 16, dst, src, srcstride, height, r3src, vf
+ MC_8TAP_FILTER %3, vf
+ lea r3srcq, [srcstrideq*3]
+.loop:
+ MC_8TAP_V_LOAD %3, srcq, srcstride, %2, r7
+ MC_8TAP_COMPUTE %2, %3, 1
+%if %3 > 8
+ packssdw m0, m1
+%endif
+ PEL_10STORE%2 dstq, m0, m1
+ LOOP_END dst, src, srcstride
+ RET
+
+; ******************************
+; void put_uni_8tap_vX_X_X(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t srcstride,
+; int height, const int8_t *hf, const int8_t *vf, int width)
+; ******************************
+cglobal %1_put_uni_8tap_v%2_%3, 7, 9, 16, dst, dststride, src, srcstride, height, r3src, vf
+ MC_8TAP_FILTER %3, vf
+ movdqa m9, [scale_%3]
+ lea r3srcq, [srcstrideq*3]
+.loop:
+ MC_8TAP_V_LOAD %3, srcq, srcstride, %2, r8
+ MC_8TAP_COMPUTE %2, %3
+%if %3 > 8
+ packssdw m0, m1
+%endif
+ UNI_COMPUTE %2, %3, m0, m1, m9
+ PEL_%3STORE%2 dstq, m0, m1
+ add dstq, dststrideq ; dst += dststride
+ add srcq, srcstrideq ; src += srcstride
+ dec heightd ; cmp height
+ jnz .loop ; height loop
+ RET
+
+%endmacro
+
+
+; ******************************
+; void put_8tap_hvX_X(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride,
+; int height, const int8_t *hf, const int8_t *vf, int width)
+; ******************************
+%macro PUT_8TAP_HV 3
+cglobal %1_put_8tap_hv%2_%3, 6, 7, 16, 0 - mmsize*16, dst, src, srcstride, height, hf, vf, r3src
+ MC_8TAP_FILTER %3, hf, 0
+ lea hfq, [rsp]
+ MC_8TAP_FILTER %3, vf, 8*mmsize
+ lea vfq, [rsp + 8*mmsize]
+
+ lea r3srcq, [srcstrideq*3]
+ sub srcq, r3srcq
+
+ MC_8TAP_H_LOAD %3, srcq, %2, 15
+ MC_8TAP_HV_COMPUTE %2, %3, hf, ackssdw
+ SWAP m8, m0
+ add srcq, srcstrideq
+ MC_8TAP_H_LOAD %3, srcq, %2, 15
+ MC_8TAP_HV_COMPUTE %2, %3, hf, ackssdw
+ SWAP m9, m0
+ add srcq, srcstrideq
+ MC_8TAP_H_LOAD %3, srcq, %2, 15
+ MC_8TAP_HV_COMPUTE %2, %3, hf, ackssdw
+ SWAP m10, m0
+ add srcq, srcstrideq
+ MC_8TAP_H_LOAD %3, srcq, %2, 15
+ MC_8TAP_HV_COMPUTE %2, %3, hf, ackssdw
+ SWAP m11, m0
+ add srcq, srcstrideq
+ MC_8TAP_H_LOAD %3, srcq, %2, 15
+ MC_8TAP_HV_COMPUTE %2, %3, hf, ackssdw
+ SWAP m12, m0
+ add srcq, srcstrideq
+ MC_8TAP_H_LOAD %3, srcq, %2, 15
+ MC_8TAP_HV_COMPUTE %2, %3, hf, ackssdw
+ SWAP m13, m0
+ add srcq, srcstrideq
+ MC_8TAP_H_LOAD %3, srcq, %2, 15
+ MC_8TAP_HV_COMPUTE %2, %3, hf, ackssdw
+ SWAP m14, m0
+ add srcq, srcstrideq
+.loop:
+ MC_8TAP_H_LOAD %3, srcq, %2, 15
+ MC_8TAP_HV_COMPUTE %2, %3, hf, ackssdw
+ SWAP m15, m0
+ punpcklwd m0, m8, m9
+ punpcklwd m2, m10, m11
+ punpcklwd m4, m12, m13
+ punpcklwd m6, m14, m15
+%if %2 > 4
+ punpckhwd m1, m8, m9
+ punpckhwd m3, m10, m11
+ punpckhwd m5, m12, m13
+ punpckhwd m7, m14, m15
+%endif
+%if %2 <= 4
+ movq m8, m9
+ movq m9, m10
+ movq m10, m11
+ movq m11, m12
+ movq m12, m13
+ movq m13, m14
+ movq m14, m15
+%else
+ movdqa m8, m9
+ movdqa m9, m10
+ movdqa m10, m11
+ movdqa m11, m12
+ movdqa m12, m13
+ movdqa m13, m14
+ movdqa m14, m15
+%endif
+ MC_8TAP_HV_COMPUTE %2, 14, vf, ackssdw
+ PEL_10STORE%2 dstq, m0, m1
+
+ LOOP_END dst, src, srcstride
+ RET
+
+
+cglobal %1_put_uni_8tap_hv%2_%3, 7, 9, 16, 0 - 16*mmsize, dst, dststride, src, srcstride, height, hf, vf, r3src
+ MC_8TAP_FILTER %3, hf, 0
+ lea hfq, [rsp]
+ MC_8TAP_FILTER %3, vf, 8*mmsize
+ lea vfq, [rsp + 8*mmsize]
+ lea r3srcq, [srcstrideq*3]
+ sub srcq, r3srcq
+ MC_8TAP_H_LOAD %3, srcq, %2, 15
+ MC_8TAP_HV_COMPUTE %2, %3, hf, ackssdw
+ SWAP m8, m0
+ add srcq, srcstrideq
+ MC_8TAP_H_LOAD %3, srcq, %2, 15
+ MC_8TAP_HV_COMPUTE %2, %3, hf, ackssdw
+ SWAP m9, m0
+ add srcq, srcstrideq
+ MC_8TAP_H_LOAD %3, srcq, %2, 15
+ MC_8TAP_HV_COMPUTE %2, %3, hf, ackssdw
+ SWAP m10, m0
+ add srcq, srcstrideq
+ MC_8TAP_H_LOAD %3, srcq, %2, 15
+ MC_8TAP_HV_COMPUTE %2, %3, hf, ackssdw
+ SWAP m11, m0
+ add srcq, srcstrideq
+ MC_8TAP_H_LOAD %3, srcq, %2, 15
+ MC_8TAP_HV_COMPUTE %2, %3, hf, ackssdw
+ SWAP m12, m0
+ add srcq, srcstrideq
+ MC_8TAP_H_LOAD %3, srcq, %2, 15
+ MC_8TAP_HV_COMPUTE %2, %3, hf, ackssdw
+ SWAP m13, m0
+ add srcq, srcstrideq
+ MC_8TAP_H_LOAD %3, srcq, %2, 15
+ MC_8TAP_HV_COMPUTE %2, %3, hf, ackssdw
+ SWAP m14, m0
+ add srcq, srcstrideq
+.loop:
+ MC_8TAP_H_LOAD %3, srcq, %2, 15
+ MC_8TAP_HV_COMPUTE %2, %3, hf, ackssdw
+ SWAP m15, m0
+ punpcklwd m0, m8, m9
+ punpcklwd m2, m10, m11
+ punpcklwd m4, m12, m13
+ punpcklwd m6, m14, m15
+%if %2 > 4
+ punpckhwd m1, m8, m9
+ punpckhwd m3, m10, m11
+ punpckhwd m5, m12, m13
+ punpckhwd m7, m14, m15
+%endif
+ MC_8TAP_HV_COMPUTE %2, 14, vf, ackusdw
+ UNI_COMPUTE %2, %3, m0, m1, [scale_%3]
+ PEL_%3STORE%2 dstq, m0, m1
+
+%if %2 <= 4
+ movq m8, m9
+ movq m9, m10
+ movq m10, m11
+ movq m11, m12
+ movq m12, m13
+ movq m13, m14
+ movq m14, m15
+%else
+ mova m8, m9
+ mova m9, m10
+ mova m10, m11
+ mova m11, m12
+ mova m12, m13
+ mova m13, m14
+ mova m14, m15
+%endif
+ add dstq, dststrideq ; dst += dststride
+ add srcq, srcstrideq ; src += srcstride
+ dec heightd ; cmp height
+ jnz .loop ; height loop
+ RET
+
+%endmacro
+
+%macro H2656PUT_PIXELS 2
+ PUT_PIXELS h2656, %1, %2
+%endmacro
+
+%macro H2656PUT_4TAP 2
+ PUT_4TAP h2656, %1, %2
+%endmacro
+
+%macro H2656PUT_4TAP_HV 2
+ PUT_4TAP_HV h2656, %1, %2
+%endmacro
+
+%macro H2656PUT_8TAP 2
+ PUT_8TAP h2656, %1, %2
+%endmacro
+
+%macro H2656PUT_8TAP_HV 2
+ PUT_8TAP_HV h2656, %1, %2
+%endmacro
+
+%if ARCH_X86_64
+
+INIT_XMM sse4
+H2656PUT_PIXELS 2, 8
+H2656PUT_PIXELS 4, 8
+H2656PUT_PIXELS 6, 8
+H2656PUT_PIXELS 8, 8
+H2656PUT_PIXELS 12, 8
+H2656PUT_PIXELS 16, 8
+
+H2656PUT_PIXELS 2, 10
+H2656PUT_PIXELS 4, 10
+H2656PUT_PIXELS 6, 10
+H2656PUT_PIXELS 8, 10
+
+H2656PUT_PIXELS 2, 12
+H2656PUT_PIXELS 4, 12
+H2656PUT_PIXELS 6, 12
+H2656PUT_PIXELS 8, 12
+
+H2656PUT_4TAP 2, 8
+H2656PUT_4TAP 4, 8
+H2656PUT_4TAP 6, 8
+H2656PUT_4TAP 8, 8
+
+H2656PUT_4TAP 12, 8
+H2656PUT_4TAP 16, 8
+
+H2656PUT_4TAP 2, 10
+H2656PUT_4TAP 4, 10
+H2656PUT_4TAP 6, 10
+H2656PUT_4TAP 8, 10
+
+H2656PUT_4TAP 2, 12
+H2656PUT_4TAP 4, 12
+H2656PUT_4TAP 6, 12
+H2656PUT_4TAP 8, 12
+
+H2656PUT_4TAP_HV 2, 8
+H2656PUT_4TAP_HV 4, 8
+H2656PUT_4TAP_HV 6, 8
+H2656PUT_4TAP_HV 8, 8
+H2656PUT_4TAP_HV 16, 8
+
+H2656PUT_4TAP_HV 2, 10
+H2656PUT_4TAP_HV 4, 10
+H2656PUT_4TAP_HV 6, 10
+H2656PUT_4TAP_HV 8, 10
+
+H2656PUT_4TAP_HV 2, 12
+H2656PUT_4TAP_HV 4, 12
+H2656PUT_4TAP_HV 6, 12
+H2656PUT_4TAP_HV 8, 12
+
+H2656PUT_8TAP 4, 8
+H2656PUT_8TAP 8, 8
+H2656PUT_8TAP 12, 8
+H2656PUT_8TAP 16, 8
+
+H2656PUT_8TAP 4, 10
+H2656PUT_8TAP 8, 10
+
+H2656PUT_8TAP 4, 12
+H2656PUT_8TAP 8, 12
+
+H2656PUT_8TAP_HV 4, 8
+H2656PUT_8TAP_HV 8, 8
+
+H2656PUT_8TAP_HV 4, 10
+H2656PUT_8TAP_HV 8, 10
+
+H2656PUT_8TAP_HV 4, 12
+H2656PUT_8TAP_HV 8, 12
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+
+H2656PUT_PIXELS 32, 8
+H2656PUT_PIXELS 16, 10
+H2656PUT_PIXELS 16, 12
+
+H2656PUT_8TAP 32, 8
+H2656PUT_8TAP 16, 10
+H2656PUT_8TAP 16, 12
+
+H2656PUT_8TAP_HV 32, 8
+H2656PUT_8TAP_HV 16, 10
+H2656PUT_8TAP_HV 16, 12
+
+H2656PUT_4TAP 32, 8
+H2656PUT_4TAP 16, 10
+H2656PUT_4TAP 16, 12
+
+H2656PUT_4TAP_HV 32, 8
+H2656PUT_4TAP_HV 16, 10
+H2656PUT_4TAP_HV 16, 12
+
+%endif
+
+%endif
diff --git a/libavcodec/x86/h26x/h2656dsp.c b/libavcodec/x86/h26x/h2656dsp.c
new file mode 100644
index 0000000000..27769f9c55
--- /dev/null
+++ b/libavcodec/x86/h26x/h2656dsp.c
@@ -0,0 +1,98 @@
+/*
+ * DSP for HEVC/VVC
+ *
+ * Copyright (C) 2022-2024 Nuo Mi
+ * Copyright (c) 2023-2024 Wu Jianhua
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h2656dsp.h"
+
+#define mc_rep_func(name, bitd, step, W, opt) \
+void ff_h2656_put_##name##W##_##bitd##_##opt(int16_t *_dst, \
+ const uint8_t *_src, ptrdiff_t _srcstride, int height, const int8_t *hf, const int8_t *vf, int width) \
+{ \
+ int i; \
+ int16_t *dst; \
+ for (i = 0; i < W; i += step) { \
+ const uint8_t *src = _src + (i * ((bitd + 7) / 8)); \
+ dst = _dst + i; \
+ ff_h2656_put_##name##step##_##bitd##_##opt(dst, src, _srcstride, height, hf, vf, width); \
+ } \
+}
+
+#define mc_rep_uni_func(name, bitd, step, W, opt) \
+void ff_h2656_put_uni_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, \
+ const uint8_t *_src, ptrdiff_t _srcstride, int height, const int8_t *hf, const int8_t *vf, int width) \
+{ \
+ int i; \
+ uint8_t *dst; \
+ for (i = 0; i < W; i += step) { \
+ const uint8_t *src = _src + (i * ((bitd + 7) / 8)); \
+ dst = _dst + (i * ((bitd + 7) / 8)); \
+ ff_h2656_put_uni_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride, \
+ height, hf, vf, width); \
+ } \
+}
+
+#define mc_rep_funcs(name, bitd, step, W, opt) \
+ mc_rep_func(name, bitd, step, W, opt) \
+ mc_rep_uni_func(name, bitd, step, W, opt)
+
+#define MC_REP_FUNCS_SSE4(fname) \
+ mc_rep_funcs(fname, 8, 16,128, sse4) \
+ mc_rep_funcs(fname, 8, 16, 64, sse4) \
+ mc_rep_funcs(fname, 8, 16, 32, sse4) \
+ mc_rep_funcs(fname, 10, 8,128, sse4) \
+ mc_rep_funcs(fname, 10, 8, 64, sse4) \
+ mc_rep_funcs(fname, 10, 8, 32, sse4) \
+ mc_rep_funcs(fname, 10, 8, 16, sse4) \
+ mc_rep_funcs(fname, 12, 8,128, sse4) \
+ mc_rep_funcs(fname, 12, 8, 64, sse4) \
+ mc_rep_funcs(fname, 12, 8, 32, sse4) \
+ mc_rep_funcs(fname, 12, 8, 16, sse4) \
+
+MC_REP_FUNCS_SSE4(pixels)
+MC_REP_FUNCS_SSE4(4tap_h)
+MC_REP_FUNCS_SSE4(4tap_v)
+MC_REP_FUNCS_SSE4(4tap_hv)
+MC_REP_FUNCS_SSE4(8tap_h)
+MC_REP_FUNCS_SSE4(8tap_v)
+MC_REP_FUNCS_SSE4(8tap_hv)
+mc_rep_funcs(8tap_hv, 8, 8, 16, sse4)
+
+#if HAVE_AVX2_EXTERNAL
+
+#define MC_REP_FUNCS_AVX2(fname) \
+ mc_rep_funcs(fname, 8, 32, 64, avx2) \
+ mc_rep_funcs(fname, 8, 32,128, avx2) \
+ mc_rep_funcs(fname,10, 16, 32, avx2) \
+ mc_rep_funcs(fname,10, 16, 64, avx2) \
+ mc_rep_funcs(fname,10, 16,128, avx2) \
+ mc_rep_funcs(fname,12, 16, 32, avx2) \
+ mc_rep_funcs(fname,12, 16, 64, avx2) \
+ mc_rep_funcs(fname,12, 16,128, avx2) \
+
+MC_REP_FUNCS_AVX2(pixels)
+MC_REP_FUNCS_AVX2(8tap_h)
+MC_REP_FUNCS_AVX2(8tap_v)
+MC_REP_FUNCS_AVX2(8tap_hv)
+MC_REP_FUNCS_AVX2(4tap_h)
+MC_REP_FUNCS_AVX2(4tap_v)
+MC_REP_FUNCS_AVX2(4tap_hv)
+#endif
diff --git a/libavcodec/x86/h26x/h2656dsp.h b/libavcodec/x86/h26x/h2656dsp.h
new file mode 100644
index 0000000000..8a2ab13607
--- /dev/null
+++ b/libavcodec/x86/h26x/h2656dsp.h
@@ -0,0 +1,103 @@
+/*
+ * DSP for HEVC/VVC
+ *
+ * Copyright (C) 2022-2024 Nuo Mi
+ * Copyright (c) 2023-2024 Wu Jianhua
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_H26X_H2656DSP_H
+#define AVCODEC_X86_H26X_H2656DSP_H
+
+#include "config.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include <stdlib.h>
+
+#define H2656_PEL_PROTOTYPE(name, D, opt) \
+void ff_h2656_put_ ## name ## _ ## D ## _##opt(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, const int8_t *hf, const int8_t *vf, int width); \
+void ff_h2656_put_uni_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, int height, const int8_t *hf, const int8_t *vf, int width); \
+
+#define H2656_MC_8TAP_PROTOTYPES(fname, bitd, opt) \
+ H2656_PEL_PROTOTYPE(fname##4, bitd, opt); \
+ H2656_PEL_PROTOTYPE(fname##6, bitd, opt); \
+ H2656_PEL_PROTOTYPE(fname##8, bitd, opt); \
+ H2656_PEL_PROTOTYPE(fname##12, bitd, opt); \
+ H2656_PEL_PROTOTYPE(fname##16, bitd, opt); \
+ H2656_PEL_PROTOTYPE(fname##32, bitd, opt); \
+ H2656_PEL_PROTOTYPE(fname##64, bitd, opt); \
+ H2656_PEL_PROTOTYPE(fname##128, bitd, opt)
+
+H2656_MC_8TAP_PROTOTYPES(pixels , 8, sse4);
+H2656_MC_8TAP_PROTOTYPES(pixels , 10, sse4);
+H2656_MC_8TAP_PROTOTYPES(pixels , 12, sse4);
+H2656_MC_8TAP_PROTOTYPES(8tap_h , 8, sse4);
+H2656_MC_8TAP_PROTOTYPES(8tap_h , 10, sse4);
+H2656_MC_8TAP_PROTOTYPES(8tap_h , 12, sse4);
+H2656_MC_8TAP_PROTOTYPES(8tap_v , 8, sse4);
+H2656_MC_8TAP_PROTOTYPES(8tap_v , 10, sse4);
+H2656_MC_8TAP_PROTOTYPES(8tap_v , 12, sse4);
+H2656_MC_8TAP_PROTOTYPES(8tap_hv , 8, sse4);
+H2656_MC_8TAP_PROTOTYPES(8tap_hv , 10, sse4);
+H2656_MC_8TAP_PROTOTYPES(8tap_hv , 12, sse4);
+
+#define H2656_MC_4TAP_PROTOTYPES(fname, bitd, opt) \
+ H2656_PEL_PROTOTYPE(fname##2, bitd, opt); \
+ H2656_PEL_PROTOTYPE(fname##4, bitd, opt); \
+ H2656_PEL_PROTOTYPE(fname##6, bitd, opt); \
+ H2656_PEL_PROTOTYPE(fname##8, bitd, opt); \
+ H2656_PEL_PROTOTYPE(fname##12, bitd, opt); \
+ H2656_PEL_PROTOTYPE(fname##16, bitd, opt); \
+ H2656_PEL_PROTOTYPE(fname##32, bitd, opt); \
+ H2656_PEL_PROTOTYPE(fname##64, bitd, opt); \
+ H2656_PEL_PROTOTYPE(fname##128, bitd, opt)
+
+#define H2656_MC_4TAP_PROTOTYPES_SSE4(bitd) \
+ H2656_PEL_PROTOTYPE(pixels2, bitd, sse4); \
+ H2656_MC_4TAP_PROTOTYPES(4tap_h, bitd, sse4); \
+ H2656_MC_4TAP_PROTOTYPES(4tap_v, bitd, sse4); \
+ H2656_MC_4TAP_PROTOTYPES(4tap_hv, bitd, sse4); \
+
+H2656_MC_4TAP_PROTOTYPES_SSE4(8)
+H2656_MC_4TAP_PROTOTYPES_SSE4(10)
+H2656_MC_4TAP_PROTOTYPES_SSE4(12)
+
+#define H2656_MC_8TAP_PROTOTYPES_AVX2(fname) \
+ H2656_PEL_PROTOTYPE(fname##32 , 8, avx2); \
+ H2656_PEL_PROTOTYPE(fname##64 , 8, avx2); \
+ H2656_PEL_PROTOTYPE(fname##128, 8, avx2); \
+ H2656_PEL_PROTOTYPE(fname##16 ,10, avx2); \
+ H2656_PEL_PROTOTYPE(fname##32 ,10, avx2); \
+ H2656_PEL_PROTOTYPE(fname##64 ,10, avx2); \
+ H2656_PEL_PROTOTYPE(fname##128,10, avx2); \
+ H2656_PEL_PROTOTYPE(fname##16 ,12, avx2); \
+ H2656_PEL_PROTOTYPE(fname##32 ,12, avx2); \
+ H2656_PEL_PROTOTYPE(fname##64 ,12, avx2); \
+ H2656_PEL_PROTOTYPE(fname##128,12, avx2) \
+
+H2656_MC_8TAP_PROTOTYPES_AVX2(pixels);
+H2656_MC_8TAP_PROTOTYPES_AVX2(8tap_h);
+H2656_MC_8TAP_PROTOTYPES_AVX2(8tap_v);
+H2656_MC_8TAP_PROTOTYPES_AVX2(8tap_hv);
+H2656_PEL_PROTOTYPE(8tap_hv16, 8, avx2);
+
+H2656_MC_8TAP_PROTOTYPES_AVX2(4tap_h);
+H2656_MC_8TAP_PROTOTYPES_AVX2(4tap_v);
+H2656_MC_8TAP_PROTOTYPES_AVX2(4tap_hv);
+
+#endif
diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm
index eb267453fe..5489701e44 100644
--- a/libavcodec/x86/hevc_mc.asm
+++ b/libavcodec/x86/hevc_mc.asm
@@ -715,35 +715,6 @@ SECTION .text
; int height, int mx, int my)
; ******************************
-%macro HEVC_PUT_HEVC_PEL_PIXELS 2
-HEVC_PEL_PIXELS %1, %2
-HEVC_UNI_PEL_PIXELS %1, %2
-HEVC_BI_PEL_PIXELS %1, %2
-%endmacro
-
-%macro HEVC_PEL_PIXELS 2
-cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height
- pxor m2, m2
-.loop:
- SIMPLE_LOAD %1, %2, srcq, m0
- MC_PIXEL_COMPUTE %1, %2, 1
- PEL_10STORE%1 dstq, m0, m1
- LOOP_END dst, src, srcstride
- RET
- %endmacro
-
-%macro HEVC_UNI_PEL_PIXELS 2
-cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstride,height
-.loop:
- SIMPLE_LOAD %1, %2, srcq, m0
- PEL_%2STORE%1 dstq, m0, m1
- add dstq, dststrideq ; dst += dststride
- add srcq, srcstrideq ; src += srcstride
- dec heightd ; cmp height
- jnz .loop ; height loop
- RET
-%endmacro
-
%macro HEVC_BI_PEL_PIXELS 2
cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height
pxor m2, m2
@@ -777,32 +748,8 @@ cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstrid
%define XMM_REGS 8
%endif
-cglobal hevc_put_hevc_epel_h%1_%2, 5, 6, XMM_REGS, dst, src, srcstride, height, mx, rfilter
-%assign %%stride ((%2 + 7)/8)
- EPEL_FILTER %2, mx, m4, m5, rfilter
-.loop:
- EPEL_LOAD %2, srcq-%%stride, %%stride, %1
- EPEL_COMPUTE %2, %1, m4, m5, 1
- PEL_10STORE%1 dstq, m0, m1
- LOOP_END dst, src, srcstride
- RET
-
-cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, XMM_REGS, dst, dststride, src, srcstride, height, mx, rfilter
-%assign %%stride ((%2 + 7)/8)
- movdqa m6, [pw_%2]
- EPEL_FILTER %2, mx, m4, m5, rfilter
-.loop:
- EPEL_LOAD %2, srcq-%%stride, %%stride, %1
- EPEL_COMPUTE %2, %1, m4, m5
- UNI_COMPUTE %1, %2, m0, m1, m6
- PEL_%2STORE%1 dstq, m0, m1
- add dstq, dststrideq ; dst += dststride
- add srcq, srcstrideq ; src += srcstride
- dec heightd ; cmp height
- jnz .loop ; height loop
- RET
-
cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, mx, rfilter
+%assign %%stride ((%2 + 7)/8)
movdqa m6, [pw_bi_%2]
EPEL_FILTER %2, mx, m4, m5, rfilter
.loop:
@@ -824,36 +771,6 @@ cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, XMM_REGS, dst, dststride, src, srcst
; int height, int mx, int my, int width)
; ******************************
-cglobal hevc_put_hevc_epel_v%1_%2, 4, 6, XMM_REGS, dst, src, srcstride, height, r3src, my
- movifnidn myd, mym
- sub srcq, srcstrideq
- EPEL_FILTER %2, my, m4, m5, r3src
- lea r3srcq, [srcstrideq*3]
-.loop:
- EPEL_LOAD %2, srcq, srcstride, %1
- EPEL_COMPUTE %2, %1, m4, m5, 1
- PEL_10STORE%1 dstq, m0, m1
- LOOP_END dst, src, srcstride
- RET
-
-cglobal hevc_put_hevc_uni_epel_v%1_%2, 5, 7, XMM_REGS, dst, dststride, src, srcstride, height, r3src, my
- movifnidn myd, mym
- movdqa m6, [pw_%2]
- sub srcq, srcstrideq
- EPEL_FILTER %2, my, m4, m5, r3src
- lea r3srcq, [srcstrideq*3]
-.loop:
- EPEL_LOAD %2, srcq, srcstride, %1
- EPEL_COMPUTE %2, %1, m4, m5
- UNI_COMPUTE %1, %2, m0, m1, m6
- PEL_%2STORE%1 dstq, m0, m1
- add dstq, dststrideq ; dst += dststride
- add srcq, srcstrideq ; src += srcstride
- dec heightd ; cmp height
- jnz .loop ; height loop
- RET
-
-
cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, r3src, my
movifnidn myd, mym
movdqa m6, [pw_bi_%2]
@@ -882,135 +799,6 @@ cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 8, XMM_REGS, dst, dststride, src, srcst
; ******************************
%macro HEVC_PUT_HEVC_EPEL_HV 2
-cglobal hevc_put_hevc_epel_hv%1_%2, 6, 7, 16 , dst, src, srcstride, height, mx, my, r3src
-%assign %%stride ((%2 + 7)/8)
- sub srcq, srcstrideq
- EPEL_HV_FILTER %2
- EPEL_LOAD %2, srcq-%%stride, %%stride, %1
- EPEL_COMPUTE %2, %1, m14, m15
-%if (%1 > 8 && (%2 == 8))
- SWAP m8, m1
-%endif
- SWAP m4, m0
- add srcq, srcstrideq
- EPEL_LOAD %2, srcq-%%stride, %%stride, %1
- EPEL_COMPUTE %2, %1, m14, m15
-%if (%1 > 8 && (%2 == 8))
- SWAP m9, m1
-%endif
- SWAP m5, m0
- add srcq, srcstrideq
- EPEL_LOAD %2, srcq-%%stride, %%stride, %1
- EPEL_COMPUTE %2, %1, m14, m15
-%if (%1 > 8 && (%2 == 8))
- SWAP m10, m1
-%endif
- SWAP m6, m0
- add srcq, srcstrideq
-.loop:
- EPEL_LOAD %2, srcq-%%stride, %%stride, %1
- EPEL_COMPUTE %2, %1, m14, m15
-%if (%1 > 8 && (%2 == 8))
- SWAP m11, m1
-%endif
- SWAP m7, m0
- punpcklwd m0, m4, m5
- punpcklwd m2, m6, m7
-%if %1 > 4
- punpckhwd m1, m4, m5
- punpckhwd m3, m6, m7
-%endif
- EPEL_COMPUTE 14, %1, m12, m13
-%if (%1 > 8 && (%2 == 8))
- punpcklwd m4, m8, m9
- punpcklwd m2, m10, m11
- punpckhwd m8, m8, m9
- punpckhwd m3, m10, m11
- EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3
-%if cpuflag(avx2)
- vinserti128 m2, m0, xm4, 1
- vperm2i128 m3, m0, m4, q0301
- PEL_10STORE%1 dstq, m2, m3
-%else
- PEL_10STORE%1 dstq, m0, m4
-%endif
-%else
- PEL_10STORE%1 dstq, m0, m1
-%endif
- movdqa m4, m5
- movdqa m5, m6
- movdqa m6, m7
-%if (%1 > 8 && (%2 == 8))
- mova m8, m9
- mova m9, m10
- mova m10, m11
-%endif
- LOOP_END dst, src, srcstride
- RET
-
-cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, height, mx, my, r3src
-%assign %%stride ((%2 + 7)/8)
- sub srcq, srcstrideq
- EPEL_HV_FILTER %2
- EPEL_LOAD %2, srcq-%%stride, %%stride, %1
- EPEL_COMPUTE %2, %1, m14, m15
-%if (%1 > 8 && (%2 == 8))
- SWAP m8, m1
-%endif
- SWAP m4, m0
- add srcq, srcstrideq
- EPEL_LOAD %2, srcq-%%stride, %%stride, %1
- EPEL_COMPUTE %2, %1, m14, m15
-%if (%1 > 8 && (%2 == 8))
- SWAP m9, m1
-%endif
- SWAP m5, m0
- add srcq, srcstrideq
- EPEL_LOAD %2, srcq-%%stride, %%stride, %1
- EPEL_COMPUTE %2, %1, m14, m15
-%if (%1 > 8 && (%2 == 8))
- SWAP m10, m1
-%endif
- SWAP m6, m0
- add srcq, srcstrideq
-.loop:
- EPEL_LOAD %2, srcq-%%stride, %%stride, %1
- EPEL_COMPUTE %2, %1, m14, m15
-%if (%1 > 8 && (%2 == 8))
- SWAP m11, m1
-%endif
- mova m7, m0
- punpcklwd m0, m4, m5
- punpcklwd m2, m6, m7
-%if %1 > 4
- punpckhwd m1, m4, m5
- punpckhwd m3, m6, m7
-%endif
- EPEL_COMPUTE 14, %1, m12, m13
-%if (%1 > 8 && (%2 == 8))
- punpcklwd m4, m8, m9
- punpcklwd m2, m10, m11
- punpckhwd m8, m8, m9
- punpckhwd m3, m10, m11
- EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3
- UNI_COMPUTE %1, %2, m0, m4, [pw_%2]
-%else
- UNI_COMPUTE %1, %2, m0, m1, [pw_%2]
-%endif
- PEL_%2STORE%1 dstq, m0, m1
- mova m4, m5
- mova m5, m6
- mova m6, m7
-%if (%1 > 8 && (%2 == 8))
- mova m8, m9
- mova m9, m10
- mova m10, m11
-%endif
- add dstq, dststrideq ; dst += dststride
- add srcq, srcstrideq ; src += srcstride
- dec heightd ; cmp height
- jnz .loop ; height loop
- RET
cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 9, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src
%assign %%stride ((%2 + 7)/8)
@@ -1093,34 +881,6 @@ cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 9, 16, dst, dststride, src, srcstride,
; ******************************
%macro HEVC_PUT_HEVC_QPEL 2
-cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 16, dst, src, srcstride, height, mx, rfilter
- QPEL_FILTER %2, mx
-.loop:
- QPEL_H_LOAD %2, srcq, %1, 10
- QPEL_COMPUTE %1, %2, 1
-%if %2 > 8
- packssdw m0, m1
-%endif
- PEL_10STORE%1 dstq, m0, m1
- LOOP_END dst, src, srcstride
- RET
-
-cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 16 , dst, dststride, src, srcstride, height, mx, rfilter
- mova m9, [pw_%2]
- QPEL_FILTER %2, mx
-.loop:
- QPEL_H_LOAD %2, srcq, %1, 10
- QPEL_COMPUTE %1, %2
-%if %2 > 8
- packssdw m0, m1
-%endif
- UNI_COMPUTE %1, %2, m0, m1, m9
- PEL_%2STORE%1 dstq, m0, m1
- add dstq, dststrideq ; dst += dststride
- add srcq, srcstrideq ; src += srcstride
- dec heightd ; cmp height
- jnz .loop ; height loop
- RET
cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, src2, height, mx, rfilter
movdqa m9, [pw_bi_%2]
@@ -1148,38 +908,6 @@ cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride,
; int height, int mx, int my, int width)
; ******************************
-cglobal hevc_put_hevc_qpel_v%1_%2, 4, 8, 16, dst, src, srcstride, height, r3src, my, rfilter
- movifnidn myd, mym
- lea r3srcq, [srcstrideq*3]
- QPEL_FILTER %2, my
-.loop:
- QPEL_V_LOAD %2, srcq, srcstride, %1, r7
- QPEL_COMPUTE %1, %2, 1
-%if %2 > 8
- packssdw m0, m1
-%endif
- PEL_10STORE%1 dstq, m0, m1
- LOOP_END dst, src, srcstride
- RET
-
-cglobal hevc_put_hevc_uni_qpel_v%1_%2, 5, 9, 16, dst, dststride, src, srcstride, height, r3src, my, rfilter
- movifnidn myd, mym
- movdqa m9, [pw_%2]
- lea r3srcq, [srcstrideq*3]
- QPEL_FILTER %2, my
-.loop:
- QPEL_V_LOAD %2, srcq, srcstride, %1, r8
- QPEL_COMPUTE %1, %2
-%if %2 > 8
- packssdw m0, m1
-%endif
- UNI_COMPUTE %1, %2, m0, m1, m9
- PEL_%2STORE%1 dstq, m0, m1
- add dstq, dststrideq ; dst += dststride
- add srcq, srcstrideq ; src += srcstride
- dec heightd ; cmp height
- jnz .loop ; height loop
- RET
cglobal hevc_put_hevc_bi_qpel_v%1_%2, 6, 10, 16, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter
movifnidn myd, mym
@@ -1210,162 +938,6 @@ cglobal hevc_put_hevc_bi_qpel_v%1_%2, 6, 10, 16, dst, dststride, src, srcstride,
; int height, int mx, int my)
; ******************************
%macro HEVC_PUT_HEVC_QPEL_HV 2
-cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 16, dst, src, srcstride, height, mx, my, r3src, rfilter
-%if cpuflag(avx2)
-%assign %%shift 4
-%else
-%assign %%shift 3
-%endif
- sub mxq, 1
- sub myq, 1
- shl mxq, %%shift ; multiply by 32
- shl myq, %%shift ; multiply by 32
- lea r3srcq, [srcstrideq*3]
- sub srcq, r3srcq
- QPEL_H_LOAD %2, srcq, %1, 15
- QPEL_HV_COMPUTE %1, %2, mx, ackssdw
- SWAP m8, m0
- add srcq, srcstrideq
- QPEL_H_LOAD %2, srcq, %1, 15
- QPEL_HV_COMPUTE %1, %2, mx, ackssdw
- SWAP m9, m0
- add srcq, srcstrideq
- QPEL_H_LOAD %2, srcq, %1, 15
- QPEL_HV_COMPUTE %1, %2, mx, ackssdw
- SWAP m10, m0
- add srcq, srcstrideq
- QPEL_H_LOAD %2, srcq, %1, 15
- QPEL_HV_COMPUTE %1, %2, mx, ackssdw
- SWAP m11, m0
- add srcq, srcstrideq
- QPEL_H_LOAD %2, srcq, %1, 15
- QPEL_HV_COMPUTE %1, %2, mx, ackssdw
- SWAP m12, m0
- add srcq, srcstrideq
- QPEL_H_LOAD %2, srcq, %1, 15
- QPEL_HV_COMPUTE %1, %2, mx, ackssdw
- SWAP m13, m0
- add srcq, srcstrideq
- QPEL_H_LOAD %2, srcq, %1, 15
- QPEL_HV_COMPUTE %1, %2, mx, ackssdw
- SWAP m14, m0
- add srcq, srcstrideq
-.loop:
- QPEL_H_LOAD %2, srcq, %1, 15
- QPEL_HV_COMPUTE %1, %2, mx, ackssdw
- SWAP m15, m0
- punpcklwd m0, m8, m9
- punpcklwd m2, m10, m11
- punpcklwd m4, m12, m13
- punpcklwd m6, m14, m15
-%if %1 > 4
- punpckhwd m1, m8, m9
- punpckhwd m3, m10, m11
- punpckhwd m5, m12, m13
- punpckhwd m7, m14, m15
-%endif
- QPEL_HV_COMPUTE %1, 14, my, ackssdw
- PEL_10STORE%1 dstq, m0, m1
-%if %1 <= 4
- movq m8, m9
- movq m9, m10
- movq m10, m11
- movq m11, m12
- movq m12, m13
- movq m13, m14
- movq m14, m15
-%else
- movdqa m8, m9
- movdqa m9, m10
- movdqa m10, m11
- movdqa m11, m12
- movdqa m12, m13
- movdqa m13, m14
- movdqa m14, m15
-%endif
- LOOP_END dst, src, srcstride
- RET
-
-cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
-%if cpuflag(avx2)
-%assign %%shift 4
-%else
-%assign %%shift 3
-%endif
- sub mxq, 1
- sub myq, 1
- shl mxq, %%shift ; multiply by 32
- shl myq, %%shift ; multiply by 32
- lea r3srcq, [srcstrideq*3]
- sub srcq, r3srcq
- QPEL_H_LOAD %2, srcq, %1, 15
- QPEL_HV_COMPUTE %1, %2, mx, ackssdw
- SWAP m8, m0
- add srcq, srcstrideq
- QPEL_H_LOAD %2, srcq, %1, 15
- QPEL_HV_COMPUTE %1, %2, mx, ackssdw
- SWAP m9, m0
- add srcq, srcstrideq
- QPEL_H_LOAD %2, srcq, %1, 15
- QPEL_HV_COMPUTE %1, %2, mx, ackssdw
- SWAP m10, m0
- add srcq, srcstrideq
- QPEL_H_LOAD %2, srcq, %1, 15
- QPEL_HV_COMPUTE %1, %2, mx, ackssdw
- SWAP m11, m0
- add srcq, srcstrideq
- QPEL_H_LOAD %2, srcq, %1, 15
- QPEL_HV_COMPUTE %1, %2, mx, ackssdw
- SWAP m12, m0
- add srcq, srcstrideq
- QPEL_H_LOAD %2, srcq, %1, 15
- QPEL_HV_COMPUTE %1, %2, mx, ackssdw
- SWAP m13, m0
- add srcq, srcstrideq
- QPEL_H_LOAD %2, srcq, %1, 15
- QPEL_HV_COMPUTE %1, %2, mx, ackssdw
- SWAP m14, m0
- add srcq, srcstrideq
-.loop:
- QPEL_H_LOAD %2, srcq, %1, 15
- QPEL_HV_COMPUTE %1, %2, mx, ackssdw
- SWAP m15, m0
- punpcklwd m0, m8, m9
- punpcklwd m2, m10, m11
- punpcklwd m4, m12, m13
- punpcklwd m6, m14, m15
-%if %1 > 4
- punpckhwd m1, m8, m9
- punpckhwd m3, m10, m11
- punpckhwd m5, m12, m13
- punpckhwd m7, m14, m15
-%endif
- QPEL_HV_COMPUTE %1, 14, my, ackusdw
- UNI_COMPUTE %1, %2, m0, m1, [pw_%2]
- PEL_%2STORE%1 dstq, m0, m1
-
-%if %1 <= 4
- movq m8, m9
- movq m9, m10
- movq m10, m11
- movq m11, m12
- movq m12, m13
- movq m13, m14
- movq m14, m15
-%else
- mova m8, m9
- mova m9, m10
- mova m10, m11
- mova m11, m12
- mova m12, m13
- mova m13, m14
- mova m14, m15
-%endif
- add dstq, dststrideq ; dst += dststride
- add srcq, srcstrideq ; src += srcstride
- dec heightd ; cmp height
- jnz .loop ; height loop
- RET
cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
%if cpuflag(avx2)
@@ -1613,22 +1185,22 @@ WEIGHTING_FUNCS 4, 12
WEIGHTING_FUNCS 6, 12
WEIGHTING_FUNCS 8, 12
-HEVC_PUT_HEVC_PEL_PIXELS 2, 8
-HEVC_PUT_HEVC_PEL_PIXELS 4, 8
-HEVC_PUT_HEVC_PEL_PIXELS 6, 8
-HEVC_PUT_HEVC_PEL_PIXELS 8, 8
-HEVC_PUT_HEVC_PEL_PIXELS 12, 8
-HEVC_PUT_HEVC_PEL_PIXELS 16, 8
+HEVC_BI_PEL_PIXELS 2, 8
+HEVC_BI_PEL_PIXELS 4, 8
+HEVC_BI_PEL_PIXELS 6, 8
+HEVC_BI_PEL_PIXELS 8, 8
+HEVC_BI_PEL_PIXELS 12, 8
+HEVC_BI_PEL_PIXELS 16, 8
-HEVC_PUT_HEVC_PEL_PIXELS 2, 10
-HEVC_PUT_HEVC_PEL_PIXELS 4, 10
-HEVC_PUT_HEVC_PEL_PIXELS 6, 10
-HEVC_PUT_HEVC_PEL_PIXELS 8, 10
+HEVC_BI_PEL_PIXELS 2, 10
+HEVC_BI_PEL_PIXELS 4, 10
+HEVC_BI_PEL_PIXELS 6, 10
+HEVC_BI_PEL_PIXELS 8, 10
-HEVC_PUT_HEVC_PEL_PIXELS 2, 12
-HEVC_PUT_HEVC_PEL_PIXELS 4, 12
-HEVC_PUT_HEVC_PEL_PIXELS 6, 12
-HEVC_PUT_HEVC_PEL_PIXELS 8, 12
+HEVC_BI_PEL_PIXELS 2, 12
+HEVC_BI_PEL_PIXELS 4, 12
+HEVC_BI_PEL_PIXELS 6, 12
+HEVC_BI_PEL_PIXELS 8, 12
HEVC_PUT_HEVC_EPEL 2, 8
HEVC_PUT_HEVC_EPEL 4, 8
@@ -1693,8 +1265,8 @@ HEVC_PUT_HEVC_QPEL_HV 8, 12
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2 ; adds ff_ and _avx2 to function name & enables 256b registers : m0 for 256b, xm0 for 128b. cpuflag(avx2) = 1 / notcpuflag(avx) = 0
-HEVC_PUT_HEVC_PEL_PIXELS 32, 8
-HEVC_PUT_HEVC_PEL_PIXELS 16, 10
+HEVC_BI_PEL_PIXELS 32, 8
+HEVC_BI_PEL_PIXELS 16, 10
HEVC_PUT_HEVC_EPEL 32, 8
HEVC_PUT_HEVC_EPEL 16, 10
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index 6f45e5e0db..5c19330e19 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -1,6 +1,7 @@
/*
* Copyright (c) 2013 Seppo Tomperi
- * Copyright (c) 2013 - 2014 Pierre-Edouard Lepere
+ * Copyright (c) 2013-2014 Pierre-Edouard Lepere
+ * Copyright (c) 2023-2024 Wu Jianhua
*
* This file is part of FFmpeg.
*
@@ -27,6 +28,7 @@
#include "libavutil/x86/cpu.h"
#include "libavcodec/hevcdsp.h"
#include "libavcodec/x86/hevcdsp.h"
+#include "libavcodec/x86/h26x/h2656dsp.h"
#define LFC_FUNC(DIR, DEPTH, OPT) \
void ff_hevc_ ## DIR ## _loop_filter_chroma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, const int *tc, const uint8_t *no_p, const uint8_t *no_q);
@@ -83,6 +85,110 @@ void ff_hevc_idct_32x32_10_ ## opt(int16_t *coeffs, int col_limit);
IDCT_FUNCS(sse2)
IDCT_FUNCS(avx)
+
+#define ff_hevc_pel_filters ff_hevc_qpel_filters
+#define DECL_HV_FILTER(f) \
+ const uint8_t *hf = ff_hevc_ ## f ## _filters[mx - 1]; \
+ const uint8_t *vf = ff_hevc_ ## f ## _filters[my - 1];
+
+#define FW_PUT(p, a, b, depth, opt) \
+void ff_hevc_put_hevc_ ## a ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, \
+ int height, intptr_t mx, intptr_t my,int width) \
+{ \
+ DECL_HV_FILTER(p) \
+ ff_h2656_put_ ## b ## _ ## depth ## _##opt(dst, src, srcstride, height, hf, vf, width); \
+}
+
+#define FW_PUT_UNI(p, a, b, depth, opt) \
+void ff_hevc_put_hevc_uni_ ## a ## _ ## depth ## _##opt(uint8_t *dst, ptrdiff_t dststride, \
+ const uint8_t *src, ptrdiff_t srcstride, \
+ int height, intptr_t mx, intptr_t my, int width) \
+{ \
+ DECL_HV_FILTER(p) \
+ ff_h2656_put_uni_ ## b ## _ ## depth ## _##opt(dst, dststride, src, srcstride, height, hf, vf, width); \
+}
+
+#if ARCH_X86_64 && HAVE_SSE4_EXTERNAL
+
+#define FW_PUT_FUNCS(p, a, b, depth, opt) \
+ FW_PUT(p, a, b, depth, opt) \
+ FW_PUT_UNI(p, a, b, depth, opt)
+
+#define FW_PEL(w, depth, opt) FW_PUT_FUNCS(pel, pel_pixels##w, pixels##w, depth, opt)
+
+#define FW_DIR(npel, n, w, depth, opt) \
+ FW_PUT_FUNCS(npel, npel ## _h##w, n ## tap_h##w, depth, opt) \
+ FW_PUT_FUNCS(npel, npel ## _v##w, n ## tap_v##w, depth, opt)
+
+#define FW_DIR_HV(npel, n, w, depth, opt) \
+ FW_PUT_FUNCS(npel, npel ## _hv##w, n ## tap_hv##w, depth, opt)
+
+FW_PEL(4, 8, sse4);
+FW_PEL(6, 8, sse4);
+FW_PEL(8, 8, sse4);
+FW_PEL(12, 8, sse4);
+FW_PEL(16, 8, sse4);
+FW_PEL(4, 10, sse4);
+FW_PEL(6, 10, sse4);
+FW_PEL(8, 10, sse4);
+FW_PEL(4, 12, sse4);
+FW_PEL(6, 12, sse4);
+FW_PEL(8, 12, sse4);
+
+#define FW_EPEL(w, depth, opt) FW_DIR(epel, 4, w, depth, opt)
+#define FW_EPEL_HV(w, depth, opt) FW_DIR_HV(epel, 4, w, depth, opt)
+#define FW_EPEL_FUNCS(w, depth, opt) \
+ FW_EPEL(w, depth, opt) \
+ FW_EPEL_HV(w, depth, opt)
+
+FW_EPEL(12, 8, sse4);
+
+FW_EPEL_FUNCS(4, 8, sse4);
+FW_EPEL_FUNCS(6, 8, sse4);
+FW_EPEL_FUNCS(8, 8, sse4);
+FW_EPEL_FUNCS(16, 8, sse4);
+FW_EPEL_FUNCS(4, 10, sse4);
+FW_EPEL_FUNCS(6, 10, sse4);
+FW_EPEL_FUNCS(8, 10, sse4);
+FW_EPEL_FUNCS(4, 12, sse4);
+FW_EPEL_FUNCS(6, 12, sse4);
+FW_EPEL_FUNCS(8, 12, sse4);
+
+#define FW_QPEL(w, depth, opt) FW_DIR(qpel, 8, w, depth, opt)
+#define FW_QPEL_HV(w, depth, opt) FW_DIR_HV(qpel, 8, w, depth, opt)
+#define FW_QPEL_FUNCS(w, depth, opt) \
+ FW_QPEL(w, depth, opt) \
+ FW_QPEL_HV(w, depth, opt)
+
+FW_QPEL(12, 8, sse4);
+FW_QPEL(16, 8, sse4);
+
+FW_QPEL_FUNCS(4, 8, sse4);
+FW_QPEL_FUNCS(8, 8, sse4);
+FW_QPEL_FUNCS(4, 10, sse4);
+FW_QPEL_FUNCS(8, 10, sse4);
+FW_QPEL_FUNCS(4, 12, sse4);
+FW_QPEL_FUNCS(8, 12, sse4);
+
+#ifdef HAVE_AVX2_EXTERNAL
+
+FW_PEL(32, 8, avx2);
+FW_PUT(pel, pel_pixels16, pixels16, 10, avx2);
+
+FW_EPEL(32, 8, avx2);
+FW_EPEL(16, 10, avx2);
+
+FW_EPEL_HV(32, 8, avx2);
+FW_EPEL_HV(16, 10, avx2);
+
+FW_QPEL(32, 8, avx2);
+FW_QPEL(16, 10, avx2);
+
+FW_QPEL_HV(16, 10, avx2);
+
+#endif
+#endif
+
#define mc_rep_func(name, bitd, step, W, opt) \
void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *_dst, \
const uint8_t *_src, ptrdiff_t _srcstride, int height, \
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH v3 4/8] avcodec/x86/h26x/h2656_inter: add dststride to put
[not found] <20240122152527.601122-1-toqsxw@outlook.com>
2024-01-22 15:25 ` [FFmpeg-devel] [PATCH v3 2/8] avcodec/hevcdsp_template: reuse put/put_luma/put_chroma from h2656_inter_template toqsxw
2024-01-22 15:25 ` [FFmpeg-devel] [PATCH v3 3/8] avcodec/x86/hevc_mc: move put/put_uni to h26x/h2656_inter.asm toqsxw
@ 2024-01-22 15:25 ` toqsxw
2024-01-22 15:25 ` [FFmpeg-devel] [PATCH v3 5/8] avcodec/vvcdec: reuse h26x/2656_inter.asm to enable x86 optimizations toqsxw
` (3 subsequent siblings)
6 siblings, 0 replies; 10+ messages in thread
From: toqsxw @ 2024-01-22 15:25 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Wu Jianhua
From: Wu Jianhua <toqsxw@outlook.com>
Signed-off-by: Wu Jianhua <toqsxw@outlook.com>
---
libavcodec/x86/h26x/h2656_inter.asm | 32 ++++++++++++++---------------
libavcodec/x86/h26x/h2656dsp.c | 4 ++--
libavcodec/x86/h26x/h2656dsp.h | 2 +-
libavcodec/x86/hevcdsp_init.c | 2 +-
4 files changed, 19 insertions(+), 21 deletions(-)
diff --git a/libavcodec/x86/h26x/h2656_inter.asm b/libavcodec/x86/h26x/h2656_inter.asm
index aa296d549c..cbba0c1ea5 100644
--- a/libavcodec/x86/h26x/h2656_inter.asm
+++ b/libavcodec/x86/h26x/h2656_inter.asm
@@ -22,8 +22,6 @@
; */
%include "libavutil/x86/x86util.asm"
-%define MAX_PB_SIZE 64
-
SECTION_RODATA 32
cextern pw_255
cextern pw_512
@@ -342,7 +340,7 @@ SECTION .text
%endmacro
%macro LOOP_END 3
- add %1q, 2*MAX_PB_SIZE ; dst += dststride
+ add %1q, dststrideq ; dst += dststride
add %2q, %3q ; src += srcstride
dec heightd ; cmp height
jnz .loop ; height loop
@@ -539,7 +537,7 @@ SECTION .text
; ******************************
-; void %1_put_pixels(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride,
+; void %1_put_pixels(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t srcstride,
; int height, const int8_t *hf, const int8_t *vf, int width)
; ******************************
@@ -549,7 +547,7 @@ SECTION .text
%endmacro
%macro MC_PIXELS 3
-cglobal %1_put_pixels%2_%3, 4, 4, 3, dst, src, srcstride, height
+cglobal %1_put_pixels%2_%3, 5, 5, 3, dst, dststride, src, srcstride, height
pxor m2, m2
.loop:
SIMPLE_LOAD %2, %3, srcq, m0
@@ -579,10 +577,10 @@ cglobal %1_put_uni_pixels%2_%3, 5, 5, 2, dst, dststride, src, srcstride, height
%endif
; ******************************
-; void %1_put_4tap_hX(int16_t *dst,
+; void %1_put_4tap_hX(int16_t *dst, ptrdiff_t dststride,
; const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, int8_t *vf, int width);
; ******************************
-cglobal %1_put_4tap_h%2_%3, 5, 5, XMM_REGS, dst, src, srcstride, height, hf
+cglobal %1_put_4tap_h%2_%3, 6, 6, XMM_REGS, dst, dststride, src, srcstride, height, hf
%assign %%stride ((%3 + 7)/8)
MC_4TAP_FILTER %3, hf, m4, m5
.loop:
@@ -612,10 +610,10 @@ cglobal %1_put_uni_4tap_h%2_%3, 6, 7, XMM_REGS, dst, dststride, src, srcstride,
RET
; ******************************
-; void %1_put_4tap_v(int16_t *dst,
+; void %1_put_4tap_v(int16_t *dst, ptrdiff_t dststride,
; const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, int8_t *vf, int width)
; ******************************
-cglobal %1_put_4tap_v%2_%3, 6, 6, XMM_REGS, dst, src, srcstride, height, r3src, vf
+cglobal %1_put_4tap_v%2_%3, 7, 7, XMM_REGS, dst, dststride, src, srcstride, height, r3src, vf
sub srcq, srcstrideq
MC_4TAP_FILTER %3, vf, m4, m5
lea r3srcq, [srcstrideq*3]
@@ -649,10 +647,10 @@ cglobal %1_put_uni_4tap_v%2_%3, 7, 7, XMM_REGS, dst, dststride, src, srcstride,
%macro PUT_4TAP_HV 3
; ******************************
-; void put_4tap_hv(int16_t *dst,
+; void put_4tap_hv(int16_t *dst, ptrdiff_t dststride,
; const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, int8_t *vf, int width)
; ******************************
-cglobal %1_put_4tap_hv%2_%3, 6, 7, 16 , dst, src, srcstride, height, hf, vf, r3src
+cglobal %1_put_4tap_hv%2_%3, 7, 8, 16 , dst, dststride, src, srcstride, height, hf, vf, r3src
%assign %%stride ((%3 + 7)/8)
sub srcq, srcstrideq
MC_4TAP_HV_FILTER %3
@@ -784,12 +782,12 @@ cglobal %1_put_uni_4tap_hv%2_%3, 7, 8, 16 , dst, dststride, src, srcstride, heig
%endmacro
; ******************************
-; void put_8tap_hX_X_X(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride,
+; void put_8tap_hX_X_X(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t srcstride,
; int height, const int8_t *hf, const int8_t *vf, int width)
; ******************************
%macro PUT_8TAP 3
-cglobal %1_put_8tap_h%2_%3, 5, 5, 16, dst, src, srcstride, height, hf
+cglobal %1_put_8tap_h%2_%3, 6, 6, 16, dst, dststride, src, srcstride, height, hf
MC_8TAP_FILTER %3, hf
.loop:
MC_8TAP_H_LOAD %3, srcq, %2, 10
@@ -824,10 +822,10 @@ cglobal %1_put_uni_8tap_h%2_%3, 6, 7, 16 , dst, dststride, src, srcstride, heigh
; ******************************
-; void put_8tap_vX_X_X(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride,
+; void put_8tap_vX_X_X(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t srcstride,
; int height, const int8_t *hf, const int8_t *vf, int width)
; ******************************
-cglobal %1_put_8tap_v%2_%3, 6, 8, 16, dst, src, srcstride, height, r3src, vf
+cglobal %1_put_8tap_v%2_%3, 7, 8, 16, dst, dststride, src, srcstride, height, r3src, vf
MC_8TAP_FILTER %3, vf
lea r3srcq, [srcstrideq*3]
.loop:
@@ -866,11 +864,11 @@ cglobal %1_put_uni_8tap_v%2_%3, 7, 9, 16, dst, dststride, src, srcstride, height
; ******************************
-; void put_8tap_hvX_X(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride,
+; void put_8tap_hvX_X(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t srcstride,
; int height, const int8_t *hf, const int8_t *vf, int width)
; ******************************
%macro PUT_8TAP_HV 3
-cglobal %1_put_8tap_hv%2_%3, 6, 7, 16, 0 - mmsize*16, dst, src, srcstride, height, hf, vf, r3src
+cglobal %1_put_8tap_hv%2_%3, 7, 8, 16, 0 - mmsize*16, dst, dststride, src, srcstride, height, hf, vf, r3src
MC_8TAP_FILTER %3, hf, 0
lea hfq, [rsp]
MC_8TAP_FILTER %3, vf, 8*mmsize
diff --git a/libavcodec/x86/h26x/h2656dsp.c b/libavcodec/x86/h26x/h2656dsp.c
index 27769f9c55..7ef1234936 100644
--- a/libavcodec/x86/h26x/h2656dsp.c
+++ b/libavcodec/x86/h26x/h2656dsp.c
@@ -24,7 +24,7 @@
#include "h2656dsp.h"
#define mc_rep_func(name, bitd, step, W, opt) \
-void ff_h2656_put_##name##W##_##bitd##_##opt(int16_t *_dst, \
+void ff_h2656_put_##name##W##_##bitd##_##opt(int16_t *_dst, ptrdiff_t dststride, \
const uint8_t *_src, ptrdiff_t _srcstride, int height, const int8_t *hf, const int8_t *vf, int width) \
{ \
int i; \
@@ -32,7 +32,7 @@ void ff_h2656_put_##name##W##_##bitd##_##opt(int16_t *_dst,
for (i = 0; i < W; i += step) { \
const uint8_t *src = _src + (i * ((bitd + 7) / 8)); \
dst = _dst + i; \
- ff_h2656_put_##name##step##_##bitd##_##opt(dst, src, _srcstride, height, hf, vf, width); \
+ ff_h2656_put_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride, height, hf, vf, width); \
} \
}
diff --git a/libavcodec/x86/h26x/h2656dsp.h b/libavcodec/x86/h26x/h2656dsp.h
index 8a2ab13607..e31aae6b0d 100644
--- a/libavcodec/x86/h26x/h2656dsp.h
+++ b/libavcodec/x86/h26x/h2656dsp.h
@@ -30,7 +30,7 @@
#include <stdlib.h>
#define H2656_PEL_PROTOTYPE(name, D, opt) \
-void ff_h2656_put_ ## name ## _ ## D ## _##opt(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, const int8_t *hf, const int8_t *vf, int width); \
+void ff_h2656_put_ ## name ## _ ## D ## _##opt(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, ptrdiff_t _srcstride, int height, const int8_t *hf, const int8_t *vf, int width); \
void ff_h2656_put_uni_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, int height, const int8_t *hf, const int8_t *vf, int width); \
#define H2656_MC_8TAP_PROTOTYPES(fname, bitd, opt) \
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index 5c19330e19..e0dc82eef0 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -96,7 +96,7 @@ void ff_hevc_put_hevc_ ## a ## _ ## depth ## _##opt(int16_t *dst, const uint8_t
int height, intptr_t mx, intptr_t my,int width) \
{ \
DECL_HV_FILTER(p) \
- ff_h2656_put_ ## b ## _ ## depth ## _##opt(dst, src, srcstride, height, hf, vf, width); \
+ ff_h2656_put_ ## b ## _ ## depth ## _##opt(dst, 2 * MAX_PB_SIZE, src, srcstride, height, hf, vf, width); \
}
#define FW_PUT_UNI(p, a, b, depth, opt) \
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH v3 5/8] avcodec/vvcdec: reuse h26x/2656_inter.asm to enable x86 optimizations
[not found] <20240122152527.601122-1-toqsxw@outlook.com>
` (2 preceding siblings ...)
2024-01-22 15:25 ` [FFmpeg-devel] [PATCH v3 4/8] avcodec/x86/h26x/h2656_inter: add dststride to put toqsxw
@ 2024-01-22 15:25 ` toqsxw
2024-01-22 15:25 ` [FFmpeg-devel] [PATCH v3 6/8] tests/checkasm: add checkasm_check_vvc_mc toqsxw
` (2 subsequent siblings)
6 siblings, 0 replies; 10+ messages in thread
From: toqsxw @ 2024-01-22 15:25 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Wu Jianhua
From: Wu Jianhua <toqsxw@outlook.com>
Signed-off-by: Wu Jianhua <toqsxw@outlook.com>
---
libavcodec/Makefile | 1 +
libavcodec/vvc/vvcdsp.c | 4 +
libavcodec/vvc/vvcdsp.h | 2 +
libavcodec/x86/vvc/Makefile | 6 +
libavcodec/x86/vvc/vvcdsp_init.c | 202 +++++++++++++++++++++++++++++++
5 files changed, 215 insertions(+)
create mode 100644 libavcodec/x86/vvc/Makefile
create mode 100644 libavcodec/x86/vvc/vvcdsp_init.c
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index bb42095165..ce33631b60 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -65,6 +65,7 @@ OBJS = ac3_parser.o \
# subsystems
include $(SRC_PATH)/libavcodec/vvc/Makefile
+include $(SRC_PATH)/libavcodec/x86/vvc/Makefile
OBJS-$(CONFIG_AANDCTTABLES) += aandcttab.o
OBJS-$(CONFIG_AC3DSP) += ac3dsp.o ac3.o ac3tab.o
OBJS-$(CONFIG_ADTS_HEADER) += adts_header.o mpeg4audio_sample_rates.o
diff --git a/libavcodec/vvc/vvcdsp.c b/libavcodec/vvc/vvcdsp.c
index c82ea7be30..c542be5258 100644
--- a/libavcodec/vvc/vvcdsp.c
+++ b/libavcodec/vvc/vvcdsp.c
@@ -138,4 +138,8 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int bit_depth)
VVC_DSP(8);
break;
}
+
+#if ARCH_X86
+ ff_vvc_dsp_init_x86(vvcdsp, bit_depth);
+#endif
}
diff --git a/libavcodec/vvc/vvcdsp.h b/libavcodec/vvc/vvcdsp.h
index b5a63c5833..6f59e73654 100644
--- a/libavcodec/vvc/vvcdsp.h
+++ b/libavcodec/vvc/vvcdsp.h
@@ -167,4 +167,6 @@ typedef struct VVCDSPContext {
void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth);
+void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth);
+
#endif /* AVCODEC_VVC_VVCDSP_H */
diff --git a/libavcodec/x86/vvc/Makefile b/libavcodec/x86/vvc/Makefile
new file mode 100644
index 0000000000..b4acc22501
--- /dev/null
+++ b/libavcodec/x86/vvc/Makefile
@@ -0,0 +1,6 @@
+clean::
+ $(RM) $(CLEANSUFFIXES:%=libavcodec/x86/vvc/%)
+
+OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvcdsp_init.o
+X86ASM-OBJS-$(CONFIG_VVC_DECODER) += x86/h26x/h2656dsp.o \
+ x86/h26x/h2656_inter.o
diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
new file mode 100644
index 0000000000..c197cdb4cc
--- /dev/null
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -0,0 +1,202 @@
+/*
+ * VVC DSP init for x86
+ *
+ * Copyright (C) 2022-2024 Nuo Mi
+ * Copyright (c) 2023-2024 Wu Jianhua
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/vvc/vvcdec.h"
+#include "libavcodec/vvc/vvc_ctu.h"
+#include "libavcodec/vvc/vvcdsp.h"
+#include "libavcodec/x86/h26x/h2656dsp.h"
+
+#define FW_PUT(name, depth, opt) \
+static void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, \
+ int height, const int8_t *hf, const int8_t *vf, int width) \
+{ \
+ ff_h2656_put_## name ## _ ## depth ## _##opt(dst, 2 * MAX_PB_SIZE, src, srcstride, height, hf, vf, width); \
+}
+
+#define FW_PUT_TAP(fname, bitd, opt ) \
+ FW_PUT(fname##4, bitd, opt ); \
+ FW_PUT(fname##8, bitd, opt ); \
+ FW_PUT(fname##16, bitd, opt ); \
+ FW_PUT(fname##32, bitd, opt ); \
+ FW_PUT(fname##64, bitd, opt ); \
+ FW_PUT(fname##128, bitd, opt ); \
+
+#define FW_PUT_4TAP(fname, bitd, opt) \
+ FW_PUT(fname ## 2, bitd, opt) \
+ FW_PUT_TAP(fname, bitd, opt)
+
+#define FW_PUT_4TAP_SSE4(bitd) \
+ FW_PUT_4TAP(pixels, bitd, sse4) \
+ FW_PUT_4TAP(4tap_h, bitd, sse4) \
+ FW_PUT_4TAP(4tap_v, bitd, sse4) \
+ FW_PUT_4TAP(4tap_hv, bitd, sse4)
+
+#define FW_PUT_8TAP_SSE4(bitd) \
+ FW_PUT_TAP(8tap_h, bitd, sse4) \
+ FW_PUT_TAP(8tap_v, bitd, sse4) \
+ FW_PUT_TAP(8tap_hv, bitd, sse4)
+
+#define FW_PUT_SSE4(bitd) \
+ FW_PUT_4TAP_SSE4(bitd) \
+ FW_PUT_8TAP_SSE4(bitd)
+
+FW_PUT_SSE4( 8);
+FW_PUT_SSE4(10);
+FW_PUT_SSE4(12);
+
+#define FW_PUT_TAP_AVX2(n, bitd) \
+ FW_PUT(n ## tap_h32, bitd, avx2) \
+ FW_PUT(n ## tap_h64, bitd, avx2) \
+ FW_PUT(n ## tap_h128, bitd, avx2) \
+ FW_PUT(n ## tap_v32, bitd, avx2) \
+ FW_PUT(n ## tap_v64, bitd, avx2) \
+ FW_PUT(n ## tap_v128, bitd, avx2)
+
+#define FW_PUT_AVX2(bitd) \
+ FW_PUT(pixels32, bitd, avx2) \
+ FW_PUT(pixels64, bitd, avx2) \
+ FW_PUT(pixels128, bitd, avx2) \
+ FW_PUT_TAP_AVX2(4, bitd) \
+ FW_PUT_TAP_AVX2(8, bitd) \
+
+FW_PUT_AVX2( 8)
+FW_PUT_AVX2(10)
+FW_PUT_AVX2(12)
+
+#define FW_PUT_TAP_16BPC_AVX2(n, bitd) \
+ FW_PUT(n ## tap_h16, bitd, avx2) \
+ FW_PUT(n ## tap_v16, bitd, avx2) \
+ FW_PUT(n ## tap_hv16, bitd, avx2) \
+ FW_PUT(n ## tap_hv32, bitd, avx2) \
+ FW_PUT(n ## tap_hv64, bitd, avx2) \
+ FW_PUT(n ## tap_hv128, bitd, avx2)
+
+#define FW_PUT_16BPC_AVX2(bitd) \
+ FW_PUT(pixels16, bitd, avx2) \
+ FW_PUT_TAP_16BPC_AVX2(4, bitd) \
+ FW_PUT_TAP_16BPC_AVX2(8, bitd);
+
+FW_PUT_16BPC_AVX2(10);
+FW_PUT_16BPC_AVX2(12);
+
+#define PEL_LINK(dst, C, W, idx1, idx2, name, D, opt) \
+ dst[C][W][idx1][idx2] = ff_vvc_put_## name ## _ ## D ## _##opt; \
+ dst ## _uni[C][W][idx1][idx2] = ff_h2656_put_uni_ ## name ## _ ## D ## _##opt; \
+
+#define MC_TAP_LINKS(pointer, C, my, mx, fname, bitd, opt ) \
+ PEL_LINK(pointer, C, 1, my , mx , fname##4 , bitd, opt ); \
+ PEL_LINK(pointer, C, 2, my , mx , fname##8 , bitd, opt ); \
+ PEL_LINK(pointer, C, 3, my , mx , fname##16, bitd, opt ); \
+ PEL_LINK(pointer, C, 4, my , mx , fname##32, bitd, opt ); \
+ PEL_LINK(pointer, C, 5, my , mx , fname##64, bitd, opt ); \
+ PEL_LINK(pointer, C, 6, my , mx , fname##128, bitd, opt );
+
+#define MC_8TAP_LINKS(pointer, my, mx, fname, bitd, opt) \
+ MC_TAP_LINKS(pointer, LUMA, my, mx, fname, bitd, opt)
+
+#define MC_8TAP_LINKS_SSE4(bd) \
+ MC_8TAP_LINKS(c->inter.put, 0, 0, pixels, bd, sse4); \
+ MC_8TAP_LINKS(c->inter.put, 0, 1, 8tap_h, bd, sse4); \
+ MC_8TAP_LINKS(c->inter.put, 1, 0, 8tap_v, bd, sse4); \
+ MC_8TAP_LINKS(c->inter.put, 1, 1, 8tap_hv, bd, sse4)
+
+#define MC_4TAP_LINKS(pointer, my, mx, fname, bitd, opt) \
+ PEL_LINK(pointer, CHROMA, 0, my , mx , fname##2 , bitd, opt ); \
+ MC_TAP_LINKS(pointer, CHROMA, my, mx, fname, bitd, opt) \
+
+#define MC_4TAP_LINKS_SSE4(bd) \
+ MC_4TAP_LINKS(c->inter.put, 0, 0, pixels, bd, sse4); \
+ MC_4TAP_LINKS(c->inter.put, 0, 1, 4tap_h, bd, sse4); \
+ MC_4TAP_LINKS(c->inter.put, 1, 0, 4tap_v, bd, sse4); \
+ MC_4TAP_LINKS(c->inter.put, 1, 1, 4tap_hv, bd, sse4)
+
+#define MC_LINK_SSE4(bd) \
+ MC_4TAP_LINKS_SSE4(bd) \
+ MC_8TAP_LINKS_SSE4(bd)
+
+#define MC_TAP_LINKS_AVX2(C,tap,bd) do { \
+ PEL_LINK(c->inter.put, C, 4, 0, 0, pixels32, bd, avx2) \
+ PEL_LINK(c->inter.put, C, 5, 0, 0, pixels64, bd, avx2) \
+ PEL_LINK(c->inter.put, C, 6, 0, 0, pixels128, bd, avx2) \
+ PEL_LINK(c->inter.put, C, 4, 0, 1, tap##tap_h32, bd, avx2) \
+ PEL_LINK(c->inter.put, C, 5, 0, 1, tap##tap_h64, bd, avx2) \
+ PEL_LINK(c->inter.put, C, 6, 0, 1, tap##tap_h128, bd, avx2) \
+ PEL_LINK(c->inter.put, C, 4, 1, 0, tap##tap_v32, bd, avx2) \
+ PEL_LINK(c->inter.put, C, 5, 1, 0, tap##tap_v64, bd, avx2) \
+ PEL_LINK(c->inter.put, C, 6, 1, 0, tap##tap_v128, bd, avx2) \
+ } while (0)
+
+#define MC_LINKS_AVX2(bd) \
+ MC_TAP_LINKS_AVX2(LUMA, 8, bd); \
+ MC_TAP_LINKS_AVX2(CHROMA, 4, bd);
+
+#define MC_TAP_LINKS_16BPC_AVX2(C, tap, bd) do { \
+ PEL_LINK(c->inter.put, C, 3, 0, 0, pixels16, bd, avx2) \
+ PEL_LINK(c->inter.put, C, 3, 0, 1, tap##tap_h16, bd, avx2) \
+ PEL_LINK(c->inter.put, C, 3, 1, 0, tap##tap_v16, bd, avx2) \
+ PEL_LINK(c->inter.put, C, 3, 1, 1, tap##tap_hv16, bd, avx2) \
+ PEL_LINK(c->inter.put, C, 4, 1, 1, tap##tap_hv32, bd, avx2) \
+ PEL_LINK(c->inter.put, C, 5, 1, 1, tap##tap_hv64, bd, avx2) \
+ PEL_LINK(c->inter.put, C, 6, 1, 1, tap##tap_hv128, bd, avx2) \
+ } while (0)
+
+#define MC_LINKS_16BPC_AVX2(bd) \
+ MC_TAP_LINKS_16BPC_AVX2(LUMA, 8, bd); \
+ MC_TAP_LINKS_16BPC_AVX2(CHROMA, 4, bd);
+
+void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
+{
+ const int cpu_flags = av_get_cpu_flags();
+
+ if (ARCH_X86_64) {
+ if (bd == 8) {
+ if (EXTERNAL_SSE4(cpu_flags)) {
+ MC_LINK_SSE4(8);
+ }
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ MC_LINKS_AVX2(8);
+ }
+ } else if (bd == 10) {
+ if (EXTERNAL_SSE4(cpu_flags)) {
+ MC_LINK_SSE4(10);
+ }
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ MC_LINKS_AVX2(10);
+ MC_LINKS_16BPC_AVX2(10);
+ }
+ } else if (bd == 12) {
+ if (EXTERNAL_SSE4(cpu_flags)) {
+ MC_LINK_SSE4(12);
+ }
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ MC_LINKS_AVX2(12);
+ MC_LINKS_16BPC_AVX2(12);
+ }
+ }
+ }
+}
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH v3 6/8] tests/checkasm: add checkasm_check_vvc_mc
[not found] <20240122152527.601122-1-toqsxw@outlook.com>
` (3 preceding siblings ...)
2024-01-22 15:25 ` [FFmpeg-devel] [PATCH v3 5/8] avcodec/vvcdec: reuse h26x/2656_inter.asm to enable x86 optimizations toqsxw
@ 2024-01-22 15:25 ` toqsxw
2024-01-22 15:25 ` [FFmpeg-devel] [PATCH v3 7/8] avcodec/x86/vvc: add avg and avg_w AVX2 optimizations toqsxw
2024-01-22 15:25 ` [FFmpeg-devel] [PATCH v3 8/8] tests/checkasm/vvc_mc: add check_avg toqsxw
6 siblings, 0 replies; 10+ messages in thread
From: toqsxw @ 2024-01-22 15:25 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Wu Jianhua
From: Wu Jianhua <toqsxw@outlook.com>
Signed-off-by: Wu Jianhua <toqsxw@outlook.com>
---
tests/checkasm/Makefile | 1 +
tests/checkasm/checkasm.c | 3 +
tests/checkasm/checkasm.h | 1 +
tests/checkasm/vvc_mc.c | 270 ++++++++++++++++++++++++++++++++++++++
4 files changed, 275 insertions(+)
create mode 100644 tests/checkasm/vvc_mc.c
diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 3b5b54352b..3562acb2b2 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -40,6 +40,7 @@ AVCODECOBJS-$(CONFIG_V210_DECODER) += v210dec.o
AVCODECOBJS-$(CONFIG_V210_ENCODER) += v210enc.o
AVCODECOBJS-$(CONFIG_VORBIS_DECODER) += vorbisdsp.o
AVCODECOBJS-$(CONFIG_VP9_DECODER) += vp9dsp.o
+AVCODECOBJS-$(CONFIG_VVC_DECODER) += vvc_mc.o
CHECKASMOBJS-$(CONFIG_AVCODEC) += $(AVCODECOBJS-yes)
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 87f24c77ca..36a97957e5 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -194,6 +194,9 @@ static const struct {
#if CONFIG_VORBIS_DECODER
{ "vorbisdsp", checkasm_check_vorbisdsp },
#endif
+ #if CONFIG_VVC_DECODER
+ { "vvc_mc", checkasm_check_vvc_mc },
+ #endif
#endif
#if CONFIG_AVFILTER
#if CONFIG_AFIR_FILTER
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 4db8c495ea..53cb3ccfbf 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -131,6 +131,7 @@ void checkasm_check_vp8dsp(void);
void checkasm_check_vp9dsp(void);
void checkasm_check_videodsp(void);
void checkasm_check_vorbisdsp(void);
+void checkasm_check_vvc_mc(void);
struct CheckasmPerf;
diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
new file mode 100644
index 0000000000..711280deec
--- /dev/null
+++ b/tests/checkasm/vvc_mc.c
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2023-2024 Nuo Mi
+ * Copyright (c) 2023-2024 Wu Jianhua
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string.h>
+
+#include "checkasm.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/vvc/vvc_ctu.h"
+#include "libavcodec/vvc/vvc_data.h"
+
+#include "libavutil/common.h"
+#include "libavutil/internal.h"
+#include "libavutil/internal.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+
+static const uint32_t pixel_mask[] = { 0xffffffff, 0x03ff03ff, 0x0fff0fff, 0x3fff3fff, 0xffffffff };
+static const int sizes[] = { 2, 4, 8, 16, 32, 64, 128 };
+
+#define PIXEL_STRIDE (MAX_CTU_SIZE * 2)
+#define EXTRA_BEFORE 3
+#define EXTRA_AFTER 4
+#define SRC_EXTRA (EXTRA_BEFORE + EXTRA_AFTER) * 2
+#define SRC_BUF_SIZE (PIXEL_STRIDE + SRC_EXTRA) * (PIXEL_STRIDE + SRC_EXTRA)
+#define DST_BUF_SIZE (MAX_CTU_SIZE * MAX_CTU_SIZE * 2)
+#define SRC_OFFSET ((PIXEL_STRIDE + EXTRA_BEFORE * 2) * EXTRA_BEFORE)
+
+#define randomize_buffers(buf0, buf1, size, mask) \
+ do { \
+ int k; \
+ for (k = 0; k < size; k += 4) { \
+ uint32_t r = rnd() & mask; \
+ AV_WN32A(buf0 + k, r); \
+ AV_WN32A(buf1 + k, r); \
+ } \
+ } while (0)
+
+#define randomize_pixels(buf0, buf1, size) \
+ do { \
+ uint32_t mask = pixel_mask[(bit_depth - 8) >> 1]; \
+ randomize_buffers(buf0, buf1, size, mask); \
+ } while (0)
+
+#define randomize_avg_src(buf0, buf1, size) \
+ do { \
+ uint32_t mask = 0x3fff3fff; \
+ randomize_buffers(buf0, buf1, size, mask); \
+ } while (0)
+
+static void check_put_vvc_luma(void)
+{
+ LOCAL_ALIGNED_32(int16_t, dst0, [DST_BUF_SIZE / 2]);
+ LOCAL_ALIGNED_32(int16_t, dst1, [DST_BUF_SIZE / 2]);
+ LOCAL_ALIGNED_32(uint8_t, src0, [SRC_BUF_SIZE]);
+ LOCAL_ALIGNED_32(uint8_t, src1, [SRC_BUF_SIZE]);
+ VVCDSPContext c;
+
+ declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, int16_t *dst, const uint8_t *src, const ptrdiff_t src_stride,
+ const int height, const int8_t *hf, const int8_t *vf, const int width);
+
+ for (int bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
+ randomize_pixels(src0, src1, SRC_BUF_SIZE);
+ ff_vvc_dsp_init(&c, bit_depth);
+ for (int i = 0; i < 2; i++) {
+ for (int j = 0; j < 2; j++) {
+ for (int h = 4; h <= MAX_CTU_SIZE; h *= 2) {
+ for (int w = 4; w <= MAX_CTU_SIZE; w *= 2) {
+ const int idx = av_log2(w) - 1;
+ const int mx = rnd() % 16;
+ const int my = rnd() % 16;
+ const int8_t *hf = ff_vvc_inter_luma_filters[rnd() % 3][mx];
+ const int8_t *vf = ff_vvc_inter_luma_filters[rnd() % 3][my];
+ const char *type;
+ switch ((j << 1) | i) {
+ case 0: type = "put_luma_pixels"; break; // 0 0
+ case 1: type = "put_luma_h"; break; // 0 1
+ case 2: type = "put_luma_v"; break; // 1 0
+ case 3: type = "put_luma_hv"; break; // 1 1
+ }
+ if (check_func(c.inter.put[LUMA][idx][j][i], "%s_%d_%dx%d", type, bit_depth, w, h)) {
+ memset(dst0, 0, DST_BUF_SIZE);
+ memset(dst1, 0, DST_BUF_SIZE);
+ call_ref(dst0, src0 + SRC_OFFSET, PIXEL_STRIDE, h, hf, vf, w);
+ call_new(dst1, src1 + SRC_OFFSET, PIXEL_STRIDE, h, hf, vf, w);
+ if (memcmp(dst0, dst1, DST_BUF_SIZE))
+ fail();
+ if (w == h)
+ bench_new(dst1, src1 + SRC_OFFSET, PIXEL_STRIDE, h, hf, vf, w);
+ }
+ }
+ }
+ }
+ }
+ }
+ report("put_luma");
+}
+
+static void check_put_vvc_luma_uni(void)
+{
+ LOCAL_ALIGNED_32(uint8_t, dst0, [DST_BUF_SIZE]);
+ LOCAL_ALIGNED_32(uint8_t, dst1, [DST_BUF_SIZE]);
+ LOCAL_ALIGNED_32(uint8_t, src0, [SRC_BUF_SIZE]);
+ LOCAL_ALIGNED_32(uint8_t, src1, [SRC_BUF_SIZE]);
+
+ VVCDSPContext c;
+ declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride, int height, const int8_t *hf, const int8_t *vf, int width);
+
+ for (int bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
+ ff_vvc_dsp_init(&c, bit_depth);
+ randomize_pixels(src0, src1, SRC_BUF_SIZE);
+ for (int i = 0; i < 2; i++) {
+ for (int j = 0; j < 2; j++) {
+ for (int h = 4; h <= MAX_CTU_SIZE; h *= 2) {
+ for (int w = 4; w <= MAX_CTU_SIZE; w *= 2) {
+ const int idx = av_log2(w) - 1;
+ const int mx = rnd() % VVC_INTER_LUMA_FACTS;
+ const int my = rnd() % VVC_INTER_LUMA_FACTS;
+ const int8_t *hf = ff_vvc_inter_luma_filters[rnd() % VVC_INTER_FILTER_TYPES][mx];
+ const int8_t *vf = ff_vvc_inter_luma_filters[rnd() % VVC_INTER_FILTER_TYPES][my];
+ const char *type;
+
+ switch ((j << 1) | i) {
+ case 0: type = "put_uni_pixels"; break; // 0 0
+ case 1: type = "put_uni_h"; break; // 0 1
+ case 2: type = "put_uni_v"; break; // 1 0
+ case 3: type = "put_uni_hv"; break; // 1 1
+ }
+
+ if (check_func(c.inter.put_uni[LUMA][idx][j][i], "%s_luma_%d_%dx%d", type, bit_depth, w, h)) {
+ memset(dst0, 0, DST_BUF_SIZE);
+ memset(dst1, 0, DST_BUF_SIZE);
+ call_ref(dst0, PIXEL_STRIDE, src0 + SRC_OFFSET, PIXEL_STRIDE, h, hf, vf, w);
+ call_new(dst1, PIXEL_STRIDE, src1 + SRC_OFFSET, PIXEL_STRIDE, h, hf, vf, w);
+ if (memcmp(dst0, dst1, DST_BUF_SIZE))
+ fail();
+ if (w == h)
+ bench_new(dst1, PIXEL_STRIDE, src1 + SRC_OFFSET, PIXEL_STRIDE, h, hf, vf, w);
+ }
+ }
+ }
+ }
+ }
+ }
+ report("put_uni_luma");
+}
+
+static void check_put_vvc_chroma(void)
+{
+ LOCAL_ALIGNED_32(int16_t, dst0, [DST_BUF_SIZE / 2]);
+ LOCAL_ALIGNED_32(int16_t, dst1, [DST_BUF_SIZE / 2]);
+ LOCAL_ALIGNED_32(uint8_t, src0, [SRC_BUF_SIZE]);
+ LOCAL_ALIGNED_32(uint8_t, src1, [SRC_BUF_SIZE]);
+ VVCDSPContext c;
+
+ declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, int16_t *dst, const uint8_t *src, const ptrdiff_t src_stride,
+ const int height, const int8_t *hf, const int8_t *vf, const int width);
+
+ for (int bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
+ randomize_pixels(src0, src1, SRC_BUF_SIZE);
+ ff_vvc_dsp_init(&c, bit_depth);
+ for (int i = 0; i < 2; i++) {
+ for (int j = 0; j < 2; j++) {
+ for (int h = 2; h <= MAX_CTU_SIZE; h *= 2) {
+ for (int w = 2; w <= MAX_CTU_SIZE; w *= 2) {
+ const int idx = av_log2(w) - 1;
+ const int mx = rnd() % VVC_INTER_CHROMA_FACTS;
+ const int my = rnd() % VVC_INTER_CHROMA_FACTS;
+ const int8_t *hf = ff_vvc_inter_chroma_filters[rnd() % VVC_INTER_FILTER_TYPES][mx];
+ const int8_t *vf = ff_vvc_inter_chroma_filters[rnd() % VVC_INTER_FILTER_TYPES][my];
+ const char *type;
+ switch ((j << 1) | i) {
+ case 0: type = "put_chroma_pixels"; break; // 0 0
+ case 1: type = "put_chroma_h"; break; // 0 1
+ case 2: type = "put_chroma_v"; break; // 1 0
+ case 3: type = "put_chroma_hv"; break; // 1 1
+ }
+ if (check_func(c.inter.put[CHROMA][idx][j][i], "%s_%d_%dx%d", type, bit_depth, w, h)) {
+ memset(dst0, 0, DST_BUF_SIZE);
+ memset(dst1, 0, DST_BUF_SIZE);
+ call_ref(dst0, src0 + SRC_OFFSET, PIXEL_STRIDE, h, hf, vf, w);
+ call_new(dst1, src1 + SRC_OFFSET, PIXEL_STRIDE, h, hf, vf, w);
+ if (memcmp(dst0, dst1, DST_BUF_SIZE))
+ fail();
+ if (w == h)
+ bench_new(dst1, src1 + SRC_OFFSET, PIXEL_STRIDE, h, hf, vf, w);
+ }
+ }
+ }
+ }
+ }
+ }
+ report("put_chroma");
+}
+
+static void check_put_vvc_chroma_uni(void)
+{
+ LOCAL_ALIGNED_32(uint8_t, dst0, [DST_BUF_SIZE]);
+ LOCAL_ALIGNED_32(uint8_t, dst1, [DST_BUF_SIZE]);
+ LOCAL_ALIGNED_32(uint8_t, src0, [SRC_BUF_SIZE]);
+ LOCAL_ALIGNED_32(uint8_t, src1, [SRC_BUF_SIZE]);
+
+ VVCDSPContext c;
+ declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride, int height, const int8_t *hf, const int8_t *vf, int width);
+
+ for (int bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
+ ff_vvc_dsp_init(&c, bit_depth);
+ randomize_pixels(src0, src1, SRC_BUF_SIZE);
+ for (int i = 0; i < 2; i++) {
+ for (int j = 0; j < 2; j++) {
+ for (int h = 4; h <= MAX_CTU_SIZE; h *= 2) {
+ for (int w = 4; w <= MAX_CTU_SIZE; w *= 2) {
+ const int idx = av_log2(w) - 1;
+ const int mx = rnd() % VVC_INTER_CHROMA_FACTS;
+ const int my = rnd() % VVC_INTER_CHROMA_FACTS;
+ const int8_t *hf = ff_vvc_inter_chroma_filters[rnd() % VVC_INTER_FILTER_TYPES][mx];
+ const int8_t *vf = ff_vvc_inter_chroma_filters[rnd() % VVC_INTER_FILTER_TYPES][my];
+ const char *type;
+
+ switch ((j << 1) | i) {
+ case 0: type = "put_uni_pixels"; break; // 0 0
+ case 1: type = "put_uni_h"; break; // 0 1
+ case 2: type = "put_uni_v"; break; // 1 0
+ case 3: type = "put_uni_hv"; break; // 1 1
+ }
+
+ if (check_func(c.inter.put_uni[CHROMA][idx][j][i], "%s_chroma_%d_%dx%d", type, bit_depth, w, h)) {
+ memset(dst0, 0, DST_BUF_SIZE);
+ memset(dst1, 0, DST_BUF_SIZE);
+ call_ref(dst0, PIXEL_STRIDE, src0 + SRC_OFFSET, PIXEL_STRIDE, h, hf, vf, w);
+ call_new(dst1, PIXEL_STRIDE, src1 + SRC_OFFSET, PIXEL_STRIDE, h, hf, vf, w);
+ if (memcmp(dst0, dst1, DST_BUF_SIZE))
+ fail();
+ if (w == h)
+ bench_new(dst1, PIXEL_STRIDE, src1 + SRC_OFFSET, PIXEL_STRIDE, h, hf, vf, w);
+ }
+ }
+ }
+ }
+ }
+ }
+ report("put_uni_chroma");
+}
+
+void checkasm_check_vvc_mc(void)
+{
+ check_put_vvc_luma();
+ check_put_vvc_luma_uni();
+ check_put_vvc_chroma();
+ check_put_vvc_chroma_uni();
+}
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH v3 7/8] avcodec/x86/vvc: add avg and avg_w AVX2 optimizations
[not found] <20240122152527.601122-1-toqsxw@outlook.com>
` (4 preceding siblings ...)
2024-01-22 15:25 ` [FFmpeg-devel] [PATCH v3 6/8] tests/checkasm: add checkasm_check_vvc_mc toqsxw
@ 2024-01-22 15:25 ` toqsxw
2024-01-22 15:25 ` [FFmpeg-devel] [PATCH v3 8/8] tests/checkasm/vvc_mc: add check_avg toqsxw
6 siblings, 0 replies; 10+ messages in thread
From: toqsxw @ 2024-01-22 15:25 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Wu Jianhua
From: Wu Jianhua <toqsxw@outlook.com>
The avg/avg_w is based on dav1d.
See https://code.videolan.org/videolan/dav1d/-/blob/master/src/x86/mc_avx2.asm
vvc_avg_8_2x2_c: 71.6
vvc_avg_8_2x2_avx2: 26.8
vvc_avg_8_2x4_c: 140.8
vvc_avg_8_2x4_avx2: 34.6
vvc_avg_8_2x8_c: 410.3
vvc_avg_8_2x8_avx2: 41.3
vvc_avg_8_2x16_c: 769.3
vvc_avg_8_2x16_avx2: 60.3
vvc_avg_8_2x32_c: 1669.6
vvc_avg_8_2x32_avx2: 105.1
vvc_avg_8_2x64_c: 1978.3
vvc_avg_8_2x64_avx2: 425.8
vvc_avg_8_2x128_c: 6536.8
vvc_avg_8_2x128_avx2: 1315.1
vvc_avg_8_4x2_c: 155.6
vvc_avg_8_4x2_avx2: 26.1
vvc_avg_8_4x4_c: 250.3
vvc_avg_8_4x4_avx2: 31.3
vvc_avg_8_4x8_c: 831.8
vvc_avg_8_4x8_avx2: 41.3
vvc_avg_8_4x16_c: 1461.1
vvc_avg_8_4x16_avx2: 57.1
vvc_avg_8_4x32_c: 2821.6
vvc_avg_8_4x32_avx2: 105.1
vvc_avg_8_4x64_c: 3615.8
vvc_avg_8_4x64_avx2: 412.6
vvc_avg_8_4x128_c: 11962.6
vvc_avg_8_4x128_avx2: 1274.3
vvc_avg_8_8x2_c: 215.8
vvc_avg_8_8x2_avx2: 29.1
vvc_avg_8_8x4_c: 430.6
vvc_avg_8_8x4_avx2: 37.6
vvc_avg_8_8x8_c: 1463.3
vvc_avg_8_8x8_avx2: 51.8
vvc_avg_8_8x16_c: 2630.1
vvc_avg_8_8x16_avx2: 97.6
vvc_avg_8_8x32_c: 5813.8
vvc_avg_8_8x32_avx2: 196.6
vvc_avg_8_8x64_c: 6687.3
vvc_avg_8_8x64_avx2: 487.8
vvc_avg_8_8x128_c: 13178.6
vvc_avg_8_8x128_avx2: 1290.6
vvc_avg_8_16x2_c: 443.8
vvc_avg_8_16x2_avx2: 28.3
vvc_avg_8_16x4_c: 1253.3
vvc_avg_8_16x4_avx2: 32.1
vvc_avg_8_16x8_c: 2236.3
vvc_avg_8_16x8_avx2: 44.3
vvc_avg_8_16x16_c: 5127.8
vvc_avg_8_16x16_avx2: 63.3
vvc_avg_8_16x32_c: 6573.3
vvc_avg_8_16x32_avx2: 223.6
vvc_avg_8_16x64_c: 30311.8
vvc_avg_8_16x64_avx2: 437.8
vvc_avg_8_16x128_c: 25693.3
vvc_avg_8_16x128_avx2: 1266.8
vvc_avg_8_32x2_c: 954.6
vvc_avg_8_32x2_avx2: 32.1
vvc_avg_8_32x4_c: 2359.6
vvc_avg_8_32x4_avx2: 39.6
vvc_avg_8_32x8_c: 5703.6
vvc_avg_8_32x8_avx2: 57.1
vvc_avg_8_32x16_c: 9967.6
vvc_avg_8_32x16_avx2: 107.1
vvc_avg_8_32x32_c: 21327.6
vvc_avg_8_32x32_avx2: 272.6
vvc_avg_8_32x64_c: 39240.8
vvc_avg_8_32x64_avx2: 529.6
vvc_avg_8_32x128_c: 52580.8
vvc_avg_8_32x128_avx2: 1338.8
vvc_avg_8_64x2_c: 1647.3
vvc_avg_8_64x2_avx2: 38.8
vvc_avg_8_64x4_c: 5130.1
vvc_avg_8_64x4_avx2: 58.8
vvc_avg_8_64x8_c: 6529.3
vvc_avg_8_64x8_avx2: 88.3
vvc_avg_8_64x16_c: 19913.6
vvc_avg_8_64x16_avx2: 162.3
vvc_avg_8_64x32_c: 39360.8
vvc_avg_8_64x32_avx2: 295.8
vvc_avg_8_64x64_c: 49658.3
vvc_avg_8_64x64_avx2: 784.1
vvc_avg_8_64x128_c: 108513.1
vvc_avg_8_64x128_avx2: 1977.1
vvc_avg_8_128x2_c: 3226.1
vvc_avg_8_128x2_avx2: 61.1
vvc_avg_8_128x4_c: 10280.3
vvc_avg_8_128x4_avx2: 94.6
vvc_avg_8_128x8_c: 18079.3
vvc_avg_8_128x8_avx2: 155.3
vvc_avg_8_128x16_c: 45121.8
vvc_avg_8_128x16_avx2: 285.3
vvc_avg_8_128x32_c: 48651.8
vvc_avg_8_128x32_avx2: 581.6
vvc_avg_8_128x64_c: 165078.6
vvc_avg_8_128x64_avx2: 1942.8
vvc_avg_8_128x128_c: 339103.1
vvc_avg_8_128x128_avx2: 4332.6
vvc_avg_10_2x2_c: 144.3
vvc_avg_10_2x2_avx2: 26.8
vvc_avg_10_2x4_c: 142.6
vvc_avg_10_2x4_avx2: 45.3
vvc_avg_10_2x8_c: 478.1
vvc_avg_10_2x8_avx2: 38.1
vvc_avg_10_2x16_c: 518.3
vvc_avg_10_2x16_avx2: 58.1
vvc_avg_10_2x32_c: 2059.8
vvc_avg_10_2x32_avx2: 93.1
vvc_avg_10_2x64_c: 2383.8
vvc_avg_10_2x64_avx2: 714.8
vvc_avg_10_2x128_c: 4498.3
vvc_avg_10_2x128_avx2: 1466.3
vvc_avg_10_4x2_c: 228.6
vvc_avg_10_4x2_avx2: 26.8
vvc_avg_10_4x4_c: 378.3
vvc_avg_10_4x4_avx2: 30.6
vvc_avg_10_4x8_c: 866.8
vvc_avg_10_4x8_avx2: 44.6
vvc_avg_10_4x16_c: 1018.1
vvc_avg_10_4x16_avx2: 58.1
vvc_avg_10_4x32_c: 3590.8
vvc_avg_10_4x32_avx2: 128.8
vvc_avg_10_4x64_c: 4200.8
vvc_avg_10_4x64_avx2: 663.6
vvc_avg_10_4x128_c: 8450.8
vvc_avg_10_4x128_avx2: 1531.8
vvc_avg_10_8x2_c: 369.3
vvc_avg_10_8x2_avx2: 28.3
vvc_avg_10_8x4_c: 513.8
vvc_avg_10_8x4_avx2: 32.1
vvc_avg_10_8x8_c: 1720.3
vvc_avg_10_8x8_avx2: 49.1
vvc_avg_10_8x16_c: 1894.8
vvc_avg_10_8x16_avx2: 71.6
vvc_avg_10_8x32_c: 3931.3
vvc_avg_10_8x32_avx2: 148.1
vvc_avg_10_8x64_c: 7964.3
vvc_avg_10_8x64_avx2: 613.1
vvc_avg_10_8x128_c: 15540.1
vvc_avg_10_8x128_avx2: 1585.1
vvc_avg_10_16x2_c: 877.3
vvc_avg_10_16x2_avx2: 27.6
vvc_avg_10_16x4_c: 955.8
vvc_avg_10_16x4_avx2: 29.8
vvc_avg_10_16x8_c: 3419.6
vvc_avg_10_16x8_avx2: 62.6
vvc_avg_10_16x16_c: 3826.8
vvc_avg_10_16x16_avx2: 54.3
vvc_avg_10_16x32_c: 7655.3
vvc_avg_10_16x32_avx2: 86.3
vvc_avg_10_16x64_c: 30011.1
vvc_avg_10_16x64_avx2: 692.6
vvc_avg_10_16x128_c: 47894.8
vvc_avg_10_16x128_avx2: 1580.3
vvc_avg_10_32x2_c: 944.3
vvc_avg_10_32x2_avx2: 29.8
vvc_avg_10_32x4_c: 2022.6
vvc_avg_10_32x4_avx2: 35.1
vvc_avg_10_32x8_c: 6148.8
vvc_avg_10_32x8_avx2: 51.3
vvc_avg_10_32x16_c: 12601.6
vvc_avg_10_32x16_avx2: 70.8
vvc_avg_10_32x32_c: 15958.6
vvc_avg_10_32x32_avx2: 124.3
vvc_avg_10_32x64_c: 31784.6
vvc_avg_10_32x64_avx2: 757.3
vvc_avg_10_32x128_c: 63892.8
vvc_avg_10_32x128_avx2: 1711.3
vvc_avg_10_64x2_c: 1890.8
vvc_avg_10_64x2_avx2: 34.3
vvc_avg_10_64x4_c: 6267.3
vvc_avg_10_64x4_avx2: 42.6
vvc_avg_10_64x8_c: 12778.1
vvc_avg_10_64x8_avx2: 67.8
vvc_avg_10_64x16_c: 22304.3
vvc_avg_10_64x16_avx2: 116.8
vvc_avg_10_64x32_c: 30777.1
vvc_avg_10_64x32_avx2: 201.1
vvc_avg_10_64x64_c: 60169.1
vvc_avg_10_64x64_avx2: 1454.3
vvc_avg_10_64x128_c: 124392.8
vvc_avg_10_64x128_avx2: 3648.6
vvc_avg_10_128x2_c: 3650.1
vvc_avg_10_128x2_avx2: 41.1
vvc_avg_10_128x4_c: 22887.8
vvc_avg_10_128x4_avx2: 64.1
vvc_avg_10_128x8_c: 14622.6
vvc_avg_10_128x8_avx2: 111.6
vvc_avg_10_128x16_c: 62207.6
vvc_avg_10_128x16_avx2: 186.3
vvc_avg_10_128x32_c: 59761.3
vvc_avg_10_128x32_avx2: 374.6
vvc_avg_10_128x64_c: 117504.3
vvc_avg_10_128x64_avx2: 2684.6
vvc_avg_10_128x128_c: 236767.6
vvc_avg_10_128x128_avx2: 15278.1
vvc_avg_12_2x2_c: 78.6
vvc_avg_12_2x2_avx2: 26.1
vvc_avg_12_2x4_c: 254.1
vvc_avg_12_2x4_avx2: 30.6
vvc_avg_12_2x8_c: 261.8
vvc_avg_12_2x8_avx2: 39.1
vvc_avg_12_2x16_c: 527.6
vvc_avg_12_2x16_avx2: 57.3
vvc_avg_12_2x32_c: 1089.1
vvc_avg_12_2x32_avx2: 93.8
vvc_avg_12_2x64_c: 2337.6
vvc_avg_12_2x64_avx2: 707.1
vvc_avg_12_2x128_c: 4582.1
vvc_avg_12_2x128_avx2: 1414.6
vvc_avg_12_4x2_c: 129.6
vvc_avg_12_4x2_avx2: 26.8
vvc_avg_12_4x4_c: 427.3
vvc_avg_12_4x4_avx2: 30.6
vvc_avg_12_4x8_c: 529.6
vvc_avg_12_4x8_avx2: 36.6
vvc_avg_12_4x16_c: 1022.1
vvc_avg_12_4x16_avx2: 57.3
vvc_avg_12_4x32_c: 1987.6
vvc_avg_12_4x32_avx2: 84.3
vvc_avg_12_4x64_c: 4147.6
vvc_avg_12_4x64_avx2: 706.3
vvc_avg_12_4x128_c: 8469.3
vvc_avg_12_4x128_avx2: 1448.3
vvc_avg_12_8x2_c: 253.6
vvc_avg_12_8x2_avx2: 27.6
vvc_avg_12_8x4_c: 836.3
vvc_avg_12_8x4_avx2: 32.1
vvc_avg_12_8x8_c: 1074.6
vvc_avg_12_8x8_avx2: 45.1
vvc_avg_12_8x16_c: 3616.8
vvc_avg_12_8x16_avx2: 71.6
vvc_avg_12_8x32_c: 3823.6
vvc_avg_12_8x32_avx2: 140.1
vvc_avg_12_8x64_c: 7764.8
vvc_avg_12_8x64_avx2: 656.1
vvc_avg_12_8x128_c: 15896.1
vvc_avg_12_8x128_avx2: 1232.8
vvc_avg_12_16x2_c: 462.1
vvc_avg_12_16x2_avx2: 26.8
vvc_avg_12_16x4_c: 1732.1
vvc_avg_12_16x4_avx2: 29.1
vvc_avg_12_16x8_c: 2097.6
vvc_avg_12_16x8_avx2: 62.6
vvc_avg_12_16x16_c: 6753.1
vvc_avg_12_16x16_avx2: 47.8
vvc_avg_12_16x32_c: 7373.1
vvc_avg_12_16x32_avx2: 80.8
vvc_avg_12_16x64_c: 15046.3
vvc_avg_12_16x64_avx2: 621.1
vvc_avg_12_16x128_c: 52574.6
vvc_avg_12_16x128_avx2: 1417.1
vvc_avg_12_32x2_c: 1712.1
vvc_avg_12_32x2_avx2: 29.8
vvc_avg_12_32x4_c: 2036.8
vvc_avg_12_32x4_avx2: 37.6
vvc_avg_12_32x8_c: 4017.6
vvc_avg_12_32x8_avx2: 44.1
vvc_avg_12_32x16_c: 8018.6
vvc_avg_12_32x16_avx2: 70.8
vvc_avg_12_32x32_c: 15637.6
vvc_avg_12_32x32_avx2: 124.3
vvc_avg_12_32x64_c: 31143.3
vvc_avg_12_32x64_avx2: 830.3
vvc_avg_12_32x128_c: 75706.8
vvc_avg_12_32x128_avx2: 1604.8
vvc_avg_12_64x2_c: 3230.3
vvc_avg_12_64x2_avx2: 33.6
vvc_avg_12_64x4_c: 4139.6
vvc_avg_12_64x4_avx2: 45.1
vvc_avg_12_64x8_c: 8201.6
vvc_avg_12_64x8_avx2: 67.1
vvc_avg_12_64x16_c: 25632.3
vvc_avg_12_64x16_avx2: 110.3
vvc_avg_12_64x32_c: 30744.3
vvc_avg_12_64x32_avx2: 200.3
vvc_avg_12_64x64_c: 105554.8
vvc_avg_12_64x64_avx2: 1325.6
vvc_avg_12_64x128_c: 235254.3
vvc_avg_12_64x128_avx2: 3132.6
vvc_avg_12_128x2_c: 6194.3
vvc_avg_12_128x2_avx2: 55.1
vvc_avg_12_128x4_c: 7583.8
vvc_avg_12_128x4_avx2: 79.3
vvc_avg_12_128x8_c: 14635.6
vvc_avg_12_128x8_avx2: 104.3
vvc_avg_12_128x16_c: 29270.8
vvc_avg_12_128x16_avx2: 194.3
vvc_avg_12_128x32_c: 60113.6
vvc_avg_12_128x32_avx2: 346.3
vvc_avg_12_128x64_c: 197030.3
vvc_avg_12_128x64_avx2: 2779.6
vvc_avg_12_128x128_c: 432809.6
vvc_avg_12_128x128_avx2: 5513.3
vvc_w_avg_8_2x2_c: 84.3
vvc_w_avg_8_2x2_avx2: 42.6
vvc_w_avg_8_2x4_c: 156.3
vvc_w_avg_8_2x4_avx2: 58.8
vvc_w_avg_8_2x8_c: 310.6
vvc_w_avg_8_2x8_avx2: 73.1
vvc_w_avg_8_2x16_c: 942.1
vvc_w_avg_8_2x16_avx2: 113.3
vvc_w_avg_8_2x32_c: 1098.8
vvc_w_avg_8_2x32_avx2: 202.6
vvc_w_avg_8_2x64_c: 2414.3
vvc_w_avg_8_2x64_avx2: 467.6
vvc_w_avg_8_2x128_c: 4763.8
vvc_w_avg_8_2x128_avx2: 1333.1
vvc_w_avg_8_4x2_c: 140.1
vvc_w_avg_8_4x2_avx2: 49.8
vvc_w_avg_8_4x4_c: 276.3
vvc_w_avg_8_4x4_avx2: 58.1
vvc_w_avg_8_4x8_c: 524.3
vvc_w_avg_8_4x8_avx2: 72.3
vvc_w_avg_8_4x16_c: 1108.1
vvc_w_avg_8_4x16_avx2: 111.8
vvc_w_avg_8_4x32_c: 2149.8
vvc_w_avg_8_4x32_avx2: 199.6
vvc_w_avg_8_4x64_c: 12288.1
vvc_w_avg_8_4x64_avx2: 509.3
vvc_w_avg_8_4x128_c: 8398.6
vvc_w_avg_8_4x128_avx2: 1319.6
vvc_w_avg_8_8x2_c: 271.1
vvc_w_avg_8_8x2_avx2: 44.1
vvc_w_avg_8_8x4_c: 503.3
vvc_w_avg_8_8x4_avx2: 61.8
vvc_w_avg_8_8x8_c: 1031.1
vvc_w_avg_8_8x8_avx2: 93.8
vvc_w_avg_8_8x16_c: 2009.8
vvc_w_avg_8_8x16_avx2: 163.1
vvc_w_avg_8_8x32_c: 4161.3
vvc_w_avg_8_8x32_avx2: 292.1
vvc_w_avg_8_8x64_c: 7940.6
vvc_w_avg_8_8x64_avx2: 592.1
vvc_w_avg_8_8x128_c: 16802.3
vvc_w_avg_8_8x128_avx2: 1287.6
vvc_w_avg_8_16x2_c: 762.6
vvc_w_avg_8_16x2_avx2: 53.6
vvc_w_avg_8_16x4_c: 1486.3
vvc_w_avg_8_16x4_avx2: 67.1
vvc_w_avg_8_16x8_c: 1907.8
vvc_w_avg_8_16x8_avx2: 96.8
vvc_w_avg_8_16x16_c: 3883.6
vvc_w_avg_8_16x16_avx2: 151.3
vvc_w_avg_8_16x32_c: 7974.8
vvc_w_avg_8_16x32_avx2: 285.8
vvc_w_avg_8_16x64_c: 25160.6
vvc_w_avg_8_16x64_avx2: 589.8
vvc_w_avg_8_16x128_c: 58328.1
vvc_w_avg_8_16x128_avx2: 1169.8
vvc_w_avg_8_32x2_c: 1009.1
vvc_w_avg_8_32x2_avx2: 65.6
vvc_w_avg_8_32x4_c: 2091.1
vvc_w_avg_8_32x4_avx2: 96.8
vvc_w_avg_8_32x8_c: 3997.8
vvc_w_avg_8_32x8_avx2: 156.3
vvc_w_avg_8_32x16_c: 8216.8
vvc_w_avg_8_32x16_avx2: 269.6
vvc_w_avg_8_32x32_c: 21746.1
vvc_w_avg_8_32x32_avx2: 635.3
vvc_w_avg_8_32x64_c: 31564.8
vvc_w_avg_8_32x64_avx2: 1010.6
vvc_w_avg_8_32x128_c: 114373.3
vvc_w_avg_8_32x128_avx2: 2013.6
vvc_w_avg_8_64x2_c: 2067.3
vvc_w_avg_8_64x2_avx2: 97.6
vvc_w_avg_8_64x4_c: 3901.1
vvc_w_avg_8_64x4_avx2: 154.8
vvc_w_avg_8_64x8_c: 7911.6
vvc_w_avg_8_64x8_avx2: 268.8
vvc_w_avg_8_64x16_c: 16508.8
vvc_w_avg_8_64x16_avx2: 501.8
vvc_w_avg_8_64x32_c: 38770.3
vvc_w_avg_8_64x32_avx2: 1287.6
vvc_w_avg_8_64x64_c: 110350.6
vvc_w_avg_8_64x64_avx2: 1890.8
vvc_w_avg_8_64x128_c: 141354.6
vvc_w_avg_8_64x128_avx2: 3839.6
vvc_w_avg_8_128x2_c: 7012.1
vvc_w_avg_8_128x2_avx2: 159.3
vvc_w_avg_8_128x4_c: 8146.8
vvc_w_avg_8_128x4_avx2: 272.6
vvc_w_avg_8_128x8_c: 24596.8
vvc_w_avg_8_128x8_avx2: 501.1
vvc_w_avg_8_128x16_c: 35918.1
vvc_w_avg_8_128x16_avx2: 948.8
vvc_w_avg_8_128x32_c: 68799.6
vvc_w_avg_8_128x32_avx2: 1963.1
vvc_w_avg_8_128x64_c: 133862.1
vvc_w_avg_8_128x64_avx2: 3833.6
vvc_w_avg_8_128x128_c: 348427.8
vvc_w_avg_8_128x128_avx2: 7682.8
vvc_w_avg_10_2x2_c: 118.6
vvc_w_avg_10_2x2_avx2: 73.1
vvc_w_avg_10_2x4_c: 189.1
vvc_w_avg_10_2x4_avx2: 89.3
vvc_w_avg_10_2x8_c: 382.8
vvc_w_avg_10_2x8_avx2: 179.8
vvc_w_avg_10_2x16_c: 658.3
vvc_w_avg_10_2x16_avx2: 185.1
vvc_w_avg_10_2x32_c: 1409.3
vvc_w_avg_10_2x32_avx2: 290.8
vvc_w_avg_10_2x64_c: 2906.8
vvc_w_avg_10_2x64_avx2: 793.1
vvc_w_avg_10_2x128_c: 6292.6
vvc_w_avg_10_2x128_avx2: 1696.8
vvc_w_avg_10_4x2_c: 178.8
vvc_w_avg_10_4x2_avx2: 80.1
vvc_w_avg_10_4x4_c: 581.6
vvc_w_avg_10_4x4_avx2: 97.6
vvc_w_avg_10_4x8_c: 693.3
vvc_w_avg_10_4x8_avx2: 128.1
vvc_w_avg_10_4x16_c: 1436.6
vvc_w_avg_10_4x16_avx2: 179.8
vvc_w_avg_10_4x32_c: 2409.1
vvc_w_avg_10_4x32_avx2: 292.3
vvc_w_avg_10_4x64_c: 4925.3
vvc_w_avg_10_4x64_avx2: 746.1
vvc_w_avg_10_4x128_c: 10664.6
vvc_w_avg_10_4x128_avx2: 1647.6
vvc_w_avg_10_8x2_c: 359.3
vvc_w_avg_10_8x2_avx2: 80.1
vvc_w_avg_10_8x4_c: 925.6
vvc_w_avg_10_8x4_avx2: 97.6
vvc_w_avg_10_8x8_c: 1360.6
vvc_w_avg_10_8x8_avx2: 121.8
vvc_w_avg_10_8x16_c: 3490.3
vvc_w_avg_10_8x16_avx2: 203.3
vvc_w_avg_10_8x32_c: 5266.1
vvc_w_avg_10_8x32_avx2: 325.8
vvc_w_avg_10_8x64_c: 11127.1
vvc_w_avg_10_8x64_avx2: 747.8
vvc_w_avg_10_8x128_c: 31058.3
vvc_w_avg_10_8x128_avx2: 1424.6
vvc_w_avg_10_16x2_c: 624.8
vvc_w_avg_10_16x2_avx2: 84.6
vvc_w_avg_10_16x4_c: 1389.6
vvc_w_avg_10_16x4_avx2: 109.1
vvc_w_avg_10_16x8_c: 2688.3
vvc_w_avg_10_16x8_avx2: 137.1
vvc_w_avg_10_16x16_c: 5387.1
vvc_w_avg_10_16x16_avx2: 224.6
vvc_w_avg_10_16x32_c: 10776.3
vvc_w_avg_10_16x32_avx2: 312.1
vvc_w_avg_10_16x64_c: 18069.1
vvc_w_avg_10_16x64_avx2: 858.6
vvc_w_avg_10_16x128_c: 43460.3
vvc_w_avg_10_16x128_avx2: 1411.6
vvc_w_avg_10_32x2_c: 1232.8
vvc_w_avg_10_32x2_avx2: 99.1
vvc_w_avg_10_32x4_c: 4017.6
vvc_w_avg_10_32x4_avx2: 134.1
vvc_w_avg_10_32x8_c: 9306.3
vvc_w_avg_10_32x8_avx2: 208.1
vvc_w_avg_10_32x16_c: 8424.6
vvc_w_avg_10_32x16_avx2: 349.3
vvc_w_avg_10_32x32_c: 20787.8
vvc_w_avg_10_32x32_avx2: 655.3
vvc_w_avg_10_32x64_c: 40972.1
vvc_w_avg_10_32x64_avx2: 904.8
vvc_w_avg_10_32x128_c: 85670.3
vvc_w_avg_10_32x128_avx2: 1751.6
vvc_w_avg_10_64x2_c: 2454.1
vvc_w_avg_10_64x2_avx2: 132.6
vvc_w_avg_10_64x4_c: 5012.6
vvc_w_avg_10_64x4_avx2: 215.6
vvc_w_avg_10_64x8_c: 10811.3
vvc_w_avg_10_64x8_avx2: 361.1
vvc_w_avg_10_64x16_c: 33349.1
vvc_w_avg_10_64x16_avx2: 904.1
vvc_w_avg_10_64x32_c: 41892.3
vvc_w_avg_10_64x32_avx2: 1220.6
vvc_w_avg_10_64x64_c: 66983.3
vvc_w_avg_10_64x64_avx2: 2622.1
vvc_w_avg_10_64x128_c: 246508.8
vvc_w_avg_10_64x128_avx2: 3316.8
vvc_w_avg_10_128x2_c: 7791.6
vvc_w_avg_10_128x2_avx2: 198.8
vvc_w_avg_10_128x4_c: 10534.3
vvc_w_avg_10_128x4_avx2: 337.3
vvc_w_avg_10_128x8_c: 21142.3
vvc_w_avg_10_128x8_avx2: 614.8
vvc_w_avg_10_128x16_c: 40968.6
vvc_w_avg_10_128x16_avx2: 1160.6
vvc_w_avg_10_128x32_c: 113043.3
vvc_w_avg_10_128x32_avx2: 1644.6
vvc_w_avg_10_128x64_c: 230658.3
vvc_w_avg_10_128x64_avx2: 5065.3
vvc_w_avg_10_128x128_c: 335236.3
vvc_w_avg_10_128x128_avx2: 6450.3
vvc_w_avg_12_2x2_c: 185.3
vvc_w_avg_12_2x2_avx2: 43.6
vvc_w_avg_12_2x4_c: 340.3
vvc_w_avg_12_2x4_avx2: 55.8
vvc_w_avg_12_2x8_c: 632.3
vvc_w_avg_12_2x8_avx2: 70.1
vvc_w_avg_12_2x16_c: 728.3
vvc_w_avg_12_2x16_avx2: 108.1
vvc_w_avg_12_2x32_c: 1392.6
vvc_w_avg_12_2x32_avx2: 176.8
vvc_w_avg_12_2x64_c: 2618.3
vvc_w_avg_12_2x64_avx2: 757.3
vvc_w_avg_12_2x128_c: 6408.8
vvc_w_avg_12_2x128_avx2: 1435.1
vvc_w_avg_12_4x2_c: 349.3
vvc_w_avg_12_4x2_avx2: 44.3
vvc_w_avg_12_4x4_c: 607.1
vvc_w_avg_12_4x4_avx2: 52.6
vvc_w_avg_12_4x8_c: 1134.8
vvc_w_avg_12_4x8_avx2: 70.1
vvc_w_avg_12_4x16_c: 1378.1
vvc_w_avg_12_4x16_avx2: 115.3
vvc_w_avg_12_4x32_c: 2599.3
vvc_w_avg_12_4x32_avx2: 174.3
vvc_w_avg_12_4x64_c: 4474.8
vvc_w_avg_12_4x64_avx2: 656.1
vvc_w_avg_12_4x128_c: 11319.6
vvc_w_avg_12_4x128_avx2: 1373.1
vvc_w_avg_12_8x2_c: 595.8
vvc_w_avg_12_8x2_avx2: 44.3
vvc_w_avg_12_8x4_c: 1164.3
vvc_w_avg_12_8x4_avx2: 56.6
vvc_w_avg_12_8x8_c: 2019.6
vvc_w_avg_12_8x8_avx2: 80.1
vvc_w_avg_12_8x16_c: 4071.6
vvc_w_avg_12_8x16_avx2: 139.3
vvc_w_avg_12_8x32_c: 4485.1
vvc_w_avg_12_8x32_avx2: 250.6
vvc_w_avg_12_8x64_c: 8404.8
vvc_w_avg_12_8x64_avx2: 735.8
vvc_w_avg_12_8x128_c: 35679.8
vvc_w_avg_12_8x128_avx2: 1252.6
vvc_w_avg_12_16x2_c: 1114.8
vvc_w_avg_12_16x2_avx2: 46.6
vvc_w_avg_12_16x4_c: 2240.1
vvc_w_avg_12_16x4_avx2: 62.6
vvc_w_avg_12_16x8_c: 13174.6
vvc_w_avg_12_16x8_avx2: 88.6
vvc_w_avg_12_16x16_c: 5334.6
vvc_w_avg_12_16x16_avx2: 144.3
vvc_w_avg_12_16x32_c: 8378.1
vvc_w_avg_12_16x32_avx2: 234.6
vvc_w_avg_12_16x64_c: 21300.8
vvc_w_avg_12_16x64_avx2: 761.8
vvc_w_avg_12_16x128_c: 32786.8
vvc_w_avg_12_16x128_avx2: 1432.8
vvc_w_avg_12_32x2_c: 2154.3
vvc_w_avg_12_32x2_avx2: 61.1
vvc_w_avg_12_32x4_c: 4299.8
vvc_w_avg_12_32x4_avx2: 83.1
vvc_w_avg_12_32x8_c: 7964.8
vvc_w_avg_12_32x8_avx2: 132.6
vvc_w_avg_12_32x16_c: 13321.6
vvc_w_avg_12_32x16_avx2: 234.6
vvc_w_avg_12_32x32_c: 21149.3
vvc_w_avg_12_32x32_avx2: 433.3
vvc_w_avg_12_32x64_c: 43666.6
vvc_w_avg_12_32x64_avx2: 876.6
vvc_w_avg_12_32x128_c: 83189.8
vvc_w_avg_12_32x128_avx2: 1756.6
vvc_w_avg_12_64x2_c: 3829.8
vvc_w_avg_12_64x2_avx2: 83.1
vvc_w_avg_12_64x4_c: 8588.1
vvc_w_avg_12_64x4_avx2: 127.1
vvc_w_avg_12_64x8_c: 17027.6
vvc_w_avg_12_64x8_avx2: 310.6
vvc_w_avg_12_64x16_c: 29797.8
vvc_w_avg_12_64x16_avx2: 415.6
vvc_w_avg_12_64x32_c: 43854.3
vvc_w_avg_12_64x32_avx2: 773.3
vvc_w_avg_12_64x64_c: 137767.3
vvc_w_avg_12_64x64_avx2: 1608.6
vvc_w_avg_12_64x128_c: 316428.3
vvc_w_avg_12_64x128_avx2: 3249.8
vvc_w_avg_12_128x2_c: 8824.6
vvc_w_avg_12_128x2_avx2: 130.3
vvc_w_avg_12_128x4_c: 17173.6
vvc_w_avg_12_128x4_avx2: 219.3
vvc_w_avg_12_128x8_c: 21997.8
vvc_w_avg_12_128x8_avx2: 397.3
vvc_w_avg_12_128x16_c: 43553.8
vvc_w_avg_12_128x16_avx2: 790.1
vvc_w_avg_12_128x32_c: 89792.1
vvc_w_avg_12_128x32_avx2: 1497.6
vvc_w_avg_12_128x64_c: 226573.3
vvc_w_avg_12_128x64_avx2: 3153.1
vvc_w_avg_12_128x128_c: 332090.1
vvc_w_avg_12_128x128_avx2: 6499.6
Signed-off-by: Wu Jianhua <toqsxw@outlook.com>
---
libavcodec/x86/vvc/Makefile | 3 +-
libavcodec/x86/vvc/vvc_mc.asm | 301 +++++++++++++++++++++++++++++++
libavcodec/x86/vvc/vvcdsp_init.c | 52 ++++++
3 files changed, 355 insertions(+), 1 deletion(-)
create mode 100644 libavcodec/x86/vvc/vvc_mc.asm
diff --git a/libavcodec/x86/vvc/Makefile b/libavcodec/x86/vvc/Makefile
index b4acc22501..29765a6c48 100644
--- a/libavcodec/x86/vvc/Makefile
+++ b/libavcodec/x86/vvc/Makefile
@@ -2,5 +2,6 @@ clean::
$(RM) $(CLEANSUFFIXES:%=libavcodec/x86/vvc/%)
OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvcdsp_init.o
-X86ASM-OBJS-$(CONFIG_VVC_DECODER) += x86/h26x/h2656dsp.o \
+X86ASM-OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvc_mc.o \
+ x86/h26x/h2656dsp.o \
x86/h26x/h2656_inter.o
diff --git a/libavcodec/x86/vvc/vvc_mc.asm b/libavcodec/x86/vvc/vvc_mc.asm
new file mode 100644
index 0000000000..d0a34d3952
--- /dev/null
+++ b/libavcodec/x86/vvc/vvc_mc.asm
@@ -0,0 +1,301 @@
+; /*
+; * Provide SIMD MC functions for VVC decoding
+; *
+; * Copyright © 2021, VideoLAN and dav1d authors
+; * Copyright © 2021, Two Orioles, LLC
+; * All rights reserved.
+; *
+; * Copyright (c) 2023-2024 Nuo Mi
+; * Copyright (c) 2023-2024 Wu Jianhua
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+; */
+
+%include "libavutil/x86/x86util.asm"
+
+%define MAX_PB_SIZE 128
+
+SECTION_RODATA 32
+
+pw_0 times 2 dw 0
+pw_1 times 2 dw 1
+pw_4 times 2 dw 4
+pw_12 times 2 dw 12
+pw_256 times 2 dw 256
+
+%macro AVG_JMP_TABLE 3-*
+ %xdefine %1_%2_%3_table (%%table - 2*%4)
+ %xdefine %%base %1_%2_%3_table
+ %xdefine %%prefix mangle(private_prefix %+ _vvc_%1_%2bpc_%3)
+ %%table:
+ %rep %0 - 3
+ dd %%prefix %+ .w%4 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+AVG_JMP_TABLE avg, 8, avx2, 2, 4, 8, 16, 32, 64, 128
+AVG_JMP_TABLE avg, 16, avx2, 2, 4, 8, 16, 32, 64, 128
+AVG_JMP_TABLE w_avg, 8, avx2, 2, 4, 8, 16, 32, 64, 128
+AVG_JMP_TABLE w_avg, 16, avx2, 2, 4, 8, 16, 32, 64, 128
+
+SECTION .text
+
+%macro AVG_W16_FN 3 ; bpc, op, count
+ %assign %%i 0
+ %rep %3
+ %define off %%i
+ AVG_LOAD_W16 0, off
+ %2
+ AVG_SAVE_W16 %1, 0, off
+
+
+ AVG_LOAD_W16 1, off
+ %2
+ AVG_SAVE_W16 %1, 1, off
+
+ %assign %%i %%i+1
+ %endrep
+%endmacro
+
+%macro AVG_FN 2 ; bpc, op
+ jmp wq
+
+.w2:
+ movd xm0, [src0q]
+ pinsrd xm0, [src0q + AVG_SRC_STRIDE], 1
+ movd xm1, [src1q]
+ pinsrd xm1, [src1q + AVG_SRC_STRIDE], 1
+ %2
+ AVG_SAVE_W2 %1
+ AVG_LOOP_END .w2
+
+.w4:
+ movq xm0, [src0q]
+ pinsrq xm0, [src0q + AVG_SRC_STRIDE], 1
+ movq xm1, [src1q]
+ pinsrq xm1, [src1q + AVG_SRC_STRIDE], 1
+ %2
+ AVG_SAVE_W4 %1
+
+ AVG_LOOP_END .w4
+
+.w8:
+ vinserti128 m0, m0, [src0q], 0
+ vinserti128 m0, m0, [src0q + AVG_SRC_STRIDE], 1
+ vinserti128 m1, m1, [src1q], 0
+ vinserti128 m1, m1, [src1q + AVG_SRC_STRIDE], 1
+ %2
+ AVG_SAVE_W8 %1
+
+ AVG_LOOP_END .w8
+
+.w16:
+ AVG_W16_FN %1, %2, 1
+
+ AVG_LOOP_END .w16
+
+.w32:
+ AVG_W16_FN %1, %2, 2
+
+ AVG_LOOP_END .w32
+
+.w64:
+ AVG_W16_FN %1, %2, 4
+
+ AVG_LOOP_END .w64
+
+.w128:
+ AVG_W16_FN %1, %2, 8
+
+ AVG_LOOP_END .w128
+
+.ret:
+ RET
+%endmacro
+
+%macro AVG 0
+ paddsw m0, m1
+ pmulhrsw m0, m2
+ CLIPW m0, m3, m4
+%endmacro
+
+%macro W_AVG 0
+ punpckhwd m5, m0, m1
+ pmaddwd m5, m3
+ paddd m5, m4
+ psrad m5, xm2
+
+ punpcklwd m0, m0, m1
+ pmaddwd m0, m3
+ paddd m0, m4
+ psrad m0, xm2
+
+ packssdw m0, m5
+ CLIPW m0, m6, m7
+%endmacro
+
+%macro AVG_LOAD_W16 2 ; line, offset
+ movu m0, [src0q + %1 * AVG_SRC_STRIDE + %2 * 32]
+ movu m1, [src1q + %1 * AVG_SRC_STRIDE + %2 * 32]
+%endmacro
+
+%macro AVG_SAVE_W2 1 ;bpc
+ %if %1 == 16
+ pextrd [dstq], xm0, 0
+ pextrd [dstq + strideq], xm0, 1
+ %else
+ packuswb m0, m0
+ pextrw [dstq], xm0, 0
+ pextrw [dstq + strideq], xm0, 1
+ %endif
+%endmacro
+
+%macro AVG_SAVE_W4 1 ;bpc
+ %if %1 == 16
+ pextrq [dstq], xm0, 0
+ pextrq [dstq + strideq], xm0, 1
+ %else
+ packuswb m0, m0
+ pextrd [dstq], xm0, 0
+ pextrd [dstq + strideq], xm0, 1
+ %endif
+%endmacro
+
+%macro AVG_SAVE_W8 1 ;bpc
+ %if %1 == 16
+ vextracti128 [dstq], m0, 0
+ vextracti128 [dstq + strideq], m0, 1
+ %else
+ packuswb m0, m0
+ vpermq m0, m0, 1000b
+ pextrq [dstq], xm0, 0
+ pextrq [dstq + strideq], xm0, 1
+ %endif
+%endmacro
+
+%macro AVG_SAVE_W16 3 ; bpc, line, offset
+ %if %1 == 16
+ movu [dstq + %2 * strideq + %3 * 32], m0
+ %else
+ packuswb m0, m0
+ vpermq m0, m0, 1000b
+ vextracti128 [dstq + %2 * strideq + %3 * 16], m0, 0
+ %endif
+%endmacro
+
+%macro AVG_LOOP_END 1
+ sub hd, 2
+ je .ret
+
+ lea src0q, [src0q + 2 * AVG_SRC_STRIDE]
+ lea src1q, [src1q + 2 * AVG_SRC_STRIDE]
+ lea dstq, [dstq + 2 * strideq]
+ jmp %1
+%endmacro
+
+%define AVG_SRC_STRIDE MAX_PB_SIZE*2
+
+;void ff_vvc_avg_%1bpc_avx2(uint8_t *dst, ptrdiff_t dst_stride,
+; const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, intptr_t pixel_max);
+%macro VVC_AVG_AVX2 1
+cglobal vvc_avg_%1bpc, 4, 7, 5, dst, stride, src0, src1, w, h, bd
+ movifnidn hd, hm
+
+ pxor m3, m3 ; pixel min
+ vpbroadcastw m4, bdm ; pixel max
+
+ movifnidn bdd, bdm
+ inc bdd
+ tzcnt bdd, bdd ; bit depth
+
+ sub bdd, 8
+ movd xm0, bdd
+ vpbroadcastd m1, [pw_4]
+ pminuw m0, m1
+ vpbroadcastd m2, [pw_256]
+ psllw m2, xm0 ; shift
+
+ lea r6, [avg_%1 %+ SUFFIX %+ _table]
+ tzcnt wd, wm
+ movsxd wq, dword [r6+wq*4]
+ add wq, r6
+ AVG_FN %1, AVG
+%endmacro
+
+;void ff_vvc_w_avg_%1bpc_avx(uint8_t *dst, ptrdiff_t dst_stride,
+; const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height,
+; intptr_t denom, intptr_t w0, intptr_t w1, intptr_t o0, intptr_t o1, intptr_t pixel_max);
+%macro VVC_W_AVG_AVX2 1
+cglobal vvc_w_avg_%1bpc, 4, 7, 8, dst, stride, src0, src1, w, h, t0, t1
+
+ movifnidn hd, hm
+
+ movifnidn t0d, r8m ; w1
+ shl t0d, 16
+ mov t0w, r7m ; w0
+ movd xm3, t0d
+ vpbroadcastd m3, xm3 ; w0, w1
+
+ pxor m6, m6 ;pixel min
+ vpbroadcastw m7, r11m ;pixel max
+
+ mov t1q, rcx ; save ecx
+ mov ecx, r11m
+ inc ecx ; bd
+ tzcnt ecx, ecx
+ sub ecx, 8
+ mov t0d, r9m ; o0
+ add t0d, r10m ; o1
+ shl t0d, cl
+ inc t0d ;((o0 + o1) << (BIT_DEPTH - 8)) + 1
+
+ neg ecx
+ add ecx, 4 ; bd - 12
+ cmovl ecx, [pw_0]
+ add ecx, 3
+ add ecx, r6m
+ movd xm2, ecx ; shift
+
+ dec ecx
+ shl t0d, cl
+ movd xm4, t0d
+ vpbroadcastd m4, xm4 ; offset
+ mov rcx, t1q ; restore ecx
+
+ lea r6, [w_avg_%1 %+ SUFFIX %+ _table]
+ tzcnt wd, wm
+ movsxd wq, dword [r6+wq*4]
+ add wq, r6
+ AVG_FN %1, W_AVG
+%endmacro
+
+%if ARCH_X86_64
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+
+VVC_AVG_AVX2 16
+
+VVC_AVG_AVX2 8
+
+VVC_W_AVG_AVX2 16
+
+VVC_W_AVG_AVX2 8
+%endif
+
+%endif
diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
index c197cdb4cc..909ef9f56b 100644
--- a/libavcodec/x86/vvc/vvcdsp_init.c
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -169,6 +169,42 @@ FW_PUT_16BPC_AVX2(12);
MC_TAP_LINKS_16BPC_AVX2(LUMA, 8, bd); \
MC_TAP_LINKS_16BPC_AVX2(CHROMA, 4, bd);
+#define bf(fn, bd, opt) fn##_##bd##_##opt
+#define BF(fn, bpc, opt) fn##_##bpc##bpc_##opt
+
+#define AVG_BPC_FUNC(bpc, opt) \
+void BF(ff_vvc_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, intptr_t pixel_max); \
+void BF(ff_vvc_w_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, \
+ intptr_t denom, intptr_t w0, intptr_t w1, intptr_t o0, intptr_t o1, intptr_t pixel_max);
+
+#define AVG_FUNCS(bpc, bd, opt) \
+static void bf(avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *src0, const int16_t *src1, int width, int height) \
+{ \
+ BF(ff_vvc_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, (1 << bd) - 1); \
+} \
+static void bf(w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *src0, const int16_t *src1, int width, int height, \
+ int denom, int w0, int w1, int o0, int o1) \
+{ \
+ BF(ff_vvc_w_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, \
+ denom, w0, w1, o0, o1, (1 << bd) - 1); \
+}
+
+AVG_BPC_FUNC(8, avx2)
+AVG_BPC_FUNC(16, avx2)
+
+AVG_FUNCS(8, 8, avx2)
+AVG_FUNCS(16, 10, avx2)
+AVG_FUNCS(16, 12, avx2)
+
+#define AVG_INIT(bd, opt) do { \
+ c->inter.avg = bf(avg, bd, opt); \
+ c->inter.w_avg = bf(w_avg, bd, opt); \
+} while (0)
+
void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
{
const int cpu_flags = av_get_cpu_flags();
@@ -198,5 +234,21 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
MC_LINKS_16BPC_AVX2(12);
}
}
+
+ if (EXTERNAL_AVX2(cpu_flags)) {
+ switch (bd) {
+ case 8:
+ AVG_INIT(8, avx2);
+ break;
+ case 10:
+ AVG_INIT(10, avx2);
+ break;
+ case 12:
+ AVG_INIT(12, avx2);
+ break;
+ default:
+ break;
+ }
+ }
}
}
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH v3 8/8] tests/checkasm/vvc_mc: add check_avg
[not found] <20240122152527.601122-1-toqsxw@outlook.com>
` (5 preceding siblings ...)
2024-01-22 15:25 ` [FFmpeg-devel] [PATCH v3 7/8] avcodec/x86/vvc: add avg and avg_w AVX2 optimizations toqsxw
@ 2024-01-22 15:25 ` toqsxw
6 siblings, 0 replies; 10+ messages in thread
From: toqsxw @ 2024-01-22 15:25 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Wu Jianhua
From: Wu Jianhua <toqsxw@outlook.com>
Signed-off-by: Wu Jianhua <toqsxw@outlook.com>
---
tests/checkasm/vvc_mc.c | 64 +++++++++++++++++++++++++++++++++++++++++
1 file changed, 64 insertions(+)
diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
index 711280deec..8adb00573f 100644
--- a/tests/checkasm/vvc_mc.c
+++ b/tests/checkasm/vvc_mc.c
@@ -35,6 +35,7 @@
static const uint32_t pixel_mask[] = { 0xffffffff, 0x03ff03ff, 0x0fff0fff, 0x3fff3fff, 0xffffffff };
static const int sizes[] = { 2, 4, 8, 16, 32, 64, 128 };
+#define SIZEOF_PIXEL ((bit_depth + 7) / 8)
#define PIXEL_STRIDE (MAX_CTU_SIZE * 2)
#define EXTRA_BEFORE 3
#define EXTRA_AFTER 4
@@ -261,10 +262,73 @@ static void check_put_vvc_chroma_uni(void)
report("put_uni_chroma");
}
+#define AVG_SRC_BUF_SIZE (MAX_CTU_SIZE * MAX_CTU_SIZE)
+#define AVG_DST_BUF_SIZE (MAX_PB_SIZE * MAX_PB_SIZE * 2)
+
+static void check_avg(void)
+{
+ LOCAL_ALIGNED_32(int16_t, src00, [AVG_SRC_BUF_SIZE]);
+ LOCAL_ALIGNED_32(int16_t, src01, [AVG_SRC_BUF_SIZE]);
+ LOCAL_ALIGNED_32(int16_t, src10, [AVG_SRC_BUF_SIZE]);
+ LOCAL_ALIGNED_32(int16_t, src11, [AVG_SRC_BUF_SIZE]);
+ LOCAL_ALIGNED_32(uint8_t, dst0, [AVG_DST_BUF_SIZE]);
+ LOCAL_ALIGNED_32(uint8_t, dst1, [AVG_DST_BUF_SIZE]);
+ VVCDSPContext c;
+
+ for (int bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
+ randomize_avg_src((uint8_t*)src00, (uint8_t*)src10, AVG_SRC_BUF_SIZE * sizeof(int16_t));
+ randomize_avg_src((uint8_t*)src01, (uint8_t*)src11, AVG_SRC_BUF_SIZE * sizeof(int16_t));
+ ff_vvc_dsp_init(&c, bit_depth);
+ for (int h = 2; h <= MAX_CTU_SIZE; h *= 2) {
+ for (int w = 2; w <= MAX_CTU_SIZE; w *= 2) {
+ {
+ declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *src0, const int16_t *src1, int width, int height);
+ if (check_func(c.inter.avg, "avg_%d_%dx%d", bit_depth, w, h)) {
+ memset(dst0, 0, AVG_DST_BUF_SIZE);
+ memset(dst1, 0, AVG_DST_BUF_SIZE);
+ call_ref(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, src01, w, h);
+ call_new(dst1, MAX_CTU_SIZE * SIZEOF_PIXEL, src10, src11, w, h);
+ if (memcmp(dst0, dst1, DST_BUF_SIZE))
+ fail();
+ if (w == h)
+ bench_new(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, src01, w, h);
+ }
+ }
+ {
+ declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *src0, const int16_t *src1, int width, int height,
+ int denom, int w0, int w1, int o0, int o1);
+ {
+ const int denom = rnd() % 8;
+ const int w0 = rnd() % 256 - 128;
+ const int w1 = rnd() % 256 - 128;
+ const int o0 = rnd() % 256 - 128;
+ const int o1 = rnd() % 256 - 128;
+ if (check_func(c.inter.w_avg, "w_avg_%d_%dx%d", bit_depth, w, h)) {
+ memset(dst0, 0, AVG_DST_BUF_SIZE);
+ memset(dst1, 0, AVG_DST_BUF_SIZE);
+
+ call_ref(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, src01, w, h, denom, w0, w1, o0, o1);
+ call_new(dst1, MAX_CTU_SIZE * SIZEOF_PIXEL, src10, src11, w, h, denom, w0, w1, o0, o1);
+ if (memcmp(dst0, dst1, DST_BUF_SIZE))
+ fail();
+ if (w == h)
+ bench_new(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, src01, w, h, denom, w0, w1, o0, o1);
+ }
+ }
+ }
+ }
+ }
+ }
+ report("avg");
+}
+
void checkasm_check_vvc_mc(void)
{
check_put_vvc_luma();
check_put_vvc_luma_uni();
check_put_vvc_chroma();
check_put_vvc_chroma_uni();
+ check_avg();
}
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] 回复: [PATCH v3 7/8] avcodec/x86/vvc: add avg and avg_w AVX2 optimizations
2024-01-22 22:46 ` Michael Niedermayer
@ 2024-01-23 18:23 ` Wu Jianhua
0 siblings, 0 replies; 10+ messages in thread
From: Wu Jianhua @ 2024-01-23 18:23 UTC (permalink / raw)
To: FFmpeg development discussions and patches
> 发件人: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> 代表 Michael Niedermayer <michael@niedermayer.cc>
> 发送时间: 2024年1月22日 14:46
> 收件人: FFmpeg development discussions and patches
> 主题: Re: [FFmpeg-devel] [PATCH v3 7/8] avcodec/x86/vvc: add avg and avg_w AVX2 optimizations
>
> On Tue, Jan 23, 2024 at 01:46:27AM +0800, toqsxw@outlook.com wrote:
>> From: Wu Jianhua <toqsxw@outlook.com>
>>
>> The avg/avg_w is based on dav1d.
>> See https://code.videolan.org/videolan/dav1d/-/blob/master/src/x86/mc_avx2.asm
>>
>>
>> Signed-off-by: Wu Jianhua <toqsxw@outlook.com>
>> ---
>> libavcodec/x86/vvc/Makefile | 3 +-
>> libavcodec/x86/vvc/vvc_mc.asm | 301 +++++++++++++++++++++++++++++++
>> libavcodec/x86/vvc/vvcdsp_init.c | 52 ++++++
>> 3 files changed, 355 insertions(+), 1 deletion(-)
>> create mode 100644 libavcodec/x86/vvc/vvc_mc.asm
>
> this seems to break x86-32
>
> src/libavcodec/x86/vvc/vvc_mc.asm:51: error: symbol `ff_vvc_avg_8bpc_avx2.w2' undefined
> src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
> src/libavcodec/x86/vvc/vvc_mc.asm:51: error: symbol `ff_vvc_avg_8bpc_avx2.w4' undefined
> src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
> src/libavcodec/x86/vvc/vvc_mc.asm:51: error: symbol `ff_vvc_avg_8bpc_avx2.w8' undefined
> src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
> src/libavcodec/x86/vvc/vvc_mc.asm:51: error: symbol `ff_vvc_avg_8bpc_avx2.w16' undefined
> src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
> src/libavcodec/x86/vvc/vvc_mc.asm:51: error: symbol `ff_vvc_avg_8bpc_avx2.w32' undefined
> src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
> src/libavcodec/x86/vvc/vvc_mc.asm:51: error: symbol `ff_vvc_avg_8bpc_avx2.w64' undefined
> src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
> src/libavcodec/x86/vvc/vvc_mc.asm:51: error: symbol `ff_vvc_avg_8bpc_avx2.w128' undefined
> src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
> src/libavcodec/x86/vvc/vvc_mc.asm:52: error: symbol `ff_vvc_avg_16bpc_avx2.w2' undefined
> src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
> src/libavcodec/x86/vvc/vvc_mc.asm:52: error: symbol `ff_vvc_avg_16bpc_avx2.w4' undefined
> src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
> src/libavcodec/x86/vvc/vvc_mc.asm:52: error: symbol `ff_vvc_avg_16bpc_avx2.w8' undefined
> src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
> src/libavcodec/x86/vvc/vvc_mc.asm:52: error: symbol `ff_vvc_avg_16bpc_avx2.w16' undefined
> src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
> src/libavcodec/x86/vvc/vvc_mc.asm:52: error: symbol `ff_vvc_avg_16bpc_avx2.w32' undefined
> src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
> src/libavcodec/x86/vvc/vvc_mc.asm:52: error: symbol `ff_vvc_avg_16bpc_avx2.w64' undefined
> src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
> src/libavcodec/x86/vvc/vvc_mc.asm:52: error: symbol `ff_vvc_avg_16bpc_avx2.w128' undefined
> src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
> src/libavcodec/x86/vvc/vvc_mc.asm:53: error: symbol `ff_vvc_w_avg_8bpc_avx2.w2' undefined
> src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
> src/libavcodec/x86/vvc/vvc_mc.asm:53: error: symbol `ff_vvc_w_avg_8bpc_avx2.w4' undefined
> src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
> src/libavcodec/x86/vvc/vvc_mc.asm:53: error: symbol `ff_vvc_w_avg_8bpc_avx2.w8' undefined
> src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
> src/libavcodec/x86/vvc/vvc_mc.asm:53: error: symbol `ff_vvc_w_avg_8bpc_avx2.w16' undefined
> src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
> src/libavcodec/x86/vvc/vvc_mc.asm:53: error: symbol `ff_vvc_w_avg_8bpc_avx2.w32' undefined
> src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
> src/libavcodec/x86/vvc/vvc_mc.asm:53: error: symbol `ff_vvc_w_avg_8bpc_avx2.w64' undefined
> src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
> src/libavcodec/x86/vvc/vvc_mc.asm:53: error: symbol `ff_vvc_w_avg_8bpc_avx2.w128' undefined
> src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
> src/libavcodec/x86/vvc/vvc_mc.asm:54: error: symbol `ff_vvc_w_avg_16bpc_avx2.w2' undefined
> src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
> src/libavcodec/x86/vvc/vvc_mc.asm:54: error: symbol `ff_vvc_w_avg_16bpc_avx2.w4' undefined
> src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
> src/libavcodec/x86/vvc/vvc_mc.asm:54: error: symbol `ff_vvc_w_avg_16bpc_avx2.w8' undefined
> src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
> src/libavcodec/x86/vvc/vvc_mc.asm:54: error: symbol `ff_vvc_w_avg_16bpc_avx2.w16' undefined
> src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
> src/libavcodec/x86/vvc/vvc_mc.asm:54: error: symbol `ff_vvc_w_avg_16bpc_avx2.w32' undefined
> src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
> src/libavcodec/x86/vvc/vvc_mc.asm:54: error: symbol `ff_vvc_w_avg_16bpc_avx2.w64' undefined
> src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
> src/libavcodec/x86/vvc/vvc_mc.asm:54: error: symbol `ff_vvc_w_avg_16bpc_avx2.w128' undefined
> src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
> src/ffbuild/common.mak:103: recipe for target 'libavcodec/x86/vvc/vvc_mc.o' failed
> make: *** [libavcodec/x86/vvc/vvc_mc.o] Error 1
> make: *** Waiting for unfinished jobs....
>
It will be fixed in the v4. Thanks for this test.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3 7/8] avcodec/x86/vvc: add avg and avg_w AVX2 optimizations
2024-01-22 17:46 ` [FFmpeg-devel] [PATCH v3 7/8] avcodec/x86/vvc: add avg and avg_w AVX2 optimizations toqsxw
@ 2024-01-22 22:46 ` Michael Niedermayer
2024-01-23 18:23 ` [FFmpeg-devel] 回复: " Wu Jianhua
0 siblings, 1 reply; 10+ messages in thread
From: Michael Niedermayer @ 2024-01-22 22:46 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1.1: Type: text/plain, Size: 24303 bytes --]
On Tue, Jan 23, 2024 at 01:46:27AM +0800, toqsxw@outlook.com wrote:
> From: Wu Jianhua <toqsxw@outlook.com>
>
> The avg/avg_w is based on dav1d.
> See https://code.videolan.org/videolan/dav1d/-/blob/master/src/x86/mc_avx2.asm
>
> vvc_avg_8_2x2_c: 71.6
> vvc_avg_8_2x2_avx2: 26.8
> vvc_avg_8_2x4_c: 140.8
> vvc_avg_8_2x4_avx2: 34.6
> vvc_avg_8_2x8_c: 410.3
> vvc_avg_8_2x8_avx2: 41.3
> vvc_avg_8_2x16_c: 769.3
> vvc_avg_8_2x16_avx2: 60.3
> vvc_avg_8_2x32_c: 1669.6
> vvc_avg_8_2x32_avx2: 105.1
> vvc_avg_8_2x64_c: 1978.3
> vvc_avg_8_2x64_avx2: 425.8
> vvc_avg_8_2x128_c: 6536.8
> vvc_avg_8_2x128_avx2: 1315.1
> vvc_avg_8_4x2_c: 155.6
> vvc_avg_8_4x2_avx2: 26.1
> vvc_avg_8_4x4_c: 250.3
> vvc_avg_8_4x4_avx2: 31.3
> vvc_avg_8_4x8_c: 831.8
> vvc_avg_8_4x8_avx2: 41.3
> vvc_avg_8_4x16_c: 1461.1
> vvc_avg_8_4x16_avx2: 57.1
> vvc_avg_8_4x32_c: 2821.6
> vvc_avg_8_4x32_avx2: 105.1
> vvc_avg_8_4x64_c: 3615.8
> vvc_avg_8_4x64_avx2: 412.6
> vvc_avg_8_4x128_c: 11962.6
> vvc_avg_8_4x128_avx2: 1274.3
> vvc_avg_8_8x2_c: 215.8
> vvc_avg_8_8x2_avx2: 29.1
> vvc_avg_8_8x4_c: 430.6
> vvc_avg_8_8x4_avx2: 37.6
> vvc_avg_8_8x8_c: 1463.3
> vvc_avg_8_8x8_avx2: 51.8
> vvc_avg_8_8x16_c: 2630.1
> vvc_avg_8_8x16_avx2: 97.6
> vvc_avg_8_8x32_c: 5813.8
> vvc_avg_8_8x32_avx2: 196.6
> vvc_avg_8_8x64_c: 6687.3
> vvc_avg_8_8x64_avx2: 487.8
> vvc_avg_8_8x128_c: 13178.6
> vvc_avg_8_8x128_avx2: 1290.6
> vvc_avg_8_16x2_c: 443.8
> vvc_avg_8_16x2_avx2: 28.3
> vvc_avg_8_16x4_c: 1253.3
> vvc_avg_8_16x4_avx2: 32.1
> vvc_avg_8_16x8_c: 2236.3
> vvc_avg_8_16x8_avx2: 44.3
> vvc_avg_8_16x16_c: 5127.8
> vvc_avg_8_16x16_avx2: 63.3
> vvc_avg_8_16x32_c: 6573.3
> vvc_avg_8_16x32_avx2: 223.6
> vvc_avg_8_16x64_c: 30311.8
> vvc_avg_8_16x64_avx2: 437.8
> vvc_avg_8_16x128_c: 25693.3
> vvc_avg_8_16x128_avx2: 1266.8
> vvc_avg_8_32x2_c: 954.6
> vvc_avg_8_32x2_avx2: 32.1
> vvc_avg_8_32x4_c: 2359.6
> vvc_avg_8_32x4_avx2: 39.6
> vvc_avg_8_32x8_c: 5703.6
> vvc_avg_8_32x8_avx2: 57.1
> vvc_avg_8_32x16_c: 9967.6
> vvc_avg_8_32x16_avx2: 107.1
> vvc_avg_8_32x32_c: 21327.6
> vvc_avg_8_32x32_avx2: 272.6
> vvc_avg_8_32x64_c: 39240.8
> vvc_avg_8_32x64_avx2: 529.6
> vvc_avg_8_32x128_c: 52580.8
> vvc_avg_8_32x128_avx2: 1338.8
> vvc_avg_8_64x2_c: 1647.3
> vvc_avg_8_64x2_avx2: 38.8
> vvc_avg_8_64x4_c: 5130.1
> vvc_avg_8_64x4_avx2: 58.8
> vvc_avg_8_64x8_c: 6529.3
> vvc_avg_8_64x8_avx2: 88.3
> vvc_avg_8_64x16_c: 19913.6
> vvc_avg_8_64x16_avx2: 162.3
> vvc_avg_8_64x32_c: 39360.8
> vvc_avg_8_64x32_avx2: 295.8
> vvc_avg_8_64x64_c: 49658.3
> vvc_avg_8_64x64_avx2: 784.1
> vvc_avg_8_64x128_c: 108513.1
> vvc_avg_8_64x128_avx2: 1977.1
> vvc_avg_8_128x2_c: 3226.1
> vvc_avg_8_128x2_avx2: 61.1
> vvc_avg_8_128x4_c: 10280.3
> vvc_avg_8_128x4_avx2: 94.6
> vvc_avg_8_128x8_c: 18079.3
> vvc_avg_8_128x8_avx2: 155.3
> vvc_avg_8_128x16_c: 45121.8
> vvc_avg_8_128x16_avx2: 285.3
> vvc_avg_8_128x32_c: 48651.8
> vvc_avg_8_128x32_avx2: 581.6
> vvc_avg_8_128x64_c: 165078.6
> vvc_avg_8_128x64_avx2: 1942.8
> vvc_avg_8_128x128_c: 339103.1
> vvc_avg_8_128x128_avx2: 4332.6
> vvc_avg_10_2x2_c: 144.3
> vvc_avg_10_2x2_avx2: 26.8
> vvc_avg_10_2x4_c: 142.6
> vvc_avg_10_2x4_avx2: 45.3
> vvc_avg_10_2x8_c: 478.1
> vvc_avg_10_2x8_avx2: 38.1
> vvc_avg_10_2x16_c: 518.3
> vvc_avg_10_2x16_avx2: 58.1
> vvc_avg_10_2x32_c: 2059.8
> vvc_avg_10_2x32_avx2: 93.1
> vvc_avg_10_2x64_c: 2383.8
> vvc_avg_10_2x64_avx2: 714.8
> vvc_avg_10_2x128_c: 4498.3
> vvc_avg_10_2x128_avx2: 1466.3
> vvc_avg_10_4x2_c: 228.6
> vvc_avg_10_4x2_avx2: 26.8
> vvc_avg_10_4x4_c: 378.3
> vvc_avg_10_4x4_avx2: 30.6
> vvc_avg_10_4x8_c: 866.8
> vvc_avg_10_4x8_avx2: 44.6
> vvc_avg_10_4x16_c: 1018.1
> vvc_avg_10_4x16_avx2: 58.1
> vvc_avg_10_4x32_c: 3590.8
> vvc_avg_10_4x32_avx2: 128.8
> vvc_avg_10_4x64_c: 4200.8
> vvc_avg_10_4x64_avx2: 663.6
> vvc_avg_10_4x128_c: 8450.8
> vvc_avg_10_4x128_avx2: 1531.8
> vvc_avg_10_8x2_c: 369.3
> vvc_avg_10_8x2_avx2: 28.3
> vvc_avg_10_8x4_c: 513.8
> vvc_avg_10_8x4_avx2: 32.1
> vvc_avg_10_8x8_c: 1720.3
> vvc_avg_10_8x8_avx2: 49.1
> vvc_avg_10_8x16_c: 1894.8
> vvc_avg_10_8x16_avx2: 71.6
> vvc_avg_10_8x32_c: 3931.3
> vvc_avg_10_8x32_avx2: 148.1
> vvc_avg_10_8x64_c: 7964.3
> vvc_avg_10_8x64_avx2: 613.1
> vvc_avg_10_8x128_c: 15540.1
> vvc_avg_10_8x128_avx2: 1585.1
> vvc_avg_10_16x2_c: 877.3
> vvc_avg_10_16x2_avx2: 27.6
> vvc_avg_10_16x4_c: 955.8
> vvc_avg_10_16x4_avx2: 29.8
> vvc_avg_10_16x8_c: 3419.6
> vvc_avg_10_16x8_avx2: 62.6
> vvc_avg_10_16x16_c: 3826.8
> vvc_avg_10_16x16_avx2: 54.3
> vvc_avg_10_16x32_c: 7655.3
> vvc_avg_10_16x32_avx2: 86.3
> vvc_avg_10_16x64_c: 30011.1
> vvc_avg_10_16x64_avx2: 692.6
> vvc_avg_10_16x128_c: 47894.8
> vvc_avg_10_16x128_avx2: 1580.3
> vvc_avg_10_32x2_c: 944.3
> vvc_avg_10_32x2_avx2: 29.8
> vvc_avg_10_32x4_c: 2022.6
> vvc_avg_10_32x4_avx2: 35.1
> vvc_avg_10_32x8_c: 6148.8
> vvc_avg_10_32x8_avx2: 51.3
> vvc_avg_10_32x16_c: 12601.6
> vvc_avg_10_32x16_avx2: 70.8
> vvc_avg_10_32x32_c: 15958.6
> vvc_avg_10_32x32_avx2: 124.3
> vvc_avg_10_32x64_c: 31784.6
> vvc_avg_10_32x64_avx2: 757.3
> vvc_avg_10_32x128_c: 63892.8
> vvc_avg_10_32x128_avx2: 1711.3
> vvc_avg_10_64x2_c: 1890.8
> vvc_avg_10_64x2_avx2: 34.3
> vvc_avg_10_64x4_c: 6267.3
> vvc_avg_10_64x4_avx2: 42.6
> vvc_avg_10_64x8_c: 12778.1
> vvc_avg_10_64x8_avx2: 67.8
> vvc_avg_10_64x16_c: 22304.3
> vvc_avg_10_64x16_avx2: 116.8
> vvc_avg_10_64x32_c: 30777.1
> vvc_avg_10_64x32_avx2: 201.1
> vvc_avg_10_64x64_c: 60169.1
> vvc_avg_10_64x64_avx2: 1454.3
> vvc_avg_10_64x128_c: 124392.8
> vvc_avg_10_64x128_avx2: 3648.6
> vvc_avg_10_128x2_c: 3650.1
> vvc_avg_10_128x2_avx2: 41.1
> vvc_avg_10_128x4_c: 22887.8
> vvc_avg_10_128x4_avx2: 64.1
> vvc_avg_10_128x8_c: 14622.6
> vvc_avg_10_128x8_avx2: 111.6
> vvc_avg_10_128x16_c: 62207.6
> vvc_avg_10_128x16_avx2: 186.3
> vvc_avg_10_128x32_c: 59761.3
> vvc_avg_10_128x32_avx2: 374.6
> vvc_avg_10_128x64_c: 117504.3
> vvc_avg_10_128x64_avx2: 2684.6
> vvc_avg_10_128x128_c: 236767.6
> vvc_avg_10_128x128_avx2: 15278.1
> vvc_avg_12_2x2_c: 78.6
> vvc_avg_12_2x2_avx2: 26.1
> vvc_avg_12_2x4_c: 254.1
> vvc_avg_12_2x4_avx2: 30.6
> vvc_avg_12_2x8_c: 261.8
> vvc_avg_12_2x8_avx2: 39.1
> vvc_avg_12_2x16_c: 527.6
> vvc_avg_12_2x16_avx2: 57.3
> vvc_avg_12_2x32_c: 1089.1
> vvc_avg_12_2x32_avx2: 93.8
> vvc_avg_12_2x64_c: 2337.6
> vvc_avg_12_2x64_avx2: 707.1
> vvc_avg_12_2x128_c: 4582.1
> vvc_avg_12_2x128_avx2: 1414.6
> vvc_avg_12_4x2_c: 129.6
> vvc_avg_12_4x2_avx2: 26.8
> vvc_avg_12_4x4_c: 427.3
> vvc_avg_12_4x4_avx2: 30.6
> vvc_avg_12_4x8_c: 529.6
> vvc_avg_12_4x8_avx2: 36.6
> vvc_avg_12_4x16_c: 1022.1
> vvc_avg_12_4x16_avx2: 57.3
> vvc_avg_12_4x32_c: 1987.6
> vvc_avg_12_4x32_avx2: 84.3
> vvc_avg_12_4x64_c: 4147.6
> vvc_avg_12_4x64_avx2: 706.3
> vvc_avg_12_4x128_c: 8469.3
> vvc_avg_12_4x128_avx2: 1448.3
> vvc_avg_12_8x2_c: 253.6
> vvc_avg_12_8x2_avx2: 27.6
> vvc_avg_12_8x4_c: 836.3
> vvc_avg_12_8x4_avx2: 32.1
> vvc_avg_12_8x8_c: 1074.6
> vvc_avg_12_8x8_avx2: 45.1
> vvc_avg_12_8x16_c: 3616.8
> vvc_avg_12_8x16_avx2: 71.6
> vvc_avg_12_8x32_c: 3823.6
> vvc_avg_12_8x32_avx2: 140.1
> vvc_avg_12_8x64_c: 7764.8
> vvc_avg_12_8x64_avx2: 656.1
> vvc_avg_12_8x128_c: 15896.1
> vvc_avg_12_8x128_avx2: 1232.8
> vvc_avg_12_16x2_c: 462.1
> vvc_avg_12_16x2_avx2: 26.8
> vvc_avg_12_16x4_c: 1732.1
> vvc_avg_12_16x4_avx2: 29.1
> vvc_avg_12_16x8_c: 2097.6
> vvc_avg_12_16x8_avx2: 62.6
> vvc_avg_12_16x16_c: 6753.1
> vvc_avg_12_16x16_avx2: 47.8
> vvc_avg_12_16x32_c: 7373.1
> vvc_avg_12_16x32_avx2: 80.8
> vvc_avg_12_16x64_c: 15046.3
> vvc_avg_12_16x64_avx2: 621.1
> vvc_avg_12_16x128_c: 52574.6
> vvc_avg_12_16x128_avx2: 1417.1
> vvc_avg_12_32x2_c: 1712.1
> vvc_avg_12_32x2_avx2: 29.8
> vvc_avg_12_32x4_c: 2036.8
> vvc_avg_12_32x4_avx2: 37.6
> vvc_avg_12_32x8_c: 4017.6
> vvc_avg_12_32x8_avx2: 44.1
> vvc_avg_12_32x16_c: 8018.6
> vvc_avg_12_32x16_avx2: 70.8
> vvc_avg_12_32x32_c: 15637.6
> vvc_avg_12_32x32_avx2: 124.3
> vvc_avg_12_32x64_c: 31143.3
> vvc_avg_12_32x64_avx2: 830.3
> vvc_avg_12_32x128_c: 75706.8
> vvc_avg_12_32x128_avx2: 1604.8
> vvc_avg_12_64x2_c: 3230.3
> vvc_avg_12_64x2_avx2: 33.6
> vvc_avg_12_64x4_c: 4139.6
> vvc_avg_12_64x4_avx2: 45.1
> vvc_avg_12_64x8_c: 8201.6
> vvc_avg_12_64x8_avx2: 67.1
> vvc_avg_12_64x16_c: 25632.3
> vvc_avg_12_64x16_avx2: 110.3
> vvc_avg_12_64x32_c: 30744.3
> vvc_avg_12_64x32_avx2: 200.3
> vvc_avg_12_64x64_c: 105554.8
> vvc_avg_12_64x64_avx2: 1325.6
> vvc_avg_12_64x128_c: 235254.3
> vvc_avg_12_64x128_avx2: 3132.6
> vvc_avg_12_128x2_c: 6194.3
> vvc_avg_12_128x2_avx2: 55.1
> vvc_avg_12_128x4_c: 7583.8
> vvc_avg_12_128x4_avx2: 79.3
> vvc_avg_12_128x8_c: 14635.6
> vvc_avg_12_128x8_avx2: 104.3
> vvc_avg_12_128x16_c: 29270.8
> vvc_avg_12_128x16_avx2: 194.3
> vvc_avg_12_128x32_c: 60113.6
> vvc_avg_12_128x32_avx2: 346.3
> vvc_avg_12_128x64_c: 197030.3
> vvc_avg_12_128x64_avx2: 2779.6
> vvc_avg_12_128x128_c: 432809.6
> vvc_avg_12_128x128_avx2: 5513.3
> vvc_w_avg_8_2x2_c: 84.3
> vvc_w_avg_8_2x2_avx2: 42.6
> vvc_w_avg_8_2x4_c: 156.3
> vvc_w_avg_8_2x4_avx2: 58.8
> vvc_w_avg_8_2x8_c: 310.6
> vvc_w_avg_8_2x8_avx2: 73.1
> vvc_w_avg_8_2x16_c: 942.1
> vvc_w_avg_8_2x16_avx2: 113.3
> vvc_w_avg_8_2x32_c: 1098.8
> vvc_w_avg_8_2x32_avx2: 202.6
> vvc_w_avg_8_2x64_c: 2414.3
> vvc_w_avg_8_2x64_avx2: 467.6
> vvc_w_avg_8_2x128_c: 4763.8
> vvc_w_avg_8_2x128_avx2: 1333.1
> vvc_w_avg_8_4x2_c: 140.1
> vvc_w_avg_8_4x2_avx2: 49.8
> vvc_w_avg_8_4x4_c: 276.3
> vvc_w_avg_8_4x4_avx2: 58.1
> vvc_w_avg_8_4x8_c: 524.3
> vvc_w_avg_8_4x8_avx2: 72.3
> vvc_w_avg_8_4x16_c: 1108.1
> vvc_w_avg_8_4x16_avx2: 111.8
> vvc_w_avg_8_4x32_c: 2149.8
> vvc_w_avg_8_4x32_avx2: 199.6
> vvc_w_avg_8_4x64_c: 12288.1
> vvc_w_avg_8_4x64_avx2: 509.3
> vvc_w_avg_8_4x128_c: 8398.6
> vvc_w_avg_8_4x128_avx2: 1319.6
> vvc_w_avg_8_8x2_c: 271.1
> vvc_w_avg_8_8x2_avx2: 44.1
> vvc_w_avg_8_8x4_c: 503.3
> vvc_w_avg_8_8x4_avx2: 61.8
> vvc_w_avg_8_8x8_c: 1031.1
> vvc_w_avg_8_8x8_avx2: 93.8
> vvc_w_avg_8_8x16_c: 2009.8
> vvc_w_avg_8_8x16_avx2: 163.1
> vvc_w_avg_8_8x32_c: 4161.3
> vvc_w_avg_8_8x32_avx2: 292.1
> vvc_w_avg_8_8x64_c: 7940.6
> vvc_w_avg_8_8x64_avx2: 592.1
> vvc_w_avg_8_8x128_c: 16802.3
> vvc_w_avg_8_8x128_avx2: 1287.6
> vvc_w_avg_8_16x2_c: 762.6
> vvc_w_avg_8_16x2_avx2: 53.6
> vvc_w_avg_8_16x4_c: 1486.3
> vvc_w_avg_8_16x4_avx2: 67.1
> vvc_w_avg_8_16x8_c: 1907.8
> vvc_w_avg_8_16x8_avx2: 96.8
> vvc_w_avg_8_16x16_c: 3883.6
> vvc_w_avg_8_16x16_avx2: 151.3
> vvc_w_avg_8_16x32_c: 7974.8
> vvc_w_avg_8_16x32_avx2: 285.8
> vvc_w_avg_8_16x64_c: 25160.6
> vvc_w_avg_8_16x64_avx2: 589.8
> vvc_w_avg_8_16x128_c: 58328.1
> vvc_w_avg_8_16x128_avx2: 1169.8
> vvc_w_avg_8_32x2_c: 1009.1
> vvc_w_avg_8_32x2_avx2: 65.6
> vvc_w_avg_8_32x4_c: 2091.1
> vvc_w_avg_8_32x4_avx2: 96.8
> vvc_w_avg_8_32x8_c: 3997.8
> vvc_w_avg_8_32x8_avx2: 156.3
> vvc_w_avg_8_32x16_c: 8216.8
> vvc_w_avg_8_32x16_avx2: 269.6
> vvc_w_avg_8_32x32_c: 21746.1
> vvc_w_avg_8_32x32_avx2: 635.3
> vvc_w_avg_8_32x64_c: 31564.8
> vvc_w_avg_8_32x64_avx2: 1010.6
> vvc_w_avg_8_32x128_c: 114373.3
> vvc_w_avg_8_32x128_avx2: 2013.6
> vvc_w_avg_8_64x2_c: 2067.3
> vvc_w_avg_8_64x2_avx2: 97.6
> vvc_w_avg_8_64x4_c: 3901.1
> vvc_w_avg_8_64x4_avx2: 154.8
> vvc_w_avg_8_64x8_c: 7911.6
> vvc_w_avg_8_64x8_avx2: 268.8
> vvc_w_avg_8_64x16_c: 16508.8
> vvc_w_avg_8_64x16_avx2: 501.8
> vvc_w_avg_8_64x32_c: 38770.3
> vvc_w_avg_8_64x32_avx2: 1287.6
> vvc_w_avg_8_64x64_c: 110350.6
> vvc_w_avg_8_64x64_avx2: 1890.8
> vvc_w_avg_8_64x128_c: 141354.6
> vvc_w_avg_8_64x128_avx2: 3839.6
> vvc_w_avg_8_128x2_c: 7012.1
> vvc_w_avg_8_128x2_avx2: 159.3
> vvc_w_avg_8_128x4_c: 8146.8
> vvc_w_avg_8_128x4_avx2: 272.6
> vvc_w_avg_8_128x8_c: 24596.8
> vvc_w_avg_8_128x8_avx2: 501.1
> vvc_w_avg_8_128x16_c: 35918.1
> vvc_w_avg_8_128x16_avx2: 948.8
> vvc_w_avg_8_128x32_c: 68799.6
> vvc_w_avg_8_128x32_avx2: 1963.1
> vvc_w_avg_8_128x64_c: 133862.1
> vvc_w_avg_8_128x64_avx2: 3833.6
> vvc_w_avg_8_128x128_c: 348427.8
> vvc_w_avg_8_128x128_avx2: 7682.8
> vvc_w_avg_10_2x2_c: 118.6
> vvc_w_avg_10_2x2_avx2: 73.1
> vvc_w_avg_10_2x4_c: 189.1
> vvc_w_avg_10_2x4_avx2: 89.3
> vvc_w_avg_10_2x8_c: 382.8
> vvc_w_avg_10_2x8_avx2: 179.8
> vvc_w_avg_10_2x16_c: 658.3
> vvc_w_avg_10_2x16_avx2: 185.1
> vvc_w_avg_10_2x32_c: 1409.3
> vvc_w_avg_10_2x32_avx2: 290.8
> vvc_w_avg_10_2x64_c: 2906.8
> vvc_w_avg_10_2x64_avx2: 793.1
> vvc_w_avg_10_2x128_c: 6292.6
> vvc_w_avg_10_2x128_avx2: 1696.8
> vvc_w_avg_10_4x2_c: 178.8
> vvc_w_avg_10_4x2_avx2: 80.1
> vvc_w_avg_10_4x4_c: 581.6
> vvc_w_avg_10_4x4_avx2: 97.6
> vvc_w_avg_10_4x8_c: 693.3
> vvc_w_avg_10_4x8_avx2: 128.1
> vvc_w_avg_10_4x16_c: 1436.6
> vvc_w_avg_10_4x16_avx2: 179.8
> vvc_w_avg_10_4x32_c: 2409.1
> vvc_w_avg_10_4x32_avx2: 292.3
> vvc_w_avg_10_4x64_c: 4925.3
> vvc_w_avg_10_4x64_avx2: 746.1
> vvc_w_avg_10_4x128_c: 10664.6
> vvc_w_avg_10_4x128_avx2: 1647.6
> vvc_w_avg_10_8x2_c: 359.3
> vvc_w_avg_10_8x2_avx2: 80.1
> vvc_w_avg_10_8x4_c: 925.6
> vvc_w_avg_10_8x4_avx2: 97.6
> vvc_w_avg_10_8x8_c: 1360.6
> vvc_w_avg_10_8x8_avx2: 121.8
> vvc_w_avg_10_8x16_c: 3490.3
> vvc_w_avg_10_8x16_avx2: 203.3
> vvc_w_avg_10_8x32_c: 5266.1
> vvc_w_avg_10_8x32_avx2: 325.8
> vvc_w_avg_10_8x64_c: 11127.1
> vvc_w_avg_10_8x64_avx2: 747.8
> vvc_w_avg_10_8x128_c: 31058.3
> vvc_w_avg_10_8x128_avx2: 1424.6
> vvc_w_avg_10_16x2_c: 624.8
> vvc_w_avg_10_16x2_avx2: 84.6
> vvc_w_avg_10_16x4_c: 1389.6
> vvc_w_avg_10_16x4_avx2: 109.1
> vvc_w_avg_10_16x8_c: 2688.3
> vvc_w_avg_10_16x8_avx2: 137.1
> vvc_w_avg_10_16x16_c: 5387.1
> vvc_w_avg_10_16x16_avx2: 224.6
> vvc_w_avg_10_16x32_c: 10776.3
> vvc_w_avg_10_16x32_avx2: 312.1
> vvc_w_avg_10_16x64_c: 18069.1
> vvc_w_avg_10_16x64_avx2: 858.6
> vvc_w_avg_10_16x128_c: 43460.3
> vvc_w_avg_10_16x128_avx2: 1411.6
> vvc_w_avg_10_32x2_c: 1232.8
> vvc_w_avg_10_32x2_avx2: 99.1
> vvc_w_avg_10_32x4_c: 4017.6
> vvc_w_avg_10_32x4_avx2: 134.1
> vvc_w_avg_10_32x8_c: 9306.3
> vvc_w_avg_10_32x8_avx2: 208.1
> vvc_w_avg_10_32x16_c: 8424.6
> vvc_w_avg_10_32x16_avx2: 349.3
> vvc_w_avg_10_32x32_c: 20787.8
> vvc_w_avg_10_32x32_avx2: 655.3
> vvc_w_avg_10_32x64_c: 40972.1
> vvc_w_avg_10_32x64_avx2: 904.8
> vvc_w_avg_10_32x128_c: 85670.3
> vvc_w_avg_10_32x128_avx2: 1751.6
> vvc_w_avg_10_64x2_c: 2454.1
> vvc_w_avg_10_64x2_avx2: 132.6
> vvc_w_avg_10_64x4_c: 5012.6
> vvc_w_avg_10_64x4_avx2: 215.6
> vvc_w_avg_10_64x8_c: 10811.3
> vvc_w_avg_10_64x8_avx2: 361.1
> vvc_w_avg_10_64x16_c: 33349.1
> vvc_w_avg_10_64x16_avx2: 904.1
> vvc_w_avg_10_64x32_c: 41892.3
> vvc_w_avg_10_64x32_avx2: 1220.6
> vvc_w_avg_10_64x64_c: 66983.3
> vvc_w_avg_10_64x64_avx2: 2622.1
> vvc_w_avg_10_64x128_c: 246508.8
> vvc_w_avg_10_64x128_avx2: 3316.8
> vvc_w_avg_10_128x2_c: 7791.6
> vvc_w_avg_10_128x2_avx2: 198.8
> vvc_w_avg_10_128x4_c: 10534.3
> vvc_w_avg_10_128x4_avx2: 337.3
> vvc_w_avg_10_128x8_c: 21142.3
> vvc_w_avg_10_128x8_avx2: 614.8
> vvc_w_avg_10_128x16_c: 40968.6
> vvc_w_avg_10_128x16_avx2: 1160.6
> vvc_w_avg_10_128x32_c: 113043.3
> vvc_w_avg_10_128x32_avx2: 1644.6
> vvc_w_avg_10_128x64_c: 230658.3
> vvc_w_avg_10_128x64_avx2: 5065.3
> vvc_w_avg_10_128x128_c: 335236.3
> vvc_w_avg_10_128x128_avx2: 6450.3
> vvc_w_avg_12_2x2_c: 185.3
> vvc_w_avg_12_2x2_avx2: 43.6
> vvc_w_avg_12_2x4_c: 340.3
> vvc_w_avg_12_2x4_avx2: 55.8
> vvc_w_avg_12_2x8_c: 632.3
> vvc_w_avg_12_2x8_avx2: 70.1
> vvc_w_avg_12_2x16_c: 728.3
> vvc_w_avg_12_2x16_avx2: 108.1
> vvc_w_avg_12_2x32_c: 1392.6
> vvc_w_avg_12_2x32_avx2: 176.8
> vvc_w_avg_12_2x64_c: 2618.3
> vvc_w_avg_12_2x64_avx2: 757.3
> vvc_w_avg_12_2x128_c: 6408.8
> vvc_w_avg_12_2x128_avx2: 1435.1
> vvc_w_avg_12_4x2_c: 349.3
> vvc_w_avg_12_4x2_avx2: 44.3
> vvc_w_avg_12_4x4_c: 607.1
> vvc_w_avg_12_4x4_avx2: 52.6
> vvc_w_avg_12_4x8_c: 1134.8
> vvc_w_avg_12_4x8_avx2: 70.1
> vvc_w_avg_12_4x16_c: 1378.1
> vvc_w_avg_12_4x16_avx2: 115.3
> vvc_w_avg_12_4x32_c: 2599.3
> vvc_w_avg_12_4x32_avx2: 174.3
> vvc_w_avg_12_4x64_c: 4474.8
> vvc_w_avg_12_4x64_avx2: 656.1
> vvc_w_avg_12_4x128_c: 11319.6
> vvc_w_avg_12_4x128_avx2: 1373.1
> vvc_w_avg_12_8x2_c: 595.8
> vvc_w_avg_12_8x2_avx2: 44.3
> vvc_w_avg_12_8x4_c: 1164.3
> vvc_w_avg_12_8x4_avx2: 56.6
> vvc_w_avg_12_8x8_c: 2019.6
> vvc_w_avg_12_8x8_avx2: 80.1
> vvc_w_avg_12_8x16_c: 4071.6
> vvc_w_avg_12_8x16_avx2: 139.3
> vvc_w_avg_12_8x32_c: 4485.1
> vvc_w_avg_12_8x32_avx2: 250.6
> vvc_w_avg_12_8x64_c: 8404.8
> vvc_w_avg_12_8x64_avx2: 735.8
> vvc_w_avg_12_8x128_c: 35679.8
> vvc_w_avg_12_8x128_avx2: 1252.6
> vvc_w_avg_12_16x2_c: 1114.8
> vvc_w_avg_12_16x2_avx2: 46.6
> vvc_w_avg_12_16x4_c: 2240.1
> vvc_w_avg_12_16x4_avx2: 62.6
> vvc_w_avg_12_16x8_c: 13174.6
> vvc_w_avg_12_16x8_avx2: 88.6
> vvc_w_avg_12_16x16_c: 5334.6
> vvc_w_avg_12_16x16_avx2: 144.3
> vvc_w_avg_12_16x32_c: 8378.1
> vvc_w_avg_12_16x32_avx2: 234.6
> vvc_w_avg_12_16x64_c: 21300.8
> vvc_w_avg_12_16x64_avx2: 761.8
> vvc_w_avg_12_16x128_c: 32786.8
> vvc_w_avg_12_16x128_avx2: 1432.8
> vvc_w_avg_12_32x2_c: 2154.3
> vvc_w_avg_12_32x2_avx2: 61.1
> vvc_w_avg_12_32x4_c: 4299.8
> vvc_w_avg_12_32x4_avx2: 83.1
> vvc_w_avg_12_32x8_c: 7964.8
> vvc_w_avg_12_32x8_avx2: 132.6
> vvc_w_avg_12_32x16_c: 13321.6
> vvc_w_avg_12_32x16_avx2: 234.6
> vvc_w_avg_12_32x32_c: 21149.3
> vvc_w_avg_12_32x32_avx2: 433.3
> vvc_w_avg_12_32x64_c: 43666.6
> vvc_w_avg_12_32x64_avx2: 876.6
> vvc_w_avg_12_32x128_c: 83189.8
> vvc_w_avg_12_32x128_avx2: 1756.6
> vvc_w_avg_12_64x2_c: 3829.8
> vvc_w_avg_12_64x2_avx2: 83.1
> vvc_w_avg_12_64x4_c: 8588.1
> vvc_w_avg_12_64x4_avx2: 127.1
> vvc_w_avg_12_64x8_c: 17027.6
> vvc_w_avg_12_64x8_avx2: 310.6
> vvc_w_avg_12_64x16_c: 29797.8
> vvc_w_avg_12_64x16_avx2: 415.6
> vvc_w_avg_12_64x32_c: 43854.3
> vvc_w_avg_12_64x32_avx2: 773.3
> vvc_w_avg_12_64x64_c: 137767.3
> vvc_w_avg_12_64x64_avx2: 1608.6
> vvc_w_avg_12_64x128_c: 316428.3
> vvc_w_avg_12_64x128_avx2: 3249.8
> vvc_w_avg_12_128x2_c: 8824.6
> vvc_w_avg_12_128x2_avx2: 130.3
> vvc_w_avg_12_128x4_c: 17173.6
> vvc_w_avg_12_128x4_avx2: 219.3
> vvc_w_avg_12_128x8_c: 21997.8
> vvc_w_avg_12_128x8_avx2: 397.3
> vvc_w_avg_12_128x16_c: 43553.8
> vvc_w_avg_12_128x16_avx2: 790.1
> vvc_w_avg_12_128x32_c: 89792.1
> vvc_w_avg_12_128x32_avx2: 1497.6
> vvc_w_avg_12_128x64_c: 226573.3
> vvc_w_avg_12_128x64_avx2: 3153.1
> vvc_w_avg_12_128x128_c: 332090.1
> vvc_w_avg_12_128x128_avx2: 6499.6
>
> Signed-off-by: Wu Jianhua <toqsxw@outlook.com>
> ---
> libavcodec/x86/vvc/Makefile | 3 +-
> libavcodec/x86/vvc/vvc_mc.asm | 301 +++++++++++++++++++++++++++++++
> libavcodec/x86/vvc/vvcdsp_init.c | 52 ++++++
> 3 files changed, 355 insertions(+), 1 deletion(-)
> create mode 100644 libavcodec/x86/vvc/vvc_mc.asm
this seems to break x86-32
src/libavcodec/x86/vvc/vvc_mc.asm:51: error: symbol `ff_vvc_avg_8bpc_avx2.w2' undefined
src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
src/libavcodec/x86/vvc/vvc_mc.asm:51: error: symbol `ff_vvc_avg_8bpc_avx2.w4' undefined
src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
src/libavcodec/x86/vvc/vvc_mc.asm:51: error: symbol `ff_vvc_avg_8bpc_avx2.w8' undefined
src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
src/libavcodec/x86/vvc/vvc_mc.asm:51: error: symbol `ff_vvc_avg_8bpc_avx2.w16' undefined
src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
src/libavcodec/x86/vvc/vvc_mc.asm:51: error: symbol `ff_vvc_avg_8bpc_avx2.w32' undefined
src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
src/libavcodec/x86/vvc/vvc_mc.asm:51: error: symbol `ff_vvc_avg_8bpc_avx2.w64' undefined
src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
src/libavcodec/x86/vvc/vvc_mc.asm:51: error: symbol `ff_vvc_avg_8bpc_avx2.w128' undefined
src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
src/libavcodec/x86/vvc/vvc_mc.asm:52: error: symbol `ff_vvc_avg_16bpc_avx2.w2' undefined
src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
src/libavcodec/x86/vvc/vvc_mc.asm:52: error: symbol `ff_vvc_avg_16bpc_avx2.w4' undefined
src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
src/libavcodec/x86/vvc/vvc_mc.asm:52: error: symbol `ff_vvc_avg_16bpc_avx2.w8' undefined
src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
src/libavcodec/x86/vvc/vvc_mc.asm:52: error: symbol `ff_vvc_avg_16bpc_avx2.w16' undefined
src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
src/libavcodec/x86/vvc/vvc_mc.asm:52: error: symbol `ff_vvc_avg_16bpc_avx2.w32' undefined
src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
src/libavcodec/x86/vvc/vvc_mc.asm:52: error: symbol `ff_vvc_avg_16bpc_avx2.w64' undefined
src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
src/libavcodec/x86/vvc/vvc_mc.asm:52: error: symbol `ff_vvc_avg_16bpc_avx2.w128' undefined
src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
src/libavcodec/x86/vvc/vvc_mc.asm:53: error: symbol `ff_vvc_w_avg_8bpc_avx2.w2' undefined
src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
src/libavcodec/x86/vvc/vvc_mc.asm:53: error: symbol `ff_vvc_w_avg_8bpc_avx2.w4' undefined
src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
src/libavcodec/x86/vvc/vvc_mc.asm:53: error: symbol `ff_vvc_w_avg_8bpc_avx2.w8' undefined
src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
src/libavcodec/x86/vvc/vvc_mc.asm:53: error: symbol `ff_vvc_w_avg_8bpc_avx2.w16' undefined
src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
src/libavcodec/x86/vvc/vvc_mc.asm:53: error: symbol `ff_vvc_w_avg_8bpc_avx2.w32' undefined
src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
src/libavcodec/x86/vvc/vvc_mc.asm:53: error: symbol `ff_vvc_w_avg_8bpc_avx2.w64' undefined
src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
src/libavcodec/x86/vvc/vvc_mc.asm:53: error: symbol `ff_vvc_w_avg_8bpc_avx2.w128' undefined
src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
src/libavcodec/x86/vvc/vvc_mc.asm:54: error: symbol `ff_vvc_w_avg_16bpc_avx2.w2' undefined
src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
src/libavcodec/x86/vvc/vvc_mc.asm:54: error: symbol `ff_vvc_w_avg_16bpc_avx2.w4' undefined
src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
src/libavcodec/x86/vvc/vvc_mc.asm:54: error: symbol `ff_vvc_w_avg_16bpc_avx2.w8' undefined
src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
src/libavcodec/x86/vvc/vvc_mc.asm:54: error: symbol `ff_vvc_w_avg_16bpc_avx2.w16' undefined
src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
src/libavcodec/x86/vvc/vvc_mc.asm:54: error: symbol `ff_vvc_w_avg_16bpc_avx2.w32' undefined
src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
src/libavcodec/x86/vvc/vvc_mc.asm:54: error: symbol `ff_vvc_w_avg_16bpc_avx2.w64' undefined
src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
src/libavcodec/x86/vvc/vvc_mc.asm:54: error: symbol `ff_vvc_w_avg_16bpc_avx2.w128' undefined
src/libavcodec/x86/vvc/vvc_mc.asm:48: ... from macro `AVG_JMP_TABLE' defined here
src/ffbuild/common.mak:103: recipe for target 'libavcodec/x86/vvc/vvc_mc.o' failed
make: *** [libavcodec/x86/vvc/vvc_mc.o] Error 1
make: *** Waiting for unfinished jobs....
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
Dictatorship: All citizens are under surveillance, all their steps and
actions recorded, for the politicians to enforce control.
Democracy: All politicians are under surveillance, all their steps and
actions recorded, for the citizens to enforce control.
[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]
[-- Attachment #2: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH v3 7/8] avcodec/x86/vvc: add avg and avg_w AVX2 optimizations
[not found] <20240122174628.1206503-1-toqsxw@outlook.com>
@ 2024-01-22 17:46 ` toqsxw
2024-01-22 22:46 ` Michael Niedermayer
0 siblings, 1 reply; 10+ messages in thread
From: toqsxw @ 2024-01-22 17:46 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Wu Jianhua
From: Wu Jianhua <toqsxw@outlook.com>
The avg/avg_w is based on dav1d.
See https://code.videolan.org/videolan/dav1d/-/blob/master/src/x86/mc_avx2.asm
vvc_avg_8_2x2_c: 71.6
vvc_avg_8_2x2_avx2: 26.8
vvc_avg_8_2x4_c: 140.8
vvc_avg_8_2x4_avx2: 34.6
vvc_avg_8_2x8_c: 410.3
vvc_avg_8_2x8_avx2: 41.3
vvc_avg_8_2x16_c: 769.3
vvc_avg_8_2x16_avx2: 60.3
vvc_avg_8_2x32_c: 1669.6
vvc_avg_8_2x32_avx2: 105.1
vvc_avg_8_2x64_c: 1978.3
vvc_avg_8_2x64_avx2: 425.8
vvc_avg_8_2x128_c: 6536.8
vvc_avg_8_2x128_avx2: 1315.1
vvc_avg_8_4x2_c: 155.6
vvc_avg_8_4x2_avx2: 26.1
vvc_avg_8_4x4_c: 250.3
vvc_avg_8_4x4_avx2: 31.3
vvc_avg_8_4x8_c: 831.8
vvc_avg_8_4x8_avx2: 41.3
vvc_avg_8_4x16_c: 1461.1
vvc_avg_8_4x16_avx2: 57.1
vvc_avg_8_4x32_c: 2821.6
vvc_avg_8_4x32_avx2: 105.1
vvc_avg_8_4x64_c: 3615.8
vvc_avg_8_4x64_avx2: 412.6
vvc_avg_8_4x128_c: 11962.6
vvc_avg_8_4x128_avx2: 1274.3
vvc_avg_8_8x2_c: 215.8
vvc_avg_8_8x2_avx2: 29.1
vvc_avg_8_8x4_c: 430.6
vvc_avg_8_8x4_avx2: 37.6
vvc_avg_8_8x8_c: 1463.3
vvc_avg_8_8x8_avx2: 51.8
vvc_avg_8_8x16_c: 2630.1
vvc_avg_8_8x16_avx2: 97.6
vvc_avg_8_8x32_c: 5813.8
vvc_avg_8_8x32_avx2: 196.6
vvc_avg_8_8x64_c: 6687.3
vvc_avg_8_8x64_avx2: 487.8
vvc_avg_8_8x128_c: 13178.6
vvc_avg_8_8x128_avx2: 1290.6
vvc_avg_8_16x2_c: 443.8
vvc_avg_8_16x2_avx2: 28.3
vvc_avg_8_16x4_c: 1253.3
vvc_avg_8_16x4_avx2: 32.1
vvc_avg_8_16x8_c: 2236.3
vvc_avg_8_16x8_avx2: 44.3
vvc_avg_8_16x16_c: 5127.8
vvc_avg_8_16x16_avx2: 63.3
vvc_avg_8_16x32_c: 6573.3
vvc_avg_8_16x32_avx2: 223.6
vvc_avg_8_16x64_c: 30311.8
vvc_avg_8_16x64_avx2: 437.8
vvc_avg_8_16x128_c: 25693.3
vvc_avg_8_16x128_avx2: 1266.8
vvc_avg_8_32x2_c: 954.6
vvc_avg_8_32x2_avx2: 32.1
vvc_avg_8_32x4_c: 2359.6
vvc_avg_8_32x4_avx2: 39.6
vvc_avg_8_32x8_c: 5703.6
vvc_avg_8_32x8_avx2: 57.1
vvc_avg_8_32x16_c: 9967.6
vvc_avg_8_32x16_avx2: 107.1
vvc_avg_8_32x32_c: 21327.6
vvc_avg_8_32x32_avx2: 272.6
vvc_avg_8_32x64_c: 39240.8
vvc_avg_8_32x64_avx2: 529.6
vvc_avg_8_32x128_c: 52580.8
vvc_avg_8_32x128_avx2: 1338.8
vvc_avg_8_64x2_c: 1647.3
vvc_avg_8_64x2_avx2: 38.8
vvc_avg_8_64x4_c: 5130.1
vvc_avg_8_64x4_avx2: 58.8
vvc_avg_8_64x8_c: 6529.3
vvc_avg_8_64x8_avx2: 88.3
vvc_avg_8_64x16_c: 19913.6
vvc_avg_8_64x16_avx2: 162.3
vvc_avg_8_64x32_c: 39360.8
vvc_avg_8_64x32_avx2: 295.8
vvc_avg_8_64x64_c: 49658.3
vvc_avg_8_64x64_avx2: 784.1
vvc_avg_8_64x128_c: 108513.1
vvc_avg_8_64x128_avx2: 1977.1
vvc_avg_8_128x2_c: 3226.1
vvc_avg_8_128x2_avx2: 61.1
vvc_avg_8_128x4_c: 10280.3
vvc_avg_8_128x4_avx2: 94.6
vvc_avg_8_128x8_c: 18079.3
vvc_avg_8_128x8_avx2: 155.3
vvc_avg_8_128x16_c: 45121.8
vvc_avg_8_128x16_avx2: 285.3
vvc_avg_8_128x32_c: 48651.8
vvc_avg_8_128x32_avx2: 581.6
vvc_avg_8_128x64_c: 165078.6
vvc_avg_8_128x64_avx2: 1942.8
vvc_avg_8_128x128_c: 339103.1
vvc_avg_8_128x128_avx2: 4332.6
vvc_avg_10_2x2_c: 144.3
vvc_avg_10_2x2_avx2: 26.8
vvc_avg_10_2x4_c: 142.6
vvc_avg_10_2x4_avx2: 45.3
vvc_avg_10_2x8_c: 478.1
vvc_avg_10_2x8_avx2: 38.1
vvc_avg_10_2x16_c: 518.3
vvc_avg_10_2x16_avx2: 58.1
vvc_avg_10_2x32_c: 2059.8
vvc_avg_10_2x32_avx2: 93.1
vvc_avg_10_2x64_c: 2383.8
vvc_avg_10_2x64_avx2: 714.8
vvc_avg_10_2x128_c: 4498.3
vvc_avg_10_2x128_avx2: 1466.3
vvc_avg_10_4x2_c: 228.6
vvc_avg_10_4x2_avx2: 26.8
vvc_avg_10_4x4_c: 378.3
vvc_avg_10_4x4_avx2: 30.6
vvc_avg_10_4x8_c: 866.8
vvc_avg_10_4x8_avx2: 44.6
vvc_avg_10_4x16_c: 1018.1
vvc_avg_10_4x16_avx2: 58.1
vvc_avg_10_4x32_c: 3590.8
vvc_avg_10_4x32_avx2: 128.8
vvc_avg_10_4x64_c: 4200.8
vvc_avg_10_4x64_avx2: 663.6
vvc_avg_10_4x128_c: 8450.8
vvc_avg_10_4x128_avx2: 1531.8
vvc_avg_10_8x2_c: 369.3
vvc_avg_10_8x2_avx2: 28.3
vvc_avg_10_8x4_c: 513.8
vvc_avg_10_8x4_avx2: 32.1
vvc_avg_10_8x8_c: 1720.3
vvc_avg_10_8x8_avx2: 49.1
vvc_avg_10_8x16_c: 1894.8
vvc_avg_10_8x16_avx2: 71.6
vvc_avg_10_8x32_c: 3931.3
vvc_avg_10_8x32_avx2: 148.1
vvc_avg_10_8x64_c: 7964.3
vvc_avg_10_8x64_avx2: 613.1
vvc_avg_10_8x128_c: 15540.1
vvc_avg_10_8x128_avx2: 1585.1
vvc_avg_10_16x2_c: 877.3
vvc_avg_10_16x2_avx2: 27.6
vvc_avg_10_16x4_c: 955.8
vvc_avg_10_16x4_avx2: 29.8
vvc_avg_10_16x8_c: 3419.6
vvc_avg_10_16x8_avx2: 62.6
vvc_avg_10_16x16_c: 3826.8
vvc_avg_10_16x16_avx2: 54.3
vvc_avg_10_16x32_c: 7655.3
vvc_avg_10_16x32_avx2: 86.3
vvc_avg_10_16x64_c: 30011.1
vvc_avg_10_16x64_avx2: 692.6
vvc_avg_10_16x128_c: 47894.8
vvc_avg_10_16x128_avx2: 1580.3
vvc_avg_10_32x2_c: 944.3
vvc_avg_10_32x2_avx2: 29.8
vvc_avg_10_32x4_c: 2022.6
vvc_avg_10_32x4_avx2: 35.1
vvc_avg_10_32x8_c: 6148.8
vvc_avg_10_32x8_avx2: 51.3
vvc_avg_10_32x16_c: 12601.6
vvc_avg_10_32x16_avx2: 70.8
vvc_avg_10_32x32_c: 15958.6
vvc_avg_10_32x32_avx2: 124.3
vvc_avg_10_32x64_c: 31784.6
vvc_avg_10_32x64_avx2: 757.3
vvc_avg_10_32x128_c: 63892.8
vvc_avg_10_32x128_avx2: 1711.3
vvc_avg_10_64x2_c: 1890.8
vvc_avg_10_64x2_avx2: 34.3
vvc_avg_10_64x4_c: 6267.3
vvc_avg_10_64x4_avx2: 42.6
vvc_avg_10_64x8_c: 12778.1
vvc_avg_10_64x8_avx2: 67.8
vvc_avg_10_64x16_c: 22304.3
vvc_avg_10_64x16_avx2: 116.8
vvc_avg_10_64x32_c: 30777.1
vvc_avg_10_64x32_avx2: 201.1
vvc_avg_10_64x64_c: 60169.1
vvc_avg_10_64x64_avx2: 1454.3
vvc_avg_10_64x128_c: 124392.8
vvc_avg_10_64x128_avx2: 3648.6
vvc_avg_10_128x2_c: 3650.1
vvc_avg_10_128x2_avx2: 41.1
vvc_avg_10_128x4_c: 22887.8
vvc_avg_10_128x4_avx2: 64.1
vvc_avg_10_128x8_c: 14622.6
vvc_avg_10_128x8_avx2: 111.6
vvc_avg_10_128x16_c: 62207.6
vvc_avg_10_128x16_avx2: 186.3
vvc_avg_10_128x32_c: 59761.3
vvc_avg_10_128x32_avx2: 374.6
vvc_avg_10_128x64_c: 117504.3
vvc_avg_10_128x64_avx2: 2684.6
vvc_avg_10_128x128_c: 236767.6
vvc_avg_10_128x128_avx2: 15278.1
vvc_avg_12_2x2_c: 78.6
vvc_avg_12_2x2_avx2: 26.1
vvc_avg_12_2x4_c: 254.1
vvc_avg_12_2x4_avx2: 30.6
vvc_avg_12_2x8_c: 261.8
vvc_avg_12_2x8_avx2: 39.1
vvc_avg_12_2x16_c: 527.6
vvc_avg_12_2x16_avx2: 57.3
vvc_avg_12_2x32_c: 1089.1
vvc_avg_12_2x32_avx2: 93.8
vvc_avg_12_2x64_c: 2337.6
vvc_avg_12_2x64_avx2: 707.1
vvc_avg_12_2x128_c: 4582.1
vvc_avg_12_2x128_avx2: 1414.6
vvc_avg_12_4x2_c: 129.6
vvc_avg_12_4x2_avx2: 26.8
vvc_avg_12_4x4_c: 427.3
vvc_avg_12_4x4_avx2: 30.6
vvc_avg_12_4x8_c: 529.6
vvc_avg_12_4x8_avx2: 36.6
vvc_avg_12_4x16_c: 1022.1
vvc_avg_12_4x16_avx2: 57.3
vvc_avg_12_4x32_c: 1987.6
vvc_avg_12_4x32_avx2: 84.3
vvc_avg_12_4x64_c: 4147.6
vvc_avg_12_4x64_avx2: 706.3
vvc_avg_12_4x128_c: 8469.3
vvc_avg_12_4x128_avx2: 1448.3
vvc_avg_12_8x2_c: 253.6
vvc_avg_12_8x2_avx2: 27.6
vvc_avg_12_8x4_c: 836.3
vvc_avg_12_8x4_avx2: 32.1
vvc_avg_12_8x8_c: 1074.6
vvc_avg_12_8x8_avx2: 45.1
vvc_avg_12_8x16_c: 3616.8
vvc_avg_12_8x16_avx2: 71.6
vvc_avg_12_8x32_c: 3823.6
vvc_avg_12_8x32_avx2: 140.1
vvc_avg_12_8x64_c: 7764.8
vvc_avg_12_8x64_avx2: 656.1
vvc_avg_12_8x128_c: 15896.1
vvc_avg_12_8x128_avx2: 1232.8
vvc_avg_12_16x2_c: 462.1
vvc_avg_12_16x2_avx2: 26.8
vvc_avg_12_16x4_c: 1732.1
vvc_avg_12_16x4_avx2: 29.1
vvc_avg_12_16x8_c: 2097.6
vvc_avg_12_16x8_avx2: 62.6
vvc_avg_12_16x16_c: 6753.1
vvc_avg_12_16x16_avx2: 47.8
vvc_avg_12_16x32_c: 7373.1
vvc_avg_12_16x32_avx2: 80.8
vvc_avg_12_16x64_c: 15046.3
vvc_avg_12_16x64_avx2: 621.1
vvc_avg_12_16x128_c: 52574.6
vvc_avg_12_16x128_avx2: 1417.1
vvc_avg_12_32x2_c: 1712.1
vvc_avg_12_32x2_avx2: 29.8
vvc_avg_12_32x4_c: 2036.8
vvc_avg_12_32x4_avx2: 37.6
vvc_avg_12_32x8_c: 4017.6
vvc_avg_12_32x8_avx2: 44.1
vvc_avg_12_32x16_c: 8018.6
vvc_avg_12_32x16_avx2: 70.8
vvc_avg_12_32x32_c: 15637.6
vvc_avg_12_32x32_avx2: 124.3
vvc_avg_12_32x64_c: 31143.3
vvc_avg_12_32x64_avx2: 830.3
vvc_avg_12_32x128_c: 75706.8
vvc_avg_12_32x128_avx2: 1604.8
vvc_avg_12_64x2_c: 3230.3
vvc_avg_12_64x2_avx2: 33.6
vvc_avg_12_64x4_c: 4139.6
vvc_avg_12_64x4_avx2: 45.1
vvc_avg_12_64x8_c: 8201.6
vvc_avg_12_64x8_avx2: 67.1
vvc_avg_12_64x16_c: 25632.3
vvc_avg_12_64x16_avx2: 110.3
vvc_avg_12_64x32_c: 30744.3
vvc_avg_12_64x32_avx2: 200.3
vvc_avg_12_64x64_c: 105554.8
vvc_avg_12_64x64_avx2: 1325.6
vvc_avg_12_64x128_c: 235254.3
vvc_avg_12_64x128_avx2: 3132.6
vvc_avg_12_128x2_c: 6194.3
vvc_avg_12_128x2_avx2: 55.1
vvc_avg_12_128x4_c: 7583.8
vvc_avg_12_128x4_avx2: 79.3
vvc_avg_12_128x8_c: 14635.6
vvc_avg_12_128x8_avx2: 104.3
vvc_avg_12_128x16_c: 29270.8
vvc_avg_12_128x16_avx2: 194.3
vvc_avg_12_128x32_c: 60113.6
vvc_avg_12_128x32_avx2: 346.3
vvc_avg_12_128x64_c: 197030.3
vvc_avg_12_128x64_avx2: 2779.6
vvc_avg_12_128x128_c: 432809.6
vvc_avg_12_128x128_avx2: 5513.3
vvc_w_avg_8_2x2_c: 84.3
vvc_w_avg_8_2x2_avx2: 42.6
vvc_w_avg_8_2x4_c: 156.3
vvc_w_avg_8_2x4_avx2: 58.8
vvc_w_avg_8_2x8_c: 310.6
vvc_w_avg_8_2x8_avx2: 73.1
vvc_w_avg_8_2x16_c: 942.1
vvc_w_avg_8_2x16_avx2: 113.3
vvc_w_avg_8_2x32_c: 1098.8
vvc_w_avg_8_2x32_avx2: 202.6
vvc_w_avg_8_2x64_c: 2414.3
vvc_w_avg_8_2x64_avx2: 467.6
vvc_w_avg_8_2x128_c: 4763.8
vvc_w_avg_8_2x128_avx2: 1333.1
vvc_w_avg_8_4x2_c: 140.1
vvc_w_avg_8_4x2_avx2: 49.8
vvc_w_avg_8_4x4_c: 276.3
vvc_w_avg_8_4x4_avx2: 58.1
vvc_w_avg_8_4x8_c: 524.3
vvc_w_avg_8_4x8_avx2: 72.3
vvc_w_avg_8_4x16_c: 1108.1
vvc_w_avg_8_4x16_avx2: 111.8
vvc_w_avg_8_4x32_c: 2149.8
vvc_w_avg_8_4x32_avx2: 199.6
vvc_w_avg_8_4x64_c: 12288.1
vvc_w_avg_8_4x64_avx2: 509.3
vvc_w_avg_8_4x128_c: 8398.6
vvc_w_avg_8_4x128_avx2: 1319.6
vvc_w_avg_8_8x2_c: 271.1
vvc_w_avg_8_8x2_avx2: 44.1
vvc_w_avg_8_8x4_c: 503.3
vvc_w_avg_8_8x4_avx2: 61.8
vvc_w_avg_8_8x8_c: 1031.1
vvc_w_avg_8_8x8_avx2: 93.8
vvc_w_avg_8_8x16_c: 2009.8
vvc_w_avg_8_8x16_avx2: 163.1
vvc_w_avg_8_8x32_c: 4161.3
vvc_w_avg_8_8x32_avx2: 292.1
vvc_w_avg_8_8x64_c: 7940.6
vvc_w_avg_8_8x64_avx2: 592.1
vvc_w_avg_8_8x128_c: 16802.3
vvc_w_avg_8_8x128_avx2: 1287.6
vvc_w_avg_8_16x2_c: 762.6
vvc_w_avg_8_16x2_avx2: 53.6
vvc_w_avg_8_16x4_c: 1486.3
vvc_w_avg_8_16x4_avx2: 67.1
vvc_w_avg_8_16x8_c: 1907.8
vvc_w_avg_8_16x8_avx2: 96.8
vvc_w_avg_8_16x16_c: 3883.6
vvc_w_avg_8_16x16_avx2: 151.3
vvc_w_avg_8_16x32_c: 7974.8
vvc_w_avg_8_16x32_avx2: 285.8
vvc_w_avg_8_16x64_c: 25160.6
vvc_w_avg_8_16x64_avx2: 589.8
vvc_w_avg_8_16x128_c: 58328.1
vvc_w_avg_8_16x128_avx2: 1169.8
vvc_w_avg_8_32x2_c: 1009.1
vvc_w_avg_8_32x2_avx2: 65.6
vvc_w_avg_8_32x4_c: 2091.1
vvc_w_avg_8_32x4_avx2: 96.8
vvc_w_avg_8_32x8_c: 3997.8
vvc_w_avg_8_32x8_avx2: 156.3
vvc_w_avg_8_32x16_c: 8216.8
vvc_w_avg_8_32x16_avx2: 269.6
vvc_w_avg_8_32x32_c: 21746.1
vvc_w_avg_8_32x32_avx2: 635.3
vvc_w_avg_8_32x64_c: 31564.8
vvc_w_avg_8_32x64_avx2: 1010.6
vvc_w_avg_8_32x128_c: 114373.3
vvc_w_avg_8_32x128_avx2: 2013.6
vvc_w_avg_8_64x2_c: 2067.3
vvc_w_avg_8_64x2_avx2: 97.6
vvc_w_avg_8_64x4_c: 3901.1
vvc_w_avg_8_64x4_avx2: 154.8
vvc_w_avg_8_64x8_c: 7911.6
vvc_w_avg_8_64x8_avx2: 268.8
vvc_w_avg_8_64x16_c: 16508.8
vvc_w_avg_8_64x16_avx2: 501.8
vvc_w_avg_8_64x32_c: 38770.3
vvc_w_avg_8_64x32_avx2: 1287.6
vvc_w_avg_8_64x64_c: 110350.6
vvc_w_avg_8_64x64_avx2: 1890.8
vvc_w_avg_8_64x128_c: 141354.6
vvc_w_avg_8_64x128_avx2: 3839.6
vvc_w_avg_8_128x2_c: 7012.1
vvc_w_avg_8_128x2_avx2: 159.3
vvc_w_avg_8_128x4_c: 8146.8
vvc_w_avg_8_128x4_avx2: 272.6
vvc_w_avg_8_128x8_c: 24596.8
vvc_w_avg_8_128x8_avx2: 501.1
vvc_w_avg_8_128x16_c: 35918.1
vvc_w_avg_8_128x16_avx2: 948.8
vvc_w_avg_8_128x32_c: 68799.6
vvc_w_avg_8_128x32_avx2: 1963.1
vvc_w_avg_8_128x64_c: 133862.1
vvc_w_avg_8_128x64_avx2: 3833.6
vvc_w_avg_8_128x128_c: 348427.8
vvc_w_avg_8_128x128_avx2: 7682.8
vvc_w_avg_10_2x2_c: 118.6
vvc_w_avg_10_2x2_avx2: 73.1
vvc_w_avg_10_2x4_c: 189.1
vvc_w_avg_10_2x4_avx2: 89.3
vvc_w_avg_10_2x8_c: 382.8
vvc_w_avg_10_2x8_avx2: 179.8
vvc_w_avg_10_2x16_c: 658.3
vvc_w_avg_10_2x16_avx2: 185.1
vvc_w_avg_10_2x32_c: 1409.3
vvc_w_avg_10_2x32_avx2: 290.8
vvc_w_avg_10_2x64_c: 2906.8
vvc_w_avg_10_2x64_avx2: 793.1
vvc_w_avg_10_2x128_c: 6292.6
vvc_w_avg_10_2x128_avx2: 1696.8
vvc_w_avg_10_4x2_c: 178.8
vvc_w_avg_10_4x2_avx2: 80.1
vvc_w_avg_10_4x4_c: 581.6
vvc_w_avg_10_4x4_avx2: 97.6
vvc_w_avg_10_4x8_c: 693.3
vvc_w_avg_10_4x8_avx2: 128.1
vvc_w_avg_10_4x16_c: 1436.6
vvc_w_avg_10_4x16_avx2: 179.8
vvc_w_avg_10_4x32_c: 2409.1
vvc_w_avg_10_4x32_avx2: 292.3
vvc_w_avg_10_4x64_c: 4925.3
vvc_w_avg_10_4x64_avx2: 746.1
vvc_w_avg_10_4x128_c: 10664.6
vvc_w_avg_10_4x128_avx2: 1647.6
vvc_w_avg_10_8x2_c: 359.3
vvc_w_avg_10_8x2_avx2: 80.1
vvc_w_avg_10_8x4_c: 925.6
vvc_w_avg_10_8x4_avx2: 97.6
vvc_w_avg_10_8x8_c: 1360.6
vvc_w_avg_10_8x8_avx2: 121.8
vvc_w_avg_10_8x16_c: 3490.3
vvc_w_avg_10_8x16_avx2: 203.3
vvc_w_avg_10_8x32_c: 5266.1
vvc_w_avg_10_8x32_avx2: 325.8
vvc_w_avg_10_8x64_c: 11127.1
vvc_w_avg_10_8x64_avx2: 747.8
vvc_w_avg_10_8x128_c: 31058.3
vvc_w_avg_10_8x128_avx2: 1424.6
vvc_w_avg_10_16x2_c: 624.8
vvc_w_avg_10_16x2_avx2: 84.6
vvc_w_avg_10_16x4_c: 1389.6
vvc_w_avg_10_16x4_avx2: 109.1
vvc_w_avg_10_16x8_c: 2688.3
vvc_w_avg_10_16x8_avx2: 137.1
vvc_w_avg_10_16x16_c: 5387.1
vvc_w_avg_10_16x16_avx2: 224.6
vvc_w_avg_10_16x32_c: 10776.3
vvc_w_avg_10_16x32_avx2: 312.1
vvc_w_avg_10_16x64_c: 18069.1
vvc_w_avg_10_16x64_avx2: 858.6
vvc_w_avg_10_16x128_c: 43460.3
vvc_w_avg_10_16x128_avx2: 1411.6
vvc_w_avg_10_32x2_c: 1232.8
vvc_w_avg_10_32x2_avx2: 99.1
vvc_w_avg_10_32x4_c: 4017.6
vvc_w_avg_10_32x4_avx2: 134.1
vvc_w_avg_10_32x8_c: 9306.3
vvc_w_avg_10_32x8_avx2: 208.1
vvc_w_avg_10_32x16_c: 8424.6
vvc_w_avg_10_32x16_avx2: 349.3
vvc_w_avg_10_32x32_c: 20787.8
vvc_w_avg_10_32x32_avx2: 655.3
vvc_w_avg_10_32x64_c: 40972.1
vvc_w_avg_10_32x64_avx2: 904.8
vvc_w_avg_10_32x128_c: 85670.3
vvc_w_avg_10_32x128_avx2: 1751.6
vvc_w_avg_10_64x2_c: 2454.1
vvc_w_avg_10_64x2_avx2: 132.6
vvc_w_avg_10_64x4_c: 5012.6
vvc_w_avg_10_64x4_avx2: 215.6
vvc_w_avg_10_64x8_c: 10811.3
vvc_w_avg_10_64x8_avx2: 361.1
vvc_w_avg_10_64x16_c: 33349.1
vvc_w_avg_10_64x16_avx2: 904.1
vvc_w_avg_10_64x32_c: 41892.3
vvc_w_avg_10_64x32_avx2: 1220.6
vvc_w_avg_10_64x64_c: 66983.3
vvc_w_avg_10_64x64_avx2: 2622.1
vvc_w_avg_10_64x128_c: 246508.8
vvc_w_avg_10_64x128_avx2: 3316.8
vvc_w_avg_10_128x2_c: 7791.6
vvc_w_avg_10_128x2_avx2: 198.8
vvc_w_avg_10_128x4_c: 10534.3
vvc_w_avg_10_128x4_avx2: 337.3
vvc_w_avg_10_128x8_c: 21142.3
vvc_w_avg_10_128x8_avx2: 614.8
vvc_w_avg_10_128x16_c: 40968.6
vvc_w_avg_10_128x16_avx2: 1160.6
vvc_w_avg_10_128x32_c: 113043.3
vvc_w_avg_10_128x32_avx2: 1644.6
vvc_w_avg_10_128x64_c: 230658.3
vvc_w_avg_10_128x64_avx2: 5065.3
vvc_w_avg_10_128x128_c: 335236.3
vvc_w_avg_10_128x128_avx2: 6450.3
vvc_w_avg_12_2x2_c: 185.3
vvc_w_avg_12_2x2_avx2: 43.6
vvc_w_avg_12_2x4_c: 340.3
vvc_w_avg_12_2x4_avx2: 55.8
vvc_w_avg_12_2x8_c: 632.3
vvc_w_avg_12_2x8_avx2: 70.1
vvc_w_avg_12_2x16_c: 728.3
vvc_w_avg_12_2x16_avx2: 108.1
vvc_w_avg_12_2x32_c: 1392.6
vvc_w_avg_12_2x32_avx2: 176.8
vvc_w_avg_12_2x64_c: 2618.3
vvc_w_avg_12_2x64_avx2: 757.3
vvc_w_avg_12_2x128_c: 6408.8
vvc_w_avg_12_2x128_avx2: 1435.1
vvc_w_avg_12_4x2_c: 349.3
vvc_w_avg_12_4x2_avx2: 44.3
vvc_w_avg_12_4x4_c: 607.1
vvc_w_avg_12_4x4_avx2: 52.6
vvc_w_avg_12_4x8_c: 1134.8
vvc_w_avg_12_4x8_avx2: 70.1
vvc_w_avg_12_4x16_c: 1378.1
vvc_w_avg_12_4x16_avx2: 115.3
vvc_w_avg_12_4x32_c: 2599.3
vvc_w_avg_12_4x32_avx2: 174.3
vvc_w_avg_12_4x64_c: 4474.8
vvc_w_avg_12_4x64_avx2: 656.1
vvc_w_avg_12_4x128_c: 11319.6
vvc_w_avg_12_4x128_avx2: 1373.1
vvc_w_avg_12_8x2_c: 595.8
vvc_w_avg_12_8x2_avx2: 44.3
vvc_w_avg_12_8x4_c: 1164.3
vvc_w_avg_12_8x4_avx2: 56.6
vvc_w_avg_12_8x8_c: 2019.6
vvc_w_avg_12_8x8_avx2: 80.1
vvc_w_avg_12_8x16_c: 4071.6
vvc_w_avg_12_8x16_avx2: 139.3
vvc_w_avg_12_8x32_c: 4485.1
vvc_w_avg_12_8x32_avx2: 250.6
vvc_w_avg_12_8x64_c: 8404.8
vvc_w_avg_12_8x64_avx2: 735.8
vvc_w_avg_12_8x128_c: 35679.8
vvc_w_avg_12_8x128_avx2: 1252.6
vvc_w_avg_12_16x2_c: 1114.8
vvc_w_avg_12_16x2_avx2: 46.6
vvc_w_avg_12_16x4_c: 2240.1
vvc_w_avg_12_16x4_avx2: 62.6
vvc_w_avg_12_16x8_c: 13174.6
vvc_w_avg_12_16x8_avx2: 88.6
vvc_w_avg_12_16x16_c: 5334.6
vvc_w_avg_12_16x16_avx2: 144.3
vvc_w_avg_12_16x32_c: 8378.1
vvc_w_avg_12_16x32_avx2: 234.6
vvc_w_avg_12_16x64_c: 21300.8
vvc_w_avg_12_16x64_avx2: 761.8
vvc_w_avg_12_16x128_c: 32786.8
vvc_w_avg_12_16x128_avx2: 1432.8
vvc_w_avg_12_32x2_c: 2154.3
vvc_w_avg_12_32x2_avx2: 61.1
vvc_w_avg_12_32x4_c: 4299.8
vvc_w_avg_12_32x4_avx2: 83.1
vvc_w_avg_12_32x8_c: 7964.8
vvc_w_avg_12_32x8_avx2: 132.6
vvc_w_avg_12_32x16_c: 13321.6
vvc_w_avg_12_32x16_avx2: 234.6
vvc_w_avg_12_32x32_c: 21149.3
vvc_w_avg_12_32x32_avx2: 433.3
vvc_w_avg_12_32x64_c: 43666.6
vvc_w_avg_12_32x64_avx2: 876.6
vvc_w_avg_12_32x128_c: 83189.8
vvc_w_avg_12_32x128_avx2: 1756.6
vvc_w_avg_12_64x2_c: 3829.8
vvc_w_avg_12_64x2_avx2: 83.1
vvc_w_avg_12_64x4_c: 8588.1
vvc_w_avg_12_64x4_avx2: 127.1
vvc_w_avg_12_64x8_c: 17027.6
vvc_w_avg_12_64x8_avx2: 310.6
vvc_w_avg_12_64x16_c: 29797.8
vvc_w_avg_12_64x16_avx2: 415.6
vvc_w_avg_12_64x32_c: 43854.3
vvc_w_avg_12_64x32_avx2: 773.3
vvc_w_avg_12_64x64_c: 137767.3
vvc_w_avg_12_64x64_avx2: 1608.6
vvc_w_avg_12_64x128_c: 316428.3
vvc_w_avg_12_64x128_avx2: 3249.8
vvc_w_avg_12_128x2_c: 8824.6
vvc_w_avg_12_128x2_avx2: 130.3
vvc_w_avg_12_128x4_c: 17173.6
vvc_w_avg_12_128x4_avx2: 219.3
vvc_w_avg_12_128x8_c: 21997.8
vvc_w_avg_12_128x8_avx2: 397.3
vvc_w_avg_12_128x16_c: 43553.8
vvc_w_avg_12_128x16_avx2: 790.1
vvc_w_avg_12_128x32_c: 89792.1
vvc_w_avg_12_128x32_avx2: 1497.6
vvc_w_avg_12_128x64_c: 226573.3
vvc_w_avg_12_128x64_avx2: 3153.1
vvc_w_avg_12_128x128_c: 332090.1
vvc_w_avg_12_128x128_avx2: 6499.6
Signed-off-by: Wu Jianhua <toqsxw@outlook.com>
---
libavcodec/x86/vvc/Makefile | 3 +-
libavcodec/x86/vvc/vvc_mc.asm | 301 +++++++++++++++++++++++++++++++
libavcodec/x86/vvc/vvcdsp_init.c | 52 ++++++
3 files changed, 355 insertions(+), 1 deletion(-)
create mode 100644 libavcodec/x86/vvc/vvc_mc.asm
diff --git a/libavcodec/x86/vvc/Makefile b/libavcodec/x86/vvc/Makefile
index b4acc22501..29765a6c48 100644
--- a/libavcodec/x86/vvc/Makefile
+++ b/libavcodec/x86/vvc/Makefile
@@ -2,5 +2,6 @@ clean::
$(RM) $(CLEANSUFFIXES:%=libavcodec/x86/vvc/%)
OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvcdsp_init.o
-X86ASM-OBJS-$(CONFIG_VVC_DECODER) += x86/h26x/h2656dsp.o \
+X86ASM-OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvc_mc.o \
+ x86/h26x/h2656dsp.o \
x86/h26x/h2656_inter.o
diff --git a/libavcodec/x86/vvc/vvc_mc.asm b/libavcodec/x86/vvc/vvc_mc.asm
new file mode 100644
index 0000000000..d0a34d3952
--- /dev/null
+++ b/libavcodec/x86/vvc/vvc_mc.asm
@@ -0,0 +1,301 @@
+; /*
+; * Provide SIMD MC functions for VVC decoding
+; *
+; * Copyright © 2021, VideoLAN and dav1d authors
+; * Copyright © 2021, Two Orioles, LLC
+; * All rights reserved.
+; *
+; * Copyright (c) 2023-2024 Nuo Mi
+; * Copyright (c) 2023-2024 Wu Jianhua
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+; */
+
+%include "libavutil/x86/x86util.asm"
+
+%define MAX_PB_SIZE 128
+
+SECTION_RODATA 32
+
+pw_0 times 2 dw 0
+pw_1 times 2 dw 1
+pw_4 times 2 dw 4
+pw_12 times 2 dw 12
+pw_256 times 2 dw 256
+
+%macro AVG_JMP_TABLE 3-*
+ %xdefine %1_%2_%3_table (%%table - 2*%4)
+ %xdefine %%base %1_%2_%3_table
+ %xdefine %%prefix mangle(private_prefix %+ _vvc_%1_%2bpc_%3)
+ %%table:
+ %rep %0 - 3
+ dd %%prefix %+ .w%4 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+AVG_JMP_TABLE avg, 8, avx2, 2, 4, 8, 16, 32, 64, 128
+AVG_JMP_TABLE avg, 16, avx2, 2, 4, 8, 16, 32, 64, 128
+AVG_JMP_TABLE w_avg, 8, avx2, 2, 4, 8, 16, 32, 64, 128
+AVG_JMP_TABLE w_avg, 16, avx2, 2, 4, 8, 16, 32, 64, 128
+
+SECTION .text
+
+%macro AVG_W16_FN 3 ; bpc, op, count
+ %assign %%i 0
+ %rep %3
+ %define off %%i
+ AVG_LOAD_W16 0, off
+ %2
+ AVG_SAVE_W16 %1, 0, off
+
+
+ AVG_LOAD_W16 1, off
+ %2
+ AVG_SAVE_W16 %1, 1, off
+
+ %assign %%i %%i+1
+ %endrep
+%endmacro
+
+%macro AVG_FN 2 ; bpc, op
+ jmp wq
+
+.w2:
+ movd xm0, [src0q]
+ pinsrd xm0, [src0q + AVG_SRC_STRIDE], 1
+ movd xm1, [src1q]
+ pinsrd xm1, [src1q + AVG_SRC_STRIDE], 1
+ %2
+ AVG_SAVE_W2 %1
+ AVG_LOOP_END .w2
+
+.w4:
+ movq xm0, [src0q]
+ pinsrq xm0, [src0q + AVG_SRC_STRIDE], 1
+ movq xm1, [src1q]
+ pinsrq xm1, [src1q + AVG_SRC_STRIDE], 1
+ %2
+ AVG_SAVE_W4 %1
+
+ AVG_LOOP_END .w4
+
+.w8:
+ vinserti128 m0, m0, [src0q], 0
+ vinserti128 m0, m0, [src0q + AVG_SRC_STRIDE], 1
+ vinserti128 m1, m1, [src1q], 0
+ vinserti128 m1, m1, [src1q + AVG_SRC_STRIDE], 1
+ %2
+ AVG_SAVE_W8 %1
+
+ AVG_LOOP_END .w8
+
+.w16:
+ AVG_W16_FN %1, %2, 1
+
+ AVG_LOOP_END .w16
+
+.w32:
+ AVG_W16_FN %1, %2, 2
+
+ AVG_LOOP_END .w32
+
+.w64:
+ AVG_W16_FN %1, %2, 4
+
+ AVG_LOOP_END .w64
+
+.w128:
+ AVG_W16_FN %1, %2, 8
+
+ AVG_LOOP_END .w128
+
+.ret:
+ RET
+%endmacro
+
+%macro AVG 0
+ paddsw m0, m1
+ pmulhrsw m0, m2
+ CLIPW m0, m3, m4
+%endmacro
+
+%macro W_AVG 0
+ punpckhwd m5, m0, m1
+ pmaddwd m5, m3
+ paddd m5, m4
+ psrad m5, xm2
+
+ punpcklwd m0, m0, m1
+ pmaddwd m0, m3
+ paddd m0, m4
+ psrad m0, xm2
+
+ packssdw m0, m5
+ CLIPW m0, m6, m7
+%endmacro
+
+%macro AVG_LOAD_W16 2 ; line, offset
+ movu m0, [src0q + %1 * AVG_SRC_STRIDE + %2 * 32]
+ movu m1, [src1q + %1 * AVG_SRC_STRIDE + %2 * 32]
+%endmacro
+
+%macro AVG_SAVE_W2 1 ;bpc
+ %if %1 == 16
+ pextrd [dstq], xm0, 0
+ pextrd [dstq + strideq], xm0, 1
+ %else
+ packuswb m0, m0
+ pextrw [dstq], xm0, 0
+ pextrw [dstq + strideq], xm0, 1
+ %endif
+%endmacro
+
+%macro AVG_SAVE_W4 1 ;bpc
+ %if %1 == 16
+ pextrq [dstq], xm0, 0
+ pextrq [dstq + strideq], xm0, 1
+ %else
+ packuswb m0, m0
+ pextrd [dstq], xm0, 0
+ pextrd [dstq + strideq], xm0, 1
+ %endif
+%endmacro
+
+%macro AVG_SAVE_W8 1 ;bpc
+ %if %1 == 16
+ vextracti128 [dstq], m0, 0
+ vextracti128 [dstq + strideq], m0, 1
+ %else
+ packuswb m0, m0
+ vpermq m0, m0, 1000b
+ pextrq [dstq], xm0, 0
+ pextrq [dstq + strideq], xm0, 1
+ %endif
+%endmacro
+
+%macro AVG_SAVE_W16 3 ; bpc, line, offset
+ %if %1 == 16
+ movu [dstq + %2 * strideq + %3 * 32], m0
+ %else
+ packuswb m0, m0
+ vpermq m0, m0, 1000b
+ vextracti128 [dstq + %2 * strideq + %3 * 16], m0, 0
+ %endif
+%endmacro
+
+%macro AVG_LOOP_END 1
+ sub hd, 2
+ je .ret
+
+ lea src0q, [src0q + 2 * AVG_SRC_STRIDE]
+ lea src1q, [src1q + 2 * AVG_SRC_STRIDE]
+ lea dstq, [dstq + 2 * strideq]
+ jmp %1
+%endmacro
+
+%define AVG_SRC_STRIDE MAX_PB_SIZE*2
+
+;void ff_vvc_avg_%1bpc_avx2(uint8_t *dst, ptrdiff_t dst_stride,
+; const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, intptr_t pixel_max);
+%macro VVC_AVG_AVX2 1
+cglobal vvc_avg_%1bpc, 4, 7, 5, dst, stride, src0, src1, w, h, bd
+ movifnidn hd, hm
+
+ pxor m3, m3 ; pixel min
+ vpbroadcastw m4, bdm ; pixel max
+
+ movifnidn bdd, bdm
+ inc bdd
+ tzcnt bdd, bdd ; bit depth
+
+ sub bdd, 8
+ movd xm0, bdd
+ vpbroadcastd m1, [pw_4]
+ pminuw m0, m1
+ vpbroadcastd m2, [pw_256]
+ psllw m2, xm0 ; shift
+
+ lea r6, [avg_%1 %+ SUFFIX %+ _table]
+ tzcnt wd, wm
+ movsxd wq, dword [r6+wq*4]
+ add wq, r6
+ AVG_FN %1, AVG
+%endmacro
+
+;void ff_vvc_w_avg_%1bpc_avx(uint8_t *dst, ptrdiff_t dst_stride,
+; const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height,
+; intptr_t denom, intptr_t w0, intptr_t w1, intptr_t o0, intptr_t o1, intptr_t pixel_max);
+%macro VVC_W_AVG_AVX2 1
+cglobal vvc_w_avg_%1bpc, 4, 7, 8, dst, stride, src0, src1, w, h, t0, t1
+
+ movifnidn hd, hm
+
+ movifnidn t0d, r8m ; w1
+ shl t0d, 16
+ mov t0w, r7m ; w0
+ movd xm3, t0d
+ vpbroadcastd m3, xm3 ; w0, w1
+
+ pxor m6, m6 ;pixel min
+ vpbroadcastw m7, r11m ;pixel max
+
+ mov t1q, rcx ; save ecx
+ mov ecx, r11m
+ inc ecx ; bd
+ tzcnt ecx, ecx
+ sub ecx, 8
+ mov t0d, r9m ; o0
+ add t0d, r10m ; o1
+ shl t0d, cl
+ inc t0d ;((o0 + o1) << (BIT_DEPTH - 8)) + 1
+
+ neg ecx
+ add ecx, 4 ; bd - 12
+ cmovl ecx, [pw_0]
+ add ecx, 3
+ add ecx, r6m
+ movd xm2, ecx ; shift
+
+ dec ecx
+ shl t0d, cl
+ movd xm4, t0d
+ vpbroadcastd m4, xm4 ; offset
+ mov rcx, t1q ; restore ecx
+
+ lea r6, [w_avg_%1 %+ SUFFIX %+ _table]
+ tzcnt wd, wm
+ movsxd wq, dword [r6+wq*4]
+ add wq, r6
+ AVG_FN %1, W_AVG
+%endmacro
+
+%if ARCH_X86_64
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+
+VVC_AVG_AVX2 16
+
+VVC_AVG_AVX2 8
+
+VVC_W_AVG_AVX2 16
+
+VVC_W_AVG_AVX2 8
+%endif
+
+%endif
diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
index c197cdb4cc..909ef9f56b 100644
--- a/libavcodec/x86/vvc/vvcdsp_init.c
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -169,6 +169,42 @@ FW_PUT_16BPC_AVX2(12);
MC_TAP_LINKS_16BPC_AVX2(LUMA, 8, bd); \
MC_TAP_LINKS_16BPC_AVX2(CHROMA, 4, bd);
+#define bf(fn, bd, opt) fn##_##bd##_##opt
+#define BF(fn, bpc, opt) fn##_##bpc##bpc_##opt
+
+#define AVG_BPC_FUNC(bpc, opt) \
+void BF(ff_vvc_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, intptr_t pixel_max); \
+void BF(ff_vvc_w_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, \
+ intptr_t denom, intptr_t w0, intptr_t w1, intptr_t o0, intptr_t o1, intptr_t pixel_max);
+
+#define AVG_FUNCS(bpc, bd, opt) \
+static void bf(avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *src0, const int16_t *src1, int width, int height) \
+{ \
+ BF(ff_vvc_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, (1 << bd) - 1); \
+} \
+static void bf(w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *src0, const int16_t *src1, int width, int height, \
+ int denom, int w0, int w1, int o0, int o1) \
+{ \
+ BF(ff_vvc_w_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, \
+ denom, w0, w1, o0, o1, (1 << bd) - 1); \
+}
+
+AVG_BPC_FUNC(8, avx2)
+AVG_BPC_FUNC(16, avx2)
+
+AVG_FUNCS(8, 8, avx2)
+AVG_FUNCS(16, 10, avx2)
+AVG_FUNCS(16, 12, avx2)
+
+#define AVG_INIT(bd, opt) do { \
+ c->inter.avg = bf(avg, bd, opt); \
+ c->inter.w_avg = bf(w_avg, bd, opt); \
+} while (0)
+
void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
{
const int cpu_flags = av_get_cpu_flags();
@@ -198,5 +234,21 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
MC_LINKS_16BPC_AVX2(12);
}
}
+
+ if (EXTERNAL_AVX2(cpu_flags)) {
+ switch (bd) {
+ case 8:
+ AVG_INIT(8, avx2);
+ break;
+ case 10:
+ AVG_INIT(10, avx2);
+ break;
+ case 12:
+ AVG_INIT(12, avx2);
+ break;
+ default:
+ break;
+ }
+ }
}
}
--
2.34.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2024-01-23 18:23 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
[not found] <20240122152527.601122-1-toqsxw@outlook.com>
2024-01-22 15:25 ` [FFmpeg-devel] [PATCH v3 2/8] avcodec/hevcdsp_template: reuse put/put_luma/put_chroma from h2656_inter_template toqsxw
2024-01-22 15:25 ` [FFmpeg-devel] [PATCH v3 3/8] avcodec/x86/hevc_mc: move put/put_uni to h26x/h2656_inter.asm toqsxw
2024-01-22 15:25 ` [FFmpeg-devel] [PATCH v3 4/8] avcodec/x86/h26x/h2656_inter: add dststride to put toqsxw
2024-01-22 15:25 ` [FFmpeg-devel] [PATCH v3 5/8] avcodec/vvcdec: reuse h26x/2656_inter.asm to enable x86 optimizations toqsxw
2024-01-22 15:25 ` [FFmpeg-devel] [PATCH v3 6/8] tests/checkasm: add checkasm_check_vvc_mc toqsxw
2024-01-22 15:25 ` [FFmpeg-devel] [PATCH v3 7/8] avcodec/x86/vvc: add avg and avg_w AVX2 optimizations toqsxw
2024-01-22 15:25 ` [FFmpeg-devel] [PATCH v3 8/8] tests/checkasm/vvc_mc: add check_avg toqsxw
[not found] <20240122174628.1206503-1-toqsxw@outlook.com>
2024-01-22 17:46 ` [FFmpeg-devel] [PATCH v3 7/8] avcodec/x86/vvc: add avg and avg_w AVX2 optimizations toqsxw
2024-01-22 22:46 ` Michael Niedermayer
2024-01-23 18:23 ` [FFmpeg-devel] 回复: " Wu Jianhua
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git