* [FFmpeg-devel] [PATCH 1/5] avcodec/vvc_mc: split the SAD dsp prototype into one function per blocksize width
@ 2024-05-23 12:27 James Almer
2024-05-23 12:27 ` [FFmpeg-devel] [PATCH 2/5] x86/vvc_sad: optimize vvc_sad_16 James Almer
` (4 more replies)
0 siblings, 5 replies; 6+ messages in thread
From: James Almer @ 2024-05-23 12:27 UTC (permalink / raw)
To: ffmpeg-devel
Signed-off-by: James Almer <jamrial@gmail.com>
---
libavcodec/vvc/dsp.h | 2 +-
libavcodec/vvc/inter.c | 6 ++++--
libavcodec/vvc/inter_template.c | 6 +++++-
libavcodec/x86/vvc/vvc_sad.asm | 32 ++++++++++++++++++++++++++------
libavcodec/x86/vvc/vvcdsp_init.c | 22 +++++++++++++++++-----
tests/checkasm/vvc_mc.c | 3 ++-
6 files changed, 55 insertions(+), 16 deletions(-)
diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h
index 1f14096c41..55c4c81f53 100644
--- a/libavcodec/vvc/dsp.h
+++ b/libavcodec/vvc/dsp.h
@@ -99,7 +99,7 @@ typedef struct VVCInterDSPContext {
void (*apply_bdof)(uint8_t *dst, ptrdiff_t dst_stride, int16_t *src0, int16_t *src1, int block_w, int block_h);
- int (*sad)(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h);
+ int (*sad[5])(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h);
void (*dmvr[2][2])(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, int height,
intptr_t mx, intptr_t my, int width);
} VVCInterDSPContext;
diff --git a/libavcodec/vvc/inter.c b/libavcodec/vvc/inter.c
index e1011b4fa1..0214e46634 100644
--- a/libavcodec/vvc/inter.c
+++ b/libavcodec/vvc/inter.c
@@ -740,6 +740,8 @@ static void dmvr_mv_refine(VVCLocalContext *lc, MvField *mvf, MvField *orig_mv,
const AVFrame *ref0, const AVFrame *ref1, const int x_off, const int y_off, const int block_w, const int block_h)
{
const VVCFrameContext *fc = lc->fc;
+ static const uint8_t sad_tab[16] = { 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4 };
+ const int tab = sad_tab[(FFALIGN(block_w, 8) >> 3) - 1];
const int sr_range = 2;
const AVFrame *ref[] = { ref0, ref1 };
int16_t *tmp[] = { lc->tmp, lc->tmp1 };
@@ -763,7 +765,7 @@ static void dmvr_mv_refine(VVCLocalContext *lc, MvField *mvf, MvField *orig_mv,
fc->vvcdsp.inter.dmvr[!!my][!!mx](tmp[i], src, src_stride, pred_h, mx, my, pred_w);
}
- min_sad = fc->vvcdsp.inter.sad(tmp[L0], tmp[L1], dx, dy, block_w, block_h);
+ min_sad = fc->vvcdsp.inter.sad[tab](tmp[L0], tmp[L1], dx, dy, block_w, block_h);
min_sad -= min_sad >> 2;
sad[dy][dx] = min_sad;
@@ -773,7 +775,7 @@ static void dmvr_mv_refine(VVCLocalContext *lc, MvField *mvf, MvField *orig_mv,
for (dy = 0; dy < SAD_ARRAY_SIZE; dy++) {
for (dx = 0; dx < SAD_ARRAY_SIZE; dx++) {
if (dx != sr_range || dy != sr_range) {
- sad[dy][dx] = fc->vvcdsp.inter.sad(lc->tmp, lc->tmp1, dx, dy, block_w, block_h);
+ sad[dy][dx] = fc->vvcdsp.inter.sad[tab](lc->tmp, lc->tmp1, dx, dy, block_w, block_h);
if (sad[dy][dx] < min_sad) {
min_sad = sad[dy][dx];
min_dx = dx;
diff --git a/libavcodec/vvc/inter_template.c b/libavcodec/vvc/inter_template.c
index a8068f4ba8..34485321d3 100644
--- a/libavcodec/vvc/inter_template.c
+++ b/libavcodec/vvc/inter_template.c
@@ -626,7 +626,11 @@ static void FUNC(ff_vvc_inter_dsp_init)(VVCInterDSPContext *const inter)
inter->apply_prof_uni_w = FUNC(apply_prof_uni_w);
inter->apply_bdof = FUNC(apply_bdof);
inter->prof_grad_filter = FUNC(prof_grad_filter);
- inter->sad = vvc_sad;
+ inter->sad[0] =
+ inter->sad[1] =
+ inter->sad[2] =
+ inter->sad[3] =
+ inter->sad[4] = vvc_sad;
}
#undef FUNCS
diff --git a/libavcodec/x86/vvc/vvc_sad.asm b/libavcodec/x86/vvc/vvc_sad.asm
index b468d89ac2..a20818530f 100644
--- a/libavcodec/x86/vvc/vvc_sad.asm
+++ b/libavcodec/x86/vvc/vvc_sad.asm
@@ -51,7 +51,7 @@ SECTION .text
INIT_YMM avx2
-cglobal vvc_sad, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx
+cglobal vvc_sad_8, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx
movsxdifnidn dxq, dxd
movsxdifnidn dyq, dyd
@@ -76,10 +76,6 @@ cglobal vvc_sad, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_
pxor m3, m3
vpbroadcastd m4, [pw_1]
- cmp block_wd, 16
- jge vvc_sad_16_128
-
- vvc_sad_8:
.loop_height:
movu xm0, [src1q]
vinserti128 m0, m0, [src1q + MAX_PB_SIZE * ROWS * 2], 1
@@ -100,7 +96,31 @@ cglobal vvc_sad, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_
movd eax, xm0
RET
- vvc_sad_16_128:
+cglobal vvc_sad_16, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx
+ movsxdifnidn dxq, dxd
+ movsxdifnidn dyq, dyd
+
+ sub dxq, 2
+ sub dyq, 2
+
+ mov off1q, 2
+ mov off2q, 2
+
+ add off1q, dyq
+ sub off2q, dyq
+
+ shl off1q, 7
+ shl off2q, 7
+
+ add off1q, dxq
+ sub off2q, dxq
+
+ lea src1q, [src1q + off1q * 2 + 2 * 2]
+ lea src2q, [src2q + off2q * 2 + 2 * 2]
+
+ pxor m3, m3
+ vpbroadcastd m4, [pw_1]
+
sar block_wd, 4
.loop_height:
mov off1q, src1q
diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
index 4b4a2aa937..bd60963432 100644
--- a/libavcodec/x86/vvc/vvcdsp_init.c
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -312,8 +312,20 @@ ALF_FUNCS(16, 12, avx2)
c->alf.classify = ff_vvc_alf_classify_##bd##_avx2; \
} while (0)
-int ff_vvc_sad_avx2(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h);
-#define SAD_INIT() c->inter.sad = ff_vvc_sad_avx2
+#define SAD_PROTOTYPE(w, opt) \
+int bf(ff_vvc_sad, w, opt)(const int16_t *src0, const int16_t *src1, \
+ int dx, int dy, int block_w, int block_h) \
+
+SAD_PROTOTYPE(8, avx2);
+SAD_PROTOTYPE(16, avx2);
+
+#define SAD_INIT(opt) do { \
+ c->inter.sad[0] = ff_vvc_sad_8_##opt; \
+ c->inter.sad[1] = \
+ c->inter.sad[2] = \
+ c->inter.sad[3] = \
+ c->inter.sad[4] = ff_vvc_sad_16_##opt; \
+} while (0)
#endif
void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
@@ -330,7 +342,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
ALF_INIT(8);
AVG_INIT(8, avx2);
MC_LINKS_AVX2(8);
- SAD_INIT();
+ SAD_INIT(avx2);
}
break;
case 10:
@@ -342,7 +354,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
AVG_INIT(10, avx2);
MC_LINKS_AVX2(10);
MC_LINKS_16BPC_AVX2(10);
- SAD_INIT();
+ SAD_INIT(avx2);
}
break;
case 12:
@@ -354,7 +366,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
AVG_INIT(12, avx2);
MC_LINKS_AVX2(12);
MC_LINKS_16BPC_AVX2(12);
- SAD_INIT();
+ SAD_INIT(avx2);
}
break;
default:
diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
index 1e889e2cff..deae1014d2 100644
--- a/tests/checkasm/vvc_mc.c
+++ b/tests/checkasm/vvc_mc.c
@@ -327,6 +327,7 @@ static void check_avg(void)
static void check_vvc_sad(void)
{
const int bit_depth = 10;
+ static const uint8_t sad_tab[16] = { 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4 };
VVCDSPContext c;
LOCAL_ALIGNED_32(uint16_t, src0, [MAX_CTU_SIZE * MAX_CTU_SIZE * 4]);
LOCAL_ALIGNED_32(uint16_t, src1, [MAX_CTU_SIZE * MAX_CTU_SIZE * 4]);
@@ -341,7 +342,7 @@ static void check_vvc_sad(void)
for (int w = 8; w <= MAX_CTU_SIZE; w *= 2) {
for(int offy = 0; offy <= 4; offy++) {
for(int offx = 0; offx <= 4; offx++) {
- if(check_func(c.inter.sad, "sad_%dx%d", w, h)) {
+ if(check_func(c.inter.sad[sad_tab[(w >> 3) - 1]], "sad_%dx%d", w, h)) {
int result0;
int result1;
--
2.45.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
* [FFmpeg-devel] [PATCH 2/5] x86/vvc_sad: optimize vvc_sad_16
2024-05-23 12:27 [FFmpeg-devel] [PATCH 1/5] avcodec/vvc_mc: split the SAD dsp prototype into one function per blocksize width James Almer
@ 2024-05-23 12:27 ` James Almer
2024-05-23 12:27 ` [FFmpeg-devel] [PATCH 3/5] x86/vvc_sad: add sse4 versions of all functions James Almer
` (3 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: James Almer @ 2024-05-23 12:27 UTC (permalink / raw)
To: ffmpeg-devel
Signed-off-by: James Almer <jamrial@gmail.com>
---
libavcodec/x86/vvc/vvc_sad.asm | 27 ++++++++++++++-------------
1 file changed, 14 insertions(+), 13 deletions(-)
diff --git a/libavcodec/x86/vvc/vvc_sad.asm b/libavcodec/x86/vvc/vvc_sad.asm
index a20818530f..829dbce489 100644
--- a/libavcodec/x86/vvc/vvc_sad.asm
+++ b/libavcodec/x86/vvc/vvc_sad.asm
@@ -96,7 +96,7 @@ cglobal vvc_sad_8, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, ro
movd eax, xm0
RET
-cglobal vvc_sad_16, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx
+cglobal vvc_sad_16, 6, 8, 5, src1, src2, dx, dy, block_w, block_h, off1, off2
movsxdifnidn dxq, dxd
movsxdifnidn dyq, dyd
@@ -121,26 +121,27 @@ cglobal vvc_sad_16, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, r
pxor m3, m3
vpbroadcastd m4, [pw_1]
- sar block_wd, 4
+ shl block_wd, 1
+ add src1q, block_wq
+ add src2q, block_wq
+ neg block_wq
+
+DEFINE_ARGS src1, src2, dx, dy, block_w, block_h, row_idx
.loop_height:
- mov off1q, src1q
- mov off2q, src2q
- mov row_idxd, block_wd
+ mov row_idxq, block_wq
.loop_width:
- movu m0, [src1q]
- movu m1, [src2q]
+ movu m0, [src1q+row_idxq]
+ movu m1, [src2q+row_idxq]
MIN_MAX_SAD m1, m0, m2
pmaddwd m1, m4
paddd m3, m1
- add src1q, 32
- add src2q, 32
- dec row_idxd
- jg .loop_width
+ add row_idxq, mmsize
+ jl .loop_width
- lea src1q, [off1q + ROWS * MAX_PB_SIZE * 2]
- lea src2q, [off2q + ROWS * MAX_PB_SIZE * 2]
+ add src1q, ROWS * MAX_PB_SIZE * 2
+ add src2q, ROWS * MAX_PB_SIZE * 2
sub block_hd, 2
jg .loop_height
--
2.45.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
* [FFmpeg-devel] [PATCH 3/5] x86/vvc_sad: add sse4 versions of all functions
2024-05-23 12:27 [FFmpeg-devel] [PATCH 1/5] avcodec/vvc_mc: split the SAD dsp prototype into one function per blocksize width James Almer
2024-05-23 12:27 ` [FFmpeg-devel] [PATCH 2/5] x86/vvc_sad: optimize vvc_sad_16 James Almer
@ 2024-05-23 12:27 ` James Almer
2024-05-23 12:27 ` [FFmpeg-devel] [PATCH 4/5] x86/vvc_sad: reduce gpr usage in all loop functions James Almer
` (2 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: James Almer @ 2024-05-23 12:27 UTC (permalink / raw)
To: ffmpeg-devel
And remove sad_8x8_avx2, as it's not faster than sad_8x8_sse4.
sad_8x8_c: 54.8
sad_8x8_sse4: 14.3
sad_16x16_c: 200.8
sad_16x16_sse4: 34.8
sad_16x16_avx2: 29.8
sad_32x32_c: 826.3
sad_32x32_sse4: 113.8
sad_32x32_avx2: 69.3
sad_64x64_c: 3679.8
sad_64x64_sse4: 392.8
sad_64x64_avx2: 257.3
sad_128x128_c: 12581.3
sad_128x128_sse4: 1560.8
sad_128x128_avx2: 1151.8
Signed-off-by: James Almer <jamrial@gmail.com>
---
libavcodec/x86/vvc/vvc_sad.asm | 53 +++++++++++++++++++++-----------
libavcodec/x86/vvc/vvcdsp_init.c | 42 +++++++++++++++++--------
2 files changed, 65 insertions(+), 30 deletions(-)
diff --git a/libavcodec/x86/vvc/vvc_sad.asm b/libavcodec/x86/vvc/vvc_sad.asm
index 829dbce489..26df25ec66 100644
--- a/libavcodec/x86/vvc/vvc_sad.asm
+++ b/libavcodec/x86/vvc/vvc_sad.asm
@@ -26,7 +26,7 @@
SECTION_RODATA
-pw_1: times 2 dw 1
+cextern pw_1
; DMVR SAD is only calculated on even rows to reduce complexity
SECTION .text
@@ -38,20 +38,21 @@ SECTION .text
%endmacro
%macro HORIZ_ADD 3 ; xm0, xm1, m1
+%if mmsize == 32
vextracti128 %1, %3, q0001 ; 3 2 1 0
- paddd %1, %2 ; xm0 (7 + 3) (6 + 2) (5 + 1) (4 + 0)
- pshufd %2, %1, q0032 ; xm1 - - (7 + 3) (6 + 2)
+ paddd %2, %1 ; xm1 (7 + 3) (6 + 2) (5 + 1) (4 + 0)
+%endif
+ pshufd %1, %2, q0032 ; xm0 - - (7 + 3) (6 + 2)
paddd %1, %1, %2 ; xm0 _ _ (5 1 7 3) (4 0 6 2)
pshufd %2, %1, q0001 ; xm1 _ _ (5 1 7 3) (5 1 7 3)
paddd %1, %1, %2 ; (01234567)
%endmacro
-%if ARCH_X86_64
-%if HAVE_AVX2_EXTERNAL
-
-INIT_YMM avx2
-
-cglobal vvc_sad_8, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx
+%macro VVC_SAD 1
+cglobal vvc_sad_%1, 4, 7, 5, src1, src2, dx, dy, off1, block_h, off2
+%if UNIX64 == 0
+ mov block_hd, dword r5m
+%endif
movsxdifnidn dxq, dxd
movsxdifnidn dyq, dyd
@@ -74,29 +75,32 @@ cglobal vvc_sad_8, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, ro
lea src2q, [src2q + off2q * 2 + 2 * 2]
pxor m3, m3
+%if mmsize == 32
vpbroadcastd m4, [pw_1]
+%else
+ mova m4, [pw_1]
+%endif
.loop_height:
- movu xm0, [src1q]
- vinserti128 m0, m0, [src1q + MAX_PB_SIZE * ROWS * 2], 1
- movu xm1, [src2q]
- vinserti128 m1, m1, [src2q + MAX_PB_SIZE * ROWS * 2], 1
-
+ movu m0, [src1q]
+ movu m1, [src2q]
MIN_MAX_SAD m1, m0, m2
pmaddwd m1, m4
paddd m3, m1
- add src1q, 2 * MAX_PB_SIZE * ROWS * 2
- add src2q, 2 * MAX_PB_SIZE * ROWS * 2
+ add src1q, ROWS * MAX_PB_SIZE * 2
+ add src2q, ROWS * MAX_PB_SIZE * 2
- sub block_hd, 4
+ sub block_hd, 2
jg .loop_height
HORIZ_ADD xm0, xm3, m3
movd eax, xm0
RET
+%endmacro
-cglobal vvc_sad_16, 6, 8, 5, src1, src2, dx, dy, block_w, block_h, off1, off2
+%macro VVC_SAD_LOOP 1
+cglobal vvc_sad_%1, 6, 8, 5, src1, src2, dx, dy, block_w, block_h, off1, off2
movsxdifnidn dxq, dxd
movsxdifnidn dyq, dyd
@@ -119,7 +123,11 @@ cglobal vvc_sad_16, 6, 8, 5, src1, src2, dx, dy, block_w, block_h, off1, off2
lea src2q, [src2q + off2q * 2 + 2 * 2]
pxor m3, m3
+%if mmsize == 32
vpbroadcastd m4, [pw_1]
+%else
+ mova m4, [pw_1]
+%endif
shl block_wd, 1
add src1q, block_wq
@@ -149,6 +157,15 @@ DEFINE_ARGS src1, src2, dx, dy, block_w, block_h, row_idx
HORIZ_ADD xm0, xm3, m3
movd eax, xm0
RET
+%endmacro
+%if ARCH_X86_64
+INIT_XMM sse4
+VVC_SAD 8
+VVC_SAD_LOOP 16
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+VVC_SAD 16
+VVC_SAD_LOOP 32
%endif
%endif
diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
index bd60963432..cdf0e36b62 100644
--- a/libavcodec/x86/vvc/vvcdsp_init.c
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -316,16 +316,10 @@ ALF_FUNCS(16, 12, avx2)
int bf(ff_vvc_sad, w, opt)(const int16_t *src0, const int16_t *src1, \
int dx, int dy, int block_w, int block_h) \
-SAD_PROTOTYPE(8, avx2);
+SAD_PROTOTYPE(8, sse4);
+SAD_PROTOTYPE(16, sse4);
SAD_PROTOTYPE(16, avx2);
-
-#define SAD_INIT(opt) do { \
- c->inter.sad[0] = ff_vvc_sad_8_##opt; \
- c->inter.sad[1] = \
- c->inter.sad[2] = \
- c->inter.sad[3] = \
- c->inter.sad[4] = ff_vvc_sad_16_##opt; \
-} while (0)
+SAD_PROTOTYPE(32, avx2);
#endif
void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
@@ -337,36 +331,60 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
case 8:
if (EXTERNAL_SSE4(cpu_flags)) {
MC_LINK_SSE4(8);
+ c->inter.sad[0] = ff_vvc_sad_8_sse4;
+ c->inter.sad[1] =
+ c->inter.sad[2] =
+ c->inter.sad[3] =
+ c->inter.sad[4] = ff_vvc_sad_16_sse4;
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
ALF_INIT(8);
AVG_INIT(8, avx2);
MC_LINKS_AVX2(8);
- SAD_INIT(avx2);
+ c->inter.sad[1] = ff_vvc_sad_16_avx2;
+ c->inter.sad[2] =
+ c->inter.sad[3] =
+ c->inter.sad[4] = ff_vvc_sad_32_avx2;
}
break;
case 10:
if (EXTERNAL_SSE4(cpu_flags)) {
MC_LINK_SSE4(10);
+ c->inter.sad[0] = ff_vvc_sad_8_sse4;
+ c->inter.sad[1] =
+ c->inter.sad[2] =
+ c->inter.sad[3] =
+ c->inter.sad[4] = ff_vvc_sad_16_sse4;
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
ALF_INIT(10);
AVG_INIT(10, avx2);
MC_LINKS_AVX2(10);
MC_LINKS_16BPC_AVX2(10);
- SAD_INIT(avx2);
+ c->inter.sad[1] = ff_vvc_sad_16_avx2;
+ c->inter.sad[2] =
+ c->inter.sad[3] =
+ c->inter.sad[4] = ff_vvc_sad_32_avx2;
}
break;
case 12:
if (EXTERNAL_SSE4(cpu_flags)) {
MC_LINK_SSE4(12);
+ c->inter.sad[0] = ff_vvc_sad_8_sse4;
+ c->inter.sad[1] =
+ c->inter.sad[2] =
+ c->inter.sad[3] =
+ c->inter.sad[4] = ff_vvc_sad_16_sse4;
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
ALF_INIT(12);
AVG_INIT(12, avx2);
MC_LINKS_AVX2(12);
MC_LINKS_16BPC_AVX2(12);
- SAD_INIT(avx2);
+ c->inter.sad[1] = ff_vvc_sad_16_avx2;
+ c->inter.sad[2] =
+ c->inter.sad[3] =
+ c->inter.sad[4] = ff_vvc_sad_32_avx2;
}
break;
default:
--
2.45.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
* [FFmpeg-devel] [PATCH 4/5] x86/vvc_sad: reduce gpr usage in all loop functions
2024-05-23 12:27 [FFmpeg-devel] [PATCH 1/5] avcodec/vvc_mc: split the SAD dsp prototype into one function per blocksize width James Almer
2024-05-23 12:27 ` [FFmpeg-devel] [PATCH 2/5] x86/vvc_sad: optimize vvc_sad_16 James Almer
2024-05-23 12:27 ` [FFmpeg-devel] [PATCH 3/5] x86/vvc_sad: add sse4 versions of all functions James Almer
@ 2024-05-23 12:27 ` James Almer
2024-05-23 12:27 ` [FFmpeg-devel] [PATCH 5/5] x86/vvc_sad: reindent after the previous changes James Almer
2024-05-23 12:35 ` [FFmpeg-devel] [PATCH 1/5] avcodec/vvc_mc: split the SAD dsp prototype into one function per blocksize width Andreas Rheinhardt
4 siblings, 0 replies; 6+ messages in thread
From: James Almer @ 2024-05-23 12:27 UTC (permalink / raw)
To: ffmpeg-devel
This way they can be assembled on x86_32 targets.
Signed-off-by: James Almer <jamrial@gmail.com>
---
libavcodec/x86/vvc/vvc_sad.asm | 22 ++++++++++------------
libavcodec/x86/vvc/vvcdsp_init.c | 16 +++++++++++++---
2 files changed, 23 insertions(+), 15 deletions(-)
diff --git a/libavcodec/x86/vvc/vvc_sad.asm b/libavcodec/x86/vvc/vvc_sad.asm
index 26df25ec66..9881b1180d 100644
--- a/libavcodec/x86/vvc/vvc_sad.asm
+++ b/libavcodec/x86/vvc/vvc_sad.asm
@@ -49,7 +49,7 @@ SECTION .text
%endmacro
%macro VVC_SAD 1
-cglobal vvc_sad_%1, 4, 7, 5, src1, src2, dx, dy, off1, block_h, off2
+cglobal vvc_sad_%1, 4, 6, 5, src1, src2, dx, dy, off, block_h
%if UNIX64 == 0
mov block_hd, dword r5m
%endif
@@ -59,12 +59,12 @@ cglobal vvc_sad_%1, 4, 7, 5, src1, src2, dx, dy, off1, block_h, off2
sub dxq, 2
sub dyq, 2
- mov off1q, 2
- mov off2q, 2
+ mov offq, 2
- add off1q, dyq
- sub off2q, dyq
+ sub offq, dyq
+ add dyq, 2
+DEFINE_ARGS src1, src2, dx, off1, off2, block_h
shl off1q, 7
shl off2q, 7
@@ -100,19 +100,19 @@ cglobal vvc_sad_%1, 4, 7, 5, src1, src2, dx, dy, off1, block_h, off2
%endmacro
%macro VVC_SAD_LOOP 1
-cglobal vvc_sad_%1, 6, 8, 5, src1, src2, dx, dy, block_w, block_h, off1, off2
+cglobal vvc_sad_%1, 6, 7, 5, src1, src2, dx, dy, block_w, block_h, off
movsxdifnidn dxq, dxd
movsxdifnidn dyq, dyd
sub dxq, 2
sub dyq, 2
- mov off1q, 2
- mov off2q, 2
+ mov offq, 2
- add off1q, dyq
- sub off2q, dyq
+ sub offq, dyq
+ add dyq, 2
+DEFINE_ARGS src1, src2, dx, off1, block_w, block_h, off2
shl off1q, 7
shl off2q, 7
@@ -159,7 +159,6 @@ DEFINE_ARGS src1, src2, dx, dy, block_w, block_h, row_idx
RET
%endmacro
-%if ARCH_X86_64
INIT_XMM sse4
VVC_SAD 8
VVC_SAD_LOOP 16
@@ -168,4 +167,3 @@ INIT_YMM avx2
VVC_SAD 16
VVC_SAD_LOOP 32
%endif
-%endif
diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
index cdf0e36b62..c0bd145191 100644
--- a/libavcodec/x86/vvc/vvcdsp_init.c
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -311,6 +311,7 @@ ALF_FUNCS(16, 12, avx2)
c->alf.filter[CHROMA] = ff_vvc_alf_filter_chroma_##bd##_avx2; \
c->alf.classify = ff_vvc_alf_classify_##bd##_avx2; \
} while (0)
+#endif
#define SAD_PROTOTYPE(w, opt) \
int bf(ff_vvc_sad, w, opt)(const int16_t *src0, const int16_t *src1, \
@@ -320,17 +321,17 @@ SAD_PROTOTYPE(8, sse4);
SAD_PROTOTYPE(16, sse4);
SAD_PROTOTYPE(16, avx2);
SAD_PROTOTYPE(32, avx2);
-#endif
void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
{
-#if ARCH_X86_64
const int cpu_flags = av_get_cpu_flags();
switch (bd) {
case 8:
if (EXTERNAL_SSE4(cpu_flags)) {
+#if ARCH_X86_64
MC_LINK_SSE4(8);
+#endif
c->inter.sad[0] = ff_vvc_sad_8_sse4;
c->inter.sad[1] =
c->inter.sad[2] =
@@ -338,9 +339,11 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
c->inter.sad[4] = ff_vvc_sad_16_sse4;
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+#if ARCH_X86_64
ALF_INIT(8);
AVG_INIT(8, avx2);
MC_LINKS_AVX2(8);
+#endif
c->inter.sad[1] = ff_vvc_sad_16_avx2;
c->inter.sad[2] =
c->inter.sad[3] =
@@ -349,7 +352,9 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
break;
case 10:
if (EXTERNAL_SSE4(cpu_flags)) {
+#if ARCH_X86_64
MC_LINK_SSE4(10);
+#endif
c->inter.sad[0] = ff_vvc_sad_8_sse4;
c->inter.sad[1] =
c->inter.sad[2] =
@@ -357,10 +362,12 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
c->inter.sad[4] = ff_vvc_sad_16_sse4;
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+#if ARCH_X86_64
ALF_INIT(10);
AVG_INIT(10, avx2);
MC_LINKS_AVX2(10);
MC_LINKS_16BPC_AVX2(10);
+#endif
c->inter.sad[1] = ff_vvc_sad_16_avx2;
c->inter.sad[2] =
c->inter.sad[3] =
@@ -369,7 +376,9 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
break;
case 12:
if (EXTERNAL_SSE4(cpu_flags)) {
+#if ARCH_X86_64
MC_LINK_SSE4(12);
+#endif
c->inter.sad[0] = ff_vvc_sad_8_sse4;
c->inter.sad[1] =
c->inter.sad[2] =
@@ -377,10 +386,12 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
c->inter.sad[4] = ff_vvc_sad_16_sse4;
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+#if ARCH_X86_64
ALF_INIT(12);
AVG_INIT(12, avx2);
MC_LINKS_AVX2(12);
MC_LINKS_16BPC_AVX2(12);
+#endif
c->inter.sad[1] = ff_vvc_sad_16_avx2;
c->inter.sad[2] =
c->inter.sad[3] =
@@ -390,5 +401,4 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
default:
break;
}
-#endif
}
--
2.45.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
* [FFmpeg-devel] [PATCH 5/5] x86/vvc_sad: reindent after the previous changes
2024-05-23 12:27 [FFmpeg-devel] [PATCH 1/5] avcodec/vvc_mc: split the SAD dsp prototype into one function per blocksize width James Almer
` (2 preceding siblings ...)
2024-05-23 12:27 ` [FFmpeg-devel] [PATCH 4/5] x86/vvc_sad: reduce gpr usage in all loop functions James Almer
@ 2024-05-23 12:27 ` James Almer
2024-05-23 12:35 ` [FFmpeg-devel] [PATCH 1/5] avcodec/vvc_mc: split the SAD dsp prototype into one function per blocksize width Andreas Rheinhardt
4 siblings, 0 replies; 6+ messages in thread
From: James Almer @ 2024-05-23 12:27 UTC (permalink / raw)
To: ffmpeg-devel
Signed-off-by: James Almer <jamrial@gmail.com>
---
libavcodec/x86/vvc/vvc_sad.asm | 20 ++++++++++----------
1 file changed, 10 insertions(+), 10 deletions(-)
diff --git a/libavcodec/x86/vvc/vvc_sad.asm b/libavcodec/x86/vvc/vvc_sad.asm
index 9881b1180d..14f7ce230e 100644
--- a/libavcodec/x86/vvc/vvc_sad.asm
+++ b/libavcodec/x86/vvc/vvc_sad.asm
@@ -81,7 +81,7 @@ DEFINE_ARGS src1, src2, dx, off1, off2, block_h
mova m4, [pw_1]
%endif
- .loop_height:
+ .loop_height:
movu m0, [src1q]
movu m1, [src2q]
MIN_MAX_SAD m1, m0, m2
@@ -94,8 +94,8 @@ DEFINE_ARGS src1, src2, dx, off1, off2, block_h
sub block_hd, 2
jg .loop_height
- HORIZ_ADD xm0, xm3, m3
- movd eax, xm0
+ HORIZ_ADD xm0, xm3, m3
+ movd eax, xm0
RET
%endmacro
@@ -129,13 +129,13 @@ DEFINE_ARGS src1, src2, dx, off1, block_w, block_h, off2
mova m4, [pw_1]
%endif
- shl block_wd, 1
- add src1q, block_wq
- add src2q, block_wq
- neg block_wq
+ shl block_wd, 1
+ add src1q, block_wq
+ add src2q, block_wq
+ neg block_wq
DEFINE_ARGS src1, src2, dx, dy, block_w, block_h, row_idx
- .loop_height:
+ .loop_height:
mov row_idxq, block_wq
.loop_width:
@@ -154,8 +154,8 @@ DEFINE_ARGS src1, src2, dx, dy, block_w, block_h, row_idx
sub block_hd, 2
jg .loop_height
- HORIZ_ADD xm0, xm3, m3
- movd eax, xm0
+ HORIZ_ADD xm0, xm3, m3
+ movd eax, xm0
RET
%endmacro
--
2.45.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/5] avcodec/vvc_mc: split the SAD dsp prototype into one function per blocksize width
2024-05-23 12:27 [FFmpeg-devel] [PATCH 1/5] avcodec/vvc_mc: split the SAD dsp prototype into one function per blocksize width James Almer
` (3 preceding siblings ...)
2024-05-23 12:27 ` [FFmpeg-devel] [PATCH 5/5] x86/vvc_sad: reindent after the previous changes James Almer
@ 2024-05-23 12:35 ` Andreas Rheinhardt
4 siblings, 0 replies; 6+ messages in thread
From: Andreas Rheinhardt @ 2024-05-23 12:35 UTC (permalink / raw)
To: ffmpeg-devel
James Almer:
> Signed-off-by: James Almer <jamrial@gmail.com>
> ---
The commit message should explain what the advantage of this is.
Particularly, what is the advantage of this over jumping in the function
to based upon blocksize vs. selecting an appropriate function in the
generic code (even when these functions turn out all the same as is for
the C version).
> libavcodec/vvc/dsp.h | 2 +-
> libavcodec/vvc/inter.c | 6 ++++--
> libavcodec/vvc/inter_template.c | 6 +++++-
> libavcodec/x86/vvc/vvc_sad.asm | 32 ++++++++++++++++++++++++++------
> libavcodec/x86/vvc/vvcdsp_init.c | 22 +++++++++++++++++-----
> tests/checkasm/vvc_mc.c | 3 ++-
> 6 files changed, 55 insertions(+), 16 deletions(-)
>
> diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h
> index 1f14096c41..55c4c81f53 100644
> --- a/libavcodec/vvc/dsp.h
> +++ b/libavcodec/vvc/dsp.h
> @@ -99,7 +99,7 @@ typedef struct VVCInterDSPContext {
>
> void (*apply_bdof)(uint8_t *dst, ptrdiff_t dst_stride, int16_t *src0, int16_t *src1, int block_w, int block_h);
>
> - int (*sad)(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h);
> + int (*sad[5])(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h);
> void (*dmvr[2][2])(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, int height,
> intptr_t mx, intptr_t my, int width);
> } VVCInterDSPContext;
> diff --git a/libavcodec/vvc/inter.c b/libavcodec/vvc/inter.c
> index e1011b4fa1..0214e46634 100644
> --- a/libavcodec/vvc/inter.c
> +++ b/libavcodec/vvc/inter.c
> @@ -740,6 +740,8 @@ static void dmvr_mv_refine(VVCLocalContext *lc, MvField *mvf, MvField *orig_mv,
> const AVFrame *ref0, const AVFrame *ref1, const int x_off, const int y_off, const int block_w, const int block_h)
> {
> const VVCFrameContext *fc = lc->fc;
> + static const uint8_t sad_tab[16] = { 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4 };
> + const int tab = sad_tab[(FFALIGN(block_w, 8) >> 3) - 1];
> const int sr_range = 2;
> const AVFrame *ref[] = { ref0, ref1 };
> int16_t *tmp[] = { lc->tmp, lc->tmp1 };
> @@ -763,7 +765,7 @@ static void dmvr_mv_refine(VVCLocalContext *lc, MvField *mvf, MvField *orig_mv,
> fc->vvcdsp.inter.dmvr[!!my][!!mx](tmp[i], src, src_stride, pred_h, mx, my, pred_w);
> }
>
> - min_sad = fc->vvcdsp.inter.sad(tmp[L0], tmp[L1], dx, dy, block_w, block_h);
> + min_sad = fc->vvcdsp.inter.sad[tab](tmp[L0], tmp[L1], dx, dy, block_w, block_h);
> min_sad -= min_sad >> 2;
> sad[dy][dx] = min_sad;
>
> @@ -773,7 +775,7 @@ static void dmvr_mv_refine(VVCLocalContext *lc, MvField *mvf, MvField *orig_mv,
> for (dy = 0; dy < SAD_ARRAY_SIZE; dy++) {
> for (dx = 0; dx < SAD_ARRAY_SIZE; dx++) {
> if (dx != sr_range || dy != sr_range) {
> - sad[dy][dx] = fc->vvcdsp.inter.sad(lc->tmp, lc->tmp1, dx, dy, block_w, block_h);
> + sad[dy][dx] = fc->vvcdsp.inter.sad[tab](lc->tmp, lc->tmp1, dx, dy, block_w, block_h);
> if (sad[dy][dx] < min_sad) {
> min_sad = sad[dy][dx];
> min_dx = dx;
> diff --git a/libavcodec/vvc/inter_template.c b/libavcodec/vvc/inter_template.c
> index a8068f4ba8..34485321d3 100644
> --- a/libavcodec/vvc/inter_template.c
> +++ b/libavcodec/vvc/inter_template.c
> @@ -626,7 +626,11 @@ static void FUNC(ff_vvc_inter_dsp_init)(VVCInterDSPContext *const inter)
> inter->apply_prof_uni_w = FUNC(apply_prof_uni_w);
> inter->apply_bdof = FUNC(apply_bdof);
> inter->prof_grad_filter = FUNC(prof_grad_filter);
> - inter->sad = vvc_sad;
> + inter->sad[0] =
> + inter->sad[1] =
> + inter->sad[2] =
> + inter->sad[3] =
> + inter->sad[4] = vvc_sad;
> }
>
> #undef FUNCS
> diff --git a/libavcodec/x86/vvc/vvc_sad.asm b/libavcodec/x86/vvc/vvc_sad.asm
> index b468d89ac2..a20818530f 100644
> --- a/libavcodec/x86/vvc/vvc_sad.asm
> +++ b/libavcodec/x86/vvc/vvc_sad.asm
> @@ -51,7 +51,7 @@ SECTION .text
>
> INIT_YMM avx2
>
> -cglobal vvc_sad, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx
> +cglobal vvc_sad_8, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx
> movsxdifnidn dxq, dxd
> movsxdifnidn dyq, dyd
>
> @@ -76,10 +76,6 @@ cglobal vvc_sad, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_
> pxor m3, m3
> vpbroadcastd m4, [pw_1]
>
> - cmp block_wd, 16
> - jge vvc_sad_16_128
> -
> - vvc_sad_8:
> .loop_height:
> movu xm0, [src1q]
> vinserti128 m0, m0, [src1q + MAX_PB_SIZE * ROWS * 2], 1
> @@ -100,7 +96,31 @@ cglobal vvc_sad, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_
> movd eax, xm0
> RET
>
> - vvc_sad_16_128:
> +cglobal vvc_sad_16, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx
> + movsxdifnidn dxq, dxd
> + movsxdifnidn dyq, dyd
> +
> + sub dxq, 2
> + sub dyq, 2
> +
> + mov off1q, 2
> + mov off2q, 2
> +
> + add off1q, dyq
> + sub off2q, dyq
> +
> + shl off1q, 7
> + shl off2q, 7
> +
> + add off1q, dxq
> + sub off2q, dxq
> +
> + lea src1q, [src1q + off1q * 2 + 2 * 2]
> + lea src2q, [src2q + off2q * 2 + 2 * 2]
> +
> + pxor m3, m3
> + vpbroadcastd m4, [pw_1]
> +
> sar block_wd, 4
> .loop_height:
> mov off1q, src1q
> diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
> index 4b4a2aa937..bd60963432 100644
> --- a/libavcodec/x86/vvc/vvcdsp_init.c
> +++ b/libavcodec/x86/vvc/vvcdsp_init.c
> @@ -312,8 +312,20 @@ ALF_FUNCS(16, 12, avx2)
> c->alf.classify = ff_vvc_alf_classify_##bd##_avx2; \
> } while (0)
>
> -int ff_vvc_sad_avx2(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h);
> -#define SAD_INIT() c->inter.sad = ff_vvc_sad_avx2
> +#define SAD_PROTOTYPE(w, opt) \
> +int bf(ff_vvc_sad, w, opt)(const int16_t *src0, const int16_t *src1, \
> + int dx, int dy, int block_w, int block_h) \
> +
> +SAD_PROTOTYPE(8, avx2);
> +SAD_PROTOTYPE(16, avx2);
> +
> +#define SAD_INIT(opt) do { \
> + c->inter.sad[0] = ff_vvc_sad_8_##opt; \
> + c->inter.sad[1] = \
> + c->inter.sad[2] = \
> + c->inter.sad[3] = \
> + c->inter.sad[4] = ff_vvc_sad_16_##opt; \
> +} while (0)
> #endif
>
> void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
> @@ -330,7 +342,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
> ALF_INIT(8);
> AVG_INIT(8, avx2);
> MC_LINKS_AVX2(8);
> - SAD_INIT();
> + SAD_INIT(avx2);
> }
> break;
> case 10:
> @@ -342,7 +354,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
> AVG_INIT(10, avx2);
> MC_LINKS_AVX2(10);
> MC_LINKS_16BPC_AVX2(10);
> - SAD_INIT();
> + SAD_INIT(avx2);
> }
> break;
> case 12:
> @@ -354,7 +366,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
> AVG_INIT(12, avx2);
> MC_LINKS_AVX2(12);
> MC_LINKS_16BPC_AVX2(12);
> - SAD_INIT();
> + SAD_INIT(avx2);
> }
> break;
> default:
> diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
> index 1e889e2cff..deae1014d2 100644
> --- a/tests/checkasm/vvc_mc.c
> +++ b/tests/checkasm/vvc_mc.c
> @@ -327,6 +327,7 @@ static void check_avg(void)
> static void check_vvc_sad(void)
> {
> const int bit_depth = 10;
> + static const uint8_t sad_tab[16] = { 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4 };
> VVCDSPContext c;
> LOCAL_ALIGNED_32(uint16_t, src0, [MAX_CTU_SIZE * MAX_CTU_SIZE * 4]);
> LOCAL_ALIGNED_32(uint16_t, src1, [MAX_CTU_SIZE * MAX_CTU_SIZE * 4]);
> @@ -341,7 +342,7 @@ static void check_vvc_sad(void)
> for (int w = 8; w <= MAX_CTU_SIZE; w *= 2) {
> for(int offy = 0; offy <= 4; offy++) {
> for(int offx = 0; offx <= 4; offx++) {
> - if(check_func(c.inter.sad, "sad_%dx%d", w, h)) {
> + if(check_func(c.inter.sad[sad_tab[(w >> 3) - 1]], "sad_%dx%d", w, h)) {
> int result0;
> int result1;
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2024-05-23 12:36 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-05-23 12:27 [FFmpeg-devel] [PATCH 1/5] avcodec/vvc_mc: split the SAD dsp prototype into one function per blocksize width James Almer
2024-05-23 12:27 ` [FFmpeg-devel] [PATCH 2/5] x86/vvc_sad: optimize vvc_sad_16 James Almer
2024-05-23 12:27 ` [FFmpeg-devel] [PATCH 3/5] x86/vvc_sad: add sse4 versions of all functions James Almer
2024-05-23 12:27 ` [FFmpeg-devel] [PATCH 4/5] x86/vvc_sad: reduce gpr usage in all loop functions James Almer
2024-05-23 12:27 ` [FFmpeg-devel] [PATCH 5/5] x86/vvc_sad: reindent after the previous changes James Almer
2024-05-23 12:35 ` [FFmpeg-devel] [PATCH 1/5] avcodec/vvc_mc: split the SAD dsp prototype into one function per blocksize width Andreas Rheinhardt
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git