* [FFmpeg-devel] [PATCH v3 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC
@ 2024-05-14 20:40 Stone Chen
2024-05-14 20:40 ` [FFmpeg-devel] [PATCH v3 2/2][GSoC 2024] tests/checkasm: Add check_vvc_sad to vvc_mc.c Stone Chen
` (3 more replies)
0 siblings, 4 replies; 7+ messages in thread
From: Stone Chen @ 2024-05-14 20:40 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Stone Chen
Implements AVX2 DMVR (decoder-side motion vector refinement) SAD functions. DMVR SAD is only calculated if w >= 8, h >= 8, and w * h > 128. To reduce complexity, SAD is only calculated on even rows. This is calculated for all video bitdepths, but the values passed to the function are always 16bit (even if the original video bitdepth is 8). The AVX2 implementation uses min/max/sub.
Benchmarks ( AMD 7940HS )
Before:
BQTerrace_1920x1080_60_10_420_22_RA.vvc | 80.7 |
Chimera_8bit_1080P_1000_frames.vvc | 158.0 |
NovosobornayaSquare_1920x1080.bin | 159.7 |
RitualDance_1920x1080_60_10_420_37_RA.266 | 146.3 |
After:
BQTerrace_1920x1080_60_10_420_22_RA.vvc | 82.7 |
Chimera_8bit_1080P_1000_frames.vvc | 167.0 |
NovosobornayaSquare_1920x1080.bin | 166.3 |
RitualDance_1920x1080_60_10_420_37_RA.266 | 154.0 |
---
libavcodec/x86/vvc/Makefile | 3 +-
libavcodec/x86/vvc/vvc_sad.asm | 157 +++++++++++++++++++++++++++++++
libavcodec/x86/vvc/vvcdsp_init.c | 6 ++
3 files changed, 165 insertions(+), 1 deletion(-)
create mode 100644 libavcodec/x86/vvc/vvc_sad.asm
diff --git a/libavcodec/x86/vvc/Makefile b/libavcodec/x86/vvc/Makefile
index d6a66f860a..7b2438ce17 100644
--- a/libavcodec/x86/vvc/Makefile
+++ b/libavcodec/x86/vvc/Makefile
@@ -5,4 +5,5 @@ OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvcdsp_init.o \
x86/h26x/h2656dsp.o
X86ASM-OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvc_alf.o \
x86/vvc/vvc_mc.o \
- x86/h26x/h2656_inter.o
+ x86/vvc/vvc_sad.o \
+ x86/h26x/h2656_inter.o
diff --git a/libavcodec/x86/vvc/vvc_sad.asm b/libavcodec/x86/vvc/vvc_sad.asm
new file mode 100644
index 0000000000..530142ad35
--- /dev/null
+++ b/libavcodec/x86/vvc/vvc_sad.asm
@@ -0,0 +1,157 @@
+; /*
+; * Provide SIMD DMVR SAD functions for VVC decoding
+; *
+; * Copyright (c) 2024 Stone Chen
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+; */
+
+%include "libavutil/x86/x86util.asm"
+
+%define MAX_PB_SIZE 128
+%define ROWS 2 ; DMVR SAD is only calculated on even rows to reduce complexity
+
+SECTION .text
+
+%macro MIN_MAX_SAD 3 ;
+ vpminuw %1, %2, %3
+ vpmaxuw %3, %2, %3
+ vpsubusw %3, %3, %1
+%endmacro
+
+%macro HORIZ_ADD 3 ; xm0, xm1, m1
+ vextracti128 %1, %3, q0001 ; 3 2 1 0
+ vpaddd %1, %2 ; xm0 (7 + 3) (6 + 2) (5 + 1) (4 + 0)
+ vpshufd %2, %1, q0032 ; xm1 - - (7 + 3) (6 + 2)
+ vpaddd %1, %1, %2 ; xm0 _ _ (5 1 7 3) (4 0 6 2)
+ vpshufd %2, %1, q0001 ; xm1 _ _ (5 1 7 3) (5 1 7 3)
+ vpaddd %1, %1, %2 ; (01234567)
+%endmacro
+
+%macro INIT_OFFSET 6 ; src1, src2, dxq, dyq, off1, off2
+ sub %3, 2
+ sub %4, 2
+
+ mov %5, 2
+ mov %6, 2
+
+ add %5, %4
+ sub %6, %4
+
+ imul %5, 128
+ imul %6, 128
+
+ add %5, 2
+ add %6, 2
+
+ add %5, %3
+ sub %6, %3
+
+ lea %1, [%1 + %5 * 2]
+ lea %2, [%2 + %6 * 2]
+%endmacro
+
+%if ARCH_X86_64
+%if HAVE_AVX2_EXTERNAL
+
+INIT_YMM avx2
+
+cglobal vvc_sad, 6, 11, 14, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx, dx2, dy2
+ movsxd dx2q, dxd
+ movsxd dy2q, dyd
+ INIT_OFFSET src1q, src2q, dx2q, dy2q, off1q, off2q
+ pxor m3, m3
+ pxor m8, m8
+
+ cmp block_wd, 16
+ jge vvc_sad_16_128
+
+ vvc_sad_8:
+ .loop_height:
+ movu xm0, [src1q]
+ movu xm1, [src2q]
+ MIN_MAX_SAD xm2, xm0, xm1
+ vpmovzxwd m1, xm1
+ vpaddd m3, m1
+
+ movu xm5, [src1q + MAX_PB_SIZE * ROWS * 2]
+ movu xm6, [src2q + MAX_PB_SIZE * ROWS * 2]
+ MIN_MAX_SAD xm7, xm5, xm6
+ vpmovzxwd m6, xm6
+ vpaddd m3, m6
+
+ movu xm8, [src1q + MAX_PB_SIZE * 2 * ROWS * 2]
+ movu xm9, [src2q + MAX_PB_SIZE * 2 * ROWS * 2]
+ MIN_MAX_SAD xm10, xm8, xm9
+ vpmovzxwd m9, xm9
+ vpaddd m3, m9
+
+ movu xm11, [src1q + MAX_PB_SIZE * 3 * ROWS * 2]
+ movu xm12, [src2q + MAX_PB_SIZE * 3 * ROWS * 2]
+ MIN_MAX_SAD xm13, xm11, xm12
+ vpmovzxwd m12, xm12
+
+ vpaddd m3, m12
+
+ add src1q, MAX_PB_SIZE * 4 * ROWS * 2
+ add src2q, MAX_PB_SIZE * 4 * ROWS * 2
+
+ sub block_hd, 8
+ jg .loop_height
+
+ HORIZ_ADD xm0, xm3, m3
+ movd eax, xm0
+ RET
+
+ vvc_sad_16_128:
+ .loop_height:
+ mov off1q, src1q
+ mov off2q, src2q
+ mov row_idxd, block_wd
+ sar row_idxd, 4
+
+ .loop_width:
+ movu xm0, [src1q]
+ movu xm1, [src2q]
+ MIN_MAX_SAD xm2, xm0, xm1
+ vpmovzxwd m1, xm1
+ vpaddd m3, m1
+
+ movu xm5, [src1q + 16]
+ movu xm6, [src2q + 16]
+ MIN_MAX_SAD xm7, xm5, xm6
+ vpmovzxwd m6, xm6
+ vpaddd m3, m6
+
+ add src1q, 32
+ add src2q, 32
+ dec row_idxd
+ jg .loop_width
+
+ lea src1q, [off1q + ROWS * MAX_PB_SIZE * 2]
+ lea src2q, [off2q + ROWS * MAX_PB_SIZE * 2]
+
+ sub block_hd, 2
+ jg .loop_height
+
+ HORIZ_ADD xm0, xm3, m3
+ movd eax, xm0
+
+ RET
+
+%endif
+%endif
diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
index 0e68971b2c..4b4a2aa937 100644
--- a/libavcodec/x86/vvc/vvcdsp_init.c
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -311,6 +311,9 @@ ALF_FUNCS(16, 12, avx2)
c->alf.filter[CHROMA] = ff_vvc_alf_filter_chroma_##bd##_avx2; \
c->alf.classify = ff_vvc_alf_classify_##bd##_avx2; \
} while (0)
+
+int ff_vvc_sad_avx2(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h);
+#define SAD_INIT() c->inter.sad = ff_vvc_sad_avx2
#endif
void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
@@ -327,6 +330,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
ALF_INIT(8);
AVG_INIT(8, avx2);
MC_LINKS_AVX2(8);
+ SAD_INIT();
}
break;
case 10:
@@ -338,6 +342,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
AVG_INIT(10, avx2);
MC_LINKS_AVX2(10);
MC_LINKS_16BPC_AVX2(10);
+ SAD_INIT();
}
break;
case 12:
@@ -349,6 +354,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
AVG_INIT(12, avx2);
MC_LINKS_AVX2(12);
MC_LINKS_16BPC_AVX2(12);
+ SAD_INIT();
}
break;
default:
--
2.45.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
* [FFmpeg-devel] [PATCH v3 2/2][GSoC 2024] tests/checkasm: Add check_vvc_sad to vvc_mc.c
2024-05-14 20:40 [FFmpeg-devel] [PATCH v3 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC Stone Chen
@ 2024-05-14 20:40 ` Stone Chen
2024-05-18 10:50 ` [FFmpeg-devel] 回复: [PATCH v3 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC Wu Jianhua
` (2 subsequent siblings)
3 siblings, 0 replies; 7+ messages in thread
From: Stone Chen @ 2024-05-14 20:40 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Stone Chen
Adds checkasm for DMVR SAD AVX2 implementation.
Benchmarks ( AMD 7940HS )
vvc_sad_8x8_c: 63.0
vvc_sad_8x8_avx2: 3.0
vvc_sad_16x16_c: 263.0
vvc_sad_16x16_avx2: 23.0
vvc_sad_32x32_c: 1003.0
vvc_sad_32x32_avx2: 83.0
vvc_sad_64x64_c: 3923.0
vvc_sad_64x64_avx2: 373.0
vvc_sad_128x128_c: 17533.0
vvc_sad_128x128_avx2: 1683.0
---
tests/checkasm/vvc_mc.c | 38 ++++++++++++++++++++++++++++++++++++++
1 file changed, 38 insertions(+)
diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
index 97f57cb401..e251400bfc 100644
--- a/tests/checkasm/vvc_mc.c
+++ b/tests/checkasm/vvc_mc.c
@@ -322,8 +322,46 @@ static void check_avg(void)
report("avg");
}
+static void check_vvc_sad(void)
+{
+ const int bit_depth = 10;
+ VVCDSPContext c;
+ LOCAL_ALIGNED_32(uint16_t, src0, [MAX_CTU_SIZE * MAX_CTU_SIZE * 4]);
+ LOCAL_ALIGNED_32(uint16_t, src1, [MAX_CTU_SIZE * MAX_CTU_SIZE * 4]);
+ declare_func(int, const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h);
+
+ ff_vvc_dsp_init(&c, bit_depth);
+ memset(src0, 0, MAX_CTU_SIZE * MAX_CTU_SIZE * 4);
+ memset(src1, 0, MAX_CTU_SIZE * MAX_CTU_SIZE * 4);
+
+ randomize_pixels(src0, src1, MAX_CTU_SIZE * MAX_CTU_SIZE * 2);
+ for (int h = 8; h <= MAX_CTU_SIZE; h *= 2) {
+ for (int w = 8; w <= MAX_CTU_SIZE; w *= 2) {
+ for(int offy = 0; offy <= 4; offy++) {
+ for(int offx = 0; offx <= 4; offx++) {
+ if(check_func(c.inter.sad, "vvc_sad_%dx%d", w, h)) {
+ int result0;
+ int result1;
+
+ result0 = call_ref(src0 + PIXEL_STRIDE * 2 + 2, src1 + PIXEL_STRIDE * 2 + 2, offx, offy, w, h);
+ result1 = call_new(src0 + PIXEL_STRIDE * 2 + 2, src1 + PIXEL_STRIDE * 2 + 2, offx, offy, w, h);
+
+ if (result1 != result0)
+ fail();
+ if(w == h && offx == 0 && offy == 0)
+ bench_new(src0 + PIXEL_STRIDE * 2 + 2, src1 + PIXEL_STRIDE * 2 + 2, offx, offy, w, h);
+ }
+ }
+ }
+ }
+ }
+
+ report("check_vvc_sad");
+}
+
void checkasm_check_vvc_mc(void)
{
+ check_vvc_sad();
check_put_vvc_luma();
check_put_vvc_luma_uni();
check_put_vvc_chroma();
--
2.45.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
* [FFmpeg-devel] 回复: [PATCH v3 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC
2024-05-14 20:40 [FFmpeg-devel] [PATCH v3 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC Stone Chen
2024-05-14 20:40 ` [FFmpeg-devel] [PATCH v3 2/2][GSoC 2024] tests/checkasm: Add check_vvc_sad to vvc_mc.c Stone Chen
@ 2024-05-18 10:50 ` Wu Jianhua
2024-05-18 13:04 ` [FFmpeg-devel] " Ronald S. Bultje
2024-05-18 15:33 ` Ronald S. Bultje
3 siblings, 0 replies; 7+ messages in thread
From: Wu Jianhua @ 2024-05-18 10:50 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Stone Chen
> 发件人: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> 代表 Stone Chen <chen.stonechen@gmail.com>
> 发送时间: 2024年5月14日 13:40
> 收件人: ffmpeg-devel@ffmpeg.org
> 抄送: Stone Chen
> 主题: [FFmpeg-devel] [PATCH v3 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC
>
> Implements AVX2 DMVR (decoder-side motion vector refinement) SAD functions. DMVR SAD is only calculated if w >= 8, h >= 8, and w * h > 128. > > To reduce complexity, SAD is only calculated on even rows. This is calculated for all video bitdepths, but the values passed to the function are always > 16bit (even if the original video bitdepth is 8). The AVX2 implementation uses min/max/sub.
>
> Benchmarks ( AMD 7940HS )
> Before:
> BQTerrace_1920x1080_60_10_420_22_RA.vvc | 80.7 |
> Chimera_8bit_1080P_1000_frames.vvc | 158.0 |
> NovosobornayaSquare_1920x1080.bin | 159.7 |
> RitualDance_1920x1080_60_10_420_37_RA.266 | 146.3 |
>
> After:
> BQTerrace_1920x1080_60_10_420_22_RA.vvc | 82.7 |
> Chimera_8bit_1080P_1000_frames.vvc | 167.0 |
> NovosobornayaSquare_1920x1080.bin | 166.3 |
> RitualDance_1920x1080_60_10_420_37_RA.266 | 154.0 |
> ---
LGTM. Thanks for your efforts.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC
2024-05-14 20:40 [FFmpeg-devel] [PATCH v3 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC Stone Chen
2024-05-14 20:40 ` [FFmpeg-devel] [PATCH v3 2/2][GSoC 2024] tests/checkasm: Add check_vvc_sad to vvc_mc.c Stone Chen
2024-05-18 10:50 ` [FFmpeg-devel] 回复: [PATCH v3 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC Wu Jianhua
@ 2024-05-18 13:04 ` Ronald S. Bultje
2024-05-18 13:12 ` Stone Chen
2024-05-18 15:33 ` Ronald S. Bultje
3 siblings, 1 reply; 7+ messages in thread
From: Ronald S. Bultje @ 2024-05-18 13:04 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: Stone Chen
Hi,
On Tue, May 14, 2024 at 4:40 PM Stone Chen <chen.stonechen@gmail.com> wrote:
> Implements AVX2 DMVR (decoder-side motion vector refinement) SAD
> functions. DMVR SAD is only calculated if w >= 8, h >= 8, and w * h > 128.
> To reduce complexity, SAD is only calculated on even rows. This is
> calculated for all video bitdepths, but the values passed to the function
> are always 16bit (even if the original video bitdepth is 8). The AVX2
> implementation uses min/max/sub.
>
> Benchmarks ( AMD 7940HS )
> Before:
> BQTerrace_1920x1080_60_10_420_22_RA.vvc | 80.7 |
> Chimera_8bit_1080P_1000_frames.vvc | 158.0 |
> NovosobornayaSquare_1920x1080.bin | 159.7 |
> RitualDance_1920x1080_60_10_420_37_RA.266 | 146.3 |
>
> After:
> BQTerrace_1920x1080_60_10_420_22_RA.vvc | 82.7 |
> Chimera_8bit_1080P_1000_frames.vvc | 167.0 |
> NovosobornayaSquare_1920x1080.bin | 166.3 |
> RitualDance_1920x1080_60_10_420_37_RA.266 | 154.0 |
>
I assume these are FPS benchmarks? Can you provide checkasm --bench output
for these functions also?
Ronald
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC
2024-05-18 13:04 ` [FFmpeg-devel] " Ronald S. Bultje
@ 2024-05-18 13:12 ` Stone Chen
0 siblings, 0 replies; 7+ messages in thread
From: Stone Chen @ 2024-05-18 13:12 UTC (permalink / raw)
To: Ronald S. Bultje; +Cc: FFmpeg development discussions and patches
On Sat, May 18, 2024 at 9:04 AM Ronald S. Bultje <rsbultje@gmail.com> wrote:
> Hi,
>
> On Tue, May 14, 2024 at 4:40 PM Stone Chen <chen.stonechen@gmail.com>
> wrote:
>
>> Implements AVX2 DMVR (decoder-side motion vector refinement) SAD
>> functions. DMVR SAD is only calculated if w >= 8, h >= 8, and w * h > 128.
>> To reduce complexity, SAD is only calculated on even rows. This is
>> calculated for all video bitdepths, but the values passed to the function
>> are always 16bit (even if the original video bitdepth is 8). The AVX2
>> implementation uses min/max/sub.
>>
>> Benchmarks ( AMD 7940HS )
>> Before:
>> BQTerrace_1920x1080_60_10_420_22_RA.vvc | 80.7 |
>> Chimera_8bit_1080P_1000_frames.vvc | 158.0 |
>> NovosobornayaSquare_1920x1080.bin | 159.7 |
>> RitualDance_1920x1080_60_10_420_37_RA.266 | 146.3 |
>>
>> After:
>> BQTerrace_1920x1080_60_10_420_22_RA.vvc | 82.7 |
>> Chimera_8bit_1080P_1000_frames.vvc | 167.0 |
>> NovosobornayaSquare_1920x1080.bin | 166.3 |
>> RitualDance_1920x1080_60_10_420_37_RA.266 | 154.0 |
>>
>
> I assume these are FPS benchmarks? Can you provide checkasm --bench output
> for these functions also?
>
> Ronald
>
Hi Ronald,
Correct those are FPS benchmarks. There's a separate patch that has the
checkasm --bench (Add check_vvc_sad to vvc_mc.c), in the commit message.
I've c&p the benchmark snippit below:
> vvc_sad_8x8_c: 63.0
> vvc_sad_8x8_avx2: 3.0
> vvc_sad_16x16_c: 263.0
> vvc_sad_16x16_avx2: 23.0
> vvc_sad_32x32_c: 1003.0
> vvc_sad_32x32_avx2: 83.0
> vvc_sad_64x64_c: 3923.0
> vvc_sad_64x64_avx2: 373.0
> vvc_sad_128x128_c: 17533.0
> vvc_sad_128x128_avx2: 1683.0
Also your blogpost was very helpful for getting started with asm!
Cheers,
Stone
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC
2024-05-14 20:40 [FFmpeg-devel] [PATCH v3 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC Stone Chen
` (2 preceding siblings ...)
2024-05-18 13:04 ` [FFmpeg-devel] " Ronald S. Bultje
@ 2024-05-18 15:33 ` Ronald S. Bultje
2024-05-19 14:24 ` Stone Chen
3 siblings, 1 reply; 7+ messages in thread
From: Ronald S. Bultje @ 2024-05-18 15:33 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: Stone Chen
Hi,
On Tue, May 14, 2024 at 4:40 PM Stone Chen <chen.stonechen@gmail.com> wrote:
> + vvc_sad_8:
> + .loop_height:
> + movu xm0, [src1q]
> + movu xm1, [src2q]
> + MIN_MAX_SAD xm2, xm0, xm1
> + vpmovzxwd m1, xm1
> + vpaddd m3, m1
>
[..]
> + vvc_sad_16_128:
> + .loop_height:
>
[..]
> + .loop_width:
> + movu xm0, [src1q]
> + movu xm1, [src2q]
> + MIN_MAX_SAD xm2, xm0, xm1
> + vpmovzxwd m1, xm1
> + vpaddd m3, m1
>
Wouldn't it be more efficient if the main loops did a full register worth
at a time?
vpbroadcastd m4, [pw_1]
loop:
movu m0, [src1q]
movu m1, [src2q]
MIN_MAX_SAD m2, m0, m1
pmaddwd m1, m4
paddd m3, m1
(And then for w8, load 2 rows per iteration using movu xmN, [row0] and
vinserti128 mN, [row1], 1.)
Ronald
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC
2024-05-18 15:33 ` Ronald S. Bultje
@ 2024-05-19 14:24 ` Stone Chen
0 siblings, 0 replies; 7+ messages in thread
From: Stone Chen @ 2024-05-19 14:24 UTC (permalink / raw)
To: Ronald S. Bultje; +Cc: FFmpeg development discussions and patches
On Sat, May 18, 2024 at 11:33 AM Ronald S. Bultje <rsbultje@gmail.com>
wrote:
> Hi,
>
> On Tue, May 14, 2024 at 4:40 PM Stone Chen <chen.stonechen@gmail.com>
> wrote:
>
>> + vvc_sad_8:
>> + .loop_height:
>> + movu xm0, [src1q]
>> + movu xm1, [src2q]
>> + MIN_MAX_SAD xm2, xm0, xm1
>> + vpmovzxwd m1, xm1
>> + vpaddd m3, m1
>>
> [..]
>
>> + vvc_sad_16_128:
>> + .loop_height:
>>
> [..]
>
>> + .loop_width:
>> + movu xm0, [src1q]
>> + movu xm1, [src2q]
>> + MIN_MAX_SAD xm2, xm0, xm1
>> + vpmovzxwd m1, xm1
>> + vpaddd m3, m1
>>
>
Hi Ronald,
> Wouldn't it be more efficient if the main loops did a full register worth
> at a time?
>
> vpbroadcastd m4, [pw_1]
> loop:
> movu m0, [src1q]
> movu m1, [src2q]
> MIN_MAX_SAD m2, m0, m1
> pmaddwd m1, m4
> paddd m3, m1
>
> (And then for w8, load 2 rows per iteration using movu xmN, [row0] and
> vinserti128 mN, [row1], 1.)
>
> Ronald
>
Thank you, I didn't know about the pmaddwd instruction, using it is
definitely more efficient!
Stone
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2024-05-19 14:24 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-05-14 20:40 [FFmpeg-devel] [PATCH v3 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC Stone Chen
2024-05-14 20:40 ` [FFmpeg-devel] [PATCH v3 2/2][GSoC 2024] tests/checkasm: Add check_vvc_sad to vvc_mc.c Stone Chen
2024-05-18 10:50 ` [FFmpeg-devel] 回复: [PATCH v3 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC Wu Jianhua
2024-05-18 13:04 ` [FFmpeg-devel] " Ronald S. Bultje
2024-05-18 13:12 ` Stone Chen
2024-05-18 15:33 ` Ronald S. Bultje
2024-05-19 14:24 ` Stone Chen
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git