From: Henrik Gramner via ffmpeg-devel <ffmpeg-devel@ffmpeg.org> To: ffmpeg-devel@ffmpeg.org Cc: Henrik Gramner <code@ffmpeg.org> Subject: [FFmpeg-devel] [PATCH] vp9: Add 8bpc intra prediction AVX2 asm (PR #20386) Message-ID: <175672499026.25.13389439321243697441@463a07221176> (raw) PR #20386 opened by Henrik Gramner (gramner) URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20386 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20386.patch A few of the most basic variants had existing AVX2 implementations since before. Those were rewritten to reduce code size. Checkasm numbers on Zen 5 (Strix Halo): ``` vp9_dc_32x32_8bpp_ssse3: 24.2 vp9_dc_32x32_8bpp_avx2: 10.3 vp9_dc_left_32x32_8bpp_ssse3: 23.6 vp9_dc_left_32x32_8bpp_avx2: 9.9 vp9_dc_top_32x32_8bpp_ssse3: 22.9 vp9_dc_top_32x32_8bpp_avx2: 10.0 vp9_diag_downleft_32x32_8bpp_avx: 28.5 vp9_diag_downleft_32x32_8bpp_avx2: 13.5 vp9_diag_downright_32x32_8bpp_avx: 35.0 vp9_diag_downright_32x32_8bpp_avx2: 17.0 vp9_hor_32x32_8bpp_avx: 22.3 vp9_hor_32x32_8bpp_avx2: 11.1 vp9_hor_down_32x32_8bpp_avx: 27.5 vp9_hor_down_32x32_8bpp_avx2: 19.8 vp9_hor_up_32x32_8bpp_avx: 26.0 vp9_hor_up_32x32_8bpp_avx2: 16.0 vp9_tm_32x32_8bpp_avx: 97.9 vp9_tm_32x32_8bpp_avx2: 23.6 vp9_vert_32x32_8bpp_sse: 20.8 vp9_vert_32x32_8bpp_avx2: 8.9 vp9_vert_left_32x32_8bpp_avx: 28.1 vp9_vert_left_32x32_8bpp_avx2: 15.2 vp9_vert_right_32x32_8bpp_avx: 32.0 vp9_vert_right_32x32_8bpp_avx2: 21.3 ``` >From ce6ff1b6229f2346e3caee18efbe36e794a94c6d Mon Sep 17 00:00:00 2001 From: Henrik Gramner <gramner@twoorioles.com> Date: Mon, 1 Sep 2025 02:03:00 +0200 Subject: [PATCH] vp9: Add 8bpc intra prediction AVX2 asm --- libavcodec/x86/vp9dsp_init.c | 13 +- libavcodec/x86/vp9intrapred.asm | 467 +++++++++++++++++++++----------- 2 files changed, 309 insertions(+), 171 deletions(-) diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c index 9836b3321c..bbabcf38c3 100644 --- a/libavcodec/x86/vp9dsp_init.c +++ b/libavcodec/x86/vp9dsp_init.c @@ -207,11 +207,8 @@ ipred_dir_tm_h_funcs(8, avx); ipred_dir_tm_h_funcs(16, avx); ipred_dir_tm_h_funcs(32, avx); -ipred_func(32, v, avx); - -ipred_dc_funcs(32, avx2); -ipred_func(32, h, avx2); -ipred_func(32, tm, avx2); +ipred_all_funcs(32, avx2); +ipred_func(32, v, avx2); #undef ipred_func #undef ipred_dir_tm_h_funcs @@ -388,7 +385,6 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact) if (EXTERNAL_AVX_FAST(cpu_flags)) { init_fpel_func(1, 0, 32, put, , avx); init_fpel_func(0, 0, 64, put, , avx); - init_ipred(32, avx, v, VERT); } if (EXTERNAL_AVX2_FAST(cpu_flags)) { @@ -408,9 +404,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact) init_subpel3_32_64(1, avg, 8, avx2); #endif } - init_dc_ipred(32, avx2); - init_ipred(32, avx2, h, HOR); - init_ipred(32, avx2, tm, TM_VP8); + init_all_ipred(32, avx2); + init_ipred(32, avx2, v, VERT); } #if ARCH_X86_64 diff --git a/libavcodec/x86/vp9intrapred.asm b/libavcodec/x86/vp9intrapred.asm index 31f7d449fd..b67addd7e3 100644 --- a/libavcodec/x86/vp9intrapred.asm +++ b/libavcodec/x86/vp9intrapred.asm @@ -2,6 +2,7 @@ ;* VP9 Intra prediction SIMD optimizations ;* ;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> +;* Copyright (c) 2025 Two Orioles, LLC ;* ;* Parts based on: ;* H.264 intra prediction asm optimizations @@ -230,40 +231,6 @@ DC_16to32_FUNCS INIT_XMM ssse3 DC_16to32_FUNCS -%if HAVE_AVX2_EXTERNAL -INIT_YMM avx2 -cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a - mova m0, [lq] - mova m1, [aq] - DEFINE_ARGS dst, stride, stride3, cnt - lea stride3q, [strideq*3] - pxor m2, m2 - psadbw m0, m2 - psadbw m1, m2 - paddw m0, m1 - vextracti128 xm1, m0, 1 - paddw xm0, xm1 - movhlps xm1, xm0 - paddw xm0, xm1 - pmulhrsw xm0, [pw_512] - vpbroadcastb m0, xm0 - mov cntd, 4 -.loop: - mova [dstq+strideq*0], m0 - mova [dstq+strideq*1], m0 - mova [dstq+strideq*2], m0 - mova [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - mova [dstq+strideq*0], m0 - mova [dstq+strideq*1], m0 - mova [dstq+strideq*2], m0 - mova [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - dec cntd - jg .loop - RET -%endif - ; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a) %macro DC_1D_4to8_FUNCS 2 ; dir (top or left), arg (a or l) @@ -395,44 +362,6 @@ INIT_XMM ssse3 DC_1D_16to32_FUNCS top, a DC_1D_16to32_FUNCS left, l -%macro DC_1D_AVX2_FUNCS 2 ; dir (top or left), arg (a or l) -%if HAVE_AVX2_EXTERNAL -cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a - mova m0, [%2q] - DEFINE_ARGS dst, stride, stride3, cnt - lea stride3q, [strideq*3] - pxor m2, m2 - psadbw m0, m2 - vextracti128 xm1, m0, 1 - paddw xm0, xm1 - movhlps xm1, xm0 - paddw xm0, xm1 - pmulhrsw xm0, [pw_1024] - vpbroadcastb m0, xm0 - mov cntd, 4 -.loop: - mova [dstq+strideq*0], m0 - mova [dstq+strideq*1], m0 - mova [dstq+strideq*2], m0 - mova [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - mova [dstq+strideq*0], m0 - mova [dstq+strideq*1], m0 - mova [dstq+strideq*2], m0 - mova [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - dec cntd - jg .loop - RET -%endif -%endmacro - -INIT_YMM avx2 -DC_1D_AVX2_FUNCS top, a -DC_1D_AVX2_FUNCS left, l - -; v - INIT_MMX mmx cglobal vp9_ipred_v_8x8, 4, 4, 0, dst, stride, l, a movq m0, [aq] @@ -486,29 +415,6 @@ cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a jg .loop RET -INIT_YMM avx -cglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a - mova m0, [aq] - DEFINE_ARGS dst, stride, stride3, cnt - lea stride3q, [strideq*3] - mov cntd, 4 -.loop: - mova [dstq+strideq*0], m0 - mova [dstq+strideq*1], m0 - mova [dstq+strideq*2], m0 - mova [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - mova [dstq+strideq*0], m0 - mova [dstq+strideq*1], m0 - mova [dstq+strideq*2], m0 - mova [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - dec cntd - jg .loop - RET - -; h - %macro H_XMM_FUNCS 2 %if notcpuflag(avx) cglobal vp9_ipred_h_4x4, 3, 4, 1, dst, stride, l, stride3 @@ -642,34 +548,6 @@ H_XMM_FUNCS 4, 8 INIT_XMM avx H_XMM_FUNCS 4, 8 -%if HAVE_AVX2_EXTERNAL -INIT_YMM avx2 -cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt - mova m5, [pb_1] - mova m6, [pb_2] - mova m7, [pb_3] - pxor m4, m4 - lea stride3q, [strideq*3] - mov cntq, 7 -.loop: - movd xm3, [lq+cntq*4] - vinserti128 m3, m3, xm3, 1 - pshufb m0, m3, m7 - pshufb m1, m3, m6 - mova [dstq+strideq*0], m0 - mova [dstq+strideq*1], m1 - pshufb m2, m3, m5 - pshufb m3, m4 - mova [dstq+strideq*2], m2 - mova [dstq+stride3q ], m3 - lea dstq, [dstq+strideq*4] - dec cntq - jge .loop - RET -%endif - -; tm - %macro TM_MMX_FUNCS 0 cglobal vp9_ipred_tm_4x4, 4, 4, 0, dst, stride, l, a pxor m1, m1 @@ -898,46 +776,9 @@ TM_XMM_FUNCS INIT_XMM avx TM_XMM_FUNCS -%if HAVE_AVX2_EXTERNAL -INIT_YMM avx2 -cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a - pxor m3, m3 - pinsrw xm2, [aq-1], 0 - vinserti128 m2, m2, xm2, 1 - mova m0, [aq] - DEFINE_ARGS dst, stride, l, cnt - mova m4, [pw_m256] - mova m5, [pw_m255] - pshufb m2, m4 - punpckhbw m1, m0, m3 - punpcklbw m0, m3 - psubw m1, m2 - psubw m0, m2 - mov cntq, 15 -.loop: - pinsrw xm7, [lq+cntq*2], 0 - vinserti128 m7, m7, xm7, 1 - pshufb m3, m7, m5 - pshufb m7, m4 - paddw m2, m3, m0 - paddw m3, m1 - paddw m6, m7, m0 - paddw m7, m1 - packuswb m2, m3 - packuswb m6, m7 - mova [dstq+strideq*0], m2 - mova [dstq+strideq*1], m6 - lea dstq, [dstq+strideq*2] - dec cntq - jge .loop - RET -%endif - -; dl - -%macro LOWPASS 4 ; left [dst], center, right, tmp +%macro LOWPASS 4-5 [pb_1] ; left [dst], center, right, tmp, pb_1 pxor m%4, m%1, m%3 - pand m%4, [pb_1] + pand m%4, %5 pavgb m%1, m%3 psubusb m%1, m%4 pavgb m%1, m%2 @@ -2041,4 +1882,306 @@ HU_XMM_FUNCS 7 INIT_XMM avx HU_XMM_FUNCS 7 +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a + pxor m1, m1 + psadbw m0, m1, [lq] + psadbw m1, [aq] + movd xm2, [pw_512] + paddw m0, m1 + vextracti128 xm1, m0, 1 +.main: + paddw xm0, xm1 + punpckhqdq xm1, xm0, xm0 + paddw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastb m0, xm0 +.main2: + lea r2, [strideq*3] + mov r3d, 8 +.loop: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+r2 ], m0 + lea dstq, [dstq+strideq*4] + dec r3d + jg .loop + RET + +cglobal vp9_ipred_dc_top_32x32, 0, 4, 3, dst, stride, l, a + mov lq, amp +%if ARCH_X86_32 + jmp mangle(private_prefix %+ _vp9_ipred_dc_left_32x32 %+ SUFFIX).main +%endif + +%assign function_align 1 +cglobal vp9_ipred_dc_left_32x32, 0, 4, 3, dst, stride, l, a + movifnidn lq, lmp +.main: + movifnidn dstq, dstmp + movifnidn strideq, stridemp + pxor xm1, xm1 + psadbw xm0, xm1, [lq] + psadbw xm1, [lq+16] + movd xm2, [pw_1024] + jmp mangle(private_prefix %+ _vp9_ipred_dc_32x32 %+ SUFFIX).main + +cglobal vp9_ipred_v_32x32, 2, 4, 3, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] + jmp mangle(private_prefix %+ _vp9_ipred_dc_32x32 %+ SUFFIX).main2 + +%assign function_align 16 +cglobal vp9_ipred_h_32x32, 3, 5, 6, dst, stride, l + vpbroadcastd m2, [pb_3] + mov r3d, 7 + vpbroadcastd m3, [pb_2] + pxor m5, m5 + vpbroadcastd m4, [pb_1] + lea r4, [strideq*3] +.loop: + vpbroadcastd m1, [lq+r3*4] + pshufb m0, m1, m2 + mova [dstq+strideq*0], m0 + pshufb m0, m1, m3 + mova [dstq+strideq*1], m0 + pshufb m0, m1, m4 + mova [dstq+strideq*2], m0 + pshufb m1, m5 + mova [dstq+r4 ], m1 + lea dstq, [dstq+strideq*4] + dec r3d + jge .loop + RET + +cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a + vpbroadcastd m0, [aq-1] + mova m7, [aq] + pxor m1, m1 + vpbroadcastd m4, [pw_m255] + mov r3d, 15 + vpbroadcastd m5, [pw_m256] + pshufb m0, m5 + punpcklbw m6, m7, m1 + punpckhbw m7, m1 + psubw m6, m0 + psubw m7, m0 +.loop: + vpbroadcastd m3, [lq+r3*2] + pshufb m2, m3, m4 + pshufb m3, m5 + paddw m0, m2, m6 + paddw m2, m7 + paddw m1, m3, m6 + paddw m3, m7 + packuswb m0, m2 + packuswb m1, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + dec r3d + jge .loop + RET + +cglobal vp9_ipred_dl_32x32, 2, 5, 6, dst, stride, l, a + movifnidn aq, amp + vpbroadcastb m2, [aq+31] + vinserti128 m3, m2, [aq+16], 0 + mova m0, [aq+ 0] + vpbroadcastd m5, [pb_1] + palignr m4, m3, m0, 2 + lea r3, [strideq*2] + palignr m3, m0, 1 + LOWPASS 0, 3, 4, 1, m5 + lea r4, [strideq*3] + vperm2i128 m1, m0, m2, 0x31 + mov r2d, 8 +.loop: + shufpd m3, m0, m1, 0x05 + mova [dstq+r3*0], m0 + punpckhqdq m4, m1, m2 + mova [dstq+r3*4], m3 + palignr m0, m1, m0, 1 + mova [dstq+r3*8], m1 + palignr m1, m2, m1, 1 + mova [dstq+r4*8], m4 + add dstq, strideq + dec r2d + jg .loop + RET + +cglobal vp9_ipred_dr_32x32, 4, 5, 7, dst, stride, l, a + mova m3, [lq+ 0] + movu m1, [aq- 1] + mova m0, [aq+ 0] + vpbroadcastd m6, [pb_1] + vperm2i128 m2, m3, m1, 0x21 + lea r3, [strideq*2] + palignr m4, m1, m2, 15 + LOWPASS 0, 1, 4, 5, m6 + pslldq xm4, xm3, 1 + palignr m2, m3, 1 + vinserti128 m4, [lq+15], 1 + LOWPASS 2, 3, 4, 5, m6 + lea r4, [strideq*3] + vperm2i128 m1, m2, m0, 0x21 + mov r2d, 8 +.loop: + shufpd m3, m1, m0, 0x05 + mova [dstq+r3*0], m0 + shufpd m4, m2, m1, 0x05 + mova [dstq+r3*4], m3 + palignr m0, m1, 15 + mova [dstq+r3*8], m1 + palignr m1, m2, 15 + mova [dstq+r4*8], m4 + add dstq, strideq + pslldq m2, 1 + dec r2d + jg .loop + RET + +cglobal vp9_ipred_hd_32x32, 4, 6, 7, dst, stride, l, a + movu m1, [aq-1] + mova m0, [lq] + vpbroadcastd m6, [pb_1] + vperm2i128 m4, m0, m1, 0x21 + palignr m3, m4, m0, 1 + palignr m4, m0, 2 + LOWPASS 4, 3, 0, 2, m6 + pavgb m3, m0 + movu xm0, [aq+15] + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + palignr m4, m0, m1, 2 + palignr m0, m1, 1 + LOWPASS 4, 0, 1, 5, m6 + lea r2, [strideq*8] + vinserti128 m0, m2, xm3, 1 + lea r3, [dstq+r2*1] + vpblendd m1, m2, m3, 0x0f + lea r4, [dstq+r2*2] + vperm2i128 m2, m3, 0x31 + lea r5, [r3 +r2*2] + vperm2i128 m3, m4, 0x21 +.loop: + sub r2, strideq + mova [r5 +r2], m0 + palignr m0, m1, m0, 2 + mova [r4 +r2], m1 + palignr m1, m2, m1, 2 + mova [r3 +r2], m2 + palignr m2, m3, m2, 2 + mova [dstq+r2], m3 + palignr m3, m4, m3, 2 + psrldq m4, 2 + jg .loop + RET + +cglobal vp9_ipred_hu_32x32, 3, 5, 6, dst, stride, l, a + mova m0, [lq] + vpbroadcastb xm3, [lq+31] + vpbroadcastd m1, [pb_1] + vbroadcasti128 m4, [pb_2toE_3xF] + vperm2i128 m3, m0, 0x03 + palignr m5, m3, m0, 2 + palignr m3, m0, 1 + LOWPASS 5, 3, 0, 2, m1 + vpbroadcastd m1, [pb_15] + pavgb m3, m0 + punpcklbw m2, m3, m5 + punpckhbw m3, m5 + vinserti128 m0, m2, xm3, 1 + pshufb m5, m1 + vperm2i128 m1, m2, m3, 0x12 + lea r3, [strideq*2] + vperm2i128 m2, m3, 0x31 + lea r4, [strideq*3] + vperm2i128 m3, m5, 0x31 + mov r2d, 8 +.loop: + mova [dstq+r3*0], m0 + palignr m0, m1, m0, 2 + mova [dstq+r3*4], m1 + palignr m1, m2, m1, 2 + mova [dstq+r3*8], m2 + palignr m2, m3, m2, 2 + mova [dstq+r4*8], m3 + pshufb m3, m4 + add dstq, strideq + dec r2d + jg .loop + RET + +cglobal vp9_ipred_vl_32x32, 2, 5, 6, dst, stride, l, a + movifnidn aq, amp + vpbroadcastb m4, [aq+31] + vinserti128 m0, m4, [aq+16], 0 + mova m1, [aq+ 0] + vpbroadcastd m5, [pb_1] + palignr m2, m0, m1, 2 + palignr m0, m1, 1 + LOWPASS 2, 0, 1, 3, m5 + pavgb m0, m1 + lea r3, [strideq*2] + vperm2i128 m1, m0, m4, 0x31 + lea r4, [strideq+r3*8] + vperm2i128 m3, m2, m4, 0x31 + mov r2d, 8 +.loop: + shufpd m4, m0, m1, 0x05 + mova [dstq+strideq*0], m0 + shufpd m5, m2, m3, 0x05 + mova [dstq+strideq*1], m2 + palignr m0, m1, m0, 1 + mova [dstq+r3*8 ], m4 + psrldq m1, 1 + mova [dstq+r4 ], m5 + palignr m2, m3, m2, 1 + add dstq, r3 + psrldq m3, 1 + dec r2d + jg .loop + RET + +cglobal vp9_ipred_vr_32x32, 4, 5, 7, dst, stride, l, a + mova m4, [lq+ 0] + movu m0, [aq- 1] + vpbroadcastd m6, [pb_1] + vperm2i128 m2, m4, m0, 0x21 + pslldq xm5, xm4, 1 + palignr m3, m2, m4, 1 + vinserti128 m5, [lq+15], 1 + LOWPASS 3, 4, 5, 1, m6 + mova m1, [aq+ 0] + vbroadcasti128 m4, [pb_02468ACE_13579BDF] + palignr m2, m0, m2, 15 + LOWPASS 2, 0, 1, 5, m6 + pshufb m3, m4 + lea r3, [strideq*2] + vpermq m3, m3, q2031 + pavgb m0, m1 + vinserti128 m1, m3, xm0, 1 + lea r4, [strideq+r3*8] + vperm2i128 m3, m2, 0x21 + mov r2d, 8 +.loop: + shufpd m4, m1, m0, 0x05 + mova [dstq+strideq*0], m0 + shufpd m5, m3, m2, 0x05 + mova [dstq+strideq*1], m2 + palignr m0, m1, 15 + mova [dstq+r3*8 ], m4 + pslldq m1, 1 + mova [dstq+r4 ], m5 + palignr m2, m3, 15 + add dstq, r3 + pslldq m3, 1 + dec r2d + jg .loop + RET +%endif + ; FIXME 127, 128, 129 ? -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
reply other threads:[~2025-09-01 11:10 UTC|newest] Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=175672499026.25.13389439321243697441@463a07221176 \ --to=ffmpeg-devel@ffmpeg.org \ --cc=code@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git