Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH] vp9: Add 8bpc intra prediction AVX2 asm (PR #20386)
@ 2025-09-01 11:10 Henrik Gramner via ffmpeg-devel
  0 siblings, 0 replies; only message in thread
From: Henrik Gramner via ffmpeg-devel @ 2025-09-01 11:10 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Henrik Gramner

PR #20386 opened by Henrik Gramner (gramner)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20386
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20386.patch

A few of the most basic variants had existing AVX2 implementations since before. Those were rewritten to reduce code size.

Checkasm numbers on Zen 5 (Strix Halo):
```
vp9_dc_32x32_8bpp_ssse3:             24.2
vp9_dc_32x32_8bpp_avx2:              10.3

vp9_dc_left_32x32_8bpp_ssse3:        23.6
vp9_dc_left_32x32_8bpp_avx2:          9.9

vp9_dc_top_32x32_8bpp_ssse3:         22.9
vp9_dc_top_32x32_8bpp_avx2:          10.0

vp9_diag_downleft_32x32_8bpp_avx:    28.5
vp9_diag_downleft_32x32_8bpp_avx2:   13.5

vp9_diag_downright_32x32_8bpp_avx:   35.0
vp9_diag_downright_32x32_8bpp_avx2:  17.0

vp9_hor_32x32_8bpp_avx:              22.3
vp9_hor_32x32_8bpp_avx2:             11.1

vp9_hor_down_32x32_8bpp_avx:         27.5
vp9_hor_down_32x32_8bpp_avx2:        19.8

vp9_hor_up_32x32_8bpp_avx:           26.0
vp9_hor_up_32x32_8bpp_avx2:          16.0

vp9_tm_32x32_8bpp_avx:               97.9
vp9_tm_32x32_8bpp_avx2:              23.6

vp9_vert_32x32_8bpp_sse:             20.8
vp9_vert_32x32_8bpp_avx2:             8.9

vp9_vert_left_32x32_8bpp_avx:        28.1
vp9_vert_left_32x32_8bpp_avx2:       15.2

vp9_vert_right_32x32_8bpp_avx:       32.0
vp9_vert_right_32x32_8bpp_avx2:      21.3
```



>From ce6ff1b6229f2346e3caee18efbe36e794a94c6d Mon Sep 17 00:00:00 2001
From: Henrik Gramner <gramner@twoorioles.com>
Date: Mon, 1 Sep 2025 02:03:00 +0200
Subject: [PATCH] vp9: Add 8bpc intra prediction AVX2 asm

---
 libavcodec/x86/vp9dsp_init.c    |  13 +-
 libavcodec/x86/vp9intrapred.asm | 467 +++++++++++++++++++++-----------
 2 files changed, 309 insertions(+), 171 deletions(-)

diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index 9836b3321c..bbabcf38c3 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -207,11 +207,8 @@ ipred_dir_tm_h_funcs(8, avx);
 ipred_dir_tm_h_funcs(16, avx);
 ipred_dir_tm_h_funcs(32, avx);
 
-ipred_func(32, v, avx);
-
-ipred_dc_funcs(32, avx2);
-ipred_func(32, h, avx2);
-ipred_func(32, tm, avx2);
+ipred_all_funcs(32, avx2);
+ipred_func(32, v, avx2);
 
 #undef ipred_func
 #undef ipred_dir_tm_h_funcs
@@ -388,7 +385,6 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
     if (EXTERNAL_AVX_FAST(cpu_flags)) {
         init_fpel_func(1, 0, 32, put, , avx);
         init_fpel_func(0, 0, 64, put, , avx);
-        init_ipred(32, avx, v, VERT);
     }
 
     if (EXTERNAL_AVX2_FAST(cpu_flags)) {
@@ -408,9 +404,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
             init_subpel3_32_64(1, avg, 8, avx2);
 #endif
         }
-        init_dc_ipred(32, avx2);
-        init_ipred(32, avx2, h,  HOR);
-        init_ipred(32, avx2, tm, TM_VP8);
+        init_all_ipred(32, avx2);
+        init_ipred(32, avx2, v, VERT);
     }
 
 #if ARCH_X86_64
diff --git a/libavcodec/x86/vp9intrapred.asm b/libavcodec/x86/vp9intrapred.asm
index 31f7d449fd..b67addd7e3 100644
--- a/libavcodec/x86/vp9intrapred.asm
+++ b/libavcodec/x86/vp9intrapred.asm
@@ -2,6 +2,7 @@
 ;* VP9 Intra prediction SIMD optimizations
 ;*
 ;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+;* Copyright (c) 2025 Two Orioles, LLC
 ;*
 ;* Parts based on:
 ;* H.264 intra prediction asm optimizations
@@ -230,40 +231,6 @@ DC_16to32_FUNCS
 INIT_XMM ssse3
 DC_16to32_FUNCS
 
-%if HAVE_AVX2_EXTERNAL
-INIT_YMM avx2
-cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a
-    mova                    m0, [lq]
-    mova                    m1, [aq]
-    DEFINE_ARGS dst, stride, stride3, cnt
-    lea               stride3q, [strideq*3]
-    pxor                    m2, m2
-    psadbw                  m0, m2
-    psadbw                  m1, m2
-    paddw                   m0, m1
-    vextracti128           xm1, m0, 1
-    paddw                  xm0, xm1
-    movhlps                xm1, xm0
-    paddw                  xm0, xm1
-    pmulhrsw               xm0, [pw_512]
-    vpbroadcastb            m0, xm0
-    mov                   cntd, 4
-.loop:
-    mova      [dstq+strideq*0], m0
-    mova      [dstq+strideq*1], m0
-    mova      [dstq+strideq*2], m0
-    mova      [dstq+stride3q ], m0
-    lea                   dstq, [dstq+strideq*4]
-    mova      [dstq+strideq*0], m0
-    mova      [dstq+strideq*1], m0
-    mova      [dstq+strideq*2], m0
-    mova      [dstq+stride3q ], m0
-    lea                   dstq, [dstq+strideq*4]
-    dec                   cntd
-    jg .loop
-    RET
-%endif
-
 ; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
 
 %macro DC_1D_4to8_FUNCS 2 ; dir (top or left), arg (a or l)
@@ -395,44 +362,6 @@ INIT_XMM ssse3
 DC_1D_16to32_FUNCS top,  a
 DC_1D_16to32_FUNCS left, l
 
-%macro DC_1D_AVX2_FUNCS 2 ; dir (top or left), arg (a or l)
-%if HAVE_AVX2_EXTERNAL
-cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
-    mova                    m0, [%2q]
-    DEFINE_ARGS dst, stride, stride3, cnt
-    lea               stride3q, [strideq*3]
-    pxor                    m2, m2
-    psadbw                  m0, m2
-    vextracti128           xm1, m0, 1
-    paddw                  xm0, xm1
-    movhlps                xm1, xm0
-    paddw                  xm0, xm1
-    pmulhrsw               xm0, [pw_1024]
-    vpbroadcastb            m0, xm0
-    mov                   cntd, 4
-.loop:
-    mova      [dstq+strideq*0], m0
-    mova      [dstq+strideq*1], m0
-    mova      [dstq+strideq*2], m0
-    mova      [dstq+stride3q ], m0
-    lea                   dstq, [dstq+strideq*4]
-    mova      [dstq+strideq*0], m0
-    mova      [dstq+strideq*1], m0
-    mova      [dstq+strideq*2], m0
-    mova      [dstq+stride3q ], m0
-    lea                   dstq, [dstq+strideq*4]
-    dec                   cntd
-    jg .loop
-    RET
-%endif
-%endmacro
-
-INIT_YMM avx2
-DC_1D_AVX2_FUNCS top,  a
-DC_1D_AVX2_FUNCS left, l
-
-; v
-
 INIT_MMX mmx
 cglobal vp9_ipred_v_8x8, 4, 4, 0, dst, stride, l, a
     movq                    m0, [aq]
@@ -486,29 +415,6 @@ cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a
     jg .loop
     RET
 
-INIT_YMM avx
-cglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a
-    mova                    m0, [aq]
-    DEFINE_ARGS dst, stride, stride3, cnt
-    lea               stride3q, [strideq*3]
-    mov                   cntd, 4
-.loop:
-    mova      [dstq+strideq*0], m0
-    mova      [dstq+strideq*1], m0
-    mova      [dstq+strideq*2], m0
-    mova      [dstq+stride3q ], m0
-    lea                   dstq, [dstq+strideq*4]
-    mova      [dstq+strideq*0], m0
-    mova      [dstq+strideq*1], m0
-    mova      [dstq+strideq*2], m0
-    mova      [dstq+stride3q ], m0
-    lea                   dstq, [dstq+strideq*4]
-    dec                   cntd
-    jg .loop
-    RET
-
-; h
-
 %macro H_XMM_FUNCS 2
 %if notcpuflag(avx)
 cglobal vp9_ipred_h_4x4, 3, 4, 1, dst, stride, l, stride3
@@ -642,34 +548,6 @@ H_XMM_FUNCS 4, 8
 INIT_XMM avx
 H_XMM_FUNCS 4, 8
 
-%if HAVE_AVX2_EXTERNAL
-INIT_YMM avx2
-cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt
-    mova                    m5, [pb_1]
-    mova                    m6, [pb_2]
-    mova                    m7, [pb_3]
-    pxor                    m4, m4
-    lea               stride3q, [strideq*3]
-    mov                   cntq, 7
-.loop:
-    movd                   xm3, [lq+cntq*4]
-    vinserti128             m3, m3, xm3, 1
-    pshufb                  m0, m3, m7
-    pshufb                  m1, m3, m6
-    mova      [dstq+strideq*0], m0
-    mova      [dstq+strideq*1], m1
-    pshufb                  m2, m3, m5
-    pshufb                  m3, m4
-    mova      [dstq+strideq*2], m2
-    mova      [dstq+stride3q ], m3
-    lea                   dstq, [dstq+strideq*4]
-    dec                   cntq
-    jge .loop
-    RET
-%endif
-
-; tm
-
 %macro TM_MMX_FUNCS 0
 cglobal vp9_ipred_tm_4x4, 4, 4, 0, dst, stride, l, a
     pxor                    m1, m1
@@ -898,46 +776,9 @@ TM_XMM_FUNCS
 INIT_XMM avx
 TM_XMM_FUNCS
 
-%if HAVE_AVX2_EXTERNAL
-INIT_YMM avx2
-cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a
-    pxor                    m3, m3
-    pinsrw                 xm2, [aq-1], 0
-    vinserti128             m2, m2, xm2, 1
-    mova                    m0, [aq]
-    DEFINE_ARGS dst, stride, l, cnt
-    mova                    m4, [pw_m256]
-    mova                    m5, [pw_m255]
-    pshufb                  m2, m4
-    punpckhbw               m1, m0, m3
-    punpcklbw               m0, m3
-    psubw                   m1, m2
-    psubw                   m0, m2
-    mov                   cntq, 15
-.loop:
-    pinsrw                 xm7, [lq+cntq*2], 0
-    vinserti128             m7, m7, xm7, 1
-    pshufb                  m3, m7, m5
-    pshufb                  m7, m4
-    paddw                   m2, m3, m0
-    paddw                   m3, m1
-    paddw                   m6, m7, m0
-    paddw                   m7, m1
-    packuswb                m2, m3
-    packuswb                m6, m7
-    mova      [dstq+strideq*0], m2
-    mova      [dstq+strideq*1], m6
-    lea                   dstq, [dstq+strideq*2]
-    dec                   cntq
-    jge .loop
-    RET
-%endif
-
-; dl
-
-%macro LOWPASS 4 ; left [dst], center, right, tmp
+%macro LOWPASS 4-5 [pb_1] ; left [dst], center, right, tmp, pb_1
     pxor                   m%4, m%1, m%3
-    pand                   m%4, [pb_1]
+    pand                   m%4, %5
     pavgb                  m%1, m%3
     psubusb                m%1, m%4
     pavgb                  m%1, m%2
@@ -2041,4 +1882,306 @@ HU_XMM_FUNCS 7
 INIT_XMM avx
 HU_XMM_FUNCS 7
 
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a
+    pxor                    m1, m1
+    psadbw                  m0, m1, [lq]
+    psadbw                  m1, [aq]
+    movd                   xm2, [pw_512]
+    paddw                   m0, m1
+    vextracti128           xm1, m0, 1
+.main:
+    paddw                  xm0, xm1
+    punpckhqdq             xm1, xm0, xm0
+    paddw                  xm0, xm1
+    pmulhrsw               xm0, xm2
+    vpbroadcastb            m0, xm0
+.main2:
+    lea                     r2, [strideq*3]
+    mov                    r3d, 8
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+r2       ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                    r3d
+    jg .loop
+    RET
+
+cglobal vp9_ipred_dc_top_32x32, 0, 4, 3, dst, stride, l, a
+    mov                     lq, amp
+%if ARCH_X86_32
+    jmp mangle(private_prefix %+ _vp9_ipred_dc_left_32x32 %+ SUFFIX).main
+%endif
+
+%assign function_align 1
+cglobal vp9_ipred_dc_left_32x32, 0, 4, 3, dst, stride, l, a
+    movifnidn               lq, lmp
+.main:
+    movifnidn             dstq, dstmp
+    movifnidn          strideq, stridemp
+    pxor                   xm1, xm1
+    psadbw                 xm0, xm1, [lq]
+    psadbw                 xm1, [lq+16]
+    movd                   xm2, [pw_1024]
+    jmp mangle(private_prefix %+ _vp9_ipred_dc_32x32 %+ SUFFIX).main
+
+cglobal vp9_ipred_v_32x32, 2, 4, 3, dst, stride, l, a
+    movifnidn               aq, amp
+    mova                    m0, [aq]
+    jmp mangle(private_prefix %+ _vp9_ipred_dc_32x32 %+ SUFFIX).main2
+
+%assign function_align 16
+cglobal vp9_ipred_h_32x32, 3, 5, 6, dst, stride, l
+    vpbroadcastd            m2, [pb_3]
+    mov                    r3d, 7
+    vpbroadcastd            m3, [pb_2]
+    pxor                    m5, m5
+    vpbroadcastd            m4, [pb_1]
+    lea                     r4, [strideq*3]
+.loop:
+    vpbroadcastd            m1, [lq+r3*4]
+    pshufb                  m0, m1, m2
+    mova      [dstq+strideq*0], m0
+    pshufb                  m0, m1, m3
+    mova      [dstq+strideq*1], m0
+    pshufb                  m0, m1, m4
+    mova      [dstq+strideq*2], m0
+    pshufb                  m1, m5
+    mova      [dstq+r4       ], m1
+    lea                   dstq, [dstq+strideq*4]
+    dec                    r3d
+    jge .loop
+    RET
+
+cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a
+    vpbroadcastd            m0, [aq-1]
+    mova                    m7, [aq]
+    pxor                    m1, m1
+    vpbroadcastd            m4, [pw_m255]
+    mov                    r3d, 15
+    vpbroadcastd            m5, [pw_m256]
+    pshufb                  m0, m5
+    punpcklbw               m6, m7, m1
+    punpckhbw               m7, m1
+    psubw                   m6, m0
+    psubw                   m7, m0
+.loop:
+    vpbroadcastd            m3, [lq+r3*2]
+    pshufb                  m2, m3, m4
+    pshufb                  m3, m5
+    paddw                   m0, m2, m6
+    paddw                   m2, m7
+    paddw                   m1, m3, m6
+    paddw                   m3, m7
+    packuswb                m0, m2
+    packuswb                m1, m3
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+    lea                   dstq, [dstq+strideq*2]
+    dec                    r3d
+    jge .loop
+    RET
+
+cglobal vp9_ipred_dl_32x32, 2, 5, 6, dst, stride, l, a
+    movifnidn               aq, amp
+    vpbroadcastb            m2, [aq+31]
+    vinserti128             m3, m2, [aq+16], 0
+    mova                    m0, [aq+ 0]
+    vpbroadcastd            m5, [pb_1]
+    palignr                 m4, m3, m0, 2
+    lea                     r3, [strideq*2]
+    palignr                 m3, m0, 1
+    LOWPASS                  0, 3, 4, 1, m5
+    lea                     r4, [strideq*3]
+    vperm2i128              m1, m0, m2, 0x31
+    mov                    r2d, 8
+.loop:
+    shufpd                  m3, m0, m1, 0x05
+    mova           [dstq+r3*0], m0
+    punpckhqdq              m4, m1, m2
+    mova           [dstq+r3*4], m3
+    palignr                 m0, m1, m0, 1
+    mova           [dstq+r3*8], m1
+    palignr                 m1, m2, m1, 1
+    mova           [dstq+r4*8], m4
+    add                   dstq, strideq
+    dec                    r2d
+    jg .loop
+    RET
+
+cglobal vp9_ipred_dr_32x32, 4, 5, 7, dst, stride, l, a
+    mova                    m3, [lq+ 0]
+    movu                    m1, [aq- 1]
+    mova                    m0, [aq+ 0]
+    vpbroadcastd            m6, [pb_1]
+    vperm2i128              m2, m3, m1, 0x21
+    lea                     r3, [strideq*2]
+    palignr                 m4, m1, m2, 15
+    LOWPASS                  0, 1, 4, 5, m6
+    pslldq                 xm4, xm3, 1
+    palignr                 m2, m3, 1
+    vinserti128             m4, [lq+15], 1
+    LOWPASS                  2, 3, 4, 5, m6
+    lea                     r4, [strideq*3]
+    vperm2i128              m1, m2, m0, 0x21
+    mov                    r2d, 8
+.loop:
+    shufpd                  m3, m1, m0, 0x05
+    mova           [dstq+r3*0], m0
+    shufpd                  m4, m2, m1, 0x05
+    mova           [dstq+r3*4], m3
+    palignr                 m0, m1, 15
+    mova           [dstq+r3*8], m1
+    palignr                 m1, m2, 15
+    mova           [dstq+r4*8], m4
+    add                   dstq, strideq
+    pslldq                  m2, 1
+    dec                    r2d
+    jg .loop
+    RET
+
+cglobal vp9_ipred_hd_32x32, 4, 6, 7, dst, stride, l, a
+    movu                    m1, [aq-1]
+    mova                    m0, [lq]
+    vpbroadcastd            m6, [pb_1]
+    vperm2i128              m4, m0, m1, 0x21
+    palignr                 m3, m4, m0, 1
+    palignr                 m4, m0, 2
+    LOWPASS                  4, 3, 0, 2, m6
+    pavgb                   m3, m0
+    movu                   xm0, [aq+15]
+    punpcklbw               m2, m3, m4
+    punpckhbw               m3, m4
+    palignr                 m4, m0, m1, 2
+    palignr                 m0, m1, 1
+    LOWPASS                  4, 0, 1, 5, m6
+    lea                     r2, [strideq*8]
+    vinserti128             m0, m2, xm3, 1
+    lea                     r3, [dstq+r2*1]
+    vpblendd                m1, m2, m3, 0x0f
+    lea                     r4, [dstq+r2*2]
+    vperm2i128              m2, m3, 0x31
+    lea                     r5, [r3  +r2*2]
+    vperm2i128              m3, m4, 0x21
+.loop:
+    sub                     r2, strideq
+    mova             [r5  +r2], m0
+    palignr                 m0, m1, m0, 2
+    mova             [r4  +r2], m1
+    palignr                 m1, m2, m1, 2
+    mova             [r3  +r2], m2
+    palignr                 m2, m3, m2, 2
+    mova             [dstq+r2], m3
+    palignr                 m3, m4, m3, 2
+    psrldq                  m4, 2
+    jg .loop
+    RET
+
+cglobal vp9_ipred_hu_32x32, 3, 5, 6, dst, stride, l, a
+    mova                    m0, [lq]
+    vpbroadcastb           xm3, [lq+31]
+    vpbroadcastd            m1, [pb_1]
+    vbroadcasti128          m4, [pb_2toE_3xF]
+    vperm2i128              m3, m0, 0x03
+    palignr                 m5, m3, m0, 2
+    palignr                 m3, m0, 1
+    LOWPASS                  5, 3, 0, 2, m1
+    vpbroadcastd            m1, [pb_15]
+    pavgb                   m3, m0
+    punpcklbw               m2, m3, m5
+    punpckhbw               m3, m5
+    vinserti128             m0, m2, xm3, 1
+    pshufb                  m5, m1
+    vperm2i128              m1, m2, m3, 0x12
+    lea                     r3, [strideq*2]
+    vperm2i128              m2, m3, 0x31
+    lea                     r4, [strideq*3]
+    vperm2i128              m3, m5, 0x31
+    mov                    r2d, 8
+.loop:
+    mova           [dstq+r3*0], m0
+    palignr                 m0, m1, m0, 2
+    mova           [dstq+r3*4], m1
+    palignr                 m1, m2, m1, 2
+    mova           [dstq+r3*8], m2
+    palignr                 m2, m3, m2, 2
+    mova           [dstq+r4*8], m3
+    pshufb                  m3, m4
+    add                   dstq, strideq
+    dec                    r2d
+    jg .loop
+    RET
+
+cglobal vp9_ipred_vl_32x32, 2, 5, 6, dst, stride, l, a
+    movifnidn               aq, amp
+    vpbroadcastb            m4, [aq+31]
+    vinserti128             m0, m4, [aq+16], 0
+    mova                    m1, [aq+ 0]
+    vpbroadcastd            m5, [pb_1]
+    palignr                 m2, m0, m1, 2
+    palignr                 m0, m1, 1
+    LOWPASS                  2, 0, 1, 3, m5
+    pavgb                   m0, m1
+    lea                     r3, [strideq*2]
+    vperm2i128              m1, m0, m4, 0x31
+    lea                     r4, [strideq+r3*8]
+    vperm2i128              m3, m2, m4, 0x31
+    mov                    r2d, 8
+.loop:
+    shufpd                  m4, m0, m1, 0x05
+    mova      [dstq+strideq*0], m0
+    shufpd                  m5, m2, m3, 0x05
+    mova      [dstq+strideq*1], m2
+    palignr                 m0, m1, m0, 1
+    mova      [dstq+r3*8     ], m4
+    psrldq                  m1, 1
+    mova      [dstq+r4       ], m5
+    palignr                 m2, m3, m2, 1
+    add                   dstq, r3
+    psrldq                  m3, 1
+    dec                    r2d
+    jg .loop
+    RET
+
+cglobal vp9_ipred_vr_32x32, 4, 5, 7, dst, stride, l, a
+    mova                    m4, [lq+ 0]
+    movu                    m0, [aq- 1]
+    vpbroadcastd            m6, [pb_1]
+    vperm2i128              m2, m4, m0, 0x21
+    pslldq                 xm5, xm4, 1
+    palignr                 m3, m2, m4, 1
+    vinserti128             m5, [lq+15], 1
+    LOWPASS                  3, 4, 5, 1, m6
+    mova                    m1, [aq+ 0]
+    vbroadcasti128          m4, [pb_02468ACE_13579BDF]
+    palignr                 m2, m0, m2, 15
+    LOWPASS                  2, 0, 1, 5, m6
+    pshufb                  m3, m4
+    lea                     r3, [strideq*2]
+    vpermq                  m3, m3, q2031
+    pavgb                   m0, m1
+    vinserti128             m1, m3, xm0, 1
+    lea                     r4, [strideq+r3*8]
+    vperm2i128              m3, m2, 0x21
+    mov                    r2d, 8
+.loop:
+    shufpd                  m4, m1, m0, 0x05
+    mova      [dstq+strideq*0], m0
+    shufpd                  m5, m3, m2, 0x05
+    mova      [dstq+strideq*1], m2
+    palignr                 m0, m1, 15
+    mova      [dstq+r3*8     ], m4
+    pslldq                  m1, 1
+    mova      [dstq+r4       ], m5
+    palignr                 m2, m3, 15
+    add                   dstq, r3
+    pslldq                  m3, 1
+    dec                    r2d
+    jg .loop
+    RET
+%endif
+
 ; FIXME 127, 128, 129 ?
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2025-09-01 11:10 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-09-01 11:10 [FFmpeg-devel] [PATCH] vp9: Add 8bpc intra prediction AVX2 asm (PR #20386) Henrik Gramner via ffmpeg-devel

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git