[FFmpeg-devel] [PATCH v4 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC

* [FFmpeg-devel] [PATCH v4 1/2][GSoC 2024] libavcodec/x86/vvc: Add AVX2 DMVR SAD functions for VVC
@ 2024-05-20  0:37 Stone Chen
  2024-05-20  0:37 ` [FFmpeg-devel] [PATCH v4 2/2][GSoC 2024] tests/checkasm: Add check_vvc_sad to vvc_mc.c Stone Chen
  0 siblings, 1 reply; 6+ messages in thread
From: Stone Chen @ 2024-05-20  0:37 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Stone Chen

Implements AVX2 DMVR (decoder-side motion vector refinement) SAD functions. DMVR SAD is only calculated if w >= 8, h >= 8, and w * h > 128. To reduce complexity, SAD is only calculated on even rows. This is calculated for all video bitdepths, but the values passed to the function are always 16bit (even if the original video bitdepth is 8). The AVX2 implementation uses min/max/sub.

Benchmarks ( AMD 7940HS )
Before:
BQTerrace_1920x1080_60_10_420_22_RA.vvc | 106.0 |
Chimera_8bit_1080P_1000_frames.vvc | 204.3 |
NovosobornayaSquare_1920x1080.bin | 197.3 |
RitualDance_1920x1080_60_10_420_37_RA.266 | 174.0 |

After:
BQTerrace_1920x1080_60_10_420_22_RA.vvc | 109.3 |
Chimera_8bit_1080P_1000_frames.vvc | 216.0 |
NovosobornayaSquare_1920x1080.bin | 204.0|
RitualDance_1920x1080_60_10_420_37_RA.266 | 181.7 |
---
 libavcodec/x86/vvc/Makefile      |   3 +-
 libavcodec/x86/vvc/vvc_sad.asm   | 138 +++++++++++++++++++++++++++++++
 libavcodec/x86/vvc/vvcdsp_init.c |   6 ++
 3 files changed, 146 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/x86/vvc/vvc_sad.asm

diff --git a/libavcodec/x86/vvc/Makefile b/libavcodec/x86/vvc/Makefile
index d6a66f860a..7b2438ce17 100644
--- a/libavcodec/x86/vvc/Makefile
+++ b/libavcodec/x86/vvc/Makefile
@@ -5,4 +5,5 @@ OBJS-$(CONFIG_VVC_DECODER)             += x86/vvc/vvcdsp_init.o \
                                           x86/h26x/h2656dsp.o
 X86ASM-OBJS-$(CONFIG_VVC_DECODER)      += x86/vvc/vvc_alf.o      \
                                           x86/vvc/vvc_mc.o       \
-                                          x86/h26x/h2656_inter.o
+                                          x86/vvc/vvc_sad.o      \
+                                          x86/h26x/h2656_inter.o 
diff --git a/libavcodec/x86/vvc/vvc_sad.asm b/libavcodec/x86/vvc/vvc_sad.asm
new file mode 100644
index 0000000000..58a24635d2
--- /dev/null
+++ b/libavcodec/x86/vvc/vvc_sad.asm
@@ -0,0 +1,138 @@
+; /*
+; * Provide SIMD DMVR SAD functions for VVC decoding
+; *
+; * Copyright (c) 2024 Stone Chen
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+; */
+
+%include "libavutil/x86/x86util.asm"
+%define MAX_PB_SIZE 128
+%define ROWS 2    
+
+SECTION_RODATA
+
+pw_1: dw 1
+
+; DMVR SAD is only calculated on even rows to reduce complexity
+SECTION .text
+
+%macro MIN_MAX_SAD 3 ; 
+    pminuw           %3, %2, %1
+    pmaxuw           %1, %2, %1
+    psubusw          %1, %1, %3
+%endmacro
+
+%macro HORIZ_ADD 3  ; xm0, xm1, m1
+    vextracti128      %1, %3, q0001  ;        3        2      1          0
+    paddd            %1, %2         ; xm0 (7 + 3) (6 + 2) (5 + 1)   (4 + 0)
+    pshufd           %2, %1, q0032  ; xm1    -      -     (7 + 3)   (6 + 2)
+    paddd            %1, %1, %2     ; xm0    _      _     (5 1 7 3) (4 0 6 2)
+    pshufd           %2, %1, q0001  ; xm1    _      _     (5 1 7 3) (5 1 7 3)
+    paddd            %1, %1, %2     ;                               (01234567)
+%endmacro
+
+%macro INIT_OFFSET 6 ; src1, src2, dxq, dyq, off1, off2
+    sub             %3, 2
+    sub             %4, 2
+
+    mov             %5, 2
+    mov             %6, 2
+
+    add             %5, %4   
+    sub             %6, %4
+
+    imul            %5, 128
+    imul            %6, 128
+
+    add             %5, 2
+    add             %6, 2
+    
+    add             %5, %3
+    sub             %6, %3
+
+    lea             %1, [%1 + %5 * 2]
+    lea             %2, [%2 + %6 * 2]
+%endmacro
+
+%if ARCH_X86_64
+%if HAVE_AVX2_EXTERNAL
+
+INIT_YMM avx2
+
+cglobal vvc_sad, 6, 11, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx, dx2, dy2
+    movsxd           dx2q, dxd
+    movsxd           dy2q, dyd
+    INIT_OFFSET     src1q, src2q, dx2q, dy2q, off1q, off2q
+    pxor               m3, m3
+    vpbroadcastw       m4, [pw_1]
+
+    cmp          block_wd, 16
+    jge    vvc_sad_16_128
+
+    vvc_sad_8:
+        .loop_height:
+        movu              xm0, [src1q]
+        vinserti128        m0, [src1q + MAX_PB_SIZE * ROWS * 2], 1
+        movu              xm1, [src2q]
+        vinserti128        m1, [src2q + MAX_PB_SIZE * ROWS * 2], 1
+
+        MIN_MAX_SAD        m1, m0, m2
+        pmaddwd            m1, m4
+        paddd              m3, m1
+
+        add         src1q, 2 * MAX_PB_SIZE * ROWS * 2 
+        add         src2q, 2 * MAX_PB_SIZE * ROWS * 2
+
+        sub      block_hd, 4
+        jg   .loop_height
+
+        HORIZ_ADD     xm0, xm3, m3
+        movd          eax, xm0
+    RET
+
+    vvc_sad_16_128:
+        .loop_height:
+        mov         off1q, src1q
+        mov         off2q, src2q
+        mov      row_idxd, block_wd
+        sar      row_idxd, 4
+
+        .loop_width:
+            movu               m0, [src1q]
+            movu               m1, [src2q]
+            MIN_MAX_SAD        m1, m0, m2
+            pmaddwd            m1, m4
+            paddd              m3, m1
+
+            add             src1q, 32
+            add             src2q, 32
+            dec          row_idxd
+            jg        .loop_width
+
+        lea         src1q, [off1q + ROWS * MAX_PB_SIZE * 2] 
+        lea         src2q, [off2q + ROWS * MAX_PB_SIZE * 2]
+
+        sub      block_hd, 2
+        jg   .loop_height
+
+        HORIZ_ADD     xm0, xm3, m3
+        movd          eax, xm0
+    RET
+
+%endif
+%endif
diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
index 0e68971b2c..4b4a2aa937 100644
--- a/libavcodec/x86/vvc/vvcdsp_init.c
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -311,6 +311,9 @@ ALF_FUNCS(16, 12, avx2)
     c->alf.filter[CHROMA] = ff_vvc_alf_filter_chroma_##bd##_avx2;    \
     c->alf.classify       = ff_vvc_alf_classify_##bd##_avx2;         \
 } while (0)
+
+int ff_vvc_sad_avx2(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h);
+#define SAD_INIT() c->inter.sad = ff_vvc_sad_avx2
 #endif
 
 void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
@@ -327,6 +330,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
             ALF_INIT(8);
             AVG_INIT(8, avx2);
             MC_LINKS_AVX2(8);
+            SAD_INIT();
         }
         break;
     case 10:
@@ -338,6 +342,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
             AVG_INIT(10, avx2);
             MC_LINKS_AVX2(10);
             MC_LINKS_16BPC_AVX2(10);
+            SAD_INIT();
         }
         break;
     case 12:
@@ -349,6 +354,7 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
             AVG_INIT(12, avx2);
             MC_LINKS_AVX2(12);
             MC_LINKS_16BPC_AVX2(12);
+            SAD_INIT();
         }
         break;
     default:
-- 
2.45.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread