* [FFmpeg-devel] [PATCH 1/2] vvcdec: alf, add avx2 for luma and chroma filter
@ 2023-02-26 5:48 Nuo Mi
2023-02-26 6:05 ` Nuo Mi
0 siblings, 1 reply; 2+ messages in thread
From: Nuo Mi @ 2023-02-26 5:48 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Nuo Mi
got 11%~26% performance for 1080P and 4k video
clip before after delta
RitualDance_1920x1080_60_10_420_32_LD.26 35 43 22.8%
RitualDance_1920x1080_60_10_420_37_RA.266 43 48 11.6%
Tango2_3840x2160_60_10_420_27_LD.266 7.9 10 26.5%
---
libavcodec/vvcdsp.c | 3 +
libavcodec/x86/Makefile | 2 +
libavcodec/x86/vvc_alf.asm | 301 +++++++++++++++++++++++++++++++++++
libavcodec/x86/vvcdsp.h | 44 +++++
libavcodec/x86/vvcdsp_init.c | 81 ++++++++++
5 files changed, 431 insertions(+)
create mode 100644 libavcodec/x86/vvc_alf.asm
create mode 100644 libavcodec/x86/vvcdsp.h
create mode 100644 libavcodec/x86/vvcdsp_init.c
diff --git a/libavcodec/vvcdsp.c b/libavcodec/vvcdsp.c
index 801bd0189d..399631503f 100644
--- a/libavcodec/vvcdsp.c
+++ b/libavcodec/vvcdsp.c
@@ -313,4 +313,7 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int bit_depth)
VVC_DSP(8);
break;
}
+#if ARCH_X86
+ ff_vvc_dsp_init_x86(vvcdsp, bit_depth);
+#endif
}
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 118daca333..23b2fb42bb 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -82,6 +82,7 @@ OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp_init.o \
x86/vp9dsp_init_12bpp.o \
x86/vp9dsp_init_16bpp.o
OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp_init.o
+OBJS-$(CONFIG_VVC_DECODER) += x86/vvcdsp_init.o
# GCC inline assembly optimizations
@@ -202,4 +203,5 @@ X86ASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9intrapred.o \
x86/vp9lpf_16bpp.o \
x86/vp9mc.o \
x86/vp9mc_16bpp.o
+X86ASM-OBJS-$(CONFIG_VVC_DECODER) += x86/vvc_alf.o
X86ASM-OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp.o
diff --git a/libavcodec/x86/vvc_alf.asm b/libavcodec/x86/vvc_alf.asm
new file mode 100644
index 0000000000..c3e4074be7
--- /dev/null
+++ b/libavcodec/x86/vvc_alf.asm
@@ -0,0 +1,301 @@
+;******************************************************************************
+;* VVC Adaptive Loop Filter SIMD optimizations
+;*
+;* Copyright (c) 2023 Nuo Mi <nuomi2021@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+%macro PARAM_SHUFFE 1
+%assign i (%1 * 2)
+%assign j ((i + 1) << 8) + (i)
+param_shuffe_%+%1:
+%rep 2
+ times 4 dw j
+ times 4 dw (j + 0x0808)
+%endrep
+%endmacro
+
+PARAM_SHUFFE 0
+PARAM_SHUFFE 1
+PARAM_SHUFFE 2
+PARAM_SHUFFE 3
+
+dw_64: dd 64
+
+SECTION .text
+
+%if HAVE_AVX2_EXTERNAL
+
+;%1-%3 out
+;%4 clip or filter
+%macro LOAD_LUMA_PARAMS_W16 4
+ %ifidn clip, %4
+ movu m%1, [%4q + 0 * 32]
+ movu m%2, [%4q + 1 * 32]
+ movu m%3, [%4q + 2 * 32]
+ %elifidn filter, %4
+ movu xm%1, [%4q + 0 * 16]
+ movu xm%2, [%4q + 1 * 16]
+ movu xm%3, [%4q + 2 * 16]
+ pmovsxbw m%1, xm%1
+ pmovsxbw m%2, xm%2
+ pmovsxbw m%3, xm%3
+ %else
+ %error "need filter or clip for the fourth param"
+ %endif
+%endmacro
+
+%macro LOAD_LUMA_PARAMS_W16 6
+ LOAD_LUMA_PARAMS_W16 %1, %2, %3, %4
+ ;m%1 = 03 02 01 00
+ ;m%2 = 07 06 05 04
+ ;m%3 = 11 10 09 08
+
+ vshufpd m%5, m%1, m%2, 0b0011 ;06 02 05 01
+ vshufpd m%6, m%3, m%5, 0b1001 ;06 10 01 09
+
+ vshufpd m%1, m%1, m%6, 0b1100 ;06 03 09 00
+ vshufpd m%2, m%2, m%6, 0b0110 ;10 07 01 04
+ vshufpd m%3, m%3, m%5, 0b0110 ;02 11 05 08
+
+ vpermpd m%1, m%1, 0b01_11_10_00 ;09 06 03 00
+ vshufpd m%2, m%2, m%2, 0b1001 ;10 07 04 01
+ vpermpd m%3, m%3, 0b10_00_01_11 ;11 08 05 02
+%endmacro
+
+%macro LOAD_LUMA_PARAMS_W4 6
+ %ifidn clip, %4
+ movq xm%1, [%4q + 0 * 8]
+ movq xm%2, [%4q + 1 * 8]
+ movq xm%3, [%4q + 2 * 8]
+ %elifidn filter, %4
+ movd xm%1, [%4q + 0 * 4]
+ movd xm%2, [%4q + 1 * 4]
+ movd xm%3, [%4q + 2 * 4]
+ pmovsxbw xm%1, xm%1
+ pmovsxbw xm%2, xm%2
+ pmovsxbw xm%3, xm%3
+ %else
+ %error "need filter or clip for the fourth param"
+ %endif
+ vpbroadcastq m%1, xm%1
+ vpbroadcastq m%2, xm%2
+ vpbroadcastq m%3, xm%3
+%endmacro
+
+;%1-%3 out
+;%4 clip or filter
+;%5, %6 tmp
+%macro LOAD_LUMA_PARAMS 6
+ LOAD_LUMA_PARAMS_W %+ WIDTH %1, %2, %3, %4, %5, %6
+%endmacro
+
+%macro LOAD_CHROMA_PARAMS 4
+ ;LOAD_CHROMA_PARAMS_W %+ WIDTH %1, %2, %3, %4
+ %ifidn clip, %3
+ movq xm%1, [%3q]
+ movd xm%2, [%3q + 8]
+ %elifidn filter, %3
+ movd xm%1, [%3q + 0]
+ pinsrw xm%2, [%3q + 4], 0
+ vpmovsxbw m%1, xm%1
+ vpmovsxbw m%2, xm%2
+ %else
+ %error "need filter or clip for the third param"
+ %endif
+ vpbroadcastq m%1, xm%1
+ vpbroadcastq m%2, xm%2
+%endmacro
+
+%macro LOAD_PARAMS 0
+ %if LUMA
+ LOAD_LUMA_PARAMS 3, 4, 5, filter, 6, 7
+ LOAD_LUMA_PARAMS 6, 7, 8, clip, 9, 10
+ %else
+ LOAD_CHROMA_PARAMS 3, 4, filter, 5
+ LOAD_CHROMA_PARAMS 6, 7, clip, 8
+ %endif
+%endmacro
+
+;FILTER(param_idx)
+;input: m2, m9, m10
+;output: m0, m1
+;m12 ~ m15: tmp
+%macro FILTER 1
+ %assign i (%1 % 4)
+ %assign j (%1 / 4 + 3)
+ %assign k (%1 / 4 + 6)
+ %define filters m%+j
+ %define clips m%+k
+
+ movu m12, [param_shuffe_%+i]
+ pshufb m14, clips, m12 ;clip
+ pxor m13, m13
+ psubw m13, m14 ;-clip
+
+ vpsubw m9, m2
+ CLIPW m9, m13, m14
+
+ vpsubw m10, m2
+ CLIPW m10, m13, m14
+
+ vpunpckhwd m15, m9, m10
+ vpunpcklwd m9, m9, m10
+
+ pshufb m14, filters, m12 ;filter
+ vpunpcklwd m10, m14, m14
+ vpunpckhwd m14, m14, m14
+
+ vpmaddwd m9, m10
+ vpmaddwd m14, m15
+
+ paddd m0, m9
+ paddd m1, m14
+%endmacro
+
+;FILTER(param_start, off0~off2)
+%macro FILTER 4
+ %assign %%i (%1)
+ %rep 3
+ lea offsetq, [%2]
+ mov topq, srcq
+ mov bottomq, srcq
+ sub topq, offsetq
+ add bottomq, offsetq
+ LOAD_PIXELS 9, topq, 11
+ LOAD_PIXELS 10, bottomq, 12
+ FILTER %%i
+ %assign %%i %%i+1
+ %rotate 1
+ %endrep
+%endmacro
+
+;filter pixels for luma and chroma
+%macro FILTER 0
+ %if LUMA
+ FILTER 0, src_stride3q , src_strideq * 2 + ps, src_strideq * 2
+ FILTER 3, src_strideq * 2 - ps, src_strideq + 2 * ps, src_strideq + ps
+ FILTER 6, src_strideq, src_strideq - ps, src_strideq + -2 * ps
+ FILTER 9, src_stride0q + 3 * ps, src_stride0q + 2 * ps, src_stride0q + ps
+ %else
+ FILTER 0, src_strideq * 2, src_strideq + ps, src_strideq
+ FILTER 3, src_strideq - ps, src_stride0q + 2 * ps, src_stride0q + ps
+ %endif
+%endmacro
+
+%define SHIFT 7
+
+;LOAD_PIXELS(dest, src, tmp)
+%macro LOAD_PIXELS 3
+ %if WIDTH == 16
+ movu m%1, [%2]
+ %else
+ pinsrq xm%1, [%2], 0
+ pinsrq xm%1, [%2 + src_strideq], 1
+ pinsrq xm%3, [%2 + src_strideq * 2], 0
+ pinsrq xm%3, [%2 + src_stride3q], 1
+ vinsertf128 m%1, xm%3, 1
+ %endif
+%endmacro
+
+;STORE_PIXELS(dest, src, tmp)
+%macro STORE_PIXELS 3
+ %if WIDTH == 16
+ movu [%1], m%2
+ %else
+ pextrq [%1], xm%2, 0
+ pextrq [%1 + src_strideq], xm%2, 1
+ vperm2f128 m%2, m%2, 1
+ pextrq [%1 + src_strideq * 2], xm%2, 0
+ pextrq [%1 + src_stride3q], xm%2, 1
+ %endif
+%endmacro
+
+;FILTER_LUMA(width)
+%macro ALF_FILTER_16BPP 2
+%ifidn %1, luma
+ %xdefine LUMA 1
+%else
+ %xdefine LUMA 0
+%endif
+%xdefine WIDTH %2
+; void vvc_alf_filter_luma_w%1_16bpp_avx2(uint8_t *dst, ptrdiff_t dst_stride,
+; const uint8_t *src, ptrdiff_t src_stride, int height,
+; const int8_t *filter, const int16_t *clip, ptrdiff_t stride, uint16_t pixel_max);
+
+; see c code for p0 to p6
+
+INIT_YMM avx2
+cglobal vvc_alf_filter_%1_w%2_16bpp, 9, 15, 15, dst, dst_stride, src, src_stride, height, filter, clip, stride, pixel_max, \
+ top, bottom, offset, src_stride3, src_stride0
+%define ps 2
+ lea src_stride3q, [src_strideq * 2 + src_strideq]
+ mov src_stride0q, 0
+ shr heightq, 2
+
+.loop:
+ LOAD_PARAMS
+
+;we need loop 4 times for a 16x4 block, 1 time for a 4x4 block
+%define rep_num (WIDTH / 4)
+%define lines (4 / rep_num)
+%rep rep_num
+ VPBROADCASTD m0, [dw_64]
+ VPBROADCASTD m1, [dw_64]
+
+ LOAD_PIXELS 2, srcq, 9 ;p0
+
+ FILTER
+
+ vpsrad m0, SHIFT
+ vpsrad m1, SHIFT
+
+ vpackssdw m0, m0, m1
+ paddw m0, m2
+
+ ;clip to pixel
+ pinsrw xm2, pixel_maxw, 0
+ vpbroadcastw m2, xm2
+ pxor m1, m1
+ CLIPW m0, m1, m2
+
+ STORE_PIXELS dstq, 0, 1
+
+ lea srcq, [srcq + lines * src_strideq]
+ lea dstq, [dstq + lines * dst_strideq]
+%endrep
+
+ lea filterq, [filterq + strideq]
+ lea clipq, [clipq + 2 * strideq]
+
+ dec heightq
+ jg .loop
+ RET
+%endmacro
+
+ALF_FILTER_16BPP luma, 16
+ALF_FILTER_16BPP luma, 4
+ALF_FILTER_16BPP chroma, 16
+ALF_FILTER_16BPP chroma, 4
+
+%endif
+
diff --git a/libavcodec/x86/vvcdsp.h b/libavcodec/x86/vvcdsp.h
new file mode 100644
index 0000000000..8589d4ae97
--- /dev/null
+++ b/libavcodec/x86/vvcdsp.h
@@ -0,0 +1,44 @@
+/*
+ * VVC DSP for x86
+ *
+ * Copyright (C) 2022 Nuo Mi
+ *
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_VVCDSP_H
+#define AVCODEC_X86_VVCDSP_H
+
+void ff_vvc_alf_filter_luma_w16_16bpp_avx2(uint8_t *dst, ptrdiff_t dst_stride,
+ const uint8_t *src, ptrdiff_t src_stride, int height,
+ const int8_t *filter, const int16_t *clip, ptrdiff_t stride, uint16_t pixel_max);
+
+void ff_vvc_alf_filter_luma_w4_16bpp_avx2(uint8_t *dst, ptrdiff_t dst_stride,
+ const uint8_t *src, ptrdiff_t src_stride, int height,
+ const int8_t *filter, const int16_t *clip, ptrdiff_t stride, uint16_t pixel_max);
+
+void ff_vvc_alf_filter_chroma_w16_16bpp_avx2(uint8_t *dst, ptrdiff_t dst_stride,
+ const uint8_t *src, ptrdiff_t src_stride, int height,
+ const int8_t *filter, const int16_t *clip, ptrdiff_t stride, uint16_t pixel_max);
+
+void ff_vvc_alf_filter_chroma_w4_16bpp_avx2(uint8_t *dst, ptrdiff_t dst_stride,
+ const uint8_t *src, ptrdiff_t src_stride, int height,
+ const int8_t *filter, const int16_t *clip, ptrdiff_t stride, uint16_t pixel_max);
+
+#endif //AVCODEC_X86_VVCDSP_H
+
diff --git a/libavcodec/x86/vvcdsp_init.c b/libavcodec/x86/vvcdsp_init.c
new file mode 100644
index 0000000000..c595ed55fa
--- /dev/null
+++ b/libavcodec/x86/vvcdsp_init.c
@@ -0,0 +1,81 @@
+/*
+ * VVC DSP init for x86
+ *
+ * Copyright (C) 2022 Nuo Mi
+ *
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/vvcdec.h"
+#include "libavcodec/vvcdsp.h"
+#include "libavcodec/x86/vvcdsp.h"
+
+static void alf_filter_luma_10_avx2(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride,
+ int width, int height, const int8_t *filter, const int16_t *clip)
+{
+ const int ps = 1; //pixel shift
+ const int pixel_max = (1 << 10) - 1;
+ const int param_stride = (width >> 2) * ALF_NUM_COEFF_LUMA;
+ int w;
+
+ for (w = 0; w + 16 <= width; w += 16) {
+ const int param_offset = w * ALF_NUM_COEFF_LUMA / ALF_BLOCK_SIZE;
+ ff_vvc_alf_filter_luma_w16_16bpp_avx2(dst + (w << ps), dst_stride, src + (w << ps), src_stride,
+ height, filter + param_offset, clip + param_offset, param_stride, pixel_max);
+ }
+ for ( /* nothing */; w < width; w += 4) {
+ const int param_offset = w * ALF_NUM_COEFF_LUMA / ALF_BLOCK_SIZE;
+ ff_vvc_alf_filter_luma_w4_16bpp_avx2(dst + (w << ps), dst_stride, src + (w << ps), src_stride,
+ height, filter + param_offset, clip + param_offset, param_stride, pixel_max);
+ }
+}
+
+static void alf_filter_chroma_10_avx2(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride,
+ int width, int height, const int8_t *filter, const int16_t *clip)
+{
+ const int ps = 1; //pixel shift
+ const int pixel_max = (1 << 10) - 1;
+ int w;
+
+ for (w = 0; w + 16 <= width; w += 16) {
+ ff_vvc_alf_filter_chroma_w16_16bpp_avx2(dst + (w << ps), dst_stride, src + (w << ps), src_stride,
+ height, filter, clip, 0, pixel_max);
+ }
+ for ( /* nothing */; w < width; w += 4) {
+ ff_vvc_alf_filter_chroma_w4_16bpp_avx2(dst + (w << ps), dst_stride, src + (w << ps), src_stride,
+ height, filter, clip, 0, pixel_max);
+ }
+}
+
+void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bit_depth)
+{
+ const int cpu_flags = av_get_cpu_flags();
+
+ if (bit_depth == 10) {
+ if (EXTERNAL_AVX2(cpu_flags)) {
+ c->alf.filter[LUMA] = alf_filter_luma_10_avx2;
+ c->alf.filter[CHROMA] = alf_filter_chroma_10_avx2;
+ }
+ }
+}
+
--
2.25.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/2] vvcdec: alf, add avx2 for luma and chroma filter
2023-02-26 5:48 [FFmpeg-devel] [PATCH 1/2] vvcdec: alf, add avx2 for luma and chroma filter Nuo Mi
@ 2023-02-26 6:05 ` Nuo Mi
0 siblings, 0 replies; 2+ messages in thread
From: Nuo Mi @ 2023-02-26 6:05 UTC (permalink / raw)
To: ffmpeg-devel
On Sun, Feb 26, 2023 at 1:48 PM Nuo Mi <nuomi2021@gmail.com> wrote:
> got 11%~26% performance for 1080P and 4k video
>
> clip before after delta
> RitualDance_1920x1080_60_10_420_32_LD.26 35 43 22.8%
> RitualDance_1920x1080_60_10_420_37_RA.266 43 48 11.6%
> Tango2_3840x2160_60_10_420_27_LD.266 7.9 10 26.5%
> ---
> libavcodec/vvcdsp.c | 3 +
> libavcodec/x86/Makefile | 2 +
> libavcodec/x86/vvc_alf.asm | 301 +++++++++++++++++++++++++++++++++++
> libavcodec/x86/vvcdsp.h | 44 +++++
> libavcodec/x86/vvcdsp_init.c | 81 ++++++++++
> 5 files changed, 431 insertions(+)
> create mode 100644 libavcodec/x86/vvc_alf.asm
> create mode 100644 libavcodec/x86/vvcdsp.h
> create mode 100644 libavcodec/x86/vvcdsp_init.c
>
> diff --git a/libavcodec/vvcdsp.c b/libavcodec/vvcdsp.c
> index 801bd0189d..399631503f 100644
> --- a/libavcodec/vvcdsp.c
> +++ b/libavcodec/vvcdsp.c
> @@ -313,4 +313,7 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int
> bit_depth)
> VVC_DSP(8);
> break;
> }
> +#if ARCH_X86
> + ff_vvc_dsp_init_x86(vvcdsp, bit_depth);
> +#endif
> }
> diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
> index 118daca333..23b2fb42bb 100644
> --- a/libavcodec/x86/Makefile
> +++ b/libavcodec/x86/Makefile
> @@ -82,6 +82,7 @@ OBJS-$(CONFIG_VP9_DECODER) +=
> x86/vp9dsp_init.o \
> x86/vp9dsp_init_12bpp.o \
> x86/vp9dsp_init_16bpp.o
> OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp_init.o
> +OBJS-$(CONFIG_VVC_DECODER) += x86/vvcdsp_init.o
>
>
> # GCC inline assembly optimizations
> @@ -202,4 +203,5 @@ X86ASM-OBJS-$(CONFIG_VP9_DECODER) +=
> x86/vp9intrapred.o \
> x86/vp9lpf_16bpp.o \
> x86/vp9mc.o \
> x86/vp9mc_16bpp.o
> +X86ASM-OBJS-$(CONFIG_VVC_DECODER) += x86/vvc_alf.o
> X86ASM-OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp.o
> diff --git a/libavcodec/x86/vvc_alf.asm b/libavcodec/x86/vvc_alf.asm
> new file mode 100644
> index 0000000000..c3e4074be7
> --- /dev/null
> +++ b/libavcodec/x86/vvc_alf.asm
> @@ -0,0 +1,301 @@
>
> +;******************************************************************************
> +;* VVC Adaptive Loop Filter SIMD optimizations
> +;*
> +;* Copyright (c) 2023 Nuo Mi <nuomi2021@gmail.com>
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
>
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA
> +
> +%macro PARAM_SHUFFE 1
> +%assign i (%1 * 2)
> +%assign j ((i + 1) << 8) + (i)
> +param_shuffe_%+%1:
> +%rep 2
> + times 4 dw j
> + times 4 dw (j + 0x0808)
> +%endrep
> +%endmacro
> +
> +PARAM_SHUFFE 0
> +PARAM_SHUFFE 1
> +PARAM_SHUFFE 2
> +PARAM_SHUFFE 3
> +
> +dw_64: dd 64
> +
> +SECTION .text
> +
> +%if HAVE_AVX2_EXTERNAL
> +
> +;%1-%3 out
> +;%4 clip or filter
> +%macro LOAD_LUMA_PARAMS_W16 4
> + %ifidn clip, %4
> + movu m%1, [%4q + 0 * 32]
> + movu m%2, [%4q + 1 * 32]
> + movu m%3, [%4q + 2 * 32]
> + %elifidn filter, %4
> + movu xm%1, [%4q + 0 * 16]
> + movu xm%2, [%4q + 1 * 16]
> + movu xm%3, [%4q + 2 * 16]
> + pmovsxbw m%1, xm%1
> + pmovsxbw m%2, xm%2
> + pmovsxbw m%3, xm%3
> + %else
> + %error "need filter or clip for the fourth param"
> + %endif
> +%endmacro
> +
> +%macro LOAD_LUMA_PARAMS_W16 6
> + LOAD_LUMA_PARAMS_W16 %1, %2, %3, %4
> + ;m%1 = 03 02 01 00
> + ;m%2 = 07 06 05 04
> + ;m%3 = 11 10 09 08
> +
> + vshufpd m%5, m%1, m%2, 0b0011 ;06 02 05 01
> + vshufpd m%6, m%3, m%5, 0b1001 ;06 10 01 09
> +
> + vshufpd m%1, m%1, m%6, 0b1100 ;06 03 09 00
> + vshufpd m%2, m%2, m%6, 0b0110 ;10 07 01 04
> + vshufpd m%3, m%3, m%5, 0b0110 ;02 11 05 08
> +
> + vpermpd m%1, m%1, 0b01_11_10_00 ;09 06 03 00
> + vshufpd m%2, m%2, m%2, 0b1001 ;10 07 04 01
> + vpermpd m%3, m%3, 0b10_00_01_11 ;11 08 05 02
> +%endmacro
> +
> +%macro LOAD_LUMA_PARAMS_W4 6
> + %ifidn clip, %4
> + movq xm%1, [%4q + 0 * 8]
> + movq xm%2, [%4q + 1 * 8]
> + movq xm%3, [%4q + 2 * 8]
> + %elifidn filter, %4
> + movd xm%1, [%4q + 0 * 4]
> + movd xm%2, [%4q + 1 * 4]
> + movd xm%3, [%4q + 2 * 4]
> + pmovsxbw xm%1, xm%1
> + pmovsxbw xm%2, xm%2
> + pmovsxbw xm%3, xm%3
> + %else
> + %error "need filter or clip for the fourth param"
> + %endif
> + vpbroadcastq m%1, xm%1
> + vpbroadcastq m%2, xm%2
> + vpbroadcastq m%3, xm%3
> +%endmacro
> +
> +;%1-%3 out
> +;%4 clip or filter
> +;%5, %6 tmp
> +%macro LOAD_LUMA_PARAMS 6
> + LOAD_LUMA_PARAMS_W %+ WIDTH %1, %2, %3, %4, %5, %6
> +%endmacro
> +
> +%macro LOAD_CHROMA_PARAMS 4
> + ;LOAD_CHROMA_PARAMS_W %+ WIDTH %1, %2, %3, %4
> + %ifidn clip, %3
> + movq xm%1, [%3q]
> + movd xm%2, [%3q + 8]
> + %elifidn filter, %3
> + movd xm%1, [%3q + 0]
> + pinsrw xm%2, [%3q + 4], 0
> + vpmovsxbw m%1, xm%1
> + vpmovsxbw m%2, xm%2
> + %else
> + %error "need filter or clip for the third param"
> + %endif
> + vpbroadcastq m%1, xm%1
> + vpbroadcastq m%2, xm%2
> +%endmacro
> +
> +%macro LOAD_PARAMS 0
> + %if LUMA
> + LOAD_LUMA_PARAMS 3, 4, 5, filter, 6, 7
> + LOAD_LUMA_PARAMS 6, 7, 8, clip, 9, 10
> + %else
> + LOAD_CHROMA_PARAMS 3, 4, filter, 5
> + LOAD_CHROMA_PARAMS 6, 7, clip, 8
> + %endif
> +%endmacro
> +
> +;FILTER(param_idx)
> +;input: m2, m9, m10
> +;output: m0, m1
> +;m12 ~ m15: tmp
> +%macro FILTER 1
> + %assign i (%1 % 4)
> + %assign j (%1 / 4 + 3)
> + %assign k (%1 / 4 + 6)
> + %define filters m%+j
> + %define clips m%+k
> +
> + movu m12, [param_shuffe_%+i]
> + pshufb m14, clips, m12 ;clip
> + pxor m13, m13
> + psubw m13, m14 ;-clip
> +
> + vpsubw m9, m2
> + CLIPW m9, m13, m14
> +
> + vpsubw m10, m2
> + CLIPW m10, m13, m14
> +
> + vpunpckhwd m15, m9, m10
> + vpunpcklwd m9, m9, m10
> +
> + pshufb m14, filters, m12 ;filter
> + vpunpcklwd m10, m14, m14
> + vpunpckhwd m14, m14, m14
> +
> + vpmaddwd m9, m10
> + vpmaddwd m14, m15
> +
> + paddd m0, m9
> + paddd m1, m14
> +%endmacro
> +
> +;FILTER(param_start, off0~off2)
> +%macro FILTER 4
> + %assign %%i (%1)
> + %rep 3
> + lea offsetq, [%2]
> + mov topq, srcq
> + mov bottomq, srcq
> + sub topq, offsetq
> + add bottomq, offsetq
> + LOAD_PIXELS 9, topq, 11
> + LOAD_PIXELS 10, bottomq, 12
> + FILTER %%i
> + %assign %%i %%i+1
> + %rotate 1
> + %endrep
> +%endmacro
> +
> +;filter pixels for luma and chroma
> +%macro FILTER 0
> + %if LUMA
> + FILTER 0, src_stride3q , src_strideq * 2 +
> ps, src_strideq * 2
> + FILTER 3, src_strideq * 2 - ps, src_strideq + 2 *
> ps, src_strideq + ps
> + FILTER 6, src_strideq, src_strideq - ps,
> src_strideq + -2 * ps
> + FILTER 9, src_stride0q + 3 * ps, src_stride0q + 2 *
> ps, src_stride0q + ps
> + %else
> + FILTER 0, src_strideq * 2, src_strideq + ps,
> src_strideq
> + FILTER 3, src_strideq - ps, src_stride0q + 2 *
> ps, src_stride0q + ps
> + %endif
> +%endmacro
> +
> +%define SHIFT 7
> +
> +;LOAD_PIXELS(dest, src, tmp)
> +%macro LOAD_PIXELS 3
> + %if WIDTH == 16
> + movu m%1, [%2]
> + %else
> + pinsrq xm%1, [%2], 0
> + pinsrq xm%1, [%2 + src_strideq], 1
> + pinsrq xm%3, [%2 + src_strideq * 2], 0
> + pinsrq xm%3, [%2 + src_stride3q], 1
> + vinsertf128 m%1, xm%3, 1
> + %endif
> +%endmacro
> +
> +;STORE_PIXELS(dest, src, tmp)
> +%macro STORE_PIXELS 3
> + %if WIDTH == 16
> + movu [%1], m%2
> + %else
> + pextrq [%1], xm%2, 0
> + pextrq [%1 + src_strideq], xm%2, 1
> + vperm2f128 m%2, m%2, 1
> + pextrq [%1 + src_strideq * 2], xm%2, 0
> + pextrq [%1 + src_stride3q], xm%2, 1
> + %endif
> +%endmacro
> +
> +;FILTER_LUMA(width)
> +%macro ALF_FILTER_16BPP 2
> +%ifidn %1, luma
> + %xdefine LUMA 1
> +%else
> + %xdefine LUMA 0
> +%endif
> +%xdefine WIDTH %2
> +; void vvc_alf_filter_luma_w%1_16bpp_avx2(uint8_t *dst, ptrdiff_t
> dst_stride,
> +; const uint8_t *src, ptrdiff_t src_stride, int height,
> +; const int8_t *filter, const int16_t *clip, ptrdiff_t stride,
> uint16_t pixel_max);
> +
> +; see c code for p0 to p6
> +
> +INIT_YMM avx2
> +cglobal vvc_alf_filter_%1_w%2_16bpp, 9, 15, 15, dst, dst_stride, src,
> src_stride, height, filter, clip, stride, pixel_max, \
> + top, bottom, offset, src_stride3, src_stride0
> +%define ps 2
> + lea src_stride3q, [src_strideq * 2 + src_strideq]
> + mov src_stride0q, 0
> + shr heightq, 2
> +
> +.loop:
> + LOAD_PARAMS
> +
> +;we need loop 4 times for a 16x4 block, 1 time for a 4x4 block
> +%define rep_num (WIDTH / 4)
> +%define lines (4 / rep_num)
> +%rep rep_num
> + VPBROADCASTD m0, [dw_64]
> + VPBROADCASTD m1, [dw_64]
> +
> + LOAD_PIXELS 2, srcq, 9 ;p0
> +
> + FILTER
> +
> + vpsrad m0, SHIFT
> + vpsrad m1, SHIFT
> +
> + vpackssdw m0, m0, m1
> + paddw m0, m2
> +
> + ;clip to pixel
> + pinsrw xm2, pixel_maxw, 0
> + vpbroadcastw m2, xm2
> + pxor m1, m1
> + CLIPW m0, m1, m2
> +
> + STORE_PIXELS dstq, 0, 1
> +
> + lea srcq, [srcq + lines * src_strideq]
> + lea dstq, [dstq + lines * dst_strideq]
> +%endrep
> +
> + lea filterq, [filterq + strideq]
> + lea clipq, [clipq + 2 * strideq]
> +
> + dec heightq
> + jg .loop
> + RET
> +%endmacro
> +
> +ALF_FILTER_16BPP luma, 16
> +ALF_FILTER_16BPP luma, 4
> +ALF_FILTER_16BPP chroma, 16
> +ALF_FILTER_16BPP chroma, 4
> +
> +%endif
> +
> diff --git a/libavcodec/x86/vvcdsp.h b/libavcodec/x86/vvcdsp.h
> new file mode 100644
> index 0000000000..8589d4ae97
> --- /dev/null
> +++ b/libavcodec/x86/vvcdsp.h
> @@ -0,0 +1,44 @@
> +/*
> + * VVC DSP for x86
> + *
> + * Copyright (C) 2022 Nuo Mi
> + *
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#ifndef AVCODEC_X86_VVCDSP_H
> +#define AVCODEC_X86_VVCDSP_H
> +
> +void ff_vvc_alf_filter_luma_w16_16bpp_avx2(uint8_t *dst, ptrdiff_t
> dst_stride,
> + const uint8_t *src, ptrdiff_t src_stride, int height,
> + const int8_t *filter, const int16_t *clip, ptrdiff_t stride, uint16_t
> pixel_max);
> +
> +void ff_vvc_alf_filter_luma_w4_16bpp_avx2(uint8_t *dst, ptrdiff_t
> dst_stride,
> + const uint8_t *src, ptrdiff_t src_stride, int height,
> + const int8_t *filter, const int16_t *clip, ptrdiff_t stride, uint16_t
> pixel_max);
> +
> +void ff_vvc_alf_filter_chroma_w16_16bpp_avx2(uint8_t *dst, ptrdiff_t
> dst_stride,
> + const uint8_t *src, ptrdiff_t src_stride, int height,
> + const int8_t *filter, const int16_t *clip, ptrdiff_t stride, uint16_t
> pixel_max);
> +
> +void ff_vvc_alf_filter_chroma_w4_16bpp_avx2(uint8_t *dst, ptrdiff_t
> dst_stride,
> + const uint8_t *src, ptrdiff_t src_stride, int height,
> + const int8_t *filter, const int16_t *clip, ptrdiff_t stride, uint16_t
> pixel_max);
> +
> +#endif //AVCODEC_X86_VVCDSP_H
> +
> diff --git a/libavcodec/x86/vvcdsp_init.c b/libavcodec/x86/vvcdsp_init.c
> new file mode 100644
> index 0000000000..c595ed55fa
> --- /dev/null
> +++ b/libavcodec/x86/vvcdsp_init.c
> @@ -0,0 +1,81 @@
> +/*
> + * VVC DSP init for x86
> + *
> + * Copyright (C) 2022 Nuo Mi
> + *
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#include "config.h"
> +
> +#include "libavutil/cpu.h"
> +#include "libavutil/x86/asm.h"
> +#include "libavutil/x86/cpu.h"
> +#include "libavcodec/vvcdec.h"
> +#include "libavcodec/vvcdsp.h"
> +#include "libavcodec/x86/vvcdsp.h"
> +
> +static void alf_filter_luma_10_avx2(uint8_t *dst, ptrdiff_t dst_stride,
> const uint8_t *src, ptrdiff_t src_stride,
> + int width, int height, const int8_t *filter, const int16_t *clip)
> +{
> + const int ps = 1;
> //pixel shift
> + const int pixel_max = (1 << 10) - 1;
> + const int param_stride = (width >> 2) * ALF_NUM_COEFF_LUMA;
> + int w;
> +
> + for (w = 0; w + 16 <= width; w += 16) {
> + const int param_offset = w * ALF_NUM_COEFF_LUMA / ALF_BLOCK_SIZE;
> + ff_vvc_alf_filter_luma_w16_16bpp_avx2(dst + (w << ps),
> dst_stride, src + (w << ps), src_stride,
> + height, filter + param_offset, clip + param_offset,
> param_stride, pixel_max);
> + }
> + for ( /* nothing */; w < width; w += 4) {
> + const int param_offset = w * ALF_NUM_COEFF_LUMA / ALF_BLOCK_SIZE;
> + ff_vvc_alf_filter_luma_w4_16bpp_avx2(dst + (w << ps), dst_stride,
> src + (w << ps), src_stride,
> + height, filter + param_offset, clip + param_offset,
> param_stride, pixel_max);
> + }
> +}
> +
> +static void alf_filter_chroma_10_avx2(uint8_t *dst, ptrdiff_t dst_stride,
> const uint8_t *src, ptrdiff_t src_stride,
> + int width, int height, const int8_t *filter, const int16_t *clip)
> +{
> + const int ps = 1;
> //pixel shift
> + const int pixel_max = (1 << 10) - 1;
> + int w;
> +
> + for (w = 0; w + 16 <= width; w += 16) {
> + ff_vvc_alf_filter_chroma_w16_16bpp_avx2(dst + (w << ps),
> dst_stride, src + (w << ps), src_stride,
> + height, filter, clip, 0, pixel_max);
> + }
> + for ( /* nothing */; w < width; w += 4) {
> + ff_vvc_alf_filter_chroma_w4_16bpp_avx2(dst + (w << ps),
> dst_stride, src + (w << ps), src_stride,
> + height, filter, clip, 0, pixel_max);
> + }
> +}
> +
> +void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bit_depth)
> +{
> + const int cpu_flags = av_get_cpu_flags();
> +
> + if (bit_depth == 10) {
> + if (EXTERNAL_AVX2(cpu_flags)) {
> + c->alf.filter[LUMA] = alf_filter_luma_10_avx2;
> + c->alf.filter[CHROMA] = alf_filter_chroma_10_avx2;
> + }
> + }
> +}
> +
> --
> 2.25.1
>
Hi experts,
Please help review ALF filter for vvc. ALF is the most time consuming
process in vvc. It takes about 30~60% time for C code. Filter is a part of
it. Please help review and give some performance improvement suggestions
for this patch.
For each 4x4 pixel block, there are 12 coeffs (int8_t) and 12 clips
(int16_t) params. For each pixel, we need to subtract and clip 24 times and
multiply 12 times. The current AVX2 code will process 16x4 or 4x4 blocks in
a loop.
Please check [1] for a working build and [2] for C code.
Thank you very much.
[1] https://github.com/ffvvc/FFmpeg/pull/42/commits
[2]
https://github.com/ffvvc/FFmpeg/blob/main/libavcodec/vvc_filter_template.c#L246
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2023-02-26 6:06 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-02-26 5:48 [FFmpeg-devel] [PATCH 1/2] vvcdec: alf, add avx2 for luma and chroma filter Nuo Mi
2023-02-26 6:05 ` Nuo Mi
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git