* [FFmpeg-devel] [PATCH v7] libavfilter/x86/vf_convolution: add sobel filter optimization and unit test with intel AVX512 VNNI @ 2022-11-04 8:29 bin.wang-at-intel.com 2022-11-07 5:24 ` Xiang, Haihao ` (2 more replies) 0 siblings, 3 replies; 13+ messages in thread From: bin.wang-at-intel.com @ 2022-11-04 8:29 UTC (permalink / raw) To: ffmpeg-devel; +Cc: bwang30 From: bwang30 <bin.wang@intel.com> This commit enabled assembly code with intel AVX512 VNNI and added unit test for sobel filter sobel_c: 4537 sobel_avx512icl 2136 Signed-off-by: bwang30 <bin.wang@intel.com> --- libavfilter/convolution.h | 74 +++++++++++++ libavfilter/vf_convolution.c | 91 +++------------- libavfilter/x86/vf_convolution.asm | 147 ++++++++++++++++++++++++++ libavfilter/x86/vf_convolution_init.c | 18 ++++ tests/checkasm/Makefile | 1 + tests/checkasm/checkasm.c | 3 + tests/checkasm/checkasm.h | 1 + tests/checkasm/vf_convolution.c | 104 ++++++++++++++++++ tests/fate/checkasm.mak | 1 + 9 files changed, 362 insertions(+), 78 deletions(-) create mode 100644 tests/checkasm/vf_convolution.c diff --git a/libavfilter/convolution.h b/libavfilter/convolution.h index 88aabe9a20..e44bfb5da8 100644 --- a/libavfilter/convolution.h +++ b/libavfilter/convolution.h @@ -21,6 +21,7 @@ #ifndef AVFILTER_CONVOLUTION_H #define AVFILTER_CONVOLUTION_H #include "avfilter.h" +#include "libavutil/intreadwrite.h" enum MatrixMode { MATRIX_SQUARE, @@ -61,4 +62,77 @@ typedef struct ConvolutionContext { } ConvolutionContext; void ff_convolution_init_x86(ConvolutionContext *s); +void ff_sobel_init_x86(ConvolutionContext *s, int depth, int nb_planes); + +static void setup_3x3(int radius, const uint8_t *c[], const uint8_t *src, int stride, + int x, int w, int y, int h, int bpc) +{ + int i; + + for (i = 0; i < 9; i++) { + int xoff = FFABS(x + ((i % 3) - 1)); + int yoff = FFABS(y + (i / 3) - 1); + + xoff = xoff >= w ? 2 * w - 1 - xoff : xoff; + yoff = yoff >= h ? 2 * h - 1 - yoff : yoff; + + c[i] = src + xoff * bpc + yoff * stride; + } +} + +static void filter_sobel(uint8_t *dst, int width, + float scale, float delta, const int *const matrix, + const uint8_t *c[], int peak, int radius, + int dstride, int stride, int size) +{ + const uint8_t *c0 = c[0], *c1 = c[1], *c2 = c[2]; + const uint8_t *c3 = c[3], *c5 = c[5]; + const uint8_t *c6 = c[6], *c7 = c[7], *c8 = c[8]; + int x; + + for (x = 0; x < width; x++) { + float suma = c0[x] * -1 + c1[x] * -2 + c2[x] * -1 + + c6[x] * 1 + c7[x] * 2 + c8[x] * 1; + float sumb = c0[x] * -1 + c2[x] * 1 + c3[x] * -2 + + c5[x] * 2 + c6[x] * -1 + c8[x] * 1; + + dst[x] = av_clip_uint8(sqrtf(suma*suma + sumb*sumb) * scale + delta); + } +} + +static void filter16_sobel(uint8_t *dstp, int width, + float scale, float delta, const int *const matrix, + const uint8_t *c[], int peak, int radius, + int dstride, int stride, int size) +{ + uint16_t *dst = (uint16_t *)dstp; + int x; + + for (x = 0; x < width; x++) { + float suma = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[1][2 * x]) * -2 + AV_RN16A(&c[2][2 * x]) * -1 + + AV_RN16A(&c[6][2 * x]) * 1 + AV_RN16A(&c[7][2 * x]) * 2 + AV_RN16A(&c[8][2 * x]) * 1; + float sumb = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[2][2 * x]) * 1 + AV_RN16A(&c[3][2 * x]) * -2 + + AV_RN16A(&c[5][2 * x]) * 2 + AV_RN16A(&c[6][2 * x]) * -1 + AV_RN16A(&c[8][2 * x]) * 1; + + dst[x] = av_clip(sqrtf(suma*suma + sumb*sumb) * scale + delta, 0, peak); + } +} + +static av_unused void ff_sobel_init(ConvolutionContext *s, int depth, int nb_planes) +{ + for (int i = 0; i < 4; i++) { + s->filter[i] = filter_sobel; + s->copy[i] = !((1 << i) & s->planes); + s->size[i] = 3; + s->setup[i] = setup_3x3; + s->rdiv[i] = s->scale; + s->bias[i] = s->delta; + } + if (s->depth > 8) + for (int i = 0; i < 4; i++) + s->filter[i] = filter16_sobel; +#if ARCH_X86_64 + ff_sobel_init_x86(s, depth, nb_planes); +#endif +} #endif diff --git a/libavfilter/vf_convolution.c b/libavfilter/vf_convolution.c index 9a9c099e6d..7762fa2a05 100644 --- a/libavfilter/vf_convolution.c +++ b/libavfilter/vf_convolution.c @@ -139,24 +139,6 @@ static void filter16_roberts(uint8_t *dstp, int width, } } -static void filter16_sobel(uint8_t *dstp, int width, - float scale, float delta, const int *const matrix, - const uint8_t *c[], int peak, int radius, - int dstride, int stride, int size) -{ - uint16_t *dst = (uint16_t *)dstp; - int x; - - for (x = 0; x < width; x++) { - float suma = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[1][2 * x]) * -2 + AV_RN16A(&c[2][2 * x]) * -1 + - AV_RN16A(&c[6][2 * x]) * 1 + AV_RN16A(&c[7][2 * x]) * 2 + AV_RN16A(&c[8][2 * x]) * 1; - float sumb = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[2][2 * x]) * 1 + AV_RN16A(&c[3][2 * x]) * -2 + - AV_RN16A(&c[5][2 * x]) * 2 + AV_RN16A(&c[6][2 * x]) * -1 + AV_RN16A(&c[8][2 * x]) * 1; - - dst[x] = av_clip(sqrtf(suma*suma + sumb*sumb) * scale + delta, 0, peak); - } -} - static void filter16_scharr(uint8_t *dstp, int width, float scale, float delta, const int *const matrix, const uint8_t *c[], int peak, int radius, @@ -261,26 +243,6 @@ static void filter_roberts(uint8_t *dst, int width, } } -static void filter_sobel(uint8_t *dst, int width, - float scale, float delta, const int *const matrix, - const uint8_t *c[], int peak, int radius, - int dstride, int stride, int size) -{ - const uint8_t *c0 = c[0], *c1 = c[1], *c2 = c[2]; - const uint8_t *c3 = c[3], *c5 = c[5]; - const uint8_t *c6 = c[6], *c7 = c[7], *c8 = c[8]; - int x; - - for (x = 0; x < width; x++) { - float suma = c0[x] * -1 + c1[x] * -2 + c2[x] * -1 + - c6[x] * 1 + c7[x] * 2 + c8[x] * 1; - float sumb = c0[x] * -1 + c2[x] * 1 + c3[x] * -2 + - c5[x] * 2 + c6[x] * -1 + c8[x] * 1; - - dst[x] = av_clip_uint8(sqrtf(suma*suma + sumb*sumb) * scale + delta); - } -} - static void filter_scharr(uint8_t *dst, int width, float scale, float delta, const int *const matrix, const uint8_t *c[], int peak, int radius, @@ -552,22 +514,6 @@ static void filter_column(uint8_t *dst, int height, } } -static void setup_3x3(int radius, const uint8_t *c[], const uint8_t *src, int stride, - int x, int w, int y, int h, int bpc) -{ - int i; - - for (i = 0; i < 9; i++) { - int xoff = FFABS(x + ((i % 3) - 1)); - int yoff = FFABS(y + (i / 3) - 1); - - xoff = xoff >= w ? 2 * w - 1 - xoff : xoff; - yoff = yoff >= h ? 2 * h - 1 - yoff : yoff; - - c[i] = src + xoff * bpc + yoff * stride; - } -} - static void setup_5x5(int radius, const uint8_t *c[], const uint8_t *src, int stride, int x, int w, int y, int h, int bpc) { @@ -708,6 +654,18 @@ static int param_init(AVFilterContext *ctx) const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format); int p, i; + s->depth = desc->comp[0].depth; + s->max = (1 << s->depth) - 1; + + s->planewidth[1] = s->planewidth[2] = AV_CEIL_RSHIFT(inlink->w, desc->log2_chroma_w); + s->planewidth[0] = s->planewidth[3] = inlink->w; + s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, desc->log2_chroma_h); + s->planeheight[0] = s->planeheight[3] = inlink->h; + + s->nb_planes = av_pix_fmt_count_planes(inlink->format); + s->nb_threads = ff_filter_get_nb_threads(ctx); + s->bpc = (s->depth + 7) / 8; + if (!strcmp(ctx->filter->name, "convolution")) { for (i = 0; i < 4; i++) { int *matrix = (int *)s->matrix[i]; @@ -804,14 +762,7 @@ static int param_init(AVFilterContext *ctx) s->bias[i] = s->delta; } } else if (!strcmp(ctx->filter->name, "sobel")) { - for (i = 0; i < 4; i++) { - s->filter[i] = filter_sobel; - s->copy[i] = !((1 << i) & s->planes); - s->size[i] = 3; - s->setup[i] = setup_3x3; - s->rdiv[i] = s->scale; - s->bias[i] = s->delta; - } + ff_sobel_init(s, s->depth, s->nb_planes); } else if (!strcmp(ctx->filter->name, "kirsch")) { for (i = 0; i < 4; i++) { s->filter[i] = filter_kirsch; @@ -832,18 +783,6 @@ static int param_init(AVFilterContext *ctx) } } - s->depth = desc->comp[0].depth; - s->max = (1 << s->depth) - 1; - - s->planewidth[1] = s->planewidth[2] = AV_CEIL_RSHIFT(inlink->w, desc->log2_chroma_w); - s->planewidth[0] = s->planewidth[3] = inlink->w; - s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, desc->log2_chroma_h); - s->planeheight[0] = s->planeheight[3] = inlink->h; - - s->nb_planes = av_pix_fmt_count_planes(inlink->format); - s->nb_threads = ff_filter_get_nb_threads(ctx); - s->bpc = (s->depth + 7) / 8; - if (!strcmp(ctx->filter->name, "convolution")) { if (s->depth > 8) { for (p = 0; p < s->nb_planes; p++) { @@ -870,10 +809,6 @@ static int param_init(AVFilterContext *ctx) if (s->depth > 8) for (p = 0; p < s->nb_planes; p++) s->filter[p] = filter16_roberts; - } else if (!strcmp(ctx->filter->name, "sobel")) { - if (s->depth > 8) - for (p = 0; p < s->nb_planes; p++) - s->filter[p] = filter16_sobel; } else if (!strcmp(ctx->filter->name, "kirsch")) { if (s->depth > 8) for (p = 0; p < s->nb_planes; p++) diff --git a/libavfilter/x86/vf_convolution.asm b/libavfilter/x86/vf_convolution.asm index 754d4d1064..c912d56752 100644 --- a/libavfilter/x86/vf_convolution.asm +++ b/libavfilter/x86/vf_convolution.asm @@ -22,6 +22,18 @@ SECTION_RODATA half: dd 0.5 +data_p1: dd 1 +data_n1: dd -1 +data_p2: dd 2 +data_n2: dd -2 + +ALIGN 64 +sobel_perm: db 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51 + db 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55 + db 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59 + db 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63 +sobel_mulA: db -1, 1, -2, 2 +sobel_mulB: db 1, -1, 2, -2 SECTION .text @@ -154,3 +166,138 @@ cglobal filter_3x3, 4, 15, 7, dst, width, rdiv, bias, matrix, ptr, c0, c1, c2, c INIT_XMM sse4 FILTER_3X3 %endif + +%macro SOBEL_MUL 2 + movzx ptrd, byte [c%1q + xq] + imul ptrd, [%2] + add rd, ptrd +%endmacro + +%macro SOBEL_ADD 1 + movzx ptrd, byte [c%1q + xq] + add rd, ptrd +%endmacro + +; void filter_sobel_avx512(uint8_t *dst, int width, +; float scale, float delta, const int *const matrix, +; const uint8_t *c[], int peak, int radius, +; int dstride, int stride) +%macro FILTER_SOBEL 0 +%if UNIX64 +cglobal filter_sobel, 4, 15, 7, dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, r, x +%else +cglobal filter_sobel, 4, 15, 7, dst, width, rdiv, bias, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, r, x +%endif +%if WIN64 + SWAP xmm0, xmm2 + SWAP xmm1, xmm3 + mov r2q, matrixmp + mov r3q, ptrmp + DEFINE_ARGS dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, r, x +%endif + movsxdifnidn widthq, widthd + VBROADCASTSS m0, xmm0 + VBROADCASTSS m1, xmm1 + pxor m6, m6 + mov c0q, [ptrq + 0*gprsize] + mov c1q, [ptrq + 1*gprsize] + mov c2q, [ptrq + 2*gprsize] + mov c3q, [ptrq + 3*gprsize] + mov c4q, [ptrq + 4*gprsize] + mov c5q, [ptrq + 5*gprsize] + mov c6q, [ptrq + 6*gprsize] + mov c7q, [ptrq + 7*gprsize] + mov c8q, [ptrq + 8*gprsize] + + xor xq, xq + cmp widthq, mmsize/4 + jl .loop2 + + mov rq, widthq + and rq, mmsize/4-1 + sub widthq, rq + + mova m6, [sobel_perm] +.loop1: + movu xm3, [c2q + xq] + pmovzxbd m5, [c0q + xq] + vinserti32x4 ym3, [c6q + xq], 1 + pmovzxbd m4, [c8q + xq] + vinserti32x4 m2, m3, [c1q + xq], 2 + vinserti32x4 m3, [c5q + xq], 2 + vinserti32x4 m2, [c7q + xq], 3 + vinserti32x4 m3, [c3q + xq], 3 + vpermb m2, m6, m2 + psubd m4, m5 + vpermb m3, m6, m3 + mova m5, m4 + vpdpbusd m4, m2, [sobel_mulA] {1to16} + vpdpbusd m5, m3, [sobel_mulB] {1to16} + + cvtdq2ps m4, m4 + mulps m4, m4 + + cvtdq2ps m5, m5 + VFMADD231PS m4, m5, m5 + + sqrtps m4, m4 + fmaddps m4, m4, m0, m1 + cvttps2dq m4, m4 + vpmovusdb [dstq + xq], m4 + + add xq, mmsize/4 + cmp xq, widthq + jl .loop1 + + add widthq, rq + cmp xq, widthq + jge .end + +.loop2: + xor rd, rd + pxor m4, m4 + + ;Gx + SOBEL_MUL 0, data_n1 + SOBEL_MUL 1, data_n2 + SOBEL_MUL 2, data_n1 + SOBEL_ADD 6 + SOBEL_MUL 7, data_p2 + SOBEL_ADD 8 + + cvtsi2ss xmm4, rd + mulss xmm4, xmm4 + + xor rd, rd + ;Gy + SOBEL_MUL 0, data_n1 + SOBEL_ADD 2 + SOBEL_MUL 3, data_n2 + SOBEL_MUL 5, data_p2 + SOBEL_MUL 6, data_n1 + SOBEL_ADD 8 + + cvtsi2ss xmm5, rd + fmaddss xmm4, xmm5, xmm5, xmm4 + + sqrtps xmm4, xmm4 + fmaddss xmm4, xmm4, xmm0, xmm1 ;sum = sum * rdiv + bias + cvttps2dq xmm4, xmm4 ; trunc to integer + packssdw xmm4, xmm4 + packuswb xmm4, xmm4 + movd rd, xmm4 + mov [dstq + xq], rb + + add xq, 1 + cmp xq, widthq + jl .loop2 +.end: + RET +%endmacro + +%if ARCH_X86_64 +%if HAVE_AVX512ICL_EXTERNAL +INIT_ZMM avx512icl +FILTER_SOBEL +%endif +%endif diff --git a/libavfilter/x86/vf_convolution_init.c b/libavfilter/x86/vf_convolution_init.c index b78a47d02b..bff10ca1a4 100644 --- a/libavfilter/x86/vf_convolution_init.c +++ b/libavfilter/x86/vf_convolution_init.c @@ -29,6 +29,11 @@ void ff_filter_3x3_sse4(uint8_t *dst, int width, const uint8_t *c[], int peak, int radius, int dstride, int stride, int size); +void ff_filter_sobel_avx512icl(uint8_t *dst, int width, + float scale, float delta, const int *const matrix, + const uint8_t *c[], int peak, int radius, + int dstride, int stride, int size); + av_cold void ff_convolution_init_x86(ConvolutionContext *s) { #if ARCH_X86_64 @@ -44,3 +49,16 @@ av_cold void ff_convolution_init_x86(ConvolutionContext *s) } #endif } + +av_cold void ff_sobel_init_x86(ConvolutionContext *s, int depth, int nb_planes) +{ +#if ARCH_X86_64 + int cpu_flags = av_get_cpu_flags(); + for (int i = 0; i < nb_planes; i++) { + if (depth == 8) { + if (EXTERNAL_AVX512ICL(cpu_flags)) + s->filter[i] = ff_filter_sobel_avx512icl; + } + } +#endif +} diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index 62d6616faf..a6f06c7007 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -46,6 +46,7 @@ AVFILTEROBJS-$(CONFIG_GBLUR_FILTER) += vf_gblur.o AVFILTEROBJS-$(CONFIG_HFLIP_FILTER) += vf_hflip.o AVFILTEROBJS-$(CONFIG_THRESHOLD_FILTER) += vf_threshold.o AVFILTEROBJS-$(CONFIG_NLMEANS_FILTER) += vf_nlmeans.o +AVFILTEROBJS-$(CONFIG_SOBEL_FILTER) += vf_convolution.o CHECKASMOBJS-$(CONFIG_AVFILTER) += $(AVFILTEROBJS-yes) diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 421bd096c5..3eb4780a64 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -197,6 +197,9 @@ static const struct { #if CONFIG_THRESHOLD_FILTER { "vf_threshold", checkasm_check_vf_threshold }, #endif + #if CONFIG_SOBEL_FILTER + { "vf_sobel", checkasm_check_vf_sobel }, + #endif #endif #if CONFIG_SWSCALE { "sw_gbrp", checkasm_check_sw_gbrp }, diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index ee9151410e..214918e7ea 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -86,6 +86,7 @@ void checkasm_check_vf_eq(void); void checkasm_check_vf_gblur(void); void checkasm_check_vf_hflip(void); void checkasm_check_vf_threshold(void); +void checkasm_check_vf_sobel(void); void checkasm_check_vp8dsp(void); void checkasm_check_vp9dsp(void); void checkasm_check_videodsp(void); diff --git a/tests/checkasm/vf_convolution.c b/tests/checkasm/vf_convolution.c new file mode 100644 index 0000000000..007865863e --- /dev/null +++ b/tests/checkasm/vf_convolution.c @@ -0,0 +1,104 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <string.h> +#include "checkasm.h" +#include "libavfilter/avfilter.h" +#include "libavfilter/convolution.h" +#include "libavutil/intreadwrite.h" +#include "libavutil/mem_internal.h" + +#define WIDTH 512 +#define HEIGHT 512 +#define SRC_STRIDE 512 +#define PIXELS (WIDTH * HEIGHT) + +#define randomize_buffers(buf, size) \ + do { \ + int j; \ + uint8_t *tmp_buf = (uint8_t *)buf;\ + for (j = 0; j< size; j++) \ + tmp_buf[j] = rnd() & 0xFF; \ + } while (0) + +static void check_sobel(const char * report_name) +{ + LOCAL_ALIGNED_32(uint8_t, src, [PIXELS]); + LOCAL_ALIGNED_32(uint8_t, dst_ref, [PIXELS]); + LOCAL_ALIGNED_32(uint8_t, dst_new, [PIXELS]); + const int height = WIDTH; + const int width = HEIGHT; + const int stride = SRC_STRIDE; + const int dstride = SRC_STRIDE; + int mode = 0; + const uint8_t *c[49]; + const int radius = 1; + const int bpc = 1; + const int step = mode == MATRIX_COLUMN ? 16 : 1; + const int slice_start = 0; + const int slice_end = height; + int y; + const int sizew = mode == MATRIX_COLUMN ? height : width; + float scale = 2; + float delta = 10; + + ConvolutionContext s; + + declare_func(void, uint8_t *dst, int width, float scale, float delta, const int *const matrix, + const uint8_t *c[], int peak, int radius, int dstride, int stride, int size); + + s.scale = scale; + s.delta = delta; + s.depth = 8; + s.nb_planes = 3; + s.planes = 15; + ff_sobel_init(&s, s.depth, s.nb_planes); + + memset(dst_ref, 0, PIXELS); + memset(dst_new, 0, PIXELS); + randomize_buffers(src, PIXELS); + + if (check_func(s.filter[0], "%s", report_name)) { + for (y = slice_start; y < slice_end; y += step) { + const int xoff = mode == MATRIX_COLUMN ? (y - slice_start) * bpc : radius * bpc; + const int yoff = mode == MATRIX_COLUMN ? radius * dstride : 0; + + s.setup[0](radius, c, src, stride, radius, width, y, height, bpc); + call_ref(dst_ref + yoff + xoff, sizew - 2 * radius, + scale, delta, NULL, c, 0, radius, + dstride, stride, slice_end - step); + call_new(dst_new + yoff + xoff, sizew - 2 * radius, + scale, delta, NULL, c, 0, radius, + dstride, stride, slice_end - step); + if (memcmp(dst_ref + yoff + xoff, dst_new + yoff + xoff, slice_end - step)) + fail(); + bench_new(dst_new + yoff + xoff, sizew - 2 * radius, + scale, delta, NULL, c, 0, radius, + dstride, stride, slice_end - step); + if (mode != MATRIX_COLUMN) + dst_ref += dstride; + } + } + +} + +void checkasm_check_vf_sobel(void) +{ + check_sobel("sobel"); + report("convolution:sobel"); +} diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak index aa9b288e12..a4e95541f5 100644 --- a/tests/fate/checkasm.mak +++ b/tests/fate/checkasm.mak @@ -43,6 +43,7 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp \ fate-checkasm-vf_hflip \ fate-checkasm-vf_nlmeans \ fate-checkasm-vf_threshold \ + fate-checkasm-vf_sobel \ fate-checkasm-videodsp \ fate-checkasm-vorbisdsp \ fate-checkasm-vp8dsp \ -- 2.27.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [FFmpeg-devel] [PATCH v7] libavfilter/x86/vf_convolution: add sobel filter optimization and unit test with intel AVX512 VNNI 2022-11-04 8:29 [FFmpeg-devel] [PATCH v7] libavfilter/x86/vf_convolution: add sobel filter optimization and unit test with intel AVX512 VNNI bin.wang-at-intel.com @ 2022-11-07 5:24 ` Xiang, Haihao 2022-11-11 3:00 ` Xiang, Haihao 2022-11-14 2:42 ` James Almer 2022-11-14 12:54 ` James Almer 2 siblings, 1 reply; 13+ messages in thread From: Xiang, Haihao @ 2022-11-07 5:24 UTC (permalink / raw) To: ffmpeg-devel; +Cc: Wang, Bin On Fri, 2022-11-04 at 16:29 +0800, bin.wang-at-intel.com@ffmpeg.org wrote: > From: bwang30 <bin.wang@intel.com> > > This commit enabled assembly code with intel AVX512 VNNI and added unit test > for sobel filter > > sobel_c: 4537 > sobel_avx512icl 2136 > > Signed-off-by: bwang30 <bin.wang@intel.com> > --- > libavfilter/convolution.h | 74 +++++++++++++ > libavfilter/vf_convolution.c | 91 +++------------- > libavfilter/x86/vf_convolution.asm | 147 ++++++++++++++++++++++++++ > libavfilter/x86/vf_convolution_init.c | 18 ++++ > tests/checkasm/Makefile | 1 + > tests/checkasm/checkasm.c | 3 + > tests/checkasm/checkasm.h | 1 + > tests/checkasm/vf_convolution.c | 104 ++++++++++++++++++ > tests/fate/checkasm.mak | 1 + > 9 files changed, 362 insertions(+), 78 deletions(-) > create mode 100644 tests/checkasm/vf_convolution.c > > diff --git a/libavfilter/convolution.h b/libavfilter/convolution.h > index 88aabe9a20..e44bfb5da8 100644 > --- a/libavfilter/convolution.h > +++ b/libavfilter/convolution.h > @@ -21,6 +21,7 @@ > #ifndef AVFILTER_CONVOLUTION_H > #define AVFILTER_CONVOLUTION_H > #include "avfilter.h" > +#include "libavutil/intreadwrite.h" > > enum MatrixMode { > MATRIX_SQUARE, > @@ -61,4 +62,77 @@ typedef struct ConvolutionContext { > } ConvolutionContext; > > void ff_convolution_init_x86(ConvolutionContext *s); > +void ff_sobel_init_x86(ConvolutionContext *s, int depth, int nb_planes); > + > +static void setup_3x3(int radius, const uint8_t *c[], const uint8_t *src, int > stride, > + int x, int w, int y, int h, int bpc) > +{ > + int i; > + > + for (i = 0; i < 9; i++) { > + int xoff = FFABS(x + ((i % 3) - 1)); > + int yoff = FFABS(y + (i / 3) - 1); > + > + xoff = xoff >= w ? 2 * w - 1 - xoff : xoff; > + yoff = yoff >= h ? 2 * h - 1 - yoff : yoff; > + > + c[i] = src + xoff * bpc + yoff * stride; > + } > +} > + > +static void filter_sobel(uint8_t *dst, int width, > + float scale, float delta, const int *const matrix, > + const uint8_t *c[], int peak, int radius, > + int dstride, int stride, int size) > +{ > + const uint8_t *c0 = c[0], *c1 = c[1], *c2 = c[2]; > + const uint8_t *c3 = c[3], *c5 = c[5]; > + const uint8_t *c6 = c[6], *c7 = c[7], *c8 = c[8]; > + int x; > + > + for (x = 0; x < width; x++) { > + float suma = c0[x] * -1 + c1[x] * -2 + c2[x] * -1 + > + c6[x] * 1 + c7[x] * 2 + c8[x] * 1; > + float sumb = c0[x] * -1 + c2[x] * 1 + c3[x] * -2 + > + c5[x] * 2 + c6[x] * -1 + c8[x] * 1; > + > + dst[x] = av_clip_uint8(sqrtf(suma*suma + sumb*sumb) * scale + delta); > + } > +} > + > +static void filter16_sobel(uint8_t *dstp, int width, > + float scale, float delta, const int *const matrix, > + const uint8_t *c[], int peak, int radius, > + int dstride, int stride, int size) > +{ > + uint16_t *dst = (uint16_t *)dstp; > + int x; > + > + for (x = 0; x < width; x++) { > + float suma = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[1][2 * x]) * > -2 + AV_RN16A(&c[2][2 * x]) * -1 + > + AV_RN16A(&c[6][2 * x]) * 1 + AV_RN16A(&c[7][2 * x]) > * 2 + AV_RN16A(&c[8][2 * x]) * 1; > + float sumb = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[2][2 * x]) > * 1 + AV_RN16A(&c[3][2 * x]) * -2 + > + AV_RN16A(&c[5][2 * x]) * 2 + AV_RN16A(&c[6][2 * x]) * > -1 + AV_RN16A(&c[8][2 * x]) * 1; > + > + dst[x] = av_clip(sqrtf(suma*suma + sumb*sumb) * scale + delta, 0, > peak); > + } > +} > + > +static av_unused void ff_sobel_init(ConvolutionContext *s, int depth, int > nb_planes) > +{ > + for (int i = 0; i < 4; i++) { > + s->filter[i] = filter_sobel; > + s->copy[i] = !((1 << i) & s->planes); > + s->size[i] = 3; > + s->setup[i] = setup_3x3; > + s->rdiv[i] = s->scale; > + s->bias[i] = s->delta; > + } > + if (s->depth > 8) > + for (int i = 0; i < 4; i++) > + s->filter[i] = filter16_sobel; > +#if ARCH_X86_64 > + ff_sobel_init_x86(s, depth, nb_planes); > +#endif > +} > #endif > diff --git a/libavfilter/vf_convolution.c b/libavfilter/vf_convolution.c > index 9a9c099e6d..7762fa2a05 100644 > --- a/libavfilter/vf_convolution.c > +++ b/libavfilter/vf_convolution.c > @@ -139,24 +139,6 @@ static void filter16_roberts(uint8_t *dstp, int width, > } > } > > -static void filter16_sobel(uint8_t *dstp, int width, > - float scale, float delta, const int *const matrix, > - const uint8_t *c[], int peak, int radius, > - int dstride, int stride, int size) > -{ > - uint16_t *dst = (uint16_t *)dstp; > - int x; > - > - for (x = 0; x < width; x++) { > - float suma = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[1][2 * x]) * > -2 + AV_RN16A(&c[2][2 * x]) * -1 + > - AV_RN16A(&c[6][2 * x]) * 1 + AV_RN16A(&c[7][2 * x]) > * 2 + AV_RN16A(&c[8][2 * x]) * 1; > - float sumb = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[2][2 * x]) > * 1 + AV_RN16A(&c[3][2 * x]) * -2 + > - AV_RN16A(&c[5][2 * x]) * 2 + AV_RN16A(&c[6][2 * x]) * > -1 + AV_RN16A(&c[8][2 * x]) * 1; > - > - dst[x] = av_clip(sqrtf(suma*suma + sumb*sumb) * scale + delta, 0, > peak); > - } > -} > - > static void filter16_scharr(uint8_t *dstp, int width, > float scale, float delta, const int *const > matrix, > const uint8_t *c[], int peak, int radius, > @@ -261,26 +243,6 @@ static void filter_roberts(uint8_t *dst, int width, > } > } > > -static void filter_sobel(uint8_t *dst, int width, > - float scale, float delta, const int *const matrix, > - const uint8_t *c[], int peak, int radius, > - int dstride, int stride, int size) > -{ > - const uint8_t *c0 = c[0], *c1 = c[1], *c2 = c[2]; > - const uint8_t *c3 = c[3], *c5 = c[5]; > - const uint8_t *c6 = c[6], *c7 = c[7], *c8 = c[8]; > - int x; > - > - for (x = 0; x < width; x++) { > - float suma = c0[x] * -1 + c1[x] * -2 + c2[x] * -1 + > - c6[x] * 1 + c7[x] * 2 + c8[x] * 1; > - float sumb = c0[x] * -1 + c2[x] * 1 + c3[x] * -2 + > - c5[x] * 2 + c6[x] * -1 + c8[x] * 1; > - > - dst[x] = av_clip_uint8(sqrtf(suma*suma + sumb*sumb) * scale + delta); > - } > -} > - > static void filter_scharr(uint8_t *dst, int width, > float scale, float delta, const int *const matrix, > const uint8_t *c[], int peak, int radius, > @@ -552,22 +514,6 @@ static void filter_column(uint8_t *dst, int height, > } > } > > -static void setup_3x3(int radius, const uint8_t *c[], const uint8_t *src, int > stride, > - int x, int w, int y, int h, int bpc) > -{ > - int i; > - > - for (i = 0; i < 9; i++) { > - int xoff = FFABS(x + ((i % 3) - 1)); > - int yoff = FFABS(y + (i / 3) - 1); > - > - xoff = xoff >= w ? 2 * w - 1 - xoff : xoff; > - yoff = yoff >= h ? 2 * h - 1 - yoff : yoff; > - > - c[i] = src + xoff * bpc + yoff * stride; > - } > -} > - > static void setup_5x5(int radius, const uint8_t *c[], const uint8_t *src, int > stride, > int x, int w, int y, int h, int bpc) > { > @@ -708,6 +654,18 @@ static int param_init(AVFilterContext *ctx) > const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format); > int p, i; > > + s->depth = desc->comp[0].depth; > + s->max = (1 << s->depth) - 1; > + > + s->planewidth[1] = s->planewidth[2] = AV_CEIL_RSHIFT(inlink->w, desc- > >log2_chroma_w); > + s->planewidth[0] = s->planewidth[3] = inlink->w; > + s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, desc- > >log2_chroma_h); > + s->planeheight[0] = s->planeheight[3] = inlink->h; > + > + s->nb_planes = av_pix_fmt_count_planes(inlink->format); > + s->nb_threads = ff_filter_get_nb_threads(ctx); > + s->bpc = (s->depth + 7) / 8; > + > if (!strcmp(ctx->filter->name, "convolution")) { > for (i = 0; i < 4; i++) { > int *matrix = (int *)s->matrix[i]; > @@ -804,14 +762,7 @@ static int param_init(AVFilterContext *ctx) > s->bias[i] = s->delta; > } > } else if (!strcmp(ctx->filter->name, "sobel")) { > - for (i = 0; i < 4; i++) { > - s->filter[i] = filter_sobel; > - s->copy[i] = !((1 << i) & s->planes); > - s->size[i] = 3; > - s->setup[i] = setup_3x3; > - s->rdiv[i] = s->scale; > - s->bias[i] = s->delta; > - } > + ff_sobel_init(s, s->depth, s->nb_planes); > } else if (!strcmp(ctx->filter->name, "kirsch")) { > for (i = 0; i < 4; i++) { > s->filter[i] = filter_kirsch; > @@ -832,18 +783,6 @@ static int param_init(AVFilterContext *ctx) > } > } > > - s->depth = desc->comp[0].depth; > - s->max = (1 << s->depth) - 1; > - > - s->planewidth[1] = s->planewidth[2] = AV_CEIL_RSHIFT(inlink->w, desc- > >log2_chroma_w); > - s->planewidth[0] = s->planewidth[3] = inlink->w; > - s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, desc- > >log2_chroma_h); > - s->planeheight[0] = s->planeheight[3] = inlink->h; > - > - s->nb_planes = av_pix_fmt_count_planes(inlink->format); > - s->nb_threads = ff_filter_get_nb_threads(ctx); > - s->bpc = (s->depth + 7) / 8; > - > if (!strcmp(ctx->filter->name, "convolution")) { > if (s->depth > 8) { > for (p = 0; p < s->nb_planes; p++) { > @@ -870,10 +809,6 @@ static int param_init(AVFilterContext *ctx) > if (s->depth > 8) > for (p = 0; p < s->nb_planes; p++) > s->filter[p] = filter16_roberts; > - } else if (!strcmp(ctx->filter->name, "sobel")) { > - if (s->depth > 8) > - for (p = 0; p < s->nb_planes; p++) > - s->filter[p] = filter16_sobel; > } else if (!strcmp(ctx->filter->name, "kirsch")) { > if (s->depth > 8) > for (p = 0; p < s->nb_planes; p++) > diff --git a/libavfilter/x86/vf_convolution.asm > b/libavfilter/x86/vf_convolution.asm > index 754d4d1064..c912d56752 100644 > --- a/libavfilter/x86/vf_convolution.asm > +++ b/libavfilter/x86/vf_convolution.asm > @@ -22,6 +22,18 @@ > > SECTION_RODATA > half: dd 0.5 > +data_p1: dd 1 > +data_n1: dd -1 > +data_p2: dd 2 > +data_n2: dd -2 > + > +ALIGN 64 > +sobel_perm: db 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51 > + db 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55 > + db 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59 > + db 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63 > +sobel_mulA: db -1, 1, -2, 2 > +sobel_mulB: db 1, -1, 2, -2 > > SECTION .text > > @@ -154,3 +166,138 @@ cglobal filter_3x3, 4, 15, 7, dst, width, rdiv, bias, > matrix, ptr, c0, c1, c2, c > INIT_XMM sse4 > FILTER_3X3 > %endif > + > +%macro SOBEL_MUL 2 > + movzx ptrd, byte [c%1q + xq] > + imul ptrd, [%2] > + add rd, ptrd > +%endmacro > + > +%macro SOBEL_ADD 1 > + movzx ptrd, byte [c%1q + xq] > + add rd, ptrd > +%endmacro > + > +; void filter_sobel_avx512(uint8_t *dst, int width, > +; float scale, float delta, const int *const matrix, > +; const uint8_t *c[], int peak, int radius, > +; int dstride, int stride) > +%macro FILTER_SOBEL 0 > +%if UNIX64 > +cglobal filter_sobel, 4, 15, 7, dst, width, matrix, ptr, c0, c1, c2, c3, c4, > c5, c6, c7, c8, r, x > +%else > +cglobal filter_sobel, 4, 15, 7, dst, width, rdiv, bias, matrix, ptr, c0, c1, > c2, c3, c4, c5, c6, c7, c8, r, x > +%endif > +%if WIN64 > + SWAP xmm0, xmm2 > + SWAP xmm1, xmm3 > + mov r2q, matrixmp > + mov r3q, ptrmp > + DEFINE_ARGS dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, > r, x > +%endif > + movsxdifnidn widthq, widthd > + VBROADCASTSS m0, xmm0 > + VBROADCASTSS m1, xmm1 > + pxor m6, m6 > + mov c0q, [ptrq + 0*gprsize] > + mov c1q, [ptrq + 1*gprsize] > + mov c2q, [ptrq + 2*gprsize] > + mov c3q, [ptrq + 3*gprsize] > + mov c4q, [ptrq + 4*gprsize] > + mov c5q, [ptrq + 5*gprsize] > + mov c6q, [ptrq + 6*gprsize] > + mov c7q, [ptrq + 7*gprsize] > + mov c8q, [ptrq + 8*gprsize] > + > + xor xq, xq > + cmp widthq, mmsize/4 > + jl .loop2 > + > + mov rq, widthq > + and rq, mmsize/4-1 > + sub widthq, rq > + > + mova m6, [sobel_perm] > +.loop1: > + movu xm3, [c2q + xq] > + pmovzxbd m5, [c0q + xq] > + vinserti32x4 ym3, [c6q + xq], 1 > + pmovzxbd m4, [c8q + xq] > + vinserti32x4 m2, m3, [c1q + xq], 2 > + vinserti32x4 m3, [c5q + xq], 2 > + vinserti32x4 m2, [c7q + xq], 3 > + vinserti32x4 m3, [c3q + xq], 3 > + vpermb m2, m6, m2 > + psubd m4, m5 > + vpermb m3, m6, m3 > + mova m5, m4 > + vpdpbusd m4, m2, [sobel_mulA] {1to16} > + vpdpbusd m5, m3, [sobel_mulB] {1to16} > + > + cvtdq2ps m4, m4 > + mulps m4, m4 > + > + cvtdq2ps m5, m5 > + VFMADD231PS m4, m5, m5 > + > + sqrtps m4, m4 > + fmaddps m4, m4, m0, m1 > + cvttps2dq m4, m4 > + vpmovusdb [dstq + xq], m4 > + > + add xq, mmsize/4 > + cmp xq, widthq > + jl .loop1 > + > + add widthq, rq > + cmp xq, widthq > + jge .end > + > +.loop2: > + xor rd, rd > + pxor m4, m4 > + > + ;Gx > + SOBEL_MUL 0, data_n1 > + SOBEL_MUL 1, data_n2 > + SOBEL_MUL 2, data_n1 > + SOBEL_ADD 6 > + SOBEL_MUL 7, data_p2 > + SOBEL_ADD 8 > + > + cvtsi2ss xmm4, rd > + mulss xmm4, xmm4 > + > + xor rd, rd > + ;Gy > + SOBEL_MUL 0, data_n1 > + SOBEL_ADD 2 > + SOBEL_MUL 3, data_n2 > + SOBEL_MUL 5, data_p2 > + SOBEL_MUL 6, data_n1 > + SOBEL_ADD 8 > + > + cvtsi2ss xmm5, rd > + fmaddss xmm4, xmm5, xmm5, xmm4 > + > + sqrtps xmm4, xmm4 > + fmaddss xmm4, xmm4, xmm0, xmm1 ;sum = sum * rdiv + bias > + cvttps2dq xmm4, xmm4 ; trunc to integer > + packssdw xmm4, xmm4 > + packuswb xmm4, xmm4 > + movd rd, xmm4 > + mov [dstq + xq], rb > + > + add xq, 1 > + cmp xq, widthq > + jl .loop2 > +.end: > + RET > +%endmacro > + > +%if ARCH_X86_64 > +%if HAVE_AVX512ICL_EXTERNAL > +INIT_ZMM avx512icl > +FILTER_SOBEL > +%endif > +%endif > diff --git a/libavfilter/x86/vf_convolution_init.c > b/libavfilter/x86/vf_convolution_init.c > index b78a47d02b..bff10ca1a4 100644 > --- a/libavfilter/x86/vf_convolution_init.c > +++ b/libavfilter/x86/vf_convolution_init.c > @@ -29,6 +29,11 @@ void ff_filter_3x3_sse4(uint8_t *dst, int width, > const uint8_t *c[], int peak, int radius, > int dstride, int stride, int size); > > +void ff_filter_sobel_avx512icl(uint8_t *dst, int width, > + float scale, float delta, const int *const matrix, > + const uint8_t *c[], int peak, int radius, > + int dstride, int stride, int size); > + > av_cold void ff_convolution_init_x86(ConvolutionContext *s) > { > #if ARCH_X86_64 > @@ -44,3 +49,16 @@ av_cold void ff_convolution_init_x86(ConvolutionContext *s) > } > #endif > } > + > +av_cold void ff_sobel_init_x86(ConvolutionContext *s, int depth, int > nb_planes) > +{ > +#if ARCH_X86_64 > + int cpu_flags = av_get_cpu_flags(); > + for (int i = 0; i < nb_planes; i++) { > + if (depth == 8) { > + if (EXTERNAL_AVX512ICL(cpu_flags)) > + s->filter[i] = ff_filter_sobel_avx512icl; > + } > + } > +#endif > +} > diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile > index 62d6616faf..a6f06c7007 100644 > --- a/tests/checkasm/Makefile > +++ b/tests/checkasm/Makefile > @@ -46,6 +46,7 @@ AVFILTEROBJS-$(CONFIG_GBLUR_FILTER) += vf_gblur.o > AVFILTEROBJS-$(CONFIG_HFLIP_FILTER) += vf_hflip.o > AVFILTEROBJS-$(CONFIG_THRESHOLD_FILTER) += vf_threshold.o > AVFILTEROBJS-$(CONFIG_NLMEANS_FILTER) += vf_nlmeans.o > +AVFILTEROBJS-$(CONFIG_SOBEL_FILTER) += vf_convolution.o > > CHECKASMOBJS-$(CONFIG_AVFILTER) += $(AVFILTEROBJS-yes) > > diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c > index 421bd096c5..3eb4780a64 100644 > --- a/tests/checkasm/checkasm.c > +++ b/tests/checkasm/checkasm.c > @@ -197,6 +197,9 @@ static const struct { > #if CONFIG_THRESHOLD_FILTER > { "vf_threshold", checkasm_check_vf_threshold }, > #endif > + #if CONFIG_SOBEL_FILTER > + { "vf_sobel", checkasm_check_vf_sobel }, > + #endif > #endif > #if CONFIG_SWSCALE > { "sw_gbrp", checkasm_check_sw_gbrp }, > diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h > index ee9151410e..214918e7ea 100644 > --- a/tests/checkasm/checkasm.h > +++ b/tests/checkasm/checkasm.h > @@ -86,6 +86,7 @@ void checkasm_check_vf_eq(void); > void checkasm_check_vf_gblur(void); > void checkasm_check_vf_hflip(void); > void checkasm_check_vf_threshold(void); > +void checkasm_check_vf_sobel(void); > void checkasm_check_vp8dsp(void); > void checkasm_check_vp9dsp(void); > void checkasm_check_videodsp(void); > diff --git a/tests/checkasm/vf_convolution.c b/tests/checkasm/vf_convolution.c > new file mode 100644 > index 0000000000..007865863e > --- /dev/null > +++ b/tests/checkasm/vf_convolution.c > @@ -0,0 +1,104 @@ > +/* > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; either version 2 of the License, or > + * (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License along > + * with FFmpeg; if not, write to the Free Software Foundation, Inc., > + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. > + */ > + > +#include <string.h> > +#include "checkasm.h" > +#include "libavfilter/avfilter.h" > +#include "libavfilter/convolution.h" > +#include "libavutil/intreadwrite.h" > +#include "libavutil/mem_internal.h" > + > +#define WIDTH 512 > +#define HEIGHT 512 > +#define SRC_STRIDE 512 > +#define PIXELS (WIDTH * HEIGHT) > + > +#define randomize_buffers(buf, size) \ > + do { \ > + int j; \ > + uint8_t *tmp_buf = (uint8_t *)buf;\ > + for (j = 0; j< size; j++) \ > + tmp_buf[j] = rnd() & 0xFF; \ > + } while (0) > + > +static void check_sobel(const char * report_name) > +{ > + LOCAL_ALIGNED_32(uint8_t, src, [PIXELS]); > + LOCAL_ALIGNED_32(uint8_t, dst_ref, [PIXELS]); > + LOCAL_ALIGNED_32(uint8_t, dst_new, [PIXELS]); > + const int height = WIDTH; > + const int width = HEIGHT; > + const int stride = SRC_STRIDE; > + const int dstride = SRC_STRIDE; > + int mode = 0; > + const uint8_t *c[49]; > + const int radius = 1; > + const int bpc = 1; > + const int step = mode == MATRIX_COLUMN ? 16 : 1; > + const int slice_start = 0; > + const int slice_end = height; > + int y; > + const int sizew = mode == MATRIX_COLUMN ? height : width; > + float scale = 2; > + float delta = 10; > + > + ConvolutionContext s; > + > + declare_func(void, uint8_t *dst, int width, float scale, float delta, > const int *const matrix, > + const uint8_t *c[], int peak, int radius, int dstride, int > stride, int size); > + > + s.scale = scale; > + s.delta = delta; > + s.depth = 8; > + s.nb_planes = 3; > + s.planes = 15; > + ff_sobel_init(&s, s.depth, s.nb_planes); > + > + memset(dst_ref, 0, PIXELS); > + memset(dst_new, 0, PIXELS); > + randomize_buffers(src, PIXELS); > + > + if (check_func(s.filter[0], "%s", report_name)) { > + for (y = slice_start; y < slice_end; y += step) { > + const int xoff = mode == MATRIX_COLUMN ? (y - slice_start) * bpc > : radius * bpc; > + const int yoff = mode == MATRIX_COLUMN ? radius * dstride : 0; > + > + s.setup[0](radius, c, src, stride, radius, width, y, height, > bpc); > + call_ref(dst_ref + yoff + xoff, sizew - 2 * radius, > + scale, delta, NULL, c, 0, radius, > + dstride, stride, slice_end - step); > + call_new(dst_new + yoff + xoff, sizew - 2 * radius, > + scale, delta, NULL, c, 0, radius, > + dstride, stride, slice_end - step); > + if (memcmp(dst_ref + yoff + xoff, dst_new + yoff + xoff, > slice_end - step)) > + fail(); > + bench_new(dst_new + yoff + xoff, sizew - 2 * radius, > + scale, delta, NULL, c, 0, radius, > + dstride, stride, slice_end - step); > + if (mode != MATRIX_COLUMN) > + dst_ref += dstride; > + } > + } > + > +} > + > +void checkasm_check_vf_sobel(void) > +{ > + check_sobel("sobel"); > + report("convolution:sobel"); > +} > diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak > index aa9b288e12..a4e95541f5 100644 > --- a/tests/fate/checkasm.mak > +++ b/tests/fate/checkasm.mak > @@ -43,6 +43,7 @@ FATE_CHECKASM = fate-checkasm- > aacpsdsp \ > fate-checkasm-vf_hflip \ > fate-checkasm-vf_nlmeans \ > fate-checkasm-vf_threshold \ > + fate-checkasm-vf_sobel \ > fate-checkasm-videodsp \ > fate-checkasm-vorbisdsp \ > fate-checkasm-vp8dsp \ LGTM and it works well for me, I saw a significant FPS improvement when running the command below. $ ffmpeg -i 1920x1080.mp4 -vf "sobel" -f null - Thanks Haihao _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [FFmpeg-devel] [PATCH v7] libavfilter/x86/vf_convolution: add sobel filter optimization and unit test with intel AVX512 VNNI 2022-11-07 5:24 ` Xiang, Haihao @ 2022-11-11 3:00 ` Xiang, Haihao 2022-11-14 2:12 ` Xiang, Haihao 0 siblings, 1 reply; 13+ messages in thread From: Xiang, Haihao @ 2022-11-11 3:00 UTC (permalink / raw) To: ffmpeg-devel; +Cc: Wang, Bin On Mon, 2022-11-07 at 05:24 +0000, Xiang, Haihao wrote: > On Fri, 2022-11-04 at 16:29 +0800, bin.wang-at-intel.com@ffmpeg.org wrote: > > From: bwang30 <bin.wang@intel.com> > > > > This commit enabled assembly code with intel AVX512 VNNI and added unit test > > for sobel filter > > > > sobel_c: 4537 > > sobel_avx512icl 2136 > > > > Signed-off-by: bwang30 <bin.wang@intel.com> > > --- > > libavfilter/convolution.h | 74 +++++++++++++ > > libavfilter/vf_convolution.c | 91 +++------------- > > libavfilter/x86/vf_convolution.asm | 147 ++++++++++++++++++++++++++ > > libavfilter/x86/vf_convolution_init.c | 18 ++++ > > tests/checkasm/Makefile | 1 + > > tests/checkasm/checkasm.c | 3 + > > tests/checkasm/checkasm.h | 1 + > > tests/checkasm/vf_convolution.c | 104 ++++++++++++++++++ > > tests/fate/checkasm.mak | 1 + > > 9 files changed, 362 insertions(+), 78 deletions(-) > > create mode 100644 tests/checkasm/vf_convolution.c > > > > diff --git a/libavfilter/convolution.h b/libavfilter/convolution.h > > index 88aabe9a20..e44bfb5da8 100644 > > --- a/libavfilter/convolution.h > > +++ b/libavfilter/convolution.h > > @@ -21,6 +21,7 @@ > > #ifndef AVFILTER_CONVOLUTION_H > > #define AVFILTER_CONVOLUTION_H > > #include "avfilter.h" > > +#include "libavutil/intreadwrite.h" > > > > enum MatrixMode { > > MATRIX_SQUARE, > > @@ -61,4 +62,77 @@ typedef struct ConvolutionContext { > > } ConvolutionContext; > > > > void ff_convolution_init_x86(ConvolutionContext *s); > > +void ff_sobel_init_x86(ConvolutionContext *s, int depth, int nb_planes); > > + > > +static void setup_3x3(int radius, const uint8_t *c[], const uint8_t *src, > > int > > stride, > > + int x, int w, int y, int h, int bpc) > > +{ > > + int i; > > + > > + for (i = 0; i < 9; i++) { > > + int xoff = FFABS(x + ((i % 3) - 1)); > > + int yoff = FFABS(y + (i / 3) - 1); > > + > > + xoff = xoff >= w ? 2 * w - 1 - xoff : xoff; > > + yoff = yoff >= h ? 2 * h - 1 - yoff : yoff; > > + > > + c[i] = src + xoff * bpc + yoff * stride; > > + } > > +} > > + > > +static void filter_sobel(uint8_t *dst, int width, > > + float scale, float delta, const int *const matrix, > > + const uint8_t *c[], int peak, int radius, > > + int dstride, int stride, int size) > > +{ > > + const uint8_t *c0 = c[0], *c1 = c[1], *c2 = c[2]; > > + const uint8_t *c3 = c[3], *c5 = c[5]; > > + const uint8_t *c6 = c[6], *c7 = c[7], *c8 = c[8]; > > + int x; > > + > > + for (x = 0; x < width; x++) { > > + float suma = c0[x] * -1 + c1[x] * -2 + c2[x] * -1 + > > + c6[x] * 1 + c7[x] * 2 + c8[x] * 1; > > + float sumb = c0[x] * -1 + c2[x] * 1 + c3[x] * -2 + > > + c5[x] * 2 + c6[x] * -1 + c8[x] * 1; > > + > > + dst[x] = av_clip_uint8(sqrtf(suma*suma + sumb*sumb) * scale + > > delta); > > + } > > +} > > + > > +static void filter16_sobel(uint8_t *dstp, int width, > > + float scale, float delta, const int *const > > matrix, > > + const uint8_t *c[], int peak, int radius, > > + int dstride, int stride, int size) > > +{ > > + uint16_t *dst = (uint16_t *)dstp; > > + int x; > > + > > + for (x = 0; x < width; x++) { > > + float suma = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[1][2 * x]) * > > -2 + AV_RN16A(&c[2][2 * x]) * -1 + > > + AV_RN16A(&c[6][2 * x]) * 1 + AV_RN16A(&c[7][2 * x]) > > * 2 + AV_RN16A(&c[8][2 * x]) * 1; > > + float sumb = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[2][2 * x]) > > * 1 + AV_RN16A(&c[3][2 * x]) * -2 + > > + AV_RN16A(&c[5][2 * x]) * 2 + AV_RN16A(&c[6][2 * x]) * > > -1 + AV_RN16A(&c[8][2 * x]) * 1; > > + > > + dst[x] = av_clip(sqrtf(suma*suma + sumb*sumb) * scale + delta, 0, > > peak); > > + } > > +} > > + > > +static av_unused void ff_sobel_init(ConvolutionContext *s, int depth, int > > nb_planes) > > +{ > > + for (int i = 0; i < 4; i++) { > > + s->filter[i] = filter_sobel; > > + s->copy[i] = !((1 << i) & s->planes); > > + s->size[i] = 3; > > + s->setup[i] = setup_3x3; > > + s->rdiv[i] = s->scale; > > + s->bias[i] = s->delta; > > + } > > + if (s->depth > 8) > > + for (int i = 0; i < 4; i++) > > + s->filter[i] = filter16_sobel; > > +#if ARCH_X86_64 > > + ff_sobel_init_x86(s, depth, nb_planes); > > +#endif > > +} > > #endif > > diff --git a/libavfilter/vf_convolution.c b/libavfilter/vf_convolution.c > > index 9a9c099e6d..7762fa2a05 100644 > > --- a/libavfilter/vf_convolution.c > > +++ b/libavfilter/vf_convolution.c > > @@ -139,24 +139,6 @@ static void filter16_roberts(uint8_t *dstp, int width, > > } > > } > > > > -static void filter16_sobel(uint8_t *dstp, int width, > > - float scale, float delta, const int *const > > matrix, > > - const uint8_t *c[], int peak, int radius, > > - int dstride, int stride, int size) > > -{ > > - uint16_t *dst = (uint16_t *)dstp; > > - int x; > > - > > - for (x = 0; x < width; x++) { > > - float suma = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[1][2 * x]) * > > -2 + AV_RN16A(&c[2][2 * x]) * -1 + > > - AV_RN16A(&c[6][2 * x]) * 1 + AV_RN16A(&c[7][2 * x]) > > * 2 + AV_RN16A(&c[8][2 * x]) * 1; > > - float sumb = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[2][2 * x]) > > * 1 + AV_RN16A(&c[3][2 * x]) * -2 + > > - AV_RN16A(&c[5][2 * x]) * 2 + AV_RN16A(&c[6][2 * x]) * > > -1 + AV_RN16A(&c[8][2 * x]) * 1; > > - > > - dst[x] = av_clip(sqrtf(suma*suma + sumb*sumb) * scale + delta, 0, > > peak); > > - } > > -} > > - > > static void filter16_scharr(uint8_t *dstp, int width, > > float scale, float delta, const int *const > > matrix, > > const uint8_t *c[], int peak, int radius, > > @@ -261,26 +243,6 @@ static void filter_roberts(uint8_t *dst, int width, > > } > > } > > > > -static void filter_sobel(uint8_t *dst, int width, > > - float scale, float delta, const int *const matrix, > > - const uint8_t *c[], int peak, int radius, > > - int dstride, int stride, int size) > > -{ > > - const uint8_t *c0 = c[0], *c1 = c[1], *c2 = c[2]; > > - const uint8_t *c3 = c[3], *c5 = c[5]; > > - const uint8_t *c6 = c[6], *c7 = c[7], *c8 = c[8]; > > - int x; > > - > > - for (x = 0; x < width; x++) { > > - float suma = c0[x] * -1 + c1[x] * -2 + c2[x] * -1 + > > - c6[x] * 1 + c7[x] * 2 + c8[x] * 1; > > - float sumb = c0[x] * -1 + c2[x] * 1 + c3[x] * -2 + > > - c5[x] * 2 + c6[x] * -1 + c8[x] * 1; > > - > > - dst[x] = av_clip_uint8(sqrtf(suma*suma + sumb*sumb) * scale + > > delta); > > - } > > -} > > - > > static void filter_scharr(uint8_t *dst, int width, > > float scale, float delta, const int *const > > matrix, > > const uint8_t *c[], int peak, int radius, > > @@ -552,22 +514,6 @@ static void filter_column(uint8_t *dst, int height, > > } > > } > > > > -static void setup_3x3(int radius, const uint8_t *c[], const uint8_t *src, > > int > > stride, > > - int x, int w, int y, int h, int bpc) > > -{ > > - int i; > > - > > - for (i = 0; i < 9; i++) { > > - int xoff = FFABS(x + ((i % 3) - 1)); > > - int yoff = FFABS(y + (i / 3) - 1); > > - > > - xoff = xoff >= w ? 2 * w - 1 - xoff : xoff; > > - yoff = yoff >= h ? 2 * h - 1 - yoff : yoff; > > - > > - c[i] = src + xoff * bpc + yoff * stride; > > - } > > -} > > - > > static void setup_5x5(int radius, const uint8_t *c[], const uint8_t *src, > > int > > stride, > > int x, int w, int y, int h, int bpc) > > { > > @@ -708,6 +654,18 @@ static int param_init(AVFilterContext *ctx) > > const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format); > > int p, i; > > > > + s->depth = desc->comp[0].depth; > > + s->max = (1 << s->depth) - 1; > > + > > + s->planewidth[1] = s->planewidth[2] = AV_CEIL_RSHIFT(inlink->w, desc- > > > log2_chroma_w); > > + s->planewidth[0] = s->planewidth[3] = inlink->w; > > + s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, desc- > > > log2_chroma_h); > > + s->planeheight[0] = s->planeheight[3] = inlink->h; > > + > > + s->nb_planes = av_pix_fmt_count_planes(inlink->format); > > + s->nb_threads = ff_filter_get_nb_threads(ctx); > > + s->bpc = (s->depth + 7) / 8; > > + > > if (!strcmp(ctx->filter->name, "convolution")) { > > for (i = 0; i < 4; i++) { > > int *matrix = (int *)s->matrix[i]; > > @@ -804,14 +762,7 @@ static int param_init(AVFilterContext *ctx) > > s->bias[i] = s->delta; > > } > > } else if (!strcmp(ctx->filter->name, "sobel")) { > > - for (i = 0; i < 4; i++) { > > - s->filter[i] = filter_sobel; > > - s->copy[i] = !((1 << i) & s->planes); > > - s->size[i] = 3; > > - s->setup[i] = setup_3x3; > > - s->rdiv[i] = s->scale; > > - s->bias[i] = s->delta; > > - } > > + ff_sobel_init(s, s->depth, s->nb_planes); > > } else if (!strcmp(ctx->filter->name, "kirsch")) { > > for (i = 0; i < 4; i++) { > > s->filter[i] = filter_kirsch; > > @@ -832,18 +783,6 @@ static int param_init(AVFilterContext *ctx) > > } > > } > > > > - s->depth = desc->comp[0].depth; > > - s->max = (1 << s->depth) - 1; > > - > > - s->planewidth[1] = s->planewidth[2] = AV_CEIL_RSHIFT(inlink->w, desc- > > > log2_chroma_w); > > - s->planewidth[0] = s->planewidth[3] = inlink->w; > > - s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, desc- > > > log2_chroma_h); > > - s->planeheight[0] = s->planeheight[3] = inlink->h; > > - > > - s->nb_planes = av_pix_fmt_count_planes(inlink->format); > > - s->nb_threads = ff_filter_get_nb_threads(ctx); > > - s->bpc = (s->depth + 7) / 8; > > - > > if (!strcmp(ctx->filter->name, "convolution")) { > > if (s->depth > 8) { > > for (p = 0; p < s->nb_planes; p++) { > > @@ -870,10 +809,6 @@ static int param_init(AVFilterContext *ctx) > > if (s->depth > 8) > > for (p = 0; p < s->nb_planes; p++) > > s->filter[p] = filter16_roberts; > > - } else if (!strcmp(ctx->filter->name, "sobel")) { > > - if (s->depth > 8) > > - for (p = 0; p < s->nb_planes; p++) > > - s->filter[p] = filter16_sobel; > > } else if (!strcmp(ctx->filter->name, "kirsch")) { > > if (s->depth > 8) > > for (p = 0; p < s->nb_planes; p++) > > diff --git a/libavfilter/x86/vf_convolution.asm > > b/libavfilter/x86/vf_convolution.asm > > index 754d4d1064..c912d56752 100644 > > --- a/libavfilter/x86/vf_convolution.asm > > +++ b/libavfilter/x86/vf_convolution.asm > > @@ -22,6 +22,18 @@ > > > > SECTION_RODATA > > half: dd 0.5 > > +data_p1: dd 1 > > +data_n1: dd -1 > > +data_p2: dd 2 > > +data_n2: dd -2 > > + > > +ALIGN 64 > > +sobel_perm: db 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, > > 51 > > + db 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, > > 55 > > + db 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, > > 59 > > + db 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, > > 63 > > +sobel_mulA: db -1, 1, -2, 2 > > +sobel_mulB: db 1, -1, 2, -2 > > > > SECTION .text > > > > @@ -154,3 +166,138 @@ cglobal filter_3x3, 4, 15, 7, dst, width, rdiv, bias, > > matrix, ptr, c0, c1, c2, c > > INIT_XMM sse4 > > FILTER_3X3 > > %endif > > + > > +%macro SOBEL_MUL 2 > > + movzx ptrd, byte [c%1q + xq] > > + imul ptrd, [%2] > > + add rd, ptrd > > +%endmacro > > + > > +%macro SOBEL_ADD 1 > > + movzx ptrd, byte [c%1q + xq] > > + add rd, ptrd > > +%endmacro > > + > > +; void filter_sobel_avx512(uint8_t *dst, int width, > > +; float scale, float delta, const int *const matrix, > > +; const uint8_t *c[], int peak, int radius, > > +; int dstride, int stride) > > +%macro FILTER_SOBEL 0 > > +%if UNIX64 > > +cglobal filter_sobel, 4, 15, 7, dst, width, matrix, ptr, c0, c1, c2, c3, > > c4, > > c5, c6, c7, c8, r, x > > +%else > > +cglobal filter_sobel, 4, 15, 7, dst, width, rdiv, bias, matrix, ptr, c0, > > c1, > > c2, c3, c4, c5, c6, c7, c8, r, x > > +%endif > > +%if WIN64 > > + SWAP xmm0, xmm2 > > + SWAP xmm1, xmm3 > > + mov r2q, matrixmp > > + mov r3q, ptrmp > > + DEFINE_ARGS dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, > > c8, > > r, x > > +%endif > > + movsxdifnidn widthq, widthd > > + VBROADCASTSS m0, xmm0 > > + VBROADCASTSS m1, xmm1 > > + pxor m6, m6 > > + mov c0q, [ptrq + 0*gprsize] > > + mov c1q, [ptrq + 1*gprsize] > > + mov c2q, [ptrq + 2*gprsize] > > + mov c3q, [ptrq + 3*gprsize] > > + mov c4q, [ptrq + 4*gprsize] > > + mov c5q, [ptrq + 5*gprsize] > > + mov c6q, [ptrq + 6*gprsize] > > + mov c7q, [ptrq + 7*gprsize] > > + mov c8q, [ptrq + 8*gprsize] > > + > > + xor xq, xq > > + cmp widthq, mmsize/4 > > + jl .loop2 > > + > > + mov rq, widthq > > + and rq, mmsize/4-1 > > + sub widthq, rq > > + > > + mova m6, [sobel_perm] > > +.loop1: > > + movu xm3, [c2q + xq] > > + pmovzxbd m5, [c0q + xq] > > + vinserti32x4 ym3, [c6q + xq], 1 > > + pmovzxbd m4, [c8q + xq] > > + vinserti32x4 m2, m3, [c1q + xq], 2 > > + vinserti32x4 m3, [c5q + xq], 2 > > + vinserti32x4 m2, [c7q + xq], 3 > > + vinserti32x4 m3, [c3q + xq], 3 > > + vpermb m2, m6, m2 > > + psubd m4, m5 > > + vpermb m3, m6, m3 > > + mova m5, m4 > > + vpdpbusd m4, m2, [sobel_mulA] {1to16} > > + vpdpbusd m5, m3, [sobel_mulB] {1to16} > > + > > + cvtdq2ps m4, m4 > > + mulps m4, m4 > > + > > + cvtdq2ps m5, m5 > > + VFMADD231PS m4, m5, m5 > > + > > + sqrtps m4, m4 > > + fmaddps m4, m4, m0, m1 > > + cvttps2dq m4, m4 > > + vpmovusdb [dstq + xq], m4 > > + > > + add xq, mmsize/4 > > + cmp xq, widthq > > + jl .loop1 > > + > > + add widthq, rq > > + cmp xq, widthq > > + jge .end > > + > > +.loop2: > > + xor rd, rd > > + pxor m4, m4 > > + > > + ;Gx > > + SOBEL_MUL 0, data_n1 > > + SOBEL_MUL 1, data_n2 > > + SOBEL_MUL 2, data_n1 > > + SOBEL_ADD 6 > > + SOBEL_MUL 7, data_p2 > > + SOBEL_ADD 8 > > + > > + cvtsi2ss xmm4, rd > > + mulss xmm4, xmm4 > > + > > + xor rd, rd > > + ;Gy > > + SOBEL_MUL 0, data_n1 > > + SOBEL_ADD 2 > > + SOBEL_MUL 3, data_n2 > > + SOBEL_MUL 5, data_p2 > > + SOBEL_MUL 6, data_n1 > > + SOBEL_ADD 8 > > + > > + cvtsi2ss xmm5, rd > > + fmaddss xmm4, xmm5, xmm5, xmm4 > > + > > + sqrtps xmm4, xmm4 > > + fmaddss xmm4, xmm4, xmm0, xmm1 ;sum = sum * rdiv + bias > > + cvttps2dq xmm4, xmm4 ; trunc to integer > > + packssdw xmm4, xmm4 > > + packuswb xmm4, xmm4 > > + movd rd, xmm4 > > + mov [dstq + xq], rb > > + > > + add xq, 1 > > + cmp xq, widthq > > + jl .loop2 > > +.end: > > + RET > > +%endmacro > > + > > +%if ARCH_X86_64 > > +%if HAVE_AVX512ICL_EXTERNAL > > +INIT_ZMM avx512icl > > +FILTER_SOBEL > > +%endif > > +%endif > > diff --git a/libavfilter/x86/vf_convolution_init.c > > b/libavfilter/x86/vf_convolution_init.c > > index b78a47d02b..bff10ca1a4 100644 > > --- a/libavfilter/x86/vf_convolution_init.c > > +++ b/libavfilter/x86/vf_convolution_init.c > > @@ -29,6 +29,11 @@ void ff_filter_3x3_sse4(uint8_t *dst, int width, > > const uint8_t *c[], int peak, int radius, > > int dstride, int stride, int size); > > > > +void ff_filter_sobel_avx512icl(uint8_t *dst, int width, > > + float scale, float delta, const int *const matrix, > > + const uint8_t *c[], int peak, int radius, > > + int dstride, int stride, int size); > > + > > av_cold void ff_convolution_init_x86(ConvolutionContext *s) > > { > > #if ARCH_X86_64 > > @@ -44,3 +49,16 @@ av_cold void ff_convolution_init_x86(ConvolutionContext > > *s) > > } > > #endif > > } > > + > > +av_cold void ff_sobel_init_x86(ConvolutionContext *s, int depth, int > > nb_planes) > > +{ > > +#if ARCH_X86_64 > > + int cpu_flags = av_get_cpu_flags(); > > + for (int i = 0; i < nb_planes; i++) { > > + if (depth == 8) { > > + if (EXTERNAL_AVX512ICL(cpu_flags)) > > + s->filter[i] = ff_filter_sobel_avx512icl; > > + } > > + } > > +#endif > > +} > > diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile > > index 62d6616faf..a6f06c7007 100644 > > --- a/tests/checkasm/Makefile > > +++ b/tests/checkasm/Makefile > > @@ -46,6 +46,7 @@ AVFILTEROBJS-$(CONFIG_GBLUR_FILTER) += vf_gblur.o > > AVFILTEROBJS-$(CONFIG_HFLIP_FILTER) += vf_hflip.o > > AVFILTEROBJS-$(CONFIG_THRESHOLD_FILTER) += vf_threshold.o > > AVFILTEROBJS-$(CONFIG_NLMEANS_FILTER) += vf_nlmeans.o > > +AVFILTEROBJS-$(CONFIG_SOBEL_FILTER) += vf_convolution.o > > > > CHECKASMOBJS-$(CONFIG_AVFILTER) += $(AVFILTEROBJS-yes) > > > > diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c > > index 421bd096c5..3eb4780a64 100644 > > --- a/tests/checkasm/checkasm.c > > +++ b/tests/checkasm/checkasm.c > > @@ -197,6 +197,9 @@ static const struct { > > #if CONFIG_THRESHOLD_FILTER > > { "vf_threshold", checkasm_check_vf_threshold }, > > #endif > > + #if CONFIG_SOBEL_FILTER > > + { "vf_sobel", checkasm_check_vf_sobel }, > > + #endif > > #endif > > #if CONFIG_SWSCALE > > { "sw_gbrp", checkasm_check_sw_gbrp }, > > diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h > > index ee9151410e..214918e7ea 100644 > > --- a/tests/checkasm/checkasm.h > > +++ b/tests/checkasm/checkasm.h > > @@ -86,6 +86,7 @@ void checkasm_check_vf_eq(void); > > void checkasm_check_vf_gblur(void); > > void checkasm_check_vf_hflip(void); > > void checkasm_check_vf_threshold(void); > > +void checkasm_check_vf_sobel(void); > > void checkasm_check_vp8dsp(void); > > void checkasm_check_vp9dsp(void); > > void checkasm_check_videodsp(void); > > diff --git a/tests/checkasm/vf_convolution.c > > b/tests/checkasm/vf_convolution.c > > new file mode 100644 > > index 0000000000..007865863e > > --- /dev/null > > +++ b/tests/checkasm/vf_convolution.c > > @@ -0,0 +1,104 @@ > > +/* > > + * This file is part of FFmpeg. > > + * > > + * FFmpeg is free software; you can redistribute it and/or modify > > + * it under the terms of the GNU General Public License as published by > > + * the Free Software Foundation; either version 2 of the License, or > > + * (at your option) any later version. > > + * > > + * FFmpeg is distributed in the hope that it will be useful, > > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > > + * GNU General Public License for more details. > > + * > > + * You should have received a copy of the GNU General Public License along > > + * with FFmpeg; if not, write to the Free Software Foundation, Inc., > > + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. > > + */ > > + > > +#include <string.h> > > +#include "checkasm.h" > > +#include "libavfilter/avfilter.h" > > +#include "libavfilter/convolution.h" > > +#include "libavutil/intreadwrite.h" > > +#include "libavutil/mem_internal.h" > > + > > +#define WIDTH 512 > > +#define HEIGHT 512 > > +#define SRC_STRIDE 512 > > +#define PIXELS (WIDTH * HEIGHT) > > + > > +#define randomize_buffers(buf, size) \ > > + do { \ > > + int j; \ > > + uint8_t *tmp_buf = (uint8_t *)buf;\ > > + for (j = 0; j< size; j++) \ > > + tmp_buf[j] = rnd() & 0xFF; \ > > + } while (0) > > + > > +static void check_sobel(const char * report_name) > > +{ > > + LOCAL_ALIGNED_32(uint8_t, src, [PIXELS]); > > + LOCAL_ALIGNED_32(uint8_t, dst_ref, [PIXELS]); > > + LOCAL_ALIGNED_32(uint8_t, dst_new, [PIXELS]); > > + const int height = WIDTH; > > + const int width = HEIGHT; > > + const int stride = SRC_STRIDE; > > + const int dstride = SRC_STRIDE; > > + int mode = 0; > > + const uint8_t *c[49]; > > + const int radius = 1; > > + const int bpc = 1; > > + const int step = mode == MATRIX_COLUMN ? 16 : 1; > > + const int slice_start = 0; > > + const int slice_end = height; > > + int y; > > + const int sizew = mode == MATRIX_COLUMN ? height : width; > > + float scale = 2; > > + float delta = 10; > > + > > + ConvolutionContext s; > > + > > + declare_func(void, uint8_t *dst, int width, float scale, float delta, > > const int *const matrix, > > + const uint8_t *c[], int peak, int radius, int dstride, int > > stride, int size); > > + > > + s.scale = scale; > > + s.delta = delta; > > + s.depth = 8; > > + s.nb_planes = 3; > > + s.planes = 15; > > + ff_sobel_init(&s, s.depth, s.nb_planes); > > + > > + memset(dst_ref, 0, PIXELS); > > + memset(dst_new, 0, PIXELS); > > + randomize_buffers(src, PIXELS); > > + > > + if (check_func(s.filter[0], "%s", report_name)) { > > + for (y = slice_start; y < slice_end; y += step) { > > + const int xoff = mode == MATRIX_COLUMN ? (y - slice_start) * > > bpc > > : radius * bpc; > > + const int yoff = mode == MATRIX_COLUMN ? radius * dstride : 0; > > + > > + s.setup[0](radius, c, src, stride, radius, width, y, height, > > bpc); > > + call_ref(dst_ref + yoff + xoff, sizew - 2 * radius, > > + scale, delta, NULL, c, 0, radius, > > + dstride, stride, slice_end - step); > > + call_new(dst_new + yoff + xoff, sizew - 2 * radius, > > + scale, delta, NULL, c, 0, radius, > > + dstride, stride, slice_end - step); > > + if (memcmp(dst_ref + yoff + xoff, dst_new + yoff + xoff, > > slice_end - step)) > > + fail(); > > + bench_new(dst_new + yoff + xoff, sizew - 2 * radius, > > + scale, delta, NULL, c, 0, radius, > > + dstride, stride, slice_end - step); > > + if (mode != MATRIX_COLUMN) > > + dst_ref += dstride; > > + } > > + } > > + > > +} > > + > > +void checkasm_check_vf_sobel(void) > > +{ > > + check_sobel("sobel"); > > + report("convolution:sobel"); > > +} > > diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak > > index aa9b288e12..a4e95541f5 100644 > > --- a/tests/fate/checkasm.mak > > +++ b/tests/fate/checkasm.mak > > @@ -43,6 +43,7 @@ FATE_CHECKASM = fate-checkasm- > > aacpsdsp \ > > fate-checkasm-vf_hflip \ > > fate-checkasm-vf_nlmeans \ > > fate-checkasm-vf_threshold \ > > + fate-checkasm-vf_sobel \ > > fate-checkasm-videodsp \ > > fate-checkasm-vorbisdsp \ > > fate-checkasm-vp8dsp \ > > LGTM and it works well for me, I saw a significant FPS improvement when > running > the command below. > > $ ffmpeg -i 1920x1080.mp4 -vf "sobel" -f null - > Does anyone else have any thought on this patch? I will merge it if there are no more comments. Thanks Haihao _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [FFmpeg-devel] [PATCH v7] libavfilter/x86/vf_convolution: add sobel filter optimization and unit test with intel AVX512 VNNI 2022-11-11 3:00 ` Xiang, Haihao @ 2022-11-14 2:12 ` Xiang, Haihao 0 siblings, 0 replies; 13+ messages in thread From: Xiang, Haihao @ 2022-11-14 2:12 UTC (permalink / raw) To: ffmpeg-devel; +Cc: Wang, Bin On Fri, 2022-11-11 at 03:00 +0000, Xiang, Haihao wrote: > On Mon, 2022-11-07 at 05:24 +0000, Xiang, Haihao wrote: > > On Fri, 2022-11-04 at 16:29 +0800, bin.wang-at-intel.com@ffmpeg.org wrote: > > > From: bwang30 <bin.wang@intel.com> > > > > > > This commit enabled assembly code with intel AVX512 VNNI and added unit > > > test > > > for sobel filter > > > > > > sobel_c: 4537 > > > sobel_avx512icl 2136 > > > > > > Signed-off-by: bwang30 <bin.wang@intel.com> > > > --- > > > libavfilter/convolution.h | 74 +++++++++++++ > > > libavfilter/vf_convolution.c | 91 +++------------- > > > libavfilter/x86/vf_convolution.asm | 147 ++++++++++++++++++++++++++ > > > libavfilter/x86/vf_convolution_init.c | 18 ++++ > > > tests/checkasm/Makefile | 1 + > > > tests/checkasm/checkasm.c | 3 + > > > tests/checkasm/checkasm.h | 1 + > > > tests/checkasm/vf_convolution.c | 104 ++++++++++++++++++ > > > tests/fate/checkasm.mak | 1 + > > > 9 files changed, 362 insertions(+), 78 deletions(-) > > > create mode 100644 tests/checkasm/vf_convolution.c > > > > > > diff --git a/libavfilter/convolution.h b/libavfilter/convolution.h > > > index 88aabe9a20..e44bfb5da8 100644 > > > --- a/libavfilter/convolution.h > > > +++ b/libavfilter/convolution.h > > > @@ -21,6 +21,7 @@ > > > #ifndef AVFILTER_CONVOLUTION_H > > > #define AVFILTER_CONVOLUTION_H > > > #include "avfilter.h" > > > +#include "libavutil/intreadwrite.h" > > > > > > enum MatrixMode { > > > MATRIX_SQUARE, > > > @@ -61,4 +62,77 @@ typedef struct ConvolutionContext { > > > } ConvolutionContext; > > > > > > void ff_convolution_init_x86(ConvolutionContext *s); > > > +void ff_sobel_init_x86(ConvolutionContext *s, int depth, int nb_planes); > > > + > > > +static void setup_3x3(int radius, const uint8_t *c[], const uint8_t *src, > > > int > > > stride, > > > + int x, int w, int y, int h, int bpc) > > > +{ > > > + int i; > > > + > > > + for (i = 0; i < 9; i++) { > > > + int xoff = FFABS(x + ((i % 3) - 1)); > > > + int yoff = FFABS(y + (i / 3) - 1); > > > + > > > + xoff = xoff >= w ? 2 * w - 1 - xoff : xoff; > > > + yoff = yoff >= h ? 2 * h - 1 - yoff : yoff; > > > + > > > + c[i] = src + xoff * bpc + yoff * stride; > > > + } > > > +} > > > + > > > +static void filter_sobel(uint8_t *dst, int width, > > > + float scale, float delta, const int *const > > > matrix, > > > + const uint8_t *c[], int peak, int radius, > > > + int dstride, int stride, int size) > > > +{ > > > + const uint8_t *c0 = c[0], *c1 = c[1], *c2 = c[2]; > > > + const uint8_t *c3 = c[3], *c5 = c[5]; > > > + const uint8_t *c6 = c[6], *c7 = c[7], *c8 = c[8]; > > > + int x; > > > + > > > + for (x = 0; x < width; x++) { > > > + float suma = c0[x] * -1 + c1[x] * -2 + c2[x] * -1 + > > > + c6[x] * 1 + c7[x] * 2 + c8[x] * 1; > > > + float sumb = c0[x] * -1 + c2[x] * 1 + c3[x] * -2 + > > > + c5[x] * 2 + c6[x] * -1 + c8[x] * 1; > > > + > > > + dst[x] = av_clip_uint8(sqrtf(suma*suma + sumb*sumb) * scale + > > > delta); > > > + } > > > +} > > > + > > > +static void filter16_sobel(uint8_t *dstp, int width, > > > + float scale, float delta, const int *const > > > matrix, > > > + const uint8_t *c[], int peak, int radius, > > > + int dstride, int stride, int size) > > > +{ > > > + uint16_t *dst = (uint16_t *)dstp; > > > + int x; > > > + > > > + for (x = 0; x < width; x++) { > > > + float suma = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[1][2 * x]) > > > * > > > -2 + AV_RN16A(&c[2][2 * x]) * -1 + > > > + AV_RN16A(&c[6][2 * x]) * 1 + AV_RN16A(&c[7][2 * x]) > > > * 2 + AV_RN16A(&c[8][2 * x]) * 1; > > > + float sumb = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[2][2 * x]) > > > * 1 + AV_RN16A(&c[3][2 * x]) * -2 + > > > + AV_RN16A(&c[5][2 * x]) * 2 + AV_RN16A(&c[6][2 * x]) > > > * > > > -1 + AV_RN16A(&c[8][2 * x]) * 1; > > > + > > > + dst[x] = av_clip(sqrtf(suma*suma + sumb*sumb) * scale + delta, 0, > > > peak); > > > + } > > > +} > > > + > > > +static av_unused void ff_sobel_init(ConvolutionContext *s, int depth, int > > > nb_planes) > > > +{ > > > + for (int i = 0; i < 4; i++) { > > > + s->filter[i] = filter_sobel; > > > + s->copy[i] = !((1 << i) & s->planes); > > > + s->size[i] = 3; > > > + s->setup[i] = setup_3x3; > > > + s->rdiv[i] = s->scale; > > > + s->bias[i] = s->delta; > > > + } > > > + if (s->depth > 8) > > > + for (int i = 0; i < 4; i++) > > > + s->filter[i] = filter16_sobel; > > > +#if ARCH_X86_64 > > > + ff_sobel_init_x86(s, depth, nb_planes); > > > +#endif > > > +} > > > #endif > > > diff --git a/libavfilter/vf_convolution.c b/libavfilter/vf_convolution.c > > > index 9a9c099e6d..7762fa2a05 100644 > > > --- a/libavfilter/vf_convolution.c > > > +++ b/libavfilter/vf_convolution.c > > > @@ -139,24 +139,6 @@ static void filter16_roberts(uint8_t *dstp, int > > > width, > > > } > > > } > > > > > > -static void filter16_sobel(uint8_t *dstp, int width, > > > - float scale, float delta, const int *const > > > matrix, > > > - const uint8_t *c[], int peak, int radius, > > > - int dstride, int stride, int size) > > > -{ > > > - uint16_t *dst = (uint16_t *)dstp; > > > - int x; > > > - > > > - for (x = 0; x < width; x++) { > > > - float suma = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[1][2 * x]) > > > * > > > -2 + AV_RN16A(&c[2][2 * x]) * -1 + > > > - AV_RN16A(&c[6][2 * x]) * 1 + AV_RN16A(&c[7][2 * x]) > > > * 2 + AV_RN16A(&c[8][2 * x]) * 1; > > > - float sumb = AV_RN16A(&c[0][2 * x]) * -1 + AV_RN16A(&c[2][2 * x]) > > > * 1 + AV_RN16A(&c[3][2 * x]) * -2 + > > > - AV_RN16A(&c[5][2 * x]) * 2 + AV_RN16A(&c[6][2 * x]) > > > * > > > -1 + AV_RN16A(&c[8][2 * x]) * 1; > > > - > > > - dst[x] = av_clip(sqrtf(suma*suma + sumb*sumb) * scale + delta, 0, > > > peak); > > > - } > > > -} > > > - > > > static void filter16_scharr(uint8_t *dstp, int width, > > > float scale, float delta, const int *const > > > matrix, > > > const uint8_t *c[], int peak, int radius, > > > @@ -261,26 +243,6 @@ static void filter_roberts(uint8_t *dst, int width, > > > } > > > } > > > > > > -static void filter_sobel(uint8_t *dst, int width, > > > - float scale, float delta, const int *const > > > matrix, > > > - const uint8_t *c[], int peak, int radius, > > > - int dstride, int stride, int size) > > > -{ > > > - const uint8_t *c0 = c[0], *c1 = c[1], *c2 = c[2]; > > > - const uint8_t *c3 = c[3], *c5 = c[5]; > > > - const uint8_t *c6 = c[6], *c7 = c[7], *c8 = c[8]; > > > - int x; > > > - > > > - for (x = 0; x < width; x++) { > > > - float suma = c0[x] * -1 + c1[x] * -2 + c2[x] * -1 + > > > - c6[x] * 1 + c7[x] * 2 + c8[x] * 1; > > > - float sumb = c0[x] * -1 + c2[x] * 1 + c3[x] * -2 + > > > - c5[x] * 2 + c6[x] * -1 + c8[x] * 1; > > > - > > > - dst[x] = av_clip_uint8(sqrtf(suma*suma + sumb*sumb) * scale + > > > delta); > > > - } > > > -} > > > - > > > static void filter_scharr(uint8_t *dst, int width, > > > float scale, float delta, const int *const > > > matrix, > > > const uint8_t *c[], int peak, int radius, > > > @@ -552,22 +514,6 @@ static void filter_column(uint8_t *dst, int height, > > > } > > > } > > > > > > -static void setup_3x3(int radius, const uint8_t *c[], const uint8_t *src, > > > int > > > stride, > > > - int x, int w, int y, int h, int bpc) > > > -{ > > > - int i; > > > - > > > - for (i = 0; i < 9; i++) { > > > - int xoff = FFABS(x + ((i % 3) - 1)); > > > - int yoff = FFABS(y + (i / 3) - 1); > > > - > > > - xoff = xoff >= w ? 2 * w - 1 - xoff : xoff; > > > - yoff = yoff >= h ? 2 * h - 1 - yoff : yoff; > > > - > > > - c[i] = src + xoff * bpc + yoff * stride; > > > - } > > > -} > > > - > > > static void setup_5x5(int radius, const uint8_t *c[], const uint8_t *src, > > > int > > > stride, > > > int x, int w, int y, int h, int bpc) > > > { > > > @@ -708,6 +654,18 @@ static int param_init(AVFilterContext *ctx) > > > const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format); > > > int p, i; > > > > > > + s->depth = desc->comp[0].depth; > > > + s->max = (1 << s->depth) - 1; > > > + > > > + s->planewidth[1] = s->planewidth[2] = AV_CEIL_RSHIFT(inlink->w, desc- > > > > log2_chroma_w); > > > + s->planewidth[0] = s->planewidth[3] = inlink->w; > > > + s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, > > > desc- > > > > log2_chroma_h); > > > + s->planeheight[0] = s->planeheight[3] = inlink->h; > > > + > > > + s->nb_planes = av_pix_fmt_count_planes(inlink->format); > > > + s->nb_threads = ff_filter_get_nb_threads(ctx); > > > + s->bpc = (s->depth + 7) / 8; > > > + > > > if (!strcmp(ctx->filter->name, "convolution")) { > > > for (i = 0; i < 4; i++) { > > > int *matrix = (int *)s->matrix[i]; > > > @@ -804,14 +762,7 @@ static int param_init(AVFilterContext *ctx) > > > s->bias[i] = s->delta; > > > } > > > } else if (!strcmp(ctx->filter->name, "sobel")) { > > > - for (i = 0; i < 4; i++) { > > > - s->filter[i] = filter_sobel; > > > - s->copy[i] = !((1 << i) & s->planes); > > > - s->size[i] = 3; > > > - s->setup[i] = setup_3x3; > > > - s->rdiv[i] = s->scale; > > > - s->bias[i] = s->delta; > > > - } > > > + ff_sobel_init(s, s->depth, s->nb_planes); > > > } else if (!strcmp(ctx->filter->name, "kirsch")) { > > > for (i = 0; i < 4; i++) { > > > s->filter[i] = filter_kirsch; > > > @@ -832,18 +783,6 @@ static int param_init(AVFilterContext *ctx) > > > } > > > } > > > > > > - s->depth = desc->comp[0].depth; > > > - s->max = (1 << s->depth) - 1; > > > - > > > - s->planewidth[1] = s->planewidth[2] = AV_CEIL_RSHIFT(inlink->w, desc- > > > > log2_chroma_w); > > > - s->planewidth[0] = s->planewidth[3] = inlink->w; > > > - s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, > > > desc- > > > > log2_chroma_h); > > > - s->planeheight[0] = s->planeheight[3] = inlink->h; > > > - > > > - s->nb_planes = av_pix_fmt_count_planes(inlink->format); > > > - s->nb_threads = ff_filter_get_nb_threads(ctx); > > > - s->bpc = (s->depth + 7) / 8; > > > - > > > if (!strcmp(ctx->filter->name, "convolution")) { > > > if (s->depth > 8) { > > > for (p = 0; p < s->nb_planes; p++) { > > > @@ -870,10 +809,6 @@ static int param_init(AVFilterContext *ctx) > > > if (s->depth > 8) > > > for (p = 0; p < s->nb_planes; p++) > > > s->filter[p] = filter16_roberts; > > > - } else if (!strcmp(ctx->filter->name, "sobel")) { > > > - if (s->depth > 8) > > > - for (p = 0; p < s->nb_planes; p++) > > > - s->filter[p] = filter16_sobel; > > > } else if (!strcmp(ctx->filter->name, "kirsch")) { > > > if (s->depth > 8) > > > for (p = 0; p < s->nb_planes; p++) > > > diff --git a/libavfilter/x86/vf_convolution.asm > > > b/libavfilter/x86/vf_convolution.asm > > > index 754d4d1064..c912d56752 100644 > > > --- a/libavfilter/x86/vf_convolution.asm > > > +++ b/libavfilter/x86/vf_convolution.asm > > > @@ -22,6 +22,18 @@ > > > > > > SECTION_RODATA > > > half: dd 0.5 > > > +data_p1: dd 1 > > > +data_n1: dd -1 > > > +data_p2: dd 2 > > > +data_n2: dd -2 > > > + > > > +ALIGN 64 > > > +sobel_perm: db 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, > > > 35, > > > 51 > > > + db 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, > > > 39, > > > 55 > > > + db 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, > > > 43, > > > 59 > > > + db 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, > > > 47, > > > 63 > > > +sobel_mulA: db -1, 1, -2, 2 > > > +sobel_mulB: db 1, -1, 2, -2 > > > > > > SECTION .text > > > > > > @@ -154,3 +166,138 @@ cglobal filter_3x3, 4, 15, 7, dst, width, rdiv, > > > bias, > > > matrix, ptr, c0, c1, c2, c > > > INIT_XMM sse4 > > > FILTER_3X3 > > > %endif > > > + > > > +%macro SOBEL_MUL 2 > > > + movzx ptrd, byte [c%1q + xq] > > > + imul ptrd, [%2] > > > + add rd, ptrd > > > +%endmacro > > > + > > > +%macro SOBEL_ADD 1 > > > + movzx ptrd, byte [c%1q + xq] > > > + add rd, ptrd > > > +%endmacro > > > + > > > +; void filter_sobel_avx512(uint8_t *dst, int width, > > > +; float scale, float delta, const int *const matrix, > > > +; const uint8_t *c[], int peak, int radius, > > > +; int dstride, int stride) > > > +%macro FILTER_SOBEL 0 > > > +%if UNIX64 > > > +cglobal filter_sobel, 4, 15, 7, dst, width, matrix, ptr, c0, c1, c2, c3, > > > c4, > > > c5, c6, c7, c8, r, x > > > +%else > > > +cglobal filter_sobel, 4, 15, 7, dst, width, rdiv, bias, matrix, ptr, c0, > > > c1, > > > c2, c3, c4, c5, c6, c7, c8, r, x > > > +%endif > > > +%if WIN64 > > > + SWAP xmm0, xmm2 > > > + SWAP xmm1, xmm3 > > > + mov r2q, matrixmp > > > + mov r3q, ptrmp > > > + DEFINE_ARGS dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, > > > c8, > > > r, x > > > +%endif > > > + movsxdifnidn widthq, widthd > > > + VBROADCASTSS m0, xmm0 > > > + VBROADCASTSS m1, xmm1 > > > + pxor m6, m6 > > > + mov c0q, [ptrq + 0*gprsize] > > > + mov c1q, [ptrq + 1*gprsize] > > > + mov c2q, [ptrq + 2*gprsize] > > > + mov c3q, [ptrq + 3*gprsize] > > > + mov c4q, [ptrq + 4*gprsize] > > > + mov c5q, [ptrq + 5*gprsize] > > > + mov c6q, [ptrq + 6*gprsize] > > > + mov c7q, [ptrq + 7*gprsize] > > > + mov c8q, [ptrq + 8*gprsize] > > > + > > > + xor xq, xq > > > + cmp widthq, mmsize/4 > > > + jl .loop2 > > > + > > > + mov rq, widthq > > > + and rq, mmsize/4-1 > > > + sub widthq, rq > > > + > > > + mova m6, [sobel_perm] > > > +.loop1: > > > + movu xm3, [c2q + xq] > > > + pmovzxbd m5, [c0q + xq] > > > + vinserti32x4 ym3, [c6q + xq], 1 > > > + pmovzxbd m4, [c8q + xq] > > > + vinserti32x4 m2, m3, [c1q + xq], 2 > > > + vinserti32x4 m3, [c5q + xq], 2 > > > + vinserti32x4 m2, [c7q + xq], 3 > > > + vinserti32x4 m3, [c3q + xq], 3 > > > + vpermb m2, m6, m2 > > > + psubd m4, m5 > > > + vpermb m3, m6, m3 > > > + mova m5, m4 > > > + vpdpbusd m4, m2, [sobel_mulA] {1to16} > > > + vpdpbusd m5, m3, [sobel_mulB] {1to16} > > > + > > > + cvtdq2ps m4, m4 > > > + mulps m4, m4 > > > + > > > + cvtdq2ps m5, m5 > > > + VFMADD231PS m4, m5, m5 > > > + > > > + sqrtps m4, m4 > > > + fmaddps m4, m4, m0, m1 > > > + cvttps2dq m4, m4 > > > + vpmovusdb [dstq + xq], m4 > > > + > > > + add xq, mmsize/4 > > > + cmp xq, widthq > > > + jl .loop1 > > > + > > > + add widthq, rq > > > + cmp xq, widthq > > > + jge .end > > > + > > > +.loop2: > > > + xor rd, rd > > > + pxor m4, m4 > > > + > > > + ;Gx > > > + SOBEL_MUL 0, data_n1 > > > + SOBEL_MUL 1, data_n2 > > > + SOBEL_MUL 2, data_n1 > > > + SOBEL_ADD 6 > > > + SOBEL_MUL 7, data_p2 > > > + SOBEL_ADD 8 > > > + > > > + cvtsi2ss xmm4, rd > > > + mulss xmm4, xmm4 > > > + > > > + xor rd, rd > > > + ;Gy > > > + SOBEL_MUL 0, data_n1 > > > + SOBEL_ADD 2 > > > + SOBEL_MUL 3, data_n2 > > > + SOBEL_MUL 5, data_p2 > > > + SOBEL_MUL 6, data_n1 > > > + SOBEL_ADD 8 > > > + > > > + cvtsi2ss xmm5, rd > > > + fmaddss xmm4, xmm5, xmm5, xmm4 > > > + > > > + sqrtps xmm4, xmm4 > > > + fmaddss xmm4, xmm4, xmm0, xmm1 ;sum = sum * rdiv + bias > > > + cvttps2dq xmm4, xmm4 ; trunc to integer > > > + packssdw xmm4, xmm4 > > > + packuswb xmm4, xmm4 > > > + movd rd, xmm4 > > > + mov [dstq + xq], rb > > > + > > > + add xq, 1 > > > + cmp xq, widthq > > > + jl .loop2 > > > +.end: > > > + RET > > > +%endmacro > > > + > > > +%if ARCH_X86_64 > > > +%if HAVE_AVX512ICL_EXTERNAL > > > +INIT_ZMM avx512icl > > > +FILTER_SOBEL > > > +%endif > > > +%endif > > > diff --git a/libavfilter/x86/vf_convolution_init.c > > > b/libavfilter/x86/vf_convolution_init.c > > > index b78a47d02b..bff10ca1a4 100644 > > > --- a/libavfilter/x86/vf_convolution_init.c > > > +++ b/libavfilter/x86/vf_convolution_init.c > > > @@ -29,6 +29,11 @@ void ff_filter_3x3_sse4(uint8_t *dst, int width, > > > const uint8_t *c[], int peak, int radius, > > > int dstride, int stride, int size); > > > > > > +void ff_filter_sobel_avx512icl(uint8_t *dst, int width, > > > + float scale, float delta, const int *const > > > matrix, > > > + const uint8_t *c[], int peak, int radius, > > > + int dstride, int stride, int size); > > > + > > > av_cold void ff_convolution_init_x86(ConvolutionContext *s) > > > { > > > #if ARCH_X86_64 > > > @@ -44,3 +49,16 @@ av_cold void ff_convolution_init_x86(ConvolutionContext > > > *s) > > > } > > > #endif > > > } > > > + > > > +av_cold void ff_sobel_init_x86(ConvolutionContext *s, int depth, int > > > nb_planes) > > > +{ > > > +#if ARCH_X86_64 > > > + int cpu_flags = av_get_cpu_flags(); > > > + for (int i = 0; i < nb_planes; i++) { > > > + if (depth == 8) { > > > + if (EXTERNAL_AVX512ICL(cpu_flags)) > > > + s->filter[i] = ff_filter_sobel_avx512icl; > > > + } > > > + } > > > +#endif > > > +} > > > diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile > > > index 62d6616faf..a6f06c7007 100644 > > > --- a/tests/checkasm/Makefile > > > +++ b/tests/checkasm/Makefile > > > @@ -46,6 +46,7 @@ AVFILTEROBJS-$(CONFIG_GBLUR_FILTER) += vf_gblur.o > > > AVFILTEROBJS-$(CONFIG_HFLIP_FILTER) += vf_hflip.o > > > AVFILTEROBJS-$(CONFIG_THRESHOLD_FILTER) += vf_threshold.o > > > AVFILTEROBJS-$(CONFIG_NLMEANS_FILTER) += vf_nlmeans.o > > > +AVFILTEROBJS-$(CONFIG_SOBEL_FILTER) += vf_convolution.o > > > > > > CHECKASMOBJS-$(CONFIG_AVFILTER) += $(AVFILTEROBJS-yes) > > > > > > diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c > > > index 421bd096c5..3eb4780a64 100644 > > > --- a/tests/checkasm/checkasm.c > > > +++ b/tests/checkasm/checkasm.c > > > @@ -197,6 +197,9 @@ static const struct { > > > #if CONFIG_THRESHOLD_FILTER > > > { "vf_threshold", checkasm_check_vf_threshold }, > > > #endif > > > + #if CONFIG_SOBEL_FILTER > > > + { "vf_sobel", checkasm_check_vf_sobel }, > > > + #endif > > > #endif > > > #if CONFIG_SWSCALE > > > { "sw_gbrp", checkasm_check_sw_gbrp }, > > > diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h > > > index ee9151410e..214918e7ea 100644 > > > --- a/tests/checkasm/checkasm.h > > > +++ b/tests/checkasm/checkasm.h > > > @@ -86,6 +86,7 @@ void checkasm_check_vf_eq(void); > > > void checkasm_check_vf_gblur(void); > > > void checkasm_check_vf_hflip(void); > > > void checkasm_check_vf_threshold(void); > > > +void checkasm_check_vf_sobel(void); > > > void checkasm_check_vp8dsp(void); > > > void checkasm_check_vp9dsp(void); > > > void checkasm_check_videodsp(void); > > > diff --git a/tests/checkasm/vf_convolution.c > > > b/tests/checkasm/vf_convolution.c > > > new file mode 100644 > > > index 0000000000..007865863e > > > --- /dev/null > > > +++ b/tests/checkasm/vf_convolution.c > > > @@ -0,0 +1,104 @@ > > > +/* > > > + * This file is part of FFmpeg. > > > + * > > > + * FFmpeg is free software; you can redistribute it and/or modify > > > + * it under the terms of the GNU General Public License as published by > > > + * the Free Software Foundation; either version 2 of the License, or > > > + * (at your option) any later version. > > > + * > > > + * FFmpeg is distributed in the hope that it will be useful, > > > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > > > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > > > + * GNU General Public License for more details. > > > + * > > > + * You should have received a copy of the GNU General Public License > > > along > > > + * with FFmpeg; if not, write to the Free Software Foundation, Inc., > > > + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. > > > + */ > > > + > > > +#include <string.h> > > > +#include "checkasm.h" > > > +#include "libavfilter/avfilter.h" > > > +#include "libavfilter/convolution.h" > > > +#include "libavutil/intreadwrite.h" > > > +#include "libavutil/mem_internal.h" > > > + > > > +#define WIDTH 512 > > > +#define HEIGHT 512 > > > +#define SRC_STRIDE 512 > > > +#define PIXELS (WIDTH * HEIGHT) > > > + > > > +#define randomize_buffers(buf, size) \ > > > + do { \ > > > + int j; \ > > > + uint8_t *tmp_buf = (uint8_t *)buf;\ > > > + for (j = 0; j< size; j++) \ > > > + tmp_buf[j] = rnd() & 0xFF; \ > > > + } while (0) > > > + > > > +static void check_sobel(const char * report_name) > > > +{ > > > + LOCAL_ALIGNED_32(uint8_t, src, [PIXELS]); > > > + LOCAL_ALIGNED_32(uint8_t, dst_ref, [PIXELS]); > > > + LOCAL_ALIGNED_32(uint8_t, dst_new, [PIXELS]); > > > + const int height = WIDTH; > > > + const int width = HEIGHT; > > > + const int stride = SRC_STRIDE; > > > + const int dstride = SRC_STRIDE; > > > + int mode = 0; > > > + const uint8_t *c[49]; > > > + const int radius = 1; > > > + const int bpc = 1; > > > + const int step = mode == MATRIX_COLUMN ? 16 : 1; > > > + const int slice_start = 0; > > > + const int slice_end = height; > > > + int y; > > > + const int sizew = mode == MATRIX_COLUMN ? height : width; > > > + float scale = 2; > > > + float delta = 10; > > > + > > > + ConvolutionContext s; > > > + > > > + declare_func(void, uint8_t *dst, int width, float scale, float delta, > > > const int *const matrix, > > > + const uint8_t *c[], int peak, int radius, int dstride, > > > int > > > stride, int size); > > > + > > > + s.scale = scale; > > > + s.delta = delta; > > > + s.depth = 8; > > > + s.nb_planes = 3; > > > + s.planes = 15; > > > + ff_sobel_init(&s, s.depth, s.nb_planes); > > > + > > > + memset(dst_ref, 0, PIXELS); > > > + memset(dst_new, 0, PIXELS); > > > + randomize_buffers(src, PIXELS); > > > + > > > + if (check_func(s.filter[0], "%s", report_name)) { > > > + for (y = slice_start; y < slice_end; y += step) { > > > + const int xoff = mode == MATRIX_COLUMN ? (y - slice_start) * > > > bpc > > > : radius * bpc; > > > + const int yoff = mode == MATRIX_COLUMN ? radius * dstride : > > > 0; > > > + > > > + s.setup[0](radius, c, src, stride, radius, width, y, height, > > > bpc); > > > + call_ref(dst_ref + yoff + xoff, sizew - 2 * radius, > > > + scale, delta, NULL, c, 0, radius, > > > + dstride, stride, slice_end - step); > > > + call_new(dst_new + yoff + xoff, sizew - 2 * radius, > > > + scale, delta, NULL, c, 0, radius, > > > + dstride, stride, slice_end - step); > > > + if (memcmp(dst_ref + yoff + xoff, dst_new + yoff + xoff, > > > slice_end - step)) > > > + fail(); > > > + bench_new(dst_new + yoff + xoff, sizew - 2 * radius, > > > + scale, delta, NULL, c, 0, radius, > > > + dstride, stride, slice_end - step); > > > + if (mode != MATRIX_COLUMN) > > > + dst_ref += dstride; > > > + } > > > + } > > > + > > > +} > > > + > > > +void checkasm_check_vf_sobel(void) > > > +{ > > > + check_sobel("sobel"); > > > + report("convolution:sobel"); > > > +} > > > diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak > > > index aa9b288e12..a4e95541f5 100644 > > > --- a/tests/fate/checkasm.mak > > > +++ b/tests/fate/checkasm.mak > > > @@ -43,6 +43,7 @@ FATE_CHECKASM = fate-checkasm- > > > aacpsdsp \ > > > fate-checkasm-vf_hflip \ > > > fate-checkasm-vf_nlmeans \ > > > fate-checkasm-vf_threshold \ > > > + fate-checkasm-vf_sobel \ > > > fate-checkasm-videodsp \ > > > fate-checkasm-vorbisdsp \ > > > fate-checkasm-vp8dsp \ > > > > LGTM and it works well for me, I saw a significant FPS improvement when > > running > > the command below. > > > > $ ffmpeg -i 1920x1080.mp4 -vf "sobel" -f null - > > > > Does anyone else have any thought on this patch? I will merge it if there are > no > more comments. Pushed, -Haihao _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [FFmpeg-devel] [PATCH v7] libavfilter/x86/vf_convolution: add sobel filter optimization and unit test with intel AVX512 VNNI 2022-11-04 8:29 [FFmpeg-devel] [PATCH v7] libavfilter/x86/vf_convolution: add sobel filter optimization and unit test with intel AVX512 VNNI bin.wang-at-intel.com 2022-11-07 5:24 ` Xiang, Haihao @ 2022-11-14 2:42 ` James Almer 2022-11-14 5:58 ` Wang, Bin 2022-11-14 12:54 ` James Almer 2 siblings, 1 reply; 13+ messages in thread From: James Almer @ 2022-11-14 2:42 UTC (permalink / raw) To: ffmpeg-devel On 11/4/2022 5:29 AM, bin.wang-at-intel.com@ffmpeg.org wrote: > +%macro FILTER_SOBEL 0 > +%if UNIX64 > +cglobal filter_sobel, 4, 15, 7, dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, r, x > +%else > +cglobal filter_sobel, 4, 15, 7, dst, width, rdiv, bias, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, r, x > +%endif > +%if WIN64 > + SWAP xmm0, xmm2 > + SWAP xmm1, xmm3 > + mov r2q, matrixmp > + mov r3q, ptrmp > + DEFINE_ARGS dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, r, x > +%endif > + movsxdifnidn widthq, widthd > + VBROADCASTSS m0, xmm0 > + VBROADCASTSS m1, xmm1 This and every other xmm# case should instead be xm#, to ensure the swapping is taken into account. _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [FFmpeg-devel] [PATCH v7] libavfilter/x86/vf_convolution: add sobel filter optimization and unit test with intel AVX512 VNNI 2022-11-14 2:42 ` James Almer @ 2022-11-14 5:58 ` Wang, Bin 2022-11-14 11:34 ` James Almer 0 siblings, 1 reply; 13+ messages in thread From: Wang, Bin @ 2022-11-14 5:58 UTC (permalink / raw) To: FFmpeg development discussions and patches -----Original Message----- From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of James Almer Sent: Monday, November 14, 2022 10:43 AM To: ffmpeg-devel@ffmpeg.org Subject: Re: [FFmpeg-devel] [PATCH v7] libavfilter/x86/vf_convolution: add sobel filter optimization and unit test with intel AVX512 VNNI On 11/4/2022 5:29 AM, bin.wang-at-intel.com@ffmpeg.org wrote: > +%macro FILTER_SOBEL 0 > +%if UNIX64 > +cglobal filter_sobel, 4, 15, 7, dst, width, matrix, ptr, c0, c1, c2, > +c3, c4, c5, c6, c7, c8, r, x %else cglobal filter_sobel, 4, 15, 7, > +dst, width, rdiv, bias, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, > +c8, r, x %endif %if WIN64 > + SWAP xmm0, xmm2 > + SWAP xmm1, xmm3 > + mov r2q, matrixmp > + mov r3q, ptrmp > + DEFINE_ARGS dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, > +c7, c8, r, x %endif > + movsxdifnidn widthq, widthd > + VBROADCASTSS m0, xmm0 > + VBROADCASTSS m1, xmm1 > + This and every other xmm# case should instead be xm#, to ensure the swapping is taken into account. Sorry, I can't get your point, could you please help to explain why I have to use xm# to ensure the swapping operation(swap xmm# can't work in WIN64 asm)? And How to do it ? _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [FFmpeg-devel] [PATCH v7] libavfilter/x86/vf_convolution: add sobel filter optimization and unit test with intel AVX512 VNNI 2022-11-14 5:58 ` Wang, Bin @ 2022-11-14 11:34 ` James Almer 0 siblings, 0 replies; 13+ messages in thread From: James Almer @ 2022-11-14 11:34 UTC (permalink / raw) To: ffmpeg-devel On 11/14/2022 2:58 AM, Wang, Bin wrote: > -----Original Message----- > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of James Almer > Sent: Monday, November 14, 2022 10:43 AM > To: ffmpeg-devel@ffmpeg.org > Subject: Re: [FFmpeg-devel] [PATCH v7] libavfilter/x86/vf_convolution: add sobel filter optimization and unit test with intel AVX512 VNNI > > On 11/4/2022 5:29 AM, bin.wang-at-intel.com@ffmpeg.org wrote: >> +%macro FILTER_SOBEL 0 >> +%if UNIX64 >> +cglobal filter_sobel, 4, 15, 7, dst, width, matrix, ptr, c0, c1, c2, >> +c3, c4, c5, c6, c7, c8, r, x %else cglobal filter_sobel, 4, 15, 7, >> +dst, width, rdiv, bias, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, >> +c8, r, x %endif %if WIN64 >> + SWAP xmm0, xmm2 >> + SWAP xmm1, xmm3 >> + mov r2q, matrixmp >> + mov r3q, ptrmp >> + DEFINE_ARGS dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, >> +c7, c8, r, x %endif >> + movsxdifnidn widthq, widthd >> + VBROADCASTSS m0, xmm0 >> + VBROADCASTSS m1, xmm1 > >> + This and every other xmm# case should instead be xm#, to ensure the swapping is taken into account. > > Sorry, I can't get your point, could you please help to explain why I have to use xm# to ensure the swapping operation(swap xmm# can't work in WIN64 asm)? And How to do it ? SWAP only affects the x86inc defined macros m#, xm#, ym#, and zm#, so those instructions above end up encoded as vbroadcastss zmm2, xmm0 and vbroadcastss zmm3, xmm1 on WIN64. In fact, now that i check it they end up as vbroadcastss zmm18, xmm0 and vbroadcastss zmm19, xmm1 because x86inc is purposely using the higher 16 regs with these macros on all targets to avoid having to call vzeroupper at the end. This works on unix64 by pure chance because the floats were effectively in xmm0 and xmm1 and all calculations then happen on m#, xm# and ym#. So you'll have to duplicate the VBROADCASTSS lines to broadcast xmm2 and xmm3 to m0 and m1 on WIN64 instead of using SWAP. _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [FFmpeg-devel] [PATCH v7] libavfilter/x86/vf_convolution: add sobel filter optimization and unit test with intel AVX512 VNNI 2022-11-04 8:29 [FFmpeg-devel] [PATCH v7] libavfilter/x86/vf_convolution: add sobel filter optimization and unit test with intel AVX512 VNNI bin.wang-at-intel.com 2022-11-07 5:24 ` Xiang, Haihao 2022-11-14 2:42 ` James Almer @ 2022-11-14 12:54 ` James Almer 2022-11-14 13:30 ` Wang, Bin 2 siblings, 1 reply; 13+ messages in thread From: James Almer @ 2022-11-14 12:54 UTC (permalink / raw) To: ffmpeg-devel On 11/4/2022 5:29 AM, bin.wang-at-intel.com@ffmpeg.org wrote: > +.loop2: > + xor rd, rd > + pxor m4, m4 > + > + ;Gx > + SOBEL_MUL 0, data_n1 > + SOBEL_MUL 1, data_n2 > + SOBEL_MUL 2, data_n1 > + SOBEL_ADD 6 > + SOBEL_MUL 7, data_p2 > + SOBEL_ADD 8 > + > + cvtsi2ss xmm4, rd > + mulss xmm4, xmm4 > + > + xor rd, rd > + ;Gy > + SOBEL_MUL 0, data_n1 > + SOBEL_ADD 2 > + SOBEL_MUL 3, data_n2 > + SOBEL_MUL 5, data_p2 > + SOBEL_MUL 6, data_n1 > + SOBEL_ADD 8 > + > + cvtsi2ss xmm5, rd > + fmaddss xmm4, xmm5, xmm5, xmm4 > + > + sqrtps xmm4, xmm4 > + fmaddss xmm4, xmm4, xmm0, xmm1 ;sum = sum * rdiv + bias By using xmm# you're not taking into account any x86inc SWAPing, so this is using xmm0 and xmm1 where the single scalar float input arguments reside (at least on unix64), instead of xm0 and xm1 (xmm16 and xmm17) where the broadcasted scalars were stored. This, again, only worked by chance on unix64 because you're using scalar fmadd, and shouldn't work at all on win64. Also, all these as is are being encoded as VEX, not EVEX, but it should be fine leaving them untouched instead of using xm#, since they will be shorter (five bytes instead of six for some) by using the lower, non callee-saved regs. > + cvttps2dq xmm4, xmm4 ; trunc to integer > + packssdw xmm4, xmm4 > + packuswb xmm4, xmm4 > + movd rd, xmm4 > + mov [dstq + xq], rb > + > + add xq, 1 > + cmp xq, widthq > + jl .loop2 > +.end: > + RET _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [FFmpeg-devel] [PATCH v7] libavfilter/x86/vf_convolution: add sobel filter optimization and unit test with intel AVX512 VNNI 2022-11-14 12:54 ` James Almer @ 2022-11-14 13:30 ` Wang, Bin 2022-11-14 13:35 ` James Almer 0 siblings, 1 reply; 13+ messages in thread From: Wang, Bin @ 2022-11-14 13:30 UTC (permalink / raw) To: FFmpeg development discussions and patches > By using xmm# you're not taking into account any x86inc SWAPing, so this is > using xmm0 and xmm1 where the single scalar float input arguments reside (at > least on unix64), instead of xm0 and xm1 (xmm16 and xmm17) where the > broadcasted scalars were stored. > This, again, only worked by chance on unix64 because you're using scalar fmadd, > and shouldn't work at all on win64. > > Also, all these as is are being encoded as VEX, not EVEX, but it should be fine > leaving them untouched instead of using xm#, since they will be shorter (five > bytes instead of six for some) by using the lower, non callee-saved regs. Thanks for the help. I'm not familiar with WIN64 asm. So what I need to do is change the WIN64 swap from: SWAP xmm0, xmm2 SWAP xmm1, xmm3 To: VBROADCASTSS m0, xmm2 VBROADCASTSS m1, xmm3 Is that correct? _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [FFmpeg-devel] [PATCH v7] libavfilter/x86/vf_convolution: add sobel filter optimization and unit test with intel AVX512 VNNI 2022-11-14 13:30 ` Wang, Bin @ 2022-11-14 13:35 ` James Almer 2022-11-14 13:54 ` Wang, Bin 0 siblings, 1 reply; 13+ messages in thread From: James Almer @ 2022-11-14 13:35 UTC (permalink / raw) To: ffmpeg-devel On 11/14/2022 10:30 AM, Wang, Bin wrote: >> By using xmm# you're not taking into account any x86inc SWAPing, so this is >> using xmm0 and xmm1 where the single scalar float input arguments reside (at >> least on unix64), instead of xm0 and xm1 (xmm16 and xmm17) where the >> broadcasted scalars were stored. >> This, again, only worked by chance on unix64 because you're using scalar fmadd, >> and shouldn't work at all on win64. >> >> Also, all these as is are being encoded as VEX, not EVEX, but it should be fine >> leaving them untouched instead of using xm#, since they will be shorter (five >> bytes instead of six for some) by using the lower, non callee-saved regs. > > Thanks for the help. I'm not familiar with WIN64 asm. So what I need to do is change the WIN64 swap from: > SWAP xmm0, xmm2 > SWAP xmm1, xmm3 > To: > VBROADCASTSS m0, xmm2 > VBROADCASTSS m1, xmm3 > > Is that correct? Yes, that will ultimately broadcast the two scalars in xmm2 and xmm3 to zmm16 and zmm17. After that what you need to do is either change the fmaddss instruction to use xm0 and xm1 macros instead of xmm0 and xmm1 (so xmm16 and xmm17 with EVEX encoding is used), or much like the broadcast above use xmm2 and xmm3 explicitly on win64, so it remains VEX encoded. _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [FFmpeg-devel] [PATCH v7] libavfilter/x86/vf_convolution: add sobel filter optimization and unit test with intel AVX512 VNNI 2022-11-14 13:35 ` James Almer @ 2022-11-14 13:54 ` Wang, Bin 2022-11-14 14:31 ` James Almer 0 siblings, 1 reply; 13+ messages in thread From: Wang, Bin @ 2022-11-14 13:54 UTC (permalink / raw) To: FFmpeg development discussions and patches > -----Original Message----- > From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of James > Almer > Sent: Monday, November 14, 2022 9:36 PM > To: ffmpeg-devel@ffmpeg.org > Subject: Re: [FFmpeg-devel] [PATCH v7] libavfilter/x86/vf_convolution: add > sobel filter optimization and unit test with intel AVX512 VNNI > > On 11/14/2022 10:30 AM, Wang, Bin wrote: > >> By using xmm# you're not taking into account any x86inc SWAPing, so > >> this is using xmm0 and xmm1 where the single scalar float input > >> arguments reside (at least on unix64), instead of xm0 and xm1 (xmm16 > >> and xmm17) where the broadcasted scalars were stored. > >> This, again, only worked by chance on unix64 because you're using > >> scalar fmadd, and shouldn't work at all on win64. > >> > >> Also, all these as is are being encoded as VEX, not EVEX, but it > >> should be fine leaving them untouched instead of using xm#, since > >> they will be shorter (five bytes instead of six for some) by using the lower, > non callee-saved regs. > > > > Thanks for the help. I'm not familiar with WIN64 asm. So what I need to do is > change the WIN64 swap from: > > SWAP xmm0, xmm2 > > SWAP xmm1, xmm3 > > To: > > VBROADCASTSS m0, xmm2 > > VBROADCASTSS m1, xmm3 > > > > Is that correct? > > Yes, that will ultimately broadcast the two scalars in xmm2 and xmm3 to > zmm16 and zmm17. > After that what you need to do is either change the fmaddss instruction to use > xm0 and xm1 macros instead of xmm0 and xmm1 (so xmm16 and xmm17 with > EVEX encoding is used), or much like the broadcast above use xmm2 and xmm3 > explicitly on win64, so it remains VEX encoded. So, to fix the issue, does this 2 changes looks good for you? First change the WIN64 swap from: SWAP xmm0, xmm2 SWAP xmm1, xmm3 To: VBROADCASTSS m0, xmm2 VBROADCASTSS m1, xmm3 Second change the fmaddss from: fmaddss xmm4, xmm4, xmm0, xmm1 To: fmaddss xmm4, xmm4, xm0, xm1 > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org > with subject "unsubscribe". _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [FFmpeg-devel] [PATCH v7] libavfilter/x86/vf_convolution: add sobel filter optimization and unit test with intel AVX512 VNNI 2022-11-14 13:54 ` Wang, Bin @ 2022-11-14 14:31 ` James Almer 2022-11-14 15:18 ` Wang, Bin 0 siblings, 1 reply; 13+ messages in thread From: James Almer @ 2022-11-14 14:31 UTC (permalink / raw) To: ffmpeg-devel On 11/14/2022 10:54 AM, Wang, Bin wrote: > > >> -----Original Message----- >> From: ffmpeg-devel <ffmpeg-devel-bounces@ffmpeg.org> On Behalf Of James >> Almer >> Sent: Monday, November 14, 2022 9:36 PM >> To: ffmpeg-devel@ffmpeg.org >> Subject: Re: [FFmpeg-devel] [PATCH v7] libavfilter/x86/vf_convolution: add >> sobel filter optimization and unit test with intel AVX512 VNNI >> >> On 11/14/2022 10:30 AM, Wang, Bin wrote: >>>> By using xmm# you're not taking into account any x86inc SWAPing, so >>>> this is using xmm0 and xmm1 where the single scalar float input >>>> arguments reside (at least on unix64), instead of xm0 and xm1 (xmm16 >>>> and xmm17) where the broadcasted scalars were stored. >>>> This, again, only worked by chance on unix64 because you're using >>>> scalar fmadd, and shouldn't work at all on win64. >>>> >>>> Also, all these as is are being encoded as VEX, not EVEX, but it >>>> should be fine leaving them untouched instead of using xm#, since >>>> they will be shorter (five bytes instead of six for some) by using the lower, >> non callee-saved regs. >>> >>> Thanks for the help. I'm not familiar with WIN64 asm. So what I need to do is >> change the WIN64 swap from: >>> SWAP xmm0, xmm2 >>> SWAP xmm1, xmm3 >>> To: >>> VBROADCASTSS m0, xmm2 >>> VBROADCASTSS m1, xmm3 >>> >>> Is that correct? >> >> Yes, that will ultimately broadcast the two scalars in xmm2 and xmm3 to >> zmm16 and zmm17. >> After that what you need to do is either change the fmaddss instruction to use >> xm0 and xm1 macros instead of xmm0 and xmm1 (so xmm16 and xmm17 with >> EVEX encoding is used), or much like the broadcast above use xmm2 and xmm3 >> explicitly on win64, so it remains VEX encoded. > > So, to fix the issue, does this 2 changes looks good for you? > First change the WIN64 swap from: > SWAP xmm0, xmm2 > SWAP xmm1, xmm3 > To: > VBROADCASTSS m0, xmm2 > VBROADCASTSS m1, xmm3 > > Second change the fmaddss from: > fmaddss xmm4, xmm4, xmm0, xmm1 > To: > fmaddss xmm4, xmm4, xm0, xm1 Yes. _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [FFmpeg-devel] [PATCH v7] libavfilter/x86/vf_convolution: add sobel filter optimization and unit test with intel AVX512 VNNI 2022-11-14 14:31 ` James Almer @ 2022-11-14 15:18 ` Wang, Bin 0 siblings, 0 replies; 13+ messages in thread From: Wang, Bin @ 2022-11-14 15:18 UTC (permalink / raw) To: FFmpeg development discussions and patches > >> On 11/14/2022 10:30 AM, Wang, Bin wrote: > >>>> By using xmm# you're not taking into account any x86inc SWAPing, so > >>>> this is using xmm0 and xmm1 where the single scalar float input > >>>> arguments reside (at least on unix64), instead of xm0 and xm1 > >>>> (xmm16 and xmm17) where the broadcasted scalars were stored. > >>>> This, again, only worked by chance on unix64 because you're using > >>>> scalar fmadd, and shouldn't work at all on win64. > >>>> > >>>> Also, all these as is are being encoded as VEX, not EVEX, but it > >>>> should be fine leaving them untouched instead of using xm#, since > >>>> they will be shorter (five bytes instead of six for some) by using > >>>> the lower, > >> non callee-saved regs. > >>> > >>> Thanks for the help. I'm not familiar with WIN64 asm. So what I need > >>> to do is > >> change the WIN64 swap from: > >>> SWAP xmm0, xmm2 > >>> SWAP xmm1, xmm3 > >>> To: > >>> VBROADCASTSS m0, xmm2 > >>> VBROADCASTSS m1, xmm3 > >>> > >>> Is that correct? > >> > >> Yes, that will ultimately broadcast the two scalars in xmm2 and xmm3 > >> to > >> zmm16 and zmm17. > >> After that what you need to do is either change the fmaddss > >> instruction to use > >> xm0 and xm1 macros instead of xmm0 and xmm1 (so xmm16 and xmm17 > with > >> EVEX encoding is used), or much like the broadcast above use xmm2 and > >> xmm3 explicitly on win64, so it remains VEX encoded. > > > > So, to fix the issue, does this 2 changes looks good for you? > > First change the WIN64 swap from: > > SWAP xmm0, xmm2 > > SWAP xmm1, xmm3 > > To: > > VBROADCASTSS m0, xmm2 > > VBROADCASTSS m1, xmm3 > > > > Second change the fmaddss from: > > fmaddss xmm4, xmm4, xmm0, xmm1 > > To: > > fmaddss xmm4, xmm4, xm0, xm1 > > Yes. Appreciate for your help, I commit new patch here: https://patchwork.ffmpeg.org/project/ffmpeg/patch/20221114143551.9740-1-bin.wang@intel.com/ > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org > with subject "unsubscribe". _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 13+ messages in thread
end of thread, other threads:[~2022-11-14 15:22 UTC | newest] Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2022-11-04 8:29 [FFmpeg-devel] [PATCH v7] libavfilter/x86/vf_convolution: add sobel filter optimization and unit test with intel AVX512 VNNI bin.wang-at-intel.com 2022-11-07 5:24 ` Xiang, Haihao 2022-11-11 3:00 ` Xiang, Haihao 2022-11-14 2:12 ` Xiang, Haihao 2022-11-14 2:42 ` James Almer 2022-11-14 5:58 ` Wang, Bin 2022-11-14 11:34 ` James Almer 2022-11-14 12:54 ` James Almer 2022-11-14 13:30 ` Wang, Bin 2022-11-14 13:35 ` James Almer 2022-11-14 13:54 ` Wang, Bin 2022-11-14 14:31 ` James Almer 2022-11-14 15:18 ` Wang, Bin
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git