* [FFmpeg-devel] [PATCH 1/3] checkasm/hevc_deblock: add luma and chroma full @ 2024-02-21 11:10 J. Dekker 2024-02-21 11:10 ` [FFmpeg-devel] [PATCH 2/3] avcodec/x86: disable hevc 12b luma deblock J. Dekker 2024-02-21 11:10 ` [FFmpeg-devel] [PATCH 3/3] avcodec/aarch64: add hevc deblock NEON J. Dekker 0 siblings, 2 replies; 10+ messages in thread From: J. Dekker @ 2024-02-21 11:10 UTC (permalink / raw) To: ffmpeg-devel Signed-off-by: J. Dekker <jdek@itanimul.li> --- tests/checkasm/hevc_deblock.c | 246 +++++++++++++++++++++++++++++----- 1 file changed, 215 insertions(+), 31 deletions(-) diff --git a/tests/checkasm/hevc_deblock.c b/tests/checkasm/hevc_deblock.c index 66fc8d5646..91e57f5cf5 100644 --- a/tests/checkasm/hevc_deblock.c +++ b/tests/checkasm/hevc_deblock.c @@ -19,9 +19,9 @@ #include <string.h> #include "libavutil/intreadwrite.h" +#include "libavutil/macros.h" #include "libavutil/mem_internal.h" -#include "libavcodec/avcodec.h" #include "libavcodec/hevcdsp.h" #include "checkasm.h" @@ -29,10 +29,11 @@ static const uint32_t pixel_mask[3] = { 0xffffffff, 0x03ff03ff, 0x0fff0fff }; #define SIZEOF_PIXEL ((bit_depth + 7) / 8) -#define BUF_STRIDE (8 * 2) -#define BUF_LINES (8) -#define BUF_OFFSET (BUF_STRIDE * BUF_LINES) -#define BUF_SIZE (BUF_STRIDE * BUF_LINES + BUF_OFFSET * 2) +#define BUF_STRIDE (16 * 2) +#define BUF_LINES (16) +// large buffer sizes based on high bit depth +#define BUF_OFFSET (2 * BUF_STRIDE * BUF_LINES) +#define BUF_SIZE (2 * BUF_STRIDE * BUF_LINES + BUF_OFFSET * 2) #define randomize_buffers(buf0, buf1, size) \ do { \ @@ -45,57 +46,240 @@ static const uint32_t pixel_mask[3] = { 0xffffffff, 0x03ff03ff, 0x0fff0fff }; } \ } while (0) -static void check_deblock_chroma(HEVCDSPContext *h, int bit_depth) +static void check_deblock_chroma(HEVCDSPContext *h, int bit_depth, int c) { - int32_t tc[2] = { 0, 0 }; + // see tctable[] in hevc_filter.c, we check full range + int32_t tc[2] = { rnd() % 25, rnd() % 25 }; // no_p, no_q can only be { 0,0 } for the simpler assembly (non *_c // variant) functions, see deblocking_filter_CTB() in hevc_filter.c - uint8_t no_p[2] = { 0, 0 }; - uint8_t no_q[2] = { 0, 0 }; + uint8_t no_p[2] = { rnd() & c, rnd() & c }; + uint8_t no_q[2] = { rnd() & c, rnd() & c }; LOCAL_ALIGNED_32(uint8_t, buf0, [BUF_SIZE]); LOCAL_ALIGNED_32(uint8_t, buf1, [BUF_SIZE]); declare_func(void, uint8_t *pix, ptrdiff_t stride, int32_t *tc, uint8_t *no_p, uint8_t *no_q); - if (check_func(h->hevc_h_loop_filter_chroma, "hevc_h_loop_filter_chroma%d", bit_depth)) { - for (int i = 0; i < 4; i++) { - randomize_buffers(buf0, buf1, BUF_SIZE); - // see betatable[] in hevc_filter.c - tc[0] = (rnd() & 63) + (rnd() & 1); - tc[1] = (rnd() & 63) + (rnd() & 1); + if (check_func(c ? h->hevc_h_loop_filter_chroma_c : h->hevc_h_loop_filter_chroma, + "hevc_h_loop_filter_chroma%d%s", bit_depth, c ? "_full" : "")) + { + randomize_buffers(buf0, buf1, BUF_SIZE); - call_ref(buf0 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q); - call_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q); + call_ref(buf0 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q); + call_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q); + if (memcmp(buf0, buf1, BUF_SIZE)) + fail(); + bench_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q); + } + + if (check_func(c ? h->hevc_v_loop_filter_chroma_c : h->hevc_v_loop_filter_chroma, + "hevc_v_loop_filter_chroma%d%s", bit_depth, c ? "_full" : "")) + { + randomize_buffers(buf0, buf1, BUF_SIZE); + + call_ref(buf0 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q); + call_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q); + if (memcmp(buf0, buf1, BUF_SIZE)) + fail(); + bench_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q); + } +} + +#define P3 buf[-4 * xstride] +#define P2 buf[-3 * xstride] +#define P1 buf[-2 * xstride] +#define P0 buf[-1 * xstride] +#define Q0 buf[0 * xstride] +#define Q1 buf[1 * xstride] +#define Q2 buf[2 * xstride] +#define Q3 buf[3 * xstride] + +#define TC25(x) ((tc[x] * 5 + 1) >> 1) +#define MASK(x) (uint16_t)(x & ((1 << (bit_depth)) - 1)) +#define GET(x) ((SIZEOF_PIXEL == 1) ? *(uint8_t*)(&x) : *(uint16_t*)(&x)) +#define SET(x, y) do { \ + uint16_t z = MASK(y); \ + if (SIZEOF_PIXEL == 1) \ + *(uint8_t*)(&x) = z; \ + else \ + *(uint16_t*)(&x) = z; \ +} while (0) +#define RANDCLIP(x, diff) av_clip(GET(x) - (diff), 0, \ + (1 << (bit_depth)) - 1) + rnd() % FFMAX(2 * (diff), 1) + +// NOTE: this function doesn't work 'correctly' in that it won't always choose +// strong/strong or weak/weak, in most cases it tends to but will sometimes mix +// weak/strong or even skip sometimes. This is more useful to test correctness +// for these functions, though it does make benching them difficult. The easiest +// way to bench these functions is to check an overall decode since there are too +// many paths and ways to trigger the deblock: we would have to bench all +// permutations of weak/strong/skip/nd_q/nd_p/no_q/no_p and it quickly becomes +// too much. +static void randomize_luma_buffers(int type, int *beta, int32_t tc[2], + uint8_t *buf, ptrdiff_t xstride, ptrdiff_t ystride, int bit_depth) +{ + int i, j, b3, tc25, tc25diff, b3diff; + // both tc & beta are unscaled inputs + // minimum useful value is 1, full range 0-24 + tc[0] = (rnd() % 25) + 1; + tc[1] = (rnd() % 25) + 1; + // minimum useful value for 8bit is 8 + *beta = (rnd() % 57) + 8; + + switch (type) { + case 0: // strong + for (j = 0; j < 2; j++) { + tc25 = TC25(j) << (bit_depth - 8); + tc25diff = FFMAX(tc25 - 1, 0); + // 4 lines per tc + for (i = 0; i < 4; i++) { + b3 = (*beta << (bit_depth - 8)) >> 3; + + SET(P0, rnd() % (1 << bit_depth)); + SET(Q0, RANDCLIP(P0, tc25diff)); + + // p3 - p0 up to beta3 budget + b3diff = rnd() % b3; + SET(P3, RANDCLIP(P0, b3diff)); + // q3 - q0, reduced budget + b3diff = rnd() % FFMAX(b3 - b3diff, 1); + SET(Q3, RANDCLIP(Q0, b3diff)); + + // same concept, budget across 4 pixels + b3 -= b3diff = rnd() % FFMAX(b3, 1); + SET(P2, RANDCLIP(P0, b3diff)); + b3 -= b3diff = rnd() % FFMAX(b3, 1); + SET(Q2, RANDCLIP(Q0, b3diff)); + + // extra reduced budget for weighted pixels + b3 -= b3diff = rnd() % FFMAX(b3 - (1 << (bit_depth - 8)), 1); + SET(P1, RANDCLIP(P0, b3diff)); + b3 -= b3diff = rnd() % FFMAX(b3 - (1 << (bit_depth - 8)), 1); + SET(Q1, RANDCLIP(Q0, b3diff)); + + buf += ystride; + } + } + break; + case 1: // weak + for (j = 0; j < 2; j++) { + tc25 = TC25(j) << (bit_depth - 8); + tc25diff = FFMAX(tc25 - 1, 0); + // 4 lines per tc + for (i = 0; i < 4; i++) { + // Weak filtering is signficantly simpler to activate as + // we only need to satisfy d0 + d3 < beta, which + // can be simplified to d0 + d0 < beta. Using the above + // derivations but substiuting b3 for b1 and ensuring + // that P0/Q0 are at least 1/2 tc25diff apart (tending + // towards 1/2 range). + b3 = (*beta << (bit_depth - 8)) >> 1; + + SET(P0, rnd() % (1 << bit_depth)); + SET(Q0, RANDCLIP(P0, tc25diff >> 1) + + (tc25diff >> 1) * (P0 < (1 << (bit_depth - 1))) ? 1 : -1); + + // p3 - p0 up to beta3 budget + b3diff = rnd() % b3; + SET(P3, RANDCLIP(P0, b3diff)); + // q3 - q0, reduced budget + b3diff = rnd() % FFMAX(b3 - b3diff, 1); + SET(Q3, RANDCLIP(Q0, b3diff)); + + // same concept, budget across 4 pixels + b3 -= b3diff = rnd() % FFMAX(b3, 1); + SET(P2, RANDCLIP(P0, b3diff)); + b3 -= b3diff = rnd() % FFMAX(b3, 1); + SET(Q2, RANDCLIP(Q0, b3diff)); + + // extra reduced budget for weighted pixels + b3 -= b3diff = rnd() % FFMAX(b3 - (1 << (bit_depth - 8)), 1); + SET(P1, RANDCLIP(P0, b3diff)); + b3 -= b3diff = rnd() % FFMAX(b3 - (1 << (bit_depth - 8)), 1); + SET(Q1, RANDCLIP(Q0, b3diff)); + + buf += ystride; + } + } + break; + case 2: // none + *beta = 0; // ensure skip + for (i = 0; i < 8; i++) { + // we can just fill with completely random data, nothing should be touched. + SET(P3, rnd()); SET(P2, rnd()); SET(P1, rnd()); SET(P0, rnd()); + SET(Q0, rnd()); SET(Q1, rnd()); SET(Q2, rnd()); SET(Q3, rnd()); + buf += ystride; + } + break; + } +} + +static void check_deblock_luma(HEVCDSPContext *h, int bit_depth, int c) +{ + const char *type; + const char *types[3] = { "strong", "weak", "skip" }; + int beta; + int32_t tc[2] = {0}; + uint8_t no_p[2] = { rnd() & c, rnd() & c }; + uint8_t no_q[2] = { rnd() & c, rnd() & c }; + LOCAL_ALIGNED_32(uint8_t, buf0, [BUF_SIZE]); + LOCAL_ALIGNED_32(uint8_t, buf1, [BUF_SIZE]); + uint8_t *ptr0 = buf0 + BUF_OFFSET, + *ptr1 = buf1 + BUF_OFFSET; + + declare_func(void, uint8_t *pix, ptrdiff_t stride, int beta, int32_t *tc, uint8_t *no_p, uint8_t *no_q); + + for (int j = 0; j < 3; j++) { + type = types[j]; + if (check_func(c ? h->hevc_h_loop_filter_luma_c : h->hevc_h_loop_filter_luma, + "hevc_h_loop_filter_luma%d_%s%s", bit_depth, type, c ? "_full" : "")) + { + randomize_luma_buffers(j, &beta, tc, buf0 + BUF_OFFSET, 16 * SIZEOF_PIXEL, SIZEOF_PIXEL, bit_depth); + memcpy(buf1, buf0, BUF_SIZE); + + call_ref(ptr0, 16 * SIZEOF_PIXEL, beta, tc, no_p, no_q); + call_new(ptr1, 16 * SIZEOF_PIXEL, beta, tc, no_p, no_q); if (memcmp(buf0, buf1, BUF_SIZE)) fail(); + bench_new(ptr1, 16 * SIZEOF_PIXEL, beta, tc, no_p, no_q); } - bench_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q); - } - if (check_func(h->hevc_v_loop_filter_chroma, "hevc_v_loop_filter_chroma%d", bit_depth)) { - for (int i = 0; i < 4; i++) { - randomize_buffers(buf0, buf1, BUF_SIZE); - // see betatable[] in hevc_filter.c - tc[0] = (rnd() & 63) + (rnd() & 1); - tc[1] = (rnd() & 63) + (rnd() & 1); + if (check_func(c ? h->hevc_v_loop_filter_luma_c : h->hevc_v_loop_filter_luma, + "hevc_v_loop_filter_luma%d_%s%s", bit_depth, type, c ? "_full" : "")) + { + randomize_luma_buffers(j, &beta, tc, buf0 + BUF_OFFSET, SIZEOF_PIXEL, 16 * SIZEOF_PIXEL, bit_depth); + memcpy(buf1, buf0, BUF_SIZE); - call_ref(buf0 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q); - call_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q); + call_ref(ptr0, 16 * SIZEOF_PIXEL, beta, tc, no_p, no_q); + call_new(ptr1, 16 * SIZEOF_PIXEL, beta, tc, no_p, no_q); if (memcmp(buf0, buf1, BUF_SIZE)) fail(); + bench_new(ptr1, 16 * SIZEOF_PIXEL, beta, tc, no_p, no_q); } - bench_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q); } } void checkasm_check_hevc_deblock(void) { + HEVCDSPContext h; int bit_depth; - for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) { - HEVCDSPContext h; ff_hevc_dsp_init(&h, bit_depth); - check_deblock_chroma(&h, bit_depth); + check_deblock_chroma(&h, bit_depth, 0); } report("chroma"); + for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) { + ff_hevc_dsp_init(&h, bit_depth); + check_deblock_chroma(&h, bit_depth, 1); + } + report("chroma_full"); + for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) { + ff_hevc_dsp_init(&h, bit_depth); + check_deblock_luma(&h, bit_depth, 0); + } + report("luma"); + for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) { + ff_hevc_dsp_init(&h, bit_depth); + check_deblock_luma(&h, bit_depth, 1); + } + report("luma_full"); } -- 2.43.2 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH 2/3] avcodec/x86: disable hevc 12b luma deblock 2024-02-21 11:10 [FFmpeg-devel] [PATCH 1/3] checkasm/hevc_deblock: add luma and chroma full J. Dekker @ 2024-02-21 11:10 ` J. Dekker 2024-02-23 3:14 ` Andreas Rheinhardt 2024-02-23 12:45 ` Nuo Mi 2024-02-21 11:10 ` [FFmpeg-devel] [PATCH 3/3] avcodec/aarch64: add hevc deblock NEON J. Dekker 1 sibling, 2 replies; 10+ messages in thread From: J. Dekker @ 2024-02-21 11:10 UTC (permalink / raw) To: ffmpeg-devel Over/underflow in some cases. Signed-off-by: J. Dekker <jdek@itanimul.li> --- libavcodec/x86/hevcdsp_init.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c index 31e81eb11f..11cb1b3bfd 100644 --- a/libavcodec/x86/hevcdsp_init.c +++ b/libavcodec/x86/hevcdsp_init.c @@ -1205,10 +1205,11 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) if (EXTERNAL_SSE2(cpu_flags)) { c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2; c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_sse2; - if (ARCH_X86_64) { - c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2; - c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2; - } + // FIXME: 12-bit luma deblock over/underflows in some cases + // if (ARCH_X86_64) { + // c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2; + // c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2; + // } SAO_BAND_INIT(12, sse2); SAO_EDGE_INIT(12, sse2); -- 2.43.2 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/3] avcodec/x86: disable hevc 12b luma deblock 2024-02-21 11:10 ` [FFmpeg-devel] [PATCH 2/3] avcodec/x86: disable hevc 12b luma deblock J. Dekker @ 2024-02-23 3:14 ` Andreas Rheinhardt 2024-02-24 9:54 ` J. Dekker 2024-02-23 12:45 ` Nuo Mi 1 sibling, 1 reply; 10+ messages in thread From: Andreas Rheinhardt @ 2024-02-23 3:14 UTC (permalink / raw) To: ffmpeg-devel J. Dekker: > Over/underflow in some cases. > > Signed-off-by: J. Dekker <jdek@itanimul.li> > --- > libavcodec/x86/hevcdsp_init.c | 9 +++++---- > 1 file changed, 5 insertions(+), 4 deletions(-) > > diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c > index 31e81eb11f..11cb1b3bfd 100644 > --- a/libavcodec/x86/hevcdsp_init.c > +++ b/libavcodec/x86/hevcdsp_init.c > @@ -1205,10 +1205,11 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) > if (EXTERNAL_SSE2(cpu_flags)) { > c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2; > c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_sse2; > - if (ARCH_X86_64) { > - c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2; > - c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2; > - } > + // FIXME: 12-bit luma deblock over/underflows in some cases > + // if (ARCH_X86_64) { > + // c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2; > + // c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2; > + // } > SAO_BAND_INIT(12, sse2); > SAO_EDGE_INIT(12, sse2); > If you disable them here, you should also ensure that they are not assembled at all. - Andreas _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/3] avcodec/x86: disable hevc 12b luma deblock 2024-02-23 3:14 ` Andreas Rheinhardt @ 2024-02-24 9:54 ` J. Dekker 0 siblings, 0 replies; 10+ messages in thread From: J. Dekker @ 2024-02-24 9:54 UTC (permalink / raw) To: ffmpeg-devel Andreas Rheinhardt <andreas.rheinhardt@outlook.com> writes: > J. Dekker: >> SAO_BAND_INIT(12, sse2); >> SAO_EDGE_INIT(12, sse2); >> > > If you disable them here, you should also ensure that they are not > assembled at all. > > - Andreas Sure, will do on push if no other things to resolve in the latest set. -- jd _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/3] avcodec/x86: disable hevc 12b luma deblock 2024-02-21 11:10 ` [FFmpeg-devel] [PATCH 2/3] avcodec/x86: disable hevc 12b luma deblock J. Dekker 2024-02-23 3:14 ` Andreas Rheinhardt @ 2024-02-23 12:45 ` Nuo Mi 2024-02-24 9:49 ` J. Dekker 1 sibling, 1 reply; 10+ messages in thread From: Nuo Mi @ 2024-02-23 12:45 UTC (permalink / raw) To: FFmpeg development discussions and patches On Wed, Feb 21, 2024 at 7:10 PM J. Dekker <jdek@itanimul.li> wrote: > Over/underflow in some cases. > > Signed-off-by: J. Dekker <jdek@itanimul.li> > --- > libavcodec/x86/hevcdsp_init.c | 9 +++++---- > 1 file changed, 5 insertions(+), 4 deletions(-) > > diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c > index 31e81eb11f..11cb1b3bfd 100644 > --- a/libavcodec/x86/hevcdsp_init.c > +++ b/libavcodec/x86/hevcdsp_init.c > @@ -1205,10 +1205,11 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const > int bit_depth) > if (EXTERNAL_SSE2(cpu_flags)) { > c->hevc_v_loop_filter_chroma = > ff_hevc_v_loop_filter_chroma_12_sse2; > c->hevc_h_loop_filter_chroma = > ff_hevc_h_loop_filter_chroma_12_sse2; > - if (ARCH_X86_64) { > - c->hevc_v_loop_filter_luma = > ff_hevc_v_loop_filter_luma_12_sse2; > - c->hevc_h_loop_filter_luma = > ff_hevc_h_loop_filter_luma_12_sse2; > - } > + // FIXME: 12-bit luma deblock over/underflows in some cases > + // if (ARCH_X86_64) { > + // c->hevc_v_loop_filter_luma = > ff_hevc_v_loop_filter_luma_12_sse2; > + // c->hevc_h_loop_filter_luma = > ff_hevc_h_loop_filter_luma_12_sse2; > + // } > SAO_BAND_INIT(12, sse2); > SAO_EDGE_INIT(12, sse2); > Hi Dekker, VVC will utilize this function as well. Could you please share the HEVC clip or data that caused the overflow? We'll make efforts to address it during the VVC porting Thank you. > > -- > 2.43.2 > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/3] avcodec/x86: disable hevc 12b luma deblock 2024-02-23 12:45 ` Nuo Mi @ 2024-02-24 9:49 ` J. Dekker 2024-02-24 10:46 ` Martin Storsjö 0 siblings, 1 reply; 10+ messages in thread From: J. Dekker @ 2024-02-24 9:49 UTC (permalink / raw) To: ffmpeg-devel Nuo Mi <nuomi2021@gmail.com> writes: > On Wed, Feb 21, 2024 at 7:10 PM J. Dekker <jdek@itanimul.li> wrote: > >> Over/underflow in some cases. >> >> Signed-off-by: J. Dekker <jdek@itanimul.li> >> --- >> libavcodec/x86/hevcdsp_init.c | 9 +++++---- >> 1 file changed, 5 insertions(+), 4 deletions(-) >> >> diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c >> index 31e81eb11f..11cb1b3bfd 100644 >> --- a/libavcodec/x86/hevcdsp_init.c >> +++ b/libavcodec/x86/hevcdsp_init.c >> @@ -1205,10 +1205,11 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const >> int bit_depth) >> if (EXTERNAL_SSE2(cpu_flags)) { >> c->hevc_v_loop_filter_chroma = >> ff_hevc_v_loop_filter_chroma_12_sse2; >> c->hevc_h_loop_filter_chroma = >> ff_hevc_h_loop_filter_chroma_12_sse2; >> - if (ARCH_X86_64) { >> - c->hevc_v_loop_filter_luma = >> ff_hevc_v_loop_filter_luma_12_sse2; >> - c->hevc_h_loop_filter_luma = >> ff_hevc_h_loop_filter_luma_12_sse2; >> - } >> + // FIXME: 12-bit luma deblock over/underflows in some cases >> + // if (ARCH_X86_64) { >> + // c->hevc_v_loop_filter_luma = >> ff_hevc_v_loop_filter_luma_12_sse2; >> + // c->hevc_h_loop_filter_luma = >> ff_hevc_h_loop_filter_luma_12_sse2; >> + // } >> SAO_BAND_INIT(12, sse2); >> SAO_EDGE_INIT(12, sse2); >> > Hi Dekker, > VVC will utilize this function as well. > Could you please share the HEVC clip or data that caused the overflow? > We'll make efforts to address it during the VVC porting > You can just run ./tests/checkasm/checkasm --test=hevc_deblock to find a failing case. My guess is that delta0 overflows before the right shift, see the ARM64 asm which specfically widens this calculation on 12 bit variant but I'm not 100%, I don't know x86 asm. -- jd _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/3] avcodec/x86: disable hevc 12b luma deblock 2024-02-24 9:49 ` J. Dekker @ 2024-02-24 10:46 ` Martin Storsjö 2024-02-24 18:32 ` J. Dekker 0 siblings, 1 reply; 10+ messages in thread From: Martin Storsjö @ 2024-02-24 10:46 UTC (permalink / raw) To: FFmpeg development discussions and patches On Sat, 24 Feb 2024, J. Dekker wrote: > > Nuo Mi <nuomi2021@gmail.com> writes: > >> On Wed, Feb 21, 2024 at 7:10 PM J. Dekker <jdek@itanimul.li> wrote: >> >>> Over/underflow in some cases. >>> >>> Signed-off-by: J. Dekker <jdek@itanimul.li> >>> --- >>> libavcodec/x86/hevcdsp_init.c | 9 +++++---- >>> 1 file changed, 5 insertions(+), 4 deletions(-) >>> >>> diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c >>> index 31e81eb11f..11cb1b3bfd 100644 >>> --- a/libavcodec/x86/hevcdsp_init.c >>> +++ b/libavcodec/x86/hevcdsp_init.c >>> @@ -1205,10 +1205,11 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const >>> int bit_depth) >>> if (EXTERNAL_SSE2(cpu_flags)) { >>> c->hevc_v_loop_filter_chroma = >>> ff_hevc_v_loop_filter_chroma_12_sse2; >>> c->hevc_h_loop_filter_chroma = >>> ff_hevc_h_loop_filter_chroma_12_sse2; >>> - if (ARCH_X86_64) { >>> - c->hevc_v_loop_filter_luma = >>> ff_hevc_v_loop_filter_luma_12_sse2; >>> - c->hevc_h_loop_filter_luma = >>> ff_hevc_h_loop_filter_luma_12_sse2; >>> - } >>> + // FIXME: 12-bit luma deblock over/underflows in some cases >>> + // if (ARCH_X86_64) { >>> + // c->hevc_v_loop_filter_luma = >>> ff_hevc_v_loop_filter_luma_12_sse2; >>> + // c->hevc_h_loop_filter_luma = >>> ff_hevc_h_loop_filter_luma_12_sse2; >>> + // } >>> SAO_BAND_INIT(12, sse2); >>> SAO_EDGE_INIT(12, sse2); >>> >> Hi Dekker, >> VVC will utilize this function as well. >> Could you please share the HEVC clip or data that caused the overflow? >> We'll make efforts to address it during the VVC porting >> > > You can just run ./tests/checkasm/checkasm --test=hevc_deblock to > find a failing case. To clarify, this is with the new checkasm test added in this patchset, not currently in git master - otherwise fate would be failing for everybody on x86. > My guess is that delta0 overflows before the right > shift, see the ARM64 asm which specfically widens this calculation on 12 > bit variant but I'm not 100%, I don't know x86 asm. Are you sure the input is within valid range? It's always possible that checkasm produces inputs that the real decoder wouldn't - but it's also possible that this is a real decoder bug that just hasn't been triggered by any other test yet. // Martin _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/3] avcodec/x86: disable hevc 12b luma deblock 2024-02-24 10:46 ` Martin Storsjö @ 2024-02-24 18:32 ` J. Dekker 0 siblings, 0 replies; 10+ messages in thread From: J. Dekker @ 2024-02-24 18:32 UTC (permalink / raw) To: ffmpeg-devel Martin Storsjö <martin@martin.st> writes: > [...] > > Are you sure the input is within valid range? It's always possible that > checkasm produces inputs that the real decoder wouldn't - but it's also > possible that this is a real decoder bug that just hasn't been triggered by any > other test yet. > > // Martin The checkasm was just written to just to trigger all the theoretical edgecases. I know there is a decent range of values which pass the d0 + d3 < beta check and overflow in (9 * (q0 - p0) - 3 * (q1 - p1) + 8) for int16_t. I'm not 100% sure that these values can be output by the decoder, and even if so they're rare. -- jd _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH 3/3] avcodec/aarch64: add hevc deblock NEON 2024-02-21 11:10 [FFmpeg-devel] [PATCH 1/3] checkasm/hevc_deblock: add luma and chroma full J. Dekker 2024-02-21 11:10 ` [FFmpeg-devel] [PATCH 2/3] avcodec/x86: disable hevc 12b luma deblock J. Dekker @ 2024-02-21 11:10 ` J. Dekker 2024-02-21 12:08 ` Martin Storsjö 1 sibling, 1 reply; 10+ messages in thread From: J. Dekker @ 2024-02-21 11:10 UTC (permalink / raw) To: ffmpeg-devel Benched using single-threaded full decode on an Ampere Altra. Bpp Before After Speedup 8 73,3s 65,2s 1.124x 10 114,2s 104,0s 1.098x 12 125,8s 115,7s 1.087x Signed-off-by: J. Dekker <jdek@itanimul.li> --- libavcodec/aarch64/hevcdsp_deblock_neon.S | 421 ++++++++++++++++++++++ libavcodec/aarch64/hevcdsp_init_aarch64.c | 18 + 2 files changed, 439 insertions(+) diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S b/libavcodec/aarch64/hevcdsp_deblock_neon.S index 8227f65649..1995f21807 100644 --- a/libavcodec/aarch64/hevcdsp_deblock_neon.S +++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S @@ -181,3 +181,424 @@ hevc_h_loop_filter_chroma 12 hevc_v_loop_filter_chroma 8 hevc_v_loop_filter_chroma 10 hevc_v_loop_filter_chroma 12 + +.macro hevc_loop_filter_luma_body bitdepth +function hevc_loop_filter_luma_body_\bitdepth\()_neon, export=0 +.if \bitdepth > 8 + lsl w2, w2, #(\bitdepth - 8) // beta <<= BIT_DEPTH - 8 +.else + uxtl v0.8h, v0.8b + uxtl v1.8h, v1.8b + uxtl v2.8h, v2.8b + uxtl v3.8h, v3.8b + uxtl v4.8h, v4.8b + uxtl v5.8h, v5.8b + uxtl v6.8h, v6.8b + uxtl v7.8h, v7.8b +.endif + ldr w7, [x3] // tc[0] + ldr w8, [x3, #4] // tc[1] + dup v18.4h, w7 + dup v19.4h, w8 + trn1 v18.2d, v18.2d, v19.2d +.if \bitdepth > 8 + shl v18.8h, v18.8h, #(\bitdepth - 8) +.endif + dup v27.8h, w2 // beta + // tc25 + shl v19.8h, v18.8h, #2 // * 4 + add v19.8h, v19.8h, v18.8h // (tc * 5) + srshr v19.8h, v19.8h, #1 // (tc * 5 + 1) >> 1 + sshr v17.8h, v27.8h, #2 // beta2 + + ////// beta_2 check + // dp0 = abs(P2 - 2 * P1 + P0) + add v22.8h, v3.8h, v1.8h + shl v23.8h, v2.8h, #1 + sabd v30.8h, v22.8h, v23.8h + // dq0 = abs(Q2 - 2 * Q1 + Q0) + add v21.8h, v6.8h, v4.8h + shl v26.8h, v5.8h, #1 + sabd v31.8h, v21.8h, v26.8h + // d0 = dp0 + dq0 + add v20.8h, v30.8h, v31.8h + shl v25.8h, v20.8h, #1 + // (d0 << 1) < beta_2 + cmgt v23.8h, v17.8h, v25.8h + + ////// beta check + // d0 + d3 < beta + mov x9, #0xFFFF00000000FFFF + dup v24.2d, x9 + and v25.16b, v24.16b, v20.16b + addp v25.8h, v25.8h, v25.8h // 1+0 0+1 1+0 0+1 + addp v25.4h, v25.4h, v25.4h // 1+0+0+1 1+0+0+1 + cmgt v25.4h, v27.4h, v25.4h // lower/upper mask in h[0/1] + mov w9, v25.s[0] + cmp w9, #0 + sxtl v26.4s, v25.4h + sxtl v16.2d, v26.2s // full skip mask + b.eq 3f // skip both blocks + + // TODO: we can check the full skip mask with the weak/strong mask to + // potentially skip weak or strong calculation entirely if we only have one + + ////// beta_3 check + // abs(P3 - P0) + abs(Q3 - Q0) < beta_3 + sshr v17.8h, v17.8h, #1 // beta_3 + sabd v20.8h, v0.8h, v3.8h + saba v20.8h, v7.8h, v4.8h + cmgt v21.8h, v17.8h, v20.8h + + and v23.16b, v23.16b, v21.16b + + ////// tc25 check + // abs(P0 - Q0) < tc25 + sabd v20.8h, v3.8h, v4.8h + cmgt v21.8h, v19.8h, v20.8h + + and v23.16b, v23.16b, v21.16b + + ////// Generate low/high line max from lines 0/3/4/7 + // mask out lines 2/3/5/6 + not v20.16b, v24.16b // 0x0000FFFFFFFF0000 + orr v23.16b, v23.16b, v20.16b + + // generate weak/strong mask + uminp v23.8h, v23.8h, v23.8h // extend to singles + sxtl v23.4s, v23.4h + uminp v26.4s, v23.4s, v23.4s // check lines + // extract to gpr + ext v25.16b, v26.16b, v26.16b, #2 + zip1 v17.4s, v26.4s, v26.4s + mov w12, v25.s[0] + mov w11, #0x0000FFFF + mov w13, #0xFFFF0000 + // FFFF FFFF -> strong strong + // FFFF 0000 -> strong weak + // 0000 FFFF -> weak strong + // 0000 0000 -> weak weak + cmp w12, w13 + b.hi 0f // only strong/strong, skip weak nd_p/nd_q calc + + ////// weak nd_p/nd_q + // d0+d3 + and v30.16b, v30.16b, v24.16b // d0 __ __ d3 d4 __ __ d7 + and v31.16b, v31.16b, v24.16b + addp v30.8h, v30.8h, v30.8h // [d0+__ __+d3 d4+__ __+d7] [ ... ] + addp v31.8h, v31.8h, v31.8h // [d0+d3 d4+d7] + addp v30.4h, v30.4h, v30.4h + addp v31.4h, v31.4h, v31.4h + + // ((beta + (beta >> 1)) >> 3) + sshr v21.8h, v27.8h, #1 + add v21.8h, v21.8h, v27.8h + sshr v21.8h, v21.8h, #3 + + // nd_p = dp0 + dp3 < ((beta + (beta >> 1)) >> 3) + cmgt v30.8h, v21.8h, v30.8h + // nd_q = dq0 + dq3 < ((beta + (beta >> 1)) >> 3) + cmgt v31.8h, v21.8h, v31.8h + + sxtl v30.4s, v30.4h + sxtl v31.4s, v31.4h + sxtl v28.2d, v30.2s + sxtl v29.2d, v31.2s + + cmp w12, w11 + b.lo 1f // can only be weak weak, skip strong + +0: // STRONG FILTER + + // P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc3, tc3); + add v21.8h, v2.8h, v3.8h // (p1 + p0 + add v21.8h, v4.8h, v21.8h // + q0) + shl v21.8h, v21.8h, #1 // * 2 + add v22.8h, v1.8h, v5.8h // (p2 + q1) + add v21.8h, v22.8h, v21.8h // + + srshr v21.8h, v21.8h, #3 // >> 3 + sub v21.8h, v21.8h, v3.8h // - p0 + + // P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2); + + add v22.8h, v1.8h, v2.8h + add v23.8h, v3.8h, v4.8h + add v22.8h, v22.8h, v23.8h + srshr v22.8h, v22.8h, #2 + sub v22.8h, v22.8h, v2.8h + + // P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc, tc); + + add v23.8h, v0.8h, v1.8h // p3 + p2 + add v24.8h, v3.8h, v4.8h // p0 + q0 + shl v23.8h, v23.8h, #1 // * 2 + add v23.8h, v23.8h, v24.8h + add v24.8h, v1.8h, v2.8h // p2 + p1 + add v23.8h, v23.8h, v24.8h + srshr v23.8h, v23.8h, #3 + sub v23.8h, v23.8h, v1.8h + + // Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc3, tc3); + add v24.8h, v3.8h, v4.8h // (p0 + q0 + add v24.8h, v5.8h, v24.8h // + q1) + shl v24.8h, v24.8h, #1 // * 2 + add v25.8h, v2.8h, v6.8h // (p1 + q2) + add v24.8h, v25.8h, v24.8h // + + srshr v24.8h, v24.8h, #3 // >> 3 + sub v24.8h, v24.8h, v4.8h // - q0 + + // Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2); + + add v25.8h, v6.8h, v5.8h + add v26.8h, v3.8h, v4.8h + add v25.8h, v25.8h, v26.8h + srshr v25.8h, v25.8h, #2 + sub v25.8h, v25.8h, v5.8h + + // Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc, tc); + + add v26.8h, v7.8h, v6.8h + add v27.8h, v6.8h, v5.8h + shl v26.8h, v26.8h, #1 + add v26.8h, v26.8h, v27.8h + add v27.8h, v3.8h, v4.8h + add v26.8h, v26.8h, v27.8h + srshr v26.8h, v26.8h, #3 + sub v26.8h, v26.8h, v6.8h + + // this clip should work properly + shl v30.8h, v18.8h, #1 // tc2 + neg v31.8h, v30.8h // -tc2 + clip v31.8h, v30.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h + + and v21.16b, v21.16b, v16.16b + and v22.16b, v22.16b, v16.16b + and v23.16b, v23.16b, v16.16b + and v24.16b, v24.16b, v16.16b + and v25.16b, v25.16b, v16.16b + and v26.16b, v26.16b, v16.16b + + add v23.8h, v23.8h, v1.8h // careful + add v22.8h, v22.8h, v2.8h + add v21.8h, v21.8h, v3.8h + add v24.8h, v24.8h, v4.8h + add v25.8h, v25.8h, v5.8h + add v26.8h, v26.8h, v6.8h + + cmp w12, w13 + b.hi 2f // only strong/strong, skip weak + +1: // WEAK FILTER + + // delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4 +.if \bitdepth < 12 + sub v27.8h, v4.8h, v3.8h // q0 - p0 + shl v30.8h, v27.8h, #3 // * 8 + add v27.8h, v27.8h, v30.8h // 9 * (q0 - p0) + + sub v30.8h, v5.8h, v2.8h // q1 - p1 + shl v31.8h, v30.8h, #1 // * 2 + + sub v27.8h, v27.8h, v31.8h + sub v27.8h, v27.8h, v30.8h // - 3 * (q1 - p1) + srshr v27.8h, v27.8h, #4 +.else + ssubl v19.4s, v4.4h, v3.4h // q0 - p0 + ssubl2 v20.4s, v4.8h, v3.8h + + shl v30.4s, v19.4s, #3 // * 8 + shl v31.4s, v20.4s, #3 + + add v30.4s, v30.4s, v19.4s // 9 * (q0 - p0) + add v31.4s, v31.4s, v20.4s + + ssubl v19.4s, v5.4h, v2.4h // q1 - p1 + ssubl2 v20.4s, v5.8h, v2.8h + + sub v30.4s, v30.4s, v19.4s + sub v31.4s, v31.4s, v20.4s + + shl v19.4s, v19.4s, #1 + shl v20.4s, v20.4s, #1 + + sub v30.4s, v30.4s, v19.4s + sub v31.4s, v31.4s, v20.4s + + sqrshrn v27.4h, v30.4s, #4 + sqrshrn2 v27.8h, v31.4s, #4 +.endif + + // delta0 10tc check mask + shl v30.8h, v18.8h, #1 // * 2 + shl v31.8h, v18.8h, #3 // * 8 + add v30.8h, v30.8h, v31.8h // 10 * tc + abs v31.8h, v27.8h + cmgt v20.8h, v30.8h, v31.8h // abs(delta0) < 10 * tc + + and v20.16b, v20.16b, v16.16b // combine with full mask + + neg v31.8h, v18.8h // -tc + clip v31.8h, v18.8h, v27.8h // delta0 = av_clip(delta0, -tc, tc) + + // deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2) + add v30.8h, v1.8h, v3.8h + srshr v30.8h, v30.8h, #1 + sub v30.8h, v30.8h, v2.8h + add v30.8h, v30.8h, v27.8h + sshr v30.8h, v30.8h, #1 + + // p3 p2 p1 p0 q0 q1 q2 q3 + // v0 v1 v2 v3 v4 v5 v6 v7 + + // deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2); + add v31.8h, v6.8h, v4.8h + srshr v31.8h, v31.8h, #1 + sub v31.8h, v31.8h, v5.8h + sub v31.8h, v31.8h, v27.8h + sshr v31.8h, v31.8h, #1 + + // apply nd_p nd_q mask to deltap1/deltaq1 + and v30.16b, v30.16b, v28.16b + and v31.16b, v31.16b, v29.16b + + // apply full skip mask to deltap1/deltaq1/delta0 + and v30.16b, v30.16b, v20.16b + and v27.16b, v27.16b, v20.16b + and v31.16b, v31.16b, v20.16b + + // clip P1/Q1 to -tc_2, tc_2 + sshr v18.8h, v18.8h, #1 // tc2 + neg v28.8h, v18.8h + clip v28.8h, v18.8h, v30.8h, v31.8h + + // P0 = av_clip_pixel(p0 + delta0) + // Q0 = av_clip_pixel(q0 - delta0) + add v29.8h, v3.8h, v27.8h // P0 + sub v27.8h, v4.8h, v27.8h // Q0 + + // P1 = av_clip_pixel(p1 + deltap1) + // Q1 = av_clip_pixel(q1 + deltaq1) + add v30.8h, v2.8h, v30.8h // P1 + add v31.8h, v5.8h, v31.8h // Q1 + +2: // MIX WEAK/STRONG + + mov v19.16b, v1.16b + mov v20.16b, v6.16b + // copy selection mask + mov v1.16b, v17.16b + mov v2.16b, v17.16b + mov v3.16b, v17.16b + mov v4.16b, v17.16b + mov v5.16b, v17.16b + mov v6.16b, v17.16b + // select + bsl v1.16b, v23.16b, v19.16b // P2 strong/orig + bsl v2.16b, v22.16b, v30.16b // P1 strong/weak + bsl v3.16b, v21.16b, v29.16b // P0 strong/weak + bsl v4.16b, v24.16b, v27.16b // Q0 strong/weak + bsl v5.16b, v25.16b, v31.16b // Q1 strong/weak + bsl v6.16b, v26.16b, v20.16b // Q2 strong/orig + // NOTE: Q3/P3 are unchanged + +.if \bitdepth > 8 + movi v19.8h, #0 + dup v20.8h, w14 + clip v19.8h, v20.8h, v1.8h, v2.8h, v3.8h, v4.8h, v5.8h, v6.8h +.else + sqxtun v0.8b, v0.8h + sqxtun v1.8b, v1.8h + sqxtun v2.8b, v2.8h + sqxtun v3.8b, v3.8h + sqxtun v4.8b, v4.8h + sqxtun v5.8b, v5.8h + sqxtun v6.8b, v6.8h + sqxtun v7.8b, v7.8h +.endif + ret +3: ret x6 +endfunc +.endm + +hevc_loop_filter_luma_body 8 +hevc_loop_filter_luma_body 10 +hevc_loop_filter_luma_body 12 + +// hevc_v_loop_filter_luma(uint8_t *pix, ptrdiff_t stride, int beta, const int32_t *tc, const uint8_t *no_p, const uint8_t *no_q) + +.macro hevc_loop_filter_luma dir bitdepth +function ff_hevc_\dir\()_loop_filter_luma_\bitdepth\()_neon, export=1 + mov x6, x30 +.if \dir == v +.if \bitdepth > 8 + sub x0, x0, #8 +.else + sub x0, x0, #4 +.endif +.else + sub x0, x0, x1, lsl #2 // -4 * xstride, idk +.endif + mov x10, x0 +.if \bitdepth > 8 + ld1 {v0.8h}, [x0], x1 + ld1 {v1.8h}, [x0], x1 + ld1 {v2.8h}, [x0], x1 + ld1 {v3.8h}, [x0], x1 + ld1 {v4.8h}, [x0], x1 + ld1 {v5.8h}, [x0], x1 + ld1 {v6.8h}, [x0], x1 + ld1 {v7.8h}, [x0] + mov w14, #((1 << \bitdepth) - 1) +.if \dir == v + transpose_8x8H v0, v1, v2, v3, v4, v5, v6, v7, v16, v17 +.endif +.else + ld1 {v0.8b}, [x0], x1 + ld1 {v1.8b}, [x0], x1 + ld1 {v2.8b}, [x0], x1 + ld1 {v3.8b}, [x0], x1 + ld1 {v4.8b}, [x0], x1 + ld1 {v5.8b}, [x0], x1 + ld1 {v6.8b}, [x0], x1 + ld1 {v7.8b}, [x0] +.if \dir == v + transpose_8x8B v0, v1, v2, v3, v4, v5, v6, v7, v16, v17 +.endif +.endif + bl hevc_loop_filter_luma_body_\bitdepth\()_neon +.if \bitdepth > 8 +.if \dir == v + transpose_8x8H v0, v1, v2, v3, v4, v5, v6, v7, v16, v17 +.endif + st1 {v0.8h}, [x10], x1 + st1 {v1.8h}, [x10], x1 + st1 {v2.8h}, [x10], x1 + st1 {v3.8h}, [x10], x1 + st1 {v4.8h}, [x10], x1 + st1 {v5.8h}, [x10], x1 + st1 {v6.8h}, [x10], x1 + st1 {v7.8h}, [x10] +.else +.if \dir == v + transpose_8x8B v0, v1, v2, v3, v4, v5, v6, v7, v16, v17 +.endif + st1 {v0.8b}, [x10], x1 + st1 {v1.8b}, [x10], x1 + st1 {v2.8b}, [x10], x1 + st1 {v3.8b}, [x10], x1 + st1 {v4.8b}, [x10], x1 + st1 {v5.8b}, [x10], x1 + st1 {v6.8b}, [x10], x1 + st1 {v7.8b}, [x10] +.endif + ret x6 +endfunc +.endm + +hevc_loop_filter_luma h 8 +hevc_loop_filter_luma h 10 +hevc_loop_filter_luma h 12 + +hevc_loop_filter_luma v 8 +hevc_loop_filter_luma v 10 +hevc_loop_filter_luma v 12 diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index 687b6cc5c3..04692aa98e 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -38,6 +38,18 @@ void ff_hevc_h_loop_filter_chroma_10_neon(uint8_t *_pix, ptrdiff_t _stride, const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q); void ff_hevc_h_loop_filter_chroma_12_neon(uint8_t *_pix, ptrdiff_t _stride, const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q); +void ff_hevc_v_loop_filter_luma_8_neon(uint8_t *_pix, ptrdiff_t _stride, int beta, + const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q); +void ff_hevc_v_loop_filter_luma_10_neon(uint8_t *_pix, ptrdiff_t _stride, int beta, + const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q); +void ff_hevc_v_loop_filter_luma_12_neon(uint8_t *_pix, ptrdiff_t _stride, int beta, + const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q); +void ff_hevc_h_loop_filter_luma_8_neon(uint8_t *_pix, ptrdiff_t _stride, int beta, + const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q); +void ff_hevc_h_loop_filter_luma_10_neon(uint8_t *_pix, ptrdiff_t _stride, int beta, + const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q); +void ff_hevc_h_loop_filter_luma_12_neon(uint8_t *_pix, ptrdiff_t _stride, int beta, + const int *_tc, const uint8_t *_no_p, const uint8_t *_no_q); void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t stride); void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, const int16_t *coeffs, @@ -291,6 +303,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) if (!have_neon(cpu_flags)) return; if (bit_depth == 8) { + c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_neon; + c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_neon; c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_neon; c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_neon; c->add_residual[0] = ff_hevc_add_residual_4x4_8_neon; @@ -379,6 +393,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) } if (bit_depth == 10) { + c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_neon; + c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_neon; c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_neon; c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_neon; c->add_residual[0] = ff_hevc_add_residual_4x4_10_neon; @@ -395,6 +411,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_neon; } if (bit_depth == 12) { + c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_neon; + c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_neon; c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_neon; c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_neon; c->add_residual[0] = ff_hevc_add_residual_4x4_12_neon; -- 2.43.2 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH 3/3] avcodec/aarch64: add hevc deblock NEON 2024-02-21 11:10 ` [FFmpeg-devel] [PATCH 3/3] avcodec/aarch64: add hevc deblock NEON J. Dekker @ 2024-02-21 12:08 ` Martin Storsjö 0 siblings, 0 replies; 10+ messages in thread From: Martin Storsjö @ 2024-02-21 12:08 UTC (permalink / raw) To: FFmpeg development discussions and patches On Wed, 21 Feb 2024, J. Dekker wrote: > Benched using single-threaded full decode on an Ampere Altra. > > Bpp Before After Speedup > 8 73,3s 65,2s 1.124x > 10 114,2s 104,0s 1.098x > 12 125,8s 115,7s 1.087x > > Signed-off-by: J. Dekker <jdek@itanimul.li> > --- > libavcodec/aarch64/hevcdsp_deblock_neon.S | 421 ++++++++++++++++++++++ > libavcodec/aarch64/hevcdsp_init_aarch64.c | 18 + > 2 files changed, 439 insertions(+) > +0: // STRONG FILTER > + > + // P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc3, tc3); > + add v21.8h, v2.8h, v3.8h // (p1 + p0 > + add v21.8h, v4.8h, v21.8h // + q0) > + shl v21.8h, v21.8h, #1 // * 2 > + add v22.8h, v1.8h, v5.8h // (p2 + q1) > + add v21.8h, v22.8h, v21.8h // + > + srshr v21.8h, v21.8h, #3 // >> 3 > + sub v21.8h, v21.8h, v3.8h // - p0 > + The srshr line is incorrectly indented here (and elsewhere) > + sqxtun v4.8b, v4.8h > + sqxtun v5.8b, v5.8h > + sqxtun v6.8b, v6.8h > + sqxtun v7.8b, v7.8h > +.endif > + ret > +3: ret x6 Please indent the "x6" here like other operands > +.macro hevc_loop_filter_luma dir bitdepth > +function ff_hevc_\dir\()_loop_filter_luma_\bitdepth\()_neon, export=1 > + mov x6, x30 > +.if \dir == v In GAS assembler, .if does a numerical comparison - it can't do string comparisons. The right way to do this is to do ".ifc \dir, v", which does a string comparison. (If you really do need to do this like a numerical comparison, it's possible to define e.g. "v" as a numeric symbol as well, see e.g. https://code.videolan.org/videolan/dav1d/-/merge_requests/1603/diffs?commit_id=d4746c908c56cb2e8545efd348b8cdc13f2f2253 but that's not really the nicest way to do it.) This issue breaks compilation with Clang. With gas-preprocessor (for MSVC), it manages to build correctly, but does the wrong thing. To avoid me having to test all these build configurations manually, remembering to check all these corner case build configurations and check indentation and all, I've set up a PoC for testing such things on Github Actions. If you have a repo on github, grab my commits from https://github.com/mstorsjo/FFmpeg/commits/gha-aarch64 (there are a couple of them), add your changes on top of these, and push it as a branch to your own github repo, then check the output from the actions. Here's the output of a run with the patches you just posted: https://github.com/mstorsjo/FFmpeg/actions/runs/7988312683 // Martin _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2024-02-24 19:04 UTC | newest] Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2024-02-21 11:10 [FFmpeg-devel] [PATCH 1/3] checkasm/hevc_deblock: add luma and chroma full J. Dekker 2024-02-21 11:10 ` [FFmpeg-devel] [PATCH 2/3] avcodec/x86: disable hevc 12b luma deblock J. Dekker 2024-02-23 3:14 ` Andreas Rheinhardt 2024-02-24 9:54 ` J. Dekker 2024-02-23 12:45 ` Nuo Mi 2024-02-24 9:49 ` J. Dekker 2024-02-24 10:46 ` Martin Storsjö 2024-02-24 18:32 ` J. Dekker 2024-02-21 11:10 ` [FFmpeg-devel] [PATCH 3/3] avcodec/aarch64: add hevc deblock NEON J. Dekker 2024-02-21 12:08 ` Martin Storsjö
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git