* [FFmpeg-devel] [PATCH 1/3] checkasm/hevc_add_res: add 12bit test @ 2022-06-23 18:04 J. Dekker 2022-06-23 18:04 ` [FFmpeg-devel] [PATCH 2/3] lavc/aarch64: reformat add_res funcs J. Dekker ` (2 more replies) 0 siblings, 3 replies; 12+ messages in thread From: J. Dekker @ 2022-06-23 18:04 UTC (permalink / raw) To: ffmpeg-devel Signed-off-by: J. Dekker <jdek@itanimul.li> --- tests/checkasm/hevc_add_res.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/checkasm/hevc_add_res.c b/tests/checkasm/hevc_add_res.c index 0c896adaca..f17d121939 100644 --- a/tests/checkasm/hevc_add_res.c +++ b/tests/checkasm/hevc_add_res.c @@ -36,14 +36,14 @@ } \ } while (0) -#define randomize_buffers2(buf, size) \ +#define randomize_buffers2(buf, size, mask) \ do { \ int j; \ for (j = 0; j < size; j++) \ - AV_WN16A(buf + j * 2, rnd() & 0x3FF); \ + AV_WN16A(buf + j * 2, rnd() & mask); \ } while (0) -static void compare_add_res(int size, ptrdiff_t stride, int overflow_test) +static void compare_add_res(int size, ptrdiff_t stride, int overflow_test, int mask) { LOCAL_ALIGNED_32(int16_t, res0, [32 * 32]); LOCAL_ALIGNED_32(int16_t, res1, [32 * 32]); @@ -53,7 +53,7 @@ static void compare_add_res(int size, ptrdiff_t stride, int overflow_test) declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, int16_t *res, ptrdiff_t stride); randomize_buffers(res0, size); - randomize_buffers2(dst0, size); + randomize_buffers2(dst0, size, mask); if (overflow_test) res0[0] = 0x8000; memcpy(res1, res0, sizeof(*res0) * size); @@ -69,6 +69,7 @@ static void compare_add_res(int size, ptrdiff_t stride, int overflow_test) static void check_add_res(HEVCDSPContext h, int bit_depth) { int i; + int mask = bit_depth == 8 ? 0xFFFF : bit_depth == 10 ? 0x03FF : 0x07FF; for (i = 2; i <= 5; i++) { int block_size = 1 << i; @@ -76,9 +77,9 @@ static void check_add_res(HEVCDSPContext h, int bit_depth) ptrdiff_t stride = block_size << (bit_depth > 8); if (check_func(h.add_residual[i - 2], "hevc_add_res_%dx%d_%d", block_size, block_size, bit_depth)) { - compare_add_res(size, stride, 0); + compare_add_res(size, stride, 0, mask); // overflow test for res = -32768 - compare_add_res(size, stride, 1); + compare_add_res(size, stride, 1, mask); } } } @@ -87,7 +88,7 @@ void checkasm_check_hevc_add_res(void) { int bit_depth; - for (bit_depth = 8; bit_depth <= 10; bit_depth++) { + for (bit_depth = 8; bit_depth <= 12; bit_depth++) { HEVCDSPContext h; ff_hevc_dsp_init(&h, bit_depth); -- 2.32.0 (Apple Git-132) _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 12+ messages in thread
* [FFmpeg-devel] [PATCH 2/3] lavc/aarch64: reformat add_res funcs 2022-06-23 18:04 [FFmpeg-devel] [PATCH 1/3] checkasm/hevc_add_res: add 12bit test J. Dekker @ 2022-06-23 18:04 ` J. Dekker 2022-08-09 11:04 ` Martin Storsjö 2022-06-23 18:04 ` [FFmpeg-devel] [PATCH 3/3] lavc/aarch64: hevc_add_res add 12bit variants J. Dekker 2022-08-09 11:02 ` [FFmpeg-devel] [PATCH 1/3] checkasm/hevc_add_res: add 12bit test Martin Storsjö 2 siblings, 1 reply; 12+ messages in thread From: J. Dekker @ 2022-06-23 18:04 UTC (permalink / raw) To: ffmpeg-devel Signed-off-by: J. Dekker <jdek@itanimul.li> --- libavcodec/aarch64/hevcdsp_idct_neon.S | 216 ++++++++++++------------- 1 file changed, 108 insertions(+), 108 deletions(-) diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S index 0869431294..484eea8437 100644 --- a/libavcodec/aarch64/hevcdsp_idct_neon.S +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S @@ -27,21 +27,21 @@ #include "libavutil/aarch64/asm.S" const trans, align=4 - .short 64, 83, 64, 36 - .short 89, 75, 50, 18 - .short 90, 87, 80, 70 - .short 57, 43, 25, 9 - .short 90, 90, 88, 85 - .short 82, 78, 73, 67 - .short 61, 54, 46, 38 - .short 31, 22, 13, 4 + .short 64, 83, 64, 36 + .short 89, 75, 50, 18 + .short 90, 87, 80, 70 + .short 57, 43, 25, 9 + .short 90, 90, 88, 85 + .short 82, 78, 73, 67 + .short 61, 54, 46, 38 + .short 31, 22, 13, 4 endconst .macro clip10 in1, in2, c1, c2 - smax \in1, \in1, \c1 - smax \in2, \in2, \c1 - smin \in1, \in1, \c2 - smin \in2, \in2, \c2 + smax \in1, \in1, \c1 + smax \in2, \in2, \c1 + smin \in1, \in1, \c2 + smin \in2, \in2, \c2 .endm function ff_hevc_add_residual_4x4_8_neon, export=1 @@ -50,13 +50,13 @@ function ff_hevc_add_residual_4x4_8_neon, export=1 ld1 {v2.s}[1], [x0], x2 ld1 {v2.s}[2], [x0], x2 ld1 {v2.s}[3], [x0], x2 - sub x0, x0, x2, lsl #2 - uxtl v6.8h, v2.8b - uxtl2 v7.8h, v2.16b - sqadd v0.8h, v0.8h, v6.8h - sqadd v1.8h, v1.8h, v7.8h - sqxtun v0.8b, v0.8h - sqxtun2 v0.16b, v1.8h + sub x0, x0, x2, lsl #2 + uxtl v6.8h, v2.8b + uxtl2 v7.8h, v2.16b + sqadd v0.8h, v0.8h, v6.8h + sqadd v1.8h, v1.8h, v7.8h + sqxtun v0.8b, v0.8h + sqxtun2 v0.16b, v1.8h st1 {v0.s}[0], [x0], x2 st1 {v0.s}[1], [x0], x2 st1 {v0.s}[2], [x0], x2 @@ -70,63 +70,63 @@ function ff_hevc_add_residual_4x4_10_neon, export=1 ld1 {v2.d}[0], [x12], x2 ld1 {v2.d}[1], [x12], x2 ld1 {v3.d}[0], [x12], x2 - sqadd v0.8h, v0.8h, v2.8h + sqadd v0.8h, v0.8h, v2.8h ld1 {v3.d}[1], [x12], x2 - movi v4.8h, #0 - sqadd v1.8h, v1.8h, v3.8h - mvni v5.8h, #0xFC, lsl #8 // movi #0x3FF - clip10 v0.8h, v1.8h, v4.8h, v5.8h - st1 {v0.d}[0], [x0], x2 - st1 {v0.d}[1], [x0], x2 - st1 {v1.d}[0], [x0], x2 - st1 {v1.d}[1], [x0], x2 + movi v4.8h, #0 + sqadd v1.8h, v1.8h, v3.8h + mvni v5.8h, #0xFC, lsl #8 // movi #0x3FF + clip10 v0.8h, v1.8h, v4.8h, v5.8h + st1 {v0.d}[0], [x0], x2 + st1 {v0.d}[1], [x0], x2 + st1 {v1.d}[0], [x0], x2 + st1 {v1.d}[1], [x0], x2 ret endfunc function ff_hevc_add_residual_8x8_8_neon, export=1 - add x12, x0, x2 - add x2, x2, x2 - mov x3, #8 -1: subs x3, x3, #2 - ld1 {v2.d}[0], [x0] - ld1 {v2.d}[1], [x12] - uxtl v3.8h, v2.8b + add x12, x0, x2 + add x2, x2, x2 + mov x3, #8 +1: subs x3, x3, #2 + ld1 {v2.d}[0], [x0] + ld1 {v2.d}[1], [x12] + uxtl v3.8h, v2.8b ld1 {v0.8h-v1.8h}, [x1], #32 - uxtl2 v2.8h, v2.16b - sqadd v0.8h, v0.8h, v3.8h - sqadd v1.8h, v1.8h, v2.8h - sqxtun v0.8b, v0.8h - sqxtun2 v0.16b, v1.8h - st1 {v0.d}[0], [x0], x2 - st1 {v0.d}[1], [x12], x2 - bne 1b + uxtl2 v2.8h, v2.16b + sqadd v0.8h, v0.8h, v3.8h + sqadd v1.8h, v1.8h, v2.8h + sqxtun v0.8b, v0.8h + sqxtun2 v0.16b, v1.8h + st1 {v0.d}[0], [x0], x2 + st1 {v0.d}[1], [x12], x2 + bne 1b ret endfunc function ff_hevc_add_residual_8x8_10_neon, export=1 - add x12, x0, x2 - add x2, x2, x2 - mov x3, #8 - movi v4.8h, #0 - mvni v5.8h, #0xFC, lsl #8 // movi #0x3FF -1: subs x3, x3, #2 + add x12, x0, x2 + add x2, x2, x2 + mov x3, #8 + movi v4.8h, #0 + mvni v5.8h, #0xFC, lsl #8 // movi #0x3FF +1: subs x3, x3, #2 ld1 {v0.8h-v1.8h}, [x1], #32 - ld1 {v2.8h}, [x0] - sqadd v0.8h, v0.8h, v2.8h - ld1 {v3.8h}, [x12] - sqadd v1.8h, v1.8h, v3.8h - clip10 v0.8h, v1.8h, v4.8h, v5.8h - st1 {v0.8h}, [x0], x2 - st1 {v1.8h}, [x12], x2 - bne 1b + ld1 {v2.8h}, [x0] + sqadd v0.8h, v0.8h, v2.8h + ld1 {v3.8h}, [x12] + sqadd v1.8h, v1.8h, v3.8h + clip10 v0.8h, v1.8h, v4.8h, v5.8h + st1 {v0.8h}, [x0], x2 + st1 {v1.8h}, [x12], x2 + bne 1b ret endfunc function ff_hevc_add_residual_16x16_8_neon, export=1 - mov x3, #16 + mov x3, #16 add x12, x0, x2 - add x2, x2, x2 -1: subs x3, x3, #2 + add x2, x2, x2 +1: subs x3, x3, #2 ld1 {v16.16b}, [x0] ld1 {v0.8h-v3.8h}, [x1], #64 ld1 {v19.16b}, [x12] @@ -134,47 +134,47 @@ function ff_hevc_add_residual_16x16_8_neon, export=1 uxtl2 v18.8h, v16.16b uxtl v20.8h, v19.8b uxtl2 v21.8h, v19.16b - sqadd v0.8h, v0.8h, v17.8h - sqadd v1.8h, v1.8h, v18.8h - sqadd v2.8h, v2.8h, v20.8h - sqadd v3.8h, v3.8h, v21.8h - sqxtun v0.8b, v0.8h + sqadd v0.8h, v0.8h, v17.8h + sqadd v1.8h, v1.8h, v18.8h + sqadd v2.8h, v2.8h, v20.8h + sqadd v3.8h, v3.8h, v21.8h + sqxtun v0.8b, v0.8h sqxtun2 v0.16b, v1.8h - sqxtun v1.8b, v2.8h + sqxtun v1.8b, v2.8h sqxtun2 v1.16b, v3.8h st1 {v0.16b}, [x0], x2 st1 {v1.16b}, [x12], x2 - bne 1b + bne 1b ret endfunc function ff_hevc_add_residual_16x16_10_neon, export=1 - mov x3, #16 + mov x3, #16 movi v20.8h, #0 mvni v21.8h, #0xFC, lsl #8 // movi #0x3FF add x12, x0, x2 - add x2, x2, x2 -1: subs x3, x3, #2 + add x2, x2, x2 +1: subs x3, x3, #2 ld1 {v16.8h-v17.8h}, [x0] - ld1 {v0.8h-v3.8h}, [x1], #64 - sqadd v0.8h, v0.8h, v16.8h + ld1 {v0.8h-v3.8h}, [x1], #64 + sqadd v0.8h, v0.8h, v16.8h ld1 {v18.8h-v19.8h}, [x12] - sqadd v1.8h, v1.8h, v17.8h - sqadd v2.8h, v2.8h, v18.8h - sqadd v3.8h, v3.8h, v19.8h - clip10 v0.8h, v1.8h, v20.8h, v21.8h - clip10 v2.8h, v3.8h, v20.8h, v21.8h - st1 {v0.8h-v1.8h}, [x0], x2 - st1 {v2.8h-v3.8h}, [x12], x2 - bne 1b + sqadd v1.8h, v1.8h, v17.8h + sqadd v2.8h, v2.8h, v18.8h + sqadd v3.8h, v3.8h, v19.8h + clip10 v0.8h, v1.8h, v20.8h, v21.8h + clip10 v2.8h, v3.8h, v20.8h, v21.8h + st1 {v0.8h-v1.8h}, [x0], x2 + st1 {v2.8h-v3.8h}, [x12], x2 + bne 1b ret endfunc function ff_hevc_add_residual_32x32_8_neon, export=1 add x12, x0, x2 - add x2, x2, x2 - mov x3, #32 -1: subs x3, x3, #2 + add x2, x2, x2 + mov x3, #32 +1: subs x3, x3, #2 ld1 {v20.16b, v21.16b}, [x0] uxtl v16.8h, v20.8b uxtl2 v17.8h, v20.16b @@ -187,43 +187,43 @@ function ff_hevc_add_residual_32x32_8_neon, export=1 uxtl2 v21.8h, v22.16b uxtl v22.8h, v23.8b uxtl2 v23.8h, v23.16b - sqadd v0.8h, v0.8h, v16.8h - sqadd v1.8h, v1.8h, v17.8h - sqadd v2.8h, v2.8h, v18.8h - sqadd v3.8h, v3.8h, v19.8h - sqadd v4.8h, v4.8h, v20.8h - sqadd v5.8h, v5.8h, v21.8h - sqadd v6.8h, v6.8h, v22.8h - sqadd v7.8h, v7.8h, v23.8h - sqxtun v0.8b, v0.8h + sqadd v0.8h, v0.8h, v16.8h + sqadd v1.8h, v1.8h, v17.8h + sqadd v2.8h, v2.8h, v18.8h + sqadd v3.8h, v3.8h, v19.8h + sqadd v4.8h, v4.8h, v20.8h + sqadd v5.8h, v5.8h, v21.8h + sqadd v6.8h, v6.8h, v22.8h + sqadd v7.8h, v7.8h, v23.8h + sqxtun v0.8b, v0.8h sqxtun2 v0.16b, v1.8h - sqxtun v1.8b, v2.8h + sqxtun v1.8b, v2.8h sqxtun2 v1.16b, v3.8h - sqxtun v2.8b, v4.8h + sqxtun v2.8b, v4.8h sqxtun2 v2.16b, v5.8h - st1 {v0.16b, v1.16b}, [x0], x2 - sqxtun v3.8b, v6.8h + st1 {v0.16b, v1.16b}, [x0], x2 + sqxtun v3.8b, v6.8h sqxtun2 v3.16b, v7.8h st1 {v2.16b, v3.16b}, [x12], x2 - bne 1b + bne 1b ret endfunc function ff_hevc_add_residual_32x32_10_neon, export=1 - mov x3, #32 + mov x3, #32 movi v20.8h, #0 mvni v21.8h, #0xFC, lsl #8 // movi #0x3FF -1: subs x3, x3, #1 - ld1 {v0.8h-v3.8h}, [x1], #64 +1: subs x3, x3, #1 + ld1 {v0.8h -v3.8h}, [x1], #64 ld1 {v16.8h-v19.8h}, [x0] - sqadd v0.8h, v0.8h, v16.8h - sqadd v1.8h, v1.8h, v17.8h - sqadd v2.8h, v2.8h, v18.8h - sqadd v3.8h, v3.8h, v19.8h - clip10 v0.8h, v1.8h, v20.8h, v21.8h - clip10 v2.8h, v3.8h, v20.8h, v21.8h - st1 {v0.8h-v3.8h}, [x0], x2 - bne 1b + sqadd v0.8h, v0.8h, v16.8h + sqadd v1.8h, v1.8h, v17.8h + sqadd v2.8h, v2.8h, v18.8h + sqadd v3.8h, v3.8h, v19.8h + clip10 v0.8h, v1.8h, v20.8h, v21.8h + clip10 v2.8h, v3.8h, v20.8h, v21.8h + st1 {v0.8h-v3.8h}, [x0], x2 + bne 1b ret endfunc -- 2.32.0 (Apple Git-132) _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/3] lavc/aarch64: reformat add_res funcs 2022-06-23 18:04 ` [FFmpeg-devel] [PATCH 2/3] lavc/aarch64: reformat add_res funcs J. Dekker @ 2022-08-09 11:04 ` Martin Storsjö 0 siblings, 0 replies; 12+ messages in thread From: Martin Storsjö @ 2022-08-09 11:04 UTC (permalink / raw) To: FFmpeg development discussions and patches On Thu, 23 Jun 2022, J. Dekker wrote: > Signed-off-by: J. Dekker <jdek@itanimul.li> > --- > libavcodec/aarch64/hevcdsp_idct_neon.S | 216 ++++++++++++------------- > 1 file changed, 108 insertions(+), 108 deletions(-) LGTM, thanks! // Martin _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 12+ messages in thread
* [FFmpeg-devel] [PATCH 3/3] lavc/aarch64: hevc_add_res add 12bit variants 2022-06-23 18:04 [FFmpeg-devel] [PATCH 1/3] checkasm/hevc_add_res: add 12bit test J. Dekker 2022-06-23 18:04 ` [FFmpeg-devel] [PATCH 2/3] lavc/aarch64: reformat add_res funcs J. Dekker @ 2022-06-23 18:04 ` J. Dekker 2022-08-09 11:13 ` Martin Storsjö 2022-08-09 11:02 ` [FFmpeg-devel] [PATCH 1/3] checkasm/hevc_add_res: add 12bit test Martin Storsjö 2 siblings, 1 reply; 12+ messages in thread From: J. Dekker @ 2022-06-23 18:04 UTC (permalink / raw) To: ffmpeg-devel hevc_add_res_4x4_12_c: 46.0 hevc_add_res_4x4_12_neon: 18.7 hevc_add_res_8x8_12_c: 194.7 hevc_add_res_8x8_12_neon: 25.2 hevc_add_res_16x16_12_c: 716.0 hevc_add_res_16x16_12_neon: 69.7 hevc_add_res_32x32_12_c: 3820.7 hevc_add_res_32x32_12_neon: 261.0 Signed-off-by: J. Dekker <jdek@itanimul.li> --- libavcodec/aarch64/hevcdsp_idct_neon.S | 148 ++++++++++++---------- libavcodec/aarch64/hevcdsp_init_aarch64.c | 34 ++--- 2 files changed, 97 insertions(+), 85 deletions(-) diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S index 484eea8437..413e225218 100644 --- a/libavcodec/aarch64/hevcdsp_idct_neon.S +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S @@ -37,11 +37,11 @@ const trans, align=4 .short 31, 22, 13, 4 endconst -.macro clip10 in1, in2, c1, c2 - smax \in1, \in1, \c1 - smax \in2, \in2, \c1 - smin \in1, \in1, \c2 - smin \in2, \in2, \c2 +.macro clip2 in1, in2, min, max + smax \in1, \in1, \min + smax \in2, \in2, \min + smin \in1, \in1, \max + smin \in2, \in2, \max .endm function ff_hevc_add_residual_4x4_8_neon, export=1 @@ -64,25 +64,6 @@ function ff_hevc_add_residual_4x4_8_neon, export=1 ret endfunc -function ff_hevc_add_residual_4x4_10_neon, export=1 - mov x12, x0 - ld1 {v0.8h-v1.8h}, [x1] - ld1 {v2.d}[0], [x12], x2 - ld1 {v2.d}[1], [x12], x2 - ld1 {v3.d}[0], [x12], x2 - sqadd v0.8h, v0.8h, v2.8h - ld1 {v3.d}[1], [x12], x2 - movi v4.8h, #0 - sqadd v1.8h, v1.8h, v3.8h - mvni v5.8h, #0xFC, lsl #8 // movi #0x3FF - clip10 v0.8h, v1.8h, v4.8h, v5.8h - st1 {v0.d}[0], [x0], x2 - st1 {v0.d}[1], [x0], x2 - st1 {v1.d}[0], [x0], x2 - st1 {v1.d}[1], [x0], x2 - ret -endfunc - function ff_hevc_add_residual_8x8_8_neon, export=1 add x12, x0, x2 add x2, x2, x2 @@ -103,25 +84,6 @@ function ff_hevc_add_residual_8x8_8_neon, export=1 ret endfunc -function ff_hevc_add_residual_8x8_10_neon, export=1 - add x12, x0, x2 - add x2, x2, x2 - mov x3, #8 - movi v4.8h, #0 - mvni v5.8h, #0xFC, lsl #8 // movi #0x3FF -1: subs x3, x3, #2 - ld1 {v0.8h-v1.8h}, [x1], #32 - ld1 {v2.8h}, [x0] - sqadd v0.8h, v0.8h, v2.8h - ld1 {v3.8h}, [x12] - sqadd v1.8h, v1.8h, v3.8h - clip10 v0.8h, v1.8h, v4.8h, v5.8h - st1 {v0.8h}, [x0], x2 - st1 {v1.8h}, [x12], x2 - bne 1b - ret -endfunc - function ff_hevc_add_residual_16x16_8_neon, export=1 mov x3, #16 add x12, x0, x2 @@ -148,28 +110,6 @@ function ff_hevc_add_residual_16x16_8_neon, export=1 ret endfunc -function ff_hevc_add_residual_16x16_10_neon, export=1 - mov x3, #16 - movi v20.8h, #0 - mvni v21.8h, #0xFC, lsl #8 // movi #0x3FF - add x12, x0, x2 - add x2, x2, x2 -1: subs x3, x3, #2 - ld1 {v16.8h-v17.8h}, [x0] - ld1 {v0.8h-v3.8h}, [x1], #64 - sqadd v0.8h, v0.8h, v16.8h - ld1 {v18.8h-v19.8h}, [x12] - sqadd v1.8h, v1.8h, v17.8h - sqadd v2.8h, v2.8h, v18.8h - sqadd v3.8h, v3.8h, v19.8h - clip10 v0.8h, v1.8h, v20.8h, v21.8h - clip10 v2.8h, v3.8h, v20.8h, v21.8h - st1 {v0.8h-v1.8h}, [x0], x2 - st1 {v2.8h-v3.8h}, [x12], x2 - bne 1b - ret -endfunc - function ff_hevc_add_residual_32x32_8_neon, export=1 add x12, x0, x2 add x2, x2, x2 @@ -209,10 +149,76 @@ function ff_hevc_add_residual_32x32_8_neon, export=1 ret endfunc -function ff_hevc_add_residual_32x32_10_neon, export=1 +.macro add_res bitdepth +.if \bitdepth == 10 +.set mask, 0xFC +.else +.set mask, 0xF0 +.endif +function ff_hevc_add_residual_4x4_\bitdepth\()_neon, export=1 + mov x12, x0 + ld1 {v0.8h-v1.8h}, [x1] + ld1 {v2.d}[0], [x12], x2 + ld1 {v2.d}[1], [x12], x2 + ld1 {v3.d}[0], [x12], x2 + sqadd v0.8h, v0.8h, v2.8h + ld1 {v3.d}[1], [x12], x2 + movi v4.8h, #0 + sqadd v1.8h, v1.8h, v3.8h + mvni v5.8h, mask, lsl #8 + clip2 v0.8h, v1.8h, v4.8h, v5.8h + st1 {v0.d}[0], [x0], x2 + st1 {v0.d}[1], [x0], x2 + st1 {v1.d}[0], [x0], x2 + st1 {v1.d}[1], [x0], x2 + ret +endfunc + +function ff_hevc_add_residual_8x8_\bitdepth\()_neon, export=1 + add x12, x0, x2 + add x2, x2, x2 + mov x3, #8 + movi v4.8h, #0 + mvni v5.8h, mask, lsl #8 +1: subs x3, x3, #2 + ld1 {v0.8h-v1.8h}, [x1], #32 + ld1 {v2.8h}, [x0] + sqadd v0.8h, v0.8h, v2.8h + ld1 {v3.8h}, [x12] + sqadd v1.8h, v1.8h, v3.8h + clip2 v0.8h, v1.8h, v4.8h, v5.8h + st1 {v0.8h}, [x0], x2 + st1 {v1.8h}, [x12], x2 + bne 1b + ret +endfunc + +function ff_hevc_add_residual_16x16_\bitdepth\()_neon, export=1 + mov x3, #16 + movi v20.8h, #0 + mvni v21.8h, mask, lsl #8 + add x12, x0, x2 + add x2, x2, x2 +1: subs x3, x3, #2 + ld1 {v16.8h-v17.8h}, [x0] + ld1 {v0.8h-v3.8h}, [x1], #64 + sqadd v0.8h, v0.8h, v16.8h + ld1 {v18.8h-v19.8h}, [x12] + sqadd v1.8h, v1.8h, v17.8h + sqadd v2.8h, v2.8h, v18.8h + sqadd v3.8h, v3.8h, v19.8h + clip2 v0.8h, v1.8h, v20.8h, v21.8h + clip2 v2.8h, v3.8h, v20.8h, v21.8h + st1 {v0.8h-v1.8h}, [x0], x2 + st1 {v2.8h-v3.8h}, [x12], x2 + bne 1b + ret +endfunc + +function ff_hevc_add_residual_32x32_\bitdepth\()_neon, export=1 mov x3, #32 movi v20.8h, #0 - mvni v21.8h, #0xFC, lsl #8 // movi #0x3FF + mvni v21.8h, mask, lsl #8 1: subs x3, x3, #1 ld1 {v0.8h -v3.8h}, [x1], #64 ld1 {v16.8h-v19.8h}, [x0] @@ -220,12 +226,16 @@ function ff_hevc_add_residual_32x32_10_neon, export=1 sqadd v1.8h, v1.8h, v17.8h sqadd v2.8h, v2.8h, v18.8h sqadd v3.8h, v3.8h, v19.8h - clip10 v0.8h, v1.8h, v20.8h, v21.8h - clip10 v2.8h, v3.8h, v20.8h, v21.8h + clip2 v0.8h, v1.8h, v20.8h, v21.8h + clip2 v2.8h, v3.8h, v20.8h, v21.8h st1 {v0.8h-v3.8h}, [x0], x2 bne 1b ret endfunc +.endm + +add_res 10 +add_res 12 .macro sum_sub out, in, c, op, p .ifc \op, + diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index 2002530266..f37e47121e 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -25,22 +25,18 @@ #include "libavutil/aarch64/cpu.h" #include "libavcodec/hevcdsp.h" -void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride); -void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride); -void ff_hevc_add_residual_8x8_8_neon(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride); -void ff_hevc_add_residual_8x8_10_neon(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride); -void ff_hevc_add_residual_16x16_8_neon(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride); -void ff_hevc_add_residual_16x16_10_neon(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride); -void ff_hevc_add_residual_32x32_8_neon(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride); -void ff_hevc_add_residual_32x32_10_neon(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride); +void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_4x4_12_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_8x8_8_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_8x8_10_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_8x8_12_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_16x16_8_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_16x16_10_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_16x16_12_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_32x32_8_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_32x32_10_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_32x32_12_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); void ff_hevc_idct_8x8_8_neon(int16_t *coeffs, int col_limit); void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit); void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit); @@ -100,4 +96,10 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_neon; c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_neon; } + if (bit_depth == 12) { + c->add_residual[0] = ff_hevc_add_residual_4x4_12_neon; + c->add_residual[1] = ff_hevc_add_residual_8x8_12_neon; + c->add_residual[2] = ff_hevc_add_residual_16x16_12_neon; + c->add_residual[3] = ff_hevc_add_residual_32x32_12_neon; + } } -- 2.32.0 (Apple Git-132) _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [FFmpeg-devel] [PATCH 3/3] lavc/aarch64: hevc_add_res add 12bit variants 2022-06-23 18:04 ` [FFmpeg-devel] [PATCH 3/3] lavc/aarch64: hevc_add_res add 12bit variants J. Dekker @ 2022-08-09 11:13 ` Martin Storsjö 2022-08-09 11:21 ` Martin Storsjö 0 siblings, 1 reply; 12+ messages in thread From: Martin Storsjö @ 2022-08-09 11:13 UTC (permalink / raw) To: FFmpeg development discussions and patches On Thu, 23 Jun 2022, J. Dekker wrote: > hevc_add_res_4x4_12_c: 46.0 > hevc_add_res_4x4_12_neon: 18.7 > hevc_add_res_8x8_12_c: 194.7 > hevc_add_res_8x8_12_neon: 25.2 > hevc_add_res_16x16_12_c: 716.0 > hevc_add_res_16x16_12_neon: 69.7 > hevc_add_res_32x32_12_c: 3820.7 > hevc_add_res_32x32_12_neon: 261.0 > > Signed-off-by: J. Dekker <jdek@itanimul.li> > --- > libavcodec/aarch64/hevcdsp_idct_neon.S | 148 ++++++++++++---------- > libavcodec/aarch64/hevcdsp_init_aarch64.c | 34 ++--- > 2 files changed, 97 insertions(+), 85 deletions(-) LGTM. The patch is a bit hard to inspect thoroughly (to see exactly how little has changed) due to the functions being moved around at the same time as they're modified, but I checked and the changes do look fine. By splitting things up in individual macros for each function, (e.g. add_res_4x4, add_res_8x8 etc, then add_res setting the mask and calling the others) you could keep the code in place and make the diff even easier to read, but it's not strictly necessary. // Martin _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [FFmpeg-devel] [PATCH 3/3] lavc/aarch64: hevc_add_res add 12bit variants 2022-08-09 11:13 ` Martin Storsjö @ 2022-08-09 11:21 ` Martin Storsjö 2022-08-16 5:01 ` [FFmpeg-devel] [PATCH v2] " J. Dekker 0 siblings, 1 reply; 12+ messages in thread From: Martin Storsjö @ 2022-08-09 11:21 UTC (permalink / raw) To: FFmpeg development discussions and patches On Tue, 9 Aug 2022, Martin Storsjö wrote: > On Thu, 23 Jun 2022, J. Dekker wrote: > >> hevc_add_res_4x4_12_c: 46.0 >> hevc_add_res_4x4_12_neon: 18.7 >> hevc_add_res_8x8_12_c: 194.7 >> hevc_add_res_8x8_12_neon: 25.2 >> hevc_add_res_16x16_12_c: 716.0 >> hevc_add_res_16x16_12_neon: 69.7 >> hevc_add_res_32x32_12_c: 3820.7 >> hevc_add_res_32x32_12_neon: 261.0 >> >> Signed-off-by: J. Dekker <jdek@itanimul.li> >> --- >> libavcodec/aarch64/hevcdsp_idct_neon.S | 148 ++++++++++++---------- >> libavcodec/aarch64/hevcdsp_init_aarch64.c | 34 ++--- >> 2 files changed, 97 insertions(+), 85 deletions(-) > > LGTM. The patch is a bit hard to inspect thoroughly (to see exactly how > little has changed) due to the functions being moved around at the same time > as they're modified, but I checked and the changes do look fine. > > By splitting things up in individual macros for each function, (e.g. > add_res_4x4, add_res_8x8 etc, then add_res setting the mask and calling the > others) you could keep the code in place and make the diff even easier to > read, but it's not strictly necessary. Actually, I do want you to make a change here. The only single thing that differs between the 10 and 12 bit versions, is what the mask register is initialized to. It's totally a waste of space to produce two near-identical versions of everything. Instead I'd suggest making just two frontend functions, which sets the mask register and then calls the (non-exported) 16 bit generic function. Also, have a look at e.g. vp9mc_16bpp_neon.S, where we have something similar: .macro do_8tap_v_func type, filter, offset, size, bpp function ff_vp9_\type\()_\filter\()\size\()_v_\bpp\()_neon, export=1 uxtw x4, w4 mvni v1.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8 movrel x5, X(ff_vp9_subpel_filters), 256*\offset add x6, x5, w6, uxtw #4 mov x5, #\size .if \size >= 8 b \type\()_8tap_8v ... For your case, you don't need anything else than the mvni instruction and then a branch to the actual implementation. // Martin _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 12+ messages in thread
* [FFmpeg-devel] [PATCH v2] lavc/aarch64: hevc_add_res add 12bit variants 2022-08-09 11:21 ` Martin Storsjö @ 2022-08-16 5:01 ` J. Dekker 2022-08-16 11:38 ` Martin Storsjö 0 siblings, 1 reply; 12+ messages in thread From: J. Dekker @ 2022-08-16 5:01 UTC (permalink / raw) To: ffmpeg-devel hevc_add_res_4x4_12_c: 46.0 hevc_add_res_4x4_12_neon: 18.7 hevc_add_res_8x8_12_c: 194.7 hevc_add_res_8x8_12_neon: 25.2 hevc_add_res_16x16_12_c: 716.0 hevc_add_res_16x16_12_neon: 69.7 hevc_add_res_32x32_12_c: 3820.7 hevc_add_res_32x32_12_neon: 261.0 Signed-off-by: J. Dekker <jdek@itanimul.li> --- libavcodec/aarch64/hevcdsp_idct_neon.S | 156 ++++++++++++---------- libavcodec/aarch64/hevcdsp_init_aarch64.c | 34 ++--- 2 files changed, 105 insertions(+), 85 deletions(-) diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S index 484eea8437..5fb5990f3d 100644 --- a/libavcodec/aarch64/hevcdsp_idct_neon.S +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S @@ -37,11 +37,11 @@ const trans, align=4 .short 31, 22, 13, 4 endconst -.macro clip10 in1, in2, c1, c2 - smax \in1, \in1, \c1 - smax \in2, \in2, \c1 - smin \in1, \in1, \c2 - smin \in2, \in2, \c2 +.macro clip2 in1, in2, min, max + smax \in1, \in1, \min + smax \in2, \in2, \min + smin \in1, \in1, \max + smin \in2, \in2, \max .endm function ff_hevc_add_residual_4x4_8_neon, export=1 @@ -64,25 +64,6 @@ function ff_hevc_add_residual_4x4_8_neon, export=1 ret endfunc -function ff_hevc_add_residual_4x4_10_neon, export=1 - mov x12, x0 - ld1 {v0.8h-v1.8h}, [x1] - ld1 {v2.d}[0], [x12], x2 - ld1 {v2.d}[1], [x12], x2 - ld1 {v3.d}[0], [x12], x2 - sqadd v0.8h, v0.8h, v2.8h - ld1 {v3.d}[1], [x12], x2 - movi v4.8h, #0 - sqadd v1.8h, v1.8h, v3.8h - mvni v5.8h, #0xFC, lsl #8 // movi #0x3FF - clip10 v0.8h, v1.8h, v4.8h, v5.8h - st1 {v0.d}[0], [x0], x2 - st1 {v0.d}[1], [x0], x2 - st1 {v1.d}[0], [x0], x2 - st1 {v1.d}[1], [x0], x2 - ret -endfunc - function ff_hevc_add_residual_8x8_8_neon, export=1 add x12, x0, x2 add x2, x2, x2 @@ -103,25 +84,6 @@ function ff_hevc_add_residual_8x8_8_neon, export=1 ret endfunc -function ff_hevc_add_residual_8x8_10_neon, export=1 - add x12, x0, x2 - add x2, x2, x2 - mov x3, #8 - movi v4.8h, #0 - mvni v5.8h, #0xFC, lsl #8 // movi #0x3FF -1: subs x3, x3, #2 - ld1 {v0.8h-v1.8h}, [x1], #32 - ld1 {v2.8h}, [x0] - sqadd v0.8h, v0.8h, v2.8h - ld1 {v3.8h}, [x12] - sqadd v1.8h, v1.8h, v3.8h - clip10 v0.8h, v1.8h, v4.8h, v5.8h - st1 {v0.8h}, [x0], x2 - st1 {v1.8h}, [x12], x2 - bne 1b - ret -endfunc - function ff_hevc_add_residual_16x16_8_neon, export=1 mov x3, #16 add x12, x0, x2 @@ -148,28 +110,6 @@ function ff_hevc_add_residual_16x16_8_neon, export=1 ret endfunc -function ff_hevc_add_residual_16x16_10_neon, export=1 - mov x3, #16 - movi v20.8h, #0 - mvni v21.8h, #0xFC, lsl #8 // movi #0x3FF - add x12, x0, x2 - add x2, x2, x2 -1: subs x3, x3, #2 - ld1 {v16.8h-v17.8h}, [x0] - ld1 {v0.8h-v3.8h}, [x1], #64 - sqadd v0.8h, v0.8h, v16.8h - ld1 {v18.8h-v19.8h}, [x12] - sqadd v1.8h, v1.8h, v17.8h - sqadd v2.8h, v2.8h, v18.8h - sqadd v3.8h, v3.8h, v19.8h - clip10 v0.8h, v1.8h, v20.8h, v21.8h - clip10 v2.8h, v3.8h, v20.8h, v21.8h - st1 {v0.8h-v1.8h}, [x0], x2 - st1 {v2.8h-v3.8h}, [x12], x2 - bne 1b - ret -endfunc - function ff_hevc_add_residual_32x32_8_neon, export=1 add x12, x0, x2 add x2, x2, x2 @@ -209,10 +149,88 @@ function ff_hevc_add_residual_32x32_8_neon, export=1 ret endfunc -function ff_hevc_add_residual_32x32_10_neon, export=1 +.macro add_res bitdepth +function ff_hevc_add_residual_4x4_\bitdepth\()_neon, export=1 + mvni v21.8h, #((0xFF << (\bitdepth - 8)) & 0xFF), lsl #8 + b X(ff_hevc_add_residual_4x4_16_neon) +endfunc +function ff_hevc_add_residual_8x8_\bitdepth\()_neon, export=1 + mvni v21.8h, #((0xFF << (\bitdepth - 8)) & 0xFF), lsl #8 + b X(ff_hevc_add_residual_8x8_16_neon) +endfunc +function ff_hevc_add_residual_16x16_\bitdepth\()_neon, export=1 + mvni v21.8h, #((0xFF << (\bitdepth - 8)) & 0xFF), lsl #8 + b X(ff_hevc_add_residual_16x16_16_neon) +endfunc +function ff_hevc_add_residual_32x32_\bitdepth\()_neon, export=1 + mvni v21.8h, #((0xFF << (\bitdepth - 8)) & 0xFF), lsl #8 + b X(ff_hevc_add_residual_32x32_16_neon) +endfunc +.endm + +add_res 10 +add_res 12 + +function ff_hevc_add_residual_4x4_16_neon, export=0 + mov x12, x0 + ld1 {v0.8h-v1.8h}, [x1] + ld1 {v2.d}[0], [x12], x2 + ld1 {v2.d}[1], [x12], x2 + ld1 {v3.d}[0], [x12], x2 + sqadd v0.8h, v0.8h, v2.8h + ld1 {v3.d}[1], [x12], x2 + movi v4.8h, #0 + sqadd v1.8h, v1.8h, v3.8h + clip2 v0.8h, v1.8h, v4.8h, v21.8h + st1 {v0.d}[0], [x0], x2 + st1 {v0.d}[1], [x0], x2 + st1 {v1.d}[0], [x0], x2 + st1 {v1.d}[1], [x0], x2 + ret +endfunc + +function ff_hevc_add_residual_8x8_16_neon, export=0 + add x12, x0, x2 + add x2, x2, x2 + mov x3, #8 + movi v4.8h, #0 +1: subs x3, x3, #2 + ld1 {v0.8h-v1.8h}, [x1], #32 + ld1 {v2.8h}, [x0] + sqadd v0.8h, v0.8h, v2.8h + ld1 {v3.8h}, [x12] + sqadd v1.8h, v1.8h, v3.8h + clip2 v0.8h, v1.8h, v4.8h, v21.8h + st1 {v0.8h}, [x0], x2 + st1 {v1.8h}, [x12], x2 + bne 1b + ret +endfunc + +function ff_hevc_add_residual_16x16_16_neon, export=0 + mov x3, #16 + movi v20.8h, #0 + add x12, x0, x2 + add x2, x2, x2 +1: subs x3, x3, #2 + ld1 {v16.8h-v17.8h}, [x0] + ld1 {v0.8h-v3.8h}, [x1], #64 + sqadd v0.8h, v0.8h, v16.8h + ld1 {v18.8h-v19.8h}, [x12] + sqadd v1.8h, v1.8h, v17.8h + sqadd v2.8h, v2.8h, v18.8h + sqadd v3.8h, v3.8h, v19.8h + clip2 v0.8h, v1.8h, v20.8h, v21.8h + clip2 v2.8h, v3.8h, v20.8h, v21.8h + st1 {v0.8h-v1.8h}, [x0], x2 + st1 {v2.8h-v3.8h}, [x12], x2 + bne 1b + ret +endfunc + +function ff_hevc_add_residual_32x32_16_neon, export=0 mov x3, #32 movi v20.8h, #0 - mvni v21.8h, #0xFC, lsl #8 // movi #0x3FF 1: subs x3, x3, #1 ld1 {v0.8h -v3.8h}, [x1], #64 ld1 {v16.8h-v19.8h}, [x0] @@ -220,8 +238,8 @@ function ff_hevc_add_residual_32x32_10_neon, export=1 sqadd v1.8h, v1.8h, v17.8h sqadd v2.8h, v2.8h, v18.8h sqadd v3.8h, v3.8h, v19.8h - clip10 v0.8h, v1.8h, v20.8h, v21.8h - clip10 v2.8h, v3.8h, v20.8h, v21.8h + clip2 v0.8h, v1.8h, v20.8h, v21.8h + clip2 v2.8h, v3.8h, v20.8h, v21.8h st1 {v0.8h-v3.8h}, [x0], x2 bne 1b ret diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index 2002530266..f37e47121e 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -25,22 +25,18 @@ #include "libavutil/aarch64/cpu.h" #include "libavcodec/hevcdsp.h" -void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride); -void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride); -void ff_hevc_add_residual_8x8_8_neon(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride); -void ff_hevc_add_residual_8x8_10_neon(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride); -void ff_hevc_add_residual_16x16_8_neon(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride); -void ff_hevc_add_residual_16x16_10_neon(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride); -void ff_hevc_add_residual_32x32_8_neon(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride); -void ff_hevc_add_residual_32x32_10_neon(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride); +void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_4x4_12_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_8x8_8_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_8x8_10_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_8x8_12_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_16x16_8_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_16x16_10_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_16x16_12_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_32x32_8_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_32x32_10_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_32x32_12_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); void ff_hevc_idct_8x8_8_neon(int16_t *coeffs, int col_limit); void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit); void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit); @@ -100,4 +96,10 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_neon; c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_neon; } + if (bit_depth == 12) { + c->add_residual[0] = ff_hevc_add_residual_4x4_12_neon; + c->add_residual[1] = ff_hevc_add_residual_8x8_12_neon; + c->add_residual[2] = ff_hevc_add_residual_16x16_12_neon; + c->add_residual[3] = ff_hevc_add_residual_32x32_12_neon; + } } -- 2.32.0 (Apple Git-132) _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2] lavc/aarch64: hevc_add_res add 12bit variants 2022-08-16 5:01 ` [FFmpeg-devel] [PATCH v2] " J. Dekker @ 2022-08-16 11:38 ` Martin Storsjö 2022-08-16 12:12 ` [FFmpeg-devel] [PATCH v3] " J. Dekker 0 siblings, 1 reply; 12+ messages in thread From: Martin Storsjö @ 2022-08-16 11:38 UTC (permalink / raw) To: FFmpeg development discussions and patches On Tue, 16 Aug 2022, J. Dekker wrote: > hevc_add_res_4x4_12_c: 46.0 > hevc_add_res_4x4_12_neon: 18.7 > hevc_add_res_8x8_12_c: 194.7 > hevc_add_res_8x8_12_neon: 25.2 > hevc_add_res_16x16_12_c: 716.0 > hevc_add_res_16x16_12_neon: 69.7 > hevc_add_res_32x32_12_c: 3820.7 > hevc_add_res_32x32_12_neon: 261.0 > > Signed-off-by: J. Dekker <jdek@itanimul.li> > --- > libavcodec/aarch64/hevcdsp_idct_neon.S | 156 ++++++++++++---------- > libavcodec/aarch64/hevcdsp_init_aarch64.c | 34 ++--- > 2 files changed, 105 insertions(+), 85 deletions(-) > > -function ff_hevc_add_residual_32x32_10_neon, export=1 > +.macro add_res bitdepth > +function ff_hevc_add_residual_4x4_\bitdepth\()_neon, export=1 > + mvni v21.8h, #((0xFF << (\bitdepth - 8)) & 0xFF), lsl #8 > + b X(ff_hevc_add_residual_4x4_16_neon) When the function isn't exported, you shouldn't use X() to access the symbol of it. On Darwin, X() adds the underscore prefix, but that symbol name is only defined for exported functions. Also, you probably should remove the ff_ prefix for symbols that aren't exported, for clarity. This issue causes the patch in its current form to break compilation on macOS. > -void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, int16_t *coeffs, > - ptrdiff_t stride); > -void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, int16_t *coeffs, > - ptrdiff_t stride); > -void ff_hevc_add_residual_8x8_8_neon(uint8_t *_dst, int16_t *coeffs, > - ptrdiff_t stride); > -void ff_hevc_add_residual_8x8_10_neon(uint8_t *_dst, int16_t *coeffs, > - ptrdiff_t stride); > -void ff_hevc_add_residual_16x16_8_neon(uint8_t *_dst, int16_t *coeffs, > - ptrdiff_t stride); > -void ff_hevc_add_residual_16x16_10_neon(uint8_t *_dst, int16_t *coeffs, > - ptrdiff_t stride); > -void ff_hevc_add_residual_32x32_8_neon(uint8_t *_dst, int16_t *coeffs, > - ptrdiff_t stride); > -void ff_hevc_add_residual_32x32_10_neon(uint8_t *_dst, int16_t *coeffs, > - ptrdiff_t stride); > +void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); > +void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); > +void ff_hevc_add_residual_4x4_12_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); > +void ff_hevc_add_residual_8x8_8_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); > +void ff_hevc_add_residual_8x8_10_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); > +void ff_hevc_add_residual_8x8_12_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); > +void ff_hevc_add_residual_16x16_8_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); > +void ff_hevc_add_residual_16x16_10_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); > +void ff_hevc_add_residual_16x16_12_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); > +void ff_hevc_add_residual_32x32_8_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); > +void ff_hevc_add_residual_32x32_10_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); > +void ff_hevc_add_residual_32x32_12_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); Note that these have been amended to include "const" on the coeffs parameter recently. // Martin _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 12+ messages in thread
* [FFmpeg-devel] [PATCH v3] lavc/aarch64: hevc_add_res add 12bit variants 2022-08-16 11:38 ` Martin Storsjö @ 2022-08-16 12:12 ` J. Dekker 2022-08-16 12:46 ` Martin Storsjö 0 siblings, 1 reply; 12+ messages in thread From: J. Dekker @ 2022-08-16 12:12 UTC (permalink / raw) To: ffmpeg-devel hevc_add_res_4x4_12_c: 46.0 hevc_add_res_4x4_12_neon: 18.7 hevc_add_res_8x8_12_c: 194.7 hevc_add_res_8x8_12_neon: 25.2 hevc_add_res_16x16_12_c: 716.0 hevc_add_res_16x16_12_neon: 69.7 hevc_add_res_32x32_12_c: 3820.7 hevc_add_res_32x32_12_neon: 261.0 Signed-off-by: J. Dekker <jdek@itanimul.li> --- libavcodec/aarch64/hevcdsp_idct_neon.S | 156 ++++++++++++---------- libavcodec/aarch64/hevcdsp_init_aarch64.c | 34 ++--- 2 files changed, 105 insertions(+), 85 deletions(-) diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S index 484eea8437..97c51e06e3 100644 --- a/libavcodec/aarch64/hevcdsp_idct_neon.S +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S @@ -37,11 +37,11 @@ const trans, align=4 .short 31, 22, 13, 4 endconst -.macro clip10 in1, in2, c1, c2 - smax \in1, \in1, \c1 - smax \in2, \in2, \c1 - smin \in1, \in1, \c2 - smin \in2, \in2, \c2 +.macro clip2 in1, in2, min, max + smax \in1, \in1, \min + smax \in2, \in2, \min + smin \in1, \in1, \max + smin \in2, \in2, \max .endm function ff_hevc_add_residual_4x4_8_neon, export=1 @@ -64,25 +64,6 @@ function ff_hevc_add_residual_4x4_8_neon, export=1 ret endfunc -function ff_hevc_add_residual_4x4_10_neon, export=1 - mov x12, x0 - ld1 {v0.8h-v1.8h}, [x1] - ld1 {v2.d}[0], [x12], x2 - ld1 {v2.d}[1], [x12], x2 - ld1 {v3.d}[0], [x12], x2 - sqadd v0.8h, v0.8h, v2.8h - ld1 {v3.d}[1], [x12], x2 - movi v4.8h, #0 - sqadd v1.8h, v1.8h, v3.8h - mvni v5.8h, #0xFC, lsl #8 // movi #0x3FF - clip10 v0.8h, v1.8h, v4.8h, v5.8h - st1 {v0.d}[0], [x0], x2 - st1 {v0.d}[1], [x0], x2 - st1 {v1.d}[0], [x0], x2 - st1 {v1.d}[1], [x0], x2 - ret -endfunc - function ff_hevc_add_residual_8x8_8_neon, export=1 add x12, x0, x2 add x2, x2, x2 @@ -103,25 +84,6 @@ function ff_hevc_add_residual_8x8_8_neon, export=1 ret endfunc -function ff_hevc_add_residual_8x8_10_neon, export=1 - add x12, x0, x2 - add x2, x2, x2 - mov x3, #8 - movi v4.8h, #0 - mvni v5.8h, #0xFC, lsl #8 // movi #0x3FF -1: subs x3, x3, #2 - ld1 {v0.8h-v1.8h}, [x1], #32 - ld1 {v2.8h}, [x0] - sqadd v0.8h, v0.8h, v2.8h - ld1 {v3.8h}, [x12] - sqadd v1.8h, v1.8h, v3.8h - clip10 v0.8h, v1.8h, v4.8h, v5.8h - st1 {v0.8h}, [x0], x2 - st1 {v1.8h}, [x12], x2 - bne 1b - ret -endfunc - function ff_hevc_add_residual_16x16_8_neon, export=1 mov x3, #16 add x12, x0, x2 @@ -148,28 +110,6 @@ function ff_hevc_add_residual_16x16_8_neon, export=1 ret endfunc -function ff_hevc_add_residual_16x16_10_neon, export=1 - mov x3, #16 - movi v20.8h, #0 - mvni v21.8h, #0xFC, lsl #8 // movi #0x3FF - add x12, x0, x2 - add x2, x2, x2 -1: subs x3, x3, #2 - ld1 {v16.8h-v17.8h}, [x0] - ld1 {v0.8h-v3.8h}, [x1], #64 - sqadd v0.8h, v0.8h, v16.8h - ld1 {v18.8h-v19.8h}, [x12] - sqadd v1.8h, v1.8h, v17.8h - sqadd v2.8h, v2.8h, v18.8h - sqadd v3.8h, v3.8h, v19.8h - clip10 v0.8h, v1.8h, v20.8h, v21.8h - clip10 v2.8h, v3.8h, v20.8h, v21.8h - st1 {v0.8h-v1.8h}, [x0], x2 - st1 {v2.8h-v3.8h}, [x12], x2 - bne 1b - ret -endfunc - function ff_hevc_add_residual_32x32_8_neon, export=1 add x12, x0, x2 add x2, x2, x2 @@ -209,10 +149,88 @@ function ff_hevc_add_residual_32x32_8_neon, export=1 ret endfunc -function ff_hevc_add_residual_32x32_10_neon, export=1 +.macro add_res bitdepth +function ff_hevc_add_residual_4x4_\bitdepth\()_neon, export=1 + mvni v21.8h, #((0xFF << (\bitdepth - 8)) & 0xFF), lsl #8 + b hevc_add_residual_4x4_16_neon +endfunc +function ff_hevc_add_residual_8x8_\bitdepth\()_neon, export=1 + mvni v21.8h, #((0xFF << (\bitdepth - 8)) & 0xFF), lsl #8 + b hevc_add_residual_8x8_16_neon +endfunc +function ff_hevc_add_residual_16x16_\bitdepth\()_neon, export=1 + mvni v21.8h, #((0xFF << (\bitdepth - 8)) & 0xFF), lsl #8 + b hevc_add_residual_16x16_16_neon +endfunc +function ff_hevc_add_residual_32x32_\bitdepth\()_neon, export=1 + mvni v21.8h, #((0xFF << (\bitdepth - 8)) & 0xFF), lsl #8 + b hevc_add_residual_32x32_16_neon +endfunc +.endm + +add_res 10 +add_res 12 + +function hevc_add_residual_4x4_16_neon, export=0 + mov x12, x0 + ld1 {v0.8h-v1.8h}, [x1] + ld1 {v2.d}[0], [x12], x2 + ld1 {v2.d}[1], [x12], x2 + ld1 {v3.d}[0], [x12], x2 + sqadd v0.8h, v0.8h, v2.8h + ld1 {v3.d}[1], [x12], x2 + movi v4.8h, #0 + sqadd v1.8h, v1.8h, v3.8h + clip2 v0.8h, v1.8h, v4.8h, v21.8h + st1 {v0.d}[0], [x0], x2 + st1 {v0.d}[1], [x0], x2 + st1 {v1.d}[0], [x0], x2 + st1 {v1.d}[1], [x0], x2 + ret +endfunc + +function hevc_add_residual_8x8_16_neon, export=0 + add x12, x0, x2 + add x2, x2, x2 + mov x3, #8 + movi v4.8h, #0 +1: subs x3, x3, #2 + ld1 {v0.8h-v1.8h}, [x1], #32 + ld1 {v2.8h}, [x0] + sqadd v0.8h, v0.8h, v2.8h + ld1 {v3.8h}, [x12] + sqadd v1.8h, v1.8h, v3.8h + clip2 v0.8h, v1.8h, v4.8h, v21.8h + st1 {v0.8h}, [x0], x2 + st1 {v1.8h}, [x12], x2 + bne 1b + ret +endfunc + +function hevc_add_residual_16x16_16_neon, export=0 + mov x3, #16 + movi v20.8h, #0 + add x12, x0, x2 + add x2, x2, x2 +1: subs x3, x3, #2 + ld1 {v16.8h-v17.8h}, [x0] + ld1 {v0.8h-v3.8h}, [x1], #64 + sqadd v0.8h, v0.8h, v16.8h + ld1 {v18.8h-v19.8h}, [x12] + sqadd v1.8h, v1.8h, v17.8h + sqadd v2.8h, v2.8h, v18.8h + sqadd v3.8h, v3.8h, v19.8h + clip2 v0.8h, v1.8h, v20.8h, v21.8h + clip2 v2.8h, v3.8h, v20.8h, v21.8h + st1 {v0.8h-v1.8h}, [x0], x2 + st1 {v2.8h-v3.8h}, [x12], x2 + bne 1b + ret +endfunc + +function hevc_add_residual_32x32_16_neon, export=0 mov x3, #32 movi v20.8h, #0 - mvni v21.8h, #0xFC, lsl #8 // movi #0x3FF 1: subs x3, x3, #1 ld1 {v0.8h -v3.8h}, [x1], #64 ld1 {v16.8h-v19.8h}, [x0] @@ -220,8 +238,8 @@ function ff_hevc_add_residual_32x32_10_neon, export=1 sqadd v1.8h, v1.8h, v17.8h sqadd v2.8h, v2.8h, v18.8h sqadd v3.8h, v3.8h, v19.8h - clip10 v0.8h, v1.8h, v20.8h, v21.8h - clip10 v2.8h, v3.8h, v20.8h, v21.8h + clip2 v0.8h, v1.8h, v20.8h, v21.8h + clip2 v2.8h, v3.8h, v20.8h, v21.8h st1 {v0.8h-v3.8h}, [x0], x2 bne 1b ret diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index 9cbe983870..b6d5efb77f 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -25,22 +25,18 @@ #include "libavutil/aarch64/cpu.h" #include "libavcodec/hevcdsp.h" -void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, const int16_t *coeffs, - ptrdiff_t stride); -void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, const int16_t *coeffs, - ptrdiff_t stride); -void ff_hevc_add_residual_8x8_8_neon(uint8_t *_dst, const int16_t *coeffs, - ptrdiff_t stride); -void ff_hevc_add_residual_8x8_10_neon(uint8_t *_dst, const int16_t *coeffs, - ptrdiff_t stride); -void ff_hevc_add_residual_16x16_8_neon(uint8_t *_dst, const int16_t *coeffs, - ptrdiff_t stride); -void ff_hevc_add_residual_16x16_10_neon(uint8_t *_dst, const int16_t *coeffs, - ptrdiff_t stride); -void ff_hevc_add_residual_32x32_8_neon(uint8_t *_dst, const int16_t *coeffs, - ptrdiff_t stride); -void ff_hevc_add_residual_32x32_10_neon(uint8_t *_dst, const int16_t *coeffs, - ptrdiff_t stride); +void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_4x4_12_neon(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_8x8_8_neon(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_8x8_10_neon(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_8x8_12_neon(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_16x16_8_neon(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_16x16_10_neon(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_16x16_12_neon(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_32x32_8_neon(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_32x32_10_neon(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_add_residual_32x32_12_neon(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t stride); void ff_hevc_idct_8x8_8_neon(int16_t *coeffs, int col_limit); void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit); void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit); @@ -100,4 +96,10 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_neon; c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_neon; } + if (bit_depth == 12) { + c->add_residual[0] = ff_hevc_add_residual_4x4_12_neon; + c->add_residual[1] = ff_hevc_add_residual_8x8_12_neon; + c->add_residual[2] = ff_hevc_add_residual_16x16_12_neon; + c->add_residual[3] = ff_hevc_add_residual_32x32_12_neon; + } } -- 2.32.0 (Apple Git-132) _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3] lavc/aarch64: hevc_add_res add 12bit variants 2022-08-16 12:12 ` [FFmpeg-devel] [PATCH v3] " J. Dekker @ 2022-08-16 12:46 ` Martin Storsjö 2022-08-18 13:07 ` J. Dekker 0 siblings, 1 reply; 12+ messages in thread From: Martin Storsjö @ 2022-08-16 12:46 UTC (permalink / raw) To: FFmpeg development discussions and patches On Tue, 16 Aug 2022, J. Dekker wrote: > hevc_add_res_4x4_12_c: 46.0 > hevc_add_res_4x4_12_neon: 18.7 > hevc_add_res_8x8_12_c: 194.7 > hevc_add_res_8x8_12_neon: 25.2 > hevc_add_res_16x16_12_c: 716.0 > hevc_add_res_16x16_12_neon: 69.7 > hevc_add_res_32x32_12_c: 3820.7 > hevc_add_res_32x32_12_neon: 261.0 > > Signed-off-by: J. Dekker <jdek@itanimul.li> > --- > > libavcodec/aarch64/hevcdsp_idct_neon.S | 156 ++++++++++++---------- > libavcodec/aarch64/hevcdsp_init_aarch64.c | 34 ++--- > 2 files changed, 105 insertions(+), 85 deletions(-) Thanks, this version seems fine to me. > diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c > index 9cbe983870..b6d5efb77f 100644 > --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c > +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c > @@ -25,22 +25,18 @@ > #include "libavutil/aarch64/cpu.h" > #include "libavcodec/hevcdsp.h" > > -void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, const int16_t *coeffs, > - ptrdiff_t stride); > +void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t stride); The joined forms of these lines end up a bit long, while they previously did fit below the 80 column soft-limit, so IMO I'd prefer to keep them wrapped - but it's not a big deal. (I guess it made more sense to join the lines before the 'const' was added.) // Martin _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3] lavc/aarch64: hevc_add_res add 12bit variants 2022-08-16 12:46 ` Martin Storsjö @ 2022-08-18 13:07 ` J. Dekker 0 siblings, 0 replies; 12+ messages in thread From: J. Dekker @ 2022-08-18 13:07 UTC (permalink / raw) To: FFmpeg development discussions and patches On 16 Aug 2022, at 14:46, Martin Storsjö wrote: > On Tue, 16 Aug 2022, J. Dekker wrote: > >> hevc_add_res_4x4_12_c: 46.0 >> hevc_add_res_4x4_12_neon: 18.7 >> hevc_add_res_8x8_12_c: 194.7 >> hevc_add_res_8x8_12_neon: 25.2 >> hevc_add_res_16x16_12_c: 716.0 >> hevc_add_res_16x16_12_neon: 69.7 >> hevc_add_res_32x32_12_c: 3820.7 >> hevc_add_res_32x32_12_neon: 261.0 >> >> Signed-off-by: J. Dekker <jdek@itanimul.li> >> --- >> >> libavcodec/aarch64/hevcdsp_idct_neon.S | 156 ++++++++++++---------- >> libavcodec/aarch64/hevcdsp_init_aarch64.c | 34 ++--- >> 2 files changed, 105 insertions(+), 85 deletions(-) > > Thanks, this version seems fine to me. > >> diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c >> index 9cbe983870..b6d5efb77f 100644 >> --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c >> +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c >> @@ -25,22 +25,18 @@ >> #include "libavutil/aarch64/cpu.h" >> #include "libavcodec/hevcdsp.h" >> >> -void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, const int16_t *coeffs, >> - ptrdiff_t stride); >> +void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t stride); > > The joined forms of these lines end up a bit long, while they previously did fit below the 80 column soft-limit, so IMO I'd prefer to keep them wrapped - but it's not a big deal. (I guess it made more sense to join the lines before the 'const' was added.) > > // Martin Pushed with these changes (entire set now). Thanks, -- jd _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/3] checkasm/hevc_add_res: add 12bit test 2022-06-23 18:04 [FFmpeg-devel] [PATCH 1/3] checkasm/hevc_add_res: add 12bit test J. Dekker 2022-06-23 18:04 ` [FFmpeg-devel] [PATCH 2/3] lavc/aarch64: reformat add_res funcs J. Dekker 2022-06-23 18:04 ` [FFmpeg-devel] [PATCH 3/3] lavc/aarch64: hevc_add_res add 12bit variants J. Dekker @ 2022-08-09 11:02 ` Martin Storsjö 2 siblings, 0 replies; 12+ messages in thread From: Martin Storsjö @ 2022-08-09 11:02 UTC (permalink / raw) To: FFmpeg development discussions and patches On Thu, 23 Jun 2022, J. Dekker wrote: > Signed-off-by: J. Dekker <jdek@itanimul.li> > --- > tests/checkasm/hevc_add_res.c | 15 ++++++++------- > 1 file changed, 8 insertions(+), 7 deletions(-) > > diff --git a/tests/checkasm/hevc_add_res.c b/tests/checkasm/hevc_add_res.c > index 0c896adaca..f17d121939 100644 > --- a/tests/checkasm/hevc_add_res.c > +++ b/tests/checkasm/hevc_add_res.c > @@ -36,14 +36,14 @@ > } \ > } while (0) > > -#define randomize_buffers2(buf, size) \ > +#define randomize_buffers2(buf, size, mask) \ > do { \ > int j; \ > for (j = 0; j < size; j++) \ > - AV_WN16A(buf + j * 2, rnd() & 0x3FF); \ > + AV_WN16A(buf + j * 2, rnd() & mask); \ > } while (0) > > -static void compare_add_res(int size, ptrdiff_t stride, int overflow_test) > +static void compare_add_res(int size, ptrdiff_t stride, int overflow_test, int mask) > { > LOCAL_ALIGNED_32(int16_t, res0, [32 * 32]); > LOCAL_ALIGNED_32(int16_t, res1, [32 * 32]); > @@ -53,7 +53,7 @@ static void compare_add_res(int size, ptrdiff_t stride, int overflow_test) > declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, int16_t *res, ptrdiff_t stride); > > randomize_buffers(res0, size); > - randomize_buffers2(dst0, size); > + randomize_buffers2(dst0, size, mask); > if (overflow_test) > res0[0] = 0x8000; > memcpy(res1, res0, sizeof(*res0) * size); > @@ -69,6 +69,7 @@ static void compare_add_res(int size, ptrdiff_t stride, int overflow_test) > static void check_add_res(HEVCDSPContext h, int bit_depth) > { > int i; > + int mask = bit_depth == 8 ? 0xFFFF : bit_depth == 10 ? 0x03FF : 0x07FF; Previously we always used the mask 0x03FF, while we now use 0xFFFF for 8 bit. I presume that means that for 8 bit, we mask two pixels with one 0xFFFF (and keep all bits), and previously we accidentally masked out everything but the lowest two bits, from every other pixel, in 8 bit mode? The patch LGTM, but it'd be good to acknowledge this existing issue in the commit message. // Martin _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2022-08-18 13:07 UTC | newest] Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2022-06-23 18:04 [FFmpeg-devel] [PATCH 1/3] checkasm/hevc_add_res: add 12bit test J. Dekker 2022-06-23 18:04 ` [FFmpeg-devel] [PATCH 2/3] lavc/aarch64: reformat add_res funcs J. Dekker 2022-08-09 11:04 ` Martin Storsjö 2022-06-23 18:04 ` [FFmpeg-devel] [PATCH 3/3] lavc/aarch64: hevc_add_res add 12bit variants J. Dekker 2022-08-09 11:13 ` Martin Storsjö 2022-08-09 11:21 ` Martin Storsjö 2022-08-16 5:01 ` [FFmpeg-devel] [PATCH v2] " J. Dekker 2022-08-16 11:38 ` Martin Storsjö 2022-08-16 12:12 ` [FFmpeg-devel] [PATCH v3] " J. Dekker 2022-08-16 12:46 ` Martin Storsjö 2022-08-18 13:07 ` J. Dekker 2022-08-09 11:02 ` [FFmpeg-devel] [PATCH 1/3] checkasm/hevc_add_res: add 12bit test Martin Storsjö
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git