* [FFmpeg-devel] [PATCH 1/3] checkasm/hevc_add_res: add 12bit test
@ 2022-06-23 18:04 J. Dekker
  2022-06-23 18:04 ` [FFmpeg-devel] [PATCH 2/3] lavc/aarch64: reformat add_res funcs J. Dekker
                   ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: J. Dekker @ 2022-06-23 18:04 UTC (permalink / raw)
  To: ffmpeg-devel
Signed-off-by: J. Dekker <jdek@itanimul.li>
---
 tests/checkasm/hevc_add_res.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)
diff --git a/tests/checkasm/hevc_add_res.c b/tests/checkasm/hevc_add_res.c
index 0c896adaca..f17d121939 100644
--- a/tests/checkasm/hevc_add_res.c
+++ b/tests/checkasm/hevc_add_res.c
@@ -36,14 +36,14 @@
         }                                       \
     } while (0)
 
-#define randomize_buffers2(buf, size)             \
+#define randomize_buffers2(buf, size, mask)       \
     do {                                          \
         int j;                                    \
         for (j = 0; j < size; j++)                \
-            AV_WN16A(buf + j * 2, rnd() & 0x3FF); \
+            AV_WN16A(buf + j * 2, rnd() & mask); \
     } while (0)
 
-static void compare_add_res(int size, ptrdiff_t stride, int overflow_test)
+static void compare_add_res(int size, ptrdiff_t stride, int overflow_test, int mask)
 {
     LOCAL_ALIGNED_32(int16_t, res0, [32 * 32]);
     LOCAL_ALIGNED_32(int16_t, res1, [32 * 32]);
@@ -53,7 +53,7 @@ static void compare_add_res(int size, ptrdiff_t stride, int overflow_test)
     declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, int16_t *res, ptrdiff_t stride);
 
     randomize_buffers(res0, size);
-    randomize_buffers2(dst0, size);
+    randomize_buffers2(dst0, size, mask);
     if (overflow_test)
         res0[0] = 0x8000;
     memcpy(res1, res0, sizeof(*res0) * size);
@@ -69,6 +69,7 @@ static void compare_add_res(int size, ptrdiff_t stride, int overflow_test)
 static void check_add_res(HEVCDSPContext h, int bit_depth)
 {
     int i;
+    int mask = bit_depth == 8 ? 0xFFFF : bit_depth == 10 ? 0x03FF : 0x07FF;
 
     for (i = 2; i <= 5; i++) {
         int block_size = 1 << i;
@@ -76,9 +77,9 @@ static void check_add_res(HEVCDSPContext h, int bit_depth)
         ptrdiff_t stride = block_size << (bit_depth > 8);
 
         if (check_func(h.add_residual[i - 2], "hevc_add_res_%dx%d_%d", block_size, block_size, bit_depth)) {
-            compare_add_res(size, stride, 0);
+            compare_add_res(size, stride, 0, mask);
             // overflow test for res = -32768
-            compare_add_res(size, stride, 1);
+            compare_add_res(size, stride, 1, mask);
         }
     }
 }
@@ -87,7 +88,7 @@ void checkasm_check_hevc_add_res(void)
 {
     int bit_depth;
 
-    for (bit_depth = 8; bit_depth <= 10; bit_depth++) {
+    for (bit_depth = 8; bit_depth <= 12; bit_depth++) {
         HEVCDSPContext h;
 
         ff_hevc_dsp_init(&h, bit_depth);
-- 
2.32.0 (Apple Git-132)
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply	[flat|nested] 12+ messages in thread
* [FFmpeg-devel] [PATCH 2/3] lavc/aarch64: reformat add_res funcs
  2022-06-23 18:04 [FFmpeg-devel] [PATCH 1/3] checkasm/hevc_add_res: add 12bit test J. Dekker
@ 2022-06-23 18:04 ` J. Dekker
  2022-08-09 11:04   ` Martin Storsjö
  2022-06-23 18:04 ` [FFmpeg-devel] [PATCH 3/3] lavc/aarch64: hevc_add_res add 12bit variants J. Dekker
  2022-08-09 11:02 ` [FFmpeg-devel] [PATCH 1/3] checkasm/hevc_add_res: add 12bit test Martin Storsjö
  2 siblings, 1 reply; 12+ messages in thread
From: J. Dekker @ 2022-06-23 18:04 UTC (permalink / raw)
  To: ffmpeg-devel
Signed-off-by: J. Dekker <jdek@itanimul.li>
---
 libavcodec/aarch64/hevcdsp_idct_neon.S | 216 ++++++++++++-------------
 1 file changed, 108 insertions(+), 108 deletions(-)
diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 0869431294..484eea8437 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -27,21 +27,21 @@
 #include "libavutil/aarch64/asm.S"
 
 const trans, align=4
-        .short 64, 83, 64, 36
-        .short 89, 75, 50, 18
-        .short 90, 87, 80, 70
-        .short 57, 43, 25, 9
-        .short 90, 90, 88, 85
-        .short 82, 78, 73, 67
-        .short 61, 54, 46, 38
-        .short 31, 22, 13, 4
+        .short          64, 83, 64, 36
+        .short          89, 75, 50, 18
+        .short          90, 87, 80, 70
+        .short          57, 43, 25, 9
+        .short          90, 90, 88, 85
+        .short          82, 78, 73, 67
+        .short          61, 54, 46, 38
+        .short          31, 22, 13, 4
 endconst
 
 .macro clip10 in1, in2, c1, c2
-        smax        \in1, \in1, \c1
-        smax        \in2, \in2, \c1
-        smin        \in1, \in1, \c2
-        smin        \in2, \in2, \c2
+        smax            \in1, \in1, \c1
+        smax            \in2, \in2, \c1
+        smin            \in1, \in1, \c2
+        smin            \in2, \in2, \c2
 .endm
 
 function ff_hevc_add_residual_4x4_8_neon, export=1
@@ -50,13 +50,13 @@ function ff_hevc_add_residual_4x4_8_neon, export=1
         ld1             {v2.s}[1], [x0], x2
         ld1             {v2.s}[2], [x0], x2
         ld1             {v2.s}[3], [x0], x2
-        sub              x0,  x0,  x2, lsl #2
-        uxtl             v6.8h,  v2.8b
-        uxtl2            v7.8h,  v2.16b
-        sqadd            v0.8h,  v0.8h, v6.8h
-        sqadd            v1.8h,  v1.8h, v7.8h
-        sqxtun           v0.8b,  v0.8h
-        sqxtun2          v0.16b, v1.8h
+        sub             x0, x0, x2, lsl #2
+        uxtl            v6.8h,  v2.8b
+        uxtl2           v7.8h,  v2.16b
+        sqadd           v0.8h,  v0.8h, v6.8h
+        sqadd           v1.8h,  v1.8h, v7.8h
+        sqxtun          v0.8b,  v0.8h
+        sqxtun2         v0.16b, v1.8h
         st1             {v0.s}[0], [x0], x2
         st1             {v0.s}[1], [x0], x2
         st1             {v0.s}[2], [x0], x2
@@ -70,63 +70,63 @@ function ff_hevc_add_residual_4x4_10_neon, export=1
         ld1             {v2.d}[0], [x12], x2
         ld1             {v2.d}[1], [x12], x2
         ld1             {v3.d}[0], [x12], x2
-        sqadd            v0.8h, v0.8h, v2.8h
+        sqadd           v0.8h, v0.8h, v2.8h
         ld1             {v3.d}[1], [x12], x2
-        movi             v4.8h, #0
-        sqadd            v1.8h, v1.8h, v3.8h
-        mvni             v5.8h, #0xFC, lsl #8 // movi #0x3FF
-        clip10           v0.8h, v1.8h, v4.8h, v5.8h
-        st1             {v0.d}[0],  [x0], x2
-        st1             {v0.d}[1],  [x0], x2
-        st1             {v1.d}[0],  [x0], x2
-        st1             {v1.d}[1],  [x0], x2
+        movi            v4.8h, #0
+        sqadd           v1.8h, v1.8h, v3.8h
+        mvni            v5.8h, #0xFC, lsl #8 // movi #0x3FF
+        clip10          v0.8h, v1.8h, v4.8h, v5.8h
+        st1             {v0.d}[0], [x0],  x2
+        st1             {v0.d}[1], [x0],  x2
+        st1             {v1.d}[0], [x0],  x2
+        st1             {v1.d}[1], [x0],  x2
         ret
 endfunc
 
 function ff_hevc_add_residual_8x8_8_neon, export=1
-        add             x12,  x0, x2
-        add              x2,  x2, x2
-        mov              x3,  #8
-1:      subs             x3,  x3, #2
-        ld1             {v2.d}[0],     [x0]
-        ld1             {v2.d}[1],    [x12]
-        uxtl             v3.8h,  v2.8b
+        add             x12, x0, x2
+        add             x2, x2, x2
+        mov             x3, #8
+1:      subs            x3, x3, #2
+        ld1             {v2.d}[0], [x0]
+        ld1             {v2.d}[1], [x12]
+        uxtl            v3.8h,  v2.8b
         ld1             {v0.8h-v1.8h}, [x1], #32
-        uxtl2            v2.8h,  v2.16b
-        sqadd            v0.8h,  v0.8h,   v3.8h
-        sqadd            v1.8h,  v1.8h,   v2.8h
-        sqxtun           v0.8b,  v0.8h
-        sqxtun2          v0.16b, v1.8h
-        st1             {v0.d}[0],     [x0], x2
-        st1             {v0.d}[1],    [x12], x2
-        bne              1b
+        uxtl2           v2.8h,  v2.16b
+        sqadd           v0.8h,  v0.8h, v3.8h
+        sqadd           v1.8h,  v1.8h, v2.8h
+        sqxtun          v0.8b,  v0.8h
+        sqxtun2         v0.16b, v1.8h
+        st1             {v0.d}[0], [x0],  x2
+        st1             {v0.d}[1], [x12], x2
+        bne             1b
         ret
 endfunc
 
 function ff_hevc_add_residual_8x8_10_neon, export=1
-        add             x12,  x0, x2
-        add              x2,  x2, x2
-        mov              x3,  #8
-        movi             v4.8h, #0
-        mvni             v5.8h, #0xFC, lsl #8 // movi #0x3FF
-1:      subs             x3,  x3, #2
+        add             x12, x0, x2
+        add             x2,  x2, x2
+        mov             x3,  #8
+        movi            v4.8h, #0
+        mvni            v5.8h, #0xFC, lsl #8 // movi #0x3FF
+1:      subs            x3,  x3, #2
         ld1             {v0.8h-v1.8h}, [x1], #32
-        ld1             {v2.8h},       [x0]
-        sqadd            v0.8h, v0.8h, v2.8h
-        ld1             {v3.8h},      [x12]
-        sqadd            v1.8h, v1.8h, v3.8h
-        clip10           v0.8h, v1.8h, v4.8h, v5.8h
-        st1             {v0.8h},       [x0], x2
-        st1             {v1.8h},      [x12], x2
-        bne              1b
+        ld1             {v2.8h}, [x0]
+        sqadd           v0.8h, v0.8h, v2.8h
+        ld1             {v3.8h}, [x12]
+        sqadd           v1.8h, v1.8h, v3.8h
+        clip10          v0.8h, v1.8h, v4.8h, v5.8h
+        st1             {v0.8h}, [x0],  x2
+        st1             {v1.8h}, [x12], x2
+        bne             1b
         ret
 endfunc
 
 function ff_hevc_add_residual_16x16_8_neon, export=1
-        mov              x3,  #16
+        mov             x3,  #16
         add             x12, x0, x2
-        add              x2,  x2, x2
-1:      subs             x3,  x3, #2
+        add             x2,  x2, x2
+1:      subs            x3,  x3, #2
         ld1             {v16.16b},     [x0]
         ld1             {v0.8h-v3.8h}, [x1], #64
         ld1             {v19.16b},    [x12]
@@ -134,47 +134,47 @@ function ff_hevc_add_residual_16x16_8_neon, export=1
         uxtl2           v18.8h, v16.16b
         uxtl            v20.8h, v19.8b
         uxtl2           v21.8h, v19.16b
-        sqadd            v0.8h,  v0.8h, v17.8h
-        sqadd            v1.8h,  v1.8h, v18.8h
-        sqadd            v2.8h,  v2.8h, v20.8h
-        sqadd            v3.8h,  v3.8h, v21.8h
-        sqxtun           v0.8b,  v0.8h
+        sqadd           v0.8h,  v0.8h, v17.8h
+        sqadd           v1.8h,  v1.8h, v18.8h
+        sqadd           v2.8h,  v2.8h, v20.8h
+        sqadd           v3.8h,  v3.8h, v21.8h
+        sqxtun          v0.8b,  v0.8h
         sqxtun2         v0.16b,  v1.8h
-        sqxtun           v1.8b,  v2.8h
+        sqxtun          v1.8b,  v2.8h
         sqxtun2         v1.16b,  v3.8h
         st1             {v0.16b},     [x0], x2
         st1             {v1.16b},    [x12], x2
-        bne              1b
+        bne             1b
         ret
 endfunc
 
 function ff_hevc_add_residual_16x16_10_neon, export=1
-        mov              x3,  #16
+        mov             x3,  #16
         movi            v20.8h, #0
         mvni            v21.8h, #0xFC, lsl #8 // movi #0x3FF
         add             x12,  x0, x2
-        add              x2,  x2, x2
-1:      subs             x3,  x3, #2
+        add             x2,  x2, x2
+1:      subs            x3,  x3, #2
         ld1             {v16.8h-v17.8h}, [x0]
-        ld1             {v0.8h-v3.8h},  [x1], #64
-        sqadd            v0.8h, v0.8h, v16.8h
+        ld1             {v0.8h-v3.8h},   [x1], #64
+        sqadd           v0.8h, v0.8h, v16.8h
         ld1             {v18.8h-v19.8h}, [x12]
-        sqadd            v1.8h, v1.8h, v17.8h
-        sqadd            v2.8h, v2.8h, v18.8h
-        sqadd            v3.8h, v3.8h, v19.8h
-        clip10           v0.8h, v1.8h, v20.8h, v21.8h
-        clip10           v2.8h, v3.8h, v20.8h, v21.8h
-        st1             {v0.8h-v1.8h},   [x0], x2
-        st1             {v2.8h-v3.8h},  [x12], x2
-        bne              1b
+        sqadd           v1.8h, v1.8h, v17.8h
+        sqadd           v2.8h, v2.8h, v18.8h
+        sqadd           v3.8h, v3.8h, v19.8h
+        clip10          v0.8h, v1.8h, v20.8h, v21.8h
+        clip10          v2.8h, v3.8h, v20.8h, v21.8h
+        st1             {v0.8h-v1.8h}, [x0],  x2
+        st1             {v2.8h-v3.8h}, [x12], x2
+        bne             1b
         ret
 endfunc
 
 function ff_hevc_add_residual_32x32_8_neon, export=1
         add             x12,  x0, x2
-        add              x2,  x2, x2
-        mov              x3,  #32
-1:      subs             x3,  x3, #2
+        add             x2,  x2, x2
+        mov             x3,  #32
+1:      subs            x3,  x3, #2
         ld1             {v20.16b, v21.16b}, [x0]
         uxtl            v16.8h,  v20.8b
         uxtl2           v17.8h,  v20.16b
@@ -187,43 +187,43 @@ function ff_hevc_add_residual_32x32_8_neon, export=1
         uxtl2           v21.8h,  v22.16b
         uxtl            v22.8h,  v23.8b
         uxtl2           v23.8h,  v23.16b
-        sqadd            v0.8h,  v0.8h,  v16.8h
-        sqadd            v1.8h,  v1.8h,  v17.8h
-        sqadd            v2.8h,  v2.8h,  v18.8h
-        sqadd            v3.8h,  v3.8h,  v19.8h
-        sqadd            v4.8h,  v4.8h,  v20.8h
-        sqadd            v5.8h,  v5.8h,  v21.8h
-        sqadd            v6.8h,  v6.8h,  v22.8h
-        sqadd            v7.8h,  v7.8h,  v23.8h
-        sqxtun           v0.8b,  v0.8h
+        sqadd           v0.8h,   v0.8h,   v16.8h
+        sqadd           v1.8h,   v1.8h,   v17.8h
+        sqadd           v2.8h,   v2.8h,   v18.8h
+        sqadd           v3.8h,   v3.8h,   v19.8h
+        sqadd           v4.8h,   v4.8h,   v20.8h
+        sqadd           v5.8h,   v5.8h,   v21.8h
+        sqadd           v6.8h,   v6.8h,   v22.8h
+        sqadd           v7.8h,   v7.8h,   v23.8h
+        sqxtun          v0.8b,   v0.8h
         sqxtun2         v0.16b,  v1.8h
-        sqxtun           v1.8b,  v2.8h
+        sqxtun          v1.8b,   v2.8h
         sqxtun2         v1.16b,  v3.8h
-        sqxtun           v2.8b,  v4.8h
+        sqxtun          v2.8b,   v4.8h
         sqxtun2         v2.16b,  v5.8h
-        st1             {v0.16b, v1.16b},  [x0], x2
-        sqxtun           v3.8b,  v6.8h
+        st1             {v0.16b, v1.16b}, [x0],  x2
+        sqxtun          v3.8b,   v6.8h
         sqxtun2         v3.16b,  v7.8h
         st1             {v2.16b, v3.16b}, [x12], x2
-        bne              1b
+        bne             1b
         ret
 endfunc
 
 function ff_hevc_add_residual_32x32_10_neon, export=1
-        mov              x3,  #32
+        mov             x3, #32
         movi            v20.8h, #0
         mvni            v21.8h, #0xFC, lsl #8 // movi #0x3FF
-1:      subs             x3,  x3, #1
-        ld1             {v0.8h-v3.8h},   [x1], #64
+1:      subs            x3, x3, #1
+        ld1             {v0.8h -v3.8h},  [x1], #64
         ld1             {v16.8h-v19.8h}, [x0]
-        sqadd            v0.8h, v0.8h, v16.8h
-        sqadd            v1.8h, v1.8h, v17.8h
-        sqadd            v2.8h, v2.8h, v18.8h
-        sqadd            v3.8h, v3.8h, v19.8h
-        clip10           v0.8h, v1.8h, v20.8h, v21.8h
-        clip10           v2.8h, v3.8h, v20.8h, v21.8h
-        st1             {v0.8h-v3.8h},   [x0], x2
-        bne              1b
+        sqadd           v0.8h, v0.8h, v16.8h
+        sqadd           v1.8h, v1.8h, v17.8h
+        sqadd           v2.8h, v2.8h, v18.8h
+        sqadd           v3.8h, v3.8h, v19.8h
+        clip10          v0.8h, v1.8h, v20.8h, v21.8h
+        clip10          v2.8h, v3.8h, v20.8h, v21.8h
+        st1             {v0.8h-v3.8h}, [x0], x2
+        bne             1b
         ret
 endfunc
 
-- 
2.32.0 (Apple Git-132)
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply	[flat|nested] 12+ messages in thread
* [FFmpeg-devel] [PATCH 3/3] lavc/aarch64: hevc_add_res add 12bit variants
  2022-06-23 18:04 [FFmpeg-devel] [PATCH 1/3] checkasm/hevc_add_res: add 12bit test J. Dekker
  2022-06-23 18:04 ` [FFmpeg-devel] [PATCH 2/3] lavc/aarch64: reformat add_res funcs J. Dekker
@ 2022-06-23 18:04 ` J. Dekker
  2022-08-09 11:13   ` Martin Storsjö
  2022-08-09 11:02 ` [FFmpeg-devel] [PATCH 1/3] checkasm/hevc_add_res: add 12bit test Martin Storsjö
  2 siblings, 1 reply; 12+ messages in thread
From: J. Dekker @ 2022-06-23 18:04 UTC (permalink / raw)
  To: ffmpeg-devel
hevc_add_res_4x4_12_c: 46.0
hevc_add_res_4x4_12_neon: 18.7
hevc_add_res_8x8_12_c: 194.7
hevc_add_res_8x8_12_neon: 25.2
hevc_add_res_16x16_12_c: 716.0
hevc_add_res_16x16_12_neon: 69.7
hevc_add_res_32x32_12_c: 3820.7
hevc_add_res_32x32_12_neon: 261.0
Signed-off-by: J. Dekker <jdek@itanimul.li>
---
 libavcodec/aarch64/hevcdsp_idct_neon.S    | 148 ++++++++++++----------
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  34 ++---
 2 files changed, 97 insertions(+), 85 deletions(-)
diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 484eea8437..413e225218 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -37,11 +37,11 @@ const trans, align=4
         .short          31, 22, 13, 4
 endconst
 
-.macro clip10 in1, in2, c1, c2
-        smax            \in1, \in1, \c1
-        smax            \in2, \in2, \c1
-        smin            \in1, \in1, \c2
-        smin            \in2, \in2, \c2
+.macro clip2 in1, in2, min, max
+        smax            \in1, \in1, \min
+        smax            \in2, \in2, \min
+        smin            \in1, \in1, \max
+        smin            \in2, \in2, \max
 .endm
 
 function ff_hevc_add_residual_4x4_8_neon, export=1
@@ -64,25 +64,6 @@ function ff_hevc_add_residual_4x4_8_neon, export=1
         ret
 endfunc
 
-function ff_hevc_add_residual_4x4_10_neon, export=1
-        mov             x12,  x0
-        ld1             {v0.8h-v1.8h}, [x1]
-        ld1             {v2.d}[0], [x12], x2
-        ld1             {v2.d}[1], [x12], x2
-        ld1             {v3.d}[0], [x12], x2
-        sqadd           v0.8h, v0.8h, v2.8h
-        ld1             {v3.d}[1], [x12], x2
-        movi            v4.8h, #0
-        sqadd           v1.8h, v1.8h, v3.8h
-        mvni            v5.8h, #0xFC, lsl #8 // movi #0x3FF
-        clip10          v0.8h, v1.8h, v4.8h, v5.8h
-        st1             {v0.d}[0], [x0],  x2
-        st1             {v0.d}[1], [x0],  x2
-        st1             {v1.d}[0], [x0],  x2
-        st1             {v1.d}[1], [x0],  x2
-        ret
-endfunc
-
 function ff_hevc_add_residual_8x8_8_neon, export=1
         add             x12, x0, x2
         add             x2, x2, x2
@@ -103,25 +84,6 @@ function ff_hevc_add_residual_8x8_8_neon, export=1
         ret
 endfunc
 
-function ff_hevc_add_residual_8x8_10_neon, export=1
-        add             x12, x0, x2
-        add             x2,  x2, x2
-        mov             x3,  #8
-        movi            v4.8h, #0
-        mvni            v5.8h, #0xFC, lsl #8 // movi #0x3FF
-1:      subs            x3,  x3, #2
-        ld1             {v0.8h-v1.8h}, [x1], #32
-        ld1             {v2.8h}, [x0]
-        sqadd           v0.8h, v0.8h, v2.8h
-        ld1             {v3.8h}, [x12]
-        sqadd           v1.8h, v1.8h, v3.8h
-        clip10          v0.8h, v1.8h, v4.8h, v5.8h
-        st1             {v0.8h}, [x0],  x2
-        st1             {v1.8h}, [x12], x2
-        bne             1b
-        ret
-endfunc
-
 function ff_hevc_add_residual_16x16_8_neon, export=1
         mov             x3,  #16
         add             x12, x0, x2
@@ -148,28 +110,6 @@ function ff_hevc_add_residual_16x16_8_neon, export=1
         ret
 endfunc
 
-function ff_hevc_add_residual_16x16_10_neon, export=1
-        mov             x3,  #16
-        movi            v20.8h, #0
-        mvni            v21.8h, #0xFC, lsl #8 // movi #0x3FF
-        add             x12,  x0, x2
-        add             x2,  x2, x2
-1:      subs            x3,  x3, #2
-        ld1             {v16.8h-v17.8h}, [x0]
-        ld1             {v0.8h-v3.8h},   [x1], #64
-        sqadd           v0.8h, v0.8h, v16.8h
-        ld1             {v18.8h-v19.8h}, [x12]
-        sqadd           v1.8h, v1.8h, v17.8h
-        sqadd           v2.8h, v2.8h, v18.8h
-        sqadd           v3.8h, v3.8h, v19.8h
-        clip10          v0.8h, v1.8h, v20.8h, v21.8h
-        clip10          v2.8h, v3.8h, v20.8h, v21.8h
-        st1             {v0.8h-v1.8h}, [x0],  x2
-        st1             {v2.8h-v3.8h}, [x12], x2
-        bne             1b
-        ret
-endfunc
-
 function ff_hevc_add_residual_32x32_8_neon, export=1
         add             x12,  x0, x2
         add             x2,  x2, x2
@@ -209,10 +149,76 @@ function ff_hevc_add_residual_32x32_8_neon, export=1
         ret
 endfunc
 
-function ff_hevc_add_residual_32x32_10_neon, export=1
+.macro add_res bitdepth
+.if \bitdepth == 10
+.set mask, 0xFC
+.else
+.set mask, 0xF0
+.endif
+function ff_hevc_add_residual_4x4_\bitdepth\()_neon, export=1
+        mov             x12,  x0
+        ld1             {v0.8h-v1.8h}, [x1]
+        ld1             {v2.d}[0], [x12], x2
+        ld1             {v2.d}[1], [x12], x2
+        ld1             {v3.d}[0], [x12], x2
+        sqadd           v0.8h, v0.8h, v2.8h
+        ld1             {v3.d}[1], [x12], x2
+        movi            v4.8h, #0
+        sqadd           v1.8h, v1.8h, v3.8h
+        mvni            v5.8h, mask, lsl #8
+        clip2           v0.8h, v1.8h, v4.8h, v5.8h
+        st1             {v0.d}[0], [x0],  x2
+        st1             {v0.d}[1], [x0],  x2
+        st1             {v1.d}[0], [x0],  x2
+        st1             {v1.d}[1], [x0],  x2
+        ret
+endfunc
+
+function ff_hevc_add_residual_8x8_\bitdepth\()_neon, export=1
+        add             x12, x0, x2
+        add             x2,  x2, x2
+        mov             x3,  #8
+        movi            v4.8h, #0
+        mvni            v5.8h, mask, lsl #8
+1:      subs            x3,  x3, #2
+        ld1             {v0.8h-v1.8h}, [x1], #32
+        ld1             {v2.8h}, [x0]
+        sqadd           v0.8h, v0.8h, v2.8h
+        ld1             {v3.8h}, [x12]
+        sqadd           v1.8h, v1.8h, v3.8h
+        clip2           v0.8h, v1.8h, v4.8h, v5.8h
+        st1             {v0.8h}, [x0],  x2
+        st1             {v1.8h}, [x12], x2
+        bne             1b
+        ret
+endfunc
+
+function ff_hevc_add_residual_16x16_\bitdepth\()_neon, export=1
+        mov             x3,  #16
+        movi            v20.8h, #0
+        mvni            v21.8h, mask, lsl #8
+        add             x12,  x0, x2
+        add             x2,  x2, x2
+1:      subs            x3,  x3, #2
+        ld1             {v16.8h-v17.8h}, [x0]
+        ld1             {v0.8h-v3.8h},   [x1], #64
+        sqadd           v0.8h, v0.8h, v16.8h
+        ld1             {v18.8h-v19.8h}, [x12]
+        sqadd           v1.8h, v1.8h, v17.8h
+        sqadd           v2.8h, v2.8h, v18.8h
+        sqadd           v3.8h, v3.8h, v19.8h
+        clip2           v0.8h, v1.8h, v20.8h, v21.8h
+        clip2           v2.8h, v3.8h, v20.8h, v21.8h
+        st1             {v0.8h-v1.8h}, [x0],  x2
+        st1             {v2.8h-v3.8h}, [x12], x2
+        bne             1b
+        ret
+endfunc
+
+function ff_hevc_add_residual_32x32_\bitdepth\()_neon, export=1
         mov             x3, #32
         movi            v20.8h, #0
-        mvni            v21.8h, #0xFC, lsl #8 // movi #0x3FF
+        mvni            v21.8h, mask, lsl #8
 1:      subs            x3, x3, #1
         ld1             {v0.8h -v3.8h},  [x1], #64
         ld1             {v16.8h-v19.8h}, [x0]
@@ -220,12 +226,16 @@ function ff_hevc_add_residual_32x32_10_neon, export=1
         sqadd           v1.8h, v1.8h, v17.8h
         sqadd           v2.8h, v2.8h, v18.8h
         sqadd           v3.8h, v3.8h, v19.8h
-        clip10          v0.8h, v1.8h, v20.8h, v21.8h
-        clip10          v2.8h, v3.8h, v20.8h, v21.8h
+        clip2           v0.8h, v1.8h, v20.8h, v21.8h
+        clip2           v2.8h, v3.8h, v20.8h, v21.8h
         st1             {v0.8h-v3.8h}, [x0], x2
         bne             1b
         ret
 endfunc
+.endm
+
+add_res 10
+add_res 12
 
 .macro sum_sub out, in, c, op, p
   .ifc \op, +
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 2002530266..f37e47121e 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -25,22 +25,18 @@
 #include "libavutil/aarch64/cpu.h"
 #include "libavcodec/hevcdsp.h"
 
-void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, int16_t *coeffs,
-                                     ptrdiff_t stride);
-void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, int16_t *coeffs,
-                                      ptrdiff_t stride);
-void ff_hevc_add_residual_8x8_8_neon(uint8_t *_dst, int16_t *coeffs,
-                                     ptrdiff_t stride);
-void ff_hevc_add_residual_8x8_10_neon(uint8_t *_dst, int16_t *coeffs,
-                                      ptrdiff_t stride);
-void ff_hevc_add_residual_16x16_8_neon(uint8_t *_dst, int16_t *coeffs,
-                                       ptrdiff_t stride);
-void ff_hevc_add_residual_16x16_10_neon(uint8_t *_dst, int16_t *coeffs,
-                                        ptrdiff_t stride);
-void ff_hevc_add_residual_32x32_8_neon(uint8_t *_dst, int16_t *coeffs,
-                                       ptrdiff_t stride);
-void ff_hevc_add_residual_32x32_10_neon(uint8_t *_dst, int16_t *coeffs,
-                                        ptrdiff_t stride);
+void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_4x4_12_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_8x8_8_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_8x8_10_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_8x8_12_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_16x16_8_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_16x16_10_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_16x16_12_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_32x32_8_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_32x32_10_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_32x32_12_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
 void ff_hevc_idct_8x8_8_neon(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit);
@@ -100,4 +96,10 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_10_neon;
         c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_10_neon;
     }
+    if (bit_depth == 12) {
+        c->add_residual[0]             = ff_hevc_add_residual_4x4_12_neon;
+        c->add_residual[1]             = ff_hevc_add_residual_8x8_12_neon;
+        c->add_residual[2]             = ff_hevc_add_residual_16x16_12_neon;
+        c->add_residual[3]             = ff_hevc_add_residual_32x32_12_neon;
+    }
 }
-- 
2.32.0 (Apple Git-132)
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply	[flat|nested] 12+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/3] checkasm/hevc_add_res: add 12bit test
  2022-06-23 18:04 [FFmpeg-devel] [PATCH 1/3] checkasm/hevc_add_res: add 12bit test J. Dekker
  2022-06-23 18:04 ` [FFmpeg-devel] [PATCH 2/3] lavc/aarch64: reformat add_res funcs J. Dekker
  2022-06-23 18:04 ` [FFmpeg-devel] [PATCH 3/3] lavc/aarch64: hevc_add_res add 12bit variants J. Dekker
@ 2022-08-09 11:02 ` Martin Storsjö
  2 siblings, 0 replies; 12+ messages in thread
From: Martin Storsjö @ 2022-08-09 11:02 UTC (permalink / raw)
  To: FFmpeg development discussions and patches
On Thu, 23 Jun 2022, J. Dekker wrote:
> Signed-off-by: J. Dekker <jdek@itanimul.li>
> ---
> tests/checkasm/hevc_add_res.c | 15 ++++++++-------
> 1 file changed, 8 insertions(+), 7 deletions(-)
>
> diff --git a/tests/checkasm/hevc_add_res.c b/tests/checkasm/hevc_add_res.c
> index 0c896adaca..f17d121939 100644
> --- a/tests/checkasm/hevc_add_res.c
> +++ b/tests/checkasm/hevc_add_res.c
> @@ -36,14 +36,14 @@
>         }                                       \
>     } while (0)
>
> -#define randomize_buffers2(buf, size)             \
> +#define randomize_buffers2(buf, size, mask)       \
>     do {                                          \
>         int j;                                    \
>         for (j = 0; j < size; j++)                \
> -            AV_WN16A(buf + j * 2, rnd() & 0x3FF); \
> +            AV_WN16A(buf + j * 2, rnd() & mask); \
>     } while (0)
>
> -static void compare_add_res(int size, ptrdiff_t stride, int overflow_test)
> +static void compare_add_res(int size, ptrdiff_t stride, int overflow_test, int mask)
> {
>     LOCAL_ALIGNED_32(int16_t, res0, [32 * 32]);
>     LOCAL_ALIGNED_32(int16_t, res1, [32 * 32]);
> @@ -53,7 +53,7 @@ static void compare_add_res(int size, ptrdiff_t stride, int overflow_test)
>     declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, int16_t *res, ptrdiff_t stride);
>
>     randomize_buffers(res0, size);
> -    randomize_buffers2(dst0, size);
> +    randomize_buffers2(dst0, size, mask);
>     if (overflow_test)
>         res0[0] = 0x8000;
>     memcpy(res1, res0, sizeof(*res0) * size);
> @@ -69,6 +69,7 @@ static void compare_add_res(int size, ptrdiff_t stride, int overflow_test)
> static void check_add_res(HEVCDSPContext h, int bit_depth)
> {
>     int i;
> +    int mask = bit_depth == 8 ? 0xFFFF : bit_depth == 10 ? 0x03FF : 0x07FF;
Previously we always used the mask 0x03FF, while we now use 0xFFFF for 8 
bit. I presume that means that for 8 bit, we mask two pixels with one 
0xFFFF (and keep all bits), and previously we accidentally masked out 
everything but the lowest two bits, from every other pixel, in 8 bit mode?
The patch LGTM, but it'd be good to acknowledge this existing issue in the 
commit message.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply	[flat|nested] 12+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/3] lavc/aarch64: reformat add_res funcs
  2022-06-23 18:04 ` [FFmpeg-devel] [PATCH 2/3] lavc/aarch64: reformat add_res funcs J. Dekker
@ 2022-08-09 11:04   ` Martin Storsjö
  0 siblings, 0 replies; 12+ messages in thread
From: Martin Storsjö @ 2022-08-09 11:04 UTC (permalink / raw)
  To: FFmpeg development discussions and patches
On Thu, 23 Jun 2022, J. Dekker wrote:
> Signed-off-by: J. Dekker <jdek@itanimul.li>
> ---
> libavcodec/aarch64/hevcdsp_idct_neon.S | 216 ++++++++++++-------------
> 1 file changed, 108 insertions(+), 108 deletions(-)
LGTM, thanks!
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply	[flat|nested] 12+ messages in thread
* Re: [FFmpeg-devel] [PATCH 3/3] lavc/aarch64: hevc_add_res add 12bit variants
  2022-06-23 18:04 ` [FFmpeg-devel] [PATCH 3/3] lavc/aarch64: hevc_add_res add 12bit variants J. Dekker
@ 2022-08-09 11:13   ` Martin Storsjö
  2022-08-09 11:21     ` Martin Storsjö
  0 siblings, 1 reply; 12+ messages in thread
From: Martin Storsjö @ 2022-08-09 11:13 UTC (permalink / raw)
  To: FFmpeg development discussions and patches
On Thu, 23 Jun 2022, J. Dekker wrote:
> hevc_add_res_4x4_12_c: 46.0
> hevc_add_res_4x4_12_neon: 18.7
> hevc_add_res_8x8_12_c: 194.7
> hevc_add_res_8x8_12_neon: 25.2
> hevc_add_res_16x16_12_c: 716.0
> hevc_add_res_16x16_12_neon: 69.7
> hevc_add_res_32x32_12_c: 3820.7
> hevc_add_res_32x32_12_neon: 261.0
>
> Signed-off-by: J. Dekker <jdek@itanimul.li>
> ---
> libavcodec/aarch64/hevcdsp_idct_neon.S    | 148 ++++++++++++----------
> libavcodec/aarch64/hevcdsp_init_aarch64.c |  34 ++---
> 2 files changed, 97 insertions(+), 85 deletions(-)
LGTM. The patch is a bit hard to inspect thoroughly (to see exactly how 
little has changed) due to the functions being moved around at the same 
time as they're modified, but I checked and the changes do look fine.
By splitting things up in individual macros for each function, (e.g. 
add_res_4x4, add_res_8x8 etc, then add_res setting the mask and calling 
the others) you could keep the code in place and make the diff even easier 
to read, but it's not strictly necessary.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply	[flat|nested] 12+ messages in thread
* Re: [FFmpeg-devel] [PATCH 3/3] lavc/aarch64: hevc_add_res add 12bit variants
  2022-08-09 11:13   ` Martin Storsjö
@ 2022-08-09 11:21     ` Martin Storsjö
  2022-08-16  5:01       ` [FFmpeg-devel] [PATCH v2] " J. Dekker
  0 siblings, 1 reply; 12+ messages in thread
From: Martin Storsjö @ 2022-08-09 11:21 UTC (permalink / raw)
  To: FFmpeg development discussions and patches
On Tue, 9 Aug 2022, Martin Storsjö wrote:
> On Thu, 23 Jun 2022, J. Dekker wrote:
>
>> hevc_add_res_4x4_12_c: 46.0
>> hevc_add_res_4x4_12_neon: 18.7
>> hevc_add_res_8x8_12_c: 194.7
>> hevc_add_res_8x8_12_neon: 25.2
>> hevc_add_res_16x16_12_c: 716.0
>> hevc_add_res_16x16_12_neon: 69.7
>> hevc_add_res_32x32_12_c: 3820.7
>> hevc_add_res_32x32_12_neon: 261.0
>> 
>> Signed-off-by: J. Dekker <jdek@itanimul.li>
>> ---
>> libavcodec/aarch64/hevcdsp_idct_neon.S    | 148 ++++++++++++----------
>> libavcodec/aarch64/hevcdsp_init_aarch64.c |  34 ++---
>> 2 files changed, 97 insertions(+), 85 deletions(-)
>
> LGTM. The patch is a bit hard to inspect thoroughly (to see exactly how 
> little has changed) due to the functions being moved around at the same time 
> as they're modified, but I checked and the changes do look fine.
>
> By splitting things up in individual macros for each function, (e.g. 
> add_res_4x4, add_res_8x8 etc, then add_res setting the mask and calling the 
> others) you could keep the code in place and make the diff even easier to 
> read, but it's not strictly necessary.
Actually, I do want you to make a change here.
The only single thing that differs between the 10 and 12 bit versions, is 
what the mask register is initialized to. It's totally a waste of space to 
produce two near-identical versions of everything.
Instead I'd suggest making just two frontend functions, which sets the 
mask register and then calls the (non-exported) 16 bit generic function. 
Also, have a look at e.g. vp9mc_16bpp_neon.S, where we have something 
similar:
.macro do_8tap_v_func type, filter, offset, size, bpp
function ff_vp9_\type\()_\filter\()\size\()_v_\bpp\()_neon, export=1
         uxtw            x4,  w4
         mvni            v1.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
         movrel          x5,  X(ff_vp9_subpel_filters), 256*\offset
         add             x6,  x5,  w6, uxtw #4
         mov             x5,  #\size
.if \size >= 8
         b               \type\()_8tap_8v
...
For your case, you don't need anything else than the mvni instruction and 
then a branch to the actual implementation.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply	[flat|nested] 12+ messages in thread
* [FFmpeg-devel] [PATCH v2] lavc/aarch64: hevc_add_res add 12bit variants
  2022-08-09 11:21     ` Martin Storsjö
@ 2022-08-16  5:01       ` J. Dekker
  2022-08-16 11:38         ` Martin Storsjö
  0 siblings, 1 reply; 12+ messages in thread
From: J. Dekker @ 2022-08-16  5:01 UTC (permalink / raw)
  To: ffmpeg-devel
hevc_add_res_4x4_12_c: 46.0
hevc_add_res_4x4_12_neon: 18.7
hevc_add_res_8x8_12_c: 194.7
hevc_add_res_8x8_12_neon: 25.2
hevc_add_res_16x16_12_c: 716.0
hevc_add_res_16x16_12_neon: 69.7
hevc_add_res_32x32_12_c: 3820.7
hevc_add_res_32x32_12_neon: 261.0
Signed-off-by: J. Dekker <jdek@itanimul.li>
---
 libavcodec/aarch64/hevcdsp_idct_neon.S    | 156 ++++++++++++----------
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  34 ++---
 2 files changed, 105 insertions(+), 85 deletions(-)
diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 484eea8437..5fb5990f3d 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -37,11 +37,11 @@ const trans, align=4
         .short          31, 22, 13, 4
 endconst
 
-.macro clip10 in1, in2, c1, c2
-        smax            \in1, \in1, \c1
-        smax            \in2, \in2, \c1
-        smin            \in1, \in1, \c2
-        smin            \in2, \in2, \c2
+.macro clip2 in1, in2, min, max
+        smax            \in1, \in1, \min
+        smax            \in2, \in2, \min
+        smin            \in1, \in1, \max
+        smin            \in2, \in2, \max
 .endm
 
 function ff_hevc_add_residual_4x4_8_neon, export=1
@@ -64,25 +64,6 @@ function ff_hevc_add_residual_4x4_8_neon, export=1
         ret
 endfunc
 
-function ff_hevc_add_residual_4x4_10_neon, export=1
-        mov             x12,  x0
-        ld1             {v0.8h-v1.8h}, [x1]
-        ld1             {v2.d}[0], [x12], x2
-        ld1             {v2.d}[1], [x12], x2
-        ld1             {v3.d}[0], [x12], x2
-        sqadd           v0.8h, v0.8h, v2.8h
-        ld1             {v3.d}[1], [x12], x2
-        movi            v4.8h, #0
-        sqadd           v1.8h, v1.8h, v3.8h
-        mvni            v5.8h, #0xFC, lsl #8 // movi #0x3FF
-        clip10          v0.8h, v1.8h, v4.8h, v5.8h
-        st1             {v0.d}[0], [x0],  x2
-        st1             {v0.d}[1], [x0],  x2
-        st1             {v1.d}[0], [x0],  x2
-        st1             {v1.d}[1], [x0],  x2
-        ret
-endfunc
-
 function ff_hevc_add_residual_8x8_8_neon, export=1
         add             x12, x0, x2
         add             x2, x2, x2
@@ -103,25 +84,6 @@ function ff_hevc_add_residual_8x8_8_neon, export=1
         ret
 endfunc
 
-function ff_hevc_add_residual_8x8_10_neon, export=1
-        add             x12, x0, x2
-        add             x2,  x2, x2
-        mov             x3,  #8
-        movi            v4.8h, #0
-        mvni            v5.8h, #0xFC, lsl #8 // movi #0x3FF
-1:      subs            x3,  x3, #2
-        ld1             {v0.8h-v1.8h}, [x1], #32
-        ld1             {v2.8h}, [x0]
-        sqadd           v0.8h, v0.8h, v2.8h
-        ld1             {v3.8h}, [x12]
-        sqadd           v1.8h, v1.8h, v3.8h
-        clip10          v0.8h, v1.8h, v4.8h, v5.8h
-        st1             {v0.8h}, [x0],  x2
-        st1             {v1.8h}, [x12], x2
-        bne             1b
-        ret
-endfunc
-
 function ff_hevc_add_residual_16x16_8_neon, export=1
         mov             x3,  #16
         add             x12, x0, x2
@@ -148,28 +110,6 @@ function ff_hevc_add_residual_16x16_8_neon, export=1
         ret
 endfunc
 
-function ff_hevc_add_residual_16x16_10_neon, export=1
-        mov             x3,  #16
-        movi            v20.8h, #0
-        mvni            v21.8h, #0xFC, lsl #8 // movi #0x3FF
-        add             x12,  x0, x2
-        add             x2,  x2, x2
-1:      subs            x3,  x3, #2
-        ld1             {v16.8h-v17.8h}, [x0]
-        ld1             {v0.8h-v3.8h},   [x1], #64
-        sqadd           v0.8h, v0.8h, v16.8h
-        ld1             {v18.8h-v19.8h}, [x12]
-        sqadd           v1.8h, v1.8h, v17.8h
-        sqadd           v2.8h, v2.8h, v18.8h
-        sqadd           v3.8h, v3.8h, v19.8h
-        clip10          v0.8h, v1.8h, v20.8h, v21.8h
-        clip10          v2.8h, v3.8h, v20.8h, v21.8h
-        st1             {v0.8h-v1.8h}, [x0],  x2
-        st1             {v2.8h-v3.8h}, [x12], x2
-        bne             1b
-        ret
-endfunc
-
 function ff_hevc_add_residual_32x32_8_neon, export=1
         add             x12,  x0, x2
         add             x2,  x2, x2
@@ -209,10 +149,88 @@ function ff_hevc_add_residual_32x32_8_neon, export=1
         ret
 endfunc
 
-function ff_hevc_add_residual_32x32_10_neon, export=1
+.macro add_res bitdepth
+function ff_hevc_add_residual_4x4_\bitdepth\()_neon, export=1
+        mvni            v21.8h, #((0xFF << (\bitdepth - 8)) & 0xFF), lsl #8
+        b               X(ff_hevc_add_residual_4x4_16_neon)
+endfunc
+function ff_hevc_add_residual_8x8_\bitdepth\()_neon, export=1
+        mvni            v21.8h, #((0xFF << (\bitdepth - 8)) & 0xFF), lsl #8
+        b               X(ff_hevc_add_residual_8x8_16_neon)
+endfunc
+function ff_hevc_add_residual_16x16_\bitdepth\()_neon, export=1
+        mvni            v21.8h, #((0xFF << (\bitdepth - 8)) & 0xFF), lsl #8
+        b               X(ff_hevc_add_residual_16x16_16_neon)
+endfunc
+function ff_hevc_add_residual_32x32_\bitdepth\()_neon, export=1
+        mvni            v21.8h, #((0xFF << (\bitdepth - 8)) & 0xFF), lsl #8
+        b               X(ff_hevc_add_residual_32x32_16_neon)
+endfunc
+.endm
+
+add_res 10
+add_res 12
+
+function ff_hevc_add_residual_4x4_16_neon, export=0
+        mov             x12,  x0
+        ld1             {v0.8h-v1.8h}, [x1]
+        ld1             {v2.d}[0], [x12], x2
+        ld1             {v2.d}[1], [x12], x2
+        ld1             {v3.d}[0], [x12], x2
+        sqadd           v0.8h, v0.8h, v2.8h
+        ld1             {v3.d}[1], [x12], x2
+        movi            v4.8h, #0
+        sqadd           v1.8h, v1.8h, v3.8h
+        clip2           v0.8h, v1.8h, v4.8h, v21.8h
+        st1             {v0.d}[0], [x0],  x2
+        st1             {v0.d}[1], [x0],  x2
+        st1             {v1.d}[0], [x0],  x2
+        st1             {v1.d}[1], [x0],  x2
+        ret
+endfunc
+
+function ff_hevc_add_residual_8x8_16_neon, export=0
+        add             x12, x0, x2
+        add             x2,  x2, x2
+        mov             x3,  #8
+        movi            v4.8h, #0
+1:      subs            x3,  x3, #2
+        ld1             {v0.8h-v1.8h}, [x1], #32
+        ld1             {v2.8h}, [x0]
+        sqadd           v0.8h, v0.8h, v2.8h
+        ld1             {v3.8h}, [x12]
+        sqadd           v1.8h, v1.8h, v3.8h
+        clip2           v0.8h, v1.8h, v4.8h, v21.8h
+        st1             {v0.8h}, [x0],  x2
+        st1             {v1.8h}, [x12], x2
+        bne             1b
+        ret
+endfunc
+
+function ff_hevc_add_residual_16x16_16_neon, export=0
+        mov             x3,  #16
+        movi            v20.8h, #0
+        add             x12,  x0, x2
+        add             x2,  x2, x2
+1:      subs            x3,  x3, #2
+        ld1             {v16.8h-v17.8h}, [x0]
+        ld1             {v0.8h-v3.8h},   [x1], #64
+        sqadd           v0.8h, v0.8h, v16.8h
+        ld1             {v18.8h-v19.8h}, [x12]
+        sqadd           v1.8h, v1.8h, v17.8h
+        sqadd           v2.8h, v2.8h, v18.8h
+        sqadd           v3.8h, v3.8h, v19.8h
+        clip2           v0.8h, v1.8h, v20.8h, v21.8h
+        clip2           v2.8h, v3.8h, v20.8h, v21.8h
+        st1             {v0.8h-v1.8h}, [x0],  x2
+        st1             {v2.8h-v3.8h}, [x12], x2
+        bne             1b
+        ret
+endfunc
+
+function ff_hevc_add_residual_32x32_16_neon, export=0
         mov             x3, #32
         movi            v20.8h, #0
-        mvni            v21.8h, #0xFC, lsl #8 // movi #0x3FF
 1:      subs            x3, x3, #1
         ld1             {v0.8h -v3.8h},  [x1], #64
         ld1             {v16.8h-v19.8h}, [x0]
@@ -220,8 +238,8 @@ function ff_hevc_add_residual_32x32_10_neon, export=1
         sqadd           v1.8h, v1.8h, v17.8h
         sqadd           v2.8h, v2.8h, v18.8h
         sqadd           v3.8h, v3.8h, v19.8h
-        clip10          v0.8h, v1.8h, v20.8h, v21.8h
-        clip10          v2.8h, v3.8h, v20.8h, v21.8h
+        clip2           v0.8h, v1.8h, v20.8h, v21.8h
+        clip2           v2.8h, v3.8h, v20.8h, v21.8h
         st1             {v0.8h-v3.8h}, [x0], x2
         bne             1b
         ret
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 2002530266..f37e47121e 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -25,22 +25,18 @@
 #include "libavutil/aarch64/cpu.h"
 #include "libavcodec/hevcdsp.h"
 
-void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, int16_t *coeffs,
-                                     ptrdiff_t stride);
-void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, int16_t *coeffs,
-                                      ptrdiff_t stride);
-void ff_hevc_add_residual_8x8_8_neon(uint8_t *_dst, int16_t *coeffs,
-                                     ptrdiff_t stride);
-void ff_hevc_add_residual_8x8_10_neon(uint8_t *_dst, int16_t *coeffs,
-                                      ptrdiff_t stride);
-void ff_hevc_add_residual_16x16_8_neon(uint8_t *_dst, int16_t *coeffs,
-                                       ptrdiff_t stride);
-void ff_hevc_add_residual_16x16_10_neon(uint8_t *_dst, int16_t *coeffs,
-                                        ptrdiff_t stride);
-void ff_hevc_add_residual_32x32_8_neon(uint8_t *_dst, int16_t *coeffs,
-                                       ptrdiff_t stride);
-void ff_hevc_add_residual_32x32_10_neon(uint8_t *_dst, int16_t *coeffs,
-                                        ptrdiff_t stride);
+void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_4x4_12_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_8x8_8_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_8x8_10_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_8x8_12_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_16x16_8_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_16x16_10_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_16x16_12_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_32x32_8_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_32x32_10_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_32x32_12_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
 void ff_hevc_idct_8x8_8_neon(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit);
@@ -100,4 +96,10 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_10_neon;
         c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_10_neon;
     }
+    if (bit_depth == 12) {
+        c->add_residual[0]             = ff_hevc_add_residual_4x4_12_neon;
+        c->add_residual[1]             = ff_hevc_add_residual_8x8_12_neon;
+        c->add_residual[2]             = ff_hevc_add_residual_16x16_12_neon;
+        c->add_residual[3]             = ff_hevc_add_residual_32x32_12_neon;
+    }
 }
-- 
2.32.0 (Apple Git-132)
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply	[flat|nested] 12+ messages in thread
* Re: [FFmpeg-devel] [PATCH v2] lavc/aarch64: hevc_add_res add 12bit variants
  2022-08-16  5:01       ` [FFmpeg-devel] [PATCH v2] " J. Dekker
@ 2022-08-16 11:38         ` Martin Storsjö
  2022-08-16 12:12           ` [FFmpeg-devel] [PATCH v3] " J. Dekker
  0 siblings, 1 reply; 12+ messages in thread
From: Martin Storsjö @ 2022-08-16 11:38 UTC (permalink / raw)
  To: FFmpeg development discussions and patches
On Tue, 16 Aug 2022, J. Dekker wrote:
> hevc_add_res_4x4_12_c: 46.0
> hevc_add_res_4x4_12_neon: 18.7
> hevc_add_res_8x8_12_c: 194.7
> hevc_add_res_8x8_12_neon: 25.2
> hevc_add_res_16x16_12_c: 716.0
> hevc_add_res_16x16_12_neon: 69.7
> hevc_add_res_32x32_12_c: 3820.7
> hevc_add_res_32x32_12_neon: 261.0
>
> Signed-off-by: J. Dekker <jdek@itanimul.li>
> ---
> libavcodec/aarch64/hevcdsp_idct_neon.S    | 156 ++++++++++++----------
> libavcodec/aarch64/hevcdsp_init_aarch64.c |  34 ++---
> 2 files changed, 105 insertions(+), 85 deletions(-)
>
> -function ff_hevc_add_residual_32x32_10_neon, export=1
> +.macro add_res bitdepth
> +function ff_hevc_add_residual_4x4_\bitdepth\()_neon, export=1
> +        mvni            v21.8h, #((0xFF << (\bitdepth - 8)) & 0xFF), lsl #8
> +        b               X(ff_hevc_add_residual_4x4_16_neon)
When the function isn't exported, you shouldn't use X() to access the 
symbol of it. On Darwin, X() adds the underscore prefix, but that symbol 
name is only defined for exported functions. Also, you probably should 
remove the ff_ prefix for symbols that aren't exported, for clarity.
This issue causes the patch in its current form to break compilation on 
macOS.
> -void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, int16_t *coeffs,
> -                                     ptrdiff_t stride);
> -void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, int16_t *coeffs,
> -                                      ptrdiff_t stride);
> -void ff_hevc_add_residual_8x8_8_neon(uint8_t *_dst, int16_t *coeffs,
> -                                     ptrdiff_t stride);
> -void ff_hevc_add_residual_8x8_10_neon(uint8_t *_dst, int16_t *coeffs,
> -                                      ptrdiff_t stride);
> -void ff_hevc_add_residual_16x16_8_neon(uint8_t *_dst, int16_t *coeffs,
> -                                       ptrdiff_t stride);
> -void ff_hevc_add_residual_16x16_10_neon(uint8_t *_dst, int16_t *coeffs,
> -                                        ptrdiff_t stride);
> -void ff_hevc_add_residual_32x32_8_neon(uint8_t *_dst, int16_t *coeffs,
> -                                       ptrdiff_t stride);
> -void ff_hevc_add_residual_32x32_10_neon(uint8_t *_dst, int16_t *coeffs,
> -                                        ptrdiff_t stride);
> +void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_add_residual_4x4_12_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_add_residual_8x8_8_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_add_residual_8x8_10_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_add_residual_8x8_12_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_add_residual_16x16_8_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_add_residual_16x16_10_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_add_residual_16x16_12_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_add_residual_32x32_8_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_add_residual_32x32_10_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_add_residual_32x32_12_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
Note that these have been amended to include "const" on the coeffs 
parameter recently.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply	[flat|nested] 12+ messages in thread
* [FFmpeg-devel] [PATCH v3] lavc/aarch64: hevc_add_res add 12bit variants
  2022-08-16 11:38         ` Martin Storsjö
@ 2022-08-16 12:12           ` J. Dekker
  2022-08-16 12:46             ` Martin Storsjö
  0 siblings, 1 reply; 12+ messages in thread
From: J. Dekker @ 2022-08-16 12:12 UTC (permalink / raw)
  To: ffmpeg-devel
hevc_add_res_4x4_12_c: 46.0
hevc_add_res_4x4_12_neon: 18.7
hevc_add_res_8x8_12_c: 194.7
hevc_add_res_8x8_12_neon: 25.2
hevc_add_res_16x16_12_c: 716.0
hevc_add_res_16x16_12_neon: 69.7
hevc_add_res_32x32_12_c: 3820.7
hevc_add_res_32x32_12_neon: 261.0
Signed-off-by: J. Dekker <jdek@itanimul.li>
---
 libavcodec/aarch64/hevcdsp_idct_neon.S    | 156 ++++++++++++----------
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  34 ++---
 2 files changed, 105 insertions(+), 85 deletions(-)
diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 484eea8437..97c51e06e3 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -37,11 +37,11 @@ const trans, align=4
         .short          31, 22, 13, 4
 endconst
 
-.macro clip10 in1, in2, c1, c2
-        smax            \in1, \in1, \c1
-        smax            \in2, \in2, \c1
-        smin            \in1, \in1, \c2
-        smin            \in2, \in2, \c2
+.macro clip2 in1, in2, min, max
+        smax            \in1, \in1, \min
+        smax            \in2, \in2, \min
+        smin            \in1, \in1, \max
+        smin            \in2, \in2, \max
 .endm
 
 function ff_hevc_add_residual_4x4_8_neon, export=1
@@ -64,25 +64,6 @@ function ff_hevc_add_residual_4x4_8_neon, export=1
         ret
 endfunc
 
-function ff_hevc_add_residual_4x4_10_neon, export=1
-        mov             x12,  x0
-        ld1             {v0.8h-v1.8h}, [x1]
-        ld1             {v2.d}[0], [x12], x2
-        ld1             {v2.d}[1], [x12], x2
-        ld1             {v3.d}[0], [x12], x2
-        sqadd           v0.8h, v0.8h, v2.8h
-        ld1             {v3.d}[1], [x12], x2
-        movi            v4.8h, #0
-        sqadd           v1.8h, v1.8h, v3.8h
-        mvni            v5.8h, #0xFC, lsl #8 // movi #0x3FF
-        clip10          v0.8h, v1.8h, v4.8h, v5.8h
-        st1             {v0.d}[0], [x0],  x2
-        st1             {v0.d}[1], [x0],  x2
-        st1             {v1.d}[0], [x0],  x2
-        st1             {v1.d}[1], [x0],  x2
-        ret
-endfunc
-
 function ff_hevc_add_residual_8x8_8_neon, export=1
         add             x12, x0, x2
         add             x2, x2, x2
@@ -103,25 +84,6 @@ function ff_hevc_add_residual_8x8_8_neon, export=1
         ret
 endfunc
 
-function ff_hevc_add_residual_8x8_10_neon, export=1
-        add             x12, x0, x2
-        add             x2,  x2, x2
-        mov             x3,  #8
-        movi            v4.8h, #0
-        mvni            v5.8h, #0xFC, lsl #8 // movi #0x3FF
-1:      subs            x3,  x3, #2
-        ld1             {v0.8h-v1.8h}, [x1], #32
-        ld1             {v2.8h}, [x0]
-        sqadd           v0.8h, v0.8h, v2.8h
-        ld1             {v3.8h}, [x12]
-        sqadd           v1.8h, v1.8h, v3.8h
-        clip10          v0.8h, v1.8h, v4.8h, v5.8h
-        st1             {v0.8h}, [x0],  x2
-        st1             {v1.8h}, [x12], x2
-        bne             1b
-        ret
-endfunc
-
 function ff_hevc_add_residual_16x16_8_neon, export=1
         mov             x3,  #16
         add             x12, x0, x2
@@ -148,28 +110,6 @@ function ff_hevc_add_residual_16x16_8_neon, export=1
         ret
 endfunc
 
-function ff_hevc_add_residual_16x16_10_neon, export=1
-        mov             x3,  #16
-        movi            v20.8h, #0
-        mvni            v21.8h, #0xFC, lsl #8 // movi #0x3FF
-        add             x12,  x0, x2
-        add             x2,  x2, x2
-1:      subs            x3,  x3, #2
-        ld1             {v16.8h-v17.8h}, [x0]
-        ld1             {v0.8h-v3.8h},   [x1], #64
-        sqadd           v0.8h, v0.8h, v16.8h
-        ld1             {v18.8h-v19.8h}, [x12]
-        sqadd           v1.8h, v1.8h, v17.8h
-        sqadd           v2.8h, v2.8h, v18.8h
-        sqadd           v3.8h, v3.8h, v19.8h
-        clip10          v0.8h, v1.8h, v20.8h, v21.8h
-        clip10          v2.8h, v3.8h, v20.8h, v21.8h
-        st1             {v0.8h-v1.8h}, [x0],  x2
-        st1             {v2.8h-v3.8h}, [x12], x2
-        bne             1b
-        ret
-endfunc
-
 function ff_hevc_add_residual_32x32_8_neon, export=1
         add             x12,  x0, x2
         add             x2,  x2, x2
@@ -209,10 +149,88 @@ function ff_hevc_add_residual_32x32_8_neon, export=1
         ret
 endfunc
 
-function ff_hevc_add_residual_32x32_10_neon, export=1
+.macro add_res bitdepth
+function ff_hevc_add_residual_4x4_\bitdepth\()_neon, export=1
+        mvni            v21.8h, #((0xFF << (\bitdepth - 8)) & 0xFF), lsl #8
+        b               hevc_add_residual_4x4_16_neon
+endfunc
+function ff_hevc_add_residual_8x8_\bitdepth\()_neon, export=1
+        mvni            v21.8h, #((0xFF << (\bitdepth - 8)) & 0xFF), lsl #8
+        b               hevc_add_residual_8x8_16_neon
+endfunc
+function ff_hevc_add_residual_16x16_\bitdepth\()_neon, export=1
+        mvni            v21.8h, #((0xFF << (\bitdepth - 8)) & 0xFF), lsl #8
+        b               hevc_add_residual_16x16_16_neon
+endfunc
+function ff_hevc_add_residual_32x32_\bitdepth\()_neon, export=1
+        mvni            v21.8h, #((0xFF << (\bitdepth - 8)) & 0xFF), lsl #8
+        b               hevc_add_residual_32x32_16_neon
+endfunc
+.endm
+
+add_res 10
+add_res 12
+
+function hevc_add_residual_4x4_16_neon, export=0
+        mov             x12,  x0
+        ld1             {v0.8h-v1.8h}, [x1]
+        ld1             {v2.d}[0], [x12], x2
+        ld1             {v2.d}[1], [x12], x2
+        ld1             {v3.d}[0], [x12], x2
+        sqadd           v0.8h, v0.8h, v2.8h
+        ld1             {v3.d}[1], [x12], x2
+        movi            v4.8h, #0
+        sqadd           v1.8h, v1.8h, v3.8h
+        clip2           v0.8h, v1.8h, v4.8h, v21.8h
+        st1             {v0.d}[0], [x0],  x2
+        st1             {v0.d}[1], [x0],  x2
+        st1             {v1.d}[0], [x0],  x2
+        st1             {v1.d}[1], [x0],  x2
+        ret
+endfunc
+
+function hevc_add_residual_8x8_16_neon, export=0
+        add             x12, x0, x2
+        add             x2,  x2, x2
+        mov             x3,  #8
+        movi            v4.8h, #0
+1:      subs            x3,  x3, #2
+        ld1             {v0.8h-v1.8h}, [x1], #32
+        ld1             {v2.8h}, [x0]
+        sqadd           v0.8h, v0.8h, v2.8h
+        ld1             {v3.8h}, [x12]
+        sqadd           v1.8h, v1.8h, v3.8h
+        clip2           v0.8h, v1.8h, v4.8h, v21.8h
+        st1             {v0.8h}, [x0],  x2
+        st1             {v1.8h}, [x12], x2
+        bne             1b
+        ret
+endfunc
+
+function hevc_add_residual_16x16_16_neon, export=0
+        mov             x3,  #16
+        movi            v20.8h, #0
+        add             x12,  x0, x2
+        add             x2,  x2, x2
+1:      subs            x3,  x3, #2
+        ld1             {v16.8h-v17.8h}, [x0]
+        ld1             {v0.8h-v3.8h},   [x1], #64
+        sqadd           v0.8h, v0.8h, v16.8h
+        ld1             {v18.8h-v19.8h}, [x12]
+        sqadd           v1.8h, v1.8h, v17.8h
+        sqadd           v2.8h, v2.8h, v18.8h
+        sqadd           v3.8h, v3.8h, v19.8h
+        clip2           v0.8h, v1.8h, v20.8h, v21.8h
+        clip2           v2.8h, v3.8h, v20.8h, v21.8h
+        st1             {v0.8h-v1.8h}, [x0],  x2
+        st1             {v2.8h-v3.8h}, [x12], x2
+        bne             1b
+        ret
+endfunc
+
+function hevc_add_residual_32x32_16_neon, export=0
         mov             x3, #32
         movi            v20.8h, #0
-        mvni            v21.8h, #0xFC, lsl #8 // movi #0x3FF
 1:      subs            x3, x3, #1
         ld1             {v0.8h -v3.8h},  [x1], #64
         ld1             {v16.8h-v19.8h}, [x0]
@@ -220,8 +238,8 @@ function ff_hevc_add_residual_32x32_10_neon, export=1
         sqadd           v1.8h, v1.8h, v17.8h
         sqadd           v2.8h, v2.8h, v18.8h
         sqadd           v3.8h, v3.8h, v19.8h
-        clip10          v0.8h, v1.8h, v20.8h, v21.8h
-        clip10          v2.8h, v3.8h, v20.8h, v21.8h
+        clip2           v0.8h, v1.8h, v20.8h, v21.8h
+        clip2           v2.8h, v3.8h, v20.8h, v21.8h
         st1             {v0.8h-v3.8h}, [x0], x2
         bne             1b
         ret
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 9cbe983870..b6d5efb77f 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -25,22 +25,18 @@
 #include "libavutil/aarch64/cpu.h"
 #include "libavcodec/hevcdsp.h"
 
-void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, const int16_t *coeffs,
-                                     ptrdiff_t stride);
-void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, const int16_t *coeffs,
-                                      ptrdiff_t stride);
-void ff_hevc_add_residual_8x8_8_neon(uint8_t *_dst, const int16_t *coeffs,
-                                     ptrdiff_t stride);
-void ff_hevc_add_residual_8x8_10_neon(uint8_t *_dst, const int16_t *coeffs,
-                                      ptrdiff_t stride);
-void ff_hevc_add_residual_16x16_8_neon(uint8_t *_dst, const int16_t *coeffs,
-                                       ptrdiff_t stride);
-void ff_hevc_add_residual_16x16_10_neon(uint8_t *_dst, const int16_t *coeffs,
-                                        ptrdiff_t stride);
-void ff_hevc_add_residual_32x32_8_neon(uint8_t *_dst, const int16_t *coeffs,
-                                       ptrdiff_t stride);
-void ff_hevc_add_residual_32x32_10_neon(uint8_t *_dst, const int16_t *coeffs,
-                                        ptrdiff_t stride);
+void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_4x4_12_neon(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_8x8_8_neon(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_8x8_10_neon(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_8x8_12_neon(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_16x16_8_neon(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_16x16_10_neon(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_16x16_12_neon(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_32x32_8_neon(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_32x32_10_neon(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_32x32_12_neon(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t stride);
 void ff_hevc_idct_8x8_8_neon(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit);
@@ -100,4 +96,10 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_10_neon;
         c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_10_neon;
     }
+    if (bit_depth == 12) {
+        c->add_residual[0]             = ff_hevc_add_residual_4x4_12_neon;
+        c->add_residual[1]             = ff_hevc_add_residual_8x8_12_neon;
+        c->add_residual[2]             = ff_hevc_add_residual_16x16_12_neon;
+        c->add_residual[3]             = ff_hevc_add_residual_32x32_12_neon;
+    }
 }
-- 
2.32.0 (Apple Git-132)
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply	[flat|nested] 12+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3] lavc/aarch64: hevc_add_res add 12bit variants
  2022-08-16 12:12           ` [FFmpeg-devel] [PATCH v3] " J. Dekker
@ 2022-08-16 12:46             ` Martin Storsjö
  2022-08-18 13:07               ` J. Dekker
  0 siblings, 1 reply; 12+ messages in thread
From: Martin Storsjö @ 2022-08-16 12:46 UTC (permalink / raw)
  To: FFmpeg development discussions and patches
On Tue, 16 Aug 2022, J. Dekker wrote:
> hevc_add_res_4x4_12_c: 46.0
> hevc_add_res_4x4_12_neon: 18.7
> hevc_add_res_8x8_12_c: 194.7
> hevc_add_res_8x8_12_neon: 25.2
> hevc_add_res_16x16_12_c: 716.0
> hevc_add_res_16x16_12_neon: 69.7
> hevc_add_res_32x32_12_c: 3820.7
> hevc_add_res_32x32_12_neon: 261.0
>
> Signed-off-by: J. Dekker <jdek@itanimul.li>
> ---
>
> libavcodec/aarch64/hevcdsp_idct_neon.S    | 156 ++++++++++++----------
> libavcodec/aarch64/hevcdsp_init_aarch64.c |  34 ++---
> 2 files changed, 105 insertions(+), 85 deletions(-)
Thanks, this version seems fine to me.
> diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> index 9cbe983870..b6d5efb77f 100644
> --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
> +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> @@ -25,22 +25,18 @@
> #include "libavutil/aarch64/cpu.h"
> #include "libavcodec/hevcdsp.h"
>
> -void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, const int16_t *coeffs,
> -                                     ptrdiff_t stride);
> +void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t stride);
The joined forms of these lines end up a bit long, while they previously 
did fit below the 80 column soft-limit, so IMO I'd prefer to keep them 
wrapped - but it's not a big deal. (I guess it made more sense to join the 
lines before the 'const' was added.)
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply	[flat|nested] 12+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3] lavc/aarch64: hevc_add_res add 12bit variants
  2022-08-16 12:46             ` Martin Storsjö
@ 2022-08-18 13:07               ` J. Dekker
  0 siblings, 0 replies; 12+ messages in thread
From: J. Dekker @ 2022-08-18 13:07 UTC (permalink / raw)
  To: FFmpeg development discussions and patches
On 16 Aug 2022, at 14:46, Martin Storsjö wrote:
> On Tue, 16 Aug 2022, J. Dekker wrote:
>
>> hevc_add_res_4x4_12_c: 46.0
>> hevc_add_res_4x4_12_neon: 18.7
>> hevc_add_res_8x8_12_c: 194.7
>> hevc_add_res_8x8_12_neon: 25.2
>> hevc_add_res_16x16_12_c: 716.0
>> hevc_add_res_16x16_12_neon: 69.7
>> hevc_add_res_32x32_12_c: 3820.7
>> hevc_add_res_32x32_12_neon: 261.0
>>
>> Signed-off-by: J. Dekker <jdek@itanimul.li>
>> ---
>>
>> libavcodec/aarch64/hevcdsp_idct_neon.S    | 156 ++++++++++++----------
>> libavcodec/aarch64/hevcdsp_init_aarch64.c |  34 ++---
>> 2 files changed, 105 insertions(+), 85 deletions(-)
>
> Thanks, this version seems fine to me.
>
>> diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
>> index 9cbe983870..b6d5efb77f 100644
>> --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
>> +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
>> @@ -25,22 +25,18 @@
>> #include "libavutil/aarch64/cpu.h"
>> #include "libavcodec/hevcdsp.h"
>>
>> -void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, const int16_t *coeffs,
>> -                                     ptrdiff_t stride);
>> +void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t stride);
>
> The joined forms of these lines end up a bit long, while they previously did fit below the 80 column soft-limit, so IMO I'd prefer to keep them wrapped - but it's not a big deal. (I guess it made more sense to join the lines before the 'const' was added.)
>
> // Martin
Pushed with these changes (entire set now).
Thanks,
-- 
jd
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply	[flat|nested] 12+ messages in thread
end of thread, other threads:[~2022-08-18 13:07 UTC | newest]
Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-23 18:04 [FFmpeg-devel] [PATCH 1/3] checkasm/hevc_add_res: add 12bit test J. Dekker
2022-06-23 18:04 ` [FFmpeg-devel] [PATCH 2/3] lavc/aarch64: reformat add_res funcs J. Dekker
2022-08-09 11:04   ` Martin Storsjö
2022-06-23 18:04 ` [FFmpeg-devel] [PATCH 3/3] lavc/aarch64: hevc_add_res add 12bit variants J. Dekker
2022-08-09 11:13   ` Martin Storsjö
2022-08-09 11:21     ` Martin Storsjö
2022-08-16  5:01       ` [FFmpeg-devel] [PATCH v2] " J. Dekker
2022-08-16 11:38         ` Martin Storsjö
2022-08-16 12:12           ` [FFmpeg-devel] [PATCH v3] " J. Dekker
2022-08-16 12:46             ` Martin Storsjö
2022-08-18 13:07               ` J. Dekker
2022-08-09 11:02 ` [FFmpeg-devel] [PATCH 1/3] checkasm/hevc_add_res: add 12bit test Martin Storsjö
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git