* [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: new optimization for 8-bit hevc_epel_v
@ 2023-10-14 8:45 Logan.Lyu
2023-10-14 17:08 ` Michael Niedermayer
2023-10-22 13:29 ` Logan.Lyu
0 siblings, 2 replies; 6+ messages in thread
From: Logan.Lyu @ 2023-10-14 8:45 UTC (permalink / raw)
To: ffmpeg-devel
[-- Attachment #1: Type: text/plain, Size: 10960 bytes --]
checkasm bench:
put_hevc_epel_v4_8_c: 79.9
put_hevc_epel_v4_8_neon: 25.7
put_hevc_epel_v6_8_c: 151.4
put_hevc_epel_v6_8_neon: 46.4
put_hevc_epel_v8_8_c: 250.9
put_hevc_epel_v8_8_neon: 41.7
put_hevc_epel_v12_8_c: 542.7
put_hevc_epel_v12_8_neon: 108.7
put_hevc_epel_v16_8_c: 939.4
put_hevc_epel_v16_8_neon: 169.2
put_hevc_epel_v24_8_c: 2104.9
put_hevc_epel_v24_8_neon: 307.9
put_hevc_epel_v32_8_c: 3713.9
put_hevc_epel_v32_8_neon: 524.2
put_hevc_epel_v48_8_c: 8175.2
put_hevc_epel_v48_8_neon: 1197.2
put_hevc_epel_v64_8_c: 16049.4
put_hevc_epel_v64_8_neon: 2094.9
Co-Authored-By: J. Dekker <jdek@itanimul.li>
Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn>
---
libavcodec/aarch64/hevcdsp_epel_neon.S | 223 ++++++++++++++++++++++
libavcodec/aarch64/hevcdsp_init_aarch64.c | 5 +
2 files changed, 228 insertions(+)
diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S
b/libavcodec/aarch64/hevcdsp_epel_neon.S
index b4ca1e4c20..e541db5430 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -243,6 +243,229 @@ function ff_hevc_put_hevc_pel_pixels64_8_neon,
export=1
ret
endfunc
+
+function ff_hevc_put_hevc_epel_v4_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ldr s16, [x1]
+ ldr s17, [x1 ,x2]
+ add x1, x1, x2, lsl #1
+ ld1 {v18.s}[0], [x1], x2
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().s}[0], [x1], x2
+ movi v4.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ subs w3, w3, #1
+ st1 {v4.4h}, [x0], x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v6_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2 - 8)
+ ldr d16, [x1]
+ ldr d17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ld1 {v18.8b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().8b}, [x1], x2
+ movi v4.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ st1 {v4.d}[0], [x0], #8
+ subs w3, w3, #1
+ st1 {v4.s}[2], [x0], x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v8_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ldr d16, [x1]
+ ldr d17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ld1 {v18.8b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().8b}, [x1], x2
+ movi v4.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ subs w3, w3, #1
+ st1 {v4.8h}, [x0], x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v12_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ldr q16, [x1]
+ ldr q17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ld1 {v18.16b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().16b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ calc_epelb2 v5, \src0, \src1, \src2, \src3
+ str q4, [x0]
+ subs w3, w3, #1
+ str d5, [x0, #16]
+ add x0, x0, x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v16_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ldr q16, [x1]
+ ldr q17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ld1 {v18.16b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().16b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ calc_epelb2 v5, \src0, \src1, \src2, \src3
+ subs w3, w3, #1
+ st1 {v4.8h, v5.8h}, [x0], x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v24_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8b, v17.8b, v18.8b}, [x1], x2
+ ld1 {v19.8b, v20.8b, v21.8b}, [x1], x2
+ ld1 {v22.8b, v23.8b, v24.8b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9,
src10, src11
+ ld1 {\src9\().8b, \src10\().8b, \src11\().8b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ calc_epelb v4, \src0, \src3, \src6, \src9
+ calc_epelb v5, \src1, \src4, \src7, \src10
+ calc_epelb v6, \src2, \src5, \src8, \src11
+ subs w3, w3, #1
+ st1 {v4.8h-v6.8h}, [x0], x10
+.endm
+1: calc_all12
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v32_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.16b, v17.16b}, [x1], x2
+ ld1 {v18.16b, v19.16b}, [x1], x2
+ ld1 {v20.16b, v21.16b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\src6\().16b, \src7\().16b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+ calc_epelb v4, \src0, \src2, \src4, \src6
+ calc_epelb2 v5, \src0, \src2, \src4, \src6
+ calc_epelb v6, \src1, \src3, \src5, \src7
+ calc_epelb2 v7, \src1, \src3, \src5, \src7
+ subs w3, w3, #1
+ st1 {v4.8h-v7.8h}, [x0], x10
+.endm
+1: calc_all8
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v48_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #64
+ ld1 {v16.16b, v17.16b, v18.16b}, [x1], x2
+ ld1 {v19.16b, v20.16b, v21.16b}, [x1], x2
+ ld1 {v22.16b, v23.16b, v24.16b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9,
src10, src11
+ ld1 {\src9\().16b, \src10\().16b, \src11\().16b},
[x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+ movi v28.8h, #0
+ movi v29.8h, #0
+ calc_epelb v4, \src0, \src3, \src6, \src9
+ calc_epelb2 v5, \src0, \src3, \src6, \src9
+ calc_epelb v6, \src1, \src4, \src7, \src10
+ calc_epelb2 v7, \src1, \src4, \src7, \src10
+ calc_epelb v28, \src2, \src5, \src8, \src11
+ calc_epelb2 v29, \src2, \src5, \src8, \src11
+ st1 {v4.8h-v7.8h}, [x0], #64
+ subs w3, w3, #1
+ st1 {v28.8h-v29.8h}, [x0], x10
+.endm
+1: calc_all12
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v64_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub sp, sp, #32
+ st1 {v8.8b-v11.8b}, [sp]
+ sub x1, x1, x2
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x2
+ ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], x2
+ ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9,
src10, src11, src12, src13, src14, src15
+ ld1 {\src12\().16b-\src15\().16b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+ movi v8.8h, #0
+ movi v9.8h, #0
+ movi v10.8h, #0
+ movi v11.8h, #0
+ calc_epelb v4, \src0, \src4, \src8, \src12
+ calc_epelb2 v5, \src0, \src4, \src8, \src12
+ calc_epelb v6, \src1, \src5, \src9, \src13
+ calc_epelb2 v7, \src1, \src5, \src9, \src13
+ calc_epelb v8, \src2, \src6, \src10, \src14
+ calc_epelb2 v9, \src2, \src6, \src10, \src14
+ calc_epelb v10, \src3, \src7, \src11, \src15
+ calc_epelb2 v11, \src3, \src7, \src11, \src15
+ st1 {v4.8h-v7.8h}, [x0], #64
+ subs w3, w3, #1
+ st1 {v8.8h-v11.8h}, [x0], #64
+.endm
+1: calc_all16
+.purgem calc
+2: ld1 {v8.8b-v11.8b}, [sp]
+ add sp, sp, #32
+ ret
+endfunc
+
function ff_hevc_put_hevc_epel_uni_v4_8_neon, export=1
load_epel_filterb x6, x5
sub x2, x2, x3
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 4c377a7940..82e1623a67 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -156,6 +156,10 @@ NEON8_FNPROTO(pel_pixels, (int16_t *dst,
const uint8_t *src, ptrdiff_t srcstride,
int height, intptr_t mx, intptr_t my, int width),);
+NEON8_FNPROTO(epel_v, (int16_t *dst,
+ const uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width),);
+
NEON8_FNPROTO(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride,
int height, intptr_t mx, intptr_t my, int width),);
@@ -305,6 +309,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext
*c, const int bit_depth)
c->put_hevc_qpel_bi[9][0][1] =
ff_hevc_put_hevc_qpel_bi_h16_8_neon;
NEON8_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels,);
+ NEON8_FNASSIGN(c->put_hevc_epel, 1, 0, epel_v,);
NEON8_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels,);
NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,);
--
2.38.0.windows.1
[-- Attachment #2: 0001-lavc-aarch64-new-optimization-for-8-bit-hevc_epel_v.patch --]
[-- Type: text/plain, Size: 11109 bytes --]
From dfaaddf97b86817bc7adb50fdf0d29634b365bb1 Mon Sep 17 00:00:00 2001
From: Logan Lyu <Logan.Lyu@myais.com.cn>
Date: Sat, 9 Sep 2023 16:50:29 +0800
Subject: [PATCH 1/4] lavc/aarch64: new optimization for 8-bit hevc_epel_v
checkasm bench:
put_hevc_epel_v4_8_c: 79.9
put_hevc_epel_v4_8_neon: 25.7
put_hevc_epel_v6_8_c: 151.4
put_hevc_epel_v6_8_neon: 46.4
put_hevc_epel_v8_8_c: 250.9
put_hevc_epel_v8_8_neon: 41.7
put_hevc_epel_v12_8_c: 542.7
put_hevc_epel_v12_8_neon: 108.7
put_hevc_epel_v16_8_c: 939.4
put_hevc_epel_v16_8_neon: 169.2
put_hevc_epel_v24_8_c: 2104.9
put_hevc_epel_v24_8_neon: 307.9
put_hevc_epel_v32_8_c: 3713.9
put_hevc_epel_v32_8_neon: 524.2
put_hevc_epel_v48_8_c: 8175.2
put_hevc_epel_v48_8_neon: 1197.2
put_hevc_epel_v64_8_c: 16049.4
put_hevc_epel_v64_8_neon: 2094.9
Co-Authored-By: J. Dekker <jdek@itanimul.li>
---
libavcodec/aarch64/hevcdsp_epel_neon.S | 223 ++++++++++++++++++++++
libavcodec/aarch64/hevcdsp_init_aarch64.c | 5 +
2 files changed, 228 insertions(+)
diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S
index b4ca1e4c20..e541db5430 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -243,6 +243,229 @@ function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1
ret
endfunc
+
+function ff_hevc_put_hevc_epel_v4_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ldr s16, [x1]
+ ldr s17, [x1 ,x2]
+ add x1, x1, x2, lsl #1
+ ld1 {v18.s}[0], [x1], x2
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().s}[0], [x1], x2
+ movi v4.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ subs w3, w3, #1
+ st1 {v4.4h}, [x0], x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v6_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2 - 8)
+ ldr d16, [x1]
+ ldr d17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ld1 {v18.8b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().8b}, [x1], x2
+ movi v4.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ st1 {v4.d}[0], [x0], #8
+ subs w3, w3, #1
+ st1 {v4.s}[2], [x0], x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v8_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ldr d16, [x1]
+ ldr d17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ld1 {v18.8b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().8b}, [x1], x2
+ movi v4.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ subs w3, w3, #1
+ st1 {v4.8h}, [x0], x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v12_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ldr q16, [x1]
+ ldr q17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ld1 {v18.16b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().16b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ calc_epelb2 v5, \src0, \src1, \src2, \src3
+ str q4, [x0]
+ subs w3, w3, #1
+ str d5, [x0, #16]
+ add x0, x0, x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v16_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ldr q16, [x1]
+ ldr q17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ld1 {v18.16b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().16b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ calc_epelb2 v5, \src0, \src1, \src2, \src3
+ subs w3, w3, #1
+ st1 {v4.8h, v5.8h}, [x0], x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v24_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8b, v17.8b, v18.8b}, [x1], x2
+ ld1 {v19.8b, v20.8b, v21.8b}, [x1], x2
+ ld1 {v22.8b, v23.8b, v24.8b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11
+ ld1 {\src9\().8b, \src10\().8b, \src11\().8b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ calc_epelb v4, \src0, \src3, \src6, \src9
+ calc_epelb v5, \src1, \src4, \src7, \src10
+ calc_epelb v6, \src2, \src5, \src8, \src11
+ subs w3, w3, #1
+ st1 {v4.8h-v6.8h}, [x0], x10
+.endm
+1: calc_all12
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v32_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.16b, v17.16b}, [x1], x2
+ ld1 {v18.16b, v19.16b}, [x1], x2
+ ld1 {v20.16b, v21.16b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\src6\().16b, \src7\().16b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+ calc_epelb v4, \src0, \src2, \src4, \src6
+ calc_epelb2 v5, \src0, \src2, \src4, \src6
+ calc_epelb v6, \src1, \src3, \src5, \src7
+ calc_epelb2 v7, \src1, \src3, \src5, \src7
+ subs w3, w3, #1
+ st1 {v4.8h-v7.8h}, [x0], x10
+.endm
+1: calc_all8
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v48_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #64
+ ld1 {v16.16b, v17.16b, v18.16b}, [x1], x2
+ ld1 {v19.16b, v20.16b, v21.16b}, [x1], x2
+ ld1 {v22.16b, v23.16b, v24.16b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11
+ ld1 {\src9\().16b, \src10\().16b, \src11\().16b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+ movi v28.8h, #0
+ movi v29.8h, #0
+ calc_epelb v4, \src0, \src3, \src6, \src9
+ calc_epelb2 v5, \src0, \src3, \src6, \src9
+ calc_epelb v6, \src1, \src4, \src7, \src10
+ calc_epelb2 v7, \src1, \src4, \src7, \src10
+ calc_epelb v28, \src2, \src5, \src8, \src11
+ calc_epelb2 v29, \src2, \src5, \src8, \src11
+ st1 {v4.8h-v7.8h}, [x0], #64
+ subs w3, w3, #1
+ st1 {v28.8h-v29.8h}, [x0], x10
+.endm
+1: calc_all12
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v64_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub sp, sp, #32
+ st1 {v8.8b-v11.8b}, [sp]
+ sub x1, x1, x2
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x2
+ ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], x2
+ ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\src12\().16b-\src15\().16b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+ movi v8.8h, #0
+ movi v9.8h, #0
+ movi v10.8h, #0
+ movi v11.8h, #0
+ calc_epelb v4, \src0, \src4, \src8, \src12
+ calc_epelb2 v5, \src0, \src4, \src8, \src12
+ calc_epelb v6, \src1, \src5, \src9, \src13
+ calc_epelb2 v7, \src1, \src5, \src9, \src13
+ calc_epelb v8, \src2, \src6, \src10, \src14
+ calc_epelb2 v9, \src2, \src6, \src10, \src14
+ calc_epelb v10, \src3, \src7, \src11, \src15
+ calc_epelb2 v11, \src3, \src7, \src11, \src15
+ st1 {v4.8h-v7.8h}, [x0], #64
+ subs w3, w3, #1
+ st1 {v8.8h-v11.8h}, [x0], #64
+.endm
+1: calc_all16
+.purgem calc
+2: ld1 {v8.8b-v11.8b}, [sp]
+ add sp, sp, #32
+ ret
+endfunc
+
function ff_hevc_put_hevc_epel_uni_v4_8_neon, export=1
load_epel_filterb x6, x5
sub x2, x2, x3
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 4c377a7940..82e1623a67 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -156,6 +156,10 @@ NEON8_FNPROTO(pel_pixels, (int16_t *dst,
const uint8_t *src, ptrdiff_t srcstride,
int height, intptr_t mx, intptr_t my, int width),);
+NEON8_FNPROTO(epel_v, (int16_t *dst,
+ const uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width),);
+
NEON8_FNPROTO(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride,
int height, intptr_t mx, intptr_t my, int width),);
@@ -305,6 +309,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
NEON8_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels,);
+ NEON8_FNASSIGN(c->put_hevc_epel, 1, 0, epel_v,);
NEON8_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels,);
NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,);
--
2.38.0.windows.1
[-- Attachment #3: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: new optimization for 8-bit hevc_epel_v
2023-10-14 8:45 [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: new optimization for 8-bit hevc_epel_v Logan.Lyu
@ 2023-10-14 17:08 ` Michael Niedermayer
2023-10-22 13:29 ` Logan.Lyu
1 sibling, 0 replies; 6+ messages in thread
From: Michael Niedermayer @ 2023-10-14 17:08 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1.1: Type: text/plain, Size: 920 bytes --]
On Sat, Oct 14, 2023 at 04:45:39PM +0800, Logan.Lyu wrote:
[...]
> diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S
> b/libavcodec/aarch64/hevcdsp_epel_neon.S
> index b4ca1e4c20..e541db5430 100644
> --- a/libavcodec/aarch64/hevcdsp_epel_neon.S
> +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
> @@ -243,6 +243,229 @@ function ff_hevc_put_hevc_pel_pixels64_8_neon,
> export=1
> ret
> endfunc
> +
> +function ff_hevc_put_hevc_epel_v4_8_neon, export=1
> + load_epel_filterb x5, x4
This is not a valid diff, some whitespaces and newlines here are not as
they should be
thx
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
Rewriting code that is poorly written but fully understood is good.
Rewriting code that one doesnt understand is a sign that one is less smart
than the original author, trying to rewrite it will not make it better.
[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]
[-- Attachment #2: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: new optimization for 8-bit hevc_epel_v
2023-10-14 8:45 [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: new optimization for 8-bit hevc_epel_v Logan.Lyu
2023-10-14 17:08 ` Michael Niedermayer
@ 2023-10-22 13:29 ` Logan.Lyu
2023-10-22 17:18 ` Martin Storsjö
1 sibling, 1 reply; 6+ messages in thread
From: Logan.Lyu @ 2023-10-22 13:29 UTC (permalink / raw)
To: ffmpeg-devel, Martin Storsjö
Hi, Martin,
Could you please review these patches and let me know if there are any
changes needed.
Thanks.
Logan Lyu
在 2023/10/14 16:45, Logan.Lyu 写道:
> checkasm bench:
> put_hevc_epel_v4_8_c: 79.9
> put_hevc_epel_v4_8_neon: 25.7
> put_hevc_epel_v6_8_c: 151.4
> put_hevc_epel_v6_8_neon: 46.4
> put_hevc_epel_v8_8_c: 250.9
> put_hevc_epel_v8_8_neon: 41.7
> put_hevc_epel_v12_8_c: 542.7
> put_hevc_epel_v12_8_neon: 108.7
> put_hevc_epel_v16_8_c: 939.4
> put_hevc_epel_v16_8_neon: 169.2
> put_hevc_epel_v24_8_c: 2104.9
> put_hevc_epel_v24_8_neon: 307.9
> put_hevc_epel_v32_8_c: 3713.9
> put_hevc_epel_v32_8_neon: 524.2
> put_hevc_epel_v48_8_c: 8175.2
> put_hevc_epel_v48_8_neon: 1197.2
> put_hevc_epel_v64_8_c: 16049.4
> put_hevc_epel_v64_8_neon: 2094.9
>
> Co-Authored-By: J. Dekker <jdek@itanimul.li>
> Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn>
> ---
> libavcodec/aarch64/hevcdsp_epel_neon.S | 223 ++++++++++++++++++++++
> libavcodec/aarch64/hevcdsp_init_aarch64.c | 5 +
> 2 files changed, 228 insertions(+)
>
> diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S
> b/libavcodec/aarch64/hevcdsp_epel_neon.S
> index b4ca1e4c20..e541db5430 100644
> --- a/libavcodec/aarch64/hevcdsp_epel_neon.S
> +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
> @@ -243,6 +243,229 @@ function ff_hevc_put_hevc_pel_pixels64_8_neon,
> export=1
> ret
> endfunc
> +
> +function ff_hevc_put_hevc_epel_v4_8_neon, export=1
> + load_epel_filterb x5, x4
> + sub x1, x1, x2
> + mov x10, #(MAX_PB_SIZE * 2)
> + ldr s16, [x1]
> + ldr s17, [x1 ,x2]
> + add x1, x1, x2, lsl #1
> + ld1 {v18.s}[0], [x1], x2
> +.macro calc src0, src1, src2, src3
> + ld1 {\src3\().s}[0], [x1], x2
> + movi v4.8h, #0
> + calc_epelb v4, \src0, \src1, \src2, \src3
> + subs w3, w3, #1
> + st1 {v4.4h}, [x0], x10
> +.endm
> +1: calc_all4
> +.purgem calc
> +2: ret
> +endfunc
> +
> +function ff_hevc_put_hevc_epel_v6_8_neon, export=1
> + load_epel_filterb x5, x4
> + sub x1, x1, x2
> + mov x10, #(MAX_PB_SIZE * 2 - 8)
> + ldr d16, [x1]
> + ldr d17, [x1, x2]
> + add x1, x1, x2, lsl #1
> + ld1 {v18.8b}, [x1], x2
> +.macro calc src0, src1, src2, src3
> + ld1 {\src3\().8b}, [x1], x2
> + movi v4.8h, #0
> + calc_epelb v4, \src0, \src1, \src2, \src3
> + st1 {v4.d}[0], [x0], #8
> + subs w3, w3, #1
> + st1 {v4.s}[2], [x0], x10
> +.endm
> +1: calc_all4
> +.purgem calc
> +2: ret
> +endfunc
> +
> +function ff_hevc_put_hevc_epel_v8_8_neon, export=1
> + load_epel_filterb x5, x4
> + sub x1, x1, x2
> + mov x10, #(MAX_PB_SIZE * 2)
> + ldr d16, [x1]
> + ldr d17, [x1, x2]
> + add x1, x1, x2, lsl #1
> + ld1 {v18.8b}, [x1], x2
> +.macro calc src0, src1, src2, src3
> + ld1 {\src3\().8b}, [x1], x2
> + movi v4.8h, #0
> + calc_epelb v4, \src0, \src1, \src2, \src3
> + subs w3, w3, #1
> + st1 {v4.8h}, [x0], x10
> +.endm
> +1: calc_all4
> +.purgem calc
> +2: ret
> +endfunc
> +
> +function ff_hevc_put_hevc_epel_v12_8_neon, export=1
> + load_epel_filterb x5, x4
> + sub x1, x1, x2
> + mov x10, #(MAX_PB_SIZE * 2)
> + ldr q16, [x1]
> + ldr q17, [x1, x2]
> + add x1, x1, x2, lsl #1
> + ld1 {v18.16b}, [x1], x2
> +.macro calc src0, src1, src2, src3
> + ld1 {\src3\().16b}, [x1], x2
> + movi v4.8h, #0
> + movi v5.8h, #0
> + calc_epelb v4, \src0, \src1, \src2, \src3
> + calc_epelb2 v5, \src0, \src1, \src2, \src3
> + str q4, [x0]
> + subs w3, w3, #1
> + str d5, [x0, #16]
> + add x0, x0, x10
> +.endm
> +1: calc_all4
> +.purgem calc
> +2: ret
> +endfunc
> +
> +function ff_hevc_put_hevc_epel_v16_8_neon, export=1
> + load_epel_filterb x5, x4
> + sub x1, x1, x2
> + mov x10, #(MAX_PB_SIZE * 2)
> + ldr q16, [x1]
> + ldr q17, [x1, x2]
> + add x1, x1, x2, lsl #1
> + ld1 {v18.16b}, [x1], x2
> +.macro calc src0, src1, src2, src3
> + ld1 {\src3\().16b}, [x1], x2
> + movi v4.8h, #0
> + movi v5.8h, #0
> + calc_epelb v4, \src0, \src1, \src2, \src3
> + calc_epelb2 v5, \src0, \src1, \src2, \src3
> + subs w3, w3, #1
> + st1 {v4.8h, v5.8h}, [x0], x10
> +.endm
> +1: calc_all4
> +.purgem calc
> +2: ret
> +endfunc
> +
> +function ff_hevc_put_hevc_epel_v24_8_neon, export=1
> + load_epel_filterb x5, x4
> + sub x1, x1, x2
> + mov x10, #(MAX_PB_SIZE * 2)
> + ld1 {v16.8b, v17.8b, v18.8b}, [x1], x2
> + ld1 {v19.8b, v20.8b, v21.8b}, [x1], x2
> + ld1 {v22.8b, v23.8b, v24.8b}, [x1], x2
> +.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8,
> src9, src10, src11
> + ld1 {\src9\().8b, \src10\().8b, \src11\().8b},
> [x1], x2
> + movi v4.8h, #0
> + movi v5.8h, #0
> + movi v6.8h, #0
> + calc_epelb v4, \src0, \src3, \src6, \src9
> + calc_epelb v5, \src1, \src4, \src7, \src10
> + calc_epelb v6, \src2, \src5, \src8, \src11
> + subs w3, w3, #1
> + st1 {v4.8h-v6.8h}, [x0], x10
> +.endm
> +1: calc_all12
> +.purgem calc
> +2: ret
> +endfunc
> +
> +function ff_hevc_put_hevc_epel_v32_8_neon, export=1
> + load_epel_filterb x5, x4
> + sub x1, x1, x2
> + mov x10, #(MAX_PB_SIZE * 2)
> + ld1 {v16.16b, v17.16b}, [x1], x2
> + ld1 {v18.16b, v19.16b}, [x1], x2
> + ld1 {v20.16b, v21.16b}, [x1], x2
> +.macro calc src0, src1, src2, src3, src4, src5, src6, src7
> + ld1 {\src6\().16b, \src7\().16b}, [x1], x2
> + movi v4.8h, #0
> + movi v5.8h, #0
> + movi v6.8h, #0
> + movi v7.8h, #0
> + calc_epelb v4, \src0, \src2, \src4, \src6
> + calc_epelb2 v5, \src0, \src2, \src4, \src6
> + calc_epelb v6, \src1, \src3, \src5, \src7
> + calc_epelb2 v7, \src1, \src3, \src5, \src7
> + subs w3, w3, #1
> + st1 {v4.8h-v7.8h}, [x0], x10
> +.endm
> +1: calc_all8
> +.purgem calc
> +2: ret
> +endfunc
> +
> +function ff_hevc_put_hevc_epel_v48_8_neon, export=1
> + load_epel_filterb x5, x4
> + sub x1, x1, x2
> + mov x10, #64
> + ld1 {v16.16b, v17.16b, v18.16b}, [x1], x2
> + ld1 {v19.16b, v20.16b, v21.16b}, [x1], x2
> + ld1 {v22.16b, v23.16b, v24.16b}, [x1], x2
> +.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8,
> src9, src10, src11
> + ld1 {\src9\().16b, \src10\().16b, \src11\().16b},
> [x1], x2
> + movi v4.8h, #0
> + movi v5.8h, #0
> + movi v6.8h, #0
> + movi v7.8h, #0
> + movi v28.8h, #0
> + movi v29.8h, #0
> + calc_epelb v4, \src0, \src3, \src6, \src9
> + calc_epelb2 v5, \src0, \src3, \src6, \src9
> + calc_epelb v6, \src1, \src4, \src7, \src10
> + calc_epelb2 v7, \src1, \src4, \src7, \src10
> + calc_epelb v28, \src2, \src5, \src8, \src11
> + calc_epelb2 v29, \src2, \src5, \src8, \src11
> + st1 {v4.8h-v7.8h}, [x0], #64
> + subs w3, w3, #1
> + st1 {v28.8h-v29.8h}, [x0], x10
> +.endm
> +1: calc_all12
> +.purgem calc
> +2: ret
> +endfunc
> +
> +function ff_hevc_put_hevc_epel_v64_8_neon, export=1
> + load_epel_filterb x5, x4
> + sub sp, sp, #32
> + st1 {v8.8b-v11.8b}, [sp]
> + sub x1, x1, x2
> + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x2
> + ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], x2
> + ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x1], x2
> +.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8,
> src9, src10, src11, src12, src13, src14, src15
> + ld1 {\src12\().16b-\src15\().16b}, [x1], x2
> + movi v4.8h, #0
> + movi v5.8h, #0
> + movi v6.8h, #0
> + movi v7.8h, #0
> + movi v8.8h, #0
> + movi v9.8h, #0
> + movi v10.8h, #0
> + movi v11.8h, #0
> + calc_epelb v4, \src0, \src4, \src8, \src12
> + calc_epelb2 v5, \src0, \src4, \src8, \src12
> + calc_epelb v6, \src1, \src5, \src9, \src13
> + calc_epelb2 v7, \src1, \src5, \src9, \src13
> + calc_epelb v8, \src2, \src6, \src10, \src14
> + calc_epelb2 v9, \src2, \src6, \src10, \src14
> + calc_epelb v10, \src3, \src7, \src11, \src15
> + calc_epelb2 v11, \src3, \src7, \src11, \src15
> + st1 {v4.8h-v7.8h}, [x0], #64
> + subs w3, w3, #1
> + st1 {v8.8h-v11.8h}, [x0], #64
> +.endm
> +1: calc_all16
> +.purgem calc
> +2: ld1 {v8.8b-v11.8b}, [sp]
> + add sp, sp, #32
> + ret
> +endfunc
> +
> function ff_hevc_put_hevc_epel_uni_v4_8_neon, export=1
> load_epel_filterb x6, x5
> sub x2, x2, x3
> diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c
> b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> index 4c377a7940..82e1623a67 100644
> --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
> +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> @@ -156,6 +156,10 @@ NEON8_FNPROTO(pel_pixels, (int16_t *dst,
> const uint8_t *src, ptrdiff_t srcstride,
> int height, intptr_t mx, intptr_t my, int width),);
> +NEON8_FNPROTO(epel_v, (int16_t *dst,
> + const uint8_t *src, ptrdiff_t srcstride,
> + int height, intptr_t mx, intptr_t my, int width),);
> +
> NEON8_FNPROTO(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
> const uint8_t *_src, ptrdiff_t _srcstride,
> int height, intptr_t mx, intptr_t my, int width),);
> @@ -305,6 +309,7 @@ av_cold void
> ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
> c->put_hevc_qpel_bi[9][0][1] =
> ff_hevc_put_hevc_qpel_bi_h16_8_neon;
> NEON8_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels,);
> + NEON8_FNASSIGN(c->put_hevc_epel, 1, 0, epel_v,);
> NEON8_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels,);
> NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
> NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,);
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: new optimization for 8-bit hevc_epel_v
2023-10-22 13:29 ` Logan.Lyu
@ 2023-10-22 17:18 ` Martin Storsjö
2023-10-26 8:30 ` Logan.Lyu
0 siblings, 1 reply; 6+ messages in thread
From: Martin Storsjö @ 2023-10-22 17:18 UTC (permalink / raw)
To: Logan.Lyu; +Cc: ffmpeg-devel
On Sun, 22 Oct 2023, Logan.Lyu wrote:
> Hi, Martin,
>
> Could you please review these patches and let me know if there are any
> changes needed.
Did you see the message from Michael on Oct 14th? Your patches have
corrupted whitespace and can't be applied. Earlier you've submitted some
patches as attached files, and those have been possible to apply.
Secondly; I just pushed some indentation cleanup for aarch64 assembly
yesterday. In case there are conflicts with your patches, please rebase
your patches before attempting to resubmit them, so they apply cleanly.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: new optimization for 8-bit hevc_epel_v
2023-10-22 17:18 ` Martin Storsjö
@ 2023-10-26 8:30 ` Logan.Lyu
2023-10-31 12:17 ` Martin Storsjö
0 siblings, 1 reply; 6+ messages in thread
From: Logan.Lyu @ 2023-10-26 8:30 UTC (permalink / raw)
To: Martin Storsjö, michael; +Cc: ffmpeg-devel
[-- Attachment #1: Type: text/plain, Size: 1056 bytes --]
Hi,
I'm sorry that I missed the message from Michael on Oct 14th due to my
negligence.
And I missed submitting a commit that was earlier than these four
commits, which caused the corrupted whitespace problem. Now I have
recreated these patches.
In addition, I rebased it to ensure that these patches can be
successfully applied on the latest master branch.
Please check again, thank you.
在 2023/10/23 1:18, Martin Storsjö 写道:
> On Sun, 22 Oct 2023, Logan.Lyu wrote:
>
>> Hi, Martin,
>>
>> Could you please review these patches and let me know if there are
>> any changes needed.
>
> Did you see the message from Michael on Oct 14th? Your patches have
> corrupted whitespace and can't be applied. Earlier you've submitted
> some patches as attached files, and those have been possible to apply.
>
> Secondly; I just pushed some indentation cleanup for aarch64 assembly
> yesterday. In case there are conflicts with your patches, please
> rebase your patches before attempting to resubmit them, so they apply
> cleanly.
>
> // Martin
>
[-- Attachment #2: 0002-lavc-aarch64-new-optimization-for-8-bit-hevc_epel_v.patch --]
[-- Type: text/plain, Size: 11110 bytes --]
From 443447657b8ea8684ab2687789b7f77845c83f3f Mon Sep 17 00:00:00 2001
From: Logan Lyu <Logan.Lyu@myais.com.cn>
Date: Thu, 26 Oct 2023 09:15:24 +0800
Subject: [PATCH 2/5] lavc/aarch64: new optimization for 8-bit hevc_epel_v
checkasm bench:
put_hevc_epel_v4_8_c: 79.9
put_hevc_epel_v4_8_neon: 25.7
put_hevc_epel_v6_8_c: 151.4
put_hevc_epel_v6_8_neon: 46.4
put_hevc_epel_v8_8_c: 250.9
put_hevc_epel_v8_8_neon: 41.7
put_hevc_epel_v12_8_c: 542.7
put_hevc_epel_v12_8_neon: 108.7
put_hevc_epel_v16_8_c: 939.4
put_hevc_epel_v16_8_neon: 169.2
put_hevc_epel_v24_8_c: 2104.9
put_hevc_epel_v24_8_neon: 307.9
put_hevc_epel_v32_8_c: 3713.9
put_hevc_epel_v32_8_neon: 524.2
put_hevc_epel_v48_8_c: 8175.2
put_hevc_epel_v48_8_neon: 1197.2
put_hevc_epel_v64_8_c: 16049.4
put_hevc_epel_v64_8_neon: 2094.9
Co-Authored-By: J. Dekker <jdek@itanimul.li>
---
libavcodec/aarch64/hevcdsp_epel_neon.S | 223 ++++++++++++++++++++++
libavcodec/aarch64/hevcdsp_init_aarch64.c | 5 +
2 files changed, 228 insertions(+)
diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S
index 0705213eed..363750ee7f 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -243,6 +243,229 @@ function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1
ret
endfunc
+
+function ff_hevc_put_hevc_epel_v4_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ldr s16, [x1]
+ ldr s17, [x1 ,x2]
+ add x1, x1, x2, lsl #1
+ ld1 {v18.s}[0], [x1], x2
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().s}[0], [x1], x2
+ movi v4.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ subs w3, w3, #1
+ st1 {v4.4h}, [x0], x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v6_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2 - 8)
+ ldr d16, [x1]
+ ldr d17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ld1 {v18.8b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().8b}, [x1], x2
+ movi v4.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ st1 {v4.d}[0], [x0], #8
+ subs w3, w3, #1
+ st1 {v4.s}[2], [x0], x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v8_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ldr d16, [x1]
+ ldr d17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ld1 {v18.8b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().8b}, [x1], x2
+ movi v4.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ subs w3, w3, #1
+ st1 {v4.8h}, [x0], x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v12_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ldr q16, [x1]
+ ldr q17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ld1 {v18.16b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().16b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ calc_epelb2 v5, \src0, \src1, \src2, \src3
+ str q4, [x0]
+ subs w3, w3, #1
+ str d5, [x0, #16]
+ add x0, x0, x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v16_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ldr q16, [x1]
+ ldr q17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ld1 {v18.16b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().16b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ calc_epelb v4, \src0, \src1, \src2, \src3
+ calc_epelb2 v5, \src0, \src1, \src2, \src3
+ subs w3, w3, #1
+ st1 {v4.8h, v5.8h}, [x0], x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v24_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8b, v17.8b, v18.8b}, [x1], x2
+ ld1 {v19.8b, v20.8b, v21.8b}, [x1], x2
+ ld1 {v22.8b, v23.8b, v24.8b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11
+ ld1 {\src9\().8b, \src10\().8b, \src11\().8b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ calc_epelb v4, \src0, \src3, \src6, \src9
+ calc_epelb v5, \src1, \src4, \src7, \src10
+ calc_epelb v6, \src2, \src5, \src8, \src11
+ subs w3, w3, #1
+ st1 {v4.8h-v6.8h}, [x0], x10
+.endm
+1: calc_all12
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v32_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.16b, v17.16b}, [x1], x2
+ ld1 {v18.16b, v19.16b}, [x1], x2
+ ld1 {v20.16b, v21.16b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\src6\().16b, \src7\().16b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+ calc_epelb v4, \src0, \src2, \src4, \src6
+ calc_epelb2 v5, \src0, \src2, \src4, \src6
+ calc_epelb v6, \src1, \src3, \src5, \src7
+ calc_epelb2 v7, \src1, \src3, \src5, \src7
+ subs w3, w3, #1
+ st1 {v4.8h-v7.8h}, [x0], x10
+.endm
+1: calc_all8
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v48_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub x1, x1, x2
+ mov x10, #64
+ ld1 {v16.16b, v17.16b, v18.16b}, [x1], x2
+ ld1 {v19.16b, v20.16b, v21.16b}, [x1], x2
+ ld1 {v22.16b, v23.16b, v24.16b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11
+ ld1 {\src9\().16b, \src10\().16b, \src11\().16b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+ movi v28.8h, #0
+ movi v29.8h, #0
+ calc_epelb v4, \src0, \src3, \src6, \src9
+ calc_epelb2 v5, \src0, \src3, \src6, \src9
+ calc_epelb v6, \src1, \src4, \src7, \src10
+ calc_epelb2 v7, \src1, \src4, \src7, \src10
+ calc_epelb v28, \src2, \src5, \src8, \src11
+ calc_epelb2 v29, \src2, \src5, \src8, \src11
+ st1 {v4.8h-v7.8h}, [x0], #64
+ subs w3, w3, #1
+ st1 {v28.8h-v29.8h}, [x0], x10
+.endm
+1: calc_all12
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v64_8_neon, export=1
+ load_epel_filterb x5, x4
+ sub sp, sp, #32
+ st1 {v8.8b-v11.8b}, [sp]
+ sub x1, x1, x2
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x2
+ ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], x2
+ ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\src12\().16b-\src15\().16b}, [x1], x2
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+ movi v8.8h, #0
+ movi v9.8h, #0
+ movi v10.8h, #0
+ movi v11.8h, #0
+ calc_epelb v4, \src0, \src4, \src8, \src12
+ calc_epelb2 v5, \src0, \src4, \src8, \src12
+ calc_epelb v6, \src1, \src5, \src9, \src13
+ calc_epelb2 v7, \src1, \src5, \src9, \src13
+ calc_epelb v8, \src2, \src6, \src10, \src14
+ calc_epelb2 v9, \src2, \src6, \src10, \src14
+ calc_epelb v10, \src3, \src7, \src11, \src15
+ calc_epelb2 v11, \src3, \src7, \src11, \src15
+ st1 {v4.8h-v7.8h}, [x0], #64
+ subs w3, w3, #1
+ st1 {v8.8h-v11.8h}, [x0], #64
+.endm
+1: calc_all16
+.purgem calc
+2: ld1 {v8.8b-v11.8b}, [sp]
+ add sp, sp, #32
+ ret
+endfunc
+
function ff_hevc_put_hevc_epel_uni_v4_8_neon, export=1
load_epel_filterb x6, x5
sub x2, x2, x3
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index c203d65d34..42aa76ddde 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -156,6 +156,10 @@ NEON8_FNPROTO(pel_pixels, (int16_t *dst,
const uint8_t *src, ptrdiff_t srcstride,
int height, intptr_t mx, intptr_t my, int width),);
+NEON8_FNPROTO(epel_v, (int16_t *dst,
+ const uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width),);
+
NEON8_FNPROTO(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride,
int height, intptr_t mx, intptr_t my, int width),);
@@ -305,6 +309,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
NEON8_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels,);
+ NEON8_FNASSIGN(c->put_hevc_epel, 1, 0, epel_v,);
NEON8_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels,);
NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,);
--
2.38.0.windows.1
[-- Attachment #3: 0003-lavc-aarch64-new-optimization-for-8-bit-hevc_epel_hv.patch --]
[-- Type: text/plain, Size: 13186 bytes --]
From 05ab9d331ec9afb63d799b4694c64fac2a616a7b Mon Sep 17 00:00:00 2001
From: Logan Lyu <Logan.Lyu@myais.com.cn>
Date: Thu, 26 Oct 2023 09:17:36 +0800
Subject: [PATCH 3/5] lavc/aarch64: new optimization for 8-bit hevc_epel_hv
checkasm bench:
put_hevc_epel_hv4_8_c: 213.7
put_hevc_epel_hv4_8_i8mm: 59.4
put_hevc_epel_hv6_8_c: 350.9
put_hevc_epel_hv6_8_i8mm: 130.2
put_hevc_epel_hv8_8_c: 548.7
put_hevc_epel_hv8_8_i8mm: 136.9
put_hevc_epel_hv12_8_c: 1126.7
put_hevc_epel_hv12_8_i8mm: 302.2
put_hevc_epel_hv16_8_c: 1925.2
put_hevc_epel_hv16_8_i8mm: 459.9
put_hevc_epel_hv24_8_c: 4301.9
put_hevc_epel_hv24_8_i8mm: 1024.9
put_hevc_epel_hv32_8_c: 7509.2
put_hevc_epel_hv32_8_i8mm: 1680.4
put_hevc_epel_hv48_8_c: 16566.9
put_hevc_epel_hv48_8_i8mm: 3945.4
put_hevc_epel_hv64_8_c: 29134.2
put_hevc_epel_hv64_8_i8mm: 6567.7
Co-Authored-By: J. Dekker <jdek@itanimul.li>
---
libavcodec/aarch64/hevcdsp_epel_neon.S | 265 ++++++++++++++++++++++
libavcodec/aarch64/hevcdsp_init_aarch64.c | 5 +
2 files changed, 270 insertions(+)
diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S
index 363750ee7f..708b903b00 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -1018,6 +1018,271 @@ function ff_hevc_put_hevc_epel_h64_8_neon_i8mm, export=1
ret
endfunc
+
+function ff_hevc_put_hevc_epel_hv4_8_neon_i8mm, export=1
+ add w10, w3, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ sub x1, x1, x2
+ add w3, w3, #3
+ bl X(ff_hevc_put_hevc_epel_h4_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_epel_filterh x5, x4
+ mov x10, #(MAX_PB_SIZE * 2)
+ ldr d16, [sp]
+ ldr d17, [sp, x10]
+ add sp, sp, x10, lsl #1
+ ld1 {v18.4h}, [sp], x10
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().4h}, [sp], x10
+ calc_epelh v4, \src0, \src1, \src2, \src3
+ subs w3, w3, #1
+ st1 {v4.4h}, [x0], x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv6_8_neon_i8mm, export=1
+ add w10, w3, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ sub x1, x1, x2
+ add w3, w3, #3
+ bl X(ff_hevc_put_hevc_epel_h6_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_epel_filterh x5, x4
+ mov x5, #120
+ mov x10, #(MAX_PB_SIZE * 2)
+ ldr q16, [sp]
+ ldr q17, [sp, x10]
+ add sp, sp, x10, lsl #1
+ ld1 {v18.8h}, [sp], x10
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().8h}, [sp], x10
+ calc_epelh v4, \src0, \src1, \src2, \src3
+ calc_epelh2 v4, v5, \src0, \src1, \src2, \src3
+ st1 {v4.d}[0], [x0], #8
+ subs w3, w3, #1
+ st1 {v4.s}[2], [x0], x5
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv8_8_neon_i8mm, export=1
+ add w10, w3, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ sub x1, x1, x2
+ add w3, w3, #3
+ bl X(ff_hevc_put_hevc_epel_h8_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_epel_filterh x5, x4
+ mov x10, #(MAX_PB_SIZE * 2)
+ ldr q16, [sp]
+ ldr q17, [sp, x10]
+ add sp, sp, x10, lsl #1
+ ld1 {v18.8h}, [sp], x10
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().8h}, [sp], x10
+ calc_epelh v4, \src0, \src1, \src2, \src3
+ calc_epelh2 v4, v5, \src0, \src1, \src2, \src3
+ subs w3, w3, #1
+ st1 {v4.8h}, [x0], x10
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv12_8_neon_i8mm, export=1
+ add w10, w3, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ sub x1, x1, x2
+ add w3, w3, #3
+ bl X(ff_hevc_put_hevc_epel_h12_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_epel_filterh x5, x4
+ mov x5, #112
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h, v17.8h}, [sp], x10
+ ld1 {v18.8h, v19.8h}, [sp], x10
+ ld1 {v20.8h, v21.8h}, [sp], x10
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\src6\().8h, \src7\().8h}, [sp], x10
+ calc_epelh v4, \src0, \src2, \src4, \src6
+ calc_epelh2 v4, v5, \src0, \src2, \src4, \src6
+ calc_epelh v5, \src1, \src3, \src5, \src7
+ st1 {v4.8h}, [x0], #16
+ subs w3, w3, #1
+ st1 {v5.4h}, [x0], x5
+.endm
+1: calc_all8
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv16_8_neon_i8mm, export=1
+ add w10, w3, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ sub x1, x1, x2
+ add w3, w3, #3
+ bl X(ff_hevc_put_hevc_epel_h16_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_epel_filterh x5, x4
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h, v17.8h}, [sp], x10
+ ld1 {v18.8h, v19.8h}, [sp], x10
+ ld1 {v20.8h, v21.8h}, [sp], x10
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\src6\().8h, \src7\().8h}, [sp], x10
+ calc_epelh v4, \src0, \src2, \src4, \src6
+ calc_epelh2 v4, v5, \src0, \src2, \src4, \src6
+ calc_epelh v5, \src1, \src3, \src5, \src7
+ calc_epelh2 v5, v6, \src1, \src3, \src5, \src7
+ subs w3, w3, #1
+ st1 {v4.8h, v5.8h}, [x0], x10
+.endm
+1: calc_all8
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv24_8_neon_i8mm, export=1
+ add w10, w3, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ sub x1, x1, x2
+ add w3, w3, #3
+ bl X(ff_hevc_put_hevc_epel_h24_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_epel_filterh x5, x4
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h, v17.8h, v18.8h}, [sp], x10
+ ld1 {v19.8h, v20.8h, v21.8h}, [sp], x10
+ ld1 {v22.8h, v23.8h, v24.8h}, [sp], x10
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11
+ ld1 {\src9\().8h-\src11\().8h}, [sp], x10
+ calc_epelh v4, \src0, \src3, \src6, \src9
+ calc_epelh2 v4, v5, \src0, \src3, \src6, \src9
+ calc_epelh v5, \src1, \src4, \src7, \src10
+ calc_epelh2 v5, v6, \src1, \src4, \src7, \src10
+ calc_epelh v6, \src2, \src5, \src8, \src11
+ calc_epelh2 v6, v7, \src2, \src5, \src8, \src11
+ subs w3, w3, #1
+ st1 {v4.8h-v6.8h}, [x0], x10
+.endm
+1: calc_all12
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv32_8_neon_i8mm, export=1
+ stp x4, x5, [sp, #-64]!
+ stp x2, x3, [sp, #16]
+ stp x0, x1, [sp, #32]
+ str x30, [sp, #48]
+ mov x6, #16
+ bl X(ff_hevc_put_hevc_epel_hv16_8_neon_i8mm)
+ ldp x4, x5, [sp]
+ ldp x2, x3, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ add sp, sp, #48
+ add x0, x0, #32
+ add x1, x1, #16
+ mov x6, #16
+ bl X(ff_hevc_put_hevc_epel_hv16_8_neon_i8mm)
+ ldr x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv48_8_neon_i8mm, export=1
+ stp x4, x5, [sp, #-64]!
+ stp x2, x3, [sp, #16]
+ stp x0, x1, [sp, #32]
+ str x30, [sp, #48]
+ mov x6, #24
+ bl X(ff_hevc_put_hevc_epel_hv24_8_neon_i8mm)
+ ldp x4, x5, [sp]
+ ldp x2, x3, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ add sp, sp, #48
+ add x0, x0, #48
+ add x1, x1, #24
+ mov x6, #24
+ bl X(ff_hevc_put_hevc_epel_hv24_8_neon_i8mm)
+ ldr x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv64_8_neon_i8mm, export=1
+ stp x4, x5, [sp, #-64]!
+ stp x2, x3, [sp, #16]
+ stp x0, x1, [sp, #32]
+ str x30, [sp, #48]
+ mov x6, #16
+ bl X(ff_hevc_put_hevc_epel_hv16_8_neon_i8mm)
+ ldp x4, x5, [sp]
+ ldp x2, x3, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ add x0, x0, #32
+ add x1, x1, #16
+ mov x6, #16
+ bl X(ff_hevc_put_hevc_epel_hv16_8_neon_i8mm)
+ ldp x4, x5, [sp]
+ ldp x2, x3, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ add x0, x0, #64
+ add x1, x1, #32
+ mov x6, #16
+ bl X(ff_hevc_put_hevc_epel_hv16_8_neon_i8mm)
+ ldp x4, x5, [sp]
+ ldp x2, x3, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ add sp, sp, #48
+ add x0, x0, #96
+ add x1, x1, #48
+ mov x6, #16
+ bl X(ff_hevc_put_hevc_epel_hv16_8_neon_i8mm)
+ ldr x30, [sp], #16
+ ret
+endfunc
+
function ff_hevc_put_hevc_epel_uni_hv4_8_neon_i8mm, export=1
add w10, w4, #3
lsl x10, x10, #7
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 42aa76ddde..e54d8d7b1e 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -191,6 +191,10 @@ NEON8_FNPROTO(epel_h, (int16_t *dst,
const uint8_t *_src, ptrdiff_t _srcstride,
int height, intptr_t mx, intptr_t my, int width), _i8mm);
+NEON8_FNPROTO(epel_hv, (int16_t *dst,
+ const uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width), _i8mm);
+
NEON8_FNPROTO(epel_uni_w_h, (uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride,
int height, int denom, int wx, int ox,
@@ -322,6 +326,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
if (have_i8mm(cpu_flags)) {
NEON8_FNASSIGN(c->put_hevc_epel, 0, 1, epel_h, _i8mm);
+ NEON8_FNASSIGN(c->put_hevc_epel, 1, 1, epel_hv, _i8mm);
NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 1, epel_uni_hv, _i8mm);
NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h ,_i8mm);
NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
--
2.38.0.windows.1
[-- Attachment #4: 0004-lavc-aarch64-new-optimization-for-8-bit-hevc_qpel_v.patch --]
[-- Type: text/plain, Size: 18276 bytes --]
From 1c1ecf4bfa25573ff2ec719a100795c0d1ecdbed Mon Sep 17 00:00:00 2001
From: Logan Lyu <Logan.Lyu@myais.com.cn>
Date: Thu, 26 Oct 2023 09:24:32 +0800
Subject: [PATCH 4/5] lavc/aarch64: new optimization for 8-bit hevc_qpel_v
checkasm bench:
put_hevc_qpel_v4_8_c: 138.1
put_hevc_qpel_v4_8_neon: 41.1
put_hevc_qpel_v6_8_c: 276.6
put_hevc_qpel_v6_8_neon: 60.9
put_hevc_qpel_v8_8_c: 478.9
put_hevc_qpel_v8_8_neon: 72.9
put_hevc_qpel_v12_8_c: 1072.6
put_hevc_qpel_v12_8_neon: 203.9
put_hevc_qpel_v16_8_c: 1852.1
put_hevc_qpel_v16_8_neon: 264.1
put_hevc_qpel_v24_8_c: 4137.6
put_hevc_qpel_v24_8_neon: 586.9
put_hevc_qpel_v32_8_c: 7579.1
put_hevc_qpel_v32_8_neon: 1036.6
put_hevc_qpel_v48_8_c: 16355.6
put_hevc_qpel_v48_8_neon: 2326.4
put_hevc_qpel_v64_8_c: 33545.1
put_hevc_qpel_v64_8_neon: 4126.4
Co-Authored-By: J. Dekker <jdek@itanimul.li>
---
libavcodec/aarch64/hevcdsp_init_aarch64.c | 5 +
libavcodec/aarch64/hevcdsp_qpel_neon.S | 347 +++++++++++++++++++---
2 files changed, 314 insertions(+), 38 deletions(-)
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index e54d8d7b1e..3f72861e29 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -204,6 +204,10 @@ NEON8_FNPROTO(qpel_h, (int16_t *dst,
const uint8_t *_src, ptrdiff_t _srcstride,
int height, intptr_t mx, intptr_t my, int width), _i8mm);
+NEON8_FNPROTO(qpel_v, (int16_t *dst,
+ const uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width),);
+
NEON8_FNPROTO(qpel_uni_v, (uint8_t *dst, ptrdiff_t dststride,
const uint8_t *src, ptrdiff_t srcstride,
int height, intptr_t mx, intptr_t my, int width),);
@@ -315,6 +319,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
NEON8_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels,);
NEON8_FNASSIGN(c->put_hevc_epel, 1, 0, epel_v,);
NEON8_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels,);
+ NEON8_FNASSIGN(c->put_hevc_qpel, 1, 0, qpel_v,);
NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,);
NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,);
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index 95b96349ef..980d4540c5 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -112,6 +112,44 @@ endconst
.endif
.endm
+.macro calc_all
+ calc v23, v16, v17, v18, v19, v20, v21, v22, v23
+ b.eq 2f
+ calc v16, v17, v18, v19, v20, v21, v22, v23, v16
+ b.eq 2f
+ calc v17, v18, v19, v20, v21, v22, v23, v16, v17
+ b.eq 2f
+ calc v18, v19, v20, v21, v22, v23, v16, v17, v18
+ b.eq 2f
+ calc v19, v20, v21, v22, v23, v16, v17, v18, v19
+ b.eq 2f
+ calc v20, v21, v22, v23, v16, v17, v18, v19, v20
+ b.eq 2f
+ calc v21, v22, v23, v16, v17, v18, v19, v20, v21
+ b.eq 2f
+ calc v22, v23, v16, v17, v18, v19, v20, v21, v22
+ b.hi 1b
+.endm
+
+.macro calc_all2
+ calc v30, v31, v16, v18, v20, v22, v24, v26, v28, v30, v17, v19, v21, v23, v25, v27, v29, v31
+ b.eq 2f
+ calc v16, v17, v18, v20, v22, v24, v26, v28, v30, v16, v19, v21, v23, v25, v27, v29, v31, v17
+ b.eq 2f
+ calc v18, v19, v20, v22, v24, v26, v28, v30, v16, v18, v21, v23, v25, v27, v29, v31, v17, v19
+ b.eq 2f
+ calc v20, v21, v22, v24, v26, v28, v30, v16, v18, v20, v23, v25, v27, v29, v31, v17, v19, v21
+ b.eq 2f
+ calc v22, v23, v24, v26, v28, v30, v16, v18, v20, v22, v25, v27, v29, v31, v17, v19, v21, v23
+ b.eq 2f
+ calc v24, v25, v26, v28, v30, v16, v18, v20, v22, v24, v27, v29, v31, v17, v19, v21, v23, v25
+ b.eq 2f
+ calc v26, v27, v28, v30, v16, v18, v20, v22, v24, v26, v29, v31, v17, v19, v21, v23, v25, v27
+ b.eq 2f
+ calc v28, v29, v30, v16, v18, v20, v22, v24, v26, v28, v31, v17, v19, v21, v23, v25, v27, v29
+ b.hi 1b
+.endm
+
.macro put_hevc type
.ifc \type, qpel
// void put_hevc_qpel_h(int16_t *dst,
@@ -558,6 +596,277 @@ put_hevc qpel
put_hevc qpel_uni
put_hevc qpel_bi
+function ff_hevc_put_hevc_qpel_v4_8_neon, export=1
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2)
+ sub x1, x1, x2
+ ldr s16, [x1]
+ ldr s17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr s18, [x1]
+ ldr s19, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr s20, [x1]
+ ldr s21, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr s22, [x1]
+ add x1, x1, x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().s}[0], [x1], x2
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ st1 {v24.4h}, [x0], x9
+ subs w3, w3, #1
+ b.eq 2f
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v6_8_neon, export=1
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2 - 8)
+ sub x1, x1, x2
+ ldr d16, [x1]
+ ldr d17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr d18, [x1]
+ ldr d19, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr d20, [x1]
+ ldr d21, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr d22, [x1]
+ add x1, x1, x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8b}, [x1], x2
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ st1 {v24.4h}, [x0], #8
+ st1 {v24.s}[2], [x0], x9
+ subs w3, w3, #1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v8_8_neon, export=1
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2)
+ sub x1, x1, x2
+ ldr d16, [x1]
+ ldr d17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr d18, [x1]
+ ldr d19, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr d20, [x1]
+ ldr d21, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr d22, [x1]
+ add x1, x1, x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8b}, [x1], x2
+ movi v24.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ st1 {v24.8h}, [x0], x9
+ subs w3, w3, #1
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v12_8_neon, export=1
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2 - 16)
+ sub x1, x1, x2
+ ldr q16, [x1]
+ ldr q17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr q18, [x1]
+ ldr q19, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr q20, [x1]
+ ldr q21, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr q22, [x1]
+ add x1, x1, x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().16b}, [x1], x2
+ movi v24.8h, #0
+ movi v25.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ st1 {v24.8h}, [x0], #16
+ subs w3, w3, #1
+ st1 {v25.4h}, [x0], x9
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v16_8_neon, export=1
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2)
+ sub x1, x1, x2
+ ldr q16, [x1]
+ ldr q17, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr q18, [x1]
+ ldr q19, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr q20, [x1]
+ ldr q21, [x1, x2]
+ add x1, x1, x2, lsl #1
+ ldr q22, [x1]
+ add x1, x1, x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().16b}, [x1], x2
+ movi v24.8h, #0
+ movi v25.8h, #0
+ calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ subs w3, w3, #1
+ st1 {v24.8h, v25.8h}, [x0], x9
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+// todo: reads #32 bytes
+function ff_hevc_put_hevc_qpel_v24_8_neon, export=1
+ sub sp, sp, #32
+ st1 {v8.8b, v9.8b, v10.8b}, [sp]
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ sub x1, x1, x2
+ mov x9, #(MAX_PB_SIZE * 2)
+ ld1 {v16.16b, v17.16b}, [x1], x2
+ ld1 {v18.16b, v19.16b}, [x1], x2
+ ld1 {v20.16b, v21.16b}, [x1], x2
+ ld1 {v22.16b, v23.16b}, [x1], x2
+ ld1 {v24.16b, v25.16b}, [x1], x2
+ ld1 {v26.16b, v27.16b}, [x1], x2
+ ld1 {v28.16b, v29.16b}, [x1], x2
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().16b, \tmp1\().16b}, [x1], x2
+ movi v8.8h, #0
+ movi v9.8h, #0
+ movi v10.8h, #0
+ calc_qpelb v8, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v9, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+ subs w3, w3, #1
+ st1 {v8.8h, v9.8h, v10.8h}, [x0], x9
+.endm
+1: calc_all2
+.purgem calc
+2: ld1 {v8.8b, v9.8b, v10.8b}, [sp]
+ add sp, sp, #32
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v32_8_neon, export=1
+ sub sp, sp, #32
+ st1 {v8.8b-v11.8b}, [sp]
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ mov x9, #(MAX_PB_SIZE * 2)
+ sub x1, x1, x2
+ ld1 {v16.16b, v17.16b}, [x1], x2
+ ld1 {v18.16b, v19.16b}, [x1], x2
+ ld1 {v20.16b, v21.16b}, [x1], x2
+ ld1 {v22.16b, v23.16b}, [x1], x2
+ ld1 {v24.16b, v25.16b}, [x1], x2
+ ld1 {v26.16b, v27.16b}, [x1], x2
+ ld1 {v28.16b, v29.16b}, [x1], x2
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().16b, \tmp1\().16b}, [x1], x2
+ movi v8.8h, #0
+ movi v9.8h, #0
+ movi v10.8h, #0
+ movi v11.8h, #0
+ calc_qpelb v8, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v9, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+ calc_qpelb2 v11, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+ subs w3, w3, #1
+ st1 {v8.8h-v11.8h}, [x0], x9
+.endm
+1: calc_all2
+.purgem calc
+2: ld1 {v8.8b-v11.8b}, [sp], #32
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v48_8_neon, export=1
+ stp x5, x30, [sp, #-16]
+ stp x0, x1, [sp, #-32]
+ stp x2, x3, [sp, #-48]!
+ bl X(ff_hevc_put_hevc_qpel_v24_8_neon)
+ ldp x2, x3, [sp]
+ ldp x0, x1, [sp, #16]
+ ldr x5, [sp, #32]
+ add sp, sp, #32
+ add x0, x0, #48
+ add x1, x1, #24
+ bl X(ff_hevc_put_hevc_qpel_v24_8_neon)
+ ldr x30, [sp, #8]
+ add sp, sp, #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v64_8_neon, export=1
+ sub sp, sp, #32
+ st1 {v8.8b-v11.8b}, [sp]
+ load_qpel_filterb x5, x4
+ sub x1, x1, x2, lsl #1
+ sub x1, x1, x2
+ mov x9, #(MAX_PB_SIZE * 2)
+0: mov x8, x1 // src
+ ld1 {v16.16b, v17.16b}, [x8], x2
+ mov w11, w3 // height
+ ld1 {v18.16b, v19.16b}, [x8], x2
+ mov x10, x0 // dst
+ ld1 {v20.16b, v21.16b}, [x8], x2
+ ld1 {v22.16b, v23.16b}, [x8], x2
+ ld1 {v24.16b, v25.16b}, [x8], x2
+ ld1 {v26.16b, v27.16b}, [x8], x2
+ ld1 {v28.16b, v29.16b}, [x8], x2
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().16b, \tmp1\().16b}, [x8], x2
+ movi v8.8h, #0
+ movi v9.8h, #0
+ movi v10.8h, #0
+ movi v11.8h, #0
+ calc_qpelb v8, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb2 v9, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+ calc_qpelb v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+ calc_qpelb2 v11, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+ subs x11, x11, #1
+ st1 {v8.8h-v11.8h}, [x10], x9
+.endm
+1: calc_all2
+.purgem calc
+2: add x0, x0, #64
+ add x1, x1, #32
+ subs w6, w6, #32
+ b.hi 0b
+ ld1 {v8.8b-v11.8b}, [sp], #32
+ ret
+endfunc
+
+
function ff_hevc_put_hevc_pel_uni_pixels4_8_neon, export=1
1:
ldr s0, [x2]
@@ -663,25 +972,6 @@ function ff_hevc_put_hevc_pel_uni_pixels64_8_neon, export=1
ret
endfunc
-.macro calc_all
- calc v23, v16, v17, v18, v19, v20, v21, v22, v23
- b.eq 2f
- calc v16, v17, v18, v19, v20, v21, v22, v23, v16
- b.eq 2f
- calc v17, v18, v19, v20, v21, v22, v23, v16, v17
- b.eq 2f
- calc v18, v19, v20, v21, v22, v23, v16, v17, v18
- b.eq 2f
- calc v19, v20, v21, v22, v23, v16, v17, v18, v19
- b.eq 2f
- calc v20, v21, v22, v23, v16, v17, v18, v19, v20
- b.eq 2f
- calc v21, v22, v23, v16, v17, v18, v19, v20, v21
- b.eq 2f
- calc v22, v23, v16, v17, v18, v19, v20, v21, v22
- b.hi 1b
-.endm
-
function ff_hevc_put_hevc_qpel_uni_v4_8_neon, export=1
load_qpel_filterb x6, x5
sub x2, x2, x3, lsl #1
@@ -1560,25 +1850,6 @@ endfunc
#if HAVE_I8MM
ENABLE_I8MM
-.macro calc_all2
- calc v30, v31, v16, v18, v20, v22, v24, v26, v28, v30, v17, v19, v21, v23, v25, v27, v29, v31
- b.eq 2f
- calc v16, v17, v18, v20, v22, v24, v26, v28, v30, v16, v19, v21, v23, v25, v27, v29, v31, v17
- b.eq 2f
- calc v18, v19, v20, v22, v24, v26, v28, v30, v16, v18, v21, v23, v25, v27, v29, v31, v17, v19
- b.eq 2f
- calc v20, v21, v22, v24, v26, v28, v30, v16, v18, v20, v23, v25, v27, v29, v31, v17, v19, v21
- b.eq 2f
- calc v22, v23, v24, v26, v28, v30, v16, v18, v20, v22, v25, v27, v29, v31, v17, v19, v21, v23
- b.eq 2f
- calc v24, v25, v26, v28, v30, v16, v18, v20, v22, v24, v27, v29, v31, v17, v19, v21, v23, v25
- b.eq 2f
- calc v26, v27, v28, v30, v16, v18, v20, v22, v24, v26, v29, v31, v17, v19, v21, v23, v25, v27
- b.eq 2f
- calc v28, v29, v30, v16, v18, v20, v22, v24, v26, v28, v31, v17, v19, v21, v23, v25, v27, v29
- b.hi 1b
-.endm
-
function ff_hevc_put_hevc_qpel_uni_hv4_8_neon_i8mm, export=1
add w10, w4, #7
lsl x10, x10, #7
--
2.38.0.windows.1
[-- Attachment #5: 0005-lavc-aarch64-new-optimization-for-8-bit-hevc_qpel_hv.patch --]
[-- Type: text/plain, Size: 15470 bytes --]
From 017913308cfd05237882080d93a11241527de345 Mon Sep 17 00:00:00 2001
From: Logan Lyu <Logan.Lyu@myais.com.cn>
Date: Thu, 26 Oct 2023 09:33:35 +0800
Subject: [PATCH 5/5] lavc/aarch64: new optimization for 8-bit hevc_qpel_hv
checkasm bench:
put_hevc_qpel_hv4_8_c: 422.1
put_hevc_qpel_hv4_8_i8mm: 101.6
put_hevc_qpel_hv6_8_c: 756.4
put_hevc_qpel_hv6_8_i8mm: 225.9
put_hevc_qpel_hv8_8_c: 1189.9
put_hevc_qpel_hv8_8_i8mm: 296.6
put_hevc_qpel_hv12_8_c: 2407.4
put_hevc_qpel_hv12_8_i8mm: 552.4
put_hevc_qpel_hv16_8_c: 4021.4
put_hevc_qpel_hv16_8_i8mm: 886.6
put_hevc_qpel_hv24_8_c: 8992.1
put_hevc_qpel_hv24_8_i8mm: 1968.9
put_hevc_qpel_hv32_8_c: 15197.9
put_hevc_qpel_hv32_8_i8mm: 3209.4
put_hevc_qpel_hv48_8_c: 32811.1
put_hevc_qpel_hv48_8_i8mm: 7442.1
put_hevc_qpel_hv64_8_c: 58106.1
put_hevc_qpel_hv64_8_i8mm: 12423.9
'
Co-Authored-By: J. Dekker <jdek@itanimul.li>
---
libavcodec/aarch64/hevcdsp_init_aarch64.c | 5 +
libavcodec/aarch64/hevcdsp_qpel_neon.S | 295 ++++++++++++++++++++++
2 files changed, 300 insertions(+)
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 3f72861e29..c51488275c 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -208,6 +208,10 @@ NEON8_FNPROTO(qpel_v, (int16_t *dst,
const uint8_t *src, ptrdiff_t srcstride,
int height, intptr_t mx, intptr_t my, int width),);
+NEON8_FNPROTO(qpel_hv, (int16_t *dst,
+ const uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width), _i8mm);
+
NEON8_FNPROTO(qpel_uni_v, (uint8_t *dst, ptrdiff_t dststride,
const uint8_t *src, ptrdiff_t srcstride,
int height, intptr_t mx, intptr_t my, int width),);
@@ -335,6 +339,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 1, epel_uni_hv, _i8mm);
NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h ,_i8mm);
NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
+ NEON8_FNASSIGN(c->put_hevc_qpel, 1, 1, qpel_hv, _i8mm);
NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 1, qpel_uni_hv, _i8mm);
NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm);
NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 1, epel_uni_w_hv, _i8mm);
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index 980d4540c5..8e257208e4 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -3062,6 +3062,301 @@ function ff_hevc_put_hevc_qpel_h64_8_neon_i8mm, export=1
ret
endfunc
+
+function ff_hevc_put_hevc_qpel_hv4_8_neon_i8mm, export=1
+ add w10, w3, #7
+ mov x7, #128
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ sub x1, x1, x2, lsl #1
+ add x3, x3, #7
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h4_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_qpel_filterh x5, x4
+ ldr d16, [sp]
+ ldr d17, [sp, x7]
+ add sp, sp, x7, lsl #1
+ ldr d18, [sp]
+ ldr d19, [sp, x7]
+ add sp, sp, x7, lsl #1
+ ldr d20, [sp]
+ ldr d21, [sp, x7]
+ add sp, sp, x7, lsl #1
+ ldr d22, [sp]
+ add sp, sp, x7
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().4h}, [sp], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+ subs w3, w3, #1
+ st1 {v1.4h}, [x0], x7
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv6_8_neon_i8mm, export=1
+ add w10, w3, #7
+ mov x7, #128
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ sub x1, x1, x2, lsl #1
+ add x3, x3, #7
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h6_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ mov x8, #120
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_qpel_filterh x5, x4
+ ldr q16, [sp]
+ ldr q17, [sp, x7]
+ add sp, sp, x7, lsl #1
+ ldr q18, [sp]
+ ldr q19, [sp, x7]
+ add sp, sp, x7, lsl #1
+ ldr q20, [sp]
+ ldr q21, [sp, x7]
+ add sp, sp, x7, lsl #1
+ ldr q22, [sp]
+ add sp, sp, x7
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8h}, [sp], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
+ st1 {v1.4h}, [x0], #8
+ subs w3, w3, #1
+ st1 {v1.s}[2], [x0], x8
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv8_8_neon_i8mm, export=1
+ add w10, w3, #7
+ lsl x10, x10, #7
+ sub x1, x1, x2, lsl #1
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ add x3, x3, #7
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h8_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ mov x7, #128
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_qpel_filterh x5, x4
+ ldr q16, [sp]
+ ldr q17, [sp, x7]
+ add sp, sp, x7, lsl #1
+ ldr q18, [sp]
+ ldr q19, [sp, x7]
+ add sp, sp, x7, lsl #1
+ ldr q20, [sp]
+ ldr q21, [sp, x7]
+ add sp, sp, x7, lsl #1
+ ldr q22, [sp]
+ add sp, sp, x7
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\tmp\().8h}, [sp], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h}, [x0], x7
+.endm
+1: calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv12_8_neon_i8mm, export=1
+ add w10, w3, #7
+ lsl x10, x10, #7
+ sub x1, x1, x2, lsl #1
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ add x3, x3, #7
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h12_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ mov x7, #128
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_qpel_filterh x5, x4
+ mov x8, #112
+ ld1 {v16.8h, v17.8h}, [sp], x7
+ ld1 {v18.8h, v19.8h}, [sp], x7
+ ld1 {v20.8h, v21.8h}, [sp], x7
+ ld1 {v22.8h, v23.8h}, [sp], x7
+ ld1 {v24.8h, v25.8h}, [sp], x7
+ ld1 {v26.8h, v27.8h}, [sp], x7
+ ld1 {v28.8h, v29.8h}, [sp], x7
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().8h, \tmp1\().8h}, [sp], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
+ calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn
+ st1 {v1.8h}, [x0], #16
+ subs w3, w3, #1
+ st1 {v2.4h}, [x0], x8
+.endm
+1: calc_all2
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv16_8_neon_i8mm, export=1
+ add w10, w3, #7
+ lsl x10, x10, #7
+ sub x1, x1, x2, lsl #1
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x3, x3, #7
+ add x0, sp, #32
+ sub x1, x1, x2
+ bl X(ff_hevc_put_hevc_qpel_h16_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ mov x7, #128
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_qpel_filterh x5, x4
+ ld1 {v16.8h, v17.8h}, [sp], x7
+ ld1 {v18.8h, v19.8h}, [sp], x7
+ ld1 {v20.8h, v21.8h}, [sp], x7
+ ld1 {v22.8h, v23.8h}, [sp], x7
+ ld1 {v24.8h, v25.8h}, [sp], x7
+ ld1 {v26.8h, v27.8h}, [sp], x7
+ ld1 {v28.8h, v29.8h}, [sp], x7
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().8h, \tmp1\().8h}, [sp], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
+ calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn
+ calc_qpelh2 v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn2
+ subs w3, w3, #1
+ st1 {v1.8h, v2.8h}, [x0], x7
+.endm
+1: calc_all2
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv24_8_neon_i8mm, export=1
+ stp x4, x5, [sp, #-64]!
+ stp x2, x3, [sp, #16]
+ stp x0, x1, [sp, #32]
+ str x30, [sp, #48]
+ bl X(ff_hevc_put_hevc_qpel_hv12_8_neon_i8mm)
+ ldp x4, x5, [sp]
+ ldp x2, x3, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ add sp, sp, #48
+ add x1, x1, #12
+ add x0, x0, #24
+ bl X(ff_hevc_put_hevc_qpel_hv12_8_neon_i8mm)
+ ldr x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv32_8_neon_i8mm, export=1
+ add w10, w3, #7
+ sub x1, x1, x2, lsl #1
+ lsl x10, x10, #7
+ sub x1, x1, x2
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x3, x3, #7
+ add x0, sp, #32
+ bl X(ff_hevc_put_hevc_qpel_h32_8_neon_i8mm)
+ ldp x5, x30, [sp]
+ mov x7, #128
+ ldp x0, x3, [sp, #16]
+ add sp, sp, #32
+ load_qpel_filterh x5, x4
+0: mov x8, sp // src
+ ld1 {v16.8h, v17.8h}, [x8], x7
+ mov w9, w3 // height
+ ld1 {v18.8h, v19.8h}, [x8], x7
+ mov x5, x0 // dst
+ ld1 {v20.8h, v21.8h}, [x8], x7
+ ld1 {v22.8h, v23.8h}, [x8], x7
+ ld1 {v24.8h, v25.8h}, [x8], x7
+ ld1 {v26.8h, v27.8h}, [x8], x7
+ ld1 {v28.8h, v29.8h}, [x8], x7
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+ ld1 {\tmp0\().8h, \tmp1\().8h}, [x8], x7
+ calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+ calc_qpelh2 v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
+ calc_qpelh v2, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn
+ calc_qpelh2 v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn2
+ subs x9, x9, #1
+ st1 {v1.8h, v2.8h}, [x5], x7
+.endm
+1: calc_all2
+.purgem calc
+2: add x0, x0, #32
+ add sp, sp, #32
+ subs w6, w6, #16
+ b.hi 0b
+ add w10, w3, #6
+ add sp, sp, #64 // discard rest of first line
+ lsl x10, x10, #7
+ add sp, sp, x10 // tmp_array without first line
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv48_8_neon_i8mm, export=1
+ stp x4, x5, [sp, #-64]!
+ stp x2, x3, [sp, #16]
+ stp x0, x1, [sp, #32]
+ str x30, [sp, #48]
+ bl X(ff_hevc_put_hevc_qpel_hv24_8_neon_i8mm)
+ ldp x4, x5, [sp]
+ ldp x2, x3, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ add sp, sp, #48
+ add x1, x1, #24
+ add x0, x0, #48
+ bl X(ff_hevc_put_hevc_qpel_hv24_8_neon_i8mm)
+ ldr x30, [sp], #16
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv64_8_neon_i8mm, export=1
+ stp x4, x5, [sp, #-64]!
+ stp x2, x3, [sp, #16]
+ stp x0, x1, [sp, #32]
+ str x30, [sp, #48]
+ mov x6, #32
+ bl X(ff_hevc_put_hevc_qpel_hv32_8_neon_i8mm)
+ ldp x4, x5, [sp]
+ ldp x2, x3, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ add sp, sp, #48
+ add x1, x1, #32
+ add x0, x0, #64
+ mov x6, #32
+ bl X(ff_hevc_put_hevc_qpel_hv32_8_neon_i8mm)
+ ldr x30, [sp], #16
+ ret
+endfunc
+
.macro QPEL_UNI_W_HV_HEADER width
ldp x14, x15, [sp] // mx, my
ldr w13, [sp, #16] // width
--
2.38.0.windows.1
[-- Attachment #6: 0001-lavc-aarch64-new-optimization-for-8-bit-hevc_epel_pixels-and-and-hevc_qpel_pixels.patch --]
[-- Type: text/plain, Size: 6811 bytes --]
From bf41a4b3bda1cbc4c331a40dd4d42b8728ec039a Mon Sep 17 00:00:00 2001
From: Logan Lyu <Logan.Lyu@myais.com.cn>
Date: Thu, 26 Oct 2023 09:11:09 +0800
Subject: [PATCH 1/5] lavc/aarch64: new optimization for 8-bit hevc_epel_pixels
and and hevc_qpel_pixels
checkasm bench:
put_hevc_pel_pixels4_8_c: 33.7
put_hevc_pel_pixels4_8_neon: 20.2
put_hevc_pel_pixels6_8_c: 61.4
put_hevc_pel_pixels6_8_neon: 25.4
put_hevc_pel_pixels8_8_c: 121.4
put_hevc_pel_pixels8_8_neon: 16.9
put_hevc_pel_pixels12_8_c: 199.9
put_hevc_pel_pixels12_8_neon: 40.2
put_hevc_pel_pixels16_8_c: 355.9
put_hevc_pel_pixels16_8_neon: 43.4
put_hevc_pel_pixels24_8_c: 774.7
put_hevc_pel_pixels24_8_neon: 78.9
put_hevc_pel_pixels32_8_c: 1345.2
put_hevc_pel_pixels32_8_neon: 152.2
put_hevc_pel_pixels48_8_c: 2963.7
put_hevc_pel_pixels48_8_neon: 309.4
put_hevc_pel_pixels64_8_c: 5236.2
put_hevc_pel_pixels64_8_neon: 514.2
Co-Authored-By: J. Dekker <jdek@itanimul.li>
---
libavcodec/aarch64/hevcdsp_epel_neon.S | 112 ++++++++++++++++++++++
libavcodec/aarch64/hevcdsp_init_aarch64.c | 6 ++
2 files changed, 118 insertions(+)
diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S
index e398e6ac9d..0705213eed 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -131,6 +131,118 @@ endconst
b.ne 1b
.endm
+function ff_hevc_put_hevc_pel_pixels4_8_neon, export=1
+ mov x7, #(MAX_PB_SIZE * 2)
+1: ld1 {v0.s}[0], [x1], x2
+ ushll v4.8h, v0.8b, #6
+ subs w3, w3, #1
+ st1 {v4.d}[0], [x0], x7
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels6_8_neon, export=1
+ mov x7, #(MAX_PB_SIZE * 2 - 8)
+1: ld1 {v0.8b}, [x1], x2
+ ushll v4.8h, v0.8b, #6
+ st1 {v4.d}[0], [x0], #8
+ subs w3, w3, #1
+ st1 {v4.s}[2], [x0], x7
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels8_8_neon, export=1
+ mov x7, #(MAX_PB_SIZE * 2)
+1: ld1 {v0.8b}, [x1], x2
+ ushll v4.8h, v0.8b, #6
+ subs w3, w3, #1
+ st1 {v4.8h}, [x0], x7
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels12_8_neon, export=1
+ mov x7, #(MAX_PB_SIZE * 2 - 16)
+1: ld1 {v0.8b, v1.8b}, [x1], x2
+ ushll v4.8h, v0.8b, #6
+ st1 {v4.8h}, [x0], #16
+ ushll v5.8h, v1.8b, #6
+ subs w3, w3, #1
+ st1 {v5.d}[0], [x0], x7
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels16_8_neon, export=1
+ mov x7, #(MAX_PB_SIZE * 2)
+1: ld1 {v0.8b, v1.8b}, [x1], x2
+ ushll v4.8h, v0.8b, #6
+ ushll v5.8h, v1.8b, #6
+ subs w3, w3, #1
+ st1 {v4.8h, v5.8h}, [x0], x7
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels24_8_neon, export=1
+ mov x7, #(MAX_PB_SIZE * 2)
+1: ld1 {v0.8b-v2.8b}, [x1], x2
+ ushll v4.8h, v0.8b, #6
+ ushll v5.8h, v1.8b, #6
+ ushll v6.8h, v2.8b, #6
+ subs w3, w3, #1
+ st1 {v4.8h-v6.8h}, [x0], x7
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels32_8_neon, export=1
+ mov x7, #(MAX_PB_SIZE * 2)
+1: ld1 {v0.8b-v3.8b}, [x1], x2
+ ushll v4.8h, v0.8b, #6
+ ushll v5.8h, v1.8b, #6
+ ushll v6.8h, v2.8b, #6
+ ushll v7.8h, v3.8b, #6
+ subs w3, w3, #1
+ st1 {v4.8h-v7.8h}, [x0], x7
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels48_8_neon, export=1
+ mov x7, #(MAX_PB_SIZE)
+1: ld1 {v0.16b-v2.16b}, [x1], x2
+ ushll v4.8h, v0.8b, #6
+ ushll2 v5.8h, v0.16b, #6
+ ushll v6.8h, v1.8b, #6
+ ushll2 v7.8h, v1.16b, #6
+ st1 {v4.8h-v7.8h}, [x0], #64
+ ushll v16.8h, v2.8b, #6
+ ushll2 v17.8h, v2.16b, #6
+ subs w3, w3, #1
+ st1 {v16.8h-v17.8h}, [x0], x7
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1
+1: ld1 {v0.16b-v3.16b}, [x1], x2
+ ushll v4.8h, v0.8b, #6
+ ushll2 v5.8h, v0.16b, #6
+ ushll v6.8h, v1.8b, #6
+ ushll2 v7.8h, v1.16b, #6
+ st1 {v4.8h-v7.8h}, [x0], #(MAX_PB_SIZE)
+ ushll v16.8h, v2.8b, #6
+ ushll2 v17.8h, v2.16b, #6
+ ushll v18.8h, v3.8b, #6
+ ushll2 v19.8h, v3.16b, #6
+ subs w3, w3, #1
+ st1 {v16.8h-v19.8h}, [x0], #(MAX_PB_SIZE)
+ b.ne 1b
+ ret
+endfunc
+
function ff_hevc_put_hevc_epel_uni_v4_8_neon, export=1
load_epel_filterb x6, x5
sub x2, x2, x3
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index d5b973d2e0..c203d65d34 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -152,6 +152,10 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, co
void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \
void ff_hevc_put_hevc_##fn##64_8_neon##ext args
+NEON8_FNPROTO(pel_pixels, (int16_t *dst,
+ const uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width),);
+
NEON8_FNPROTO(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride,
int height, intptr_t mx, intptr_t my, int width),);
@@ -300,6 +304,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_qpel_bi[8][0][1] =
c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
+ NEON8_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels,);
+ NEON8_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels,);
NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,);
NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,);
--
2.38.0.windows.1
[-- Attachment #7: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: new optimization for 8-bit hevc_epel_v
2023-10-26 8:30 ` Logan.Lyu
@ 2023-10-31 12:17 ` Martin Storsjö
0 siblings, 0 replies; 6+ messages in thread
From: Martin Storsjö @ 2023-10-31 12:17 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Thu, 26 Oct 2023, Logan.Lyu wrote:
> And I missed submitting a commit that was earlier than these four commits,
> which caused the corrupted whitespace problem. Now I have recreated these
> patches.
>
> In addition, I rebased it to ensure that these patches can be successfully
> applied on the latest master branch.
>
> Please check again, thank you.
Thanks, now these was possibly to apply, and they looked mostly ok, so I
touched up the last details I noticed and pushed them.
Things I noticed and fixed before pushing:
A bunch of minor cosmetics, you had minor misindentations in a few places
(that were copypasted around in lots of places), that I fixed like this:
ld1 {v18.16b}, [x1], x2
.macro calc src0, src1, src2, src3
- ld1 {\src3\().16b}, [x1], x2
+ ld1 {\src3\().16b}, [x1], x2
movi v4.8h, #0
movi v5.8h, #0
calc_epelb v4, \src0, \src1, \src2, \src3
@@ -461,7 +461,7 @@ function ff_hevc_put_hevc_epel_v64_8_neon, export=1
.endm
1: calc_all16
.purgem calc
-2: ld1 {v8.8b-v11.8b}, [sp]
+2: ld1 {v8.8b-v11.8b}, [sp]
add sp, sp, #32
ret
The first patch, with mostly small trivial functions, can probably be
scheduled better for in-order cores. I'll send a patch if I can make them
measurably faster.
In almost every patch, you have loads/stores to the stack; you use the
fused stack decrement nicely everywhere possible, but for the loading,
you're almost always lacking the fused stack increment. I've fixed it now
for this patchset, but please do keep this in mind and fix it up before
submitting any further patches. I've fixed that up like this:
bl X(ff_hevc_put_hevc_epel_h4_8_neon_i8mm)
- ldp x5, x30, [sp]
ldp x0, x3, [sp, #16]
- add sp, sp, #32
+ ldp x5, x30, [sp], #32
load_epel_filterh x5, x4
(In many places.)
In one place, you wrote below the stack pointer before decrementing it.
That's ok on OSes with a defined red zone, but we shouldn't need to assume
that; I've fixed that like this:
function ff_hevc_put_hevc_qpel_v48_8_neon, export=1
- stp x5, x30, [sp, #-16]
- stp x0, x1, [sp, #-32]
stp x2, x3, [sp, #-48]!
+ stp x0, x1, [sp, #16]
+ stp x5, x30, [sp, #32]
I'll push the patchset with these changes soon.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2023-10-31 12:17 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-10-14 8:45 [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: new optimization for 8-bit hevc_epel_v Logan.Lyu
2023-10-14 17:08 ` Michael Niedermayer
2023-10-22 13:29 ` Logan.Lyu
2023-10-22 17:18 ` Martin Storsjö
2023-10-26 8:30 ` Logan.Lyu
2023-10-31 12:17 ` Martin Storsjö
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git