* [FFmpeg-devel] [PATCH 2/4] lavc/aarch64: new optimization for 8-bit hevc_epel_uni_hv
@ 2023-08-26 8:49 Logan.Lyu
0 siblings, 0 replies; only message in thread
From: Logan.Lyu @ 2023-08-26 8:49 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: jb, jdek
checkasm bench:
put_hevc_epel_uni_hv4_8_c: 204.7
put_hevc_epel_uni_hv4_8_i8mm: 70.2
put_hevc_epel_uni_hv6_8_c: 378.2
put_hevc_epel_uni_hv6_8_i8mm: 131.9
put_hevc_epel_uni_hv8_8_c: 637.7
put_hevc_epel_uni_hv8_8_i8mm: 137.9
put_hevc_epel_uni_hv12_8_c: 1301.9
put_hevc_epel_uni_hv12_8_i8mm: 314.2
put_hevc_epel_uni_hv16_8_c: 2203.4
put_hevc_epel_uni_hv16_8_i8mm: 454.7
put_hevc_epel_uni_hv24_8_c: 4848.2
put_hevc_epel_uni_hv24_8_i8mm: 1065.2
put_hevc_epel_uni_hv32_8_c: 8517.4
put_hevc_epel_uni_hv32_8_i8mm: 1898.4
put_hevc_epel_uni_hv48_8_c: 19591.7
put_hevc_epel_uni_hv48_8_i8mm: 4107.2
put_hevc_epel_uni_hv64_8_c: 33880.2
put_hevc_epel_uni_hv64_8_i8mm: 6568.7
Co-Authored-By: J. Dekker <jdek@itanimul.li>
Signed-off-by: Logon Lyu <Logan.Lyu@myais.com.cn>
---
libavcodec/aarch64/hevcdsp_epel_neon.S | 344 ++++++++++++++++++++--
libavcodec/aarch64/hevcdsp_init_aarch64.c | 5 +
2 files changed, 328 insertions(+), 21 deletions(-)
diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S
b/libavcodec/aarch64/hevcdsp_epel_neon.S
index 7ce7eec829..4ad1b67081 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -54,6 +54,29 @@ endconst
umlsl2 \dst\().8h, \src3\().16b, v3.16b
.endm
+.macro load_epel_filterh freg, xreg
+ movrel \xreg, epel_filters
+ add \xreg, \xreg, \freg, lsl #2
+ ld1 {v0.8b}, [\xreg]
+ sxtl v0.8h, v0.8b
+.endm
+
+.macro calc_epelh dst, src0, src1, src2, src3
+ smull \dst\().4s, \src0\().4h, v0.h[0]
+ smlal \dst\().4s, \src1\().4h, v0.h[1]
+ smlal \dst\().4s, \src2\().4h, v0.h[2]
+ smlal \dst\().4s, \src3\().4h, v0.h[3]
+ sqshrn \dst\().4h, \dst\().4s, #6
+.endm
+
+.macro calc_epelh2 dst, tmp, src0, src1, src2, src3
+ smull2 \tmp\().4s, \src0\().8h, v0.h[0]
+ smlal2 \tmp\().4s, \src1\().8h, v0.h[1]
+ smlal2 \tmp\().4s, \src2\().8h, v0.h[2]
+ smlal2 \tmp\().4s, \src3\().8h, v0.h[3]
+ sqshrn2 \dst\().8h, \tmp\().4s, #6
+.endm
+
.macro calc_all4
calc v16, v17, v18, v19
b.eq 2f
@@ -696,6 +719,306 @@ function ff_hevc_put_hevc_epel_h64_8_neon_i8mm,
export=1
ret
endfunc
+function ff_hevc_put_hevc_epel_uni_hv4_8_neon_i8mm, export=1
+ add w10, w4, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]
+ stp x4, x6, [sp, #-32]
+ stp xzr, x30, [sp, #-48]!
+ add x0, sp, #48
+ sub x1, x2, x3
+ mov x2, x3
+ add w3, w4, #3
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_epel_h4_8_neon_i8mm)
+ ldp xzr, x30, [sp]
+ ldp x4, x6, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ add sp, sp, #48
+ load_epel_filterh x6, x5
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.4h}, [sp], x10
+ ld1 {v17.4h}, [sp], x10
+ ld1 {v18.4h}, [sp], x10
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().4h}, [sp], x10
+ calc_epelh v4, \src0, \src1, \src2, \src3
+ sqrshrun v4.8b, v4.8h, #6
+ subs w4, w4, #1
+ st1 {v4.s}[0], [x0], x1
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv6_8_neon_i8mm, export=1
+ add w10, w4, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]
+ stp x4, x6, [sp, #-32]
+ stp xzr, x30, [sp, #-48]!
+ add x0, sp, #48
+ sub x1, x2, x3
+ mov x2, x3
+ add w3, w4, #3
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_epel_h6_8_neon_i8mm)
+ ldp xzr, x30, [sp]
+ ldp x4, x6, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ add sp, sp, #48
+ load_epel_filterh x6, x5
+ sub x1, x1, #4
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h}, [sp], x10
+ ld1 {v17.8h}, [sp], x10
+ ld1 {v18.8h}, [sp], x10
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().8h}, [sp], x10
+ calc_epelh v4, \src0, \src1, \src2, \src3
+ calc_epelh2 v4, v5, \src0, \src1, \src2, \src3
+ sqrshrun v4.8b, v4.8h, #6
+ st1 {v4.s}[0], [x0], #4
+ subs w4, w4, #1
+ st1 {v4.h}[2], [x0], x1
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv8_8_neon_i8mm, export=1
+ add w10, w4, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]
+ stp x4, x6, [sp, #-32]
+ stp xzr, x30, [sp, #-48]!
+ add x0, sp, #48
+ sub x1, x2, x3
+ mov x2, x3
+ add w3, w4, #3
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_epel_h8_8_neon_i8mm)
+ ldp xzr, x30, [sp]
+ ldp x4, x6, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ add sp, sp, #48
+ load_epel_filterh x6, x5
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h}, [sp], x10
+ ld1 {v17.8h}, [sp], x10
+ ld1 {v18.8h}, [sp], x10
+.macro calc src0, src1, src2, src3
+ ld1 {\src3\().8h}, [sp], x10
+ calc_epelh v4, \src0, \src1, \src2, \src3
+ calc_epelh2 v4, v5, \src0, \src1, \src2, \src3
+ sqrshrun v4.8b, v4.8h, #6
+ subs w4, w4, #1
+ st1 {v4.8b}, [x0], x1
+.endm
+1: calc_all4
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv12_8_neon_i8mm, export=1
+ add w10, w4, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]
+ stp x4, x6, [sp, #-32]
+ stp xzr, x30, [sp, #-48]!
+ add x0, sp, #48
+ sub x1, x2, x3
+ mov x2, x3
+ add w3, w4, #3
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_epel_h12_8_neon_i8mm)
+ ldp xzr, x30, [sp]
+ ldp x4, x6, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ add sp, sp, #48
+ load_epel_filterh x6, x5
+ sub x1, x1, #8
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h, v17.8h}, [sp], x10
+ ld1 {v18.8h, v19.8h}, [sp], x10
+ ld1 {v20.8h, v21.8h}, [sp], x10
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\src6\().8h, \src7\().8h}, [sp], x10
+ calc_epelh v4, \src0, \src2, \src4, \src6
+ calc_epelh2 v4, v5, \src0, \src2, \src4, \src6
+ calc_epelh v5, \src1, \src3, \src5, \src7
+ sqrshrun v4.8b, v4.8h, #6
+ sqrshrun2 v4.16b, v5.8h, #6
+ st1 {v4.8b}, [x0], #8
+ st1 {v4.s}[2], [x0], x1
+ subs w4, w4, #1
+.endm
+1: calc_all8
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv16_8_neon_i8mm, export=1
+ add w10, w4, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]
+ stp x4, x6, [sp, #-32]
+ stp xzr, x30, [sp, #-48]!
+ add x0, sp, #48
+ sub x1, x2, x3
+ mov x2, x3
+ add w3, w4, #3
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_epel_h16_8_neon_i8mm)
+ ldp xzr, x30, [sp]
+ ldp x4, x6, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ add sp, sp, #48
+ load_epel_filterh x6, x5
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h, v17.8h}, [sp], x10
+ ld1 {v18.8h, v19.8h}, [sp], x10
+ ld1 {v20.8h, v21.8h}, [sp], x10
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+ ld1 {\src6\().8h, \src7\().8h}, [sp], x10
+ calc_epelh v4, \src0, \src2, \src4, \src6
+ calc_epelh2 v4, v5, \src0, \src2, \src4, \src6
+ calc_epelh v5, \src1, \src3, \src5, \src7
+ calc_epelh2 v5, v6, \src1, \src3, \src5, \src7
+ sqrshrun v4.8b, v4.8h, #6
+ sqrshrun2 v4.16b, v5.8h, #6
+ subs w4, w4, #1
+ st1 {v4.16b}, [x0], x1
+.endm
+1: calc_all8
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv24_8_neon_i8mm, export=1
+ add w10, w4, #3
+ lsl x10, x10, #7
+ sub sp, sp, x10 // tmp_array
+ stp x0, x1, [sp, #-16]
+ stp x4, x6, [sp, #-32]
+ stp xzr, x30, [sp, #-48]!
+ add x0, sp, #48
+ sub x1, x2, x3
+ mov x2, x3
+ add w3, w4, #3
+ mov x4, x5
+ bl X(ff_hevc_put_hevc_epel_h24_8_neon_i8mm)
+ ldp xzr, x30, [sp]
+ ldp x4, x6, [sp, #16]
+ ldp x0, x1, [sp, #32]
+ add sp, sp, #48
+ load_epel_filterh x6, x5
+ mov x10, #(MAX_PB_SIZE * 2)
+ ld1 {v16.8h, v17.8h, v18.8h}, [sp], x10
+ ld1 {v19.8h, v20.8h, v21.8h}, [sp], x10
+ ld1 {v22.8h, v23.8h, v24.8h}, [sp], x10
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9,
src10, src11
+ ld1 {\src9\().8h, \src10\().8h, \src11\().8h},
[sp], x10
+ calc_epelh v4, \src0, \src3, \src6, \src9
+ calc_epelh2 v4, v5, \src0, \src3, \src6, \src9
+ calc_epelh v5, \src1, \src4, \src7, \src10
+ calc_epelh2 v5, v6, \src1, \src4, \src7, \src10
+ calc_epelh v6, \src2, \src5, \src8, \src11
+ calc_epelh2 v6, v7, \src2, \src5, \src8, \src11
+ sqrshrun v4.8b, v4.8h, #6
+ sqrshrun v5.8b, v5.8h, #6
+ sqrshrun v6.8b, v6.8h, #6
+ subs w4, w4, #1
+ st1 {v4.8b, v5.8b, v6.8b}, [x0], x1
+.endm
+1: calc_all12
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv32_8_neon_i8mm, export=1
+ stp x0, x30, [sp, #-16]
+ stp x1, x2, [sp, #-32]
+ stp x3, x4, [sp, #-48]
+ stp x5, x6, [sp, #-64]!
+ mov x7, #16
+ bl X(ff_hevc_put_hevc_epel_uni_hv16_8_neon_i8mm)
+ ldp x5, x6, [sp]
+ ldp x3, x4, [sp, #16]
+ ldp x1, x2, [sp, #32]
+ ldr x0, [sp, #48]
+ add x0, x0, #16
+ add x2, x2, #16
+ mov x7, #16
+ bl X(ff_hevc_put_hevc_epel_uni_hv16_8_neon_i8mm)
+ ldr x30, [sp, #56]
+ add sp, sp, #64
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv48_8_neon_i8mm, export=1
+ stp x0, x30, [sp, #-16]
+ stp x1, x2, [sp, #-32]
+ stp x3, x4, [sp, #-48]
+ stp x5, x6, [sp, #-64]!
+ mov x7, #24
+ bl X(ff_hevc_put_hevc_epel_uni_hv24_8_neon_i8mm)
+ ldp x5, x6, [sp]
+ ldp x3, x4, [sp, #16]
+ ldp x1, x2, [sp, #32]
+ ldr x0, [sp, #48]
+ add x0, x0, #24
+ add x2, x2, #24
+ mov x7, #24
+ bl X(ff_hevc_put_hevc_epel_uni_hv24_8_neon_i8mm)
+ ldr x30, [sp, #56]
+ add sp, sp, #64
+ ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv64_8_neon_i8mm, export=1
+ stp x0, x30, [sp, #-16]
+ stp x1, x2, [sp, #-32]
+ stp x3, x4, [sp, #-48]
+ stp x5, x6, [sp, #-64]!
+ mov x7, #16
+ bl X(ff_hevc_put_hevc_epel_uni_hv16_8_neon_i8mm)
+ ldp x5, x6, [sp]
+ ldp x3, x4, [sp, #16]
+ ldp x1, x2, [sp, #32]
+ ldr x0, [sp, #48]
+ add x0, x0, #16
+ add x2, x2, #16
+ mov x7, #16
+ bl X(ff_hevc_put_hevc_epel_uni_hv16_8_neon_i8mm)
+ ldp x5, x6, [sp]
+ ldp x3, x4, [sp, #16]
+ ldp x1, x2, [sp, #32]
+ ldr x0, [sp, #48]
+ add x0, x0, #32
+ add x2, x2, #32
+ mov x7, #16
+ bl X(ff_hevc_put_hevc_epel_uni_hv16_8_neon_i8mm)
+ ldp x5, x6, [sp]
+ ldp x3, x4, [sp, #16]
+ ldp x1, x2, [sp, #32]
+ ldr x0, [sp, #48]
+ add x0, x0, #48
+ add x2, x2, #48
+ mov x7, #16
+ bl X(ff_hevc_put_hevc_epel_uni_hv16_8_neon_i8mm)
+ ldr x30, [sp, #56]
+ add sp, sp, #64
+ ret
+endfunc
+
.macro EPEL_UNI_W_H_HEADER
ldr x12, [sp]
sub x2, x2, #1
@@ -1138,28 +1461,7 @@ endfunc
sqxtn2 v6.8h, v31.4s
.endm
-.macro calc_epelh dst, src0, src1, src2, src3
- smull \dst\().4s, \src0\().4h, v0.h[0]
- smlal \dst\().4s, \src1\().4h, v0.h[1]
- smlal \dst\().4s, \src2\().4h, v0.h[2]
- smlal \dst\().4s, \src3\().4h, v0.h[3]
- sqshrn \dst\().4h, \dst\().4s, #6
-.endm
-
-.macro calc_epelh2 dst, tmp, src0, src1, src2, src3
- smull2 \tmp\().4s, \src0\().8h, v0.h[0]
- smlal2 \tmp\().4s, \src1\().8h, v0.h[1]
- smlal2 \tmp\().4s, \src2\().8h, v0.h[2]
- smlal2 \tmp\().4s, \src3\().8h, v0.h[3]
- sqshrn2 \dst\().8h, \tmp\().4s, #6
-.endm
-.macro load_epel_filterh freg, xreg
- movrel \xreg, epel_filters
- add \xreg, \xreg, \freg, lsl #2
- ld1 {v0.8b}, [\xreg]
- sxtl v0.8h, v0.8b
-.endm
function ff_hevc_put_hevc_epel_uni_w_hv4_8_neon_i8mm, export=1
epel_uni_w_hv_start
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index f1e167c50b..bf4e466af8 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -165,6 +165,10 @@ NEON8_FNPROTO(epel_uni_v, (uint8_t *dst, ptrdiff_t
dststride,
const uint8_t *src, ptrdiff_t srcstride,
int height, intptr_t mx, intptr_t my, int width),);
+NEON8_FNPROTO(epel_uni_hv, (uint8_t *dst, ptrdiff_t _dststride,
+ const uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width), _i8mm);
+
NEON8_FNPROTO(epel_uni_w_v, (uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride,
int height, int denom, int wx, int ox,
@@ -298,6 +302,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext
*c, const int bit_depth)
if (have_i8mm(cpu_flags)) {
NEON8_FNASSIGN(c->put_hevc_epel, 0, 1, epel_h, _i8mm);
+ NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 1, epel_uni_hv, _i8mm);
NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h
,_i8mm);
NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h,
_i8mm);
--
2.38.0.windows.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2023-08-26 8:50 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-08-26 8:49 [FFmpeg-devel] [PATCH 2/4] lavc/aarch64: new optimization for 8-bit hevc_epel_uni_hv Logan.Lyu
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git