From: Logan.Lyu@myais.com.cn To: ffmpeg-devel@ffmpeg.org Cc: Logan Lyu <Logan.Lyu@myais.com.cn> Subject: [FFmpeg-devel] [PATCH 4/5] lavc/aarch64: new optimization for 8-bit hevc_epel_h Date: Sun, 4 Jun 2023 12:17:55 +0800 Message-ID: <20230604041756.5196-4-Logan.Lyu@myais.com.cn> (raw) In-Reply-To: <20230604041756.5196-1-Logan.Lyu@myais.com.cn> From: Logan Lyu <Logan.Lyu@myais.com.cn> Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn> --- libavcodec/aarch64/hevcdsp_epel_neon.S | 343 ++++++++++++++++++++++ libavcodec/aarch64/hevcdsp_init_aarch64.c | 7 +- 2 files changed, 349 insertions(+), 1 deletion(-) diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S index 4841f49dab..32f052a7b1 100644 --- a/libavcodec/aarch64/hevcdsp_epel_neon.S +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S @@ -33,6 +33,349 @@ const epel_filters, align=4 endconst #if HAVE_I8MM + +.macro EPEL_H_HEADER + movrel x5, epel_filters + add x5, x5, x4, lsl #2 + ld1r {v30.4s}, [x5] + sub x1, x1, #1 + mov x10, #(MAX_PB_SIZE * 2) +.endm + +function ff_hevc_put_hevc_epel_h4_8_neon_i8mm, export=1 + EPEL_H_HEADER +1: ld1 {v4.8b}, [x1], x2 + ext v5.8b, v4.8b, v4.8b, #1 + ext v6.8b, v4.8b, v4.8b, #2 + ext v7.8b, v4.8b, v4.8b, #3 + trn1 v4.2s, v4.2s, v5.2s + trn1 v6.2s, v6.2s, v7.2s + trn1 v4.2d, v4.2d, v6.2d + movi v16.2d, #0 + usdot v16.4s, v4.16b, v30.16b + xtn v16.4h, v16.4s + st1 {v16.4h}, [x0], x10 + subs w3, w3, #1 // height + b.ne 1b + ret +endfunc + + +function ff_hevc_put_hevc_epel_h6_8_neon_i8mm, export=1 + EPEL_H_HEADER +1: ld1 {v4.16b}, [x1], x2 + ext v5.16b, v4.16b, v4.16b, #1 + ext v6.8b, v4.8b, v4.8b, #2 + ext v7.8b, v4.8b, v4.8b, #3 + trn1 v16.2s, v4.2s, v5.2s + trn2 v17.2s, v4.2s, v5.2s + trn1 v6.2s, v6.2s, v7.2s + trn1 v16.2d, v16.2d, v6.2d + movi v18.2d, #0 + movi v19.2d, #0 + usdot v18.4s, v16.16b, v30.16b + usdot v19.2s, v17.8b, v30.8b + xtn v18.4h, v18.4s + xtn v19.4h, v19.4s + str d18, [x0] + str s19, [x0, #8] + add x0, x0, x10 + subs w3, w3, #1 // height + b.ne 1b + ret +endfunc + +function ff_hevc_put_hevc_epel_h8_8_neon_i8mm, export=1 + EPEL_H_HEADER +1: ld1 {v4.16b}, [x1], x2 + ext v5.16b, v4.16b, v4.16b, #1 + ext v6.16b, v4.16b, v4.16b, #2 + ext v7.16b, v4.16b, v4.16b, #3 + zip1 v20.4s, v4.4s, v6.4s + zip1 v21.4s, v5.4s, v7.4s + movi v16.2d, #0 + movi v17.2d, #0 + usdot v16.4s, v20.16b, v30.16b + usdot v17.4s, v21.16b, v30.16b + xtn v16.4h, v16.4s + xtn v17.4h, v17.4s + st2 {v16.4h, v17.4h}, [x0], x10 + subs w3, w3, #1 // height + b.ne 1b + ret +endfunc + +function ff_hevc_put_hevc_epel_h12_8_neon_i8mm, export=1 + EPEL_H_HEADER +1: ld1 {v4.16b}, [x1], x2 + ext v5.16b, v4.16b, v4.16b, #1 + ext v6.16b, v4.16b, v4.16b, #2 + ext v7.16b, v4.16b, v4.16b, #3 + trn1 v20.2d, v4.2d, v6.2d + trn2 v22.2d, v4.2d, v6.2d + trn1 v21.2d, v5.2d, v7.2d + trn2 v23.2d, v5.2d, v7.2d + trn1 v4.4s, v20.4s, v21.4s + trn2 v5.4s, v20.4s, v21.4s + trn1 v6.4s, v22.4s, v23.4s + movi v16.2d, #0 + movi v17.2d, #0 + movi v18.2d, #0 + usdot v16.4s, v4.16b, v30.16b + usdot v17.4s, v5.16b, v30.16b + usdot v18.4s, v6.16b, v30.16b + xtn v16.4h, v16.4s + xtn2 v16.8h, v17.4s + xtn v18.4h, v18.4s + str q16, [x0] + str d18, [x0, #16] + add x0, x0, x10 + subs w3, w3, #1 // height + b.ne 1b + ret +endfunc + +function ff_hevc_put_hevc_epel_h16_8_neon_i8mm, export=1 + EPEL_H_HEADER +1: ld1 {v0.16b, v1.16b}, [x1], x2 + ext v5.16b, v0.16b, v1.16b, #1 + ext v6.16b, v0.16b, v1.16b, #2 + ext v7.16b, v0.16b, v1.16b, #3 + zip1 v20.4s, v0.4s, v6.4s + zip2 v22.4s, v0.4s, v6.4s + zip1 v21.4s, v5.4s, v7.4s + zip2 v23.4s, v5.4s, v7.4s + movi v16.2d, #0 + movi v17.2d, #0 + movi v18.2d, #0 + movi v19.2d, #0 + usdot v16.4s, v20.16b, v30.16b + usdot v17.4s, v21.16b, v30.16b + usdot v18.4s, v22.16b, v30.16b + usdot v19.4s, v23.16b, v30.16b + xtn v16.4h, v16.4s + xtn2 v16.8h, v18.4s + xtn v17.4h, v17.4s + xtn2 v17.8h, v19.4s + st2 {v16.8h, v17.8h}, [x0], x10 + subs w3, w3, #1 // height + b.ne 1b + ret +endfunc + +function ff_hevc_put_hevc_epel_h24_8_neon_i8mm, export=1 + EPEL_H_HEADER +1: ld1 {v0.16b, v1.16b}, [x1], x2 + ext v5.16b, v0.16b, v1.16b, #1 + ext v6.16b, v0.16b, v1.16b, #2 + ext v7.16b, v0.16b, v1.16b, #3 + ext v26.16b, v1.16b, v1.16b, #1 + ext v27.16b, v1.16b, v1.16b, #2 + ext v28.16b, v1.16b, v1.16b, #3 + movi v16.2d, #0 + movi v17.2d, #0 + movi v18.2d, #0 + movi v19.2d, #0 + movi v20.2d, #0 + movi v21.2d, #0 + movi v22.2d, #0 + movi v23.2d, #0 + usdot v16.4s, v0.16b, v30.16b + usdot v17.4s, v5.16b, v30.16b + usdot v18.4s, v6.16b, v30.16b + usdot v19.4s, v7.16b, v30.16b + usdot v20.4s, v1.16b, v30.16b + usdot v21.4s, v26.16b, v30.16b + usdot v22.4s, v27.16b, v30.16b + usdot v23.4s, v28.16b, v30.16b + xtn v16.4h, v16.4s + xtn2 v16.8h, v20.4s + xtn v17.4h, v17.4s + xtn2 v17.8h, v21.4s + xtn v18.4h, v18.4s + xtn2 v18.8h, v22.4s + xtn v19.4h, v19.4s + xtn2 v19.8h, v23.4s + zip1 v20.8h, v16.8h, v18.8h + zip1 v21.8h, v17.8h, v19.8h + zip2 v22.8h, v16.8h, v18.8h + zip2 v23.8h, v17.8h, v19.8h + zip1 v22.8h, v22.8h, v23.8h + add x7, x0, #32 + st2 {v20.8h, v21.8h}, [x0], x10 + st1 {v22.8h}, [x7] + subs w3, w3, #1 // height + b.ne 1b + ret +endfunc + +function ff_hevc_put_hevc_epel_h32_8_neon_i8mm, export=1 + EPEL_H_HEADER +1: ld1 {v0.16b, v1.16b, v2.16b}, [x1], x2 + ext v5.16b, v0.16b, v1.16b, #1 + ext v6.16b, v0.16b, v1.16b, #2 + ext v7.16b, v0.16b, v1.16b, #3 + ext v26.16b, v1.16b, v2.16b, #1 + ext v27.16b, v1.16b, v2.16b, #2 + ext v28.16b, v1.16b, v2.16b, #3 + movi v16.2d, #0 + movi v17.2d, #0 + movi v18.2d, #0 + movi v19.2d, #0 + movi v20.2d, #0 + movi v21.2d, #0 + movi v22.2d, #0 + movi v23.2d, #0 + usdot v16.4s, v0.16b, v30.16b + usdot v17.4s, v5.16b, v30.16b + usdot v18.4s, v6.16b, v30.16b + usdot v19.4s, v7.16b, v30.16b + usdot v20.4s, v1.16b, v30.16b + usdot v21.4s, v26.16b, v30.16b + usdot v22.4s, v27.16b, v30.16b + usdot v23.4s, v28.16b, v30.16b + xtn v16.4h, v16.4s + xtn2 v16.8h, v20.4s + xtn v17.4h, v17.4s + xtn2 v17.8h, v21.4s + xtn v18.4h, v18.4s + xtn2 v18.8h, v22.4s + xtn v19.4h, v19.4s + xtn2 v19.8h, v23.4s + st4 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x10 + subs w3, w3, #1 // height + b.ne 1b + ret +endfunc + +function ff_hevc_put_hevc_epel_h48_8_neon_i8mm, export=1 + EPEL_H_HEADER +1: ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], x2 + ext v4.16b, v0.16b, v1.16b, #1 + ext v5.16b, v0.16b, v1.16b, #2 + ext v6.16b, v0.16b, v1.16b, #3 + ext v16.16b, v1.16b, v2.16b, #1 + ext v17.16b, v1.16b, v2.16b, #2 + ext v18.16b, v1.16b, v2.16b, #3 + movi v20.2d, #0 + movi v21.2d, #0 + movi v22.2d, #0 + movi v23.2d, #0 + usdot v20.4s, v0.16b, v30.16b + usdot v21.4s, v4.16b, v30.16b + usdot v22.4s, v5.16b, v30.16b + usdot v23.4s, v6.16b, v30.16b + movi v24.2d, #0 + movi v25.2d, #0 + movi v26.2d, #0 + movi v27.2d, #0 + usdot v24.4s, v1.16b, v30.16b + usdot v25.4s, v16.16b, v30.16b + usdot v26.4s, v17.16b, v30.16b + usdot v27.4s, v18.16b, v30.16b + xtn v20.4h, v20.4s + xtn2 v20.8h, v24.4s + xtn v21.4h, v21.4s + xtn2 v21.8h, v25.4s + xtn v22.4h, v22.4s + xtn2 v22.8h, v26.4s + xtn v23.4h, v23.4s + xtn2 v23.8h, v27.4s + st4 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], x10 + ext v4.16b, v2.16b, v3.16b, #1 + ext v5.16b, v2.16b, v3.16b, #2 + ext v6.16b, v2.16b, v3.16b, #3 + movi v20.2d, #0 + movi v21.2d, #0 + movi v22.2d, #0 + movi v23.2d, #0 + usdot v20.4s, v2.16b, v30.16b + usdot v21.4s, v4.16b, v30.16b + usdot v22.4s, v5.16b, v30.16b + usdot v23.4s, v6.16b, v30.16b + xtn v20.4h, v20.4s + xtn2 v20.8h, v22.4s + xtn v21.4h, v21.4s + xtn2 v21.8h, v23.4s + add x7, x0, #64 + st2 {v20.8h, v21.8h}, [x7] + subs w3, w3, #1 // height + b.ne 1b + ret +endfunc + +function ff_hevc_put_hevc_epel_h64_8_neon_i8mm, export=1 + EPEL_H_HEADER + sub x2, x2, #64 +1: ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64 + ext v4.16b, v0.16b, v1.16b, #1 + ext v5.16b, v0.16b, v1.16b, #2 + ext v6.16b, v0.16b, v1.16b, #3 + ext v16.16b, v1.16b, v2.16b, #1 + ext v17.16b, v1.16b, v2.16b, #2 + ext v18.16b, v1.16b, v2.16b, #3 + movi v20.2d, #0 + movi v21.2d, #0 + movi v22.2d, #0 + movi v23.2d, #0 + usdot v20.4s, v0.16b, v30.16b + usdot v21.4s, v4.16b, v30.16b + usdot v22.4s, v5.16b, v30.16b + usdot v23.4s, v6.16b, v30.16b + movi v24.2d, #0 + movi v25.2d, #0 + movi v26.2d, #0 + movi v27.2d, #0 + usdot v24.4s, v1.16b, v30.16b + usdot v25.4s, v16.16b, v30.16b + usdot v26.4s, v17.16b, v30.16b + usdot v27.4s, v18.16b, v30.16b + xtn v20.4h, v20.4s + xtn2 v20.8h, v24.4s + xtn v21.4h, v21.4s + xtn2 v21.8h, v25.4s + xtn v22.4h, v22.4s + xtn2 v22.8h, v26.4s + xtn v23.4h, v23.4s + xtn2 v23.8h, v27.4s + st4 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64 + ld1 {v7.8b}, [x1], x2 + ext v4.16b, v2.16b, v3.16b, #1 + ext v5.16b, v2.16b, v3.16b, #2 + ext v6.16b, v2.16b, v3.16b, #3 + ext v16.16b, v3.16b, v7.16b, #1 + ext v17.16b, v3.16b, v7.16b, #2 + ext v18.16b, v3.16b, v7.16b, #3 + movi v20.2d, #0 + movi v21.2d, #0 + movi v22.2d, #0 + movi v23.2d, #0 + usdot v20.4s, v2.16b, v30.16b + usdot v21.4s, v4.16b, v30.16b + usdot v22.4s, v5.16b, v30.16b + usdot v23.4s, v6.16b, v30.16b + movi v24.2d, #0 + movi v25.2d, #0 + movi v26.2d, #0 + movi v27.2d, #0 + usdot v24.4s, v3.16b, v30.16b + usdot v25.4s, v16.16b, v30.16b + usdot v26.4s, v17.16b, v30.16b + usdot v27.4s, v18.16b, v30.16b + xtn v20.4h, v20.4s + xtn2 v20.8h, v24.4s + xtn v21.4h, v21.4s + xtn2 v21.8h, v25.4s + xtn v22.4h, v22.4s + xtn2 v22.8h, v26.4s + xtn v23.4h, v23.4s + xtn2 v23.8h, v27.4s + st4 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64 + subs w3, w3, #1 // height + b.ne 1b + ret +endfunc + .macro EPEL_UNI_W_H_HEADER ldr x12, [sp] sub x2, x2, #1 diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index 4a260e1d9a..348497bbbe 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -171,6 +171,10 @@ NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst, ptrdiff_t _dststride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width),); +NEON8_FNPROTO(epel_h, (int16_t *dst, + const uint8_t *_src, ptrdiff_t _srcstride, + int height, intptr_t mx, intptr_t my, int width), _i8mm); + NEON8_FNPROTO(epel_uni_w_h, (uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, int height, int denom, int wx, int ox, @@ -283,13 +287,14 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,); if (have_i8mm(cpu_flags)) { + NEON8_FNASSIGN(c->put_hevc_epel, 0, 1, epel_h, _i8mm); NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h ,_i8mm); NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm); NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm); NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1, qpel_uni_w_hv, _i8mm); } - } + if (bit_depth == 10) { c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_neon; c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_neon; -- 2.38.0.windows.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next prev parent reply other threads:[~2023-06-04 4:18 UTC|newest] Thread overview: 22+ messages / expand[flat|nested] mbox.gz Atom feed top 2023-06-04 4:17 [FFmpeg-devel] [PATCH 1/5] lavc/aarch64: new optimization for 8-bit hevc_pel_uni_pixels Logan.Lyu 2023-06-04 4:17 ` [FFmpeg-devel] [PATCH 2/5] lavc/aarch64: new optimization for 8-bit hevc_epel_uni_w_h Logan.Lyu 2023-06-12 7:59 ` Martin Storsjö 2023-06-18 8:21 ` Logan.Lyu 2023-06-04 4:17 ` [FFmpeg-devel] [PATCH 3/5] lavc/aarch64: new optimization for 8-bit hevc_epel_uni_w_v Logan.Lyu 2023-06-12 8:09 ` Martin Storsjö 2023-06-12 9:08 ` Martin Storsjö 2023-06-18 8:22 ` Logan.Lyu 2023-07-01 21:21 ` Martin Storsjö 2023-06-04 4:17 ` Logan.Lyu [this message] 2023-06-12 8:12 ` [FFmpeg-devel] [PATCH 4/5] lavc/aarch64: new optimization for 8-bit hevc_epel_h Martin Storsjö 2023-06-18 8:23 ` Logan.Lyu 2023-06-18 8:26 ` Logan.Lyu 2023-06-04 4:17 ` [FFmpeg-devel] [PATCH 5/5] lavc/aarch64: new optimization for 8-bit hevc_epel_uni_w_hv Logan.Lyu 2023-06-12 8:19 ` Martin Storsjö 2023-06-18 8:25 ` Logan.Lyu 2023-07-01 21:28 ` Martin Storsjö 2023-07-13 14:54 ` Logan.Lyu 2023-07-14 9:28 ` Martin Storsjö 2023-06-12 7:47 ` [FFmpeg-devel] [PATCH 1/5] lavc/aarch64: new optimization for 8-bit hevc_pel_uni_pixels Martin Storsjö 2023-06-18 8:29 ` Logan.Lyu 2023-07-01 21:16 ` Martin Storsjö
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20230604041756.5196-4-Logan.Lyu@myais.com.cn \ --to=logan.lyu@myais.com.cn \ --cc=ffmpeg-devel@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git