From: xufuji456 <839789740@qq.com> To: ffmpeg-devel@ffmpeg.org Cc: xufuji456 <839789740@qq.com> Subject: [FFmpeg-devel] [PATCH] avcodec/aarch64/hevc:add transform_luma_4x4_neon Date: Thu, 30 Mar 2023 21:21:55 +0800 Message-ID: <tencent_BB08F84E76419ECCF28DD04A365A0C2C2306@qq.com> (raw) got 56% speed up (run_count=1000, CPU=Cortex A53) transform_4x4_luma_neon: 45 transform_4x4_luma_c: 103 --- libavcodec/aarch64/hevcdsp_idct_neon.S | 51 ++++++++++++++++++++++- libavcodec/aarch64/hevcdsp_init_aarch64.c | 2 + 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S index 74a96957bf..f302ed9773 100644 --- a/libavcodec/aarch64/hevcdsp_idct_neon.S +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S @@ -6,6 +6,7 @@ * Ported from arm/hevcdsp_idct_neon.S by * Copyright (c) 2020 Reimar Döffinger * Copyright (c) 2023 J. Dekker <jdek@itanimul.li> + * Copyright (c) 2023 xu fulong <839789740@qq.com> * * This file is part of FFmpeg. * @@ -656,4 +657,52 @@ idct_dc 16, 8 idct_dc 16, 10 idct_dc 32, 8 -idct_dc 32, 10 \ No newline at end of file +idct_dc 32, 10 + +.macro tr4_luma_shift r0, r1, r2, r3, shift + saddl v0.4s, \r0, \r2 // c0 = src0 + src2 + saddl v1.4s, \r2, \r3 // c1 = src2 + src3 + ssubl v2.4s, \r0, \r3 // c2 = src0 - src3 + smull v3.4s, \r1, v21.4h // c3 = 74 * src1 + + saddl v7.4s, \r0, \r3 // src0 + src3 + ssubw v7.4s, v7.4s, \r2 // src0 - src2 + src3 + mul v7.4s, v7.4s, v18.4s // dst2 = 74 * (src0 - src2 + src3) + + mul v5.4s, v0.4s, v19.4s // 29 * c0 + mul v6.4s, v1.4s, v20.4s // 55 * c1 + add v5.4s, v5.4s, v6.4s // 29 * c0 + 55 * c1 + add v5.4s, v5.4s, v3.4s // dst0 = 29 * c0 + 55 * c1 + c3 + + mul v1.4s, v1.4s, v19.4s // 29 * c1 + mul v6.4s, v2.4s, v20.4s // 55 * c2 + sub v6.4s, v6.4s, v1.4s // 55 * c2 - 29 * c1 + add v6.4s, v6.4s, v3.4s // dst1 = 55 * c2 - 29 * c1 + c3 + + mul v0.4s, v0.4s, v20.4s // 55 * c0 + mul v2.4s, v2.4s, v19.4s // 29 * c2 + add v0.4s, v0.4s, v2.4s // 55 * c0 + 29 * c2 + sub v0.4s, v0.4s, v3.4s // dst3 = 55 * c0 + 29 * c2 - c3 + + sqrshrn \r0, v5.4s, \shift + sqrshrn \r1, v6.4s, \shift + sqrshrn \r2, v7.4s, \shift + sqrshrn \r3, v0.4s, \shift +.endm + +function ff_hevc_transform_luma_4x4_neon_8, export=1 + ld1 {v28.4h-v31.4h}, [x0] + movi v18.4s, #74 + movi v19.4s, #29 + movi v20.4s, #55 + movi v21.4h, #74 + + tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #7 + transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25 + + tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #12 + transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25 + + st1 {v28.4h-v31.4h}, [x0] + ret +endfunc \ No newline at end of file diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index 1deefca0a2..10e7f2318e 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -63,6 +63,7 @@ void ff_hevc_idct_4x4_dc_10_neon(int16_t *coeffs); void ff_hevc_idct_8x8_dc_10_neon(int16_t *coeffs); void ff_hevc_idct_16x16_dc_10_neon(int16_t *coeffs); void ff_hevc_idct_32x32_dc_10_neon(int16_t *coeffs); +void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs); void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, const int16_t *sao_offset_val, int sao_left_class, @@ -128,6 +129,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_neon; c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_neon; c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_neon; + c->transform_4x4_luma = ff_hevc_transform_luma_4x4_neon_8; c->sao_band_filter[0] = c->sao_band_filter[1] = c->sao_band_filter[2] = -- 2.32.0 (Apple Git-132) _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
reply other threads:[~2023-03-30 13:22 UTC|newest] Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=tencent_BB08F84E76419ECCF28DD04A365A0C2C2306@qq.com \ --to=839789740@qq.com \ --cc=ffmpeg-devel@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git