* [FFmpeg-devel] [PATCH] libavcodec/hevc: add hevc idct4x4 neon of aarch64 after fixed
@ 2023-02-24 7:43 xufuji456
2023-02-28 11:15 ` Martin Storsjö
0 siblings, 1 reply; 2+ messages in thread
From: xufuji456 @ 2023-02-24 7:43 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: xufuji456
---
libavcodec/aarch64/hevcdsp_idct_neon.S | 40 +++++++++++++++++++++++
libavcodec/aarch64/hevcdsp_init_aarch64.c | 4 +++
2 files changed, 44 insertions(+)
diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 124c50998a..f5135160b6 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -245,6 +245,43 @@ function hevc_add_residual_32x32_16_neon, export=0
ret
endfunc
+.macro tr_4x4 in0, in1, in2, in3, out0, out1, out2, out3, shift
+ sshll v20.4s, \in0, #6
+ sshll v21.4s, \in0, #6
+ smull v22.4s, \in1, v4.h[1]
+ smull v23.4s, \in1, v4.h[3]
+ smlal v20.4s, \in2, v4.h[0] //e0
+ smlsl v21.4s, \in2, v4.h[0] //e1
+ smlal v22.4s, \in3, v4.h[3] //o0
+ smlsl v23.4s, \in3, v4.h[1] //o1
+
+ add v24.4s, v20.4s, v22.4s
+ sub v20.4s, v20.4s, v22.4s
+ add v22.4s, v21.4s, v23.4s
+ sub v21.4s, v21.4s, v23.4s
+ sqrshrn \out0, v24.4s, #\shift
+ sqrshrn \out3, v20.4s, #\shift
+ sqrshrn \out1, v22.4s, #\shift
+ sqrshrn \out2, v21.4s, #\shift
+.endm
+
+.macro idct_4x4 bitdepth
+function ff_hevc_idct_4x4_\bitdepth\()_neon, export=1
+ ld1 {v0.4h-v3.4h}, [x0]
+
+ movrel x1, trans
+ ld1 {v4.4h}, [x1]
+
+ tr_4x4 v0.4h, v1.4h, v2.4h, v3.4h, v16.4h, v17.4h, v18.4h, v19.4h, 7
+ transpose_4x8H v16, v17, v18, v19, v26, v27, v28, v29
+
+ tr_4x4 v16.4h, v17.4h, v18.4h, v19.4h, v0.4h, v1.4h, v2.4h, v3.4h, 20 - \bitdepth
+ transpose_4x8H v0, v1, v2, v3, v26, v27, v28, v29
+ st1 {v0.4h-v3.4h}, [x0]
+ ret
+endfunc
+.endm
+
.macro sum_sub out, in, c, op, p
.ifc \op, +
smlal\p \out, \in, \c
@@ -578,6 +615,9 @@ function ff_hevc_idct_16x16_\bitdepth\()_neon, export=1
endfunc
.endm
+idct_4x4 8
+idct_4x4 10
+
idct_8x8 8
idct_8x8 10
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 88a797f393..1deefca0a2 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -49,6 +49,8 @@ void ff_hevc_add_residual_32x32_10_neon(uint8_t *_dst, const int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_add_residual_32x32_12_neon(uint8_t *_dst, const int16_t *coeffs,
ptrdiff_t stride);
+void ff_hevc_idct_4x4_8_neon(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_4x4_10_neon(int16_t *coeffs, int col_limit);
void ff_hevc_idct_8x8_8_neon(int16_t *coeffs, int col_limit);
void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit);
void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit);
@@ -119,6 +121,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
c->add_residual[1] = ff_hevc_add_residual_8x8_8_neon;
c->add_residual[2] = ff_hevc_add_residual_16x16_8_neon;
c->add_residual[3] = ff_hevc_add_residual_32x32_8_neon;
+ c->idct[0] = ff_hevc_idct_4x4_8_neon;
c->idct[1] = ff_hevc_idct_8x8_8_neon;
c->idct[2] = ff_hevc_idct_16x16_8_neon;
c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_neon;
@@ -168,6 +171,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
c->add_residual[1] = ff_hevc_add_residual_8x8_10_neon;
c->add_residual[2] = ff_hevc_add_residual_16x16_10_neon;
c->add_residual[3] = ff_hevc_add_residual_32x32_10_neon;
+ c->idct[0] = ff_hevc_idct_4x4_10_neon;
c->idct[1] = ff_hevc_idct_8x8_10_neon;
c->idct[2] = ff_hevc_idct_16x16_10_neon;
c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_neon;
--
2.32.0 (Apple Git-132)
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [FFmpeg-devel] [PATCH] libavcodec/hevc: add hevc idct4x4 neon of aarch64 after fixed
2023-02-24 7:43 [FFmpeg-devel] [PATCH] libavcodec/hevc: add hevc idct4x4 neon of aarch64 after fixed xufuji456
@ 2023-02-28 11:15 ` Martin Storsjö
0 siblings, 0 replies; 2+ messages in thread
From: Martin Storsjö @ 2023-02-28 11:15 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: xufuji456
On Fri, 24 Feb 2023, xufuji456 wrote:
> ---
> libavcodec/aarch64/hevcdsp_idct_neon.S | 40 +++++++++++++++++++++++
> libavcodec/aarch64/hevcdsp_init_aarch64.c | 4 +++
> 2 files changed, 44 insertions(+)
Thanks, this looks good to me now, will push in a moment.
The subject of the commit doesn't need to include "after fixed" or such
comments - those comments are kinda irrelevant once the commit gets
pushed.
It's usually good practice to include checkasm benchmark numbers (and
mention what system the numbers are from) when posting assembly
optimizations.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2023-02-28 11:15 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-02-24 7:43 [FFmpeg-devel] [PATCH] libavcodec/hevc: add hevc idct4x4 neon of aarch64 after fixed xufuji456
2023-02-28 11:15 ` Martin Storsjö
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git