Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH] libavcodec/hevc: add hevc idct_4x4_neon of aarch64
@ 2023-02-14 10:02 xufuji456
  2023-02-23  8:55 ` Martin Storsjö
  0 siblings, 1 reply; 2+ messages in thread
From: xufuji456 @ 2023-02-14 10:02 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: xufuji456

---
 libavcodec/aarch64/hevcdsp_idct_neon.S    | 51 +++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  4 ++
 2 files changed, 55 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 124c50998a..fe8baf1348 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -245,6 +245,54 @@ function hevc_add_residual_32x32_16_neon, export=0
         ret
 endfunc
 
+.macro tr_4x4 in0, in1, in2, in3, out0, out1, out2, out3, shift, tmp0, tmp1, tmp2, tmp3, tmp4
+         sshll           \tmp0, \in0, #6
+         smull          \tmp2, \in1, v4.h[1]
+         mov            \tmp1, \tmp0
+         smull          \tmp3, \in1, v4.h[3]
+         smlal          \tmp0, \in2, v4.h[0] //e0
+         smlsl          \tmp1, \in2, v4.h[0] //e1
+         smlal          \tmp2, \in3, v4.h[3] //o0
+         smlsl          \tmp3, \in3, v4.h[1] //o1
+
+         add            \tmp4, \tmp0, \tmp2
+         sub            \tmp0, \tmp0, \tmp2
+         add            \tmp2, \tmp1, \tmp3
+         sub            \tmp1, \tmp1, \tmp3
+         sqrshrn        \out0, \tmp4, #\shift
+         sqrshrn        \out3, \tmp0, #\shift
+         sqrshrn        \out1, \tmp2, #\shift
+         sqrshrn        \out2, \tmp1, #\shift
+.endm
+
+.macro transpose_4x4 r0, r1, r2, r3
+        trn1            v22.8h, \r0\().8h, \r1\().8h
+        trn2            v23.8h, \r0\().8h, \r1\().8h
+        trn1            v24.8h, \r2\().8h, \r3\().8h
+        trn2            v25.8h, \r2\().8h, \r3\().8h
+        trn1            \r0\().4s, v22.4s, v24.4s
+        trn2            \r2\().4s, v22.4s, v24.4s
+        trn1            \r1\().4s, v23.4s, v25.4s
+        trn2            \r3\().4s, v23.4s, v25.4s
+.endm
+
+.macro idct_4x4 bitdepth
+function ff_hevc_idct_4x4_\bitdepth\()_neon, export=1
+        ld1             {v0.4h-v3.4h}, [x0]
+
+        movrel          x1, trans
+        ld1             {v4.4h}, [x1]
+
+        tr_4x4          v0.4h, v1.4h, v2.4h, v3.4h, v16.4h, v17.4h, v18.4h, v19.4h, 7, v10.4s, v11.4s, v12.4s, v13.4s, v15.4s
+        transpose_4x4   v16, v17, v18, v19
+
+        tr_4x4          v16.4h, v17.4h, v18.4h, v19.4h, v0.4h, v1.4h, v2.4h, v3.4h, 20 - \bitdepth, v10.4s, v11.4s, v12.4s, v13.4s, v15.4s
+        transpose_4x4   v0, v1, v2, v3
+        st1             {v0.4h-v3.4h}, [x0]
+        ret
+endfunc
+.endm
+
 .macro sum_sub out, in, c, op, p
   .ifc \op, +
         smlal\p         \out, \in, \c
@@ -578,6 +626,9 @@ function ff_hevc_idct_16x16_\bitdepth\()_neon, export=1
 endfunc
 .endm
 
+idct_4x4 8
+idct_4x4 10
+
 idct_8x8 8
 idct_8x8 10
 
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 88a797f393..1deefca0a2 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -49,6 +49,8 @@ void ff_hevc_add_residual_32x32_10_neon(uint8_t *_dst, const int16_t *coeffs,
                                         ptrdiff_t stride);
 void ff_hevc_add_residual_32x32_12_neon(uint8_t *_dst, const int16_t *coeffs,
                                         ptrdiff_t stride);
+void ff_hevc_idct_4x4_8_neon(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_4x4_10_neon(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_8x8_8_neon(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit);
@@ -119,6 +121,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->add_residual[1]             = ff_hevc_add_residual_8x8_8_neon;
         c->add_residual[2]             = ff_hevc_add_residual_16x16_8_neon;
         c->add_residual[3]             = ff_hevc_add_residual_32x32_8_neon;
+        c->idct[0]                     = ff_hevc_idct_4x4_8_neon;
         c->idct[1]                     = ff_hevc_idct_8x8_8_neon;
         c->idct[2]                     = ff_hevc_idct_16x16_8_neon;
         c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_8_neon;
@@ -168,6 +171,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->add_residual[1]             = ff_hevc_add_residual_8x8_10_neon;
         c->add_residual[2]             = ff_hevc_add_residual_16x16_10_neon;
         c->add_residual[3]             = ff_hevc_add_residual_32x32_10_neon;
+        c->idct[0]                     = ff_hevc_idct_4x4_10_neon;
         c->idct[1]                     = ff_hevc_idct_8x8_10_neon;
         c->idct[2]                     = ff_hevc_idct_16x16_10_neon;
         c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_10_neon;
-- 
2.32.0 (Apple Git-132)

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [FFmpeg-devel] [PATCH] libavcodec/hevc: add hevc idct_4x4_neon of aarch64
  2023-02-14 10:02 [FFmpeg-devel] [PATCH] libavcodec/hevc: add hevc idct_4x4_neon of aarch64 xufuji456
@ 2023-02-23  8:55 ` Martin Storsjö
  0 siblings, 0 replies; 2+ messages in thread
From: Martin Storsjö @ 2023-02-23  8:55 UTC (permalink / raw)
  To: FFmpeg development discussions and patches; +Cc: xufuji456

On Tue, 14 Feb 2023, xufuji456 wrote:

> ---
> libavcodec/aarch64/hevcdsp_idct_neon.S    | 51 +++++++++++++++++++++++
> libavcodec/aarch64/hevcdsp_init_aarch64.c |  4 ++
> 2 files changed, 55 insertions(+)

This looks mostly fine, but there's a couple issues that breaks it when 
built with toolchains other than what you've been using. (I presume you've 
built it with Clang on macOS, since that's the only toolchain that built 
it successfully and ran without failures.)

>
> diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
> index 124c50998a..fe8baf1348 100644
> --- a/libavcodec/aarch64/hevcdsp_idct_neon.S
> +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
> @@ -245,6 +245,54 @@ function hevc_add_residual_32x32_16_neon, export=0
>         ret
> endfunc
>
> +.macro tr_4x4 in0, in1, in2, in3, out0, out1, out2, out3, shift, tmp0, tmp1, tmp2, tmp3, tmp4
> +         sshll           \tmp0, \in0, #6

Nitpick: the indentation of \tmp0 is off here

> +         smull          \tmp2, \in1, v4.h[1]
> +         mov            \tmp1, \tmp0

Both binutils and MS armasm64.exe refuse to assemble an instruction like 
"mov v0.4s, v1.4s" - for the mov instruction, they only accept .8b or 
.16b, not .4s - even if it doesn't make any functional difference for what 
the instruction does.

Due to how these macros are used here, it's not very easy to come up with 
a neat solution. One possible solution is to just repeat "sshll \tmp1, 
\in0, #6"; that does the same shift operation twice, but reduces the chain 
depth for setting tmp1, so it's possible that it doesn't make any big 
difference in the end. (On Cortex A53/A72/A73, I didn't measure any 
significant difference here.)

> +         smull          \tmp3, \in1, v4.h[3]
> +         smlal          \tmp0, \in2, v4.h[0] //e0
> +         smlsl          \tmp1, \in2, v4.h[0] //e1
> +         smlal          \tmp2, \in3, v4.h[3] //o0
> +         smlsl          \tmp3, \in3, v4.h[1] //o1
> +
> +         add            \tmp4, \tmp0, \tmp2
> +         sub            \tmp0, \tmp0, \tmp2
> +         add            \tmp2, \tmp1, \tmp3
> +         sub            \tmp1, \tmp1, \tmp3
> +         sqrshrn        \out0, \tmp4, #\shift
> +         sqrshrn        \out3, \tmp0, #\shift
> +         sqrshrn        \out1, \tmp2, #\shift
> +         sqrshrn        \out2, \tmp1, #\shift
> +.endm
> +
> +.macro transpose_4x4 r0, r1, r2, r3
> +        trn1            v22.8h, \r0\().8h, \r1\().8h
> +        trn2            v23.8h, \r0\().8h, \r1\().8h
> +        trn1            v24.8h, \r2\().8h, \r3\().8h
> +        trn2            v25.8h, \r2\().8h, \r3\().8h
> +        trn1            \r0\().4s, v22.4s, v24.4s
> +        trn2            \r2\().4s, v22.4s, v24.4s
> +        trn1            \r1\().4s, v23.4s, v25.4s
> +        trn2            \r3\().4s, v23.4s, v25.4s
> +.endm

neon.S already has got a bunch of similar matching transposes. I see that 
this file already had some similar custom transposes; I sent 
https://patchwork.ffmpeg.org/project/ffmpeg/patch/20230223083826.3149827-1-martin@martin.st/ 
to fix that. Here, you can use transpose_4x8H from neon.S too.

> +
> +.macro idct_4x4 bitdepth
> +function ff_hevc_idct_4x4_\bitdepth\()_neon, export=1
> +        ld1             {v0.4h-v3.4h}, [x0]
> +
> +        movrel          x1, trans
> +        ld1             {v4.4h}, [x1]
> +
> +        tr_4x4          v0.4h, v1.4h, v2.4h, v3.4h, v16.4h, v17.4h, v18.4h, v19.4h, 7, v10.4s, v11.4s, v12.4s, v13.4s, v15.4s

You're not allowed to clobber v8-v15 without backing them up/restoring 
them. Checkasm on macOS doesn't detect this, but it does on other OSes.

Here it should be trivial to use other temporary registers instead, 
without needing to backup/restore anything.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2023-02-23  8:55 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-02-14 10:02 [FFmpeg-devel] [PATCH] libavcodec/hevc: add hevc idct_4x4_neon of aarch64 xufuji456
2023-02-23  8:55 ` Martin Storsjö

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git