Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH] avcodec/aarch64/hevc: add transform_luma_4x4_neon note: run_count=1000, CPU=Cortex A53 transform_4x4_luma_neon: 45 transform_4x4_luma_c: 103
@ 2023-03-11  3:18 xufuji456
  2023-03-22  8:06 ` Martin Storsjö
  0 siblings, 1 reply; 2+ messages in thread
From: xufuji456 @ 2023-03-11  3:18 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: xufuji456

---
 libavcodec/aarch64/hevcdsp_idct_neon.S    | 52 ++++++++++++++++++++++-
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  2 +
 2 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
index b11f56862a..00d9690466 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -7,6 +7,8 @@
  * Copyright (c) 2020 Reimar Döffinger
  * Copyright (c) 2020 J. Dekker
  *
+ * Copyright (c) 2023 xu fulong
+ *
  * This file is part of FFmpeg.
  *
  * FFmpeg is free software; you can redistribute it and/or
@@ -665,4 +667,52 @@ idct_dc 16, 8
 idct_dc 16, 10
 
 idct_dc 32, 8
-idct_dc 32, 10
\ No newline at end of file
+idct_dc 32, 10
+
+.macro tr4_luma_shift r0, r1, r2, r3, shift
+        saddl       v0.4s, \r0, \r2         // c0 = src0 + src2
+        saddl       v1.4s, \r2, \r3         // c1 = src2 + src3
+        ssubl       v2.4s, \r0, \r3         // c2 = src0 - src3
+        smull       v3.4s, \r1, v21.4h      // c3 = 74 * src1
+
+        saddl       v7.4s, \r0, \r3         // src0 + src3
+        ssubw       v7.4s, v7.4s, \r2       // src0 - src2 + src3
+        mul         v7.4s, v7.4s, v18.4s    // dst2 = 74 * (src0 - src2 + src3)
+
+        mul         v5.4s, v0.4s, v19.4s    // 29 * c0
+        mul         v6.4s, v1.4s, v20.4s    // 55 * c1
+        add         v5.4s, v5.4s, v6.4s     // 29 * c0 + 55 * c1
+        add         v5.4s, v5.4s, v3.4s     // dst0 = 29 * c0 + 55 * c1 + c3
+
+        mul         v1.4s, v1.4s, v19.4s    // 29 * c1
+        mul         v6.4s, v2.4s, v20.4s    // 55 * c2
+        sub         v6.4s, v6.4s, v1.4s     // 55 * c2 - 29 * c1
+        add         v6.4s, v6.4s, v3.4s     // dst1 = 55 * c2 - 29 * c1 + c3
+
+        mul         v0.4s, v0.4s, v20.4s    // 55 * c0
+        mul         v2.4s, v2.4s, v19.4s    // 29 * c2
+        add         v0.4s, v0.4s, v2.4s     // 55 * c0 + 29 * c2
+        sub         v0.4s, v0.4s, v3.4s     // dst3 = 55 * c0 + 29 * c2 - c3
+
+        sqrshrn     \r0, v5.4s, \shift
+        sqrshrn     \r1, v6.4s, \shift
+        sqrshrn     \r2, v7.4s, \shift
+        sqrshrn     \r3, v0.4s, \shift
+.endm
+
+function ff_hevc_transform_luma_4x4_neon_8, export=1
+        ld1            {v28.4h-v31.4h}, [x0]
+        movi           v18.4s, #74
+        movi           v19.4s, #29
+        movi           v20.4s, #55
+        movi           v21.4h, #74
+
+        tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #7
+        transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
+
+        tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #12
+        transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
+
+        st1            {v28.4h-v31.4h}, [x0]
+        ret
+endfunc
\ No newline at end of file
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 1deefca0a2..10e7f2318e 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -63,6 +63,7 @@ void ff_hevc_idct_4x4_dc_10_neon(int16_t *coeffs);
 void ff_hevc_idct_8x8_dc_10_neon(int16_t *coeffs);
 void ff_hevc_idct_16x16_dc_10_neon(int16_t *coeffs);
 void ff_hevc_idct_32x32_dc_10_neon(int16_t *coeffs);
+void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
 void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, const uint8_t *_src,
                                   ptrdiff_t stride_dst, ptrdiff_t stride_src,
                                   const int16_t *sao_offset_val, int sao_left_class,
@@ -128,6 +129,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_8_neon;
         c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_8_neon;
         c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_8_neon;
+        c->transform_4x4_luma          = ff_hevc_transform_luma_4x4_neon_8;
         c->sao_band_filter[0]          =
         c->sao_band_filter[1]          =
         c->sao_band_filter[2]          =
-- 
2.32.0 (Apple Git-132)

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [FFmpeg-devel] [PATCH] avcodec/aarch64/hevc: add transform_luma_4x4_neon note: run_count=1000, CPU=Cortex A53 transform_4x4_luma_neon: 45 transform_4x4_luma_c: 103
  2023-03-11  3:18 [FFmpeg-devel] [PATCH] avcodec/aarch64/hevc: add transform_luma_4x4_neon note: run_count=1000, CPU=Cortex A53 transform_4x4_luma_neon: 45 transform_4x4_luma_c: 103 xufuji456
@ 2023-03-22  8:06 ` Martin Storsjö
  0 siblings, 0 replies; 2+ messages in thread
From: Martin Storsjö @ 2023-03-22  8:06 UTC (permalink / raw)
  To: FFmpeg development discussions and patches; +Cc: xufuji456

On Sat, 11 Mar 2023, xufuji456 wrote:

> ---
> libavcodec/aarch64/hevcdsp_idct_neon.S    | 52 ++++++++++++++++++++++-
> libavcodec/aarch64/hevcdsp_init_aarch64.c |  2 +
> 2 files changed, 53 insertions(+), 1 deletion(-)

Overall the code seems fine, but there's a couple more minor issues:

- There's no checkasm test, so we don't have any continuous tracking that 
this function doesn't break ABI details subtly. From reading the code it 
seems fine, but we really want to have checkasm coverage for all new 
assembly. Can you please add one? It should hopefully not be very 
complicated given the existing tests for other idct parts.

- The commit message is a bit garbled - not all that text belongs in the 
subject line.

- It was hard to get the patch applied:

> diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
> index b11f56862a..00d9690466 100644
> --- a/libavcodec/aarch64/hevcdsp_idct_neon.S
> +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
> @@ -665,4 +667,52 @@ idct_dc 16, 8
> idct_dc 16, 10
> 
> idct_dc 32, 8
> -idct_dc 32, 10
> \ No newline at end of file
> +idct_dc 32, 10

The file upstream actually did have the proper newline at the end of the 
file (some earlier patch of yours, I think, were missing this but I fixed 
it up when pushing it), but the patch expected a file without a trailing 
newline. It would be much less hassle for me to apply the patch if it was 
properly rebased on the actually pushed git master version.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2023-03-22  8:06 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-03-11  3:18 [FFmpeg-devel] [PATCH] avcodec/aarch64/hevc: add transform_luma_4x4_neon note: run_count=1000, CPU=Cortex A53 transform_4x4_luma_neon: 45 transform_4x4_luma_c: 103 xufuji456
2023-03-22  8:06 ` Martin Storsjö

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git