* [FFmpeg-devel] [PATCH] codec/aarch64/hevc: add transform_luma_neon
@ 2023-04-13 6:51 xufuji456
2023-04-13 12:14 ` Martin Storsjö
0 siblings, 1 reply; 6+ messages in thread
From: xufuji456 @ 2023-04-13 6:51 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: xufuji456
got 56% speed up (run_count=1000, CPU=Cortex A53)
transform_4x4_luma_neon: 45 transform_4x4_luma_c: 103
Signed-off-by: xufuji456 <839789740@qq.com>
---
libavcodec/aarch64/hevcdsp_idct_neon.S | 50 ++++++++++++++++++++++-
libavcodec/aarch64/hevcdsp_init_aarch64.c | 2 +
2 files changed, 51 insertions(+), 1 deletion(-)
diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 994f0a47b6..504258f7c7 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -889,4 +889,52 @@ idct_dc 16, 8
idct_dc 16, 10
idct_dc 32, 8
-idct_dc 32, 10
\ No newline at end of file
+idct_dc 32, 10
+
+.macro tr4_luma_shift r0, r1, r2, r3, shift
+ saddl v0.4s, \r0, \r2 // c0 = src0 + src2
+ saddl v1.4s, \r2, \r3 // c1 = src2 + src3
+ ssubl v2.4s, \r0, \r3 // c2 = src0 - src3
+ smull v3.4s, \r1, v21.4h // c3 = 74 * src1
+
+ saddl v7.4s, \r0, \r3 // src0 + src3
+ ssubw v7.4s, v7.4s, \r2 // src0 - src2 + src3
+ mul v7.4s, v7.4s, v18.4s // dst2 = 74 * (src0 - src2 + src3)
+
+ mul v5.4s, v0.4s, v19.4s // 29 * c0
+ mul v6.4s, v1.4s, v20.4s // 55 * c1
+ add v5.4s, v5.4s, v6.4s // 29 * c0 + 55 * c1
+ add v5.4s, v5.4s, v3.4s // dst0 = 29 * c0 + 55 * c1 + c3
+
+ mul v1.4s, v1.4s, v19.4s // 29 * c1
+ mul v6.4s, v2.4s, v20.4s // 55 * c2
+ sub v6.4s, v6.4s, v1.4s // 55 * c2 - 29 * c1
+ add v6.4s, v6.4s, v3.4s // dst1 = 55 * c2 - 29 * c1 + c3
+
+ mul v0.4s, v0.4s, v20.4s // 55 * c0
+ mul v2.4s, v2.4s, v19.4s // 29 * c2
+ add v0.4s, v0.4s, v2.4s // 55 * c0 + 29 * c2
+ sub v0.4s, v0.4s, v3.4s // dst3 = 55 * c0 + 29 * c2 - c3
+
+ sqrshrn \r0, v5.4s, \shift
+ sqrshrn \r1, v6.4s, \shift
+ sqrshrn \r2, v7.4s, \shift
+ sqrshrn \r3, v0.4s, \shift
+.endm
+
+function ff_hevc_transform_luma_4x4_neon_8, export=1
+ ld1 {v28.4h-v31.4h}, [x0]
+ movi v18.4s, #74
+ movi v19.4s, #29
+ movi v20.4s, #55
+ movi v21.4h, #74
+
+ tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #7
+ transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
+
+ tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #12
+ transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
+
+ st1 {v28.4h-v31.4h}, [x0]
+ ret
+endfunc
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 4cc8732ad3..be1049a2ec 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -78,6 +78,7 @@ void ff_hevc_idct_4x4_dc_10_neon(int16_t *coeffs);
void ff_hevc_idct_8x8_dc_10_neon(int16_t *coeffs);
void ff_hevc_idct_16x16_dc_10_neon(int16_t *coeffs);
void ff_hevc_idct_32x32_dc_10_neon(int16_t *coeffs);
+void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, const uint8_t *_src,
ptrdiff_t stride_dst, ptrdiff_t stride_src,
const int16_t *sao_offset_val, int sao_left_class,
@@ -146,6 +147,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_neon;
c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_neon;
c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_neon;
+ c->transform_4x4_luma = ff_hevc_transform_luma_4x4_neon_8;
c->sao_band_filter[0] =
c->sao_band_filter[1] =
c->sao_band_filter[2] =
--
2.32.0 (Apple Git-132)
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [FFmpeg-devel] [PATCH] codec/aarch64/hevc: add transform_luma_neon
2023-04-13 6:51 [FFmpeg-devel] [PATCH] codec/aarch64/hevc: add transform_luma_neon xufuji456
@ 2023-04-13 12:14 ` Martin Storsjö
2023-04-13 13:20 ` =?gb18030?B?0Oy4o8Kh?=
0 siblings, 1 reply; 6+ messages in thread
From: Martin Storsjö @ 2023-04-13 12:14 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: xufuji456
On Thu, 13 Apr 2023, xufuji456 wrote:
> got 56% speed up (run_count=1000, CPU=Cortex A53)
> transform_4x4_luma_neon: 45 transform_4x4_luma_c: 103
>
> Signed-off-by: xufuji456 <839789740@qq.com>
> ---
> libavcodec/aarch64/hevcdsp_idct_neon.S | 50 ++++++++++++++++++++++-
> libavcodec/aarch64/hevcdsp_init_aarch64.c | 2 +
> 2 files changed, 51 insertions(+), 1 deletion(-)
>
> diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
> index 994f0a47b6..504258f7c7 100644
> --- a/libavcodec/aarch64/hevcdsp_idct_neon.S
> +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
> @@ -889,4 +889,52 @@ idct_dc 16, 8
> idct_dc 16, 10
>
> idct_dc 32, 8
> -idct_dc 32, 10
> \ No newline at end of file
> +idct_dc 32, 10
This patch does still not apply on git master.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [FFmpeg-devel] [PATCH] codec/aarch64/hevc: add transform_luma_neon
2023-04-13 12:14 ` Martin Storsjö
@ 2023-04-13 13:20 ` =?gb18030?B?0Oy4o8Kh?=
0 siblings, 0 replies; 6+ messages in thread
From: =?gb18030?B?0Oy4o8Kh?= @ 2023-04-13 13:20 UTC (permalink / raw)
To: =?gb18030?B?RkZtcGVnIGRldmVsb3BtZW50IGRpc2N1c3Npb25zIGFuZCBwYXRjaGVz?=
Cc: =?gb18030?B?TWFydGluIFN0b3JzaoQxpDc=?=
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="gb18030", Size: 1857 bytes --]
It seem that the reason is "No newline at end of file".
I will fix it and submit again.
Thank you for your patient review.
------------------ Original ------------------
From: "Martin Storsj1¤7" <martin@martin.st>;
Date: Thu, Apr 13, 2023 08:14 PM
To: "FFmpeg development discussions and patches"<ffmpeg-devel@ffmpeg.org>;
Cc: "Ð츣¡"<839789740@qq.com>;
Subject: Re: [FFmpeg-devel] [PATCH] codec/aarch64/hevc: add transform_luma_neon
On Thu, 13 Apr 2023, xufuji456 wrote:
> got 56% speed up (run_count=1000, CPU=Cortex A53)
> transform_4x4_luma_neon: 45 transform_4x4_luma_c: 103
>
> Signed-off-by: xufuji456 <839789740@qq.com>
> ---
> libavcodec/aarch64/hevcdsp_idct_neon.S | 50 ++++++++++++++++++++++-
> libavcodec/aarch64/hevcdsp_init_aarch64.c | 2 +
> 2 files changed, 51 insertions(+), 1 deletion(-)
>
> diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
> index 994f0a47b6..504258f7c7 100644
> --- a/libavcodec/aarch64/hevcdsp_idct_neon.S
> +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
> @@ -889,4 +889,52 @@ idct_dc 16, 8
> idct_dc 16, 10
>
> idct_dc 32, 8
> -idct_dc 32, 10
> \ No newline at end of file
> +idct_dc 32, 10
This patch does still not apply on git master.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [FFmpeg-devel] [PATCH] codec/aarch64/hevc: add transform_luma_neon
2023-04-14 12:20 ` Martin Storsjö
@ 2023-04-30 7:28 ` =?gb18030?B?0Oy4o8Kh?=
0 siblings, 0 replies; 6+ messages in thread
From: =?gb18030?B?0Oy4o8Kh?= @ 2023-04-30 7:28 UTC (permalink / raw)
To: =?gb18030?B?RkZtcGVnIGRldmVsb3BtZW50IGRpc2N1c3Npb25zIGFuZCBwYXRjaGVz?=
Cc: =?gb18030?B?TWFydGluJm5ic3A7U3RvcnNqhDGkNw==?=
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="gb18030", Size: 2035 bytes --]
Thank you, Martin.
It's my mistake that delete an empty line in the end of file.
Should I submit a patch with a newline in the end of file or do something else?
Thanks for your review and point out the details of error.
------------------ Original ------------------
From: "Martin Storsj1¤7" <martin@martin.st>;
Date: Fri, Apr 14, 2023 08:20 PM
To: "FFmpeg development discussions and patches"<ffmpeg-devel@ffmpeg.org>;
Cc: "Ð츣¡"<839789740@qq.com>;
Subject: Re: [FFmpeg-devel] [PATCH] codec/aarch64/hevc: add transform_luma_neon
On Thu, 13 Apr 2023, xufuji456 wrote:
> got 56% speed up (run_count=1000, CPU=Cortex A53)
> transform_4x4_luma_neon: 45 transform_4x4_luma_c: 103
>
> Signed-off-by: xufuji456 <839789740@qq.com>
> ---
> libavcodec/aarch64/hevcdsp_idct_neon.S | 48 +++++++++++++++++++++++
> libavcodec/aarch64/hevcdsp_init_aarch64.c | 2 +
> 2 files changed, 50 insertions(+)
Thanks, this version can be applied - and still looks good, so I pushed
it.
I see that you fixed the issue by just applying the new code in the middle
of the file instead of at the end of the file though. You really should
try to look into what it is that is causing the previous version of the
file to be lacking the trailing newline, since that's not what is in the
actual upstream git. So it looks like there's something off with your git
workflow, and it would be very good to get that sorted out before going
forward anyway.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [FFmpeg-devel] [PATCH] codec/aarch64/hevc: add transform_luma_neon
2023-04-13 13:34 xufuji456
@ 2023-04-14 12:20 ` Martin Storsjö
2023-04-30 7:28 ` =?gb18030?B?0Oy4o8Kh?=
0 siblings, 1 reply; 6+ messages in thread
From: Martin Storsjö @ 2023-04-14 12:20 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: xufuji456
On Thu, 13 Apr 2023, xufuji456 wrote:
> got 56% speed up (run_count=1000, CPU=Cortex A53)
> transform_4x4_luma_neon: 45 transform_4x4_luma_c: 103
>
> Signed-off-by: xufuji456 <839789740@qq.com>
> ---
> libavcodec/aarch64/hevcdsp_idct_neon.S | 48 +++++++++++++++++++++++
> libavcodec/aarch64/hevcdsp_init_aarch64.c | 2 +
> 2 files changed, 50 insertions(+)
Thanks, this version can be applied - and still looks good, so I pushed
it.
I see that you fixed the issue by just applying the new code in the middle
of the file instead of at the end of the file though. You really should
try to look into what it is that is causing the previous version of the
file to be lacking the trailing newline, since that's not what is in the
actual upstream git. So it looks like there's something off with your git
workflow, and it would be very good to get that sorted out before going
forward anyway.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
* [FFmpeg-devel] [PATCH] codec/aarch64/hevc: add transform_luma_neon
@ 2023-04-13 13:34 xufuji456
2023-04-14 12:20 ` Martin Storsjö
0 siblings, 1 reply; 6+ messages in thread
From: xufuji456 @ 2023-04-13 13:34 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: xufuji456
got 56% speed up (run_count=1000, CPU=Cortex A53)
transform_4x4_luma_neon: 45 transform_4x4_luma_c: 103
Signed-off-by: xufuji456 <839789740@qq.com>
---
libavcodec/aarch64/hevcdsp_idct_neon.S | 48 +++++++++++++++++++++++
libavcodec/aarch64/hevcdsp_init_aarch64.c | 2 +
2 files changed, 50 insertions(+)
diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 994f0a47b6..4a25787070 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -842,6 +842,54 @@ tr_32x4 secondpass_10, 20 - 10
idct_32x32 8
idct_32x32 10
+.macro tr4_luma_shift r0, r1, r2, r3, shift
+ saddl v0.4s, \r0, \r2 // c0 = src0 + src2
+ saddl v1.4s, \r2, \r3 // c1 = src2 + src3
+ ssubl v2.4s, \r0, \r3 // c2 = src0 - src3
+ smull v3.4s, \r1, v21.4h // c3 = 74 * src1
+
+ saddl v7.4s, \r0, \r3 // src0 + src3
+ ssubw v7.4s, v7.4s, \r2 // src0 - src2 + src3
+ mul v7.4s, v7.4s, v18.4s // dst2 = 74 * (src0 - src2 + src3)
+
+ mul v5.4s, v0.4s, v19.4s // 29 * c0
+ mul v6.4s, v1.4s, v20.4s // 55 * c1
+ add v5.4s, v5.4s, v6.4s // 29 * c0 + 55 * c1
+ add v5.4s, v5.4s, v3.4s // dst0 = 29 * c0 + 55 * c1 + c3
+
+ mul v1.4s, v1.4s, v19.4s // 29 * c1
+ mul v6.4s, v2.4s, v20.4s // 55 * c2
+ sub v6.4s, v6.4s, v1.4s // 55 * c2 - 29 * c1
+ add v6.4s, v6.4s, v3.4s // dst1 = 55 * c2 - 29 * c1 + c3
+
+ mul v0.4s, v0.4s, v20.4s // 55 * c0
+ mul v2.4s, v2.4s, v19.4s // 29 * c2
+ add v0.4s, v0.4s, v2.4s // 55 * c0 + 29 * c2
+ sub v0.4s, v0.4s, v3.4s // dst3 = 55 * c0 + 29 * c2 - c3
+
+ sqrshrn \r0, v5.4s, \shift
+ sqrshrn \r1, v6.4s, \shift
+ sqrshrn \r2, v7.4s, \shift
+ sqrshrn \r3, v0.4s, \shift
+.endm
+
+function ff_hevc_transform_luma_4x4_neon_8, export=1
+ ld1 {v28.4h-v31.4h}, [x0]
+ movi v18.4s, #74
+ movi v19.4s, #29
+ movi v20.4s, #55
+ movi v21.4h, #74
+
+ tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #7
+ transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
+
+ tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #12
+ transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
+
+ st1 {v28.4h-v31.4h}, [x0]
+ ret
+endfunc
+
// void ff_hevc_idct_NxN_dc_DEPTH_neon(int16_t *coeffs)
.macro idct_dc size, bitdepth
function ff_hevc_idct_\size\()x\size\()_dc_\bitdepth\()_neon, export=1
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 4cc8732ad3..be1049a2ec 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -78,6 +78,7 @@ void ff_hevc_idct_4x4_dc_10_neon(int16_t *coeffs);
void ff_hevc_idct_8x8_dc_10_neon(int16_t *coeffs);
void ff_hevc_idct_16x16_dc_10_neon(int16_t *coeffs);
void ff_hevc_idct_32x32_dc_10_neon(int16_t *coeffs);
+void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, const uint8_t *_src,
ptrdiff_t stride_dst, ptrdiff_t stride_src,
const int16_t *sao_offset_val, int sao_left_class,
@@ -146,6 +147,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_neon;
c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_neon;
c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_neon;
+ c->transform_4x4_luma = ff_hevc_transform_luma_4x4_neon_8;
c->sao_band_filter[0] =
c->sao_band_filter[1] =
c->sao_band_filter[2] =
--
2.32.0 (Apple Git-132)
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2023-04-30 7:28 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-04-13 6:51 [FFmpeg-devel] [PATCH] codec/aarch64/hevc: add transform_luma_neon xufuji456
2023-04-13 12:14 ` Martin Storsjö
2023-04-13 13:20 ` =?gb18030?B?0Oy4o8Kh?=
2023-04-13 13:34 xufuji456
2023-04-14 12:20 ` Martin Storsjö
2023-04-30 7:28 ` =?gb18030?B?0Oy4o8Kh?=
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git