Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH] codec/aarch64/hevc: add transform_luma_neon
@ 2023-04-13  6:51 xufuji456
  2023-04-13 12:14 ` Martin Storsjö
  0 siblings, 1 reply; 6+ messages in thread
From: xufuji456 @ 2023-04-13  6:51 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: xufuji456

got 56% speed up (run_count=1000, CPU=Cortex A53)
transform_4x4_luma_neon: 45 transform_4x4_luma_c: 103

Signed-off-by: xufuji456 <839789740@qq.com>
---
 libavcodec/aarch64/hevcdsp_idct_neon.S    | 50 ++++++++++++++++++++++-
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  2 +
 2 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 994f0a47b6..504258f7c7 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -889,4 +889,52 @@ idct_dc 16, 8
 idct_dc 16, 10
 
 idct_dc 32, 8
-idct_dc 32, 10
\ No newline at end of file
+idct_dc 32, 10
+
+.macro tr4_luma_shift r0, r1, r2, r3, shift
+        saddl       v0.4s, \r0, \r2         // c0 = src0 + src2
+        saddl       v1.4s, \r2, \r3         // c1 = src2 + src3
+        ssubl       v2.4s, \r0, \r3         // c2 = src0 - src3
+        smull       v3.4s, \r1, v21.4h      // c3 = 74 * src1
+
+        saddl       v7.4s, \r0, \r3         // src0 + src3
+        ssubw       v7.4s, v7.4s, \r2       // src0 - src2 + src3
+        mul         v7.4s, v7.4s, v18.4s    // dst2 = 74 * (src0 - src2 + src3)
+
+        mul         v5.4s, v0.4s, v19.4s    // 29 * c0
+        mul         v6.4s, v1.4s, v20.4s    // 55 * c1
+        add         v5.4s, v5.4s, v6.4s     // 29 * c0 + 55 * c1
+        add         v5.4s, v5.4s, v3.4s     // dst0 = 29 * c0 + 55 * c1 + c3
+
+        mul         v1.4s, v1.4s, v19.4s    // 29 * c1
+        mul         v6.4s, v2.4s, v20.4s    // 55 * c2
+        sub         v6.4s, v6.4s, v1.4s     // 55 * c2 - 29 * c1
+        add         v6.4s, v6.4s, v3.4s     // dst1 = 55 * c2 - 29 * c1 + c3
+
+        mul         v0.4s, v0.4s, v20.4s    // 55 * c0
+        mul         v2.4s, v2.4s, v19.4s    // 29 * c2
+        add         v0.4s, v0.4s, v2.4s     // 55 * c0 + 29 * c2
+        sub         v0.4s, v0.4s, v3.4s     // dst3 = 55 * c0 + 29 * c2 - c3
+
+        sqrshrn     \r0, v5.4s, \shift
+        sqrshrn     \r1, v6.4s, \shift
+        sqrshrn     \r2, v7.4s, \shift
+        sqrshrn     \r3, v0.4s, \shift
+.endm
+
+function ff_hevc_transform_luma_4x4_neon_8, export=1
+        ld1            {v28.4h-v31.4h}, [x0]
+        movi           v18.4s, #74
+        movi           v19.4s, #29
+        movi           v20.4s, #55
+        movi           v21.4h, #74
+
+        tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #7
+        transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
+
+        tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #12
+        transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
+
+        st1            {v28.4h-v31.4h}, [x0]
+        ret
+endfunc
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 4cc8732ad3..be1049a2ec 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -78,6 +78,7 @@ void ff_hevc_idct_4x4_dc_10_neon(int16_t *coeffs);
 void ff_hevc_idct_8x8_dc_10_neon(int16_t *coeffs);
 void ff_hevc_idct_16x16_dc_10_neon(int16_t *coeffs);
 void ff_hevc_idct_32x32_dc_10_neon(int16_t *coeffs);
+void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
 void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, const uint8_t *_src,
                                   ptrdiff_t stride_dst, ptrdiff_t stride_src,
                                   const int16_t *sao_offset_val, int sao_left_class,
@@ -146,6 +147,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_8_neon;
         c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_8_neon;
         c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_8_neon;
+        c->transform_4x4_luma          = ff_hevc_transform_luma_4x4_neon_8;
         c->sao_band_filter[0]          =
         c->sao_band_filter[1]          =
         c->sao_band_filter[2]          =
-- 
2.32.0 (Apple Git-132)

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [FFmpeg-devel] [PATCH] codec/aarch64/hevc: add transform_luma_neon
  2023-04-13  6:51 [FFmpeg-devel] [PATCH] codec/aarch64/hevc: add transform_luma_neon xufuji456
@ 2023-04-13 12:14 ` Martin Storsjö
  2023-04-13 13:20   ` =?gb18030?B?0Oy4o8Kh?=
  0 siblings, 1 reply; 6+ messages in thread
From: Martin Storsjö @ 2023-04-13 12:14 UTC (permalink / raw)
  To: FFmpeg development discussions and patches; +Cc: xufuji456

On Thu, 13 Apr 2023, xufuji456 wrote:

> got 56% speed up (run_count=1000, CPU=Cortex A53)
> transform_4x4_luma_neon: 45 transform_4x4_luma_c: 103
>
> Signed-off-by: xufuji456 <839789740@qq.com>
> ---
> libavcodec/aarch64/hevcdsp_idct_neon.S    | 50 ++++++++++++++++++++++-
> libavcodec/aarch64/hevcdsp_init_aarch64.c |  2 +
> 2 files changed, 51 insertions(+), 1 deletion(-)
>
> diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
> index 994f0a47b6..504258f7c7 100644
> --- a/libavcodec/aarch64/hevcdsp_idct_neon.S
> +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
> @@ -889,4 +889,52 @@ idct_dc 16, 8
> idct_dc 16, 10
>
> idct_dc 32, 8
> -idct_dc 32, 10
> \ No newline at end of file
> +idct_dc 32, 10

This patch does still not apply on git master.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [FFmpeg-devel] [PATCH] codec/aarch64/hevc: add transform_luma_neon
  2023-04-13 12:14 ` Martin Storsjö
@ 2023-04-13 13:20   ` =?gb18030?B?0Oy4o8Kh?=
  0 siblings, 0 replies; 6+ messages in thread
From: =?gb18030?B?0Oy4o8Kh?= @ 2023-04-13 13:20 UTC (permalink / raw)
  To: =?gb18030?B?RkZtcGVnIGRldmVsb3BtZW50IGRpc2N1c3Npb25zIGFuZCBwYXRjaGVz?=
  Cc: =?gb18030?B?TWFydGluIFN0b3JzaoQxpDc=?=

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="gb18030", Size: 1857 bytes --]

It seem that the reason is "No newline at end of file".
I will fix it and submit again.
Thank you for your patient review.


------------------&nbsp;Original&nbsp;------------------
From:                                                                                                                        "Martin Storsj„1¤7"                                                                                    <martin@martin.st&gt;;
Date:&nbsp;Thu, Apr 13, 2023 08:14 PM
To:&nbsp;"FFmpeg development discussions and patches"<ffmpeg-devel@ffmpeg.org&gt;;
Cc:&nbsp;"Ð츣¡"<839789740@qq.com&gt;;
Subject:&nbsp;Re: [FFmpeg-devel] [PATCH] codec/aarch64/hevc: add transform_luma_neon



On Thu, 13 Apr 2023, xufuji456 wrote:

&gt; got 56% speed up (run_count=1000, CPU=Cortex A53)
&gt; transform_4x4_luma_neon: 45 transform_4x4_luma_c: 103
&gt;
&gt; Signed-off-by: xufuji456 <839789740@qq.com&gt;
&gt; ---
&gt; libavcodec/aarch64/hevcdsp_idct_neon.S&nbsp;&nbsp;&nbsp; | 50 ++++++++++++++++++++++-
&gt; libavcodec/aarch64/hevcdsp_init_aarch64.c |&nbsp; 2 +
&gt; 2 files changed, 51 insertions(+), 1 deletion(-)
&gt;
&gt; diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
&gt; index 994f0a47b6..504258f7c7 100644
&gt; --- a/libavcodec/aarch64/hevcdsp_idct_neon.S
&gt; +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
&gt; @@ -889,4 +889,52 @@ idct_dc 16, 8
&gt; idct_dc 16, 10
&gt;
&gt; idct_dc 32, 8
&gt; -idct_dc 32, 10
&gt; \ No newline at end of file
&gt; +idct_dc 32, 10

This patch does still not apply on git master.

// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [FFmpeg-devel] [PATCH] codec/aarch64/hevc: add transform_luma_neon
  2023-04-14 12:20 ` Martin Storsjö
@ 2023-04-30  7:28   ` =?gb18030?B?0Oy4o8Kh?=
  0 siblings, 0 replies; 6+ messages in thread
From: =?gb18030?B?0Oy4o8Kh?= @ 2023-04-30  7:28 UTC (permalink / raw)
  To: =?gb18030?B?RkZtcGVnIGRldmVsb3BtZW50IGRpc2N1c3Npb25zIGFuZCBwYXRjaGVz?=
  Cc: =?gb18030?B?TWFydGluJm5ic3A7U3RvcnNqhDGkNw==?=

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="gb18030", Size: 2035 bytes --]

Thank you, Martin.
It's my mistake that delete an empty line in the end of file.
Should I submit a patch with a newline in the end of file or do something else?
Thanks for your review and point out the details of error.&nbsp;




------------------&nbsp;Original&nbsp;------------------
From:                                                                                                                        "Martin Storsj„1¤7"                                                                                    <martin@martin.st&gt;;
Date:&nbsp;Fri, Apr 14, 2023 08:20 PM
To:&nbsp;"FFmpeg development discussions and patches"<ffmpeg-devel@ffmpeg.org&gt;;
Cc:&nbsp;"Ð츣¡"<839789740@qq.com&gt;;
Subject:&nbsp;Re: [FFmpeg-devel] [PATCH] codec/aarch64/hevc: add transform_luma_neon



On Thu, 13 Apr 2023, xufuji456 wrote:

&gt; got 56% speed up (run_count=1000, CPU=Cortex A53)
&gt; transform_4x4_luma_neon: 45 transform_4x4_luma_c: 103
&gt;
&gt; Signed-off-by: xufuji456 <839789740@qq.com&gt;
&gt; ---
&gt; libavcodec/aarch64/hevcdsp_idct_neon.S&nbsp;&nbsp;&nbsp; | 48 +++++++++++++++++++++++
&gt; libavcodec/aarch64/hevcdsp_init_aarch64.c |&nbsp; 2 +
&gt; 2 files changed, 50 insertions(+)

Thanks, this version can be applied - and still looks good, so I pushed 
it.

I see that you fixed the issue by just applying the new code in the middle 
of the file instead of at the end of the file though. You really should 
try to look into what it is that is causing the previous version of the 
file to be lacking the trailing newline, since that's not what is in the 
actual upstream git. So it looks like there's something off with your git 
workflow, and it would be very good to get that sorted out before going 
forward anyway.

// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [FFmpeg-devel] [PATCH] codec/aarch64/hevc: add transform_luma_neon
  2023-04-13 13:34 xufuji456
@ 2023-04-14 12:20 ` Martin Storsjö
  2023-04-30  7:28   ` =?gb18030?B?0Oy4o8Kh?=
  0 siblings, 1 reply; 6+ messages in thread
From: Martin Storsjö @ 2023-04-14 12:20 UTC (permalink / raw)
  To: FFmpeg development discussions and patches; +Cc: xufuji456

On Thu, 13 Apr 2023, xufuji456 wrote:

> got 56% speed up (run_count=1000, CPU=Cortex A53)
> transform_4x4_luma_neon: 45 transform_4x4_luma_c: 103
>
> Signed-off-by: xufuji456 <839789740@qq.com>
> ---
> libavcodec/aarch64/hevcdsp_idct_neon.S    | 48 +++++++++++++++++++++++
> libavcodec/aarch64/hevcdsp_init_aarch64.c |  2 +
> 2 files changed, 50 insertions(+)

Thanks, this version can be applied - and still looks good, so I pushed 
it.

I see that you fixed the issue by just applying the new code in the middle 
of the file instead of at the end of the file though. You really should 
try to look into what it is that is causing the previous version of the 
file to be lacking the trailing newline, since that's not what is in the 
actual upstream git. So it looks like there's something off with your git 
workflow, and it would be very good to get that sorted out before going 
forward anyway.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [FFmpeg-devel] [PATCH] codec/aarch64/hevc: add transform_luma_neon
@ 2023-04-13 13:34 xufuji456
  2023-04-14 12:20 ` Martin Storsjö
  0 siblings, 1 reply; 6+ messages in thread
From: xufuji456 @ 2023-04-13 13:34 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: xufuji456

got 56% speed up (run_count=1000, CPU=Cortex A53)
transform_4x4_luma_neon: 45 transform_4x4_luma_c: 103

Signed-off-by: xufuji456 <839789740@qq.com>
---
 libavcodec/aarch64/hevcdsp_idct_neon.S    | 48 +++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  2 +
 2 files changed, 50 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 994f0a47b6..4a25787070 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -842,6 +842,54 @@ tr_32x4 secondpass_10, 20 - 10
 idct_32x32 8
 idct_32x32 10
 
+.macro tr4_luma_shift r0, r1, r2, r3, shift
+        saddl       v0.4s, \r0, \r2         // c0 = src0 + src2
+        saddl       v1.4s, \r2, \r3         // c1 = src2 + src3
+        ssubl       v2.4s, \r0, \r3         // c2 = src0 - src3
+        smull       v3.4s, \r1, v21.4h      // c3 = 74 * src1
+
+        saddl       v7.4s, \r0, \r3         // src0 + src3
+        ssubw       v7.4s, v7.4s, \r2       // src0 - src2 + src3
+        mul         v7.4s, v7.4s, v18.4s    // dst2 = 74 * (src0 - src2 + src3)
+
+        mul         v5.4s, v0.4s, v19.4s    // 29 * c0
+        mul         v6.4s, v1.4s, v20.4s    // 55 * c1
+        add         v5.4s, v5.4s, v6.4s     // 29 * c0 + 55 * c1
+        add         v5.4s, v5.4s, v3.4s     // dst0 = 29 * c0 + 55 * c1 + c3
+
+        mul         v1.4s, v1.4s, v19.4s    // 29 * c1
+        mul         v6.4s, v2.4s, v20.4s    // 55 * c2
+        sub         v6.4s, v6.4s, v1.4s     // 55 * c2 - 29 * c1
+        add         v6.4s, v6.4s, v3.4s     // dst1 = 55 * c2 - 29 * c1 + c3
+
+        mul         v0.4s, v0.4s, v20.4s    // 55 * c0
+        mul         v2.4s, v2.4s, v19.4s    // 29 * c2
+        add         v0.4s, v0.4s, v2.4s     // 55 * c0 + 29 * c2
+        sub         v0.4s, v0.4s, v3.4s     // dst3 = 55 * c0 + 29 * c2 - c3
+
+        sqrshrn     \r0, v5.4s, \shift
+        sqrshrn     \r1, v6.4s, \shift
+        sqrshrn     \r2, v7.4s, \shift
+        sqrshrn     \r3, v0.4s, \shift
+.endm
+
+function ff_hevc_transform_luma_4x4_neon_8, export=1
+        ld1            {v28.4h-v31.4h}, [x0]
+        movi           v18.4s, #74
+        movi           v19.4s, #29
+        movi           v20.4s, #55
+        movi           v21.4h, #74
+
+        tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #7
+        transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
+
+        tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #12
+        transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
+
+        st1            {v28.4h-v31.4h}, [x0]
+        ret
+endfunc
+
 // void ff_hevc_idct_NxN_dc_DEPTH_neon(int16_t *coeffs)
 .macro idct_dc size, bitdepth
 function ff_hevc_idct_\size\()x\size\()_dc_\bitdepth\()_neon, export=1
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 4cc8732ad3..be1049a2ec 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -78,6 +78,7 @@ void ff_hevc_idct_4x4_dc_10_neon(int16_t *coeffs);
 void ff_hevc_idct_8x8_dc_10_neon(int16_t *coeffs);
 void ff_hevc_idct_16x16_dc_10_neon(int16_t *coeffs);
 void ff_hevc_idct_32x32_dc_10_neon(int16_t *coeffs);
+void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
 void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, const uint8_t *_src,
                                   ptrdiff_t stride_dst, ptrdiff_t stride_src,
                                   const int16_t *sao_offset_val, int sao_left_class,
@@ -146,6 +147,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_8_neon;
         c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_8_neon;
         c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_8_neon;
+        c->transform_4x4_luma          = ff_hevc_transform_luma_4x4_neon_8;
         c->sao_band_filter[0]          =
         c->sao_band_filter[1]          =
         c->sao_band_filter[2]          =
-- 
2.32.0 (Apple Git-132)

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2023-04-30  7:28 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-04-13  6:51 [FFmpeg-devel] [PATCH] codec/aarch64/hevc: add transform_luma_neon xufuji456
2023-04-13 12:14 ` Martin Storsjö
2023-04-13 13:20   ` =?gb18030?B?0Oy4o8Kh?=
2023-04-13 13:34 xufuji456
2023-04-14 12:20 ` Martin Storsjö
2023-04-30  7:28   ` =?gb18030?B?0Oy4o8Kh?=

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git