Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
From: "Martin Storsjö" <martin@martin.st>
To: ffmpeg-devel@ffmpeg.org
Cc: Logan Lyu <Logan.Lyu@myais.com.cn>, "J . Dekker" <jdek@itanimul.li>
Subject: [FFmpeg-devel] [PATCH 04/21] aarch64: hevc: Specialize put_hevc_\type\()_h*_8_neon for horizontal looping
Date: Mon, 25 Mar 2024 17:02:26 +0200
Message-ID: <20240325150243.59058-5-martin@martin.st> (raw)
In-Reply-To: <20240325150243.59058-1-martin@martin.st>

For widths of 32 pixels and more, loop first horizontally,
then vertically.

Previously, this function would process a 16 pixel wide slice
of the block, looping vertically. After processing the whole
height, it would backtrack and process the next 16 pixel wide
slice.

When doing 8tap filtering horizontally, the function must load
7 more pixels (in practice, 8) following the actual inputs, and
this was done for each slice.

By iterating first horizontally throughout each line, then
vertically, we access data in a more cache friendly order, and
we don't need to reload data unnecessarily.

Keep the original order in put_hevc_\type\()_h12_8_neon; the
only suboptimal case there is for width=24. But specializing
an optimal variant for that would require more code, which
might not be worth it.

For the h16 case, this implementation would give a slowdown,
as it now loads the first 8 pixels separately from the rest, but
for larger widths, it is a gain. Therefore, keep the h16 case
as it was (but remove the outer loop), and create a new specialized
version for horizontal looping with 16 pixels at a time.

Before:                  Cortex A53      A72      A73  Graviton 3
put_hevc_qpel_h16_8_neon:     710.5    667.7    692.5   211.0
put_hevc_qpel_h32_8_neon:    2791.5   2643.5   2732.0   883.5
put_hevc_qpel_h64_8_neon:   10954.0  10657.0  10874.2  3241.5
After:
put_hevc_qpel_h16_8_neon:     697.5    663.5    705.7   212.5
put_hevc_qpel_h32_8_neon:    2767.2   2684.5   2791.2   920.5
put_hevc_qpel_h64_8_neon:   10559.2  10471.5  10932.2  3051.7
---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  20 +++--
 libavcodec/aarch64/hevcdsp_qpel_neon.S    | 103 +++++++++++++++++-----
 2 files changed, 94 insertions(+), 29 deletions(-)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index d2f2a3681f..1e9f5e32db 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -109,6 +109,8 @@ void ff_hevc_put_hevc_qpel_h12_8_neon(int16_t *dst, const uint8_t *_src, ptrdiff
                                       intptr_t mx, intptr_t my, int width);
 void ff_hevc_put_hevc_qpel_h16_8_neon(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height,
                                       intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_h32_8_neon(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height,
+                                      intptr_t mx, intptr_t my, int width);
 void ff_hevc_put_hevc_qpel_uni_h4_8_neon(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src,
                                          ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,
                                          int width);
@@ -124,6 +126,9 @@ void ff_hevc_put_hevc_qpel_uni_h12_8_neon(uint8_t *_dst, ptrdiff_t _dststride, c
 void ff_hevc_put_hevc_qpel_uni_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src,
                                           ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t
                                           my, int width);
+void ff_hevc_put_hevc_qpel_uni_h32_8_neon(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src,
+                                          ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t
+                                          my, int width);
 void ff_hevc_put_hevc_qpel_bi_h4_8_neon(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src,
                                         ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t
                                         mx, intptr_t my, int width);
@@ -139,6 +144,9 @@ void ff_hevc_put_hevc_qpel_bi_h12_8_neon(uint8_t *_dst, ptrdiff_t _dststride, co
 void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src,
                                          ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t
                                          mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_bi_h32_8_neon(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src,
+                                         ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t
+                                         mx, intptr_t my, int width);
 
 #define NEON8_FNPROTO(fn, args, ext) \
     void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
@@ -335,28 +343,28 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->put_hevc_qpel[3][0][1]      = ff_hevc_put_hevc_qpel_h8_8_neon;
         c->put_hevc_qpel[4][0][1]      =
         c->put_hevc_qpel[6][0][1]      = ff_hevc_put_hevc_qpel_h12_8_neon;
-        c->put_hevc_qpel[5][0][1]      =
+        c->put_hevc_qpel[5][0][1]      = ff_hevc_put_hevc_qpel_h16_8_neon;
         c->put_hevc_qpel[7][0][1]      =
         c->put_hevc_qpel[8][0][1]      =
-        c->put_hevc_qpel[9][0][1]      = ff_hevc_put_hevc_qpel_h16_8_neon;
+        c->put_hevc_qpel[9][0][1]      = ff_hevc_put_hevc_qpel_h32_8_neon;
         c->put_hevc_qpel_uni[1][0][1]  = ff_hevc_put_hevc_qpel_uni_h4_8_neon;
         c->put_hevc_qpel_uni[2][0][1]  = ff_hevc_put_hevc_qpel_uni_h6_8_neon;
         c->put_hevc_qpel_uni[3][0][1]  = ff_hevc_put_hevc_qpel_uni_h8_8_neon;
         c->put_hevc_qpel_uni[4][0][1]  =
         c->put_hevc_qpel_uni[6][0][1]  = ff_hevc_put_hevc_qpel_uni_h12_8_neon;
-        c->put_hevc_qpel_uni[5][0][1]  =
+        c->put_hevc_qpel_uni[5][0][1]  = ff_hevc_put_hevc_qpel_uni_h16_8_neon;
         c->put_hevc_qpel_uni[7][0][1]  =
         c->put_hevc_qpel_uni[8][0][1]  =
-        c->put_hevc_qpel_uni[9][0][1]  = ff_hevc_put_hevc_qpel_uni_h16_8_neon;
+        c->put_hevc_qpel_uni[9][0][1]  = ff_hevc_put_hevc_qpel_uni_h32_8_neon;
         c->put_hevc_qpel_bi[1][0][1]   = ff_hevc_put_hevc_qpel_bi_h4_8_neon;
         c->put_hevc_qpel_bi[2][0][1]   = ff_hevc_put_hevc_qpel_bi_h6_8_neon;
         c->put_hevc_qpel_bi[3][0][1]   = ff_hevc_put_hevc_qpel_bi_h8_8_neon;
         c->put_hevc_qpel_bi[4][0][1]   =
         c->put_hevc_qpel_bi[6][0][1]   = ff_hevc_put_hevc_qpel_bi_h12_8_neon;
-        c->put_hevc_qpel_bi[5][0][1]   =
+        c->put_hevc_qpel_bi[5][0][1]   = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
         c->put_hevc_qpel_bi[7][0][1]   =
         c->put_hevc_qpel_bi[8][0][1]   =
-        c->put_hevc_qpel_bi[9][0][1]   = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
+        c->put_hevc_qpel_bi[9][0][1]   = ff_hevc_put_hevc_qpel_bi_h32_8_neon;
 
         NEON8_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels,);
         NEON8_FNASSIGN(c->put_hevc_epel, 1, 0, epel_v,);
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index 432558bb95..0fcded344b 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -383,11 +383,9 @@ endfunc
 
 .ifc \type, qpel
 function ff_hevc_put_hevc_h16_8_neon, export=0
-        uxtl            v16.8h,  v16.8b
         uxtl            v17.8h,  v17.8b
         uxtl            v18.8h,  v18.8b
 
-        uxtl            v19.8h,  v19.8b
         uxtl            v20.8h,  v20.8b
         uxtl            v21.8h,  v21.8b
 
@@ -408,7 +406,6 @@ function ff_hevc_put_hevc_h16_8_neon, export=0
         mla             v28.8h,  v24.8h, v0.h[\i]
         mla             v29.8h,  v25.8h, v0.h[\i]
 .endr
-        subs            x9, x9, #2
         ret
 endfunc
 .endif
@@ -439,7 +436,10 @@ function ff_hevc_put_hevc_\type\()_h12_8_neon, export=1
 1:      ld1             {v16.8b-v18.8b}, [src], x13
         ld1             {v19.8b-v21.8b}, [x12], x13
 
+        uxtl            v16.8h,  v16.8b
+        uxtl            v19.8h,  v19.8b
         bl              ff_hevc_put_hevc_h16_8_neon
+        subs            x9, x9, #2
 
 .ifc \type, qpel
         st1             {v26.8h}, [dst], #16
@@ -504,7 +504,6 @@ function ff_hevc_put_hevc_\type\()_h16_8_neon, export=1
 .ifc \type, qpel_bi
         ldrh            w8, [sp] // width
         mov             x16, #(MAX_PB_SIZE << 2) // src2bstridel
-        lsl             x17, x5, #7 // src2b reset
         add             x15, x4, #(MAX_PB_SIZE << 1) // src2b
 .endif
         sub             src, src, #3
@@ -519,11 +518,14 @@ function ff_hevc_put_hevc_\type\()_h16_8_neon, export=1
 .endif
         add             x10, dst, dststride // dstb
         add             x12, src, srcstride // srcb
-0:      mov             x9, height
+
 1:      ld1             {v16.8b-v18.8b}, [src], x13
         ld1             {v19.8b-v21.8b}, [x12], x13
 
+        uxtl            v16.8h,  v16.8b
+        uxtl            v19.8h,  v19.8b
         bl              ff_hevc_put_hevc_h16_8_neon
+        subs            height, height, #2
 
 .ifc \type, qpel
         st1             {v26.8h, v27.8h}, [dst], x14
@@ -550,28 +552,83 @@ function ff_hevc_put_hevc_\type\()_h16_8_neon, export=1
         st1             {v28.8b, v29.8b}, [x10], x14
 .endif
         b.gt            1b // double line
-        subs            width, width, #16
-        // reset src
-        msub            src, srcstride, height, src
-        msub            x12, srcstride, height, x12
-        // reset dst
-        msub            dst, dststride, height, dst
-        msub            x10, dststride, height, x10
+        ret             mx
+endfunc
+
+function ff_hevc_put_hevc_\type\()_h32_8_neon, export=1
+        load_filter     mx
+        sxtw            height, heightw
+        mov             mx, x30
 .ifc \type, qpel_bi
-        // reset xsrc
-        sub             x4,  x4,  x17
-        sub             x15, x15, x17
-        add             x4,  x4,  #32
-        add             x15, x15, #32
+        ldrh            w8, [sp] // width
+        mov             x16, #(MAX_PB_SIZE << 2) // src2bstridel
+        lsl             x17, x5, #7 // src2b reset
+        add             x15, x4, #(MAX_PB_SIZE << 1) // src2b
+        sub             x16, x16, width, uxtw #1
 .endif
-        add             src, src, #16
-        add             x12, x12, #16
+        sub             src, src, #3
+        mov             mx, x30
+.ifc \type, qpel
+        mov             dststride, #(MAX_PB_SIZE << 1)
+        lsl             x13, srcstride, #1 // srcstridel
+        mov             x14, #(MAX_PB_SIZE << 2)
+        sub             x14, x14, width, uxtw #1
+.else
+        lsl             x14, dststride, #1 // dststridel
+        lsl             x13, srcstride, #1 // srcstridel
+        sub             x14, x14, width, uxtw
+.endif
+        sub             x13, x13, width, uxtw
+        sub             x13, x13, #8
+        add             x10, dst, dststride // dstb
+        add             x12, src, srcstride // srcb
+0:      mov             w9, width
+        ld1             {v16.8b}, [src], #8
+        ld1             {v19.8b}, [x12], #8
+        uxtl            v16.8h, v16.8b
+        uxtl            v19.8h, v19.8b
+1:
+        ld1             {v17.8b-v18.8b}, [src], #16
+        ld1             {v20.8b-v21.8b}, [x12], #16
+
+        bl              ff_hevc_put_hevc_h16_8_neon
+        subs            w9, w9, #16
+
+        mov             v16.16b, v18.16b
+        mov             v19.16b, v21.16b
 .ifc \type, qpel
-        add             dst, dst, #32
-        add             x10, x10, #32
+        st1             {v26.8h, v27.8h}, [dst], #32
+        st1             {v28.8h, v29.8h}, [x10], #32
+.else
+.ifc \type, qpel_bi
+        ld1             {v20.8h, v21.8h}, [ x4], #32
+        ld1             {v22.8h, v23.8h}, [x15], #32
+        sqadd           v26.8h, v26.8h, v20.8h
+        sqadd           v27.8h, v27.8h, v21.8h
+        sqadd           v28.8h, v28.8h, v22.8h
+        sqadd           v29.8h, v29.8h, v23.8h
+        sqrshrun        v26.8b, v26.8h, #7
+        sqrshrun        v27.8b, v27.8h, #7
+        sqrshrun        v28.8b, v28.8h, #7
+        sqrshrun        v29.8b, v29.8h, #7
 .else
-        add             dst, dst, #16
-        add             x10, x10, #16
+        sqrshrun        v26.8b, v26.8h, #6
+        sqrshrun        v27.8b, v27.8h, #6
+        sqrshrun        v28.8b, v28.8h, #6
+        sqrshrun        v29.8b, v29.8h, #6
+.endif
+        st1             {v26.8b, v27.8b}, [dst], #16
+        st1             {v28.8b, v29.8b}, [x10], #16
+.endif
+        b.gt            1b // double line
+        subs            height, height, #2
+        add             src, src, x13
+        add             x12, x12, x13
+        add             dst, dst, x14
+        add             x10, x10, x14
+.ifc \type, qpel_bi
+        add             x4,  x4,  x16
+        add             x15, x15, x16
 .endif
         b.gt            0b
         ret             mx
-- 
2.39.3 (Apple Git-146)

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

  parent reply	other threads:[~2024-03-25 15:03 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-03-25 15:02 [FFmpeg-devel] [PATCH 00/21] aarch64: hevc: Add missing hevc_pel NEON functions Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 01/21] aarch64: hevc: Reorder a misplaced function init line Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 02/21] aarch64: hevc: Don't iterate with sp in ff_hevc_put_hevc_qpel_uni_w_hv32/64_8_neon_i8mm Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 03/21] aarch64: hevc: Merge consecutive stores in put_hevc_\type\()_h16_8_neon Martin Storsjö
2024-03-25 15:02 ` Martin Storsjö [this message]
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 05/21] aarch64: hevc: Use ld1r instead of ldr+dup in hevc_qpel_uni_w_h Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 06/21] aarch64: hevc: Implement a neon version of put_hevc_epel_h*_8 Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 07/21] aarch64: hevc: Implement a neon version of hevc_epel_uni_w_h*_8 Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 08/21] aarch64: hevc: Split the epel_*_hv functions into two parts Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 09/21] aarch64: hevc: Reorder epel_hv functions to prepare for templating Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 10/21] aarch64: hevc: Produce epel_hv functions for both plain neon and i8mm Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 11/21] aarch64: hevc: Produce epel_uni_hv functions for both " Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 12/21] aarch64: hevc: Produce epel_uni_w_hv " Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 13/21] aarch64: hevc: Produce epel_bi_hv " Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 14/21] aarch64: hevc: Implement a neon version of hevc_qpel_uni_w_h*_8 Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 15/21] aarch64: hevc: Split the qpel_*_hv functions into two parts Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 16/21] aarch64: hevc: Deduplicate the hevc_put_hevc_qpel_uni_w_hv*_8_end_neon functions Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 17/21] aarch64: hevc: Reorder qpel_hv functions to prepare for templating Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 18/21] aarch64: hevc: Produce plain neon versions of qpel_hv Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 19/21] aarch64: hevc: Produce plain neon versions of qpel_uni_hv Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 20/21] aarch64: hevc: Produce plain neon versions of qpel_uni_w_hv Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 21/21] aarch64: hevc: Produce plain neon versions of qpel_bi_hv Martin Storsjö
2024-03-25 21:15 ` [FFmpeg-devel] [PATCH 00/21] aarch64: hevc: Add missing hevc_pel NEON functions Martin Storsjö
2024-03-25 21:56   ` J. Dekker
2024-03-26  6:01     ` Jean-Baptiste Kempf
2024-03-26  7:09       ` Martin Storsjö

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240325150243.59058-5-martin@martin.st \
    --to=martin@martin.st \
    --cc=Logan.Lyu@myais.com.cn \
    --cc=ffmpeg-devel@ffmpeg.org \
    --cc=jdek@itanimul.li \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git