Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
From: "Martin Storsjö" <martin@martin.st>
To: ffmpeg-devel@ffmpeg.org
Cc: Logan Lyu <Logan.Lyu@myais.com.cn>, "J . Dekker" <jdek@itanimul.li>
Subject: [FFmpeg-devel] [PATCH 16/21] aarch64: hevc: Deduplicate the hevc_put_hevc_qpel_uni_w_hv*_8_end_neon functions
Date: Mon, 25 Mar 2024 17:02:38 +0200
Message-ID: <20240325150243.59058-17-martin@martin.st> (raw)
In-Reply-To: <20240325150243.59058-1-martin@martin.st>

The hv32 and hv64 functions were identical - both loop and
process 16 pixels at a time.

The hv16 function was near identical, except for the outer loop
(and using sp instead of a separate register).

Given the size of these functions, the extra cost of the outer
loop is negligible, so use the same function for hv16 as well.

This removes over 200 lines of duplicated assembly, and over 4 KB
of binary size.
---
 libavcodec/aarch64/hevcdsp_qpel_neon.S | 220 +------------------------
 1 file changed, 3 insertions(+), 217 deletions(-)

diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index c04e8dbea8..06832603d9 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -4381,231 +4381,17 @@ function ff_hevc_put_hevc_qpel_uni_w_hv16_8_neon_i8mm, export=1
         b               hevc_put_hevc_qpel_uni_w_hv16_8_end_neon
 endfunc
 
-function hevc_put_hevc_qpel_uni_w_hv16_8_end_neon
-        ldp             q16, q1, [sp]
-        add             sp, sp, x10
-        ldp             q17, q2, [sp]
-        add             sp, sp, x10
-        ldp             q18, q3, [sp]
-        add             sp, sp, x10
-        ldp             q19, q4, [sp]
-        add             sp, sp, x10
-        ldp             q20, q5, [sp]
-        add             sp, sp, x10
-        ldp             q21, q6, [sp]
-        add             sp, sp, x10
-        ldp             q22, q7, [sp]
-        add             sp, sp, x10
-1:
-        ldp             q23, q31, [sp]
-        add             sp, sp, x10
-        QPEL_FILTER_H   v24, v16, v17, v18, v19, v20, v21, v22, v23
-        QPEL_FILTER_H2  v25, v16, v17, v18, v19, v20, v21, v22, v23
-        QPEL_FILTER_H   v26,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
-        QPEL_FILTER_H2  v27,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
-        QPEL_UNI_W_HV_16
-        subs            w22, w22, #1
-        b.eq            2f
-
-        ldp             q16, q1, [sp]
-        add             sp, sp, x10
-        QPEL_FILTER_H   v24, v17, v18, v19, v20, v21, v22, v23, v16
-        QPEL_FILTER_H2  v25, v17, v18, v19, v20, v21, v22, v23, v16
-        QPEL_FILTER_H   v26,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
-        QPEL_FILTER_H2  v27,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
-        QPEL_UNI_W_HV_16
-        subs            w22, w22, #1
-        b.eq            2f
-
-        ldp             q17, q2, [sp]
-        add             sp, sp, x10
-        QPEL_FILTER_H   v24, v18, v19, v20, v21, v22, v23, v16, v17
-        QPEL_FILTER_H2  v25, v18, v19, v20, v21, v22, v23, v16, v17
-        QPEL_FILTER_H   v26,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
-        QPEL_FILTER_H2  v27,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
-        QPEL_UNI_W_HV_16
-        subs            w22, w22, #1
-        b.eq            2f
-
-        ldp             q18, q3, [sp]
-        add             sp, sp, x10
-        QPEL_FILTER_H   v24, v19, v20, v21, v22, v23, v16, v17, v18
-        QPEL_FILTER_H2  v25, v19, v20, v21, v22, v23, v16, v17, v18
-        QPEL_FILTER_H   v26,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
-        QPEL_FILTER_H2  v27,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
-        QPEL_UNI_W_HV_16
-        subs            w22, w22, #1
-        b.eq            2f
-
-        ldp             q19, q4, [sp]
-        add             sp, sp, x10
-        QPEL_FILTER_H   v24, v20, v21, v22, v23, v16, v17, v18, v19
-        QPEL_FILTER_H2  v25, v20, v21, v22, v23, v16, v17, v18, v19
-        QPEL_FILTER_H   v26,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
-        QPEL_FILTER_H2  v27,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
-        QPEL_UNI_W_HV_16
-        subs            w22, w22, #1
-        b.eq            2f
-
-        ldp             q20, q5, [sp]
-        add             sp, sp, x10
-        QPEL_FILTER_H   v24, v21, v22, v23, v16, v17, v18, v19, v20
-        QPEL_FILTER_H2  v25, v21, v22, v23, v16, v17, v18, v19, v20
-        QPEL_FILTER_H   v26,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
-        QPEL_FILTER_H2  v27,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
-        QPEL_UNI_W_HV_16
-        subs            w22, w22, #1
-        b.eq            2f
-
-        ldp             q21, q6, [sp]
-        add             sp, sp, x10
-        QPEL_FILTER_H   v24, v22, v23, v16, v17, v18, v19, v20, v21
-        QPEL_FILTER_H2  v25, v22, v23, v16, v17, v18, v19, v20, v21
-        QPEL_FILTER_H   v26,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
-        QPEL_FILTER_H2  v27,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
-        QPEL_UNI_W_HV_16
-        subs            w22, w22, #1
-        b.eq            2f
-
-        ldp             q22, q7, [sp]
-        add             sp, sp, x10
-        QPEL_FILTER_H   v24, v23, v16, v17, v18, v19, v20, v21, v22
-        QPEL_FILTER_H2  v25, v23, v16, v17, v18, v19, v20, v21, v22
-        QPEL_FILTER_H   v26, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
-        QPEL_FILTER_H2  v27, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
-        QPEL_UNI_W_HV_16
-        subs            w22, w22, #1
-        b.hi            1b
-
-2:
-        QPEL_UNI_W_HV_END
-        ret
-endfunc
-
-
 function ff_hevc_put_hevc_qpel_uni_w_hv32_8_neon_i8mm, export=1
         QPEL_UNI_W_HV_HEADER 32
-        b               hevc_put_hevc_qpel_uni_w_hv32_8_end_neon
-endfunc
-
-function hevc_put_hevc_qpel_uni_w_hv32_8_end_neon
-        mov             x11, sp
-        mov             w12, w22
-        mov             x13, x20
-        mov             x14, sp
-3:
-        ldp             q16, q1, [x11]
-        add             x11, x11, x10
-        ldp             q17, q2, [x11]
-        add             x11, x11, x10
-        ldp             q18, q3, [x11]
-        add             x11, x11, x10
-        ldp             q19, q4, [x11]
-        add             x11, x11, x10
-        ldp             q20, q5, [x11]
-        add             x11, x11, x10
-        ldp             q21, q6, [x11]
-        add             x11, x11, x10
-        ldp             q22, q7, [x11]
-        add             x11, x11, x10
-1:
-        ldp             q23, q31, [x11]
-        add             x11, x11, x10
-        QPEL_FILTER_H   v24, v16, v17, v18, v19, v20, v21, v22, v23
-        QPEL_FILTER_H2  v25, v16, v17, v18, v19, v20, v21, v22, v23
-        QPEL_FILTER_H   v26,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
-        QPEL_FILTER_H2  v27,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
-        QPEL_UNI_W_HV_16
-        subs            w22, w22, #1
-        b.eq            2f
-
-        ldp             q16, q1, [x11]
-        add             x11, x11, x10
-        QPEL_FILTER_H   v24, v17, v18, v19, v20, v21, v22, v23, v16
-        QPEL_FILTER_H2  v25, v17, v18, v19, v20, v21, v22, v23, v16
-        QPEL_FILTER_H   v26,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
-        QPEL_FILTER_H2  v27,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
-        QPEL_UNI_W_HV_16
-        subs            w22, w22, #1
-        b.eq            2f
-
-        ldp             q17, q2, [x11]
-        add             x11, x11, x10
-        QPEL_FILTER_H   v24, v18, v19, v20, v21, v22, v23, v16, v17
-        QPEL_FILTER_H2  v25, v18, v19, v20, v21, v22, v23, v16, v17
-        QPEL_FILTER_H   v26,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
-        QPEL_FILTER_H2  v27,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
-        QPEL_UNI_W_HV_16
-        subs            w22, w22, #1
-        b.eq            2f
-
-        ldp             q18, q3, [x11]
-        add             x11, x11, x10
-        QPEL_FILTER_H   v24, v19, v20, v21, v22, v23, v16, v17, v18
-        QPEL_FILTER_H2  v25, v19, v20, v21, v22, v23, v16, v17, v18
-        QPEL_FILTER_H   v26,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
-        QPEL_FILTER_H2  v27,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
-        QPEL_UNI_W_HV_16
-        subs            w22, w22, #1
-        b.eq            2f
-
-        ldp             q19, q4, [x11]
-        add             x11, x11, x10
-        QPEL_FILTER_H   v24, v20, v21, v22, v23, v16, v17, v18, v19
-        QPEL_FILTER_H2  v25, v20, v21, v22, v23, v16, v17, v18, v19
-        QPEL_FILTER_H   v26,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
-        QPEL_FILTER_H2  v27,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
-        QPEL_UNI_W_HV_16
-        subs            w22, w22, #1
-        b.eq            2f
-
-        ldp             q20, q5, [x11]
-        add             x11, x11, x10
-        QPEL_FILTER_H   v24, v21, v22, v23, v16, v17, v18, v19, v20
-        QPEL_FILTER_H2  v25, v21, v22, v23, v16, v17, v18, v19, v20
-        QPEL_FILTER_H   v26,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
-        QPEL_FILTER_H2  v27,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
-        QPEL_UNI_W_HV_16
-        subs            w22, w22, #1
-        b.eq            2f
-
-        ldp             q21, q6, [x11]
-        add             x11, x11, x10
-        QPEL_FILTER_H   v24, v22, v23, v16, v17, v18, v19, v20, v21
-        QPEL_FILTER_H2  v25, v22, v23, v16, v17, v18, v19, v20, v21
-        QPEL_FILTER_H   v26,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
-        QPEL_FILTER_H2  v27,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
-        QPEL_UNI_W_HV_16
-        subs            w22, w22, #1
-        b.eq            2f
-
-        ldp             q22, q7, [x11]
-        add             x11, x11, x10
-        QPEL_FILTER_H   v24, v23, v16, v17, v18, v19, v20, v21, v22
-        QPEL_FILTER_H2  v25, v23, v16, v17, v18, v19, v20, v21, v22
-        QPEL_FILTER_H   v26, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
-        QPEL_FILTER_H2  v27, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
-        QPEL_UNI_W_HV_16
-        subs            w22, w22, #1
-        b.hi            1b
-2:
-        subs            w27, w27, #16
-        add             x11, x14, #32
-        add             x20, x13, #16
-        mov             w22, w12
-        mov             x14, x11
-        mov             x13, x20
-        b.hi            3b
-        QPEL_UNI_W_HV_END
-        ret
+        b               hevc_put_hevc_qpel_uni_w_hv16_8_end_neon
 endfunc
 
 function ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm, export=1
         QPEL_UNI_W_HV_HEADER 64
-        b               hevc_put_hevc_qpel_uni_w_hv64_8_end_neon
+        b               hevc_put_hevc_qpel_uni_w_hv16_8_end_neon
 endfunc
 
-function hevc_put_hevc_qpel_uni_w_hv64_8_end_neon
+function hevc_put_hevc_qpel_uni_w_hv16_8_end_neon
         mov             x11, sp
         mov             w12, w22
         mov             x13, x20
-- 
2.39.3 (Apple Git-146)

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

  parent reply	other threads:[~2024-03-25 15:05 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-03-25 15:02 [FFmpeg-devel] [PATCH 00/21] aarch64: hevc: Add missing hevc_pel NEON functions Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 01/21] aarch64: hevc: Reorder a misplaced function init line Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 02/21] aarch64: hevc: Don't iterate with sp in ff_hevc_put_hevc_qpel_uni_w_hv32/64_8_neon_i8mm Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 03/21] aarch64: hevc: Merge consecutive stores in put_hevc_\type\()_h16_8_neon Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 04/21] aarch64: hevc: Specialize put_hevc_\type\()_h*_8_neon for horizontal looping Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 05/21] aarch64: hevc: Use ld1r instead of ldr+dup in hevc_qpel_uni_w_h Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 06/21] aarch64: hevc: Implement a neon version of put_hevc_epel_h*_8 Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 07/21] aarch64: hevc: Implement a neon version of hevc_epel_uni_w_h*_8 Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 08/21] aarch64: hevc: Split the epel_*_hv functions into two parts Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 09/21] aarch64: hevc: Reorder epel_hv functions to prepare for templating Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 10/21] aarch64: hevc: Produce epel_hv functions for both plain neon and i8mm Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 11/21] aarch64: hevc: Produce epel_uni_hv functions for both " Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 12/21] aarch64: hevc: Produce epel_uni_w_hv " Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 13/21] aarch64: hevc: Produce epel_bi_hv " Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 14/21] aarch64: hevc: Implement a neon version of hevc_qpel_uni_w_h*_8 Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 15/21] aarch64: hevc: Split the qpel_*_hv functions into two parts Martin Storsjö
2024-03-25 15:02 ` Martin Storsjö [this message]
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 17/21] aarch64: hevc: Reorder qpel_hv functions to prepare for templating Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 18/21] aarch64: hevc: Produce plain neon versions of qpel_hv Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 19/21] aarch64: hevc: Produce plain neon versions of qpel_uni_hv Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 20/21] aarch64: hevc: Produce plain neon versions of qpel_uni_w_hv Martin Storsjö
2024-03-25 15:02 ` [FFmpeg-devel] [PATCH 21/21] aarch64: hevc: Produce plain neon versions of qpel_bi_hv Martin Storsjö
2024-03-25 21:15 ` [FFmpeg-devel] [PATCH 00/21] aarch64: hevc: Add missing hevc_pel NEON functions Martin Storsjö
2024-03-25 21:56   ` J. Dekker
2024-03-26  6:01     ` Jean-Baptiste Kempf
2024-03-26  7:09       ` Martin Storsjö

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240325150243.59058-17-martin@martin.st \
    --to=martin@martin.st \
    --cc=Logan.Lyu@myais.com.cn \
    --cc=ffmpeg-devel@ffmpeg.org \
    --cc=jdek@itanimul.li \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git