Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
From: "Swinney, Jonathan" <jswinney@amazon.com>
To: Hubert Mazur <hum@semihalf.com>,
	"ffmpeg-devel@ffmpeg.org" <ffmpeg-devel@ffmpeg.org>
Cc: "martin@martin.st" <martin@martin.st>,
	"mw@semihalf.com" <mw@semihalf.com>,
	"upstream@semihalf.com" <upstream@semihalf.com>,
	"Pop, Sebastian" <spop@amazon.com>,
	"gjb@semihalf.com" <gjb@semihalf.com>
Subject: Re: [FFmpeg-devel] [PATCH 2/5] lavc/aarch64: Add neon implementation for sse4
Date: Fri, 22 Jul 2022 21:30:57 +0000
Message-ID: <EC41504D-2C95-4C98-92D3-C7D09FF0B164@amazon.com> (raw)
In-Reply-To: <20220715080228.686736-3-hum@semihalf.com>

As Martin noted, this patch doesn't build. But other than, that, it would be nice if there were comments on each line at least making some note about which of the 4 iterations each instruction calculates. That would make it a little bit easier to read, in my opinion, since the instructions are manually reordered.

Thanks,
-- 

Jonathan Swinney

On 7/15/22, 3:03 AM, "Hubert Mazur" <hum@semihalf.com> wrote:

    CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.



    Provide neon implementation for sse4 function.

    Performance comparison tests are shown below.
    - sse_2_c: 74.0
    - sse_2_neon: 24.0

    Benchmarks and tests are run with checkasm tool on AWS Graviton 3.

    Signed-off-by: Hubert Mazur <hum@semihalf.com>
    ---
     libavcodec/aarch64/me_cmp_init_aarch64.c |  3 ++
     libavcodec/aarch64/me_cmp_neon.S         | 65 ++++++++++++++++++++++++
     2 files changed, 68 insertions(+)

    diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
    index 3ff5767bd0..72a2062e7e 100644
    --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
    +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
    @@ -32,6 +32,8 @@ int ff_pix_abs16_x2_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,

     int sse16_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                       ptrdiff_t stride, int h);
    +int sse4_neon(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
    +                  ptrdiff_t stride, int h);

     av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
     {
    @@ -44,5 +46,6 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)

             c->sad[0] = ff_pix_abs16_neon;
             c->sse[0] = sse16_neon;
    +        c->sse[2] = sse4_neon;
         }
     }
    diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
    index 88cd335443..bacf151314 100644
    --- a/libavcodec/aarch64/me_cmp_neon.S
    +++ b/libavcodec/aarch64/me_cmp_neon.S
    @@ -360,3 +360,68 @@ function sse16_neon, export=1
             ret

     endfunc
    +
    +function sse4_neon, export=1
    +        // x0 - unused
    +        // x1 - pix1
    +        // x2 - pix2
    +        // x3 - stride
    +        // w4 - h
    +
    +        movi            d18, #0
    +        movi            d17, #0
    +        cmp             w4, #4
    +        b.le            2f
    +
    +// make 4 iterations at once
    +1:
    +
    +        // res = abs(pix1[0] - pix2[0])
    +        // res * res
    +
    +        ld1             {v0.4b}, [x1], x3
    +        ld1             {v1.4b}, [x2], x3
    +        uabdl           v30.8h, v0.4b, v1.4b
    +        ld1             {v2.4b}, [x1], x3
    +        ld1             {v3.4b}, [x2], x3
    +        umull           v16.4s, v30.4h, v30.4h
    +        uabdl           v29.8h, v2.4b, v3.4b
    +        ld1             {v4.4b}, [x1], x3
    +        ld1             {v5.4b}, [x2], x3
    +        umlal           v16.4s, v29.4h, v29.4h
    +        uabdl           v28.8h, v4.4b, v5.4b
    +        ld1             {v6.4b}, [x1], x3
    +        ld1             {v7.4b}, [x2], x3
    +        umlal           v16.4s, v28.4h, v28.4h
    +        uabdl           v27.8h, v6.4b, v7.4b
    +        umlal           v16.4s, v27.4h, v27.4h
    +
    +        uaddlv          d17, v16.4s
    +        add             d18, d18, d17
    +
    +        sub             w4, w4, #4
    +        cmp             w4, #4
    +        b.ge            1b
    +
    +        cbnz            w4, 2f
    +        fmov            w0, s18
    +
    +        ret
    +
    +// iterate by one
    +2:
    +        ld1             {v0.4b}, [x1], x3
    +        ld1             {v1.4b}, [x2], x3
    +        uabdl           v30.8h, v0.4b, v1.4b
    +        umull           v16.4s, v30.4h, v30.4h
    +
    +        uaddlv          d17, v16.4s
    +        add             d18, d18, d17
    +
    +        subs            w4, w4, #1
    +        b.ne            2b
    +        fmov            w0, s18
    +
    +        ret
    +
    +endfunc
    --
    2.34.1


_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

  parent reply	other threads:[~2022-07-22 21:31 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-07-15  8:02 [FFmpeg-devel] [PATCH 0/5] Add neon implementation for me_cmp functions Hubert Mazur
2022-07-15  8:02 ` [FFmpeg-devel] [PATCH 1/5] lavc/aarch64: Add neon implementation for sse16 Hubert Mazur
2022-07-25 11:12   ` [FFmpeg-devel] [PATCH] " Hubert Mazur
2022-08-03 13:22     ` Martin Storsjö
2022-08-04  7:46     ` Martin Storsjö
2022-07-15  8:02 ` [FFmpeg-devel] [PATCH 2/5] lavc/aarch64: Add neon implementation for sse4 Hubert Mazur
2022-07-21 21:43   ` Martin Storsjö
2022-07-22 21:30   ` Swinney, Jonathan [this message]
2022-07-25 11:15   ` [FFmpeg-devel] [PATCH] " Hubert Mazur
2022-07-28 18:50     ` Swinney, Jonathan
2022-07-28 18:51       ` Swinney, Jonathan
2022-07-29  7:26         ` Hubert Mazur
2022-08-04  8:00     ` Martin Storsjö
2022-07-15  8:02 ` [FFmpeg-devel] [PATCH 3/5] lavc/aarch64: Add neon implementation for pix_abs16_y2 Hubert Mazur
2022-07-25 11:17   ` [FFmpeg-devel] [PATCH] " Hubert Mazur
2022-08-04  8:08     ` Martin Storsjö
2022-08-04  8:12     ` Martin Storsjö
2022-07-15  8:02 ` [FFmpeg-devel] [PATCH 4/5] lavc/aarch64: Add neon implementation for sse8 Hubert Mazur
2022-07-25 11:18   ` [FFmpeg-devel] [PATCH] " Hubert Mazur
2022-08-04  8:04     ` Martin Storsjö
2022-07-15  8:02 ` [FFmpeg-devel] [PATCH 5/5] lavc/aarch64: Add neon implementation for pix_abs8 Hubert Mazur
2022-07-25 11:21   ` [FFmpeg-devel] [PATCH] " Hubert Mazur
2022-08-04  8:10     ` Martin Storsjö
2022-08-16 12:20 [FFmpeg-devel] [PATCH 0/5] Provide neon implementation for me_cmp functions Hubert Mazur
2022-08-16 12:20 ` [FFmpeg-devel] [PATCH 2/5] lavc/aarch64: Add neon implementation for sse4 Hubert Mazur
2022-08-18  9:10   ` Martin Storsjö

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=EC41504D-2C95-4C98-92D3-C7D09FF0B164@amazon.com \
    --to=jswinney@amazon.com \
    --cc=ffmpeg-devel@ffmpeg.org \
    --cc=gjb@semihalf.com \
    --cc=hum@semihalf.com \
    --cc=martin@martin.st \
    --cc=mw@semihalf.com \
    --cc=spop@amazon.com \
    --cc=upstream@semihalf.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git