Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
From: "Swinney, Jonathan" <jswinney@amazon.com>
To: "Martin Storsjö" <martin@martin.st>,
	"ffmpeg-devel@ffmpeg.org" <ffmpeg-devel@ffmpeg.org>
Subject: Re: [FFmpeg-devel] [PATCH 5/5] aarch64: me_cmp: Don't do uaddlv once per iteration
Date: Fri, 15 Jul 2022 19:32:11 +0000
Message-ID: <8E07A574-9F34-48B1-98BC-95A5B676A458@amazon.com> (raw)
In-Reply-To: <20220713204854.3114817-5-martin@martin.st>

If the max height is just 16, then this should be fine. I assumed that h could have a much higher value (>1024), but if that is not the case, then this is a useful optimization.

Thanks!

-- 

Jonathan Swinney

On 7/13/22, 3:49 PM, "Martin Storsjö" <martin@martin.st> wrote:

    CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.



    The max height is 16; the max difference per pixel is 255, and
    a .8h element can easily contain 16*255, thus keep accumulating
    in two .8h vectors, and just do the final accumulation at the
    end.

    This requires a minor register renumbering in ff_pix_abs16_xy2_neon.

    Before:       Cortex A53    A72    A73   Graviton 3
    pix_abs_0_0_neon:   97.7   47.0   37.5   22.7
    pix_abs_0_1_neon:  154.0   59.0   52.0   25.0
    pix_abs_0_3_neon:  179.7   96.7   87.5   41.2
    After:
    pix_abs_0_0_neon:   96.0   39.2   31.2   22.0
    pix_abs_0_1_neon:  150.7   59.7   46.2   23.7
    pix_abs_0_3_neon:  175.7   83.7   81.7   38.2

    ---
    I remember suggesting this before, and having it dismissed for
    some reason I don't remember - maybe that the element size wasn't
    big enough to hold the intermediate results? At least as far as I
    can see, it can hold the results just fine, and this passes all
    tests.
    ---
     libavcodec/aarch64/me_cmp_neon.S | 102 +++++++++++++++----------------
     1 file changed, 49 insertions(+), 53 deletions(-)

    diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
    index 89546869fb..cda7ce0408 100644
    --- a/libavcodec/aarch64/me_cmp_neon.S
    +++ b/libavcodec/aarch64/me_cmp_neon.S
    @@ -27,15 +27,16 @@ function ff_pix_abs16_neon, export=1
             // x3           ptrdiff_t stride
             // w4           int h
             cmp             w4, #4                      // if h < 4, jump to completion section
    -        movi            v18.4S, #0                  // clear result accumulator
    +        movi            v16.8h, #0                  // clear result accumulator
    +        movi            v17.8h, #0                  // clear result accumulator
             b.lt            2f
     1:
             ld1             {v0.16b}, [x1], x3          // load pix1
             ld1             {v4.16b}, [x2], x3          // load pix2
             ld1             {v1.16b}, [x1], x3          // load pix1
             ld1             {v5.16b}, [x2], x3          // load pix2
    -        uabdl           v16.8h, v0.8b, v4.8b        // absolute difference accumulate
    -        uabdl2          v17.8h, v0.16b, v4.16b
    +        uabal           v16.8h, v0.8b, v4.8b        // absolute difference accumulate
    +        uabal2          v17.8h, v0.16b, v4.16b
             ld1             {v2.16b}, [x1], x3          // load pix1
             ld1             {v6.16b}, [x2], x3          // load pix2
             uabal           v16.8h, v1.8b, v5.8b        // absolute difference accumulate
    @@ -48,27 +49,26 @@ function ff_pix_abs16_neon, export=1
             uabal           v16.8h, v3.8b, v7.8b
             uabal2          v17.8h, v3.16b, v7.16b
             cmp             w4, #4                      // if h >= 4, loop
    -        add             v16.8h, v16.8h, v17.8h
    -        uaddlv          s16, v16.8h                 // add up everything in v16 accumulator
    -        add             d18, d16, d18               // add to the end result register

             b.ge            1b
             cbnz            w4, 2f                      // if iterations remain, jump to completion section

    -        fmov            w0, s18                     // copy result to general purpose register
    +        add             v16.8h, v16.8h, v17.8h
    +        uaddlv          s16, v16.8h                 // add up everything in v16 accumulator
    +        fmov            w0, s16                     // copy result to general purpose register
             ret

     2:
             ld1             {v0.16b}, [x1], x3          // load pix1
             ld1             {v4.16b}, [x2], x3          // load pix2
    -        uabdl           v16.8h, v0.8b, v4.8b        // absolute difference accumulate
    -        uabal2          v16.8h, v0.16b, v4.16b
             subs            w4, w4, #1                  // h -= 1
    -        addv            h16, v16.8h                 // add up v16
    -        add             d18, d16, d18               // add to result
    +        uabal           v16.8h, v0.8b, v4.8b        // absolute difference accumulate
    +        uabal2          v17.8h, v0.16b, v4.16b
             b.ne            2b

    -        fmov            w0, s18                     // copy result to general purpose register
    +        add             v16.8h, v16.8h, v17.8h
    +        uaddlv          s16, v16.8h                 // add up everything in v16 accumulator
    +        fmov            w0, s16                     // copy result to general purpose register
             ret
     endfunc

    @@ -80,7 +80,8 @@ function ff_pix_abs16_xy2_neon, export=1
             // w4           int h

             add             x5, x2, x3                  // use x5 to hold uint8_t *pix3
    -        movi            v0.2d, #0                   // initialize the result register
    +        movi            v21.8h, #0                  // initialize the result register
    +        movi            v22.8h, #0                  // initialize the result register

             // Load initial pix2 values for either the unrolled version or completion version.
             ldur            q4, [x2, #1]                // load pix2+1
    @@ -119,15 +120,15 @@ function ff_pix_abs16_xy2_neon, export=1
             uaddl           v2.8h, v6.8b, v7.8b         // pix3 + pix3+1 0..7
             uaddl2          v3.8h, v6.16b, v7.16b       // pix3 + pix3+1 8..15

    -        ldur            q22, [x5, #1]               // load pix3+1
    +        ldur            q7, [x5, #1]                // load pix3+1

             add             v26.8h, v30.8h, v2.8h       // add up 0..7, using pix2 + pix2+1 values from pix3 above
             add             v27.8h, v31.8h, v3.8h       // add up 8..15, using pix2 + pix2+1 values from pix3 above

    -        uabdl           v24.8h, v1.8b,  v23.8b      // absolute difference 0..7, i=0
    -        uabdl2          v23.8h, v1.16b, v23.16b     // absolute difference 8..15, i=0
    +        uabal           v21.8h, v1.8b,  v23.8b      // absolute difference 0..7, i=0
    +        uabal2          v22.8h, v1.16b, v23.16b     // absolute difference 8..15, i=0

    -        ld1             {v21.16b}, [x5], x3         // load pix3
    +        ld1             {v6.16b}, [x5], x3          // load pix3
             ld1             {v20.16b}, [x1], x3         // load pix1

             rshrn           v26.8b, v26.8h, #2          // shift right 2 0..7 (rounding shift right)
    @@ -140,33 +141,33 @@ function ff_pix_abs16_xy2_neon, export=1
             rshrn           v28.8b, v28.8h, #2          // shift right 2 0..7 (rounding shift right)
             rshrn2          v28.16b, v29.8h, #2         // shift right 2 8..15

    -        uabal           v24.8h, v16.8b,  v26.8b     // absolute difference 0..7, i=1
    -        uabal2          v23.8h, v16.16b, v26.16b    // absolute difference 8..15, i=1
    +        uabal           v21.8h, v16.8b,  v26.8b     // absolute difference 0..7, i=1
    +        uabal2          v22.8h, v16.16b, v26.16b    // absolute difference 8..15, i=1

    -        uaddl           v2.8h, v21.8b, v22.8b       // pix3 + pix3+1 0..7
    -        uaddl2          v3.8h, v21.16b, v22.16b     // pix3 + pix3+1 8..15
    +        uaddl           v2.8h, v6.8b,  v7.8b        // pix3 + pix3+1 0..7
    +        uaddl2          v3.8h, v6.16b, v7.16b       // pix3 + pix3+1 8..15
             add             v30.8h, v4.8h, v2.8h        // add up 0..7, using pix2 + pix2+1 values from pix3 above
             add             v31.8h, v5.8h, v3.8h        // add up 8..15, using pix2 + pix2+1 values from pix3 above
             rshrn           v30.8b, v30.8h, #2          // shift right 2 0..7 (rounding shift right)
             rshrn2          v30.16b, v31.8h, #2         // shift right 2 8..15

    -        uabal           v24.8h, v17.8b,  v28.8b     // absolute difference 0..7, i=2
    -        uabal2          v23.8h, v17.16b, v28.16b    // absolute difference 8..15, i=2
    -
             sub             w4, w4, #4                  // h -= 4

    -        uabal           v24.8h, v20.8b,  v30.8b     // absolute difference 0..7, i=3
    -        uabal2          v23.8h, v20.16b, v30.16b    // absolute difference 8..15, i=3
    +        uabal           v21.8h, v17.8b,  v28.8b     // absolute difference 0..7, i=2
    +        uabal2          v22.8h, v17.16b, v28.16b    // absolute difference 8..15, i=2

             cmp             w4, #4                      // loop if h >= 4

    -        add             v4.8h, v23.8h, v24.8h
    -        uaddlv          s4, v4.8h                   // finish adding up accumulated values
    -        add             d0, d0, d4                  // add the value to the top level accumulator
    +
    +        uabal           v21.8h, v20.8b,  v30.8b     // absolute difference 0..7, i=3
    +        uabal2          v22.8h, v20.16b, v30.16b    // absolute difference 8..15, i=3

             b.ge            1b
             cbnz            w4, 2f                      // if iterations remain jump to completion section

    +        add             v4.8h, v21.8h, v22.8h
    +        uaddlv          s0, v4.8h                   // finish adding up accumulated values
    +
             fmov            w0, s0                      // copy result to general purpose register
             ret
     2:
    @@ -182,20 +183,18 @@ function ff_pix_abs16_xy2_neon, export=1
             add             v16.8h, v2.8h, v18.8h       // add up 0..7, using pix2 + pix2+1 values from previous iteration
             add             v17.8h, v3.8h, v19.8h       // add up 8..15, using pix2 + pix2+1 values from previous iteration
             // divide by 4 to compute the average of values summed above
    -        urshr           v16.8h, v16.8h, #2          // shift right by 2 0..7 (rounding shift right)
    -        urshr           v17.8h, v17.8h, #2          // shift right by 2 8..15
    -
    -        uxtl2           v7.8h, v1.16b               // 8->16 bits pix1 8..15
    -        uxtl            v1.8h, v1.8b                // 8->16 bits pix1 0..7
    +        rshrn           v16.8b,  v16.8h, #2         // shift right by 2 0..7 (rounding shift right)
    +        rshrn2          v16.16b, v17.8h, #2         // shift right by 2 8..15

    -        uabd            v6.8h, v1.8h, v16.8h        // absolute difference 0..7
    -        uaba            v6.8h, v7.8h, v17.8h        // absolute difference accumulate 8..15
    +        uabal           v21.8h, v1.8b,  v16.8b      // absolute difference 0..7
    +        uabal2          v22.8h, v1.16b, v16.16b     // absolute difference accumulate 8..15
             mov             v2.16b, v18.16b             // pix3 -> pix2
             mov             v3.16b, v19.16b             // pix3+1 -> pix2+1
    -        uaddlv          s6, v6.8h                   // add up accumulator in v6
    -        add             d0, d0, d6                  // add to the final result

             b.ne            2b                          // loop if h > 0
    +
    +        add             v4.8h, v21.8h, v22.8h
    +        uaddlv          s0, v4.8h                   // finish adding up accumulated values
             fmov            w0, s0                      // copy result to general purpose register
             ret
     endfunc
    @@ -209,7 +208,8 @@ function ff_pix_abs16_x2_neon, export=1

             cmp             w4, #4
             // initialize buffers
    -        movi            d20, #0
    +        movi            v16.8h, #0
    +        movi            v17.8h, #0
             add             x5, x2, #1 // pix2 + 1
             b.lt            2f

    @@ -224,9 +224,9 @@ function ff_pix_abs16_x2_neon, export=1
             ld1             {v2.16b}, [x5], x3
             urhadd          v30.16b, v1.16b, v2.16b
             ld1             {v0.16b}, [x1], x3
    -        uabdl           v16.8h, v0.8b, v30.8b
    +        uabal           v16.8h, v0.8b, v30.8b
             ld1             {v4.16b}, [x2], x3
    -        uabdl2          v17.8h, v0.16b, v30.16b
    +        uabal2          v17.8h, v0.16b, v30.16b
             ld1             {v5.16b}, [x5], x3
             urhadd          v29.16b, v4.16b, v5.16b
             ld1             {v3.16b}, [x1], x3
    @@ -238,20 +238,15 @@ function ff_pix_abs16_x2_neon, export=1
             ld1             {v6.16b}, [x1], x3
             uabal           v16.8h, v6.8b, v28.8b
             ld1             {v24.16b}, [x2], x3
    +        sub             w4, w4, #4
             uabal2          v17.8h, v6.16b, v28.16b
             ld1             {v25.16b}, [x5], x3
             urhadd          v27.16b, v24.16b, v25.16b
             ld1             {v23.16b}, [x1], x3
    +        cmp             w4, #4
             uabal           v16.8h, v23.8b, v27.8b
             uabal2          v17.8h, v23.16b, v27.16b

    -        sub             w4, w4, #4
    -
    -        add             v16.8h, v16.8h, v17.8h
    -        uaddlv          s16, v16.8h
    -        cmp             w4, #4
    -        add             d20, d20, d16
    -
             b.ge            1b
             cbz             w4, 3f

    @@ -259,18 +254,19 @@ function ff_pix_abs16_x2_neon, export=1
     2:
             ld1             {v1.16b}, [x2], x3
             ld1             {v2.16b}, [x5], x3
    +        subs            w4, w4, #1
             urhadd          v29.16b, v1.16b, v2.16b
             ld1             {v0.16b}, [x1], x3
    -        uabd            v28.16b, v0.16b, v29.16b
    +        uabal           v16.8h, v0.8b,  v29.8b
    +        uabal2          v17.8h, v0.16b, v29.16b

    -        uaddlv          h28, v28.16b
    -        subs            w4, w4, #1

    -        add             d20, d20, d28
             b.ne            2b

     3:
    -        fmov            w0, s20
    +        add             v16.8h, v16.8h, v17.8h
    +        uaddlv          s16, v16.8h
    +        fmov            w0, s16

             ret
     endfunc
    --
    2.25.1


_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

  reply	other threads:[~2022-07-15 19:32 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-07-13 20:48 [FFmpeg-devel] [PATCH 1/5] libavcodec: aarch64: Don't clobber v8 in the h%4 case in ff_pix_abs16_xy2_neon Martin Storsjö
2022-07-13 20:48 ` [FFmpeg-devel] [PATCH 2/5] checkasm: motion: Make the benchmarks more stable Martin Storsjö
2022-07-15 19:35   ` Swinney, Jonathan
2022-07-13 20:48 ` [FFmpeg-devel] [PATCH 3/5] aarch64: me_cmp: Interleave some of the loads in ff_pix_abs16_xy2_neon Martin Storsjö
2022-07-15 19:34   ` Swinney, Jonathan
2022-07-13 20:48 ` [FFmpeg-devel] [PATCH 4/5] aarch64: me_cmp: Switch from uabd to uabal " Martin Storsjö
2022-07-15 19:38   ` Swinney, Jonathan
2022-07-13 20:48 ` [FFmpeg-devel] [PATCH 5/5] aarch64: me_cmp: Don't do uaddlv once per iteration Martin Storsjö
2022-07-15 19:32   ` Swinney, Jonathan [this message]
2022-07-15 19:56     ` Martin Storsjö
2022-07-15 21:19       ` Michael Niedermayer
2022-07-15 21:25         ` Martin Storsjö
2022-07-16 11:23           ` Michael Niedermayer
2022-07-16 12:30             ` Martin Storsjö
2022-07-16 13:20               ` Michael Niedermayer
2022-07-16 14:23                 ` Martin Storsjö
2022-07-16 12:50             ` Ronald S. Bultje
2022-07-16 13:06               ` Michael Niedermayer
2022-07-16  9:18         ` Martin Storsjö
2022-07-15 19:35 ` [FFmpeg-devel] [PATCH 1/5] libavcodec: aarch64: Don't clobber v8 in the h%4 case in ff_pix_abs16_xy2_neon Swinney, Jonathan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=8E07A574-9F34-48B1-98BC-95A5B676A458@amazon.com \
    --to=jswinney@amazon.com \
    --cc=ffmpeg-devel@ffmpeg.org \
    --cc=martin@martin.st \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git