From: John Cox <jc@kynesim.co.uk> To: ffmpeg-devel@ffmpeg.org Cc: John Cox <jc@kynesim.co.uk> Subject: [FFmpeg-devel] [PATCH v1 6/6] swscale: Add aarch64 functions for RGB24->YUV420P Date: Sun, 20 Aug 2023 15:10:22 +0000 Message-ID: <20230820151022.2204421-7-jc@kynesim.co.uk> (raw) In-Reply-To: <20230820151022.2204421-1-jc@kynesim.co.uk> Neon RGB24->YUV420P and BGR24->YUV420P functions. Works on 16 pixel blocks and can do any width or height, though for widths less than 32 or so the C is likely faster. Signed-off-by: John Cox <jc@kynesim.co.uk> --- libswscale/aarch64/rgb2rgb.c | 8 + libswscale/aarch64/rgb2rgb_neon.S | 356 ++++++++++++++++++++++++++++++ 2 files changed, 364 insertions(+) diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c index a9bf6ff9e0..b2d68c1df3 100644 --- a/libswscale/aarch64/rgb2rgb.c +++ b/libswscale/aarch64/rgb2rgb.c @@ -30,6 +30,12 @@ void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2, uint8_t *dest, int width, int height, int src1Stride, int src2Stride, int dstStride); +void ff_bgr24toyv12_neon(const uint8_t *src, uint8_t *ydst, uint8_t *udst, + uint8_t *vdst, int width, int height, int lumStride, + int chromStride, int srcStride, int32_t *rgb2yuv); +void ff_rgb24toyv12_neon(const uint8_t *src, uint8_t *ydst, uint8_t *udst, + uint8_t *vdst, int width, int height, int lumStride, + int chromStride, int srcStride, int32_t *rgb2yuv); av_cold void rgb2rgb_init_aarch64(void) { @@ -37,5 +43,7 @@ av_cold void rgb2rgb_init_aarch64(void) if (have_neon(cpu_flags)) { interleaveBytes = ff_interleave_bytes_neon; + ff_rgb24toyv12 = ff_rgb24toyv12_neon; + ff_bgr24toyv12 = ff_bgr24toyv12_neon; } } diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S index d81110ec57..b15e69a3bd 100644 --- a/libswscale/aarch64/rgb2rgb_neon.S +++ b/libswscale/aarch64/rgb2rgb_neon.S @@ -77,3 +77,359 @@ function ff_interleave_bytes_neon, export=1 0: ret endfunc + +// Expand rgb2 into r0+r1/g0+g1/b0+b1 +.macro XRGB3Y r0, g0, b0, r1, g1, b1, r2, g2, b2 + uxtl \r0\().8h, \r2\().8b + uxtl \g0\().8h, \g2\().8b + uxtl \b0\().8h, \b2\().8b + + uxtl2 \r1\().8h, \r2\().16b + uxtl2 \g1\().8h, \g2\().16b + uxtl2 \b1\().8h, \b2\().16b +.endm + +// Expand rgb2 into r0+r1/g0+g1/b0+b1 +// and pick every other el to put back into rgb2 for chroma +.macro XRGB3YC r0, g0, b0, r1, g1, b1, r2, g2, b2 + XRGB3Y \r0, \g0, \b0, \r1, \g1, \b1, \r2, \g2, \b2 + + bic \r2\().8h, #0xff, LSL #8 + bic \g2\().8h, #0xff, LSL #8 + bic \b2\().8h, #0xff, LSL #8 +.endm + +.macro SMLAL3 d0, d1, s0, s1, s2, c0, c1, c2 + smull \d0\().4s, \s0\().4h, \c0 + smlal \d0\().4s, \s1\().4h, \c1 + smlal \d0\().4s, \s2\().4h, \c2 + smull2 \d1\().4s, \s0\().8h, \c0 + smlal2 \d1\().4s, \s1\().8h, \c1 + smlal2 \d1\().4s, \s2\().8h, \c2 +.endm + +// d0 may be s0 +// s0, s2 corrupted +.macro SHRN_Y d0, s0, s1, s2, s3, k128h + shrn \s0\().4h, \s0\().4s, #12 + shrn2 \s0\().8h, \s1\().4s, #12 + add \s0\().8h, \s0\().8h, \k128h\().8h // +128 (>> 3 = 16) + sqrshrun \d0\().8b, \s0\().8h, #3 + shrn \s2\().4h, \s2\().4s, #12 + shrn2 \s2\().8h, \s3\().4s, #12 + add \s2\().8h, \s2\().8h, \k128h\().8h + sqrshrun2 \d0\().16b, v28.8h, #3 +.endm + +.macro SHRN_C d0, s0, s1, k128b + shrn \s0\().4h, \s0\().4s, #14 + shrn2 \s0\().8h, \s1\().4s, #14 + sqrshrn \s0\().8b, \s0\().8h, #1 + add \d0\().8b, \s0\().8b, \k128b\().8b // +128 +.endm + +.macro STB2V s0, n, a + st1 {\s0\().b}[(\n+0)], [\a], #1 + st1 {\s0\().b}[(\n+1)], [\a], #1 +.endm + +.macro STB4V s0, n, a + STB2V \s0, (\n+0), \a + STB2V \s0, (\n+2), \a +.endm + + +// void ff_bgr24toyv12_neon( +// const uint8_t *src, // x0 +// uint8_t *ydst, // x1 +// uint8_t *udst, // x2 +// uint8_t *vdst, // x3 +// int width, // w4 +// int height, // w5 +// int lumStride, // w6 +// int chromStride, // w7 +// int srcStr, // [sp, #0] +// int32_t *rgb2yuv); // [sp, #8] + +function ff_bgr24toyv12_neon, export=1 + ldr x15, [sp, #8] + ld3 {v3.s, v4.s, v5.s}[0], [x15], #12 + ld3 {v3.s, v4.s, v5.s}[1], [x15], #12 + ld3 {v3.s, v4.s, v5.s}[2], [x15] + mov v6.16b, v3.16b + mov v3.16b, v5.16b + mov v5.16b, v6.16b + b 99f +endfunc + +// void ff_rgb24toyv12_neon( +// const uint8_t *src, // x0 +// uint8_t *ydst, // x1 +// uint8_t *udst, // x2 +// uint8_t *vdst, // x3 +// int width, // w4 +// int height, // w5 +// int lumStride, // w6 +// int chromStride, // w7 +// int srcStr, // [sp, #0] +// int32_t *rgb2yuv); // [sp, #8] (including Mac) + +// regs +// v0-2 Src bytes - reused as chroma src +// v3-5 Coeffs (packed very inefficiently - could be squashed) +// v6 128b +// v7 128h +// v8-15 Reserved +// v16-18 Lo Src expanded as H +// v19 - +// v20-22 Hi Src expanded as H +// v23 - +// v24 U out +// v25 U tmp +// v26 Y out +// v27-29 Y tmp +// v30 V out +// v31 V tmp + +function ff_rgb24toyv12_neon, export=1 + ldr x15, [sp, #8] + ld3 {v3.s, v4.s, v5.s}[0], [x15], #12 + ld3 {v3.s, v4.s, v5.s}[1], [x15], #12 + ld3 {v3.s, v4.s, v5.s}[2], [x15] + +99: + ldr w14, [sp, #0] + movi v7.8b, #128 + uxtl v6.8h, v7.8b + // Ensure if nothing to do then we do nothing + cmp w4, #0 + b.le 90f + cmp w5, #0 + b.le 90f + // If w % 16 != 0 then -16 so we do main loop 1 fewer times with + // the remainder done in the tail + tst w4, #15 + b.eq 1f + sub w4, w4, #16 +1: + +// -------------------- Even line body - YUV +11: + subs w9, w4, #0 + mov x10, x0 + mov x11, x1 + mov x12, x2 + mov x13, x3 + b.lt 12f + + ld3 {v0.16b, v1.16b, v2.16b}, [x10], #48 + subs w9, w9, #16 + b.le 13f + +10: + XRGB3YC v16, v17, v18, v20, v21, v22, v0, v1, v2 + + // Testing shows it is faster to stack the smull/smlal ops together + // rather than interleave them between channels and indeed even the + // shift/add sections seem happier not interleaved + + // Y0 + SMLAL3 v26, v27, v16, v17, v18, v3.h[0], v4.h[0], v5.h[0] + // Y1 + SMLAL3 v28, v29, v20, v21, v22, v3.h[0], v4.h[0], v5.h[0] + SHRN_Y v26, v26, v27, v28, v29, v6 + + // U + // Vector subscript *2 as we loaded into S but are only using H + SMLAL3 v24, v25, v0, v1, v2, v3.h[2], v4.h[2], v5.h[2] + + // V + SMLAL3 v30, v31, v0, v1, v2, v3.h[4], v4.h[4], v5.h[4] + + ld3 {v0.16b, v1.16b, v2.16b}, [x10], #48 + + SHRN_C v24, v24, v25, v7 + SHRN_C v30, v30, v31, v7 + + subs w9, w9, #16 + + st1 {v26.16b}, [x11], #16 + st1 {v24.8b}, [x12], #8 + st1 {v30.8b}, [x13], #8 + + b.gt 10b + +// -------------------- Even line tail - YUV +// If width % 16 == 0 then simply runs once with preloaded RGB +// If other then deals with preload & then does remaining tail + +13: + // Body is simple copy of main loop body minus preload + + XRGB3YC v16, v17, v18, v20, v21, v22, v0, v1, v2 + // Y0 + SMLAL3 v26, v27, v16, v17, v18, v3.h[0], v4.h[0], v5.h[0] + // Y1 + SMLAL3 v28, v29, v20, v21, v22, v3.h[0], v4.h[0], v5.h[0] + SHRN_Y v26, v26, v27, v28, v29, v6 + // U + SMLAL3 v24, v25, v0, v1, v2, v3.h[2], v4.h[2], v5.h[2] + // V + SMLAL3 v30, v31, v0, v1, v2, v3.h[4], v4.h[4], v5.h[4] + + cmp w9, #-16 + + SHRN_C v24, v24, v25, v7 + SHRN_C v30, v30, v31, v7 + + // Here: + // w9 == 0 width % 16 == 0, tail done + // w9 > -16 1st tail done (16 pels), remainder still to go + // w9 == -16 shouldn't happen + // w9 > -32 2nd tail done + // w9 <= -32 shouldn't happen + + b.lt 2f + st1 {v26.16b}, [x11], #16 + st1 {v24.8b}, [x12], #8 + st1 {v30.8b}, [x13], #8 + cbz w9, 3f + +12: + sub w9, w9, #16 + + tbz w9, #3, 1f + ld3 {v0.8b, v1.8b, v2.8b}, [x10], #24 +1: tbz w9, #2, 1f + ld3 {v0.b, v1.b, v2.b}[8], [x10], #3 + ld3 {v0.b, v1.b, v2.b}[9], [x10], #3 + ld3 {v0.b, v1.b, v2.b}[10], [x10], #3 + ld3 {v0.b, v1.b, v2.b}[11], [x10], #3 +1: tbz w9, #1, 1f + ld3 {v0.b, v1.b, v2.b}[12], [x10], #3 + ld3 {v0.b, v1.b, v2.b}[13], [x10], #3 +1: tbz w9, #0, 13b + ld3 {v0.b, v1.b, v2.b}[14], [x10], #3 + b 13b + +2: + tbz w9, #3, 1f + st1 {v26.8b}, [x11], #8 + STB4V v24, 0, x12 + STB4V v30, 0, x13 +1: tbz w9, #2, 1f + STB4V v26 8, x11 + STB2V v24, 4, x12 + STB2V v30, 4, x13 +1: tbz w9, #1, 1f + STB2V v26, 12, x11 + st1 {v24.b}[6], [x12], #1 + st1 {v30.b}[6], [x13], #1 +1: tbz w9, #0, 1f + st1 {v26.b}[14], [x11] + st1 {v24.b}[7], [x12] + st1 {v30.b}[7], [x13] +1: +3: + +// -------------------- Odd line body - Y only + + subs w5, w5, #1 + b.eq 90f + + subs w9, w4, #0 + add x0, x0, w14, sxtx + add x1, x1, w6, sxtx + mov x10, x0 + mov x11, x1 + b.lt 12f + + ld3 {v0.16b, v1.16b, v2.16b}, [x10], #48 + subs w9, w9, #16 + b.le 13f + +10: + XRGB3Y v16, v17, v18, v20, v21, v22, v0, v1, v2 + // Y0 + SMLAL3 v26, v27, v16, v17, v18, v3.h[0], v4.h[0], v5.h[0] + // Y1 + SMLAL3 v28, v29, v20, v21, v22, v3.h[0], v4.h[0], v5.h[0] + + ld3 {v0.16b, v1.16b, v2.16b}, [x10], #48 + + SHRN_Y v26, v26, v27, v28, v29, v6 + + subs w9, w9, #16 + + st1 {v26.16b}, [x11], #16 + + b.gt 10b + +// -------------------- Odd line tail - Y +// If width % 16 == 0 then simply runs once with preloaded RGB +// If other then deals with preload & then does remaining tail + +13: + // Body is simple copy of main loop body minus preload + + XRGB3Y v16, v17, v18, v20, v21, v22, v0, v1, v2 + // Y0 + SMLAL3 v26, v27, v16, v17, v18, v3.h[0], v4.h[0], v5.h[0] + // Y1 + SMLAL3 v28, v29, v20, v21, v22, v3.h[0], v4.h[0], v5.h[0] + + cmp w9, #-16 + + SHRN_Y v26, v26, v27, v28, v29, v6 + + // Here: + // w9 == 0 width % 16 == 0, tail done + // w9 > -16 1st tail done (16 pels), remainder still to go + // w9 == -16 shouldn't happen + // w9 > -32 2nd tail done + // w9 <= -32 shouldn't happen + + b.lt 2f + st1 {v26.16b}, [x11], #16 + cbz w9, 3f + +12: + sub w9, w9, #16 + + tbz w9, #3, 1f + ld3 {v0.8b, v1.8b, v2.8b}, [x10], #24 +1: tbz w9, #2, 1f + ld3 {v0.b, v1.b, v2.b}[8], [x10], #3 + ld3 {v0.b, v1.b, v2.b}[9], [x10], #3 + ld3 {v0.b, v1.b, v2.b}[10], [x10], #3 + ld3 {v0.b, v1.b, v2.b}[11], [x10], #3 +1: tbz w9, #1, 1f + ld3 {v0.b, v1.b, v2.b}[12], [x10], #3 + ld3 {v0.b, v1.b, v2.b}[13], [x10], #3 +1: tbz w9, #0, 13b + ld3 {v0.b, v1.b, v2.b}[14], [x10], #3 + b 13b + +2: + tbz w9, #3, 1f + st1 {v26.8b}, [x11], #8 +1: tbz w9, #2, 1f + STB4V v26, 8, x11 +1: tbz w9, #1, 1f + STB2V v26, 12, x11 +1: tbz w9, #0, 1f + st1 {v26.b}[14], [x11] +1: +3: + +// ------------------- Loop to start + + add x0, x0, w14, sxtx + add x1, x1, w6, sxtx + add x2, x2, w7, sxtx + add x3, x3, w7, sxtx + subs w5, w5, #1 + b.gt 11b +90: + ret +endfunc -- 2.39.2 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
prev parent reply other threads:[~2023-08-20 15:11 UTC|newest] Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top 2023-08-20 15:10 [FFmpeg-devel] [PATCH v1 0/6] swscale: Add dedicated RGB->YUV unscaled functions & aarch64 asm John Cox 2023-08-20 15:10 ` [FFmpeg-devel] [PATCH v1 1/6] fate-filter-fps: Set swscale bitexact for tests that do conversions John Cox 2023-08-20 15:10 ` [FFmpeg-devel] [PATCH v1 2/6] swscale: Rename BGR24->YUV conversion functions as bgr John Cox 2023-08-20 15:10 ` [FFmpeg-devel] [PATCH v1 3/6] swscale: Add explicit rgb24->yv12 conversion John Cox 2023-08-20 17:16 ` Michael Niedermayer 2023-08-20 17:45 ` Michael Niedermayer 2023-08-20 18:28 ` John Cox 2023-08-21 19:15 ` Michael Niedermayer 2023-08-22 14:24 ` John Cox 2023-08-22 18:03 ` Michael Niedermayer 2023-08-20 18:09 ` John Cox 2023-08-20 15:10 ` [FFmpeg-devel] [PATCH v1 4/6] swscale: RGB24->YUV allow odd widths & improve C rounding John Cox 2023-08-20 15:10 ` [FFmpeg-devel] [PATCH v1 5/6] swscale: Add unscaled XRGB->YUV420P functions John Cox 2023-08-20 15:10 ` John Cox [this message]
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20230820151022.2204421-7-jc@kynesim.co.uk \ --to=jc@kynesim.co.uk \ --cc=ffmpeg-devel@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git