* [FFmpeg-devel] [PR] NEON rgb2rgb conversions: channel swap + alpha drop/insert (PR #21738)
@ 2026-02-12 3:15 David Christle via ffmpeg-devel
0 siblings, 0 replies; only message in thread
From: David Christle via ffmpeg-devel @ 2026-02-12 3:15 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: David Christle
PR #21738 opened by David Christle (dchristle)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21738
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21738.patch
Three new NEON-optimized rgb2rgb functions for aarch64:
- rgb24tobgr24: R↔B channel swap via ld3/st3 (14x over scalar C)
- rgb32tobgr24: alpha drop via ldp+tbl+stp (1.8x over clang auto-vec)
- rgb24tobgr32: alpha insert via ldp+tbl+stp (1.6x over clang auto-vec)
checkasm tests cover all three code tiers (16px NEON, 8px NEON, scalar tail) with overwrite detection.
From b2c802376b13034f03a89dc7a1ad0b6eac04ac95 Mon Sep 17 00:00:00 2001
From: David Christle <dev@christle.is>
Date: Fri, 6 Feb 2026 16:45:12 -0500
Subject: [PATCH 1/2] swscale/aarch64: add NEON rgb24tobgr24 byte-swap
Add a NEON rgb24tobgr24 using ld3/st3 to swap R and B channels in
packed 24bpp RGB buffers. Handles all input sizes with a 16-pixel
NEON fast path, 8-pixel NEON cleanup, and scalar tail.
checkasm --bench on Apple M3 Max (1920*3 = 5760 bytes):
rgb24tobgr24_c: 872.3 ( 1.00x)
rgb24tobgr24_neon: 62.4 (13.98x)
Signed-off-by: David Christle <dev@christle.is>
---
libswscale/aarch64/rgb2rgb.c | 3 +++
libswscale/aarch64/rgb2rgb_neon.S | 43 +++++++++++++++++++++++++++++++
tests/checkasm/sw_rgb.c | 31 ++++++++++++++++++++++
3 files changed, 77 insertions(+)
diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c
index f474228298..5873439db5 100644
--- a/libswscale/aarch64/rgb2rgb.c
+++ b/libswscale/aarch64/rgb2rgb.c
@@ -51,6 +51,8 @@ static void rgb24toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
}
}
+void ff_rgb24tobgr24_neon(const uint8_t *src, uint8_t *dst, int src_size);
+
void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2,
uint8_t *dest, int width, int height,
int src1Stride, int src2Stride, int dstStride);
@@ -85,6 +87,7 @@ av_cold void rgb2rgb_init_aarch64(void)
if (have_neon(cpu_flags)) {
ff_rgb24toyv12 = rgb24toyv12;
+ rgb24tobgr24 = ff_rgb24tobgr24_neon;
interleaveBytes = ff_interleave_bytes_neon;
deinterleaveBytes = ff_deinterleave_bytes_neon;
shuffle_bytes_0321 = ff_shuffle_bytes_0321_neon;
diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
index f6d625f11f..25e6c73f42 100644
--- a/libswscale/aarch64/rgb2rgb_neon.S
+++ b/libswscale/aarch64/rgb2rgb_neon.S
@@ -241,6 +241,49 @@ function ff_rgb24toyv12_neon, export=1
ret
endfunc
+// void ff_rgb24tobgr24_neon(const uint8_t *src, uint8_t *dst, int src_size);
+function ff_rgb24tobgr24_neon, export=1
+ // x0 = src, x1 = dst, w2 = src_size (bytes)
+
+ // Fast path: 48 bytes (16 pixels) per iteration
+ subs w2, w2, #48
+ b.lt 2f
+1:
+ ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ mov v3.16b, v0.16b
+ mov v0.16b, v2.16b
+ mov v2.16b, v3.16b
+ st3 {v0.16b, v1.16b, v2.16b}, [x1], #48
+ subs w2, w2, #48
+ b.ge 1b
+2:
+ add w2, w2, #48
+ // Medium path: 24 bytes (8 pixels)
+ cmp w2, #24
+ b.lt 3f
+ ld3 {v0.8b, v1.8b, v2.8b}, [x0], #24
+ mov v3.8b, v0.8b
+ mov v0.8b, v2.8b
+ mov v2.8b, v3.8b
+ sub w2, w2, #24
+ st3 {v0.8b, v1.8b, v2.8b}, [x1], #24
+3:
+ // Scalar tail: 3 bytes (1 pixel) at a time
+ cmp w2, #3
+ b.lt 4f
+5:
+ ldrb w3, [x0], #1
+ ldrb w4, [x0], #1
+ ldrb w5, [x0], #1
+ subs w2, w2, #3
+ strb w5, [x1], #1
+ strb w4, [x1], #1
+ strb w3, [x1], #1
+ b.gt 5b
+4:
+ ret
+endfunc
+
// void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2,
// uint8_t *dest, int width, int height,
// int src1Stride, int src2Stride, int dstStride);
diff --git a/tests/checkasm/sw_rgb.c b/tests/checkasm/sw_rgb.c
index 6edfc93b0b..baeb34b465 100644
--- a/tests/checkasm/sw_rgb.c
+++ b/tests/checkasm/sw_rgb.c
@@ -834,6 +834,37 @@ void checkasm_check_sw_rgb(void)
check_shuffle_bytes(shuffle_bytes_2130, "shuffle_bytes_2130");
report("shuffle_bytes_2130");
+ {
+ /* rgb24tobgr24 operates on 3-byte pixels, so test widths must be
+ * multiples of 3 to avoid reading past the source buffer. */
+ static const int rgb24_width[] = {3, 12, 24, 36, 48, 126, 1920 * 3};
+ int i;
+#define RGB24_BENCH_WIDTH (1920 * 3)
+ LOCAL_ALIGNED_32(uint8_t, src0, [RGB24_BENCH_WIDTH]);
+ LOCAL_ALIGNED_32(uint8_t, src1, [RGB24_BENCH_WIDTH]);
+ LOCAL_ALIGNED_32(uint8_t, dst0, [RGB24_BENCH_WIDTH]);
+ LOCAL_ALIGNED_32(uint8_t, dst1, [RGB24_BENCH_WIDTH]);
+
+ declare_func(void, const uint8_t *src, uint8_t *dst, int src_size);
+
+ memset(dst0, 0, RGB24_BENCH_WIDTH);
+ memset(dst1, 0, RGB24_BENCH_WIDTH);
+ randomize_buffers(src0, RGB24_BENCH_WIDTH);
+ memcpy(src1, src0, RGB24_BENCH_WIDTH);
+
+ if (check_func(rgb24tobgr24, "rgb24tobgr24")) {
+ for (i = 0; i < FF_ARRAY_ELEMS(rgb24_width); i++) {
+ call_ref(src0, dst0, rgb24_width[i]);
+ call_new(src1, dst1, rgb24_width[i]);
+ if (memcmp(dst0, dst1, rgb24_width[i]))
+ fail();
+ }
+ bench_new(src0, dst0, RGB24_BENCH_WIDTH);
+ }
+#undef RGB24_BENCH_WIDTH
+ }
+ report("rgb24tobgr24");
+
check_uyvy_to_422p();
report("uyvytoyuv422");
--
2.52.0
From eec19b9e860bbffb65a640b1647bd6f2d1c63d0e Mon Sep 17 00:00:00 2001
From: David Christle <dev@christle.is>
Date: Fri, 6 Feb 2026 21:15:52 -0500
Subject: [PATCH 2/2] swscale/aarch64: add NEON rgb32tobgr24 and rgb24tobgr32
conversions
Add NEON alpha drop/insert using ldp+tbl+stp instead of ld4/st3 and
ld3/st4 structure operations. Both use a 2-register sliding-window
tbl with post-indexed addressing. Instruction scheduling targets
narrow in-order cores (A55) while remaining neutral on wide OoO.
Scalar tails use coalesced loads/stores (ldr+strh+lsr+strb for alpha
drop, ldrh+ldrb+orr+str for alpha insert) to reduce per-pixel
instruction count. Independent instructions placed between loads and
dependent operations to fill load-use latency on in-order cores.
checkasm --bench on Apple M3 Max (decicycles, 1920px):
rgb32tobgr24_c: 114.4 ( 1.00x)
rgb32tobgr24_neon: 64.3 ( 1.78x)
rgb24tobgr32_c: 128.9 ( 1.00x)
rgb24tobgr32_neon: 80.9 ( 1.59x)
C baseline is clang auto-vectorized; speedup is over compiler NEON.
Signed-off-by: David Christle <dev@christle.is>
---
libswscale/aarch64/rgb2rgb.c | 4 +
libswscale/aarch64/rgb2rgb_neon.S | 139 ++++++++++++++++++++++++++++++
tests/checkasm/sw_rgb.c | 91 +++++++++++++++++++
3 files changed, 234 insertions(+)
diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c
index 5873439db5..b9d8aa4dc2 100644
--- a/libswscale/aarch64/rgb2rgb.c
+++ b/libswscale/aarch64/rgb2rgb.c
@@ -52,6 +52,8 @@ static void rgb24toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
}
void ff_rgb24tobgr24_neon(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_rgb32tobgr24_neon(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_rgb24tobgr32_neon(const uint8_t *src, uint8_t *dst, int src_size);
void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2,
uint8_t *dest, int width, int height,
@@ -88,6 +90,8 @@ av_cold void rgb2rgb_init_aarch64(void)
if (have_neon(cpu_flags)) {
ff_rgb24toyv12 = rgb24toyv12;
rgb24tobgr24 = ff_rgb24tobgr24_neon;
+ rgb32tobgr24 = ff_rgb32tobgr24_neon;
+ rgb24tobgr32 = ff_rgb24tobgr32_neon;
interleaveBytes = ff_interleave_bytes_neon;
deinterleaveBytes = ff_deinterleave_bytes_neon;
shuffle_bytes_0321 = ff_shuffle_bytes_0321_neon;
diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
index 25e6c73f42..379c6bf343 100644
--- a/libswscale/aarch64/rgb2rgb_neon.S
+++ b/libswscale/aarch64/rgb2rgb_neon.S
@@ -97,6 +97,32 @@ const shuf_2130_tbl, align=4
.byte 14, 13, 15, 12
endconst
+// rgb32tobgr24: tbl indices for 2-register sliding window (ldp+tbl+stp approach)
+// Converts 16 BGRA pixels (64 bytes) to 16 BGR pixels (48 bytes) by dropping alpha.
+// Each 16-byte output register selects 3-of-4 bytes from a {Vn, Vn+1} pair.
+const rgb32tobgr24_tbl, align=4
+ // out0 from {v0,v1}: pixels 0-5⅓ → B0 G0 R0 B1 G1 R1 B2 G2 R2 B3 G3 R3 B4 G4 R4 B5
+ .byte 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20
+ // out1 from {v1,v2}: pixels 5⅓-10⅔ → G5 R5 B6 G6 R6 B7 G7 R7 B8 G8 R8 B9 G9 R9 B10 G10
+ .byte 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25
+ // out2 from {v2,v3}: pixels 10⅔-15 → R10 B11 G11 R11 B12 G12 R12 B13 G13 R13 B14 G14 R14 B15 G15 R15
+ .byte 10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25, 26, 28, 29, 30
+endconst
+
+// rgb24tobgr32: tbl indices for sliding window (ldp+tbl+orr+stp approach)
+// Converts 16 BGR pixels (48 bytes) to 16 BGRA pixels (64 bytes) by inserting alpha=255.
+// Out-of-range index 128 produces 0 from tbl; orr with alpha mask fills in 0xFF.
+const rgb24tobgr32_tbl, align=4
+ // out0 from {v0}: pixels 0-3 → B0 G0 R0 _ B1 G1 R1 _ B2 G2 R2 _ B3 G3 R3 _
+ .byte 0, 1, 2, 128, 3, 4, 5, 128, 6, 7, 8, 128, 9, 10, 11, 128
+ // out1 from {v0,v1}: pixels 4-7 → B4 G4 R4 _ B5 G5 R5 _ B6 G6 R6 _ B7 G7 R7 _
+ .byte 12, 13, 14, 128, 15, 16, 17, 128, 18, 19, 20, 128, 21, 22, 23, 128
+ // out2 from {v1,v2}: pixels 8-11 → B8 G8 R8 _ B9 G9 R9 _ B10 G10 R10 _ B11 G11 R11 _
+ .byte 8, 9, 10, 128, 11, 12, 13, 128, 14, 15, 16, 128, 17, 18, 19, 128
+ // out3 from {v2}: pixels 12-15 → B12 G12 R12 _ B13 G13 R13 _ B14 G14 R14 _ B15 G15 R15 _
+ .byte 4, 5, 6, 128, 7, 8, 9, 128, 10, 11, 12, 128, 13, 14, 15, 128
+endconst
+
// convert rgb to 16-bit y, u, or v
// uses v3 and v4
@@ -284,6 +310,119 @@ function ff_rgb24tobgr24_neon, export=1
ret
endfunc
+// void ff_rgb32tobgr24_neon(const uint8_t *src, uint8_t *dst, int src_size);
+function ff_rgb32tobgr24_neon, export=1
+ // x0 = src (BGRA), x1 = dst (BGR), w2 = src_size (bytes)
+
+ // Load 3 tbl permutation masks for 2-register sliding window
+ movrel x3, rgb32tobgr24_tbl
+ ld1 {v16.16b, v17.16b, v18.16b}, [x3]
+
+ // Fast path: 64 bytes input (16 pixels) → 48 bytes output
+ // Uses ldp+tbl(2-reg sliding window)+stp to avoid expensive ld4/st3.
+ // Post-indexed addressing eliminates pointer-advance instructions.
+ // subs placed between loads and tbl to fill load-latency gap on
+ // in-order cores (A55).
+ subs w2, w2, #64
+ b.lt 2f
+1:
+ ldp q0, q1, [x0], #64
+ ldp q2, q3, [x0, #-32]
+ subs w2, w2, #64
+ tbl v4.16b, {v0.16b, v1.16b}, v16.16b
+ tbl v5.16b, {v1.16b, v2.16b}, v17.16b
+ tbl v6.16b, {v2.16b, v3.16b}, v18.16b
+ stp q4, q5, [x1], #48
+ str q6, [x1, #-16]
+ b.ge 1b
+2:
+ add w2, w2, #64
+ // Medium path: 32 bytes input (8 pixels) → 24 bytes output
+ cmp w2, #32
+ b.lt 3f
+ ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], #32
+ st3 {v0.8b, v1.8b, v2.8b}, [x1], #24
+ sub w2, w2, #32
+3:
+ // Scalar tail: 4 bytes → 3 bytes at a time
+ // Uses word load + halfword/byte stores to reduce instructions.
+ // On LE: ldr gives A<<24|R<<16|G<<8|B; strh stores [B,G]; lsr+strb stores R.
+ // subs placed between ldr and strh to fill load-use latency on A55.
+ cmp w2, #4
+ b.lt 4f
+5:
+ ldr w3, [x0], #4
+ subs w2, w2, #4
+ strh w3, [x1], #2
+ lsr w3, w3, #16
+ strb w3, [x1], #1
+ b.gt 5b
+4:
+ ret
+endfunc
+
+// void ff_rgb24tobgr32_neon(const uint8_t *src, uint8_t *dst, int src_size);
+function ff_rgb24tobgr32_neon, export=1
+ // x0 = src (BGR), x1 = dst (BGRA), w2 = src_size (bytes)
+
+ // Load tbl permutation indices and alpha mask for the fast path
+ movrel x3, rgb24tobgr32_tbl
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3]
+ movi v20.4s, #255, lsl #24 // Alpha mask: 00 00 00 FF per pixel
+
+ // Fast path: 48 bytes input (16 pixels) → 64 bytes output
+ // Uses ldp+tbl+orr+stp to avoid expensive ld3/st4 structure load/stores.
+ // tbl produces 0 for alpha positions; orr fills in 0xFF.
+ // Post-indexed addressing eliminates pointer-advance instructions.
+ // tbl/orr interleaved so each orr starts as soon as its tbl result
+ // is ready, hiding latency on narrow in-order cores (A55).
+ subs w2, w2, #48
+ b.lt 2f
+1:
+ ldp q0, q1, [x0], #48
+ ldr q2, [x0, #-16]
+ subs w2, w2, #48
+ tbl v4.16b, {v0.16b}, v16.16b
+ tbl v5.16b, {v0.16b, v1.16b}, v17.16b
+ orr v4.16b, v4.16b, v20.16b
+ tbl v6.16b, {v1.16b, v2.16b}, v18.16b
+ orr v5.16b, v5.16b, v20.16b
+ tbl v7.16b, {v2.16b}, v19.16b
+ orr v6.16b, v6.16b, v20.16b
+ stp q4, q5, [x1], #64
+ orr v7.16b, v7.16b, v20.16b
+ stp q6, q7, [x1, #-32]
+ b.ge 1b
+2:
+ add w2, w2, #48
+ // Medium path: 24 bytes input (8 pixels) → 32 bytes output
+ cmp w2, #24
+ b.lt 3f
+ movi v3.8b, #255
+ ld3 {v0.8b, v1.8b, v2.8b}, [x0], #24
+ st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x1], #32
+ sub w2, w2, #24
+3:
+ // Scalar tail: 3 bytes → 4 bytes at a time
+ // Uses halfword+byte loads, orr to combine with alpha, word store.
+ // On LE: ldrh gives G<<8|B, ldrb gives R; orr assembles 0xFF<<24|R<<16|G<<8|B;
+ // str stores [B,G,R,0xFF]. subs and add placed between loads and first
+ // orr to fill load-use latency on A55.
+ cmp w2, #3
+ b.lt 4f
+5:
+ ldrh w4, [x0]
+ ldrb w5, [x0, #2]
+ add x0, x0, #3
+ subs w2, w2, #3
+ orr w4, w4, w5, lsl #16
+ orr w4, w4, #0xFF000000
+ str w4, [x1], #4
+ b.gt 5b
+4:
+ ret
+endfunc
+
// void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2,
// uint8_t *dest, int width, int height,
// int src1Stride, int src2Stride, int dstStride);
diff --git a/tests/checkasm/sw_rgb.c b/tests/checkasm/sw_rgb.c
index baeb34b465..a38e1d68f5 100644
--- a/tests/checkasm/sw_rgb.c
+++ b/tests/checkasm/sw_rgb.c
@@ -865,6 +865,97 @@ void checkasm_check_sw_rgb(void)
}
report("rgb24tobgr24");
+ {
+ /* rgb32tobgr24: 4-byte pixels → 3-byte pixels.
+ * Test widths must be multiples of 4 (one pixel).
+ * Sizes chosen to exercise each codepath tier:
+ * 4 = scalar only (1 pixel)
+ * 16 = scalar only (4 pixels, loop iteration)
+ * 32 = medium only
+ * 48 = medium + scalar
+ * 64 = fast only (exact)
+ * 68 = fast + scalar (skip medium)
+ * 100 = fast + medium + scalar (all tiers)
+ * 128 = fast only (multi-iteration)
+ * 1920*4 = fast only (benchmark width)
+ */
+ static const int rgb32_widths[] = {4, 16, 32, 48, 64, 68, 100, 128, 1920 * 4};
+#define RGB32_BENCH_WIDTH (1920 * 4)
+#define RGB32_DST_SIZE (RGB32_BENCH_WIDTH * 3 / 4 + 8)
+ LOCAL_ALIGNED_32(uint8_t, src0, [RGB32_BENCH_WIDTH]);
+ LOCAL_ALIGNED_32(uint8_t, src1, [RGB32_BENCH_WIDTH]);
+ LOCAL_ALIGNED_32(uint8_t, dst0, [RGB32_DST_SIZE]);
+ LOCAL_ALIGNED_32(uint8_t, dst1, [RGB32_DST_SIZE]);
+
+ declare_func(void, const uint8_t *src, uint8_t *dst, int src_size);
+
+ randomize_buffers(src0, RGB32_BENCH_WIDTH);
+ memcpy(src1, src0, RGB32_BENCH_WIDTH);
+
+ if (check_func(rgb32tobgr24, "rgb32tobgr24")) {
+ for (int i = 0; i < FF_ARRAY_ELEMS(rgb32_widths); i++) {
+ int out_size = rgb32_widths[i] * 3 / 4;
+ memset(dst0, 0xAA, RGB32_DST_SIZE);
+ memset(dst1, 0xAA, RGB32_DST_SIZE);
+ call_ref(src0, dst0, rgb32_widths[i]);
+ call_new(src1, dst1, rgb32_widths[i]);
+ if (memcmp(dst0, dst1, out_size) ||
+ dst0[out_size] != 0xAA ||
+ dst1[out_size] != 0xAA)
+ fail();
+ }
+ bench_new(src0, dst0, RGB32_BENCH_WIDTH);
+ }
+#undef RGB32_DST_SIZE
+#undef RGB32_BENCH_WIDTH
+ }
+ report("rgb32tobgr24");
+
+ {
+ /* rgb24tobgr32: 3-byte pixels → 4-byte pixels.
+ * Test widths must be multiples of 3 (one pixel).
+ * Sizes chosen to exercise each codepath tier:
+ * 3 = scalar only (1 pixel)
+ * 12 = scalar only (4 pixels, loop iteration)
+ * 24 = medium only
+ * 36 = medium + scalar
+ * 48 = fast only (exact)
+ * 51 = fast + scalar (skip medium)
+ * 126 = fast + medium + scalar (all tiers)
+ * 1920*3 = fast only (benchmark width)
+ */
+ static const int rgb24to32_widths[] = {3, 12, 24, 36, 48, 51, 126, 1920 * 3};
+#define RGB24TO32_BENCH_WIDTH (1920 * 3)
+#define RGB24TO32_DST_SIZE (RGB24TO32_BENCH_WIDTH * 4 / 3 + 8)
+ LOCAL_ALIGNED_32(uint8_t, src0, [RGB24TO32_BENCH_WIDTH]);
+ LOCAL_ALIGNED_32(uint8_t, src1, [RGB24TO32_BENCH_WIDTH]);
+ LOCAL_ALIGNED_32(uint8_t, dst0, [RGB24TO32_DST_SIZE]);
+ LOCAL_ALIGNED_32(uint8_t, dst1, [RGB24TO32_DST_SIZE]);
+
+ declare_func(void, const uint8_t *src, uint8_t *dst, int src_size);
+
+ randomize_buffers(src0, RGB24TO32_BENCH_WIDTH);
+ memcpy(src1, src0, RGB24TO32_BENCH_WIDTH);
+
+ if (check_func(rgb24tobgr32, "rgb24tobgr32")) {
+ for (int i = 0; i < FF_ARRAY_ELEMS(rgb24to32_widths); i++) {
+ int out_size = rgb24to32_widths[i] * 4 / 3;
+ memset(dst0, 0xAA, RGB24TO32_DST_SIZE);
+ memset(dst1, 0xAA, RGB24TO32_DST_SIZE);
+ call_ref(src0, dst0, rgb24to32_widths[i]);
+ call_new(src1, dst1, rgb24to32_widths[i]);
+ if (memcmp(dst0, dst1, out_size) ||
+ dst0[out_size] != 0xAA ||
+ dst1[out_size] != 0xAA)
+ fail();
+ }
+ bench_new(src0, dst0, RGB24TO32_BENCH_WIDTH);
+ }
+#undef RGB24TO32_DST_SIZE
+#undef RGB24TO32_BENCH_WIDTH
+ }
+ report("rgb24tobgr32");
+
check_uyvy_to_422p();
report("uyvytoyuv422");
--
2.52.0
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2026-02-12 3:15 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2026-02-12 3:15 [FFmpeg-devel] [PR] NEON rgb2rgb conversions: channel swap + alpha drop/insert (PR #21738) David Christle via ffmpeg-devel
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git