* [FFmpeg-devel] [PATCH] libavformat/nal: add ARM NEON optimization for ff_nal_find_startcode
@ 2026-01-13 2:03 hezuoqiang--- via ffmpeg-devel
2026-01-13 2:48 ` [FFmpeg-devel] " Zhao Zhili via ffmpeg-devel
2026-01-13 10:26 ` Rémi Denis-Courmont via ffmpeg-devel
0 siblings, 2 replies; 4+ messages in thread
From: hezuoqiang--- via ffmpeg-devel @ 2026-01-13 2:03 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Zuoqiang He
From: Zuoqiang He <hezuoqiang@foxmail.com>
This adds an ARM NEON optimized implementation of the NAL startcode
search function. Performance testing shows approximately 3.7-4x speedup
on ARMv8-A platforms with NEON support.
The optimization uses 64-byte NEON vector blocks to quickly scan for
the 00 00 01 startcode pattern, falling back to the existing C code
for smaller buffers or when NEON is not available.
Performance improvement on ARMv8-A (Cortex-A76): ~3.7-4x faster
Tested with FATE suite and custom H.264 streams.
Signed-off-by: Zuoqiang He <hezuoqiang@foxmail.com>
---
libavformat/aarch64/Makefile | 2 +
libavformat/aarch64/nal.S | 172 +++++++++++++++++++++++++++++++++
libavformat/aarch64/nal_init.c | 42 ++++++++
libavformat/nal.c | 19 +++-
4 files changed, 233 insertions(+), 2 deletions(-)
create mode 100644 libavformat/aarch64/Makefile
create mode 100644 libavformat/aarch64/nal.S
create mode 100644 libavformat/aarch64/nal_init.c
diff --git a/libavformat/aarch64/Makefile b/libavformat/aarch64/Makefile
new file mode 100644
index 0000000000..f1dc99de09
--- /dev/null
+++ b/libavformat/aarch64/Makefile
@@ -0,0 +1,2 @@
+OBJS += aarch64/nal_init.o
+NEON-OBJS += aarch64/nal.o
diff --git a/libavformat/aarch64/nal.S b/libavformat/aarch64/nal.S
new file mode 100644
index 0000000000..6dc1570d39
--- /dev/null
+++ b/libavformat/aarch64/nal.S
@@ -0,0 +1,172 @@
+/*
+ * ARM NEON-optimized NAL startcode search
+ * Copyright (c) 2024
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+ .arch armv8-a
+ .text
+
+function ff_nal_find_startcode_neon, export=1
+ and x2, x0, #-4 // align to 4-byte boundary
+ sub x7, x1, #3 // end -= 3
+ add x2, x2, #4 // align4 = aligned_p + 4
+ mov x3, x0 // p = orig_p
+ cmp x0, x2
+ ccmp x7, x0, #0, cc
+ bls 2f // skip alignment phase
+
+ // Phase 1: align to 4-byte boundary
+1: ldrb w0, [x3]
+ cbnz w0, 3f
+ ldrb w0, [x3, #1]
+ cbnz w0, 3f
+ ldrb w0, [x3, #2]
+ cmp w0, #1
+ beq 22f // found 00 00 01
+3: add x3, x3, #1
+ cmp x2, x3
+ ccmp x7, x3, #0, hi
+ bhi 1b
+
+2: sub x0, x7, x3 // remaining = end - p
+ cmp x0, #63
+ bgt 43f // enter NEON phase if >= 64 bytes
+
+ // Phase 3: byte-by-byte check for remaining data
+4: cmp x7, x3
+ bls 8f
+5: ldrb w0, [x3]
+ cbnz w0, 6f
+ ldrb w0, [x3, #1]
+ cbnz w0, 6f
+ ldrb w0, [x3, #2]
+ cmp w0, #1
+ beq 22f
+6: add x3, x3, #1
+ cmp x7, x3
+ bne 5b
+8: add x0, x1, #3 // return orig_end + 3
+ ret
+
+ // Phase 2: NEON acceleration (64-byte blocks)
+43: sub x8, x1, #66 // end64 = end - 66
+ cmp x8, x3
+ bls 4b
+ mov w6, #65279 // 0xFEFF
+ add x5, x3, #64 // chunk_end = p + 64
+ movk w6, #0xfefe, lsl #16 // 0xFEFEFEFF
+ b 10f
+
+9: add x3, x3, #64 // p += 64
+ add x5, x5, #64 // chunk_end += 64
+ cmp x8, x3
+ bls 4b
+
+10: // Load 64 bytes (4x16-byte vectors)
+ ldp q31, q30, [x3] // load first 32 bytes
+ ldp q29, q28, [x3, #32] // load next 32 bytes
+ prfm PLDL1KEEP, [x3, #192] // prefetch
+
+ // Check for zero bytes (data == 0)
+ cmeq v31.16b, v31.16b, #0 // z0
+ cmeq v30.16b, v30.16b, #0 // z1
+ cmeq v29.16b, v29.16b, #0 // z2
+ cmeq v28.16b, v28.16b, #0 // z3
+
+ // Check for 00 pattern (current byte is 0 AND next byte is 0)
+ ext v24.16b, v31.16b, v31.16b, #1 // zs0
+ ext v27.16b, v30.16b, v30.16b, #1 // zs1
+ ext v26.16b, v29.16b, v29.16b, #1 // zs2
+ ext v25.16b, v28.16b, v28.16b, #1 // zs3
+
+ // pattern00 = zero & zero_shift
+ and v24.16b, v24.16b, v31.16b // p0
+ and v27.16b, v27.16b, v30.16b // p1
+ and v26.16b, v26.16b, v29.16b // p2
+ and v25.16b, v25.16b, v28.16b // p3
+
+ // Check if any 00 pattern exists (fast ORR test)
+ orr v27.16b, v24.16b, v27.16b
+ orr v25.16b, v26.16b, v25.16b
+ orr v25.16b, v25.16b, v27.16b
+ dup d31, v25.d[1]
+ orr v31.8b, v31.8b, v25.8b
+ fmov x0, d31
+ cbz x0, 9b // no 00 pattern, skip to next chunk
+
+ // Detailed check of this 64-byte chunk
+ mov x0, x3
+11: ldr w2, [x0]
+ add w4, w2, w6 // x - 0x01010101
+ bic w2, w4, w2 // (~x) & (x - 0x01010101)
+ tst w2, #-2139062144 // & 0x80808080
+ beq 12f
+
+ ldrb w2, [x0, #1]
+ cbnz w2, 13f
+ ldrb w4, [x0]
+ ldrb w2, [x0, #2]
+ cbnz w4, 14f
+ cmp w2, #1
+ beq 18f // found 00 00 01
+14: ldrb w4, [x0, #3]
+ cbnz w2, 15f
+ cmp w4, #1
+ beq 44f // found 00 00 01 (offset +1)
+ cbnz w4, 12f
+16: ldrb w2, [x0, #4]
+ cmp w2, #1
+ beq 45f // found 00 00 01 (offset +2)
+17: cbnz w2, 12f
+ ldrb w2, [x0, #5]
+ cmp w2, #1
+ beq 46f // found 00 00 01 (offset +3)
+
+12: add x0, x0, #4
+ cmp x0, x5
+ bne 11b
+ b 9b
+
+13: ldrb w2, [x0, #3]
+ cbnz w2, 12b
+ ldrb w2, [x0, #2]
+ cbz w2, 16b
+ ldrb w2, [x0, #4]
+ b 17b
+
+15: cbnz w4, 12b
+ ldrb w2, [x0, #4]
+ b 17b
+
+22: mov x0, x3
+ ret
+
+45: add x0, x0, #2
+ ret
+
+44: add x0, x0, #1
+ ret
+
+46: add x0, x0, #3
+ ret
+
+18: ret
+endfunc
diff --git a/libavformat/aarch64/nal_init.c b/libavformat/aarch64/nal_init.c
new file mode 100644
index 0000000000..90160b882c
--- /dev/null
+++ b/libavformat/aarch64/nal_init.c
@@ -0,0 +1,42 @@
+/*
+ * ARM NEON-optimized NAL functions
+ * Copyright (c) 2024
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/arm/cpu.h"
+#include "libavutil/cpu.h"
+
+const uint8_t *ff_nal_find_startcode_neon(const uint8_t *p, const uint8_t *end);
+
+/* External function pointer from nal.c */
+extern const uint8_t *(*ff_nal_find_startcode_internal)(const uint8_t *p, const uint8_t *end);
+
+void ff_nal_init_arm(void);
+
+void ff_nal_init_arm(void)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags))
+ ff_nal_find_startcode_internal = ff_nal_find_startcode_neon;
+}
diff --git a/libavformat/nal.c b/libavformat/nal.c
index 26dc5fe688..2e293c0225 100644
--- a/libavformat/nal.c
+++ b/libavformat/nal.c
@@ -21,14 +21,20 @@
#include <stdint.h>
#include <string.h>
+#include "libavutil/attributes.h"
#include "libavutil/mem.h"
#include "libavutil/error.h"
#include "libavcodec/defs.h"
#include "avio.h"
#include "avio_internal.h"
+#include "config.h"
#include "nal.h"
-static const uint8_t *nal_find_startcode_internal(const uint8_t *p, const uint8_t *end)
+/* Pointer to the active implementation */
+const uint8_t *(*ff_nal_find_startcode_internal)(const uint8_t *p, const uint8_t *end);
+
+/* C implementation */
+static const uint8_t *ff_nal_find_startcode_c(const uint8_t *p, const uint8_t *end)
{
const uint8_t *a = p + 4 - ((intptr_t)p & 3);
@@ -66,7 +72,16 @@ static const uint8_t *nal_find_startcode_internal(const uint8_t *p, const uint8_
}
const uint8_t *ff_nal_find_startcode(const uint8_t *p, const uint8_t *end){
- const uint8_t *out = nal_find_startcode_internal(p, end);
+ static int initialized = 0;
+ if (!initialized) {
+ ff_nal_find_startcode_internal = ff_nal_find_startcode_c;
+#if ARCH_AARCH64
+ extern void ff_nal_init_arm(void);
+ ff_nal_init_arm();
+#endif
+ initialized = 1;
+ }
+ const uint8_t *out = ff_nal_find_startcode_internal(p, end);
if(p<out && out<end && !out[-1]) out--;
return out;
}
--
2.47.3
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] 4+ messages in thread* [FFmpeg-devel] Re: [PATCH] libavformat/nal: add ARM NEON optimization for ff_nal_find_startcode
2026-01-13 2:03 [FFmpeg-devel] [PATCH] libavformat/nal: add ARM NEON optimization for ff_nal_find_startcode hezuoqiang--- via ffmpeg-devel
@ 2026-01-13 2:48 ` Zhao Zhili via ffmpeg-devel
2026-01-13 10:26 ` Rémi Denis-Courmont via ffmpeg-devel
1 sibling, 0 replies; 4+ messages in thread
From: Zhao Zhili via ffmpeg-devel @ 2026-01-13 2:48 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: Zuoqiang He, Zhao Zhili
> On Jan 13, 2026, at 10:03, hezuoqiang--- via ffmpeg-devel <ffmpeg-devel@ffmpeg.org> wrote:
>
> From: Zuoqiang He <hezuoqiang@foxmail.com>
>
> This adds an ARM NEON optimized implementation of the NAL startcode
> search function. Performance testing shows approximately 3.7-4x speedup
> on ARMv8-A platforms with NEON support.
>
> The optimization uses 64-byte NEON vector blocks to quickly scan for
> the 00 00 01 startcode pattern, falling back to the existing C code
> for smaller buffers or when NEON is not available.
>
> Performance improvement on ARMv8-A (Cortex-A76): ~3.7-4x faster
>
> Tested with FATE suite and custom H.264 streams.
Could you send a PR on https://code.ffmpeg.org/FFmpeg/FFmpeg ?
Please add a checkasm test under tests/checkasm/.
Some comments inline:
>
> Signed-off-by: Zuoqiang He <hezuoqiang@foxmail.com>
> ---
> libavformat/aarch64/Makefile | 2 +
> libavformat/aarch64/nal.S | 172 +++++++++++++++++++++++++++++++++
> libavformat/aarch64/nal_init.c | 42 ++++++++
> libavformat/nal.c | 19 +++-
> 4 files changed, 233 insertions(+), 2 deletions(-)
> create mode 100644 libavformat/aarch64/Makefile
> create mode 100644 libavformat/aarch64/nal.S
> create mode 100644 libavformat/aarch64/nal_init.c
>
> diff --git a/libavformat/aarch64/Makefile b/libavformat/aarch64/Makefile
> new file mode 100644
> index 0000000000..f1dc99de09
> --- /dev/null
> +++ b/libavformat/aarch64/Makefile
> @@ -0,0 +1,2 @@
> +OBJS += aarch64/nal_init.o
> +NEON-OBJS += aarch64/nal.o
> diff --git a/libavformat/aarch64/nal.S b/libavformat/aarch64/nal.S
> new file mode 100644
> index 0000000000..6dc1570d39
> --- /dev/null
> +++ b/libavformat/aarch64/nal.S
> @@ -0,0 +1,172 @@
> +/*
> + * ARM NEON-optimized NAL startcode search
> + * Copyright (c) 2024
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +
> + .arch armv8-a
> + .text
Remove this part. It’s handled by asm.S.
> +
> +function ff_nal_find_startcode_neon, export=1
> + and x2, x0, #-4 // align to 4-byte boundary
> + sub x7, x1, #3 // end -= 3
> + add x2, x2, #4 // align4 = aligned_p + 4
> + mov x3, x0 // p = orig_p
> + cmp x0, x2
> + ccmp x7, x0, #0, cc
> + bls 2f // skip alignment phase
The indent doesn’t match our coding style. There is a script at tools/check_arm_indent.sh
> +
> + // Phase 1: align to 4-byte boundary
> +1: ldrb w0, [x3]
> + cbnz w0, 3f
> + ldrb w0, [x3, #1]
> + cbnz w0, 3f
> + ldrb w0, [x3, #2]
> + cmp w0, #1
> + beq 22f // found 00 00 01
> +3: add x3, x3, #1
> + cmp x2, x3
> + ccmp x7, x3, #0, hi
> + bhi 1b
> +
> +2: sub x0, x7, x3 // remaining = end - p
> + cmp x0, #63
> + bgt 43f // enter NEON phase if >= 64 bytes
> +
> + // Phase 3: byte-by-byte check for remaining data
> +4: cmp x7, x3
> + bls 8f
> +5: ldrb w0, [x3]
> + cbnz w0, 6f
> + ldrb w0, [x3, #1]
> + cbnz w0, 6f
> + ldrb w0, [x3, #2]
> + cmp w0, #1
> + beq 22f
> +6: add x3, x3, #1
> + cmp x7, x3
> + bne 5b
> +8: add x0, x1, #3 // return orig_end + 3
> + ret
> +
> + // Phase 2: NEON acceleration (64-byte blocks)
> +43: sub x8, x1, #66 // end64 = end - 66
> + cmp x8, x3
> + bls 4b
> + mov w6, #65279 // 0xFEFF
> + add x5, x3, #64 // chunk_end = p + 64
> + movk w6, #0xfefe, lsl #16 // 0xFEFEFEFF
> + b 10f
> +
> +9: add x3, x3, #64 // p += 64
> + add x5, x5, #64 // chunk_end += 64
> + cmp x8, x3
> + bls 4b
> +
> +10: // Load 64 bytes (4x16-byte vectors)
> + ldp q31, q30, [x3] // load first 32 bytes
> + ldp q29, q28, [x3, #32] // load next 32 bytes
> + prfm PLDL1KEEP, [x3, #192] // prefetch
> +
> + // Check for zero bytes (data == 0)
> + cmeq v31.16b, v31.16b, #0 // z0
> + cmeq v30.16b, v30.16b, #0 // z1
> + cmeq v29.16b, v29.16b, #0 // z2
> + cmeq v28.16b, v28.16b, #0 // z3
> +
> + // Check for 00 pattern (current byte is 0 AND next byte is 0)
> + ext v24.16b, v31.16b, v31.16b, #1 // zs0
> + ext v27.16b, v30.16b, v30.16b, #1 // zs1
> + ext v26.16b, v29.16b, v29.16b, #1 // zs2
> + ext v25.16b, v28.16b, v28.16b, #1 // zs3
> +
> + // pattern00 = zero & zero_shift
> + and v24.16b, v24.16b, v31.16b // p0
> + and v27.16b, v27.16b, v30.16b // p1
> + and v26.16b, v26.16b, v29.16b // p2
> + and v25.16b, v25.16b, v28.16b // p3
> +
> + // Check if any 00 pattern exists (fast ORR test)
> + orr v27.16b, v24.16b, v27.16b
> + orr v25.16b, v26.16b, v25.16b
> + orr v25.16b, v25.16b, v27.16b
> + dup d31, v25.d[1]
> + orr v31.8b, v31.8b, v25.8b
> + fmov x0, d31
> + cbz x0, 9b // no 00 pattern, skip to next chunk
> +
> + // Detailed check of this 64-byte chunk
> + mov x0, x3
> +11: ldr w2, [x0]
> + add w4, w2, w6 // x - 0x01010101
> + bic w2, w4, w2 // (~x) & (x - 0x01010101)
> + tst w2, #-2139062144 // & 0x80808080
> + beq 12f
> +
> + ldrb w2, [x0, #1]
> + cbnz w2, 13f
> + ldrb w4, [x0]
> + ldrb w2, [x0, #2]
> + cbnz w4, 14f
> + cmp w2, #1
> + beq 18f // found 00 00 01
> +14: ldrb w4, [x0, #3]
> + cbnz w2, 15f
> + cmp w4, #1
> + beq 44f // found 00 00 01 (offset +1)
> + cbnz w4, 12f
> +16: ldrb w2, [x0, #4]
> + cmp w2, #1
> + beq 45f // found 00 00 01 (offset +2)
> +17: cbnz w2, 12f
> + ldrb w2, [x0, #5]
> + cmp w2, #1
> + beq 46f // found 00 00 01 (offset +3)
> +
> +12: add x0, x0, #4
> + cmp x0, x5
> + bne 11b
> + b 9b
> +
> +13: ldrb w2, [x0, #3]
> + cbnz w2, 12b
> + ldrb w2, [x0, #2]
> + cbz w2, 16b
> + ldrb w2, [x0, #4]
> + b 17b
> +
> +15: cbnz w4, 12b
> + ldrb w2, [x0, #4]
> + b 17b
> +
> +22: mov x0, x3
> + ret
> +
> +45: add x0, x0, #2
> + ret
> +
> +44: add x0, x0, #1
> + ret
> +
> +46: add x0, x0, #3
> + ret
> +
> +18: ret
> +endfunc
> diff --git a/libavformat/aarch64/nal_init.c b/libavformat/aarch64/nal_init.c
> new file mode 100644
> index 0000000000..90160b882c
> --- /dev/null
> +++ b/libavformat/aarch64/nal_init.c
> @@ -0,0 +1,42 @@
> +/*
> + * ARM NEON-optimized NAL functions
> + * Copyright (c) 2024
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include <stdint.h>
> +
> +#include "config.h"
> +#include "libavutil/attributes.h"
> +#include "libavutil/arm/cpu.h"
> +#include "libavutil/cpu.h"
> +
> +const uint8_t *ff_nal_find_startcode_neon(const uint8_t *p, const uint8_t *end);
> +
> +/* External function pointer from nal.c */
> +extern const uint8_t *(*ff_nal_find_startcode_internal)(const uint8_t *p, const uint8_t *end);
It’s not thread-safe.
> +
> +void ff_nal_init_arm(void);
Declare the function in header file then include header file.
arm suffix is for arm32. Use aarch64.
> +
> +void ff_nal_init_arm(void)
> +{
> + int cpu_flags = av_get_cpu_flags();
> +
> + if (have_neon(cpu_flags))
> + ff_nal_find_startcode_internal = ff_nal_find_startcode_neon;
> +}
> diff --git a/libavformat/nal.c b/libavformat/nal.c
> index 26dc5fe688..2e293c0225 100644
> --- a/libavformat/nal.c
> +++ b/libavformat/nal.c
> @@ -21,14 +21,20 @@
> #include <stdint.h>
> #include <string.h>
>
> +#include "libavutil/attributes.h"
> #include "libavutil/mem.h"
> #include "libavutil/error.h"
> #include "libavcodec/defs.h"
> #include "avio.h"
> #include "avio_internal.h"
> +#include "config.h"
> #include "nal.h"
>
> -static const uint8_t *nal_find_startcode_internal(const uint8_t *p, const uint8_t *end)
> +/* Pointer to the active implementation */
> +const uint8_t *(*ff_nal_find_startcode_internal)(const uint8_t *p, const uint8_t *end);
> +
> +/* C implementation */
> +static const uint8_t *ff_nal_find_startcode_c(const uint8_t *p, const uint8_t *end)
> {
> const uint8_t *a = p + 4 - ((intptr_t)p & 3);
>
> @@ -66,7 +72,16 @@ static const uint8_t *nal_find_startcode_internal(const uint8_t *p, const uint8_
> }
>
> const uint8_t *ff_nal_find_startcode(const uint8_t *p, const uint8_t *end){
> - const uint8_t *out = nal_find_startcode_internal(p, end);
> + static int initialized = 0;
> + if (!initialized) {
> + ff_nal_find_startcode_internal = ff_nal_find_startcode_c;
This is a race condition. Please note av_get_cpu_flags can be changed at any time.
> +#if ARCH_AARCH64
> + extern void ff_nal_init_arm(void);
> + ff_nal_init_arm();
> +#endif
> + initialized = 1;
> + }
> + const uint8_t *out = ff_nal_find_startcode_internal(p, end);
> if(p<out && out<end && !out[-1]) out--;
> return out;
> }
> --
> 2.47.3
>
> _______________________________________________
> ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
> To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] 4+ messages in thread* [FFmpeg-devel] Re: [PATCH] libavformat/nal: add ARM NEON optimization for ff_nal_find_startcode
2026-01-13 2:03 [FFmpeg-devel] [PATCH] libavformat/nal: add ARM NEON optimization for ff_nal_find_startcode hezuoqiang--- via ffmpeg-devel
2026-01-13 2:48 ` [FFmpeg-devel] " Zhao Zhili via ffmpeg-devel
@ 2026-01-13 10:26 ` Rémi Denis-Courmont via ffmpeg-devel
2026-01-13 17:20 ` [FFmpeg-devel] 回复:Re: [PATCH] libavformat/nal: add ARM NEON optimization forff_nal_find_startcode hezuoqiang via ffmpeg-devel
1 sibling, 1 reply; 4+ messages in thread
From: Rémi Denis-Courmont via ffmpeg-devel @ 2026-01-13 10:26 UTC (permalink / raw)
To: hezuoqiang--- via ffmpeg-devel; +Cc: Zuoqiang He, Rémi Denis-Courmont
Nihao,
There already is a hook for this purpose under h264dsp, and it's already used on some other ISAs. So there should be no need to add a new one.
It's also probably faster to just look for a nul byte in assembler and let the C code manually check for the full 32-bit start code. This is basically just `strnlen()`.
Br,
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] 4+ messages in thread
* [FFmpeg-devel] 回复:Re: [PATCH] libavformat/nal: add ARM NEON optimization forff_nal_find_startcode
2026-01-13 10:26 ` Rémi Denis-Courmont via ffmpeg-devel
@ 2026-01-13 17:20 ` hezuoqiang via ffmpeg-devel
0 siblings, 0 replies; 4+ messages in thread
From: hezuoqiang via ffmpeg-devel @ 2026-01-13 17:20 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Cc: Rémi Denis-Courmont, hezuoqiang
Hi James,
Thank you for your review. I'd like to clarify the difference between the two approaches:
**Clarification:**
My patch optimizes `ff_nal_find_startcode` in libavformat/nal.c, which is different from the `ff_startcode_find_candidate` hook you mentioned under libavcodec/h264dsp.c.
- `ff_startcode_find_candidate`: Returns offset to first zero byte, requires upper layer validation
- `ff_nal_find_startcode`: Returns pointer to complete startcode (00 00 01), used by H.264 demuxer
**Test Environment:**
- Platform: Raspberry Pi 5 (ARM Cortex-A76, AArch64)
- Compiler: GCC 14.2.0 with -O3 -march=armv8-a
- Test file: 1080p H.264 video, 22.88 MB
- Total NALU startcodes found: 1,224
**Test Methodology:**
I compared two approaches:
**Method 1 (baseline):** Use `ff_startcode_find_candidate` + C validation (current FFmpeg approach)
```c
// Simplified pseudo-code
std::vector<size_t> find_all_startcode_positions(const uint8_t* data, size_t size) {
std::vector<size_t> positions;
size_t i = 0;
while (i < size) {
// Step 1: Fast search for zero byte
int offset = ff_startcode_find_candidate(data + i, size - i);
if (offset >= size - i) break;
i += offset;
// Step 2: Validate if it's a complete startcode (00 00 01)
if (i + 2 < size && data[i] == 0 && data[i+1] == 0) {
if (data[i+2] == 1) {
positions.push_back(i);
i += 3;
continue;
} else if (i + 3 < size && data[i+2] == 0 && data[i+3] == 1) {
positions.push_back(i);
i += 4;
continue;
}
}
i++;
}
return positions;
}
```
Method 2 (NEON optimized): Use ff_nal_find_startcode_neon directly
```cpp
std::vector<size_t> find_all_startcode_positions_neon(const uint8_t* data, size_t size) {
std::vector<size_t> positions;
const uint8_t* p = data;
const uint8_t* end = data + size;
while (p < end) {
// Directly find complete startcode
const uint8_t* start = ff_nal_find_startcode_neon(p, end);
// Skip zero bytes before NALU header
while (start < end && *start == 0) start++;
if (start >= end) break;
positions.push_back(start - data);
p = start;
}
return positions;
}
```
Performance Results (1000 iterations):
- Method 1 (find zero + validate): 5,454,680 μs
- Method 2 (NEON direct search): 1,741,280 μs
- Speedup: 3.13x
Why this optimization is effective:
The NEON version detects "00" pattern (two consecutive zeros) instead of single zeros:
Test file analysis (22.88 MB 1080p H.264):
- Single zero bytes: 95,673 (98.1% false positive rate)
- Valid startcodes: 1,224
- With "00" pattern: Only 22.8% of 64-byte blocks need detailed checking
- 77.2% of blocks can be skipped entirely
This optimization specifically improves H.264 demuxing performance on ARM platforms.
Should I modify the commit message to better clarify this distinction?
Best regards,
He Zuoqiang
原始邮件
发件人:Rémi Denis-Courmont via ffmpeg-devel <ffmpeg-devel@ffmpeg.org>
发件时间:2026年1月13日 18:26
收件人:hezuoqiang--- via ffmpeg-devel <ffmpeg-devel@ffmpeg.org>
抄送:Zuoqiang He <hezuoqiang@foxmail.com>, Rémi Denis-Courmont <remi@remlab.net>
主题:[FFmpeg-devel] Re: [PATCH] libavformat/nal: add ARM NEON optimization forff_nal_find_startcode
Nihao,
There already is a hook for this purpose under h264dsp, and it's already used on some other ISAs. So there should be no need to add a new one.
It's also probably faster to just look for a nul byte in assembler and let the C code manually check for the full 32-bit start code. This is basically just `strnlen()`.
Br,
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2026-01-13 17:21 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2026-01-13 2:03 [FFmpeg-devel] [PATCH] libavformat/nal: add ARM NEON optimization for ff_nal_find_startcode hezuoqiang--- via ffmpeg-devel
2026-01-13 2:48 ` [FFmpeg-devel] " Zhao Zhili via ffmpeg-devel
2026-01-13 10:26 ` Rémi Denis-Courmont via ffmpeg-devel
2026-01-13 17:20 ` [FFmpeg-devel] 回复:Re: [PATCH] libavformat/nal: add ARM NEON optimization forff_nal_find_startcode hezuoqiang via ffmpeg-devel
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git