[FFmpeg-devel] [PATCH] libavformat/nal: add ARM NEON optimization for ff_nal_find

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed

* [FFmpeg-devel] [PATCH] libavformat/nal: add ARM NEON optimization for ff_nal_find_startcode
@ 2026-01-13  2:03 hezuoqiang--- via ffmpeg-devel
  2026-01-13  2:48 ` [FFmpeg-devel] " Zhao Zhili via ffmpeg-devel
  0 siblings, 1 reply; 2+ messages in thread
From: hezuoqiang--- via ffmpeg-devel @ 2026-01-13  2:03 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Zuoqiang He

From: Zuoqiang He <hezuoqiang@foxmail.com>

This adds an ARM NEON optimized implementation of the NAL startcode
search function. Performance testing shows approximately 3.7-4x speedup
on ARMv8-A platforms with NEON support.

The optimization uses 64-byte NEON vector blocks to quickly scan for
the 00 00 01 startcode pattern, falling back to the existing C code
for smaller buffers or when NEON is not available.

Performance improvement on ARMv8-A (Cortex-A76): ~3.7-4x faster

Tested with FATE suite and custom H.264 streams.

Signed-off-by: Zuoqiang He <hezuoqiang@foxmail.com>
---
 libavformat/aarch64/Makefile   |   2 +
 libavformat/aarch64/nal.S      | 172 +++++++++++++++++++++++++++++++++
 libavformat/aarch64/nal_init.c |  42 ++++++++
 libavformat/nal.c              |  19 +++-
 4 files changed, 233 insertions(+), 2 deletions(-)
 create mode 100644 libavformat/aarch64/Makefile
 create mode 100644 libavformat/aarch64/nal.S
 create mode 100644 libavformat/aarch64/nal_init.c

diff --git a/libavformat/aarch64/Makefile b/libavformat/aarch64/Makefile
new file mode 100644
index 0000000000..f1dc99de09
--- /dev/null
+++ b/libavformat/aarch64/Makefile
@@ -0,0 +1,2 @@
+OBJS += aarch64/nal_init.o
+NEON-OBJS += aarch64/nal.o
diff --git a/libavformat/aarch64/nal.S b/libavformat/aarch64/nal.S
new file mode 100644
index 0000000000..6dc1570d39
--- /dev/null
+++ b/libavformat/aarch64/nal.S
@@ -0,0 +1,172 @@
+/*
+ * ARM NEON-optimized NAL startcode search
+ * Copyright (c) 2024
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+        .arch armv8-a
+        .text
+
+function ff_nal_find_startcode_neon, export=1
+        and     x2, x0, #-4              // align to 4-byte boundary
+        sub     x7, x1, #3               // end -= 3
+        add     x2, x2, #4               // align4 = aligned_p + 4
+        mov     x3, x0                   // p = orig_p
+        cmp     x0, x2
+        ccmp    x7, x0, #0, cc
+        bls     2f                       // skip alignment phase
+
+        // Phase 1: align to 4-byte boundary
+1:      ldrb    w0, [x3]
+        cbnz    w0, 3f
+        ldrb    w0, [x3, #1]
+        cbnz    w0, 3f
+        ldrb    w0, [x3, #2]
+        cmp     w0, #1
+        beq     22f                      // found 00 00 01
+3:      add     x3, x3, #1
+        cmp     x2, x3
+        ccmp    x7, x3, #0, hi
+        bhi     1b
+
+2:      sub     x0, x7, x3               // remaining = end - p
+        cmp     x0, #63
+        bgt     43f                      // enter NEON phase if >= 64 bytes
+
+        // Phase 3: byte-by-byte check for remaining data
+4:      cmp     x7, x3
+        bls     8f
+5:      ldrb    w0, [x3]
+        cbnz    w0, 6f
+        ldrb    w0, [x3, #1]
+        cbnz    w0, 6f
+        ldrb    w0, [x3, #2]
+        cmp     w0, #1
+        beq     22f
+6:      add     x3, x3, #1
+        cmp     x7, x3
+        bne     5b
+8:      add     x0, x1, #3               // return orig_end + 3
+        ret
+
+        // Phase 2: NEON acceleration (64-byte blocks)
+43:     sub     x8, x1, #66              // end64 = end - 66
+        cmp     x8, x3
+        bls     4b
+        mov     w6, #65279               // 0xFEFF
+        add     x5, x3, #64              // chunk_end = p + 64
+        movk    w6, #0xfefe, lsl #16     // 0xFEFEFEFF
+        b       10f
+
+9:      add     x3, x3, #64              // p += 64
+        add     x5, x5, #64              // chunk_end += 64
+        cmp     x8, x3
+        bls     4b
+
+10:     // Load 64 bytes (4x16-byte vectors)
+        ldp     q31, q30, [x3]           // load first 32 bytes
+        ldp     q29, q28, [x3, #32]      // load next 32 bytes
+        prfm    PLDL1KEEP, [x3, #192]    // prefetch
+
+        // Check for zero bytes (data == 0)
+        cmeq    v31.16b, v31.16b, #0     // z0
+        cmeq    v30.16b, v30.16b, #0     // z1
+        cmeq    v29.16b, v29.16b, #0     // z2
+        cmeq    v28.16b, v28.16b, #0     // z3
+
+        // Check for 00 pattern (current byte is 0 AND next byte is 0)
+        ext     v24.16b, v31.16b, v31.16b, #1    // zs0
+        ext     v27.16b, v30.16b, v30.16b, #1    // zs1
+        ext     v26.16b, v29.16b, v29.16b, #1    // zs2
+        ext     v25.16b, v28.16b, v28.16b, #1    // zs3
+
+        // pattern00 = zero & zero_shift
+        and     v24.16b, v24.16b, v31.16b        // p0
+        and     v27.16b, v27.16b, v30.16b        // p1
+        and     v26.16b, v26.16b, v29.16b        // p2
+        and     v25.16b, v25.16b, v28.16b        // p3
+
+        // Check if any 00 pattern exists (fast ORR test)
+        orr     v27.16b, v24.16b, v27.16b
+        orr     v25.16b, v26.16b, v25.16b
+        orr     v25.16b, v25.16b, v27.16b
+        dup     d31, v25.d[1]
+        orr     v31.8b, v31.8b, v25.8b
+        fmov    x0, d31
+        cbz     x0, 9b                   // no 00 pattern, skip to next chunk
+
+        // Detailed check of this 64-byte chunk
+        mov     x0, x3
+11:     ldr     w2, [x0]
+        add     w4, w2, w6               // x - 0x01010101
+        bic     w2, w4, w2               // (~x) & (x - 0x01010101)
+        tst     w2, #-2139062144         // & 0x80808080
+        beq     12f
+
+        ldrb    w2, [x0, #1]
+        cbnz    w2, 13f
+        ldrb    w4, [x0]
+        ldrb    w2, [x0, #2]
+        cbnz    w4, 14f
+        cmp     w2, #1
+        beq     18f                      // found 00 00 01
+14:     ldrb    w4, [x0, #3]
+        cbnz    w2, 15f
+        cmp     w4, #1
+        beq     44f                      // found 00 00 01 (offset +1)
+        cbnz    w4, 12f
+16:     ldrb    w2, [x0, #4]
+        cmp     w2, #1
+        beq     45f                      // found 00 00 01 (offset +2)
+17:     cbnz    w2, 12f
+        ldrb    w2, [x0, #5]
+        cmp     w2, #1
+        beq     46f                      // found 00 00 01 (offset +3)
+
+12:     add     x0, x0, #4
+        cmp     x0, x5
+        bne     11b
+        b       9b
+
+13:     ldrb    w2, [x0, #3]
+        cbnz    w2, 12b
+        ldrb    w2, [x0, #2]
+        cbz     w2, 16b
+        ldrb    w2, [x0, #4]
+        b       17b
+
+15:     cbnz    w4, 12b
+        ldrb    w2, [x0, #4]
+        b       17b
+
+22:     mov     x0, x3
+        ret
+
+45:     add     x0, x0, #2
+        ret
+
+44:     add     x0, x0, #1
+        ret
+
+46:     add     x0, x0, #3
+        ret
+
+18:     ret
+endfunc
diff --git a/libavformat/aarch64/nal_init.c b/libavformat/aarch64/nal_init.c
new file mode 100644
index 0000000000..90160b882c
--- /dev/null
+++ b/libavformat/aarch64/nal_init.c
@@ -0,0 +1,42 @@
+/*
+ * ARM NEON-optimized NAL functions
+ * Copyright (c) 2024
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/arm/cpu.h"
+#include "libavutil/cpu.h"
+
+const uint8_t *ff_nal_find_startcode_neon(const uint8_t *p, const uint8_t *end);
+
+/* External function pointer from nal.c */
+extern const uint8_t *(*ff_nal_find_startcode_internal)(const uint8_t *p, const uint8_t *end);
+
+void ff_nal_init_arm(void);
+
+void ff_nal_init_arm(void)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags))
+        ff_nal_find_startcode_internal = ff_nal_find_startcode_neon;
+}
diff --git a/libavformat/nal.c b/libavformat/nal.c
index 26dc5fe688..2e293c0225 100644
--- a/libavformat/nal.c
+++ b/libavformat/nal.c
@@ -21,14 +21,20 @@
 #include <stdint.h>
 #include <string.h>
 
+#include "libavutil/attributes.h"
 #include "libavutil/mem.h"
 #include "libavutil/error.h"
 #include "libavcodec/defs.h"
 #include "avio.h"
 #include "avio_internal.h"
+#include "config.h"
 #include "nal.h"
 
-static const uint8_t *nal_find_startcode_internal(const uint8_t *p, const uint8_t *end)
+/* Pointer to the active implementation */
+const uint8_t *(*ff_nal_find_startcode_internal)(const uint8_t *p, const uint8_t *end);
+
+/* C implementation */
+static const uint8_t *ff_nal_find_startcode_c(const uint8_t *p, const uint8_t *end)
 {
     const uint8_t *a = p + 4 - ((intptr_t)p & 3);
 
@@ -66,7 +72,16 @@ static const uint8_t *nal_find_startcode_internal(const uint8_t *p, const uint8_
 }
 
 const uint8_t *ff_nal_find_startcode(const uint8_t *p, const uint8_t *end){
-    const uint8_t *out = nal_find_startcode_internal(p, end);
+    static int initialized = 0;
+    if (!initialized) {
+        ff_nal_find_startcode_internal = ff_nal_find_startcode_c;
+#if ARCH_AARCH64
+        extern void ff_nal_init_arm(void);
+        ff_nal_init_arm();
+#endif
+        initialized = 1;
+    }
+    const uint8_t *out = ff_nal_find_startcode_internal(p, end);
     if(p<out && out<end && !out[-1]) out--;
     return out;
 }
-- 
2.47.3

_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org

^ permalink raw reply	[flat|nested] 2+ messages in thread

* [FFmpeg-devel] Re: [PATCH] libavformat/nal: add ARM NEON optimization for ff_nal_find_startcode
  2026-01-13  2:03 [FFmpeg-devel] [PATCH] libavformat/nal: add ARM NEON optimization for ff_nal_find_startcode hezuoqiang--- via ffmpeg-devel
@ 2026-01-13  2:48 ` Zhao Zhili via ffmpeg-devel
  0 siblings, 0 replies; 2+ messages in thread
From: Zhao Zhili via ffmpeg-devel @ 2026-01-13  2:48 UTC (permalink / raw)
  To: FFmpeg development discussions and patches; +Cc: Zuoqiang He, Zhao Zhili



> On Jan 13, 2026, at 10:03, hezuoqiang--- via ffmpeg-devel <ffmpeg-devel@ffmpeg.org> wrote:
> 
> From: Zuoqiang He <hezuoqiang@foxmail.com>
> 
> This adds an ARM NEON optimized implementation of the NAL startcode
> search function. Performance testing shows approximately 3.7-4x speedup
> on ARMv8-A platforms with NEON support.
> 
> The optimization uses 64-byte NEON vector blocks to quickly scan for
> the 00 00 01 startcode pattern, falling back to the existing C code
> for smaller buffers or when NEON is not available.
> 
> Performance improvement on ARMv8-A (Cortex-A76): ~3.7-4x faster
> 
> Tested with FATE suite and custom H.264 streams.

Could you send a PR on https://code.ffmpeg.org/FFmpeg/FFmpeg ?

Please add a checkasm test under tests/checkasm/.

Some comments inline:

> 
> Signed-off-by: Zuoqiang He <hezuoqiang@foxmail.com>
> ---
> libavformat/aarch64/Makefile   |   2 +
> libavformat/aarch64/nal.S      | 172 +++++++++++++++++++++++++++++++++
> libavformat/aarch64/nal_init.c |  42 ++++++++
> libavformat/nal.c              |  19 +++-
> 4 files changed, 233 insertions(+), 2 deletions(-)
> create mode 100644 libavformat/aarch64/Makefile
> create mode 100644 libavformat/aarch64/nal.S
> create mode 100644 libavformat/aarch64/nal_init.c
> 
> diff --git a/libavformat/aarch64/Makefile b/libavformat/aarch64/Makefile
> new file mode 100644
> index 0000000000..f1dc99de09
> --- /dev/null
> +++ b/libavformat/aarch64/Makefile
> @@ -0,0 +1,2 @@
> +OBJS += aarch64/nal_init.o
> +NEON-OBJS += aarch64/nal.o
> diff --git a/libavformat/aarch64/nal.S b/libavformat/aarch64/nal.S
> new file mode 100644
> index 0000000000..6dc1570d39
> --- /dev/null
> +++ b/libavformat/aarch64/nal.S
> @@ -0,0 +1,172 @@
> +/*
> + * ARM NEON-optimized NAL startcode search
> + * Copyright (c) 2024
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +
> +        .arch armv8-a
> +        .text

Remove this part. It’s handled by asm.S.

> +
> +function ff_nal_find_startcode_neon, export=1
> +        and     x2, x0, #-4              // align to 4-byte boundary
> +        sub     x7, x1, #3               // end -= 3
> +        add     x2, x2, #4               // align4 = aligned_p + 4
> +        mov     x3, x0                   // p = orig_p
> +        cmp     x0, x2
> +        ccmp    x7, x0, #0, cc
> +        bls     2f                       // skip alignment phase

The indent doesn’t match our coding style. There is a script at tools/check_arm_indent.sh

> +
> +        // Phase 1: align to 4-byte boundary
> +1:      ldrb    w0, [x3]
> +        cbnz    w0, 3f
> +        ldrb    w0, [x3, #1]
> +        cbnz    w0, 3f
> +        ldrb    w0, [x3, #2]
> +        cmp     w0, #1
> +        beq     22f                      // found 00 00 01
> +3:      add     x3, x3, #1
> +        cmp     x2, x3
> +        ccmp    x7, x3, #0, hi
> +        bhi     1b
> +
> +2:      sub     x0, x7, x3               // remaining = end - p
> +        cmp     x0, #63
> +        bgt     43f                      // enter NEON phase if >= 64 bytes
> +
> +        // Phase 3: byte-by-byte check for remaining data
> +4:      cmp     x7, x3
> +        bls     8f
> +5:      ldrb    w0, [x3]
> +        cbnz    w0, 6f
> +        ldrb    w0, [x3, #1]
> +        cbnz    w0, 6f
> +        ldrb    w0, [x3, #2]
> +        cmp     w0, #1
> +        beq     22f
> +6:      add     x3, x3, #1
> +        cmp     x7, x3
> +        bne     5b
> +8:      add     x0, x1, #3               // return orig_end + 3
> +        ret
> +
> +        // Phase 2: NEON acceleration (64-byte blocks)
> +43:     sub     x8, x1, #66              // end64 = end - 66
> +        cmp     x8, x3
> +        bls     4b
> +        mov     w6, #65279               // 0xFEFF
> +        add     x5, x3, #64              // chunk_end = p + 64
> +        movk    w6, #0xfefe, lsl #16     // 0xFEFEFEFF
> +        b       10f
> +
> +9:      add     x3, x3, #64              // p += 64
> +        add     x5, x5, #64              // chunk_end += 64
> +        cmp     x8, x3
> +        bls     4b
> +
> +10:     // Load 64 bytes (4x16-byte vectors)
> +        ldp     q31, q30, [x3]           // load first 32 bytes
> +        ldp     q29, q28, [x3, #32]      // load next 32 bytes
> +        prfm    PLDL1KEEP, [x3, #192]    // prefetch
> +
> +        // Check for zero bytes (data == 0)
> +        cmeq    v31.16b, v31.16b, #0     // z0
> +        cmeq    v30.16b, v30.16b, #0     // z1
> +        cmeq    v29.16b, v29.16b, #0     // z2
> +        cmeq    v28.16b, v28.16b, #0     // z3
> +
> +        // Check for 00 pattern (current byte is 0 AND next byte is 0)
> +        ext     v24.16b, v31.16b, v31.16b, #1    // zs0
> +        ext     v27.16b, v30.16b, v30.16b, #1    // zs1
> +        ext     v26.16b, v29.16b, v29.16b, #1    // zs2
> +        ext     v25.16b, v28.16b, v28.16b, #1    // zs3
> +
> +        // pattern00 = zero & zero_shift
> +        and     v24.16b, v24.16b, v31.16b        // p0
> +        and     v27.16b, v27.16b, v30.16b        // p1
> +        and     v26.16b, v26.16b, v29.16b        // p2
> +        and     v25.16b, v25.16b, v28.16b        // p3
> +
> +        // Check if any 00 pattern exists (fast ORR test)
> +        orr     v27.16b, v24.16b, v27.16b
> +        orr     v25.16b, v26.16b, v25.16b
> +        orr     v25.16b, v25.16b, v27.16b
> +        dup     d31, v25.d[1]
> +        orr     v31.8b, v31.8b, v25.8b
> +        fmov    x0, d31
> +        cbz     x0, 9b                   // no 00 pattern, skip to next chunk
> +
> +        // Detailed check of this 64-byte chunk
> +        mov     x0, x3
> +11:     ldr     w2, [x0]
> +        add     w4, w2, w6               // x - 0x01010101
> +        bic     w2, w4, w2               // (~x) & (x - 0x01010101)
> +        tst     w2, #-2139062144         // & 0x80808080
> +        beq     12f
> +
> +        ldrb    w2, [x0, #1]
> +        cbnz    w2, 13f
> +        ldrb    w4, [x0]
> +        ldrb    w2, [x0, #2]
> +        cbnz    w4, 14f
> +        cmp     w2, #1
> +        beq     18f                      // found 00 00 01
> +14:     ldrb    w4, [x0, #3]
> +        cbnz    w2, 15f
> +        cmp     w4, #1
> +        beq     44f                      // found 00 00 01 (offset +1)
> +        cbnz    w4, 12f
> +16:     ldrb    w2, [x0, #4]
> +        cmp     w2, #1
> +        beq     45f                      // found 00 00 01 (offset +2)
> +17:     cbnz    w2, 12f
> +        ldrb    w2, [x0, #5]
> +        cmp     w2, #1
> +        beq     46f                      // found 00 00 01 (offset +3)
> +
> +12:     add     x0, x0, #4
> +        cmp     x0, x5
> +        bne     11b
> +        b       9b
> +
> +13:     ldrb    w2, [x0, #3]
> +        cbnz    w2, 12b
> +        ldrb    w2, [x0, #2]
> +        cbz     w2, 16b
> +        ldrb    w2, [x0, #4]
> +        b       17b
> +
> +15:     cbnz    w4, 12b
> +        ldrb    w2, [x0, #4]
> +        b       17b
> +
> +22:     mov     x0, x3
> +        ret
> +
> +45:     add     x0, x0, #2
> +        ret
> +
> +44:     add     x0, x0, #1
> +        ret
> +
> +46:     add     x0, x0, #3
> +        ret
> +
> +18:     ret
> +endfunc
> diff --git a/libavformat/aarch64/nal_init.c b/libavformat/aarch64/nal_init.c
> new file mode 100644
> index 0000000000..90160b882c
> --- /dev/null
> +++ b/libavformat/aarch64/nal_init.c
> @@ -0,0 +1,42 @@
> +/*
> + * ARM NEON-optimized NAL functions
> + * Copyright (c) 2024
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include <stdint.h>
> +
> +#include "config.h"
> +#include "libavutil/attributes.h"
> +#include "libavutil/arm/cpu.h"
> +#include "libavutil/cpu.h"
> +
> +const uint8_t *ff_nal_find_startcode_neon(const uint8_t *p, const uint8_t *end);
> +
> +/* External function pointer from nal.c */
> +extern const uint8_t *(*ff_nal_find_startcode_internal)(const uint8_t *p, const uint8_t *end);

It’s not thread-safe.

> +
> +void ff_nal_init_arm(void);

Declare the function in header file then include header file.

arm suffix is for arm32. Use aarch64.

> +
> +void ff_nal_init_arm(void)
> +{
> +    int cpu_flags = av_get_cpu_flags();
> +
> +    if (have_neon(cpu_flags))
> +        ff_nal_find_startcode_internal = ff_nal_find_startcode_neon;
> +}
> diff --git a/libavformat/nal.c b/libavformat/nal.c
> index 26dc5fe688..2e293c0225 100644
> --- a/libavformat/nal.c
> +++ b/libavformat/nal.c
> @@ -21,14 +21,20 @@
> #include <stdint.h>
> #include <string.h>
> 
> +#include "libavutil/attributes.h"
> #include "libavutil/mem.h"
> #include "libavutil/error.h"
> #include "libavcodec/defs.h"
> #include "avio.h"
> #include "avio_internal.h"
> +#include "config.h"
> #include "nal.h"
> 
> -static const uint8_t *nal_find_startcode_internal(const uint8_t *p, const uint8_t *end)
> +/* Pointer to the active implementation */
> +const uint8_t *(*ff_nal_find_startcode_internal)(const uint8_t *p, const uint8_t *end);
> +
> +/* C implementation */
> +static const uint8_t *ff_nal_find_startcode_c(const uint8_t *p, const uint8_t *end)
> {
>     const uint8_t *a = p + 4 - ((intptr_t)p & 3);
> 
> @@ -66,7 +72,16 @@ static const uint8_t *nal_find_startcode_internal(const uint8_t *p, const uint8_
> }
> 
> const uint8_t *ff_nal_find_startcode(const uint8_t *p, const uint8_t *end){
> -    const uint8_t *out = nal_find_startcode_internal(p, end);
> +    static int initialized = 0;
> +    if (!initialized) {
> +        ff_nal_find_startcode_internal = ff_nal_find_startcode_c;

This is a race condition. Please note av_get_cpu_flags can be changed at any time.

> +#if ARCH_AARCH64
> +        extern void ff_nal_init_arm(void);
> +        ff_nal_init_arm();
> +#endif
> +        initialized = 1;
> +    }
> +    const uint8_t *out = ff_nal_find_startcode_internal(p, end);
>     if(p<out && out<end && !out[-1]) out--;
>     return out;
> }
> -- 
> 2.47.3
> 
> _______________________________________________
> ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
> To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org

_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2026-01-13  2:49 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2026-01-13  2:03 [FFmpeg-devel] [PATCH] libavformat/nal: add ARM NEON optimization for ff_nal_find_startcode hezuoqiang--- via ffmpeg-devel
2026-01-13  2:48 ` [FFmpeg-devel] " Zhao Zhili via ffmpeg-devel

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git