Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PR] feat/sme2_alf_filter_luma_8bit (PR #21373)
@ 2026-01-04 16:33 george.zaguri via ffmpeg-devel
  0 siblings, 0 replies; only message in thread
From: george.zaguri via ffmpeg-devel @ 2026-01-04 16:33 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: george.zaguri

PR #21373 opened by george.zaguri
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21373
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21373.patch

NOTE:
1) there is .sme_entry / .sme_exit define added because this MR (https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21194) is not merged yet
2) issue with i16i64 extension detection: "sme-i16i64" contains dash and configure fails because of it. Would appreciate help to solve this issue

Apple M4:
vvc_alf_filter_luma_8x8_8_c:                           443.0 ( 1.00x)
vvc_alf_filter_luma_8x8_8_neon:                        118.6 ( 3.74x)
vvc_alf_filter_luma_8x8_8_sme:                         257.6 ( 1.72x)
vvc_alf_filter_luma_16x16_8_c:                        1231.0 ( 1.00x)
vvc_alf_filter_luma_16x16_8_neon:                      503.8 ( 2.44x)
vvc_alf_filter_luma_16x16_8_sme:                       523.0 ( 2.35x)
vvc_alf_filter_luma_32x32_8_c:                        5576.3 ( 1.00x)
vvc_alf_filter_luma_32x32_8_neon:                     2149.3 ( 2.59x)
vvc_alf_filter_luma_32x32_8_sme:                      1108.1 ( 5.03x)
vvc_alf_filter_luma_64x64_8_c:                       22764.9 ( 1.00x)
vvc_alf_filter_luma_64x64_8_neon:                     8587.8 ( 2.65x)
vvc_alf_filter_luma_64x64_8_sme:                      4313.7 ( 5.28x)
vvc_alf_filter_luma_128x128_8_c:                     88816.2 ( 1.00x)
vvc_alf_filter_luma_128x128_8_neon:                  33542.2 ( 2.65x)
vvc_alf_filter_luma_128x128_8_sme:                   17227.9 ( 5.16x)


>From e22a3fd8c37f5d1066c22c6cd961cde3ede21bd9 Mon Sep 17 00:00:00 2001
From: Georgii Zagoruiko <george.zaguri@gmail.com>
Date: Sun, 4 Jan 2026 14:54:14 +0000
Subject: [PATCH 1/2] configure: aarch64/sme2 support

---
 Makefile                  |  2 +-
 configure                 |  8 +++++++-
 ffbuild/arch.mak          |  1 +
 libavutil/aarch64/asm.S   |  9 +++++++++
 libavutil/aarch64/cpu.c   | 13 +++++++++++++
 libavutil/aarch64/cpu.h   |  1 +
 libavutil/cpu.h           |  1 +
 libavutil/tests/cpu.c     |  1 +
 tests/checkasm/checkasm.c |  1 +
 9 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index f563a37fca..c290ad0a04 100644
--- a/Makefile
+++ b/Makefile
@@ -111,7 +111,7 @@ SUBDIR_VARS := CLEANFILES FFLIBS HOSTPROGS TESTPROGS TOOLS               \
                MIPSFPU-OBJS MIPSDSPR2-OBJS MIPSDSP-OBJS MSA-OBJS         \
                MMI-OBJS LSX-OBJS LASX-OBJS RV-OBJS RVV-OBJS RVVB-OBJS    \
                OBJS SHLIBOBJS STLIBOBJS HOSTOBJS TESTOBJS SIMD128-OBJS   \
-               SVE-OBJS SVE2-OBJS SME-OBJS
+               SVE-OBJS SVE2-OBJS SME-OBJS SME2-OBJS
 
 define RESET
 $(1) :=
diff --git a/configure b/configure
index 083a30972a..2115c447f2 100755
--- a/configure
+++ b/configure
@@ -480,6 +480,7 @@ Optimization options (experts only):
   --disable-sve            disable SVE optimizations
   --disable-sve2           disable SVE2 optimizations
   --disable-sme            disable SME optimizations
+  --disable-sme2           disable SME2 optimizations
   --disable-inline-asm     disable use of inline assembly
   --disable-x86asm         disable use of standalone x86 assembly
   --disable-mipsdsp        disable MIPS DSP ASE R1 optimizations
@@ -2230,6 +2231,7 @@ ARCH_EXT_LIST_ARM="
     sve
     sve2
     sme
+    sme2
 "
 
 ARCH_EXT_LIST_MIPS="
@@ -2498,6 +2500,7 @@ TOOLCHAIN_FEATURES="
     as_archext_sve_directive
     as_archext_sve2_directive
     as_archext_sme_directive
+    as_archext_sme2_directive
     as_dn_directive
     as_fpu_directive
     as_func
@@ -2831,6 +2834,7 @@ i8mm_deps="aarch64 neon"
 sve_deps="aarch64 neon"
 sve2_deps="aarch64 neon sve"
 sme_deps="aarch64 neon sve sve2"
+sme2_deps="aarch64 neon sve sve2 sme"
 
 map 'eval ${v}_inline_deps=inline_asm' $ARCH_EXT_LIST_ARM
 
@@ -6455,12 +6459,13 @@ if enabled aarch64; then
     # internal assembler in clang 3.3 does not support this instruction
     enabled neon && check_insn neon 'ext   v0.8B, v0.8B, v1.8B, #1'
 
-    archext_list="dotprod i8mm sve sve2 sme"
+    archext_list="dotprod i8mm sve sve2 sme sme2"
     enabled dotprod && check_archext_insn dotprod 'udot v0.4s, v0.16b, v0.16b'
     enabled i8mm    && check_archext_insn i8mm    'usdot v0.4s, v0.16b, v0.16b'
     enabled sve     && check_archext_insn sve     'whilelt p0.s, x0, x1'
     enabled sve2    && check_archext_insn sve2    'sqrdmulh z0.s, z0.s, z0.s'
     enabled sme     && check_archext_insn sme     'smstart' 'cntb x0'
+    enabled sme2    && check_archext_insn sme2    'smstart' 'sdot za.s[w10, 0], {z0.b-z3.b}, {z4.b-z7.b}'
 
     # Disable the main feature (e.g. HAVE_NEON) if neither inline nor external
     # assembly support the feature out of the box. Skip this for the features
@@ -8222,6 +8227,7 @@ if enabled aarch64; then
     echo "SVE enabled               ${sve-no}"
     echo "SVE2 enabled              ${sve2-no}"
     echo "SME enabled               ${sme-no}"
+    echo "SME2 enabled              ${sme2-no}"
 fi
 if enabled arm; then
     echo "ARMv5TE enabled           ${armv5te-no}"
diff --git a/ffbuild/arch.mak b/ffbuild/arch.mak
index 83d6bf276f..13e1eb33bc 100644
--- a/ffbuild/arch.mak
+++ b/ffbuild/arch.mak
@@ -6,6 +6,7 @@ OBJS-$(HAVE_NEON)    += $(NEON-OBJS)    $(NEON-OBJS-yes)
 OBJS-$(HAVE_SVE)     += $(SVE-OBJS)     $(SVE-OBJS-yes)
 OBJS-$(HAVE_SVE2)    += $(SVE2-OBJS)    $(SVE2-OBJS-yes)
 OBJS-$(HAVE_SME)     += $(SME-OBJS)     $(SME-OBJS-yes)
+OBJS-$(HAVE_SME2)    += $(SME2-OBJS)    $(SME2-OBJS-yes)
 
 OBJS-$(HAVE_MIPSFPU)   += $(MIPSFPU-OBJS)    $(MIPSFPU-OBJS-yes)
 OBJS-$(HAVE_MIPSDSP)   += $(MIPSDSP-OBJS)    $(MIPSDSP-OBJS-yes)
diff --git a/libavutil/aarch64/asm.S b/libavutil/aarch64/asm.S
index 77cea57cfc..7d59ed8199 100644
--- a/libavutil/aarch64/asm.S
+++ b/libavutil/aarch64/asm.S
@@ -80,11 +80,20 @@
 #define DISABLE_SME
 #endif
 
+#if HAVE_AS_ARCHEXT_SME2_DIRECTIVE
+#define ENABLE_SME2   .arch_extension sme2
+#define DISABLE_SME2  .arch_extension nosme2
+#else
+#define ENABLE_SME2
+#define DISABLE_SME2
+#endif
+
 DISABLE_DOTPROD
 DISABLE_I8MM
 DISABLE_SVE
 DISABLE_SVE2
 DISABLE_SME
+DISABLE_SME2
 
 
 /* Support macros for
diff --git a/libavutil/aarch64/cpu.c b/libavutil/aarch64/cpu.c
index f93ff08fb5..1ad417c141 100644
--- a/libavutil/aarch64/cpu.c
+++ b/libavutil/aarch64/cpu.c
@@ -29,6 +29,7 @@
 #define HWCAP2_AARCH64_SVE2   (1 << 1)
 #define HWCAP2_AARCH64_I8MM   (1 << 13)
 #define HWCAP2_AARCH64_SME    (1 << 23)
+#define HWCAP2_AARCH64_SME2   (1 << 37)
 
 static int detect_flags(void)
 {
@@ -47,6 +48,8 @@ static int detect_flags(void)
         flags |= AV_CPU_FLAG_I8MM;
     if (hwcap2 & HWCAP2_AARCH64_SME)
         flags |= AV_CPU_FLAG_SME;
+    if (hwcap2 & HWCAP2_AARCH64_SME2)
+        flags |= AV_CPU_FLAG_SME2;
 
     return flags;
 }
@@ -72,6 +75,8 @@ static int detect_flags(void)
         flags |= AV_CPU_FLAG_I8MM;
     if (have_feature("hw.optional.arm.FEAT_SME"))
         flags |= AV_CPU_FLAG_SME;
+    if (have_feature("hw.optional.arm.FEAT_SME2"))
+        flags |= AV_CPU_FLAG_SME2;
 
     return flags;
 }
@@ -143,6 +148,11 @@ static int detect_flags(void)
     if (IsProcessorFeaturePresent(PF_ARM_SME_INSTRUCTIONS_AVAILABLE))
         flags |= AV_CPU_FLAG_SME;
 #endif
+
+#ifdef PF_ARM_SME2_INSTRUCTIONS_AVAILABLE
+    if (IsProcessorFeaturePresent(PF_ARM_SME2_INSTRUCTIONS_AVAILABLE))
+        flags |= AV_CPU_FLAG_SME2;
+#endif
     return flags;
 }
 #else
@@ -174,6 +184,9 @@ int ff_get_cpu_flags_aarch64(void)
 #ifdef __ARM_FEATURE_SME
     flags |= AV_CPU_FLAG_SME;
 #endif
+#ifdef __ARM_FEATURE_SME2
+    flags |= AV_CPU_FLAG_SME2;
+#endif
 
     flags |= detect_flags();
 
diff --git a/libavutil/aarch64/cpu.h b/libavutil/aarch64/cpu.h
index 62d5eb768f..731a0a1a35 100644
--- a/libavutil/aarch64/cpu.h
+++ b/libavutil/aarch64/cpu.h
@@ -30,6 +30,7 @@
 #define have_sve(flags)     CPUEXT(flags, SVE)
 #define have_sve2(flags)    CPUEXT(flags, SVE2)
 #define have_sme(flags)     CPUEXT(flags, SME)
+#define have_sme2(flags)    CPUEXT(flags, SME2)
 
 #if HAVE_SVE
 int ff_aarch64_sve_length(void);
diff --git a/libavutil/cpu.h b/libavutil/cpu.h
index 87cecd0424..4c7a6edd37 100644
--- a/libavutil/cpu.h
+++ b/libavutil/cpu.h
@@ -77,6 +77,7 @@
 #define AV_CPU_FLAG_SVE          (1 <<10)
 #define AV_CPU_FLAG_SVE2         (1 <<11)
 #define AV_CPU_FLAG_SME          (1 <<12)
+#define AV_CPU_FLAG_SME2         (1 <<14)
 #define AV_CPU_FLAG_SETEND       (1 <<16)
 
 #define AV_CPU_FLAG_MMI          (1 << 0)
diff --git a/libavutil/tests/cpu.c b/libavutil/tests/cpu.c
index c63b7e7d53..acbe67d388 100644
--- a/libavutil/tests/cpu.c
+++ b/libavutil/tests/cpu.c
@@ -49,6 +49,7 @@ static const struct {
     { AV_CPU_FLAG_SVE,       "sve"        },
     { AV_CPU_FLAG_SVE2,      "sve2"       },
     { AV_CPU_FLAG_SME,       "sme"        },
+    { AV_CPU_FLAG_SME2,      "sme2" },
 #elif ARCH_ARM
     { AV_CPU_FLAG_ARMV5TE,   "armv5te"    },
     { AV_CPU_FLAG_ARMV6,     "armv6"      },
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 268e600346..13a335592a 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -363,6 +363,7 @@ static const struct {
     { "SVE",      "sve",      AV_CPU_FLAG_SVE },
     { "SVE2",     "sve2",     AV_CPU_FLAG_SVE2 },
     { "SME",      "sme",      AV_CPU_FLAG_SME },
+    { "SME2",     "sme2",      AV_CPU_FLAG_SME2 },
 #elif ARCH_ARM
     { "ARMV5TE",  "armv5te",  AV_CPU_FLAG_ARMV5TE },
     { "ARMV6",    "armv6",    AV_CPU_FLAG_ARMV6 },
-- 
2.49.1


>From 922130ea9eb4e294581668da55541f3243312f73 Mon Sep 17 00:00:00 2001
From: Georgii Zagoruiko <george.zaguri@gmail.com>
Date: Sun, 4 Jan 2026 16:13:43 +0000
Subject: [PATCH 2/2] aarch64/vvc: sme2 optimisation of alf_filter_luma() 8bit
 Apple M4: vvc_alf_filter_luma_8x8_8_c:                           443.0 (
 1.00x) vvc_alf_filter_luma_8x8_8_neon:                        118.6 ( 3.74x)
 vvc_alf_filter_luma_8x8_8_sme:                         257.6 ( 1.72x)
 vvc_alf_filter_luma_16x16_8_c:                        1231.0 ( 1.00x)
 vvc_alf_filter_luma_16x16_8_neon:                      503.8 ( 2.44x)
 vvc_alf_filter_luma_16x16_8_sme:                       523.0 ( 2.35x)
 vvc_alf_filter_luma_32x32_8_c:                        5576.3 ( 1.00x)
 vvc_alf_filter_luma_32x32_8_neon:                     2149.3 ( 2.59x)
 vvc_alf_filter_luma_32x32_8_sme:                      1108.1 ( 5.03x)
 vvc_alf_filter_luma_64x64_8_c:                       22764.9 ( 1.00x)
 vvc_alf_filter_luma_64x64_8_neon:                     8587.8 ( 2.65x)
 vvc_alf_filter_luma_64x64_8_sme:                      4313.7 ( 5.28x)
 vvc_alf_filter_luma_128x128_8_c:                     88816.2 ( 1.00x)
 vvc_alf_filter_luma_128x128_8_neon:                  33542.2 ( 2.65x)
 vvc_alf_filter_luma_128x128_8_sme:                   17227.9 ( 5.16x)

---
 libavcodec/aarch64/vvc/Makefile     |   1 +
 libavcodec/aarch64/vvc/dsp_init.c   |  25 ++
 libavcodec/aarch64/vvc/inter_sme2.S | 390 ++++++++++++++++++++++++++++
 3 files changed, 416 insertions(+)
 create mode 100644 libavcodec/aarch64/vvc/inter_sme2.S

diff --git a/libavcodec/aarch64/vvc/Makefile b/libavcodec/aarch64/vvc/Makefile
index ed80338969..7c336bc031 100644
--- a/libavcodec/aarch64/vvc/Makefile
+++ b/libavcodec/aarch64/vvc/Makefile
@@ -8,3 +8,4 @@ NEON-OBJS-$(CONFIG_VVC_DECODER)         += aarch64/vvc/alf.o \
                                            aarch64/h26x/epel_neon.o \
                                            aarch64/h26x/qpel_neon.o \
                                            aarch64/h26x/sao_neon.o
+SME2-OBJS-$(CONFIG_VVC_DECODER)         += aarch64/vvc/inter_sme2.o
diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
index aa75d22b78..02f7e3be5e 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -45,6 +45,27 @@ void ff_vvc_put_luma_h_x16_12_neon(int16_t *dst, const uint8_t *_src, const ptrd
 
 void ff_alf_classify_sum_neon(int *sum0, int *sum1, int16_t *grad, uint32_t gshift, uint32_t steps);
 
+void ff_vvc_alf_filter_luma_8_sme2(uint8_t *dst, const uint8_t *src, const uint64_t strides,
+                                   const uint64_t dims, const int16_t *filter, const int16_t *clip,
+				   const int vb_pos);
+
+#define ALF_ALIGN_BY_4(x) (4*((x - 1) >> 2u)+4)
+
+static void alf_filter_luma_8_sme2(uint8_t *_dst,
+                                   ptrdiff_t dst_stride,
+                                   const uint8_t *_src,
+                                   ptrdiff_t src_stride,
+                                   const int width, const int height,
+                                   const int16_t *filter,
+                                   const int16_t *clip,
+                                   const int vb_pos)
+{
+    int aligned_width = ALF_ALIGN_BY_4(width); // align width by 4
+    uint64_t dims = ((uint64_t)height << 32u) | (uint64_t)aligned_width;
+    uint64_t strides = ((uint64_t)src_stride << 32u) | (uint64_t)dst_stride;
+    ff_vvc_alf_filter_luma_8_sme2(_dst, _src, strides, dims, filter, clip, vb_pos);
+}
+
 #define BIT_DEPTH 8
 #include "alf_template.c"
 #undef BIT_DEPTH
@@ -57,6 +78,7 @@ void ff_alf_classify_sum_neon(int *sum0, int *sum1, int16_t *grad, uint32_t gshi
 #include "alf_template.c"
 #undef BIT_DEPTH
 
+
 int ff_vvc_sad_neon(const int16_t *src0, const int16_t *src1, int dx, int dy,
                     const int block_w, const int block_h);
 
@@ -251,6 +273,9 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
             c->inter.put[1][5][1][1] = ff_vvc_put_epel_hv64_8_neon_i8mm;
             c->inter.put[1][6][1][1] = ff_vvc_put_epel_hv128_8_neon_i8mm;
         }
+        if (have_sme2(cpu_flags)) {
+            c->alf.filter[LUMA] = alf_filter_luma_8_sme2;
+        }
     } else if (bd == 10) {
         c->inter.avg = ff_vvc_avg_10_neon;
         c->inter.w_avg = vvc_w_avg_10;
diff --git a/libavcodec/aarch64/vvc/inter_sme2.S b/libavcodec/aarch64/vvc/inter_sme2.S
new file mode 100644
index 0000000000..e0fbacf7b2
--- /dev/null
+++ b/libavcodec/aarch64/vvc/inter_sme2.S
@@ -0,0 +1,390 @@
+/*
+ * Copyright (c) 2025 Georgii Zagoruiko <george.zaguri@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+#define VVC_MAX_PB_SIZE 128
+
+ENABLE_SME2
+//#if HAVE_SME_I16I64
+//ENABLE_SME_I16I64
+.arch_extension sme-i16i64
+.ifndef sme_entry
+.macro sme_entry
+        stp             x29, x30, [sp, #-80]!
+        mov             x29, sp
+        stp             d8, d9, [sp, #16]
+        stp             d10, d11, [sp, #32]
+        stp             d12, d13, [sp, #48]
+        stp             d14, d15, [sp, #64]
+        smstart
+.endm
+.endif
+.ifndef sme_exit
+.macro sme_exit
+        smstop
+        ldp             d8, d9, [sp, #16]
+        ldp             d10, d11, [sp, #32]
+        ldp             d12, d13, [sp, #48]
+        ldp             d14, d15, [sp, #64]
+        ldp             x29, x30, [sp], #80
+.endm
+.endif
+
+
+.macro first_group_filter_luma_8_offsets breg, shift
+        // x20-x23: p5[0],p3[-1],p1[0],p0[3]
+        // x24-x27: p6[0],p4[1],p2[0],p0[-3]
+        neg             x26, x11
+        ubfx            x20, \breg, #(3+\shift), #2
+        ubfx            x21, \breg, #(1+\shift), #2
+        mul             x24, x20, x26
+        mul             x20, x20, x11
+        mul             x25, x21, x26
+        mul             x21, x21, x11
+        ubfx            x22, \breg, #(\shift), #1
+        sub             x21, x21, #1
+        mul             x26, x22, x26
+        mul             x22, x22, x11
+        mov             x23, #3
+        add             x25, x25, #1
+        mov             x27, #-3
+.endm
+
+.macro second_group_filter_luma_8_offsets breg, shift
+        // x20-x23: p3[ 1],p1[ 2],p1[-1],p0[ 2]
+        // x24-x27: p4[-1],p2[-2],p2[ 1],p0[-2]
+        neg             x26, x11
+        ubfx            x20, \breg, #(1+\shift), #2
+        ubfx            x21, \breg, #(\shift), #1
+        mul             x24, x20, x26
+        mul             x20, x20, x11
+        mul             x25, x21, x26
+        mul             x26, x21, x26
+        mul             x21, x21, x11
+        add             x20, x20, #1
+        sub             x22, x21, #1
+        add             x21, x21, #2
+        mov             x23, #2
+        sub             x24, x24, #1
+        sub             x25, x25, #2
+        add             x26, x26, #1
+        mov             x27, #-2
+.endm
+
+.macro third_group_filter_luma_8_offsets breg, shift
+        // x20-x23: p3[0],p1[ 1],p1[-2],p0[ 1]
+        // x24-x27: p4[0],p2[-1],p2[ 2],p0[-1]
+        neg             x26, x11
+        ubfx            x21, \breg, #(\shift), #1
+        ubfx            x20, \breg, #(1+\shift), #2
+        mul             x25, x21, x26
+        mul             x26, x21, x26
+        mul             x21, x21, x11
+        mul             x24, x20, x26
+        mul             x20, x20, x11
+        sub             x22, x21, #2
+        add             x21, x21, #1
+        mov             x23, #1
+        sub             x25, x25, #1
+        mov             x27, #-1
+        add             x26, x26, #2
+.endm
+
+.macro first_group_filter_luma_8_sme2 src, zreg, idx
+        ld1b            z20.h, p0/z, [\src, x20]
+        ld1b            z21.h, p0/z, [\src, x21]
+        ld1b            z22.h, p0/z, [\src, x22]
+        ld1b            z23.h, p0/z, [\src, x23]
+        ld1b            z24.h, p0/z, [\src, x24]
+        ld1b            z25.h, p0/z, [\src, x25]
+        neg             z8.h, p0/m, \zreg // -p0
+        ld1b            z26.h, p0/z, [\src, x26]
+        ld1b            z27.h, p0/z, [\src, x27]
+        add             {z20.h-z23.h}, {z20.h-z23.h}, z8.h
+        add             {z24.h-z27.h}, {z24.h-z27.h}, z8.h
+        // transpose data vectors
+        zip             {z20.h-z23.h}, {z20.h-z23.h}
+        zip             {z24.h-z27.h}, {z24.h-z27.h}
+        // clip data
+        sclamp          z20.h, z16.h, z12.h
+        sclamp          z24.h, z16.h, z12.h
+        sclamp          z21.h, z17.h, z13.h
+        sclamp          z25.h, z17.h, z13.h
+        sclamp          z22.h, z18.h, z14.h
+        sclamp          z26.h, z18.h, z14.h
+        sclamp          z23.h, z19.h, z15.h
+        sclamp          z27.h, z19.h, z15.h
+        sdot            za.d[w10, \idx], {z20.h-z23.h}, {z28.h-z31.h}
+        sdot            za.d[w10, \idx], {z24.h-z27.h}, {z28.h-z31.h}
+.endm
+
+function ff_vvc_alf_filter_luma_8_sme2, export=1
+        // dst           .req x0
+        // src           .req x1
+        // strides       .req x2
+        // dims          .req x3
+        // filter        .req x4
+        // clip          .req x5
+        // vb            .req x6
+        sme_entry
+        stp             x29, x30, [sp, #-96]!
+        mov             x29, sp
+        stp             x19, x20, [sp, #16]
+        stp             x21, x22, [sp, #32]
+        stp             x23, x24, [sp, #48]
+        stp             x25, x26, [sp, #64]
+        stp             x27, x28, [sp, #80]
+
+        lsr             x7, x3, #32
+        cnth            x11
+        mov             w8, w3
+        sub             w9, w8, #1
+        sdiv            w9, w9, w11
+        msub            w9, w9, w11, w8
+        whilelo         p10.h, xzr, x9
+        ptrue           p1.h
+        lsr             x11, x2, #32 // src stride
+        lsr             w2, w2, #0 // leave dst stride only
+        mov             w10, #0
+        mov             w12, #255
+        dup             z9.h, w10
+        dup             z10.h, w12
+1:
+        lsr             x20, x3, #32
+        mov             p0.b, p10.b
+        sub             w20, w20, w7
+        mov             w12, w9
+        sub             w6, w6, #6
+        // offsets are packed into the format: (M<<3)|(N<<1)|K, where M is p5/p6 offset (multiply), N is p3/p4 offset, K is p1/p2 offset
+        mov             w21, #0
+        mov             w22, #0xB
+        mov             w23, #0x15
+        mov             w13, #0x1D // 0x1D == (3<<3)|(2<<1)|1
+        mov             w14, #0x1D
+        mov             w15, #0x1D
+        mov             w16, #0x1D
+        // y == vb_pos - 6
+        cmp             w20, w6
+        add             w6, w6, #1
+        csel            w16, w16, w23, ne
+        // y == vb_pos - 5
+        cmp             w20, w6
+        add             w6, w6, #1
+        csel            w15, w15, w23, ne
+        csel            w16, w16, w22, ne
+        // y == vb_pos - 4
+        cmp             w20, w6
+        add             w6, w6, #1
+        csel            w14, w14, w23, ne
+        csel            w15, w15, w22, ne
+        csel            w16, w16, w21, ne
+        // y == vb_pos - 3
+        cmp             w20, w6
+        add             w6, w6, #1
+        csel            w13, w13, w23, ne
+        csel            w14, w14, w22, ne
+        csel            w15, w15, w21, ne
+        csel            w16, w16, w21, ne
+        // y == vb_pos - 2
+        cmp             w20, w6
+        add             w6, w6, #1
+        csel            w13, w13, w22, ne
+        csel            w14, w14, w21, ne
+        csel            w15, w15, w21, ne
+        csel            w16, w16, w22, ne
+        // y == vb_pos - 1
+        cmp             w20, w6
+        add             w6, w6, #1
+        csel            w13, w13, w21, ne
+        csel            w14, w14, w21, ne
+        csel            w15, w15, w22, ne
+        csel            w16, w16, w23, ne
+        // y == vb_pos
+        cmp             w20, w6
+        add             w6, w6, #1
+        csel            w13, w13, w21, ne
+        csel            w14, w14, w22, ne
+        csel            w15, w15, w23, ne
+        // y == vb_pos + 1
+        cmp             w20, w6
+        add             w6, w6, #1
+        csel            w13, w13, w22, ne
+        csel            w14, w14, w23, ne
+        // y == vb_pos + 2
+        cmp             w20, w6
+        sub             w6, w6, #2
+        csel            w13, w13, w23, ne
+        orr             w13, w13, w14, lsl #8
+        orr             w13, w13, w15, lsl #16
+        orr             w13, w13, w16, lsl #24
+        mov             x14, x1
+        mov             x19, x0
+2:
+        // Load clip [12=>3x4 memory layout]
+        ld3h            {z0.h-z2.h}, p0/z, [x5]
+        // Load filter [12=>3x4 memory layout]
+        ld3h            {z3.h-z5.h}, p0/z, [x4]
+        add             x15, x14, x11
+        add             x16, x14, x11, lsl #1
+        add             x17, x15, x11, lsl #1
+        add             x30, x19, x2, lsl #1
+
+        mov             z12.d, z0.d
+        mov             z13.d, z0.d
+        mov             z14.d, z0.d
+        mov             z15.d, z0.d
+        // copy filter into 4 vectors and then zip
+        mov             z28.d, z3.d
+        mov             z29.d, z3.d
+        zip             {z12.d-z15.d}, {z12.d-z15.d}
+        mov             z30.d, z3.d
+        mov             z31.d, z3.d
+        neg             z16.h, p1/m, z12.h
+        neg             z17.h, p1/m, z13.h
+        neg             z18.h, p1/m, z14.h
+        neg             z19.h, p1/m, z15.h
+        zip             {z28.d-z31.d}, {z28.d-z31.d}
+        // p0 (curr)
+        ld1b            z6.h, p0/z, [x14]
+        ld1b            z7.h, p0/z, [x15]
+        ld1b            z0.h, p0/z, [x16]
+        ld1b            z3.h, p0/z, [x17]
+        // clip & filter (first group): a0,a3,a6,a9, a12...
+        // {p5[0],p3[-1],p1[0],p0[3]} -> left operand in clip
+        // {p6[0],p4[1],p2[0],p0[-3]} -> right operand in clip
+        first_group_filter_luma_8_offsets x13, 0
+        first_group_filter_luma_8_sme2 x14, z6.h, 0
+        first_group_filter_luma_8_offsets x13, 8
+        first_group_filter_luma_8_sme2 x15, z7.h, 1
+        first_group_filter_luma_8_offsets x13, 16
+        first_group_filter_luma_8_sme2 x16, z0.h, 2
+        first_group_filter_luma_8_offsets x13, 24
+        first_group_filter_luma_8_sme2 x17, z3.h, 3
+
+        mov             z12.d, z1.d
+        mov             z13.d, z1.d
+        mov             z14.d, z1.d
+        mov             z15.d, z1.d
+        // copy filter into 4 vectors and then zip
+        mov             z28.d, z4.d
+        mov             z29.d, z4.d
+        zip             {z12.d-z15.d}, {z12.d-z15.d}
+        mov             z30.d, z4.d
+        mov             z31.d, z4.d
+        // -clip
+        neg             z16.h, p1/m, z12.h
+        neg             z17.h, p1/m, z13.h
+        neg             z18.h, p1/m, z14.h
+        neg             z19.h, p1/m, z15.h
+        zip             {z28.d-z31.d}, {z28.d-z31.d}
+        // clip & filter (second group): a1,a4,a7,a10,a13...
+        // left:  {p3[ 1],p1[ 2],p1[-1],p0[ 2]}
+        // right: {p4[-1],p2[-2],p2[ 1],p0[-2]}
+        second_group_filter_luma_8_offsets x13, 0
+        first_group_filter_luma_8_sme2 x14, z6.h, 0
+        second_group_filter_luma_8_offsets x13, 8
+        first_group_filter_luma_8_sme2 x15, z7.h, 1
+        second_group_filter_luma_8_offsets x13, 16
+        first_group_filter_luma_8_sme2 x16, z0.h, 2
+        second_group_filter_luma_8_offsets x13, 24
+        first_group_filter_luma_8_sme2 x17, z3.h, 3
+
+        mov             z12.d, z2.d
+        mov             z13.d, z2.d
+        mov             z14.d, z2.d
+        mov             z15.d, z2.d
+        // copy filter into 4 vectors and then zip
+        mov             z28.d, z5.d
+        mov             z29.d, z5.d
+        zip             {z12.d-z15.d}, {z12.d-z15.d}
+        mov             z30.d, z5.d
+        mov             z31.d, z5.d
+        // -clip
+        neg             z16.h, p1/m, z12.h
+        neg             z17.h, p1/m, z13.h
+        neg             z18.h, p1/m, z14.h
+        neg             z19.h, p1/m, z15.h
+        zip             {z28.d-z31.d}, {z28.d-z31.d}
+        // clip & filter (third group): a2,a5,a8,a11,a14...
+        // left:  {p3[0],p1[ 1],p1[-2],p0[ 1]}
+        // right: {p4[0],p2[-1],p2[ 2],p0[-1]}
+        third_group_filter_luma_8_offsets x13, 0
+        first_group_filter_luma_8_sme2 x14, z6.h, 0
+        third_group_filter_luma_8_offsets x13, 8
+        first_group_filter_luma_8_sme2 x15, z7.h, 1
+        third_group_filter_luma_8_offsets x13, 16
+        first_group_filter_luma_8_sme2 x16, z0.h, 2
+        third_group_filter_luma_8_offsets x13, 24
+        first_group_filter_luma_8_sme2 x17, z3.h, 3
+        mova            {z16.d-z19.d}, za.d[w10, 0]
+        mova            {z20.d-z23.d}, za.d[w10, 1]
+        mova            {z24.d-z27.d}, za.d[w10, 2]
+        mova            {z28.d-z31.d}, za.d[w10, 3]
+        sqrshr          z12.h, {z16.d-z19.d}, #7
+        sqrshr          z13.h, {z20.d-z23.d}, #7
+        sqrshr          z14.h, {z24.d-z27.d}, #7
+        sqrshr          z15.h, {z28.d-z31.d}, #7
+        tbnz            x13, #0, 10f
+        sqrshr          z12.h, {z16.d-z19.d}, #10
+10:
+        tbnz            x13, #8, 11f
+        sqrshr          z13.h, {z20.d-z23.d}, #10
+11:
+        tbnz            x13, #16, 12f
+        sqrshr          z14.h, {z24.d-z27.d}, #10
+12:
+        tbnz            x13, #24, 13f
+        sqrshr          z15.h, {z28.d-z31.d}, #10
+13:
+        add             z12.h, z12.h, z6.h
+        add             z13.h, z13.h, z7.h
+        add             z14.h, z14.h, z0.h
+        add             z15.h, z15.h, z3.h
+        sclamp          {z12.h-z15.h}, z9.h, z10.h
+        st1b            z12.h, p0, [x19]
+        st1b            z13.h, p0, [x19, x2]
+        st1b            z14.h, p0, [x30]
+        st1b            z15.h, p0, [x30, x2]
+        zero            {za}
+        add             x14, x14, x12
+        add             x19, x19, x12
+        ptrue           p0.h
+        subs            w8, w8, w12
+        add             w12, w12, w12, lsl #1
+        add             x4, x4, x12, lsl #1
+        add             x5, x5, x12, lsl #1
+        cnth            x12
+        b.gt            2b
+        mov             w8, w3
+        subs            w7, w7, #4
+        add             x1, x1, x11, lsl #2
+        add             x0, x0, x2, lsl #2
+        b.gt            1b
+
+        ldp             x19, x20, [sp, #16]
+        ldp             x21, x22, [sp, #32]
+        ldp             x23, x24, [sp, #48]
+        ldp             x25, x26, [sp, #64]
+        ldp             x27, x28, [sp, #80]
+        ldp             x29, x30, [sp], #96
+        sme_exit
+        ret
+endfunc
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2026-01-04 16:34 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2026-01-04 16:33 [FFmpeg-devel] [PR] feat/sme2_alf_filter_luma_8bit (PR #21373) george.zaguri via ffmpeg-devel

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git