* [FFmpeg-devel] [PR] feat/sme2_alf_filter_luma_8bit (PR #21373)
@ 2026-01-04 16:33 george.zaguri via ffmpeg-devel
0 siblings, 0 replies; only message in thread
From: george.zaguri via ffmpeg-devel @ 2026-01-04 16:33 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: george.zaguri
PR #21373 opened by george.zaguri
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21373
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21373.patch
NOTE:
1) there is .sme_entry / .sme_exit define added because this MR (https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21194) is not merged yet
2) issue with i16i64 extension detection: "sme-i16i64" contains dash and configure fails because of it. Would appreciate help to solve this issue
Apple M4:
vvc_alf_filter_luma_8x8_8_c: 443.0 ( 1.00x)
vvc_alf_filter_luma_8x8_8_neon: 118.6 ( 3.74x)
vvc_alf_filter_luma_8x8_8_sme: 257.6 ( 1.72x)
vvc_alf_filter_luma_16x16_8_c: 1231.0 ( 1.00x)
vvc_alf_filter_luma_16x16_8_neon: 503.8 ( 2.44x)
vvc_alf_filter_luma_16x16_8_sme: 523.0 ( 2.35x)
vvc_alf_filter_luma_32x32_8_c: 5576.3 ( 1.00x)
vvc_alf_filter_luma_32x32_8_neon: 2149.3 ( 2.59x)
vvc_alf_filter_luma_32x32_8_sme: 1108.1 ( 5.03x)
vvc_alf_filter_luma_64x64_8_c: 22764.9 ( 1.00x)
vvc_alf_filter_luma_64x64_8_neon: 8587.8 ( 2.65x)
vvc_alf_filter_luma_64x64_8_sme: 4313.7 ( 5.28x)
vvc_alf_filter_luma_128x128_8_c: 88816.2 ( 1.00x)
vvc_alf_filter_luma_128x128_8_neon: 33542.2 ( 2.65x)
vvc_alf_filter_luma_128x128_8_sme: 17227.9 ( 5.16x)
>From e22a3fd8c37f5d1066c22c6cd961cde3ede21bd9 Mon Sep 17 00:00:00 2001
From: Georgii Zagoruiko <george.zaguri@gmail.com>
Date: Sun, 4 Jan 2026 14:54:14 +0000
Subject: [PATCH 1/2] configure: aarch64/sme2 support
---
Makefile | 2 +-
configure | 8 +++++++-
ffbuild/arch.mak | 1 +
libavutil/aarch64/asm.S | 9 +++++++++
libavutil/aarch64/cpu.c | 13 +++++++++++++
libavutil/aarch64/cpu.h | 1 +
libavutil/cpu.h | 1 +
libavutil/tests/cpu.c | 1 +
tests/checkasm/checkasm.c | 1 +
9 files changed, 35 insertions(+), 2 deletions(-)
diff --git a/Makefile b/Makefile
index f563a37fca..c290ad0a04 100644
--- a/Makefile
+++ b/Makefile
@@ -111,7 +111,7 @@ SUBDIR_VARS := CLEANFILES FFLIBS HOSTPROGS TESTPROGS TOOLS \
MIPSFPU-OBJS MIPSDSPR2-OBJS MIPSDSP-OBJS MSA-OBJS \
MMI-OBJS LSX-OBJS LASX-OBJS RV-OBJS RVV-OBJS RVVB-OBJS \
OBJS SHLIBOBJS STLIBOBJS HOSTOBJS TESTOBJS SIMD128-OBJS \
- SVE-OBJS SVE2-OBJS SME-OBJS
+ SVE-OBJS SVE2-OBJS SME-OBJS SME2-OBJS
define RESET
$(1) :=
diff --git a/configure b/configure
index 083a30972a..2115c447f2 100755
--- a/configure
+++ b/configure
@@ -480,6 +480,7 @@ Optimization options (experts only):
--disable-sve disable SVE optimizations
--disable-sve2 disable SVE2 optimizations
--disable-sme disable SME optimizations
+ --disable-sme2 disable SME2 optimizations
--disable-inline-asm disable use of inline assembly
--disable-x86asm disable use of standalone x86 assembly
--disable-mipsdsp disable MIPS DSP ASE R1 optimizations
@@ -2230,6 +2231,7 @@ ARCH_EXT_LIST_ARM="
sve
sve2
sme
+ sme2
"
ARCH_EXT_LIST_MIPS="
@@ -2498,6 +2500,7 @@ TOOLCHAIN_FEATURES="
as_archext_sve_directive
as_archext_sve2_directive
as_archext_sme_directive
+ as_archext_sme2_directive
as_dn_directive
as_fpu_directive
as_func
@@ -2831,6 +2834,7 @@ i8mm_deps="aarch64 neon"
sve_deps="aarch64 neon"
sve2_deps="aarch64 neon sve"
sme_deps="aarch64 neon sve sve2"
+sme2_deps="aarch64 neon sve sve2 sme"
map 'eval ${v}_inline_deps=inline_asm' $ARCH_EXT_LIST_ARM
@@ -6455,12 +6459,13 @@ if enabled aarch64; then
# internal assembler in clang 3.3 does not support this instruction
enabled neon && check_insn neon 'ext v0.8B, v0.8B, v1.8B, #1'
- archext_list="dotprod i8mm sve sve2 sme"
+ archext_list="dotprod i8mm sve sve2 sme sme2"
enabled dotprod && check_archext_insn dotprod 'udot v0.4s, v0.16b, v0.16b'
enabled i8mm && check_archext_insn i8mm 'usdot v0.4s, v0.16b, v0.16b'
enabled sve && check_archext_insn sve 'whilelt p0.s, x0, x1'
enabled sve2 && check_archext_insn sve2 'sqrdmulh z0.s, z0.s, z0.s'
enabled sme && check_archext_insn sme 'smstart' 'cntb x0'
+ enabled sme2 && check_archext_insn sme2 'smstart' 'sdot za.s[w10, 0], {z0.b-z3.b}, {z4.b-z7.b}'
# Disable the main feature (e.g. HAVE_NEON) if neither inline nor external
# assembly support the feature out of the box. Skip this for the features
@@ -8222,6 +8227,7 @@ if enabled aarch64; then
echo "SVE enabled ${sve-no}"
echo "SVE2 enabled ${sve2-no}"
echo "SME enabled ${sme-no}"
+ echo "SME2 enabled ${sme2-no}"
fi
if enabled arm; then
echo "ARMv5TE enabled ${armv5te-no}"
diff --git a/ffbuild/arch.mak b/ffbuild/arch.mak
index 83d6bf276f..13e1eb33bc 100644
--- a/ffbuild/arch.mak
+++ b/ffbuild/arch.mak
@@ -6,6 +6,7 @@ OBJS-$(HAVE_NEON) += $(NEON-OBJS) $(NEON-OBJS-yes)
OBJS-$(HAVE_SVE) += $(SVE-OBJS) $(SVE-OBJS-yes)
OBJS-$(HAVE_SVE2) += $(SVE2-OBJS) $(SVE2-OBJS-yes)
OBJS-$(HAVE_SME) += $(SME-OBJS) $(SME-OBJS-yes)
+OBJS-$(HAVE_SME2) += $(SME2-OBJS) $(SME2-OBJS-yes)
OBJS-$(HAVE_MIPSFPU) += $(MIPSFPU-OBJS) $(MIPSFPU-OBJS-yes)
OBJS-$(HAVE_MIPSDSP) += $(MIPSDSP-OBJS) $(MIPSDSP-OBJS-yes)
diff --git a/libavutil/aarch64/asm.S b/libavutil/aarch64/asm.S
index 77cea57cfc..7d59ed8199 100644
--- a/libavutil/aarch64/asm.S
+++ b/libavutil/aarch64/asm.S
@@ -80,11 +80,20 @@
#define DISABLE_SME
#endif
+#if HAVE_AS_ARCHEXT_SME2_DIRECTIVE
+#define ENABLE_SME2 .arch_extension sme2
+#define DISABLE_SME2 .arch_extension nosme2
+#else
+#define ENABLE_SME2
+#define DISABLE_SME2
+#endif
+
DISABLE_DOTPROD
DISABLE_I8MM
DISABLE_SVE
DISABLE_SVE2
DISABLE_SME
+DISABLE_SME2
/* Support macros for
diff --git a/libavutil/aarch64/cpu.c b/libavutil/aarch64/cpu.c
index f93ff08fb5..1ad417c141 100644
--- a/libavutil/aarch64/cpu.c
+++ b/libavutil/aarch64/cpu.c
@@ -29,6 +29,7 @@
#define HWCAP2_AARCH64_SVE2 (1 << 1)
#define HWCAP2_AARCH64_I8MM (1 << 13)
#define HWCAP2_AARCH64_SME (1 << 23)
+#define HWCAP2_AARCH64_SME2 (1 << 37)
static int detect_flags(void)
{
@@ -47,6 +48,8 @@ static int detect_flags(void)
flags |= AV_CPU_FLAG_I8MM;
if (hwcap2 & HWCAP2_AARCH64_SME)
flags |= AV_CPU_FLAG_SME;
+ if (hwcap2 & HWCAP2_AARCH64_SME2)
+ flags |= AV_CPU_FLAG_SME2;
return flags;
}
@@ -72,6 +75,8 @@ static int detect_flags(void)
flags |= AV_CPU_FLAG_I8MM;
if (have_feature("hw.optional.arm.FEAT_SME"))
flags |= AV_CPU_FLAG_SME;
+ if (have_feature("hw.optional.arm.FEAT_SME2"))
+ flags |= AV_CPU_FLAG_SME2;
return flags;
}
@@ -143,6 +148,11 @@ static int detect_flags(void)
if (IsProcessorFeaturePresent(PF_ARM_SME_INSTRUCTIONS_AVAILABLE))
flags |= AV_CPU_FLAG_SME;
#endif
+
+#ifdef PF_ARM_SME2_INSTRUCTIONS_AVAILABLE
+ if (IsProcessorFeaturePresent(PF_ARM_SME2_INSTRUCTIONS_AVAILABLE))
+ flags |= AV_CPU_FLAG_SME2;
+#endif
return flags;
}
#else
@@ -174,6 +184,9 @@ int ff_get_cpu_flags_aarch64(void)
#ifdef __ARM_FEATURE_SME
flags |= AV_CPU_FLAG_SME;
#endif
+#ifdef __ARM_FEATURE_SME2
+ flags |= AV_CPU_FLAG_SME2;
+#endif
flags |= detect_flags();
diff --git a/libavutil/aarch64/cpu.h b/libavutil/aarch64/cpu.h
index 62d5eb768f..731a0a1a35 100644
--- a/libavutil/aarch64/cpu.h
+++ b/libavutil/aarch64/cpu.h
@@ -30,6 +30,7 @@
#define have_sve(flags) CPUEXT(flags, SVE)
#define have_sve2(flags) CPUEXT(flags, SVE2)
#define have_sme(flags) CPUEXT(flags, SME)
+#define have_sme2(flags) CPUEXT(flags, SME2)
#if HAVE_SVE
int ff_aarch64_sve_length(void);
diff --git a/libavutil/cpu.h b/libavutil/cpu.h
index 87cecd0424..4c7a6edd37 100644
--- a/libavutil/cpu.h
+++ b/libavutil/cpu.h
@@ -77,6 +77,7 @@
#define AV_CPU_FLAG_SVE (1 <<10)
#define AV_CPU_FLAG_SVE2 (1 <<11)
#define AV_CPU_FLAG_SME (1 <<12)
+#define AV_CPU_FLAG_SME2 (1 <<14)
#define AV_CPU_FLAG_SETEND (1 <<16)
#define AV_CPU_FLAG_MMI (1 << 0)
diff --git a/libavutil/tests/cpu.c b/libavutil/tests/cpu.c
index c63b7e7d53..acbe67d388 100644
--- a/libavutil/tests/cpu.c
+++ b/libavutil/tests/cpu.c
@@ -49,6 +49,7 @@ static const struct {
{ AV_CPU_FLAG_SVE, "sve" },
{ AV_CPU_FLAG_SVE2, "sve2" },
{ AV_CPU_FLAG_SME, "sme" },
+ { AV_CPU_FLAG_SME2, "sme2" },
#elif ARCH_ARM
{ AV_CPU_FLAG_ARMV5TE, "armv5te" },
{ AV_CPU_FLAG_ARMV6, "armv6" },
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 268e600346..13a335592a 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -363,6 +363,7 @@ static const struct {
{ "SVE", "sve", AV_CPU_FLAG_SVE },
{ "SVE2", "sve2", AV_CPU_FLAG_SVE2 },
{ "SME", "sme", AV_CPU_FLAG_SME },
+ { "SME2", "sme2", AV_CPU_FLAG_SME2 },
#elif ARCH_ARM
{ "ARMV5TE", "armv5te", AV_CPU_FLAG_ARMV5TE },
{ "ARMV6", "armv6", AV_CPU_FLAG_ARMV6 },
--
2.49.1
>From 922130ea9eb4e294581668da55541f3243312f73 Mon Sep 17 00:00:00 2001
From: Georgii Zagoruiko <george.zaguri@gmail.com>
Date: Sun, 4 Jan 2026 16:13:43 +0000
Subject: [PATCH 2/2] aarch64/vvc: sme2 optimisation of alf_filter_luma() 8bit
Apple M4: vvc_alf_filter_luma_8x8_8_c: 443.0 (
1.00x) vvc_alf_filter_luma_8x8_8_neon: 118.6 ( 3.74x)
vvc_alf_filter_luma_8x8_8_sme: 257.6 ( 1.72x)
vvc_alf_filter_luma_16x16_8_c: 1231.0 ( 1.00x)
vvc_alf_filter_luma_16x16_8_neon: 503.8 ( 2.44x)
vvc_alf_filter_luma_16x16_8_sme: 523.0 ( 2.35x)
vvc_alf_filter_luma_32x32_8_c: 5576.3 ( 1.00x)
vvc_alf_filter_luma_32x32_8_neon: 2149.3 ( 2.59x)
vvc_alf_filter_luma_32x32_8_sme: 1108.1 ( 5.03x)
vvc_alf_filter_luma_64x64_8_c: 22764.9 ( 1.00x)
vvc_alf_filter_luma_64x64_8_neon: 8587.8 ( 2.65x)
vvc_alf_filter_luma_64x64_8_sme: 4313.7 ( 5.28x)
vvc_alf_filter_luma_128x128_8_c: 88816.2 ( 1.00x)
vvc_alf_filter_luma_128x128_8_neon: 33542.2 ( 2.65x)
vvc_alf_filter_luma_128x128_8_sme: 17227.9 ( 5.16x)
---
libavcodec/aarch64/vvc/Makefile | 1 +
libavcodec/aarch64/vvc/dsp_init.c | 25 ++
libavcodec/aarch64/vvc/inter_sme2.S | 390 ++++++++++++++++++++++++++++
3 files changed, 416 insertions(+)
create mode 100644 libavcodec/aarch64/vvc/inter_sme2.S
diff --git a/libavcodec/aarch64/vvc/Makefile b/libavcodec/aarch64/vvc/Makefile
index ed80338969..7c336bc031 100644
--- a/libavcodec/aarch64/vvc/Makefile
+++ b/libavcodec/aarch64/vvc/Makefile
@@ -8,3 +8,4 @@ NEON-OBJS-$(CONFIG_VVC_DECODER) += aarch64/vvc/alf.o \
aarch64/h26x/epel_neon.o \
aarch64/h26x/qpel_neon.o \
aarch64/h26x/sao_neon.o
+SME2-OBJS-$(CONFIG_VVC_DECODER) += aarch64/vvc/inter_sme2.o
diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
index aa75d22b78..02f7e3be5e 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -45,6 +45,27 @@ void ff_vvc_put_luma_h_x16_12_neon(int16_t *dst, const uint8_t *_src, const ptrd
void ff_alf_classify_sum_neon(int *sum0, int *sum1, int16_t *grad, uint32_t gshift, uint32_t steps);
+void ff_vvc_alf_filter_luma_8_sme2(uint8_t *dst, const uint8_t *src, const uint64_t strides,
+ const uint64_t dims, const int16_t *filter, const int16_t *clip,
+ const int vb_pos);
+
+#define ALF_ALIGN_BY_4(x) (4*((x - 1) >> 2u)+4)
+
+static void alf_filter_luma_8_sme2(uint8_t *_dst,
+ ptrdiff_t dst_stride,
+ const uint8_t *_src,
+ ptrdiff_t src_stride,
+ const int width, const int height,
+ const int16_t *filter,
+ const int16_t *clip,
+ const int vb_pos)
+{
+ int aligned_width = ALF_ALIGN_BY_4(width); // align width by 4
+ uint64_t dims = ((uint64_t)height << 32u) | (uint64_t)aligned_width;
+ uint64_t strides = ((uint64_t)src_stride << 32u) | (uint64_t)dst_stride;
+ ff_vvc_alf_filter_luma_8_sme2(_dst, _src, strides, dims, filter, clip, vb_pos);
+}
+
#define BIT_DEPTH 8
#include "alf_template.c"
#undef BIT_DEPTH
@@ -57,6 +78,7 @@ void ff_alf_classify_sum_neon(int *sum0, int *sum1, int16_t *grad, uint32_t gshi
#include "alf_template.c"
#undef BIT_DEPTH
+
int ff_vvc_sad_neon(const int16_t *src0, const int16_t *src1, int dx, int dy,
const int block_w, const int block_h);
@@ -251,6 +273,9 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->inter.put[1][5][1][1] = ff_vvc_put_epel_hv64_8_neon_i8mm;
c->inter.put[1][6][1][1] = ff_vvc_put_epel_hv128_8_neon_i8mm;
}
+ if (have_sme2(cpu_flags)) {
+ c->alf.filter[LUMA] = alf_filter_luma_8_sme2;
+ }
} else if (bd == 10) {
c->inter.avg = ff_vvc_avg_10_neon;
c->inter.w_avg = vvc_w_avg_10;
diff --git a/libavcodec/aarch64/vvc/inter_sme2.S b/libavcodec/aarch64/vvc/inter_sme2.S
new file mode 100644
index 0000000000..e0fbacf7b2
--- /dev/null
+++ b/libavcodec/aarch64/vvc/inter_sme2.S
@@ -0,0 +1,390 @@
+/*
+ * Copyright (c) 2025 Georgii Zagoruiko <george.zaguri@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+#define VVC_MAX_PB_SIZE 128
+
+ENABLE_SME2
+//#if HAVE_SME_I16I64
+//ENABLE_SME_I16I64
+.arch_extension sme-i16i64
+.ifndef sme_entry
+.macro sme_entry
+ stp x29, x30, [sp, #-80]!
+ mov x29, sp
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+ smstart
+.endm
+.endif
+.ifndef sme_exit
+.macro sme_exit
+ smstop
+ ldp d8, d9, [sp, #16]
+ ldp d10, d11, [sp, #32]
+ ldp d12, d13, [sp, #48]
+ ldp d14, d15, [sp, #64]
+ ldp x29, x30, [sp], #80
+.endm
+.endif
+
+
+.macro first_group_filter_luma_8_offsets breg, shift
+ // x20-x23: p5[0],p3[-1],p1[0],p0[3]
+ // x24-x27: p6[0],p4[1],p2[0],p0[-3]
+ neg x26, x11
+ ubfx x20, \breg, #(3+\shift), #2
+ ubfx x21, \breg, #(1+\shift), #2
+ mul x24, x20, x26
+ mul x20, x20, x11
+ mul x25, x21, x26
+ mul x21, x21, x11
+ ubfx x22, \breg, #(\shift), #1
+ sub x21, x21, #1
+ mul x26, x22, x26
+ mul x22, x22, x11
+ mov x23, #3
+ add x25, x25, #1
+ mov x27, #-3
+.endm
+
+.macro second_group_filter_luma_8_offsets breg, shift
+ // x20-x23: p3[ 1],p1[ 2],p1[-1],p0[ 2]
+ // x24-x27: p4[-1],p2[-2],p2[ 1],p0[-2]
+ neg x26, x11
+ ubfx x20, \breg, #(1+\shift), #2
+ ubfx x21, \breg, #(\shift), #1
+ mul x24, x20, x26
+ mul x20, x20, x11
+ mul x25, x21, x26
+ mul x26, x21, x26
+ mul x21, x21, x11
+ add x20, x20, #1
+ sub x22, x21, #1
+ add x21, x21, #2
+ mov x23, #2
+ sub x24, x24, #1
+ sub x25, x25, #2
+ add x26, x26, #1
+ mov x27, #-2
+.endm
+
+.macro third_group_filter_luma_8_offsets breg, shift
+ // x20-x23: p3[0],p1[ 1],p1[-2],p0[ 1]
+ // x24-x27: p4[0],p2[-1],p2[ 2],p0[-1]
+ neg x26, x11
+ ubfx x21, \breg, #(\shift), #1
+ ubfx x20, \breg, #(1+\shift), #2
+ mul x25, x21, x26
+ mul x26, x21, x26
+ mul x21, x21, x11
+ mul x24, x20, x26
+ mul x20, x20, x11
+ sub x22, x21, #2
+ add x21, x21, #1
+ mov x23, #1
+ sub x25, x25, #1
+ mov x27, #-1
+ add x26, x26, #2
+.endm
+
+.macro first_group_filter_luma_8_sme2 src, zreg, idx
+ ld1b z20.h, p0/z, [\src, x20]
+ ld1b z21.h, p0/z, [\src, x21]
+ ld1b z22.h, p0/z, [\src, x22]
+ ld1b z23.h, p0/z, [\src, x23]
+ ld1b z24.h, p0/z, [\src, x24]
+ ld1b z25.h, p0/z, [\src, x25]
+ neg z8.h, p0/m, \zreg // -p0
+ ld1b z26.h, p0/z, [\src, x26]
+ ld1b z27.h, p0/z, [\src, x27]
+ add {z20.h-z23.h}, {z20.h-z23.h}, z8.h
+ add {z24.h-z27.h}, {z24.h-z27.h}, z8.h
+ // transpose data vectors
+ zip {z20.h-z23.h}, {z20.h-z23.h}
+ zip {z24.h-z27.h}, {z24.h-z27.h}
+ // clip data
+ sclamp z20.h, z16.h, z12.h
+ sclamp z24.h, z16.h, z12.h
+ sclamp z21.h, z17.h, z13.h
+ sclamp z25.h, z17.h, z13.h
+ sclamp z22.h, z18.h, z14.h
+ sclamp z26.h, z18.h, z14.h
+ sclamp z23.h, z19.h, z15.h
+ sclamp z27.h, z19.h, z15.h
+ sdot za.d[w10, \idx], {z20.h-z23.h}, {z28.h-z31.h}
+ sdot za.d[w10, \idx], {z24.h-z27.h}, {z28.h-z31.h}
+.endm
+
+function ff_vvc_alf_filter_luma_8_sme2, export=1
+ // dst .req x0
+ // src .req x1
+ // strides .req x2
+ // dims .req x3
+ // filter .req x4
+ // clip .req x5
+ // vb .req x6
+ sme_entry
+ stp x29, x30, [sp, #-96]!
+ mov x29, sp
+ stp x19, x20, [sp, #16]
+ stp x21, x22, [sp, #32]
+ stp x23, x24, [sp, #48]
+ stp x25, x26, [sp, #64]
+ stp x27, x28, [sp, #80]
+
+ lsr x7, x3, #32
+ cnth x11
+ mov w8, w3
+ sub w9, w8, #1
+ sdiv w9, w9, w11
+ msub w9, w9, w11, w8
+ whilelo p10.h, xzr, x9
+ ptrue p1.h
+ lsr x11, x2, #32 // src stride
+ lsr w2, w2, #0 // leave dst stride only
+ mov w10, #0
+ mov w12, #255
+ dup z9.h, w10
+ dup z10.h, w12
+1:
+ lsr x20, x3, #32
+ mov p0.b, p10.b
+ sub w20, w20, w7
+ mov w12, w9
+ sub w6, w6, #6
+ // offsets are packed into the format: (M<<3)|(N<<1)|K, where M is p5/p6 offset (multiply), N is p3/p4 offset, K is p1/p2 offset
+ mov w21, #0
+ mov w22, #0xB
+ mov w23, #0x15
+ mov w13, #0x1D // 0x1D == (3<<3)|(2<<1)|1
+ mov w14, #0x1D
+ mov w15, #0x1D
+ mov w16, #0x1D
+ // y == vb_pos - 6
+ cmp w20, w6
+ add w6, w6, #1
+ csel w16, w16, w23, ne
+ // y == vb_pos - 5
+ cmp w20, w6
+ add w6, w6, #1
+ csel w15, w15, w23, ne
+ csel w16, w16, w22, ne
+ // y == vb_pos - 4
+ cmp w20, w6
+ add w6, w6, #1
+ csel w14, w14, w23, ne
+ csel w15, w15, w22, ne
+ csel w16, w16, w21, ne
+ // y == vb_pos - 3
+ cmp w20, w6
+ add w6, w6, #1
+ csel w13, w13, w23, ne
+ csel w14, w14, w22, ne
+ csel w15, w15, w21, ne
+ csel w16, w16, w21, ne
+ // y == vb_pos - 2
+ cmp w20, w6
+ add w6, w6, #1
+ csel w13, w13, w22, ne
+ csel w14, w14, w21, ne
+ csel w15, w15, w21, ne
+ csel w16, w16, w22, ne
+ // y == vb_pos - 1
+ cmp w20, w6
+ add w6, w6, #1
+ csel w13, w13, w21, ne
+ csel w14, w14, w21, ne
+ csel w15, w15, w22, ne
+ csel w16, w16, w23, ne
+ // y == vb_pos
+ cmp w20, w6
+ add w6, w6, #1
+ csel w13, w13, w21, ne
+ csel w14, w14, w22, ne
+ csel w15, w15, w23, ne
+ // y == vb_pos + 1
+ cmp w20, w6
+ add w6, w6, #1
+ csel w13, w13, w22, ne
+ csel w14, w14, w23, ne
+ // y == vb_pos + 2
+ cmp w20, w6
+ sub w6, w6, #2
+ csel w13, w13, w23, ne
+ orr w13, w13, w14, lsl #8
+ orr w13, w13, w15, lsl #16
+ orr w13, w13, w16, lsl #24
+ mov x14, x1
+ mov x19, x0
+2:
+ // Load clip [12=>3x4 memory layout]
+ ld3h {z0.h-z2.h}, p0/z, [x5]
+ // Load filter [12=>3x4 memory layout]
+ ld3h {z3.h-z5.h}, p0/z, [x4]
+ add x15, x14, x11
+ add x16, x14, x11, lsl #1
+ add x17, x15, x11, lsl #1
+ add x30, x19, x2, lsl #1
+
+ mov z12.d, z0.d
+ mov z13.d, z0.d
+ mov z14.d, z0.d
+ mov z15.d, z0.d
+ // copy filter into 4 vectors and then zip
+ mov z28.d, z3.d
+ mov z29.d, z3.d
+ zip {z12.d-z15.d}, {z12.d-z15.d}
+ mov z30.d, z3.d
+ mov z31.d, z3.d
+ neg z16.h, p1/m, z12.h
+ neg z17.h, p1/m, z13.h
+ neg z18.h, p1/m, z14.h
+ neg z19.h, p1/m, z15.h
+ zip {z28.d-z31.d}, {z28.d-z31.d}
+ // p0 (curr)
+ ld1b z6.h, p0/z, [x14]
+ ld1b z7.h, p0/z, [x15]
+ ld1b z0.h, p0/z, [x16]
+ ld1b z3.h, p0/z, [x17]
+ // clip & filter (first group): a0,a3,a6,a9, a12...
+ // {p5[0],p3[-1],p1[0],p0[3]} -> left operand in clip
+ // {p6[0],p4[1],p2[0],p0[-3]} -> right operand in clip
+ first_group_filter_luma_8_offsets x13, 0
+ first_group_filter_luma_8_sme2 x14, z6.h, 0
+ first_group_filter_luma_8_offsets x13, 8
+ first_group_filter_luma_8_sme2 x15, z7.h, 1
+ first_group_filter_luma_8_offsets x13, 16
+ first_group_filter_luma_8_sme2 x16, z0.h, 2
+ first_group_filter_luma_8_offsets x13, 24
+ first_group_filter_luma_8_sme2 x17, z3.h, 3
+
+ mov z12.d, z1.d
+ mov z13.d, z1.d
+ mov z14.d, z1.d
+ mov z15.d, z1.d
+ // copy filter into 4 vectors and then zip
+ mov z28.d, z4.d
+ mov z29.d, z4.d
+ zip {z12.d-z15.d}, {z12.d-z15.d}
+ mov z30.d, z4.d
+ mov z31.d, z4.d
+ // -clip
+ neg z16.h, p1/m, z12.h
+ neg z17.h, p1/m, z13.h
+ neg z18.h, p1/m, z14.h
+ neg z19.h, p1/m, z15.h
+ zip {z28.d-z31.d}, {z28.d-z31.d}
+ // clip & filter (second group): a1,a4,a7,a10,a13...
+ // left: {p3[ 1],p1[ 2],p1[-1],p0[ 2]}
+ // right: {p4[-1],p2[-2],p2[ 1],p0[-2]}
+ second_group_filter_luma_8_offsets x13, 0
+ first_group_filter_luma_8_sme2 x14, z6.h, 0
+ second_group_filter_luma_8_offsets x13, 8
+ first_group_filter_luma_8_sme2 x15, z7.h, 1
+ second_group_filter_luma_8_offsets x13, 16
+ first_group_filter_luma_8_sme2 x16, z0.h, 2
+ second_group_filter_luma_8_offsets x13, 24
+ first_group_filter_luma_8_sme2 x17, z3.h, 3
+
+ mov z12.d, z2.d
+ mov z13.d, z2.d
+ mov z14.d, z2.d
+ mov z15.d, z2.d
+ // copy filter into 4 vectors and then zip
+ mov z28.d, z5.d
+ mov z29.d, z5.d
+ zip {z12.d-z15.d}, {z12.d-z15.d}
+ mov z30.d, z5.d
+ mov z31.d, z5.d
+ // -clip
+ neg z16.h, p1/m, z12.h
+ neg z17.h, p1/m, z13.h
+ neg z18.h, p1/m, z14.h
+ neg z19.h, p1/m, z15.h
+ zip {z28.d-z31.d}, {z28.d-z31.d}
+ // clip & filter (third group): a2,a5,a8,a11,a14...
+ // left: {p3[0],p1[ 1],p1[-2],p0[ 1]}
+ // right: {p4[0],p2[-1],p2[ 2],p0[-1]}
+ third_group_filter_luma_8_offsets x13, 0
+ first_group_filter_luma_8_sme2 x14, z6.h, 0
+ third_group_filter_luma_8_offsets x13, 8
+ first_group_filter_luma_8_sme2 x15, z7.h, 1
+ third_group_filter_luma_8_offsets x13, 16
+ first_group_filter_luma_8_sme2 x16, z0.h, 2
+ third_group_filter_luma_8_offsets x13, 24
+ first_group_filter_luma_8_sme2 x17, z3.h, 3
+ mova {z16.d-z19.d}, za.d[w10, 0]
+ mova {z20.d-z23.d}, za.d[w10, 1]
+ mova {z24.d-z27.d}, za.d[w10, 2]
+ mova {z28.d-z31.d}, za.d[w10, 3]
+ sqrshr z12.h, {z16.d-z19.d}, #7
+ sqrshr z13.h, {z20.d-z23.d}, #7
+ sqrshr z14.h, {z24.d-z27.d}, #7
+ sqrshr z15.h, {z28.d-z31.d}, #7
+ tbnz x13, #0, 10f
+ sqrshr z12.h, {z16.d-z19.d}, #10
+10:
+ tbnz x13, #8, 11f
+ sqrshr z13.h, {z20.d-z23.d}, #10
+11:
+ tbnz x13, #16, 12f
+ sqrshr z14.h, {z24.d-z27.d}, #10
+12:
+ tbnz x13, #24, 13f
+ sqrshr z15.h, {z28.d-z31.d}, #10
+13:
+ add z12.h, z12.h, z6.h
+ add z13.h, z13.h, z7.h
+ add z14.h, z14.h, z0.h
+ add z15.h, z15.h, z3.h
+ sclamp {z12.h-z15.h}, z9.h, z10.h
+ st1b z12.h, p0, [x19]
+ st1b z13.h, p0, [x19, x2]
+ st1b z14.h, p0, [x30]
+ st1b z15.h, p0, [x30, x2]
+ zero {za}
+ add x14, x14, x12
+ add x19, x19, x12
+ ptrue p0.h
+ subs w8, w8, w12
+ add w12, w12, w12, lsl #1
+ add x4, x4, x12, lsl #1
+ add x5, x5, x12, lsl #1
+ cnth x12
+ b.gt 2b
+ mov w8, w3
+ subs w7, w7, #4
+ add x1, x1, x11, lsl #2
+ add x0, x0, x2, lsl #2
+ b.gt 1b
+
+ ldp x19, x20, [sp, #16]
+ ldp x21, x22, [sp, #32]
+ ldp x23, x24, [sp, #48]
+ ldp x25, x26, [sp, #64]
+ ldp x27, x28, [sp, #80]
+ ldp x29, x30, [sp], #96
+ sme_exit
+ ret
+endfunc
--
2.49.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2026-01-04 16:34 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2026-01-04 16:33 [FFmpeg-devel] [PR] feat/sme2_alf_filter_luma_8bit (PR #21373) george.zaguri via ffmpeg-devel
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git