* [FFmpeg-devel] [PATCH] aarch64/vvc: SME optimisations of put_luma_h(64x64,128x128) functions for 8-bit (PR #21194)
@ 2025-12-14 16:00 george.zaguri via ffmpeg-devel
0 siblings, 0 replies; only message in thread
From: george.zaguri via ffmpeg-devel @ 2025-12-14 16:00 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: george.zaguri
PR #21194 opened by george.zaguri
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21194
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21194.patch
Apple M4:
put_luma_h_8_64x64_c: 644.5 ( 1.00x)
put_luma_h_8_64x64_neon: 520.3 ( 1.24x)
put_luma_h_8_64x64_i8mm: 440.9 ( 1.46x)
put_luma_h_8_64x64_sme: 405.7 ( 1.59x)
put_luma_h_8_128x128_c: 2340.3 ( 1.00x)
put_luma_h_8_128x128_neon: 2078.7 ( 1.13x)
put_luma_h_8_128x128_i8mm: 1711.9 ( 1.37x)
put_luma_h_8_128x128_sme: 1604.5 ( 1.46x)
>From 151199038279cbe8b7100ce2c41a73791f71bd45 Mon Sep 17 00:00:00 2001
From: Georgii Zagoruiko <george.zaguri@gmail.com>
Date: Sun, 14 Dec 2025 15:58:39 +0000
Subject: [PATCH] aarch64/vvc: SME optimisations of put_luma_h(64x64,128x128)
functions for 8-bit
Apple M4:
put_luma_h_8_64x64_c: 644.5 ( 1.00x)
put_luma_h_8_64x64_neon: 520.3 ( 1.24x)
put_luma_h_8_64x64_i8mm: 440.9 ( 1.46x)
put_luma_h_8_64x64_sme: 405.7 ( 1.59x)
put_luma_h_8_128x128_c: 2340.3 ( 1.00x)
put_luma_h_8_128x128_neon: 2078.7 ( 1.13x)
put_luma_h_8_128x128_i8mm: 1711.9 ( 1.37x)
put_luma_h_8_128x128_sme: 1604.5 ( 1.46x)
---
libavcodec/aarch64/vvc/Makefile | 1 +
libavcodec/aarch64/vvc/dsp_init.c | 6 ++
libavcodec/aarch64/vvc/inter_sme.S | 132 +++++++++++++++++++++++++++++
3 files changed, 139 insertions(+)
create mode 100644 libavcodec/aarch64/vvc/inter_sme.S
diff --git a/libavcodec/aarch64/vvc/Makefile b/libavcodec/aarch64/vvc/Makefile
index ed80338969..56282478a7 100644
--- a/libavcodec/aarch64/vvc/Makefile
+++ b/libavcodec/aarch64/vvc/Makefile
@@ -8,3 +8,4 @@ NEON-OBJS-$(CONFIG_VVC_DECODER) += aarch64/vvc/alf.o \
aarch64/h26x/epel_neon.o \
aarch64/h26x/qpel_neon.o \
aarch64/h26x/sao_neon.o
+SME-OBJS-$(CONFIG_VVC_DECODER) += aarch64/vvc/inter_sme.o
diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
index aa75d22b78..d86e431215 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -42,6 +42,8 @@ void ff_vvc_put_luma_h16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdif
const int height, const int8_t *hf, const int8_t *vf, const int width);
void ff_vvc_put_luma_h_x16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
const int height, const int8_t *hf, const int8_t *vf, const int width);
+void ff_vvc_put_luma_h_8_sme(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
+ const int height, const int8_t *hf, const int8_t *vf, const int width);
void ff_alf_classify_sum_neon(int *sum0, int *sum1, int16_t *grad, uint32_t gshift, uint32_t steps);
@@ -251,6 +253,10 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->inter.put[1][5][1][1] = ff_vvc_put_epel_hv64_8_neon_i8mm;
c->inter.put[1][6][1][1] = ff_vvc_put_epel_hv128_8_neon_i8mm;
}
+ if (have_sme(cpu_flags)) {
+ c->inter.put[0][5][0][1] =
+ c->inter.put[0][6][0][1] = ff_vvc_put_luma_h_8_sme;
+ }
} else if (bd == 10) {
c->inter.avg = ff_vvc_avg_10_neon;
c->inter.w_avg = vvc_w_avg_10;
diff --git a/libavcodec/aarch64/vvc/inter_sme.S b/libavcodec/aarch64/vvc/inter_sme.S
new file mode 100644
index 0000000000..d3592518cb
--- /dev/null
+++ b/libavcodec/aarch64/vvc/inter_sme.S
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2025 Georgii Zagoruiko <georgii.zaguri@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+#define VVC_MAX_PB_SIZE 128
+
+#if HAVE_SME
+ENABLE_SME
+
+function ff_vvc_put_luma_h_8_sme, export=1
+ // dst .req x0
+ // _src .req x1
+ // _src_stride .req x2
+ // height .req w3
+ // hf .req x4
+ // vf .req x5
+ // width .req w6
+ smstart
+ cntb x8
+ mov x9, #(VVC_MAX_PB_SIZE * 2)
+ mov w13, #0
+ mov w14, #1
+ mov w15, #2
+ mov w16, #3
+ ptrue p0.b, VL8
+ ptrue p1.s
+ ld1b z30.b, p0/Z, [x4]
+ eor z0.b, z0.b, z0.b
+ mov z31.d, z30.d
+ sub x1, x1, #3
+ ext z31.b, z31.b, z0.b, #4
+.Loop_H:
+ cmp w6, w8
+ csel w11, w6, w8, ls
+ whilelo p0.b, xzr, x6
+ mov w10, w3
+ asr w12, w11, #1
+ whilelo p2.h, xzr, x12
+.Loop_W:
+ ld1b z0.b, p0/z, [x1]
+ ld1b z1.b, p0/z, [x1, x14]
+ ld1b z2.b, p0/z, [x1, x15]
+ ld1b z3.b, p0/z, [x1, x16]
+ add x1, x1, #4
+ ld1b z4.b, p0/z, [x1]
+ ld1b z5.b, p0/z, [x1, x14]
+ ld1b z6.b, p0/z, [x1, x15]
+ ld1b z7.b, p0/z, [x1, x16]
+ sub x1, x1, #4
+ usmopa za0.s, p0/m, p0/m, z0.b, z30.b
+ usmopa za1.s, p0/m, p0/m, z1.b, z30.b
+ usmopa za2.s, p0/m, p0/m, z2.b, z30.b
+ usmopa za3.s, p0/m, p0/m, z3.b, z30.b
+ usmopa za0.s, p0/m, p0/m, z4.b, z31.b
+ usmopa za1.s, p0/m, p0/m, z5.b, z31.b
+ usmopa za2.s, p0/m, p0/m, z6.b, z31.b
+ usmopa za3.s, p0/m, p0/m, z7.b, z31.b
+ mova z22.s, p1/m, za0v.s[w13, 0]
+ mova z24.s, p1/m, za1v.s[w13, 0]
+ mova z26.s, p1/m, za2v.s[w13, 0]
+ mova z28.s, p1/m, za3v.s[w13, 0]
+ add x1, x1, x2
+ zero {za}
+ ld1b z0.b, p0/z, [x1]
+ ld1b z1.b, p0/z, [x1, x14]
+ ld1b z2.b, p0/z, [x1, x15]
+ ld1b z3.b, p0/z, [x1, x16]
+ add x1, x1, #4
+ ld1b z4.b, p0/z, [x1]
+ ld1b z5.b, p0/z, [x1, x14]
+ ld1b z6.b, p0/z, [x1, x15]
+ ld1b z7.b, p0/z, [x1, x16]
+ sub x1, x1, #4
+ sqxtnb z21.h, z22.s
+ sqxtnb z22.h, z24.s
+ sqxtnt z21.h, z26.s
+ sqxtnt z22.h, z28.s
+ st2h {z21.h-z22.h}, p2, [x0]
+ add x1, x1, x2
+ add x0, x0, x9
+
+ usmopa za0.s, p0/m, p0/m, z0.b, z30.b
+ usmopa za1.s, p0/m, p0/m, z1.b, z30.b
+ usmopa za2.s, p0/m, p0/m, z2.b, z30.b
+ usmopa za3.s, p0/m, p0/m, z3.b, z30.b
+ usmopa za0.s, p0/m, p0/m, z4.b, z31.b
+ usmopa za1.s, p0/m, p0/m, z5.b, z31.b
+ usmopa za2.s, p0/m, p0/m, z6.b, z31.b
+ usmopa za3.s, p0/m, p0/m, z7.b, z31.b
+ mova z22.s, p1/m, za0v.s[w13, 0]
+ mova z24.s, p1/m, za1v.s[w13, 0]
+ mova z26.s, p1/m, za2v.s[w13, 0]
+ mova z28.s, p1/m, za3v.s[w13, 0]
+ sqxtnb z21.h, z22.s
+ sqxtnb z22.h, z24.s
+ sqxtnt z21.h, z26.s
+ sqxtnt z22.h, z28.s
+ zero {za}
+ st2h {z21.h-z22.h}, p2, [x0]
+ subs w10, w10, #2
+ add x0, x0, x9
+ b.gt .Loop_W
+ msub x0, x3, x9, x0
+ msub x1, x3, x2, x1
+ add x0, x0, x11, lsl #1
+ subs w6, w6, w11
+ add x1, x1, x11
+ b.gt .Loop_H
+ smstop
+ ret
+endfunc
+
+DISABLE_SME
+#endif
--
2.49.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2025-12-14 16:00 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-12-14 16:00 [FFmpeg-devel] [PATCH] aarch64/vvc: SME optimisations of put_luma_h(64x64,128x128) functions for 8-bit (PR #21194) george.zaguri via ffmpeg-devel
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git