* [FFmpeg-devel] [PATCH v3] lavc/aarch64: add hevc horizontal qpel/uni/bi
@ 2022-10-11 7:32 J. Dekker
2022-10-24 12:01 ` Martin Storsjö
0 siblings, 1 reply; 3+ messages in thread
From: J. Dekker @ 2022-10-11 7:32 UTC (permalink / raw)
To: ffmpeg-devel
checkasm benchmark on Ampere Altra (Neoverse N1):
put_hevc_qpel_bi_h4_8_c: 170.7
put_hevc_qpel_bi_h4_8_neon: 64.5
put_hevc_qpel_bi_h6_8_c: 373.7
put_hevc_qpel_bi_h6_8_neon: 130.2
put_hevc_qpel_bi_h8_8_c: 662.0
put_hevc_qpel_bi_h8_8_neon: 138.5
put_hevc_qpel_bi_h12_8_c: 1529.5
put_hevc_qpel_bi_h12_8_neon: 422.0
put_hevc_qpel_bi_h16_8_c: 2735.5
put_hevc_qpel_bi_h16_8_neon: 560.5
put_hevc_qpel_bi_h24_8_c: 6015.7
put_hevc_qpel_bi_h24_8_neon: 1636.0
put_hevc_qpel_bi_h32_8_c: 10779.0
put_hevc_qpel_bi_h32_8_neon: 2204.5
put_hevc_qpel_bi_h48_8_c: 24375.0
put_hevc_qpel_bi_h48_8_neon: 4984.0
put_hevc_qpel_bi_h64_8_c: 42768.0
put_hevc_qpel_bi_h64_8_neon: 8795.7
put_hevc_qpel_h4_8_c: 149.0
put_hevc_qpel_h4_8_neon: 55.7
put_hevc_qpel_h6_8_c: 321.2
put_hevc_qpel_h6_8_neon: 106.0
put_hevc_qpel_h8_8_c: 578.7
put_hevc_qpel_h8_8_neon: 133.2
put_hevc_qpel_h12_8_c: 1279.0
put_hevc_qpel_h12_8_neon: 391.7
put_hevc_qpel_h16_8_c: 2286.2
put_hevc_qpel_h16_8_neon: 519.7
put_hevc_qpel_h24_8_c: 5100.7
put_hevc_qpel_h24_8_neon: 1546.2
put_hevc_qpel_h32_8_c: 9022.0
put_hevc_qpel_h32_8_neon: 2060.2
put_hevc_qpel_h48_8_c: 20293.5
put_hevc_qpel_h48_8_neon: 4656.7
put_hevc_qpel_h64_8_c: 36037.0
put_hevc_qpel_h64_8_neon: 8262.7
put_hevc_qpel_uni_h4_8_c: 162.2
put_hevc_qpel_uni_h4_8_neon: 61.7
put_hevc_qpel_uni_h6_8_c: 355.2
put_hevc_qpel_uni_h6_8_neon: 114.2
put_hevc_qpel_uni_h8_8_c: 651.0
put_hevc_qpel_uni_h8_8_neon: 135.7
put_hevc_qpel_uni_h12_8_c: 1412.5
put_hevc_qpel_uni_h12_8_neon: 402.7
put_hevc_qpel_uni_h16_8_c: 2551.0
put_hevc_qpel_uni_h16_8_neon: 533.5
put_hevc_qpel_uni_h24_8_c: 5782.2
put_hevc_qpel_uni_h24_8_neon: 1578.7
put_hevc_qpel_uni_h32_8_c: 10586.5
put_hevc_qpel_uni_h32_8_neon: 2102.2
put_hevc_qpel_uni_h48_8_c: 23812.0
put_hevc_qpel_uni_h48_8_neon: 4739.5
put_hevc_qpel_uni_h64_8_c: 42958.7
put_hevc_qpel_uni_h64_8_neon: 8366.5
Signed-off-by: J. Dekker <jdek@itanimul.li>
---
Summary of changes since last iteration:
- Interleaved stores
- Changed tiling to loop more naturally
- Increased code reuse (.text reduction by ~60%)
- Simplified function variations through .req
libavcodec/aarch64/Makefile | 1 +
libavcodec/aarch64/hevcdsp_init_aarch64.c | 67 +++
libavcodec/aarch64/hevcdsp_qpel_neon.S | 484 ++++++++++++++++++++++
3 files changed, 552 insertions(+)
create mode 100644 libavcodec/aarch64/hevcdsp_qpel_neon.S
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 9ce21566c6..02fb51c3ab 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -67,4 +67,5 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \
aarch64/vp9mc_neon.o
NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_idct_neon.o \
aarch64/hevcdsp_init_aarch64.o \
+ aarch64/hevcdsp_qpel_neon.o \
aarch64/hevcdsp_sao_neon.o
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 644cc17715..44399b05d8 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -69,6 +69,46 @@ void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, const uint8_t *src, ptrd
const int16_t *sao_offset_val, int eo, int width, int height);
void ff_hevc_sao_edge_filter_8x8_8_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst,
const int16_t *sao_offset_val, int eo, int width, int height);
+void ff_hevc_put_hevc_qpel_h4_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height,
+ intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_h6_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height,
+ intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_h8_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height,
+ intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_h12_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height,
+ intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_h16_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height,
+ intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_uni_h4_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
+ ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,
+ int width);
+void ff_hevc_put_hevc_qpel_uni_h6_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
+ ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,
+ int width);
+void ff_hevc_put_hevc_qpel_uni_h8_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
+ ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,
+ int width);
+void ff_hevc_put_hevc_qpel_uni_h12_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
+ ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t
+ my, int width);
+void ff_hevc_put_hevc_qpel_uni_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
+ ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t
+ my, int width);
+void ff_hevc_put_hevc_qpel_bi_h4_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
+ ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t
+ mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_bi_h6_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
+ ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t
+ mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_bi_h8_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
+ ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t
+ mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_bi_h12_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
+ ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t
+ mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
+ ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t
+ mx, intptr_t my, int width);
av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
{
@@ -95,6 +135,33 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
c->sao_edge_filter[2] =
c->sao_edge_filter[3] =
c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_16x16_8_neon;
+ c->put_hevc_qpel[1][0][1] = ff_hevc_put_hevc_qpel_h4_8_neon;
+ c->put_hevc_qpel[2][0][1] = ff_hevc_put_hevc_qpel_h6_8_neon;
+ c->put_hevc_qpel[3][0][1] = ff_hevc_put_hevc_qpel_h8_8_neon;
+ c->put_hevc_qpel[4][0][1] =
+ c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h12_8_neon;
+ c->put_hevc_qpel[5][0][1] =
+ c->put_hevc_qpel[7][0][1] =
+ c->put_hevc_qpel[8][0][1] =
+ c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h16_8_neon;
+ c->put_hevc_qpel_uni[1][0][1] = ff_hevc_put_hevc_qpel_uni_h4_8_neon;
+ c->put_hevc_qpel_uni[2][0][1] = ff_hevc_put_hevc_qpel_uni_h6_8_neon;
+ c->put_hevc_qpel_uni[3][0][1] = ff_hevc_put_hevc_qpel_uni_h8_8_neon;
+ c->put_hevc_qpel_uni[4][0][1] =
+ c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_qpel_uni_h12_8_neon;
+ c->put_hevc_qpel_uni[5][0][1] =
+ c->put_hevc_qpel_uni[7][0][1] =
+ c->put_hevc_qpel_uni[8][0][1] =
+ c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_qpel_uni_h16_8_neon;
+ c->put_hevc_qpel_bi[1][0][1] = ff_hevc_put_hevc_qpel_bi_h4_8_neon;
+ c->put_hevc_qpel_bi[2][0][1] = ff_hevc_put_hevc_qpel_bi_h6_8_neon;
+ c->put_hevc_qpel_bi[3][0][1] = ff_hevc_put_hevc_qpel_bi_h8_8_neon;
+ c->put_hevc_qpel_bi[4][0][1] =
+ c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_qpel_bi_h12_8_neon;
+ c->put_hevc_qpel_bi[5][0][1] =
+ c->put_hevc_qpel_bi[7][0][1] =
+ c->put_hevc_qpel_bi[8][0][1] =
+ c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
}
if (bit_depth == 10) {
c->add_residual[0] = ff_hevc_add_residual_4x4_10_neon;
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
new file mode 100644
index 0000000000..7974b8529e
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -0,0 +1,484 @@
+/* -*-arm64-*-
+ * vim: syntax=arm64asm
+ *
+ * Copyright (c) 2022 J. Dekker <jdek@itanimul.li>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#define MAX_PB_SIZE 64
+
+const qpel_filters, align=4
+ .byte 0, 0, 0, 0, 0, 0, 0, 0
+ .byte -1, 4,-10, 58, 17, -5, 1, 0
+ .byte -1, 4,-11, 40, 40,-11, 4, -1
+ .byte 0, 1, -5, 17, 58,-10, 4, -1
+endconst
+
+.macro load_filter m
+ movrel x15, qpel_filters
+ add x15, x15, \m, lsl #3
+ ld1 {v0.8b}, [x15]
+ sxtl v0.8h, v0.8b
+.endm
+
+.macro put_hevc type
+.ifc \type, qpel
+ // void put_hevc_qpel_h(int16_t *dst,
+ // uint8_t *_src, ptrdiff_t _srcstride,
+ // int height, intptr_t mx, intptr_t my, int width)
+ dst .req x0
+ dststride .req x7
+ src .req x1
+ srcstride .req x2
+ height .req x3
+ heightw .req w3
+ mx .req x4
+ width .req w6
+.endif
+.ifc \type, qpel_uni
+ // void put_hevc_qpel_uni_h(uint8_t *_dst, ptrdiff_t _dststride,
+ // uint8_t *_src, ptrdiff_t _srcstride,
+ // int height, intptr_t mx, intptr_t my, int width)
+ dst .req x0
+ dststride .req x1
+ src .req x2
+ srcstride .req x3
+ height .req x4
+ heightw .req w4
+ mx .req x5
+ width .req w7
+.endif
+.ifc \type, qpel_bi
+ // void put_hevc_qpel_bi_h(uint8_t *_dst, ptrdiff_t _dststride,
+ // uint8_t *_src, ptrdiff_t _srcstride,
+ // int16_t *src2, int height, intptr_t mx,
+ // intptr_t my, int width)
+ dst .req x0
+ dststride .req x1
+ src .req x2
+ srcstride .req x3
+ height .req x5
+ heightw .req w5
+ mx .req x6
+ width .req w8
+.endif
+
+.ifc \type, qpel
+function ff_hevc_put_hevc_h4_8_neon, export=0
+ uxtl v16.8h, v16.8b
+ uxtl v17.8h, v17.8b
+ uxtl v18.8h, v18.8b
+ uxtl v19.8h, v19.8b
+
+ mul v23.4h, v16.4h, v0.h[0]
+ mul v24.4h, v18.4h, v0.h[0]
+
+.irpc i, 1234567
+ ext v20.16b, v16.16b, v17.16b, #(2*\i)
+ ext v21.16b, v18.16b, v19.16b, #(2*\i)
+ mla v23.4h, v20.4h, v0.h[\i]
+ mla v24.4h, v21.4h, v0.h[\i]
+.endr
+ ret
+endfunc
+.endif
+
+function ff_hevc_put_hevc_\type\()_h4_8_neon, export=1
+ load_filter mx
+.ifc \type, qpel_bi
+ mov x16, #(MAX_PB_SIZE << 2) // src2bstridel
+ add x15, x4, #(MAX_PB_SIZE << 1) // src2b
+.endif
+ sub src, src, #3
+ mov mx, lr
+.ifc \type, qpel
+ mov dststride, #(MAX_PB_SIZE << 1)
+ lsl x13, srcstride, #1 // srcstridel
+ mov x14, #(MAX_PB_SIZE << 2)
+.else
+ lsl x14, dststride, #1 // dststridel
+ lsl x13, srcstride, #1 // srcstridel
+.endif
+ add x10, dst, dststride // dstb
+ add x12, src, srcstride // srcb
+0: ld1 {v16.8b, v17.8b}, [src], x13
+ ld1 {v18.8b, v19.8b}, [x12], x13
+.ifc \type, qpel_bi
+ ld1 {v25.8h}, [ x4], x16
+ ld1 {v26.8h}, [x15], x16
+.endif
+
+ bl ff_hevc_put_hevc_h4_8_neon
+ subs heightw, heightw, #2
+
+.ifc \type, qpel
+ st1 {v23.4h}, [dst], x14
+ st1 {v24.4h}, [x10], x14
+.else
+.ifc \type, qpel_bi
+ sqadd v23.4h, v23.4h, v25.4h
+ sqadd v24.4h, v24.4h, v26.4h
+ sqrshrun v23.8b, v23.8h, #7
+ sqrshrun v24.8b, v24.8h, #7
+.else
+ sqrshrun v23.8b, v23.8h, #6
+ sqrshrun v24.8b, v24.8h, #6
+.endif
+ st1 {v23.s}[0], [dst], x14
+ st1 {v24.s}[0], [x10], x14
+.endif
+ b.gt 0b // double line
+ ret mx
+endfunc
+
+.ifc \type, qpel
+function ff_hevc_put_hevc_h8_8_neon, export=0
+ uxtl v16.8h, v16.8b
+ uxtl v17.8h, v17.8b
+ uxtl v18.8h, v18.8b
+ uxtl v19.8h, v19.8b
+
+ mul v23.8h, v16.8h, v0.h[0]
+ mul v24.8h, v18.8h, v0.h[0]
+
+.irpc i, 1234567
+ ext v20.16b, v16.16b, v17.16b, #(2*\i)
+ ext v21.16b, v18.16b, v19.16b, #(2*\i)
+ mla v23.8h, v20.8h, v0.h[\i]
+ mla v24.8h, v21.8h, v0.h[\i]
+.endr
+ ret
+endfunc
+.endif
+
+function ff_hevc_put_hevc_\type\()_h6_8_neon, export=1
+ load_filter mx
+.ifc \type, qpel_bi
+ mov x16, #(MAX_PB_SIZE << 2) // src2bstridel
+ add x15, x4, #(MAX_PB_SIZE << 1) // src2b
+.endif
+ sub src, src, #3
+ mov mx, lr
+.ifc \type, qpel
+ mov dststride, #(MAX_PB_SIZE << 1)
+ lsl x13, srcstride, #1 // srcstridel
+ mov x14, #((MAX_PB_SIZE << 2) - 8)
+.else
+ lsl x14, dststride, #1 // dststridel
+ lsl x13, srcstride, #1 // srcstridel
+ sub x14, x14, #4
+.endif
+ add x10, dst, dststride // dstb
+ add x12, src, srcstride // srcb
+0: ld1 {v16.8b, v17.8b}, [src], x13
+ ld1 {v18.8b, v19.8b}, [x12], x13
+.ifc \type, qpel_bi
+ ld1 {v25.8h}, [ x4], x16
+ ld1 {v26.8h}, [x15], x16
+.endif
+
+ bl ff_hevc_put_hevc_h8_8_neon
+ subs heightw, heightw, #2
+
+.ifc \type, qpel
+ st1 {v23.4h}, [dst], #8
+ st1 {v24.4h}, [x10], #8
+ st1 {v23.s}[2], [dst], x14
+ st1 {v24.s}[2], [x10], x14
+.else
+.ifc \type, qpel_bi
+ sqadd v23.8h, v23.8h, v25.8h
+ sqadd v24.8h, v24.8h, v26.8h
+ sqrshrun v23.8b, v23.8h, #7
+ sqrshrun v24.8b, v24.8h, #7
+.else
+ sqrshrun v23.8b, v23.8h, #6
+ sqrshrun v24.8b, v24.8h, #6
+.endif
+ st1 {v23.s}[0], [dst], #4
+ st1 {v24.s}[0], [x10], #4
+ st1 {v23.h}[2], [dst], x14
+ st1 {v24.h}[2], [x10], x14
+.endif
+ b.gt 0b // double line
+ ret mx
+endfunc
+
+function ff_hevc_put_hevc_\type\()_h8_8_neon, export=1
+ load_filter mx
+.ifc \type, qpel_bi
+ mov x16, #(MAX_PB_SIZE << 2) // src2bstridel
+ add x15, x4, #(MAX_PB_SIZE << 1) // src2b
+.endif
+ sub src, src, #3
+ mov mx, lr
+.ifc \type, qpel
+ mov dststride, #(MAX_PB_SIZE << 1)
+ lsl x13, srcstride, #1 // srcstridel
+ mov x14, #(MAX_PB_SIZE << 2)
+.else
+ lsl x14, dststride, #1 // dststridel
+ lsl x13, srcstride, #1 // srcstridel
+.endif
+ add x10, dst, dststride // dstb
+ add x12, src, srcstride // srcb
+0: ld1 {v16.8b, v17.8b}, [src], x13
+ ld1 {v18.8b, v19.8b}, [x12], x13
+.ifc \type, qpel_bi
+ ld1 {v25.8h}, [ x4], x16
+ ld1 {v26.8h}, [x15], x16
+.endif
+
+ bl ff_hevc_put_hevc_h8_8_neon
+ subs heightw, heightw, #2
+
+.ifc \type, qpel
+ st1 {v23.8h}, [dst], x14
+ st1 {v24.8h}, [x10], x14
+.else
+.ifc \type, qpel_bi
+ sqadd v23.8h, v23.8h, v25.8h
+ sqadd v24.8h, v24.8h, v26.8h
+ sqrshrun v23.8b, v23.8h, #7
+ sqrshrun v24.8b, v24.8h, #7
+.else
+ sqrshrun v23.8b, v23.8h, #6
+ sqrshrun v24.8b, v24.8h, #6
+.endif
+ st1 {v23.8b}, [dst], x14
+ st1 {v24.8b}, [x10], x14
+.endif
+ b.gt 0b // double line
+ ret mx
+endfunc
+
+.ifc \type, qpel
+function ff_hevc_put_hevc_h16_8_neon, export=0
+ uxtl v16.8h, v16.8b
+ uxtl v17.8h, v17.8b
+ uxtl v18.8h, v18.8b
+
+ uxtl v19.8h, v19.8b
+ uxtl v20.8h, v20.8b
+ uxtl v21.8h, v21.8b
+
+ mul v26.8h, v16.8h, v0.h[0]
+ mul v27.8h, v17.8h, v0.h[0]
+ mul v28.8h, v19.8h, v0.h[0]
+ mul v29.8h, v20.8h, v0.h[0]
+.irpc i, 1234567
+ ext v22.16b, v16.16b, v17.16b, #(2*\i)
+ ext v23.16b, v17.16b, v18.16b, #(2*\i)
+
+ ext v24.16b, v19.16b, v20.16b, #(2*\i)
+ ext v25.16b, v20.16b, v21.16b, #(2*\i)
+
+ mla v26.8h, v22.8h, v0.h[\i]
+ mla v27.8h, v23.8h, v0.h[\i]
+
+ mla v28.8h, v24.8h, v0.h[\i]
+ mla v29.8h, v25.8h, v0.h[\i]
+.endr
+ subs x9, x9, #2
+ ret
+endfunc
+.endif
+
+function ff_hevc_put_hevc_\type\()_h12_8_neon, export=1
+ load_filter mx
+ sxtw height, heightw
+.ifc \type, qpel_bi
+ ldrh w8, [sp] // width
+ mov x16, #(MAX_PB_SIZE << 2) // src2bstridel
+ lsl x17, height, #7 // src2b reset (height * (MAX_PB_SIZE << 1))
+ add x15, x4, #(MAX_PB_SIZE << 1) // src2b
+.endif
+ sub src, src, #3
+ mov mx, lr
+.ifc \type, qpel
+ mov dststride, #(MAX_PB_SIZE << 1)
+ lsl x13, srcstride, #1 // srcstridel
+ mov x14, #((MAX_PB_SIZE << 2) - 16)
+.else
+ lsl x14, dststride, #1 // dststridel
+ lsl x13, srcstride, #1 // srcstridel
+ sub x14, x14, #8
+.endif
+ add x10, dst, dststride // dstb
+ add x12, src, srcstride // srcb
+0: mov x9, height
+1: ld1 {v16.8b-v18.8b}, [src], x13
+ ld1 {v19.8b-v21.8b}, [x12], x13
+
+ bl ff_hevc_put_hevc_h16_8_neon
+
+.ifc \type, qpel
+ st1 {v26.8h}, [dst], #16
+ st1 {v28.8h}, [x10], #16
+ st1 {v27.4h}, [dst], x14
+ st1 {v29.4h}, [x10], x14
+.else
+.ifc \type, qpel_bi
+ ld1 {v16.8h, v17.8h}, [ x4], x16
+ ld1 {v18.8h, v19.8h}, [x15], x16
+ sqadd v26.8h, v26.8h, v16.8h
+ sqadd v27.8h, v27.8h, v17.8h
+ sqadd v28.8h, v28.8h, v18.8h
+ sqadd v29.8h, v29.8h, v19.8h
+ sqrshrun v26.8b, v26.8h, #7
+ sqrshrun v27.8b, v27.8h, #7
+ sqrshrun v28.8b, v28.8h, #7
+ sqrshrun v29.8b, v29.8h, #7
+.else
+ sqrshrun v26.8b, v26.8h, #6
+ sqrshrun v27.8b, v27.8h, #6
+ sqrshrun v28.8b, v28.8h, #6
+ sqrshrun v29.8b, v29.8h, #6
+.endif
+ st1 {v26.8b}, [dst], #8
+ st1 {v28.8b}, [x10], #8
+ st1 {v27.s}[0], [dst], x14
+ st1 {v29.s}[0], [x10], x14
+.endif
+ b.gt 1b // double line
+ subs width, width, #12
+ // reset src
+ msub src, srcstride, height, src
+ msub x12, srcstride, height, x12
+ // reset dst
+ msub dst, dststride, height, dst
+ msub x10, dststride, height, x10
+.ifc \type, qpel_bi
+ // reset xsrc
+ sub x4, x4, x17
+ sub x15, x15, x17
+ add x4, x4, #24
+ add x15, x15, #24
+.endif
+ add src, src, #12
+ add x12, x12, #12
+.ifc \type, qpel
+ add dst, dst, #24
+ add x10, x10, #24
+.else
+ add dst, dst, #12
+ add x10, x10, #12
+.endif
+ b.gt 0b
+ ret mx
+endfunc
+
+function ff_hevc_put_hevc_\type\()_h16_8_neon, export=1
+ load_filter mx
+ sxtw height, heightw
+ mov mx, lr
+.ifc \type, qpel_bi
+ ldrh w8, [sp] // width
+ mov x16, #(MAX_PB_SIZE << 2) // src2bstridel
+ lsl x17, x5, #7 // src2b reset
+ add x15, x4, #(MAX_PB_SIZE << 1) // src2b
+.endif
+ sub src, src, #3
+ mov mx, lr
+.ifc \type, qpel
+ mov dststride, #(MAX_PB_SIZE << 1)
+ lsl x13, srcstride, #1 // srcstridel
+ mov x14, #((MAX_PB_SIZE << 2) - 16)
+.else
+ lsl x14, dststride, #1 // dststridel
+ lsl x13, srcstride, #1 // srcstridel
+ sub x14, x14, #8
+.endif
+ add x10, dst, dststride // dstb
+ add x12, src, srcstride // srcb
+0: mov x9, height
+1: ld1 {v16.8b-v18.8b}, [src], x13
+ ld1 {v19.8b-v21.8b}, [x12], x13
+
+ bl ff_hevc_put_hevc_h16_8_neon
+
+.ifc \type, qpel
+ st1 {v26.8h}, [dst], #16
+ st1 {v28.8h}, [x10], #16
+ st1 {v27.8h}, [dst], x14
+ st1 {v29.8h}, [x10], x14
+.else
+.ifc \type, qpel_bi
+ ld1 {v16.8h, v17.8h}, [ x4], x16
+ ld1 {v18.8h, v19.8h}, [x15], x16
+ sqadd v26.8h, v26.8h, v16.8h
+ sqadd v27.8h, v27.8h, v17.8h
+ sqadd v28.8h, v28.8h, v18.8h
+ sqadd v29.8h, v29.8h, v19.8h
+ sqrshrun v26.8b, v26.8h, #7
+ sqrshrun v27.8b, v27.8h, #7
+ sqrshrun v28.8b, v28.8h, #7
+ sqrshrun v29.8b, v29.8h, #7
+.else
+ sqrshrun v26.8b, v26.8h, #6
+ sqrshrun v27.8b, v27.8h, #6
+ sqrshrun v28.8b, v28.8h, #6
+ sqrshrun v29.8b, v29.8h, #6
+.endif
+ st1 {v26.8b}, [dst], #8
+ st1 {v28.8b}, [x10], #8
+ st1 {v27.8b}, [dst], x14
+ st1 {v29.8b}, [x10], x14
+.endif
+ b.gt 1b // double line
+ subs width, width, #16
+ // reset src
+ msub src, srcstride, height, src
+ msub x12, srcstride, height, x12
+ // reset dst
+ msub dst, dststride, height, dst
+ msub x10, dststride, height, x10
+.ifc \type, qpel_bi
+ // reset xsrc
+ sub x4, x4, x17
+ sub x15, x15, x17
+ add x4, x4, #32
+ add x15, x15, #32
+.endif
+ add src, src, #16
+ add x12, x12, #16
+.ifc \type, qpel
+ add dst, dst, #32
+ add x10, x10, #32
+.else
+ add dst, dst, #16
+ add x10, x10, #16
+.endif
+ b.gt 0b
+ ret mx
+endfunc
+
+.unreq height
+.unreq heightw
+.unreq width
+.unreq src
+.unreq dst
+.unreq srcstride
+.unreq dststride
+.unreq mx
+.endm
+
+put_hevc qpel
+put_hevc qpel_uni
+put_hevc qpel_bi
--
2.32.0 (Apple Git-132)
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3] lavc/aarch64: add hevc horizontal qpel/uni/bi
2022-10-11 7:32 [FFmpeg-devel] [PATCH v3] lavc/aarch64: add hevc horizontal qpel/uni/bi J. Dekker
@ 2022-10-24 12:01 ` Martin Storsjö
2022-10-25 13:12 ` J. Dekker
0 siblings, 1 reply; 3+ messages in thread
From: Martin Storsjö @ 2022-10-24 12:01 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Tue, 11 Oct 2022, J. Dekker wrote:
> checkasm benchmark on Ampere Altra (Neoverse N1):
>
> put_hevc_qpel_bi_h4_8_c: 170.7
> put_hevc_qpel_bi_h4_8_neon: 64.5
> put_hevc_qpel_bi_h6_8_c: 373.7
> put_hevc_qpel_bi_h6_8_neon: 130.2
> put_hevc_qpel_bi_h8_8_c: 662.0
> put_hevc_qpel_bi_h8_8_neon: 138.5
> put_hevc_qpel_bi_h12_8_c: 1529.5
> put_hevc_qpel_bi_h12_8_neon: 422.0
> put_hevc_qpel_bi_h16_8_c: 2735.5
> put_hevc_qpel_bi_h16_8_neon: 560.5
> put_hevc_qpel_bi_h24_8_c: 6015.7
> put_hevc_qpel_bi_h24_8_neon: 1636.0
> put_hevc_qpel_bi_h32_8_c: 10779.0
> put_hevc_qpel_bi_h32_8_neon: 2204.5
> put_hevc_qpel_bi_h48_8_c: 24375.0
> put_hevc_qpel_bi_h48_8_neon: 4984.0
> put_hevc_qpel_bi_h64_8_c: 42768.0
> put_hevc_qpel_bi_h64_8_neon: 8795.7
> put_hevc_qpel_h4_8_c: 149.0
> put_hevc_qpel_h4_8_neon: 55.7
> put_hevc_qpel_h6_8_c: 321.2
> put_hevc_qpel_h6_8_neon: 106.0
> put_hevc_qpel_h8_8_c: 578.7
> put_hevc_qpel_h8_8_neon: 133.2
> put_hevc_qpel_h12_8_c: 1279.0
> put_hevc_qpel_h12_8_neon: 391.7
> put_hevc_qpel_h16_8_c: 2286.2
> put_hevc_qpel_h16_8_neon: 519.7
> put_hevc_qpel_h24_8_c: 5100.7
> put_hevc_qpel_h24_8_neon: 1546.2
> put_hevc_qpel_h32_8_c: 9022.0
> put_hevc_qpel_h32_8_neon: 2060.2
> put_hevc_qpel_h48_8_c: 20293.5
> put_hevc_qpel_h48_8_neon: 4656.7
> put_hevc_qpel_h64_8_c: 36037.0
> put_hevc_qpel_h64_8_neon: 8262.7
> put_hevc_qpel_uni_h4_8_c: 162.2
> put_hevc_qpel_uni_h4_8_neon: 61.7
> put_hevc_qpel_uni_h6_8_c: 355.2
> put_hevc_qpel_uni_h6_8_neon: 114.2
> put_hevc_qpel_uni_h8_8_c: 651.0
> put_hevc_qpel_uni_h8_8_neon: 135.7
> put_hevc_qpel_uni_h12_8_c: 1412.5
> put_hevc_qpel_uni_h12_8_neon: 402.7
> put_hevc_qpel_uni_h16_8_c: 2551.0
> put_hevc_qpel_uni_h16_8_neon: 533.5
> put_hevc_qpel_uni_h24_8_c: 5782.2
> put_hevc_qpel_uni_h24_8_neon: 1578.7
> put_hevc_qpel_uni_h32_8_c: 10586.5
> put_hevc_qpel_uni_h32_8_neon: 2102.2
> put_hevc_qpel_uni_h48_8_c: 23812.0
> put_hevc_qpel_uni_h48_8_neon: 4739.5
> put_hevc_qpel_uni_h64_8_c: 42958.7
> put_hevc_qpel_uni_h64_8_neon: 8366.5
>
> Signed-off-by: J. Dekker <jdek@itanimul.li>
> ---
>
> Summary of changes since last iteration:
> - Interleaved stores
> - Changed tiling to loop more naturally
> - Increased code reuse (.text reduction by ~60%)
> - Simplified function variations through .req
>
> libavcodec/aarch64/Makefile | 1 +
> libavcodec/aarch64/hevcdsp_init_aarch64.c | 67 +++
> libavcodec/aarch64/hevcdsp_qpel_neon.S | 484 ++++++++++++++++++++++
> 3 files changed, 552 insertions(+)
> create mode 100644 libavcodec/aarch64/hevcdsp_qpel_neon.S
>
> diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
> index 9ce21566c6..02fb51c3ab 100644
> --- a/libavcodec/aarch64/Makefile
> +++ b/libavcodec/aarch64/Makefile
> @@ -67,4 +67,5 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \
> aarch64/vp9mc_neon.o
> NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_idct_neon.o \
> aarch64/hevcdsp_init_aarch64.o \
> + aarch64/hevcdsp_qpel_neon.o \
> aarch64/hevcdsp_sao_neon.o
> diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> index 644cc17715..44399b05d8 100644
> --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
> +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> @@ -69,6 +69,46 @@ void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, const uint8_t *src, ptrd
> const int16_t *sao_offset_val, int eo, int width, int height);
> void ff_hevc_sao_edge_filter_8x8_8_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst,
> const int16_t *sao_offset_val, int eo, int width, int height);
> +void ff_hevc_put_hevc_qpel_h4_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height,
> + intptr_t mx, intptr_t my, int width);
The function pointers in the dsp context has gotten 'const' on the source
pointers now, which makes it emit a lot of warnings with GCC, and fail
with latest Clang. Please rebase and check that it builds without
warnings.
> +void ff_hevc_put_hevc_qpel_h6_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height,
> + intptr_t mx, intptr_t my, int width);
> +void ff_hevc_put_hevc_qpel_h8_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height,
> + intptr_t mx, intptr_t my, int width);
> +void ff_hevc_put_hevc_qpel_h12_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height,
> + intptr_t mx, intptr_t my, int width);
> +void ff_hevc_put_hevc_qpel_h16_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height,
> + intptr_t mx, intptr_t my, int width);
> +void ff_hevc_put_hevc_qpel_uni_h4_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
> + ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,
> + int width);
> +void ff_hevc_put_hevc_qpel_uni_h6_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
> + ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,
> + int width);
> +void ff_hevc_put_hevc_qpel_uni_h8_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
> + ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,
> + int width);
> +void ff_hevc_put_hevc_qpel_uni_h12_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
> + ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t
> + my, int width);
> +void ff_hevc_put_hevc_qpel_uni_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
> + ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t
> + my, int width);
> +void ff_hevc_put_hevc_qpel_bi_h4_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
> + ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t
> + mx, intptr_t my, int width);
> +void ff_hevc_put_hevc_qpel_bi_h6_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
> + ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t
> + mx, intptr_t my, int width);
> +void ff_hevc_put_hevc_qpel_bi_h8_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
> + ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t
> + mx, intptr_t my, int width);
> +void ff_hevc_put_hevc_qpel_bi_h12_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
> + ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t
> + mx, intptr_t my, int width);
> +void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src,
> + ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t
> + mx, intptr_t my, int width);
>
> av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
> {
> @@ -95,6 +135,33 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
> c->sao_edge_filter[2] =
> c->sao_edge_filter[3] =
> c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_16x16_8_neon;
> + c->put_hevc_qpel[1][0][1] = ff_hevc_put_hevc_qpel_h4_8_neon;
> + c->put_hevc_qpel[2][0][1] = ff_hevc_put_hevc_qpel_h6_8_neon;
> + c->put_hevc_qpel[3][0][1] = ff_hevc_put_hevc_qpel_h8_8_neon;
> + c->put_hevc_qpel[4][0][1] =
> + c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h12_8_neon;
> + c->put_hevc_qpel[5][0][1] =
> + c->put_hevc_qpel[7][0][1] =
> + c->put_hevc_qpel[8][0][1] =
> + c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h16_8_neon;
> + c->put_hevc_qpel_uni[1][0][1] = ff_hevc_put_hevc_qpel_uni_h4_8_neon;
> + c->put_hevc_qpel_uni[2][0][1] = ff_hevc_put_hevc_qpel_uni_h6_8_neon;
> + c->put_hevc_qpel_uni[3][0][1] = ff_hevc_put_hevc_qpel_uni_h8_8_neon;
> + c->put_hevc_qpel_uni[4][0][1] =
> + c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_qpel_uni_h12_8_neon;
> + c->put_hevc_qpel_uni[5][0][1] =
> + c->put_hevc_qpel_uni[7][0][1] =
> + c->put_hevc_qpel_uni[8][0][1] =
> + c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_qpel_uni_h16_8_neon;
> + c->put_hevc_qpel_bi[1][0][1] = ff_hevc_put_hevc_qpel_bi_h4_8_neon;
> + c->put_hevc_qpel_bi[2][0][1] = ff_hevc_put_hevc_qpel_bi_h6_8_neon;
> + c->put_hevc_qpel_bi[3][0][1] = ff_hevc_put_hevc_qpel_bi_h8_8_neon;
> + c->put_hevc_qpel_bi[4][0][1] =
> + c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_qpel_bi_h12_8_neon;
> + c->put_hevc_qpel_bi[5][0][1] =
> + c->put_hevc_qpel_bi[7][0][1] =
> + c->put_hevc_qpel_bi[8][0][1] =
> + c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
> }
> if (bit_depth == 10) {
> c->add_residual[0] = ff_hevc_add_residual_4x4_10_neon;
> diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
> new file mode 100644
> index 0000000000..7974b8529e
> --- /dev/null
> +++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
> @@ -0,0 +1,484 @@
> +/* -*-arm64-*-
> + * vim: syntax=arm64asm
> + *
> + * Copyright (c) 2022 J. Dekker <jdek@itanimul.li>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +#define MAX_PB_SIZE 64
> +
> +const qpel_filters, align=4
> + .byte 0, 0, 0, 0, 0, 0, 0, 0
> + .byte -1, 4,-10, 58, 17, -5, 1, 0
> + .byte -1, 4,-11, 40, 40,-11, 4, -1
> + .byte 0, 1, -5, 17, 58,-10, 4, -1
> +endconst
> +
> +.macro load_filter m
> + movrel x15, qpel_filters
> + add x15, x15, \m, lsl #3
> + ld1 {v0.8b}, [x15]
> + sxtl v0.8h, v0.8b
> +.endm
> +
> +.macro put_hevc type
> +.ifc \type, qpel
> + // void put_hevc_qpel_h(int16_t *dst,
> + // uint8_t *_src, ptrdiff_t _srcstride,
> + // int height, intptr_t mx, intptr_t my, int width)
> + dst .req x0
> + dststride .req x7
> + src .req x1
> + srcstride .req x2
> + height .req x3
> + heightw .req w3
> + mx .req x4
> + width .req w6
> +.endif
> +.ifc \type, qpel_uni
> + // void put_hevc_qpel_uni_h(uint8_t *_dst, ptrdiff_t _dststride,
> + // uint8_t *_src, ptrdiff_t _srcstride,
> + // int height, intptr_t mx, intptr_t my, int width)
> + dst .req x0
> + dststride .req x1
> + src .req x2
> + srcstride .req x3
> + height .req x4
> + heightw .req w4
> + mx .req x5
> + width .req w7
> +.endif
> +.ifc \type, qpel_bi
> + // void put_hevc_qpel_bi_h(uint8_t *_dst, ptrdiff_t _dststride,
> + // uint8_t *_src, ptrdiff_t _srcstride,
> + // int16_t *src2, int height, intptr_t mx,
> + // intptr_t my, int width)
> + dst .req x0
> + dststride .req x1
> + src .req x2
> + srcstride .req x3
> + height .req x5
> + heightw .req w5
> + mx .req x6
> + width .req w8
> +.endif
> +
> +.ifc \type, qpel
> +function ff_hevc_put_hevc_h4_8_neon, export=0
> + uxtl v16.8h, v16.8b
> + uxtl v17.8h, v17.8b
> + uxtl v18.8h, v18.8b
> + uxtl v19.8h, v19.8b
> +
> + mul v23.4h, v16.4h, v0.h[0]
> + mul v24.4h, v18.4h, v0.h[0]
> +
> +.irpc i, 1234567
> + ext v20.16b, v16.16b, v17.16b, #(2*\i)
> + ext v21.16b, v18.16b, v19.16b, #(2*\i)
> + mla v23.4h, v20.4h, v0.h[\i]
> + mla v24.4h, v21.4h, v0.h[\i]
> +.endr
> + ret
> +endfunc
> +.endif
> +
> +function ff_hevc_put_hevc_\type\()_h4_8_neon, export=1
> + load_filter mx
> +.ifc \type, qpel_bi
> + mov x16, #(MAX_PB_SIZE << 2) // src2bstridel
> + add x15, x4, #(MAX_PB_SIZE << 1) // src2b
Beware that you can't in general rely on x16/x17 keeping their values for
long. If you branch to a function which is implemented in a different
object file, it may end up linked at a place in the address space which is
too far away for a regular 'bl' branch, so the linker has to insert a
range extension thunk, which clobbers x16/x17. But as long as everything
here is branched within the same object file, it should be ok.
In general, if you need to use x16/x17, use it only for very short-lived
temporaries.
> +.endif
> + sub src, src, #3
> + mov mx, lr
Please use literal 'x30' instead of 'lr' - older binutils don't support
the 'lr' register name alias.
Other than that, the code seems to run correctly, and the code looks
mostly reasonable now. (I didn't do a very deep read-through this time,
but it looks like you've addressed my earlier concerns.)
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3] lavc/aarch64: add hevc horizontal qpel/uni/bi
2022-10-24 12:01 ` Martin Storsjö
@ 2022-10-25 13:12 ` J. Dekker
0 siblings, 0 replies; 3+ messages in thread
From: J. Dekker @ 2022-10-25 13:12 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On 24 Oct 2022, at 14:01, Martin Storsjö wrote:
> On Tue, 11 Oct 2022, J. Dekker wrote:
>
>> [...]
>> diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
>> index 644cc17715..44399b05d8 100644
>> --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
>> +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
>> @@ -69,6 +69,46 @@ void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, const uint8_t *src, ptrd
>> const int16_t *sao_offset_val, int eo, int width, int height);
>> void ff_hevc_sao_edge_filter_8x8_8_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst,
>> const int16_t *sao_offset_val, int eo, int width, int height);
>> +void ff_hevc_put_hevc_qpel_h4_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height,
>> + intptr_t mx, intptr_t my, int width);
>
> The function pointers in the dsp context has gotten 'const' on the source pointers now, which makes it emit a lot of warnings with GCC, and fail with latest Clang. Please rebase and check that it builds without warnings.
>
Fixed.
>> [...]
>> +.ifc \type, qpel
>> +function ff_hevc_put_hevc_h4_8_neon, export=0
>> + uxtl v16.8h, v16.8b
>> + uxtl v17.8h, v17.8b
>> + uxtl v18.8h, v18.8b
>> + uxtl v19.8h, v19.8b
>> +
>> + mul v23.4h, v16.4h, v0.h[0]
>> + mul v24.4h, v18.4h, v0.h[0]
>> +
>> +.irpc i, 1234567
>> + ext v20.16b, v16.16b, v17.16b, #(2*\i)
>> + ext v21.16b, v18.16b, v19.16b, #(2*\i)
>> + mla v23.4h, v20.4h, v0.h[\i]
>> + mla v24.4h, v21.4h, v0.h[\i]
>> +.endr
>> + ret
>> +endfunc
>> +.endif
>> +
>> +function ff_hevc_put_hevc_\type\()_h4_8_neon, export=1
>> + load_filter mx
>> +.ifc \type, qpel_bi
>> + mov x16, #(MAX_PB_SIZE << 2) // src2bstridel
>> + add x15, x4, #(MAX_PB_SIZE << 1) // src2b
>
> Beware that you can't in general rely on x16/x17 keeping their values for long. If you branch to a function which is implemented in a different object file, it may end up linked at a place in the address space which is too far away for a regular 'bl' branch, so the linker has to insert a range extension thunk, which clobbers x16/x17. But as long as everything here is branched within the same object file, it should be ok.
>
> In general, if you need to use x16/x17, use it only for very short-lived temporaries.
Alright, thanks for the consideration. Left as is since as you said we're not branching anywhere outside this file.
>> +.endif
>> + sub src, src, #3
>> + mov mx, lr
>
> Please use literal 'x30' instead of 'lr' - older binutils don't support the 'lr' register name alias.
Fixed.
> Other than that, the code seems to run correctly, and the code looks mostly reasonable now. (I didn't do a very deep read-through this time, but it looks like you've addressed my earlier concerns.)
Thanks for the review, pushed with above fixes.
--
jd
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2022-10-25 13:12 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-11 7:32 [FFmpeg-devel] [PATCH v3] lavc/aarch64: add hevc horizontal qpel/uni/bi J. Dekker
2022-10-24 12:01 ` Martin Storsjö
2022-10-25 13:12 ` J. Dekker
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git