Hi, Martin, I modified it according to your comments. Please review again. And here are the checkasm benchmark results of the related functions: put_hevc_epel_uni_w_h4_8_c: 126.1 put_hevc_epel_uni_w_h4_8_i8mm: 41.6 put_hevc_epel_uni_w_h6_8_c: 222.9 put_hevc_epel_uni_w_h6_8_i8mm: 91.4 put_hevc_epel_uni_w_h8_8_c: 374.4 put_hevc_epel_uni_w_h8_8_i8mm: 102.1 put_hevc_epel_uni_w_h12_8_c: 806.1 put_hevc_epel_uni_w_h12_8_i8mm: 225.6 put_hevc_epel_uni_w_h16_8_c: 1414.4 put_hevc_epel_uni_w_h16_8_i8mm: 333.4 put_hevc_epel_uni_w_h24_8_c: 3128.6 put_hevc_epel_uni_w_h24_8_i8mm: 713.1 put_hevc_epel_uni_w_h32_8_c: 5519.1 put_hevc_epel_uni_w_h32_8_i8mm: 1118.1 put_hevc_epel_uni_w_h48_8_c: 12364.4 put_hevc_epel_uni_w_h48_8_i8mm: 2541.1 put_hevc_epel_uni_w_h64_8_c: 21925.9 put_hevc_epel_uni_w_h64_8_i8mm: 4383.6 在 2023/6/12 15:59, Martin Storsjö 写道: > On Sun, 4 Jun 2023, Logan.Lyu@myais.com.cn wrote: > >> From: Logan Lyu >> >> Signed-off-by: Logan Lyu >> --- >> libavcodec/aarch64/Makefile               |   1 + >> libavcodec/aarch64/hevcdsp_epel_neon.S    | 378 ++++++++++++++++++++++ >> libavcodec/aarch64/hevcdsp_init_aarch64.c |   7 +- >> 3 files changed, 385 insertions(+), 1 deletion(-) >> create mode 100644 libavcodec/aarch64/hevcdsp_epel_neon.S >> >> diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile >> index 216191640c..cb428b49e0 100644 >> --- a/libavcodec/aarch64/Makefile >> +++ b/libavcodec/aarch64/Makefile >> @@ -69,4 +69,5 @@ NEON-OBJS-$(CONFIG_HEVC_DECODER)        += >> aarch64/hevcdsp_deblock_neon.o      \ >> aarch64/hevcdsp_idct_neon.o         \ >> aarch64/hevcdsp_init_aarch64.o      \ >> aarch64/hevcdsp_qpel_neon.o         \ >> + aarch64/hevcdsp_epel_neon.o         \ >> aarch64/hevcdsp_sao_neon.o >> diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S >> b/libavcodec/aarch64/hevcdsp_epel_neon.S >> new file mode 100644 >> index 0000000000..fe494dd843 >> --- /dev/null >> +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S >> @@ -0,0 +1,378 @@ >> +/* -*-arm64-*- >> + * vim: syntax=arm64asm >> + * >> + * This file is part of FFmpeg. >> + * >> + * FFmpeg is free software; you can redistribute it and/or >> + * modify it under the terms of the GNU Lesser General Public >> + * License as published by the Free Software Foundation; either >> + * version 2.1 of the License, or (at your option) any later version. >> + * >> + * FFmpeg is distributed in the hope that it will be useful, >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU >> + * Lesser General Public License for more details. >> + * >> + * You should have received a copy of the GNU Lesser General Public >> + * License along with FFmpeg; if not, write to the Free Software >> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA >> 02110-1301 USA >> + */ >> + >> +#include "libavutil/aarch64/asm.S" >> +#define MAX_PB_SIZE 64 >> + >> +const epel_filters, align=4 >> +        .byte  0,  0,  0,  0 >> +        .byte -2, 58, 10, -2 >> +        .byte -4, 54, 16, -2 >> +        .byte -6, 46, 28, -4 >> +        .byte -4, 36, 36, -4 >> +        .byte -4, 28, 46, -6 >> +        .byte -2, 16, 54, -4 >> +        .byte -2, 10, 58, -2 >> +endconst >> + >> +#if HAVE_I8MM >> +.macro EPEL_UNI_W_H_HEADER >> +        ldr             x12, [sp] >> +        sub             x2, x2, #1 >> +        movrel          x9, epel_filters >> +        add             x9, x9, x12, lsl #2 >> +        ldr             w11, [x9] >> +        dup             v28.4s, w11 > > Why not just do "ld1r {v28.4s}, [x9]" here instead, avoiding the > indirection via GPRs? > > Other than that, I think this mostly looks reasonable. > > Btw, for any assembly patches like these, it would be appreciated if > you can provide benchmarks from checkasm, e.g. "checkasm > --test=hevc_pel --bench=put_hevc" (or maybe just "--bench") and > extract the relevant lines for the functions that you've > added/modified, and mention what system you've benchmarked it on. You > get the most useful benchmarks for micro-tuning if you can enable > userspace access to the timing registers and configure with > --disable-linux-perf. > > // Martin >