From: jinbo <jinbo@loongson.cn> To: ffmpeg-devel@ffmpeg.org Cc: jinbo <jinbo@loongson.cn> Subject: [FFmpeg-devel] [PATCH v1 3/6] avcodec/hevc: Add pel_uni_w_pixels4/6/8/12/16/24/32/48/64 asm opt Date: Fri, 22 Dec 2023 18:52:11 +0800 Message-ID: <20231222105214.15168-4-jinbo@loongson.cn> (raw) In-Reply-To: <20231222105214.15168-1-jinbo@loongson.cn> tests/checkasm/checkasm: C LSX LASX put_hevc_pel_uni_w_pixels4_8_c: 2.7 1.0 put_hevc_pel_uni_w_pixels6_8_c: 6.2 2.0 1.5 put_hevc_pel_uni_w_pixels8_8_c: 10.7 2.5 1.7 put_hevc_pel_uni_w_pixels12_8_c: 23.0 5.5 5.0 put_hevc_pel_uni_w_pixels16_8_c: 41.0 8.2 5.0 put_hevc_pel_uni_w_pixels24_8_c: 91.0 19.7 13.2 put_hevc_pel_uni_w_pixels32_8_c: 161.7 32.5 16.2 put_hevc_pel_uni_w_pixels48_8_c: 354.5 73.7 43.0 put_hevc_pel_uni_w_pixels64_8_c: 641.5 130.0 64.2 Speedup of decoding H265 4K 30FPS 30Mbps on 3A6000 with 8 threads is 1fps(47fps-->48fps). --- libavcodec/loongarch/Makefile | 3 +- libavcodec/loongarch/hevc_mc.S | 471 ++++++++++++++++++ libavcodec/loongarch/hevcdsp_init_loongarch.c | 43 ++ libavcodec/loongarch/hevcdsp_lasx.h | 53 ++ libavcodec/loongarch/hevcdsp_lsx.h | 27 + 5 files changed, 596 insertions(+), 1 deletion(-) create mode 100644 libavcodec/loongarch/hevc_mc.S create mode 100644 libavcodec/loongarch/hevcdsp_lasx.h diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile index 07ea97f803..ad98cd4054 100644 --- a/libavcodec/loongarch/Makefile +++ b/libavcodec/loongarch/Makefile @@ -28,7 +28,8 @@ LSX-OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_lsx.o \ loongarch/hevc_mc_bi_lsx.o \ loongarch/hevc_mc_uni_lsx.o \ loongarch/hevc_mc_uniw_lsx.o \ - loongarch/hevc_add_res.o + loongarch/hevc_add_res.o \ + loongarch/hevc_mc.o LSX-OBJS-$(CONFIG_H264DSP) += loongarch/h264idct.o \ loongarch/h264idct_loongarch.o \ loongarch/h264dsp.o diff --git a/libavcodec/loongarch/hevc_mc.S b/libavcodec/loongarch/hevc_mc.S new file mode 100644 index 0000000000..c5d553effe --- /dev/null +++ b/libavcodec/loongarch/hevc_mc.S @@ -0,0 +1,471 @@ +/* + * Copyright (c) 2023 Loongson Technology Corporation Limited + * Contributed by jinbo <jinbo@loongson.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "loongson_asm.S" + +.macro LOAD_VAR bit + addi.w t1, a5, 6 //shift + addi.w t3, zero, 1 //one + sub.w t4, t1, t3 + sll.w t3, t3, t4 //offset +.if \bit == 128 + vreplgr2vr.w vr1, a6 //wx + vreplgr2vr.w vr2, t3 //offset + vreplgr2vr.w vr3, t1 //shift + vreplgr2vr.w vr4, a7 //ox +.else + xvreplgr2vr.w xr1, a6 + xvreplgr2vr.w xr2, t3 + xvreplgr2vr.w xr3, t1 + xvreplgr2vr.w xr4, a7 +.endif +.endm + +.macro HEVC_PEL_UNI_W_PIXELS8_LSX src0, dst0, w + vldrepl.d vr0, \src0, 0 + vsllwil.hu.bu vr0, vr0, 0 + vexth.wu.hu vr5, vr0 + vsllwil.wu.hu vr0, vr0, 0 + vslli.w vr0, vr0, 6 + vslli.w vr5, vr5, 6 + vmul.w vr0, vr0, vr1 + vmul.w vr5, vr5, vr1 + vadd.w vr0, vr0, vr2 + vadd.w vr5, vr5, vr2 + vsra.w vr0, vr0, vr3 + vsra.w vr5, vr5, vr3 + vadd.w vr0, vr0, vr4 + vadd.w vr5, vr5, vr4 + vssrani.h.w vr5, vr0, 0 + vssrani.bu.h vr5, vr5, 0 +.if \w == 6 + fst.s f5, \dst0, 0 + vstelm.h vr5, \dst0, 4, 2 +.else + fst.d f5, \dst0, 0 +.endif +.endm + +.macro HEVC_PEL_UNI_W_PIXELS8x2_LASX src0, dst0, w + vldrepl.d vr0, \src0, 0 + add.d t2, \src0, a3 + vldrepl.d vr5, t2, 0 + xvpermi.q xr0, xr5, 0x02 + xvsllwil.hu.bu xr0, xr0, 0 + xvexth.wu.hu xr5, xr0 + xvsllwil.wu.hu xr0, xr0, 0 + xvslli.w xr0, xr0, 6 + xvslli.w xr5, xr5, 6 + xvmul.w xr0, xr0, xr1 + xvmul.w xr5, xr5, xr1 + xvadd.w xr0, xr0, xr2 + xvadd.w xr5, xr5, xr2 + xvsra.w xr0, xr0, xr3 + xvsra.w xr5, xr5, xr3 + xvadd.w xr0, xr0, xr4 + xvadd.w xr5, xr5, xr4 + xvssrani.h.w xr5, xr0, 0 + xvpermi.q xr0, xr5, 0x01 + xvssrani.bu.h xr0, xr5, 0 + add.d t3, \dst0, a1 +.if \w == 6 + vstelm.w vr0, \dst0, 0, 0 + vstelm.h vr0, \dst0, 4, 2 + vstelm.w vr0, t3, 0, 2 + vstelm.h vr0, t3, 4, 6 +.else + vstelm.d vr0, \dst0, 0, 0 + vstelm.d vr0, t3, 0, 1 +.endif +.endm + +.macro HEVC_PEL_UNI_W_PIXELS16_LSX src0, dst0 + vld vr0, \src0, 0 + vexth.hu.bu vr7, vr0 + vexth.wu.hu vr8, vr7 + vsllwil.wu.hu vr7, vr7, 0 + vsllwil.hu.bu vr5, vr0, 0 + vexth.wu.hu vr6, vr5 + vsllwil.wu.hu vr5, vr5, 0 + vslli.w vr5, vr5, 6 + vslli.w vr6, vr6, 6 + vslli.w vr7, vr7, 6 + vslli.w vr8, vr8, 6 + vmul.w vr5, vr5, vr1 + vmul.w vr6, vr6, vr1 + vmul.w vr7, vr7, vr1 + vmul.w vr8, vr8, vr1 + vadd.w vr5, vr5, vr2 + vadd.w vr6, vr6, vr2 + vadd.w vr7, vr7, vr2 + vadd.w vr8, vr8, vr2 + vsra.w vr5, vr5, vr3 + vsra.w vr6, vr6, vr3 + vsra.w vr7, vr7, vr3 + vsra.w vr8, vr8, vr3 + vadd.w vr5, vr5, vr4 + vadd.w vr6, vr6, vr4 + vadd.w vr7, vr7, vr4 + vadd.w vr8, vr8, vr4 + vssrani.h.w vr6, vr5, 0 + vssrani.h.w vr8, vr7, 0 + vssrani.bu.h vr8, vr6, 0 + vst vr8, \dst0, 0 +.endm + +.macro HEVC_PEL_UNI_W_PIXELS16_LASX src0, dst0 + vld vr0, \src0, 0 + xvpermi.d xr0, xr0, 0xd8 + xvsllwil.hu.bu xr0, xr0, 0 + xvexth.wu.hu xr6, xr0 + xvsllwil.wu.hu xr5, xr0, 0 + xvslli.w xr5, xr5, 6 + xvslli.w xr6, xr6, 6 + xvmul.w xr5, xr5, xr1 + xvmul.w xr6, xr6, xr1 + xvadd.w xr5, xr5, xr2 + xvadd.w xr6, xr6, xr2 + xvsra.w xr5, xr5, xr3 + xvsra.w xr6, xr6, xr3 + xvadd.w xr5, xr5, xr4 + xvadd.w xr6, xr6, xr4 + xvssrani.h.w xr6, xr5, 0 + xvpermi.q xr7, xr6, 0x01 + xvssrani.bu.h xr7, xr6, 0 + vst vr7, \dst0, 0 +.endm + +.macro HEVC_PEL_UNI_W_PIXELS32_LASX src0, dst0, w +.if \w == 16 + vld vr0, \src0, 0 + add.d t2, \src0, a3 + vld vr5, t2, 0 + xvpermi.q xr0, xr5, 0x02 +.else //w=24/32 + xvld xr0, \src0, 0 +.endif + xvexth.hu.bu xr7, xr0 + xvexth.wu.hu xr8, xr7 + xvsllwil.wu.hu xr7, xr7, 0 + xvsllwil.hu.bu xr5, xr0, 0 + xvexth.wu.hu xr6, xr5 + xvsllwil.wu.hu xr5, xr5, 0 + xvslli.w xr5, xr5, 6 + xvslli.w xr6, xr6, 6 + xvslli.w xr7, xr7, 6 + xvslli.w xr8, xr8, 6 + xvmul.w xr5, xr5, xr1 + xvmul.w xr6, xr6, xr1 + xvmul.w xr7, xr7, xr1 + xvmul.w xr8, xr8, xr1 + xvadd.w xr5, xr5, xr2 + xvadd.w xr6, xr6, xr2 + xvadd.w xr7, xr7, xr2 + xvadd.w xr8, xr8, xr2 + xvsra.w xr5, xr5, xr3 + xvsra.w xr6, xr6, xr3 + xvsra.w xr7, xr7, xr3 + xvsra.w xr8, xr8, xr3 + xvadd.w xr5, xr5, xr4 + xvadd.w xr6, xr6, xr4 + xvadd.w xr7, xr7, xr4 + xvadd.w xr8, xr8, xr4 + xvssrani.h.w xr6, xr5, 0 + xvssrani.h.w xr8, xr7, 0 + xvssrani.bu.h xr8, xr6, 0 +.if \w == 16 + vst vr8, \dst0, 0 + add.d t2, \dst0, a1 + xvpermi.q xr8, xr8, 0x01 + vst vr8, t2, 0 +.elseif \w == 24 + vst vr8, \dst0, 0 + xvstelm.d xr8, \dst0, 16, 2 +.else + xvst xr8, \dst0, 0 +.endif +.endm + +function ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx + LOAD_VAR 128 + srli.w t0, a4, 1 +.LOOP_PIXELS4: + vldrepl.w vr0, a2, 0 + add.d t1, a2, a3 + vldrepl.w vr5, t1, 0 + vsllwil.hu.bu vr0, vr0, 0 + vsllwil.wu.hu vr0, vr0, 0 + vsllwil.hu.bu vr5, vr5, 0 + vsllwil.wu.hu vr5, vr5, 0 + vslli.w vr0, vr0, 6 + vslli.w vr5, vr5, 6 + vmul.w vr0, vr0, vr1 + vmul.w vr5, vr5, vr1 + vadd.w vr0, vr0, vr2 + vadd.w vr5, vr5, vr2 + vsra.w vr0, vr0, vr3 + vsra.w vr5, vr5, vr3 + vadd.w vr0, vr0, vr4 + vadd.w vr5, vr5, vr4 + vssrani.h.w vr5, vr0, 0 + vssrani.bu.h vr5, vr5, 0 + fst.s f5, a0, 0 + add.d t2, a0, a1 + vstelm.w vr5, t2, 0, 1 + alsl.d a2, a3, a2, 1 + alsl.d a0, a1, a0, 1 + addi.w t0, t0, -1 + bnez t0, .LOOP_PIXELS4 +endfunc + +function ff_hevc_put_hevc_pel_uni_w_pixels6_8_lsx + LOAD_VAR 128 +.LOOP_PIXELS6: + HEVC_PEL_UNI_W_PIXELS8_LSX a2, a0, 6 + add.d a2, a2, a3 + add.d a0, a0, a1 + addi.w a4, a4, -1 + bnez a4, .LOOP_PIXELS6 +endfunc + +function ff_hevc_put_hevc_pel_uni_w_pixels6_8_lasx + LOAD_VAR 256 + srli.w t0, a4, 1 +.LOOP_PIXELS6_LASX: + HEVC_PEL_UNI_W_PIXELS8x2_LASX a2, a0, 6 + alsl.d a2, a3, a2, 1 + alsl.d a0, a1, a0, 1 + addi.w t0, t0, -1 + bnez t0, .LOOP_PIXELS6_LASX +endfunc + +function ff_hevc_put_hevc_pel_uni_w_pixels8_8_lsx + LOAD_VAR 128 +.LOOP_PIXELS8: + HEVC_PEL_UNI_W_PIXELS8_LSX a2, a0, 8 + add.d a2, a2, a3 + add.d a0, a0, a1 + addi.w a4, a4, -1 + bnez a4, .LOOP_PIXELS8 +endfunc + +function ff_hevc_put_hevc_pel_uni_w_pixels8_8_lasx + LOAD_VAR 256 + srli.w t0, a4, 1 +.LOOP_PIXELS8_LASX: + HEVC_PEL_UNI_W_PIXELS8x2_LASX a2, a0, 8 + alsl.d a2, a3, a2, 1 + alsl.d a0, a1, a0, 1 + addi.w t0, t0, -1 + bnez t0, .LOOP_PIXELS8_LASX +endfunc + +function ff_hevc_put_hevc_pel_uni_w_pixels12_8_lsx + LOAD_VAR 128 +.LOOP_PIXELS12: + vld vr0, a2, 0 + vexth.hu.bu vr7, vr0 + vsllwil.wu.hu vr7, vr7, 0 + vsllwil.hu.bu vr5, vr0, 0 + vexth.wu.hu vr6, vr5 + vsllwil.wu.hu vr5, vr5, 0 + vslli.w vr5, vr5, 6 + vslli.w vr6, vr6, 6 + vslli.w vr7, vr7, 6 + vmul.w vr5, vr5, vr1 + vmul.w vr6, vr6, vr1 + vmul.w vr7, vr7, vr1 + vadd.w vr5, vr5, vr2 + vadd.w vr6, vr6, vr2 + vadd.w vr7, vr7, vr2 + vsra.w vr5, vr5, vr3 + vsra.w vr6, vr6, vr3 + vsra.w vr7, vr7, vr3 + vadd.w vr5, vr5, vr4 + vadd.w vr6, vr6, vr4 + vadd.w vr7, vr7, vr4 + vssrani.h.w vr6, vr5, 0 + vssrani.h.w vr7, vr7, 0 + vssrani.bu.h vr7, vr6, 0 + fst.d f7, a0, 0 + vstelm.w vr7, a0, 8, 2 + add.d a2, a2, a3 + add.d a0, a0, a1 + addi.w a4, a4, -1 + bnez a4, .LOOP_PIXELS12 +endfunc + +function ff_hevc_put_hevc_pel_uni_w_pixels12_8_lasx + LOAD_VAR 256 +.LOOP_PIXELS12_LASX: + vld vr0, a2, 0 + xvpermi.d xr0, xr0, 0xd8 + xvsllwil.hu.bu xr0, xr0, 0 + xvexth.wu.hu xr6, xr0 + xvsllwil.wu.hu xr5, xr0, 0 + xvslli.w xr5, xr5, 6 + xvslli.w xr6, xr6, 6 + xvmul.w xr5, xr5, xr1 + xvmul.w xr6, xr6, xr1 + xvadd.w xr5, xr5, xr2 + xvadd.w xr6, xr6, xr2 + xvsra.w xr5, xr5, xr3 + xvsra.w xr6, xr6, xr3 + xvadd.w xr5, xr5, xr4 + xvadd.w xr6, xr6, xr4 + xvssrani.h.w xr6, xr5, 0 + xvpermi.q xr7, xr6, 0x01 + xvssrani.bu.h xr7, xr6, 0 + fst.d f7, a0, 0 + vstelm.w vr7, a0, 8, 2 + add.d a2, a2, a3 + add.d a0, a0, a1 + addi.w a4, a4, -1 + bnez a4, .LOOP_PIXELS12_LASX +endfunc + +function ff_hevc_put_hevc_pel_uni_w_pixels16_8_lsx + LOAD_VAR 128 +.LOOP_PIXELS16: + HEVC_PEL_UNI_W_PIXELS16_LSX a2, a0 + add.d a2, a2, a3 + add.d a0, a0, a1 + addi.w a4, a4, -1 + bnez a4, .LOOP_PIXELS16 +endfunc + +function ff_hevc_put_hevc_pel_uni_w_pixels16_8_lasx + LOAD_VAR 256 + srli.w t0, a4, 1 +.LOOP_PIXELS16_LASX: + HEVC_PEL_UNI_W_PIXELS32_LASX a2, a0, 16 + alsl.d a2, a3, a2, 1 + alsl.d a0, a1, a0, 1 + addi.w t0, t0, -1 + bnez t0, .LOOP_PIXELS16_LASX +endfunc + +function ff_hevc_put_hevc_pel_uni_w_pixels24_8_lsx + LOAD_VAR 128 +.LOOP_PIXELS24: + HEVC_PEL_UNI_W_PIXELS16_LSX a2, a0 + addi.d t0, a2, 16 + addi.d t1, a0, 16 + HEVC_PEL_UNI_W_PIXELS8_LSX t0, t1, 8 + add.d a2, a2, a3 + add.d a0, a0, a1 + addi.w a4, a4, -1 + bnez a4, .LOOP_PIXELS24 +endfunc + +function ff_hevc_put_hevc_pel_uni_w_pixels24_8_lasx + LOAD_VAR 256 +.LOOP_PIXELS24_LASX: + HEVC_PEL_UNI_W_PIXELS32_LASX a2, a0, 24 + add.d a2, a2, a3 + add.d a0, a0, a1 + addi.w a4, a4, -1 + bnez a4, .LOOP_PIXELS24_LASX +endfunc + +function ff_hevc_put_hevc_pel_uni_w_pixels32_8_lsx + LOAD_VAR 128 +.LOOP_PIXELS32: + HEVC_PEL_UNI_W_PIXELS16_LSX a2, a0 + addi.d t0, a2, 16 + addi.d t1, a0, 16 + HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1 + add.d a2, a2, a3 + add.d a0, a0, a1 + addi.w a4, a4, -1 + bnez a4, .LOOP_PIXELS32 +endfunc + +function ff_hevc_put_hevc_pel_uni_w_pixels32_8_lasx + LOAD_VAR 256 +.LOOP_PIXELS32_LASX: + HEVC_PEL_UNI_W_PIXELS32_LASX a2, a0, 32 + add.d a2, a2, a3 + add.d a0, a0, a1 + addi.w a4, a4, -1 + bnez a4, .LOOP_PIXELS32_LASX +endfunc + +function ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx + LOAD_VAR 128 +.LOOP_PIXELS48: + HEVC_PEL_UNI_W_PIXELS16_LSX a2, a0 + addi.d t0, a2, 16 + addi.d t1, a0, 16 + HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1 + addi.d t0, a2, 32 + addi.d t1, a0, 32 + HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1 + add.d a2, a2, a3 + add.d a0, a0, a1 + addi.w a4, a4, -1 + bnez a4, .LOOP_PIXELS48 +endfunc + +function ff_hevc_put_hevc_pel_uni_w_pixels48_8_lasx + LOAD_VAR 256 +.LOOP_PIXELS48_LASX: + HEVC_PEL_UNI_W_PIXELS32_LASX a2, a0, 32 + addi.d t0, a2, 32 + addi.d t1, a0, 32 + HEVC_PEL_UNI_W_PIXELS16_LASX t0, t1 + add.d a2, a2, a3 + add.d a0, a0, a1 + addi.w a4, a4, -1 + bnez a4, .LOOP_PIXELS48_LASX +endfunc + +function ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx + LOAD_VAR 128 +.LOOP_PIXELS64: + HEVC_PEL_UNI_W_PIXELS16_LSX a2, a0 + addi.d t0, a2, 16 + addi.d t1, a0, 16 + HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1 + addi.d t0, a2, 32 + addi.d t1, a0, 32 + HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1 + addi.d t0, a2, 48 + addi.d t1, a0, 48 + HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1 + add.d a2, a2, a3 + add.d a0, a0, a1 + addi.w a4, a4, -1 + bnez a4, .LOOP_PIXELS64 +endfunc + +function ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx + LOAD_VAR 256 +.LOOP_PIXELS64_LASX: + HEVC_PEL_UNI_W_PIXELS32_LASX a2, a0, 32 + addi.d t0, a2, 32 + addi.d t1, a0, 32 + HEVC_PEL_UNI_W_PIXELS32_LASX t0, t1, 32 + add.d a2, a2, a3 + add.d a0, a0, a1 + addi.w a4, a4, -1 + bnez a4, .LOOP_PIXELS64_LASX +endfunc diff --git a/libavcodec/loongarch/hevcdsp_init_loongarch.c b/libavcodec/loongarch/hevcdsp_init_loongarch.c index a8f753dc86..d0ee99d6b5 100644 --- a/libavcodec/loongarch/hevcdsp_init_loongarch.c +++ b/libavcodec/loongarch/hevcdsp_init_loongarch.c @@ -22,6 +22,7 @@ #include "libavutil/loongarch/cpu.h" #include "hevcdsp_lsx.h" +#include "hevcdsp_lasx.h" void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth) { @@ -160,6 +161,26 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth) c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_8_lsx; c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_lsx; + c->put_hevc_qpel_uni_w[1][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx; + c->put_hevc_qpel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lsx; + c->put_hevc_qpel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lsx; + c->put_hevc_qpel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lsx; + c->put_hevc_qpel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lsx; + c->put_hevc_qpel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lsx; + c->put_hevc_qpel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lsx; + c->put_hevc_qpel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx; + c->put_hevc_qpel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx; + + c->put_hevc_epel_uni_w[1][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx; + c->put_hevc_epel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lsx; + c->put_hevc_epel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lsx; + c->put_hevc_epel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lsx; + c->put_hevc_epel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lsx; + c->put_hevc_epel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lsx; + c->put_hevc_epel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lsx; + c->put_hevc_epel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx; + c->put_hevc_epel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx; + c->put_hevc_qpel_uni_w[3][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv8_8_lsx; c->put_hevc_qpel_uni_w[5][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv16_8_lsx; c->put_hevc_qpel_uni_w[6][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv24_8_lsx; @@ -196,4 +217,26 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth) c->add_residual[3] = ff_hevc_add_residual32x32_8_lsx; } } + + if (have_lasx(cpu_flags)) { + if (bit_depth == 8) { + c->put_hevc_qpel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lasx; + c->put_hevc_qpel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lasx; + c->put_hevc_qpel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lasx; + c->put_hevc_qpel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lasx; + c->put_hevc_qpel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lasx; + c->put_hevc_qpel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lasx; + c->put_hevc_qpel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lasx; + c->put_hevc_qpel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx; + + c->put_hevc_epel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lasx; + c->put_hevc_epel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lasx; + c->put_hevc_epel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lasx; + c->put_hevc_epel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lasx; + c->put_hevc_epel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lasx; + c->put_hevc_epel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lasx; + c->put_hevc_epel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lasx; + c->put_hevc_epel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx; + } + } } diff --git a/libavcodec/loongarch/hevcdsp_lasx.h b/libavcodec/loongarch/hevcdsp_lasx.h new file mode 100644 index 0000000000..819c3c3ecf --- /dev/null +++ b/libavcodec/loongarch/hevcdsp_lasx.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2023 Loongson Technology Corporation Limited + * Contributed by jinbo <jinbo@loongson.cn> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_LOONGARCH_HEVCDSP_LASX_H +#define AVCODEC_LOONGARCH_HEVCDSP_LASX_H + +#include "libavcodec/hevcdsp.h" + +#define PEL_UNI_W(PEL, DIR, WIDTH) \ +void ff_hevc_put_hevc_##PEL##_uni_w_##DIR##WIDTH##_8_lasx(uint8_t *dst, \ + ptrdiff_t \ + dst_stride, \ + const uint8_t *src, \ + ptrdiff_t \ + src_stride, \ + int height, \ + int denom, \ + int wx, \ + int ox, \ + intptr_t mx, \ + intptr_t my, \ + int width) + +PEL_UNI_W(pel, pixels, 6); +PEL_UNI_W(pel, pixels, 8); +PEL_UNI_W(pel, pixels, 12); +PEL_UNI_W(pel, pixels, 16); +PEL_UNI_W(pel, pixels, 24); +PEL_UNI_W(pel, pixels, 32); +PEL_UNI_W(pel, pixels, 48); +PEL_UNI_W(pel, pixels, 64); + +#undef PEL_UNI_W + +#endif // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LASX_H diff --git a/libavcodec/loongarch/hevcdsp_lsx.h b/libavcodec/loongarch/hevcdsp_lsx.h index ac509984fd..0d724a90ef 100644 --- a/libavcodec/loongarch/hevcdsp_lsx.h +++ b/libavcodec/loongarch/hevcdsp_lsx.h @@ -232,4 +232,31 @@ void ff_hevc_add_residual8x8_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t s void ff_hevc_add_residual16x16_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride); void ff_hevc_add_residual32x32_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride); +#define PEL_UNI_W(PEL, DIR, WIDTH) \ +void ff_hevc_put_hevc_##PEL##_uni_w_##DIR##WIDTH##_8_lsx(uint8_t *dst, \ + ptrdiff_t \ + dst_stride, \ + const uint8_t *src, \ + ptrdiff_t \ + src_stride, \ + int height, \ + int denom, \ + int wx, \ + int ox, \ + intptr_t mx, \ + intptr_t my, \ + int width) + +PEL_UNI_W(pel, pixels, 4); +PEL_UNI_W(pel, pixels, 6); +PEL_UNI_W(pel, pixels, 8); +PEL_UNI_W(pel, pixels, 12); +PEL_UNI_W(pel, pixels, 16); +PEL_UNI_W(pel, pixels, 24); +PEL_UNI_W(pel, pixels, 32); +PEL_UNI_W(pel, pixels, 48); +PEL_UNI_W(pel, pixels, 64); + +#undef PEL_UNI_W + #endif // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LSX_H -- 2.20.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next prev parent reply other threads:[~2023-12-22 10:53 UTC|newest] Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top 2023-12-22 10:52 [FFmpeg-devel] [PATCH v1] [loongarch] Add hevc 128-bit & 256-bit asm optimizations jinbo 2023-12-22 10:52 ` [FFmpeg-devel] [PATCH v1 1/6] avcodec/hevc: Add init for sao_edge_filter jinbo 2023-12-22 10:52 ` [FFmpeg-devel] [PATCH v1 2/6] avcodec/hevc: Add add_residual_4/8/16/32 asm opt jinbo 2023-12-22 10:52 ` jinbo [this message] 2023-12-22 10:52 ` [FFmpeg-devel] [PATCH v1 4/6] avcodec/hevc: Add qpel_uni_w_v|h4/6/8/12/16/24/32/48/64 " jinbo 2023-12-22 10:52 ` [FFmpeg-devel] [PATCH v1 5/6] avcodec/hevc: Add epel_uni_w_hv4/6/8/12/16/24/32/48/64 " jinbo 2023-12-22 10:52 ` [FFmpeg-devel] [PATCH v1 6/6] avcodec/hevc: Add asm opt for the following functions jinbo
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20231222105214.15168-4-jinbo@loongson.cn \ --to=jinbo@loongson.cn \ --cc=ffmpeg-devel@ffmpeg.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git