Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
From: jinbo <jinbo@loongson.cn>
To: ffmpeg-devel@ffmpeg.org
Cc: jinbo <jinbo@loongson.cn>
Subject: [FFmpeg-devel] [PATCH v3 2/7] avcodec/hevc: Add add_residual_4/8/16/32 asm opt
Date: Thu, 28 Dec 2023 16:21:00 +0800
Message-ID: <20231228082105.31311-2-jinbo@loongson.cn> (raw)
In-Reply-To: <20231228082105.31311-1-jinbo@loongson.cn>

After this patch, the peformance of decoding H265 4K 30FPS 30Mbps
on 3A6000 with 8 threads improves 2fps (45fps-->47fsp).
---
 libavcodec/loongarch/Makefile                 |   3 +-
 libavcodec/loongarch/hevc_add_res.S           | 162 ++++++++++++++++++
 libavcodec/loongarch/hevcdsp_init_loongarch.c |   5 +
 libavcodec/loongarch/hevcdsp_lsx.h            |   5 +
 4 files changed, 174 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/loongarch/hevc_add_res.S

diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index 06cfab5c20..07ea97f803 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -27,7 +27,8 @@ LSX-OBJS-$(CONFIG_HEVC_DECODER)       += loongarch/hevcdsp_lsx.o \
                                          loongarch/hevc_lpf_sao_lsx.o \
                                          loongarch/hevc_mc_bi_lsx.o \
                                          loongarch/hevc_mc_uni_lsx.o \
-                                         loongarch/hevc_mc_uniw_lsx.o
+                                         loongarch/hevc_mc_uniw_lsx.o \
+                                         loongarch/hevc_add_res.o
 LSX-OBJS-$(CONFIG_H264DSP)            += loongarch/h264idct.o \
                                          loongarch/h264idct_loongarch.o \
                                          loongarch/h264dsp.o
diff --git a/libavcodec/loongarch/hevc_add_res.S b/libavcodec/loongarch/hevc_add_res.S
new file mode 100644
index 0000000000..dd2d820af8
--- /dev/null
+++ b/libavcodec/loongarch/hevc_add_res.S
@@ -0,0 +1,162 @@
+/*
+ * Loongson LSX optimized add_residual functions for HEVC decoding
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by jinbo <jinbo@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+/*
+ * void ff_hevc_add_residual4x4_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
+ */
+.macro ADD_RES_LSX_4x4_8
+    vldrepl.w      vr0,    a0,     0
+    add.d          t0,     a0,     a2
+    vldrepl.w      vr1,    t0,     0
+    vld            vr2,    a1,     0
+
+    vilvl.w        vr1,    vr1,    vr0
+    vsllwil.hu.bu  vr1,    vr1,    0
+    vadd.h         vr1,    vr1,    vr2
+    vssrani.bu.h   vr1,    vr1,    0
+
+    vstelm.w       vr1,    a0,     0,    0
+    vstelm.w       vr1,    t0,     0,    1
+.endm
+
+function ff_hevc_add_residual4x4_8_lsx
+    ADD_RES_LSX_4x4_8
+    alsl.d         a0,     a2,     a0,   1
+    addi.d         a1,     a1,     16
+    ADD_RES_LSX_4x4_8
+endfunc
+
+/*
+ * void ff_hevc_add_residual8x8_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
+ */
+.macro ADD_RES_LSX_8x8_8
+    vldrepl.d      vr0,    a0,     0
+    add.d          t0,     a0,     a2
+    vldrepl.d      vr1,    t0,     0
+    add.d          t1,     t0,     a2
+    vldrepl.d      vr2,    t1,     0
+    add.d          t2,     t1,     a2
+    vldrepl.d      vr3,    t2,     0
+
+    vld            vr4,    a1,     0
+    addi.d         t3,     zero,   16
+    vldx           vr5,    a1,     t3
+    addi.d         t4,     a1,     32
+    vld            vr6,    t4,     0
+    vldx           vr7,    t4,     t3
+
+    vsllwil.hu.bu  vr0,    vr0,    0
+    vsllwil.hu.bu  vr1,    vr1,    0
+    vsllwil.hu.bu  vr2,    vr2,    0
+    vsllwil.hu.bu  vr3,    vr3,    0
+    vadd.h         vr0,    vr0,    vr4
+    vadd.h         vr1,    vr1,    vr5
+    vadd.h         vr2,    vr2,    vr6
+    vadd.h         vr3,    vr3,    vr7
+    vssrani.bu.h   vr1,    vr0,    0
+    vssrani.bu.h   vr3,    vr2,    0
+
+    vstelm.d       vr1,    a0,     0,     0
+    vstelm.d       vr1,    t0,     0,     1
+    vstelm.d       vr3,    t1,     0,     0
+    vstelm.d       vr3,    t2,     0,     1
+.endm
+
+function ff_hevc_add_residual8x8_8_lsx
+    ADD_RES_LSX_8x8_8
+    alsl.d         a0,     a2,     a0,    2
+    addi.d         a1,     a1,     64
+    ADD_RES_LSX_8x8_8
+endfunc
+
+/*
+ * void ff_hevc_add_residual16x16_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
+ */
+function ff_hevc_add_residual16x16_8_lsx
+.rept 8
+    vld            vr0,    a0,     0
+    vldx           vr2,    a0,     a2
+
+    vld            vr4,    a1,     0
+    addi.d         t0,     zero,   16
+    vldx           vr5,    a1,     t0
+    addi.d         t1,     a1,     32
+    vld            vr6,    t1,     0
+    vldx           vr7,    t1,     t0
+
+    vexth.hu.bu    vr1,    vr0
+    vsllwil.hu.bu  vr0,    vr0,    0
+    vexth.hu.bu    vr3,    vr2
+    vsllwil.hu.bu  vr2,    vr2,    0
+    vadd.h         vr0,    vr0,    vr4
+    vadd.h         vr1,    vr1,    vr5
+    vadd.h         vr2,    vr2,    vr6
+    vadd.h         vr3,    vr3,    vr7
+
+    vssrani.bu.h   vr1,    vr0,    0
+    vssrani.bu.h   vr3,    vr2,    0
+
+    vst            vr1,    a0,     0
+    vstx           vr3,    a0,     a2
+
+    alsl.d         a0,     a2,     a0,   1
+    addi.d         a1,     a1,     64
+.endr
+endfunc
+
+/*
+ * void ff_hevc_add_residual32x32_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
+ */
+function ff_hevc_add_residual32x32_8_lsx
+.rept 32
+    vld            vr0,    a0,     0
+    addi.w         t0,     zero,   16
+    vldx           vr2,    a0,     t0
+
+    vld            vr4,    a1,     0
+    vldx           vr5,    a1,     t0
+    addi.d         t1,     a1,     32
+    vld            vr6,    t1,     0
+    vldx           vr7,    t1,     t0
+
+    vexth.hu.bu    vr1,    vr0
+    vsllwil.hu.bu  vr0,    vr0,    0
+    vexth.hu.bu    vr3,    vr2
+    vsllwil.hu.bu  vr2,    vr2,    0
+    vadd.h         vr0,    vr0,    vr4
+    vadd.h         vr1,    vr1,    vr5
+    vadd.h         vr2,    vr2,    vr6
+    vadd.h         vr3,    vr3,    vr7
+
+    vssrani.bu.h   vr1,    vr0,    0
+    vssrani.bu.h   vr3,    vr2,    0
+
+    vst            vr1,    a0,     0
+    vstx           vr3,    a0,     t0
+
+    add.d          a0,     a0,     a2
+    addi.d         a1,     a1,     64
+.endr
+endfunc
diff --git a/libavcodec/loongarch/hevcdsp_init_loongarch.c b/libavcodec/loongarch/hevcdsp_init_loongarch.c
index 5a96f3a4c9..a8f753dc86 100644
--- a/libavcodec/loongarch/hevcdsp_init_loongarch.c
+++ b/libavcodec/loongarch/hevcdsp_init_loongarch.c
@@ -189,6 +189,11 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
             c->idct[1] = ff_hevc_idct_8x8_lsx;
             c->idct[2] = ff_hevc_idct_16x16_lsx;
             c->idct[3] = ff_hevc_idct_32x32_lsx;
+
+            c->add_residual[0] = ff_hevc_add_residual4x4_8_lsx;
+            c->add_residual[1] = ff_hevc_add_residual8x8_8_lsx;
+            c->add_residual[2] = ff_hevc_add_residual16x16_8_lsx;
+            c->add_residual[3] = ff_hevc_add_residual32x32_8_lsx;
         }
     }
 }
diff --git a/libavcodec/loongarch/hevcdsp_lsx.h b/libavcodec/loongarch/hevcdsp_lsx.h
index 0d54196caf..ac509984fd 100644
--- a/libavcodec/loongarch/hevcdsp_lsx.h
+++ b/libavcodec/loongarch/hevcdsp_lsx.h
@@ -227,4 +227,9 @@ void ff_hevc_idct_8x8_lsx(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_16x16_lsx(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_32x32_lsx(int16_t *coeffs, int col_limit);
 
+void ff_hevc_add_residual4x4_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual8x8_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual16x16_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual32x32_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
+
 #endif  // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LSX_H
-- 
2.20.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

  reply	other threads:[~2023-12-28  8:22 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-12-28  8:20 [FFmpeg-devel] [PATCH v3 1/7] avcodec/hevc: Add init for sao_edge_filter jinbo
2023-12-28  8:21 ` jinbo [this message]
2023-12-28  8:21 ` [FFmpeg-devel] [PATCH v3 3/7] avcodec/hevc: Add pel_uni_w_pixels4/6/8/12/16/24/32/48/64 asm opt jinbo
2023-12-28  8:21 ` [FFmpeg-devel] [PATCH v3 4/7] avcodec/hevc: Add qpel_uni_w_v|h4/6/8/12/16/24/32/48/64 " jinbo
2023-12-28  8:21 ` [FFmpeg-devel] [PATCH v3 5/7] avcodec/hevc: Add epel_uni_w_hv4/6/8/12/16/24/32/48/64 " jinbo
2023-12-28  8:21 ` [FFmpeg-devel] [PATCH v3 6/7] avcodec/hevc: Add asm opt for the following functions jinbo
2023-12-28  8:21 ` [FFmpeg-devel] [PATCH v3 7/7] avcodec/hevc: Add ff_hevc_idct_32x32_lasx asm opt jinbo
2024-01-03  1:03   ` yinshiyou-hf
2024-01-06  6:59     ` 金波
2024-01-12 22:42     ` Michael Niedermayer
2023-12-28  8:52 ` [FFmpeg-devel] [PATCH v3 1/7] avcodec/hevc: Add init for sao_edge_filter yinshiyou-hf
2023-12-28 23:25   ` Michael Niedermayer

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20231228082105.31311-2-jinbo@loongson.cn \
    --to=jinbo@loongson.cn \
    --cc=ffmpeg-devel@ffmpeg.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git