From: "Rémi Denis-Courmont via ffmpeg-devel" <ffmpeg-devel@ffmpeg.org>
To: ffmpeg-devel@ffmpeg.org
Cc: "Rémi Denis-Courmont" <code@ffmpeg.org>
Subject: [FFmpeg-devel] [PATCH] RVV: h264_luma_dc_dequant_idct (PR #21048)
Date: Sat, 29 Nov 2025 20:59:50 -0000
Message-ID: <176444999129.39.8575696394420034983@2cb04c0e5124> (raw)
PR #21048 opened by Rémi Denis-Courmont (Courmisch)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21048
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21048.patch
From d81b88782e181bdee9599e0fac1ca62915dfb723 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= <remi@remlab.net>
Date: Sat, 29 Nov 2025 17:46:55 +0200
Subject: [PATCH 1/2] lavc/h264idct: R-V V 8-bit h264_luma_dc_dequant_idct
This does not improve performance with current hardware due to the poor
performance of segmented accesses. Performance should be slightly better
with expensive or near-future hardware that I don't have, however it is
still limited by two other factors:
- There are only 4 elements.
- The final stores are necessarily indexed and hit multiple cache lines,
thus as slow as scalar.
---
libavcodec/riscv/Makefile | 2 +-
libavcodec/riscv/h264dsp_init.c | 7 +-
libavcodec/riscv/h264idct_dequant_rvv.S | 86 +++++++++++++++++++++++++
3 files changed, 93 insertions(+), 2 deletions(-)
create mode 100644 libavcodec/riscv/h264idct_dequant_rvv.S
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 736f873fe8..3d2a2b4b6f 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -32,7 +32,7 @@ OBJS-$(CONFIG_H264CHROMA) += riscv/h264_chroma_init_riscv.o
RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o
OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_init.o
RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o riscv/h264dsp_rvv.o \
- riscv/h264idct_rvv.o
+ riscv/h264idct_rvv.o riscv/h264idct_dequant_rvv.o
OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o
RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o
OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o
diff --git a/libavcodec/riscv/h264dsp_init.c b/libavcodec/riscv/h264dsp_init.c
index f214486bbe..7ab8d38698 100644
--- a/libavcodec/riscv/h264dsp_init.c
+++ b/libavcodec/riscv/h264dsp_init.c
@@ -80,7 +80,8 @@ void ff_h264_idct4_add8_##depth##_rvv(uint8_t **d, const int *soffset, \
const uint8_t nnzc[5 * 8]); \
void ff_h264_idct4_add8_422_##depth##_rvv(uint8_t **d, const int *soffset, \
int16_t *s, int stride, \
- const uint8_t nnzc[5 * 8]);
+ const uint8_t nnzc[5 * 8]); \
+void ff_h264_luma_dc_dequant_idct_##depth##_rvv(int16_t *d, int16_t *s, int q);
IDCT_DEPTH(8)
IDCT_DEPTH(9)
@@ -174,6 +175,10 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
dsp->h264_idct_add8 = ff_h264_idct4_add8_422_8_rvv;
# endif
}
+
+ dsp->h264_luma_dc_dequant_idct =
+ ff_h264_luma_dc_dequant_idct_8_rvv;
+
if (flags & AV_CPU_FLAG_RVV_I64) {
dsp->h264_add_pixels8_clear = ff_h264_add_pixels8_8_rvv;
if (flags & AV_CPU_FLAG_RVB)
diff --git a/libavcodec/riscv/h264idct_dequant_rvv.S b/libavcodec/riscv/h264idct_dequant_rvv.S
new file mode 100644
index 0000000000..73a68a28ab
--- /dev/null
+++ b/libavcodec/riscv/h264idct_dequant_rvv.S
@@ -0,0 +1,86 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright © 2025 Rémi Denis-Courmont.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "libavutil/riscv/asm.S"
+
+const offsets_8, 1
+ .short 0, 64, 256, 320
+endconst
+
+func ff_h264_luma_dc_dequant_idct_8_rvv, zve32x
+ lpad 0
+ csrwi vxrm, 0
+ vsetivli zero, 4, e16, mf2, ta, ma
+ vlseg4e16.v v8, (a1)
+ vwadd.vv v16, v8, v9 # z0
+ addi t1, sp, 4 * 4 * -3
+ vwadd.vv v19, v10, v11 # z3
+ addi t2, sp, 4 * 4 * -2
+ vwsub.vv v17, v8, v9 # z1
+ addi t3, sp, 4 * 4 * -1
+ vwsub.vv v18, v10, v11 # z2
+ vsetvli zero, zero, e32, m1, ta, ma
+ vadd.vv v8, v16, v19
+ addi sp, sp, 4 * 4 * -4
+ vsub.vv v9, v16, v19
+ vsub.vv v10, v17, v18
+ vadd.vv v11, v17, v18
+ vsseg4e32.v v8, (sp)
+ vle32.v v8, (sp)
+ vle32.v v9, (t1)
+ vle32.v v10, (t2)
+ vle32.v v11, (t3)
+ vadd.vv v16, v8, v10 # z0
+ addi sp, sp, 4 * 4 * 4
+ vadd.vv v19, v9, v11 # z3
+ lla t0, offsets_8
+ vsub.vv v17, v8, v10 # z1
+ vsub.vv v18, v9, v11 # z2
+ vadd.vv v8, v16, v19
+ vadd.vv v9, v17, v18
+ vsub.vv v10, v17, v18
+ vsub.vv v11, v16, v19
+ vle16.v v24, (t0)
+ vmul.vx v8, v8, a2
+ vmul.vx v9, v9, a2
+ vmul.vx v10, v10, a2
+ vmul.vx v11, v11, a2
+ vsetvli zero, zero, e16, mf2, ta, ma
+ vnclip.wi v16, v8, 8
+ addi t1, a0, 2 * 16 * 1
+ vnclip.wi v17, v9, 8
+ addi t2, a0, 2 * 16 * 4
+ vnclip.wi v18, v10, 8
+ addi t3, a0, 2 * 16 * 5
+ vnclip.wi v19, v11, 8
+ vsuxei16.v v16, (a0), v24
+ vsuxei16.v v17, (t1), v24
+ vsuxei16.v v18, (t2), v24
+ vsuxei16.v v19, (t3), v24
+ ret
+endfunc
--
2.49.1
From 9e42a42b229365250316774269948835e736020d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= <remi@remlab.net>
Date: Sat, 29 Nov 2025 22:51:01 +0200
Subject: [PATCH 2/2] lavc/h264idct: R-V V 9-bit h264_luma_dc_dequant_idct
Note that, like the C reference, the same function can be used for
larger bit depths.
---
libavcodec/riscv/h264dsp_init.c | 5 ++-
libavcodec/riscv/h264idct_dequant_rvv.S | 55 +++++++++++++++++++++++++
2 files changed, 59 insertions(+), 1 deletion(-)
diff --git a/libavcodec/riscv/h264dsp_init.c b/libavcodec/riscv/h264dsp_init.c
index 7ab8d38698..06cb3c59de 100644
--- a/libavcodec/riscv/h264dsp_init.c
+++ b/libavcodec/riscv/h264dsp_init.c
@@ -189,8 +189,11 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
#define IDCT_DEPTH(depth) \
if (bit_depth == depth) { \
- if (zvl128b) \
+ if (zvl128b) { \
dsp->h264_idct_add = ff_h264_idct_add_##depth##_rvv; \
+ dsp->h264_luma_dc_dequant_idct = \
+ ff_h264_luma_dc_dequant_idct_9_rvv; \
+ } \
if (flags & AV_CPU_FLAG_RVB) \
dsp->h264_idct8_add = ff_h264_idct8_add_##depth##_rvv; \
if (zvl128b && (flags & AV_CPU_FLAG_RVB)) { \
diff --git a/libavcodec/riscv/h264idct_dequant_rvv.S b/libavcodec/riscv/h264idct_dequant_rvv.S
index 73a68a28ab..bc49ca6ad4 100644
--- a/libavcodec/riscv/h264idct_dequant_rvv.S
+++ b/libavcodec/riscv/h264idct_dequant_rvv.S
@@ -84,3 +84,58 @@ func ff_h264_luma_dc_dequant_idct_8_rvv, zve32x
vsuxei16.v v19, (t3), v24
ret
endfunc
+
+const offsets_9, 1
+ .short 0, 128, 512, 640
+endconst
+
+func ff_h264_luma_dc_dequant_idct_9_rvv, zve32x
+ lpad 0
+ csrwi vxrm, 0
+ vsetivli zero, 4, e32, m1, ta, ma
+ vlseg4e32.v v8, (a1)
+ vadd.vv v16, v8, v9 # z0
+ addi t1, sp, 4 * 4 * -3
+ vadd.vv v19, v10, v11 # z3
+ addi t2, sp, 4 * 4 * -2
+ vsub.vv v17, v8, v9 # z1
+ addi t3, sp, 4 * 4 * -1
+ vsub.vv v18, v10, v11 # z2
+ vadd.vv v8, v16, v19
+ addi sp, sp, 4 * 4 * -4
+ vsub.vv v9, v16, v19
+ vsub.vv v10, v17, v18
+ vadd.vv v11, v17, v18
+ vsseg4e32.v v8, (sp)
+ vle32.v v8, (sp)
+ vle32.v v9, (t1)
+ vle32.v v10, (t2)
+ vle32.v v11, (t3)
+ vadd.vv v16, v8, v10 # z0
+ addi sp, sp, 4 * 4 * 4
+ vadd.vv v19, v9, v11 # z3
+ lla t0, offsets_9
+ vsub.vv v17, v8, v10 # z1
+ vsub.vv v18, v9, v11 # z2
+ vadd.vv v8, v16, v19
+ vadd.vv v9, v17, v18
+ vsub.vv v10, v17, v18
+ vsub.vv v11, v16, v19
+ vle16.v v24, (t0)
+ vmul.vx v8, v8, a2
+ vmul.vx v9, v9, a2
+ vmul.vx v10, v10, a2
+ vmul.vx v11, v11, a2
+ vssra.vi v16, v8, 8
+ addi t1, a0, 4 * 16 * 1
+ vssra.vi v17, v9, 8
+ addi t2, a0, 4 * 16 * 4
+ vssra.vi v18, v10, 8
+ addi t3, a0, 4 * 16 * 5
+ vssra.vi v19, v11, 8
+ vsuxei16.v v16, (a0), v24
+ vsuxei16.v v17, (t1), v24
+ vsuxei16.v v18, (t2), v24
+ vsuxei16.v v19, (t3), v24
+ ret
+endfunc
--
2.49.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
reply other threads:[~2025-11-29 21:02 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=176444999129.39.8575696394420034983@2cb04c0e5124 \
--to=ffmpeg-devel@ffmpeg.org \
--cc=code@ffmpeg.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git