* [FFmpeg-devel] [PATCH] avcodec/riscv: add h264 dc idct rvv
@ 2024-06-10 6:22 J. Dekker
2024-06-10 15:03 ` Rémi Denis-Courmont
0 siblings, 1 reply; 2+ messages in thread
From: J. Dekker @ 2024-06-10 6:22 UTC (permalink / raw)
To: ffmpeg-devel
checkasm: bench runs 131072 (1 << 17)
h264_idct4_add_dc_8bpp_c: 1.5
h264_idct4_add_dc_8bpp_rvv_i64: 0.7
h264_idct4_add_dc_9bpp_c: 1.5
h264_idct4_add_dc_9bpp_rvv_i64: 0.7
h264_idct4_add_dc_10bpp_c: 1.5
h264_idct4_add_dc_10bpp_rvv_i64: 0.7
h264_idct4_add_dc_12bpp_c: 1.2
h264_idct4_add_dc_12bpp_rvv_i64: 0.7
h264_idct4_add_dc_14bpp_c: 1.2
h264_idct4_add_dc_14bpp_rvv_i64: 0.7
h264_idct8_add_dc_8bpp_c: 5.2
h264_idct8_add_dc_8bpp_rvv_i64: 1.5
h264_idct8_add_dc_9bpp_c: 5.5
h264_idct8_add_dc_9bpp_rvv_i64: 1.2
h264_idct8_add_dc_10bpp_c: 5.5
h264_idct8_add_dc_10bpp_rvv_i64: 1.2
h264_idct8_add_dc_12bpp_c: 4.2
h264_idct8_add_dc_12bpp_rvv_i64: 1.2
h264_idct8_add_dc_14bpp_c: 4.2
h264_idct8_add_dc_14bpp_rvv_i64: 1.2
Signed-off-by: J. Dekker <jdek@itanimul.li>
---
libavcodec/riscv/Makefile | 1 +
libavcodec/riscv/h264dsp_init.c | 40 +++++++++-
libavcodec/riscv/h264dsp_rvv.S | 129 ++++++++++++++++++++++++++++++++
3 files changed, 167 insertions(+), 3 deletions(-)
create mode 100644 libavcodec/riscv/h264dsp_rvv.S
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 590655f829..cb63631de4 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -31,6 +31,7 @@ RVV-OBJS-$(CONFIG_H263DSP) += riscv/h263dsp_rvv.o
OBJS-$(CONFIG_H264CHROMA) += riscv/h264_chroma_init_riscv.o
RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o
OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_init.o
+RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_rvv.o
OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o
RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o
OBJS-$(CONFIG_IDCTDSP) += riscv/idctdsp_init.o
diff --git a/libavcodec/riscv/h264dsp_init.c b/libavcodec/riscv/h264dsp_init.c
index dbbf3db400..fe3caf686d 100644
--- a/libavcodec/riscv/h264dsp_init.c
+++ b/libavcodec/riscv/h264dsp_init.c
@@ -1,4 +1,5 @@
/*
+ * Copyright (c) 2024 J. Dekker <jdek@itanimul.li>
* Copyright © 2024 Rémi Denis-Courmont.
*
* This file is part of FFmpeg.
@@ -24,22 +25,55 @@
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
+#include "libavutil/riscv/cpu.h"
#include "libavcodec/h264dsp.h"
extern int ff_startcode_find_candidate_rvb(const uint8_t *, int);
extern int ff_startcode_find_candidate_rvv(const uint8_t *, int);
+void ff_h264_idct4_dc_add_8_rvv(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct8_dc_add_8_rvv(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct4_dc_add_9_rvv(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct8_dc_add_9_rvv(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct4_dc_add_10_rvv(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct8_dc_add_10_rvv(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct4_dc_add_12_rvv(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct8_dc_add_12_rvv(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct4_dc_add_14_rvv(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct8_dc_add_14_rvv(uint8_t *dst, int16_t *block, int stride);
-av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
+av_cold void ff_h264dsp_init_riscv(H264DSPContext *c, const int bit_depth,
const int chroma_format_idc)
{
#if HAVE_RV
int flags = av_get_cpu_flags();
if (flags & AV_CPU_FLAG_RVB_BASIC)
- dsp->startcode_find_candidate = ff_startcode_find_candidate_rvb;
+ c->startcode_find_candidate = ff_startcode_find_candidate_rvb;
# if HAVE_RVV
if (flags & AV_CPU_FLAG_RVV_I32)
- dsp->startcode_find_candidate = ff_startcode_find_candidate_rvv;
+ c->startcode_find_candidate = ff_startcode_find_candidate_rvv;
# endif
+ if ((flags & AV_CPU_FLAG_RVV_I64) && ff_get_rv_vlenb() >= 16) {
+ if (bit_depth == 8) {
+ c->h264_idct_dc_add = ff_h264_idct4_dc_add_8_rvv;
+ c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_rvv;
+ }
+ if (bit_depth == 9) {
+ c->h264_idct_dc_add = ff_h264_idct4_dc_add_9_rvv;
+ c->h264_idct8_dc_add = ff_h264_idct8_dc_add_9_rvv;
+ }
+ if (bit_depth == 10) {
+ c->h264_idct_dc_add = ff_h264_idct4_dc_add_10_rvv;
+ c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_rvv;
+ }
+ if (bit_depth == 12) {
+ c->h264_idct_dc_add = ff_h264_idct4_dc_add_12_rvv;
+ c->h264_idct8_dc_add = ff_h264_idct8_dc_add_12_rvv;
+ }
+ if (bit_depth == 14) {
+ c->h264_idct_dc_add = ff_h264_idct4_dc_add_14_rvv;
+ c->h264_idct8_dc_add = ff_h264_idct8_dc_add_14_rvv;
+ }
+ }
#endif
}
diff --git a/libavcodec/riscv/h264dsp_rvv.S b/libavcodec/riscv/h264dsp_rvv.S
new file mode 100644
index 0000000000..c392403add
--- /dev/null
+++ b/libavcodec/riscv/h264dsp_rvv.S
@@ -0,0 +1,129 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 J. Dekker <jdek@itanimul.li>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "libavutil/riscv/asm.S"
+
+.macro clip min, max, regs:vararg
+.irp x, \regs
+ vmax.vx \x, \x, \min
+.endr
+.irp x, \regs
+ vmin.vx \x, \x, \max
+.endr
+.endm
+
+.macro idct_dc_add depth
+func ff_h264_idct_dc_add_\depth\()_rvv, zve64x, zbb
+.if \depth == 8
+ lh a3, 0(a1)
+.else
+ lw a3, 0(a1)
+.endif
+ addi a3, a3, 32
+ mv t1, a4 // lines = cols
+ srai a3, a3, 6
+.if \depth == 8
+ sh zero, 0(a1)
+.else
+ sw zero, 0(a1)
+.endif
+1: add t4, a0, a2 // &src[line * 1]
+ sh1add t5, a2, a0 // &src[line * 2]
+ sh1add t6, a2, t4 // &src[line * 3]
+.if \depth == 8
+ vsetvli zero, a4, e8, mf2, ta, ma
+ vle8.v v0, (a0)
+ vle8.v v1, (t4)
+ vle8.v v2, (t5)
+ vle8.v v3, (t6)
+ vwcvtu.x.x.v v8, v0
+ vwcvtu.x.x.v v9, v1
+ vwcvtu.x.x.v v10, v2
+ vwcvtu.x.x.v v11, v3
+ vsetvli zero, a4, e16, m1, ta, ma
+.else
+ vsetvli zero, a4, e16, m1, ta, ma
+ vle16.v v8, (a0)
+ vle16.v v9, (t4)
+ vle16.v v10, (t5)
+ vle16.v v11, (t6)
+.endif
+ vadd.vx v8, v8, a3
+ vadd.vx v9, v9, a3
+ vadd.vx v10, v10, a3
+ vadd.vx v11, v11, a3
+ clip zero, a5, v8, v9, v10, v11
+ addi t1, t1, -4
+.if \depth == 8
+ vsetvli zero, a4, e8, mf2, ta, ma
+ vncvt.x.x.w v0, v8
+ vncvt.x.x.w v1, v9
+ vncvt.x.x.w v2, v10
+ vncvt.x.x.w v3, v11
+ vse8.v v0, (a0)
+ vse8.v v1, (t4)
+ vse8.v v2, (t5)
+ vse8.v v3, (t6)
+.else
+ vse16.v v8, (a0)
+ vse16.v v9, (t4)
+ vse16.v v10, (t5)
+ vse16.v v11, (t6)
+.endif
+ sh2add a0, a2, a0 // src += stride * 4
+ bnez t1, 1b
+ ret
+endfunc
+.endm
+
+idct_dc_add 8
+idct_dc_add 16
+
+func ff_h264_idct4_dc_add_8_rvv, zve64x
+ li a4, 4
+ li a5, 255
+ j ff_h264_idct_dc_add_8_rvv
+endfunc
+func ff_h264_idct8_dc_add_8_rvv, zve64x
+ li a4, 8
+ li a5, 255
+ j ff_h264_idct_dc_add_8_rvv
+endfunc
+
+.irp depth,9,10,12,14
+func ff_h264_idct4_dc_add_\depth\()_rvv, zve64x
+ li a4, 4
+ li a5, (1 << \depth) - 1
+ j ff_h264_idct_dc_add_16_rvv
+endfunc
+
+func ff_h264_idct8_dc_add_\depth\()_rvv, zve64x
+ li a4, 8
+ li a5, (1 << \depth) - 1
+ j ff_h264_idct_dc_add_16_rvv
+endfunc
+.endr
--
2.45.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [FFmpeg-devel] [PATCH] avcodec/riscv: add h264 dc idct rvv
2024-06-10 6:22 [FFmpeg-devel] [PATCH] avcodec/riscv: add h264 dc idct rvv J. Dekker
@ 2024-06-10 15:03 ` Rémi Denis-Courmont
0 siblings, 0 replies; 2+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-10 15:03 UTC (permalink / raw)
To: ffmpeg-devel
Le maanantaina 10. kesäkuuta 2024, 9.22.14 EEST J. Dekker a écrit :
> checkasm: bench runs 131072 (1 << 17)
> h264_idct4_add_dc_8bpp_c: 1.5
> h264_idct4_add_dc_8bpp_rvv_i64: 0.7
RDCYCLE would exhibit larger and more meaningful numbers here.
> h264_idct4_add_dc_9bpp_c: 1.5
> h264_idct4_add_dc_9bpp_rvv_i64: 0.7
> h264_idct4_add_dc_10bpp_c: 1.5
> h264_idct4_add_dc_10bpp_rvv_i64: 0.7
> h264_idct4_add_dc_12bpp_c: 1.2
> h264_idct4_add_dc_12bpp_rvv_i64: 0.7
> h264_idct4_add_dc_14bpp_c: 1.2
> h264_idct4_add_dc_14bpp_rvv_i64: 0.7
> h264_idct8_add_dc_8bpp_c: 5.2
> h264_idct8_add_dc_8bpp_rvv_i64: 1.5
> h264_idct8_add_dc_9bpp_c: 5.5
> h264_idct8_add_dc_9bpp_rvv_i64: 1.2
> h264_idct8_add_dc_10bpp_c: 5.5
> h264_idct8_add_dc_10bpp_rvv_i64: 1.2
> h264_idct8_add_dc_12bpp_c: 4.2
> h264_idct8_add_dc_12bpp_rvv_i64: 1.2
> h264_idct8_add_dc_14bpp_c: 4.2
> h264_idct8_add_dc_14bpp_rvv_i64: 1.2
>
> Signed-off-by: J. Dekker <jdek@itanimul.li>
> ---
> libavcodec/riscv/Makefile | 1 +
> libavcodec/riscv/h264dsp_init.c | 40 +++++++++-
> libavcodec/riscv/h264dsp_rvv.S | 129 ++++++++++++++++++++++++++++++++
> 3 files changed, 167 insertions(+), 3 deletions(-)
> create mode 100644 libavcodec/riscv/h264dsp_rvv.S
>
> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> index 590655f829..cb63631de4 100644
> --- a/libavcodec/riscv/Makefile
> +++ b/libavcodec/riscv/Makefile
> @@ -31,6 +31,7 @@ RVV-OBJS-$(CONFIG_H263DSP) += riscv/h263dsp_rvv.o
> OBJS-$(CONFIG_H264CHROMA) += riscv/h264_chroma_init_riscv.o
> RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o
> OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_init.o
> +RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_rvv.o
> OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o
> RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o
> OBJS-$(CONFIG_IDCTDSP) += riscv/idctdsp_init.o
> diff --git a/libavcodec/riscv/h264dsp_init.c
> b/libavcodec/riscv/h264dsp_init.c index dbbf3db400..fe3caf686d 100644
> --- a/libavcodec/riscv/h264dsp_init.c
> +++ b/libavcodec/riscv/h264dsp_init.c
> @@ -1,4 +1,5 @@
> /*
> + * Copyright (c) 2024 J. Dekker <jdek@itanimul.li>
> * Copyright © 2024 Rémi Denis-Courmont.
> *
> * This file is part of FFmpeg.
> @@ -24,22 +25,55 @@
>
> #include "libavutil/attributes.h"
> #include "libavutil/cpu.h"
> +#include "libavutil/riscv/cpu.h"
> #include "libavcodec/h264dsp.h"
>
> extern int ff_startcode_find_candidate_rvb(const uint8_t *, int);
> extern int ff_startcode_find_candidate_rvv(const uint8_t *, int);
> +void ff_h264_idct4_dc_add_8_rvv(uint8_t *dst, int16_t *block, int stride);
> +void ff_h264_idct8_dc_add_8_rvv(uint8_t *dst, int16_t *block, int stride);
> +void ff_h264_idct4_dc_add_9_rvv(uint8_t *dst, int16_t *block, int stride);
> +void ff_h264_idct8_dc_add_9_rvv(uint8_t *dst, int16_t *block, int stride);
> +void ff_h264_idct4_dc_add_10_rvv(uint8_t *dst, int16_t *block, int stride);
> +void ff_h264_idct8_dc_add_10_rvv(uint8_t *dst, int16_t *block, int
> stride); +void ff_h264_idct4_dc_add_12_rvv(uint8_t *dst, int16_t *block,
> int stride); +void ff_h264_idct8_dc_add_12_rvv(uint8_t *dst, int16_t
> *block, int stride); +void ff_h264_idct4_dc_add_14_rvv(uint8_t *dst,
> int16_t *block, int stride); +void ff_h264_idct8_dc_add_14_rvv(uint8_t
> *dst, int16_t *block, int stride);
>
> -av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int
> bit_depth, +av_cold void ff_h264dsp_init_riscv(H264DSPContext *c, const int
> bit_depth, const int chroma_format_idc)
> {
> #if HAVE_RV
> int flags = av_get_cpu_flags();
>
> if (flags & AV_CPU_FLAG_RVB_BASIC)
> - dsp->startcode_find_candidate = ff_startcode_find_candidate_rvb;
> + c->startcode_find_candidate = ff_startcode_find_candidate_rvb;
> # if HAVE_RVV
> if (flags & AV_CPU_FLAG_RVV_I32)
> - dsp->startcode_find_candidate = ff_startcode_find_candidate_rvv;
> + c->startcode_find_candidate = ff_startcode_find_candidate_rvv;
> # endif
> + if ((flags & AV_CPU_FLAG_RVV_I64) && ff_get_rv_vlenb() >= 16) {
ff_rv_vlen_least()
> + if (bit_depth == 8) {
switch()
> + c->h264_idct_dc_add = ff_h264_idct4_dc_add_8_rvv;
> + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_rvv;
> + }
> + if (bit_depth == 9) {
> + c->h264_idct_dc_add = ff_h264_idct4_dc_add_9_rvv;
> + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_9_rvv;
> + }
> + if (bit_depth == 10) {
> + c->h264_idct_dc_add = ff_h264_idct4_dc_add_10_rvv;
> + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_rvv;
> + }
> + if (bit_depth == 12) {
> + c->h264_idct_dc_add = ff_h264_idct4_dc_add_12_rvv;
> + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_12_rvv;
> + }
> + if (bit_depth == 14) {
> + c->h264_idct_dc_add = ff_h264_idct4_dc_add_14_rvv;
> + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_14_rvv;
> + }
> + }
> #endif
> }
> diff --git a/libavcodec/riscv/h264dsp_rvv.S b/libavcodec/riscv/h264dsp_rvv.S
> new file mode 100644
> index 0000000000..c392403add
> --- /dev/null
> +++ b/libavcodec/riscv/h264dsp_rvv.S
> @@ -0,0 +1,129 @@
> +/*
> + * SPDX-License-Identifier: BSD-2-Clause
> + *
> + * Copyright (c) 2024 J. Dekker <jdek@itanimul.li>
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + * notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + * notice, this list of conditions and the following disclaimer in the
> + * documentation and/or other materials provided with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
> IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
> TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
> PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY
> THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT
> (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE
> OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#include "libavutil/riscv/asm.S"
> +
> +.macro clip min, max, regs:vararg
> +.irp x, \regs
> + vmax.vx \x, \x, \min
> +.endr
> +.irp x, \regs
> + vmin.vx \x, \x, \max
> +.endr
> +.endm
In my experience, single use macros juse make the code harder to follow and
hide possible optimisations (indeed, this is the case here).
> +
> +.macro idct_dc_add depth
> +func ff_h264_idct_dc_add_\depth\()_rvv, zve64x, zbb
In this particular case, it seems to me that sharing the code between 8-bit
and 16-bit is counter-productive.
> +.if \depth == 8
> + lh a3, 0(a1)
> +.else
> + lw a3, 0(a1)
> +.endif
> + addi a3, a3, 32
> + mv t1, a4 // lines = cols
> + srai a3, a3, 6
> +.if \depth == 8
> + sh zero, 0(a1)
> +.else
> + sw zero, 0(a1)
> +.endif
> +1: add t4, a0, a2 // &src[line * 1]
> + sh1add t5, a2, a0 // &src[line * 2]
> + sh1add t6, a2, t4 // &src[line * 3]
> +.if \depth == 8
> + vsetvli zero, a4, e8, mf2, ta, ma
> + vle8.v v0, (a0)
> + vle8.v v1, (t4)
> + vle8.v v2, (t5)
> + vle8.v v3, (t6)
I doubt that you'll gain anything from unrolling 4x here. Maybe 2x will help
amortise consecutive latencies, but that's pretty much it.
> + vwcvtu.x.x.v v8, v0
> + vwcvtu.x.x.v v9, v1
> + vwcvtu.x.x.v v10, v2
> + vwcvtu.x.x.v v11, v3
As a general note, when up-converting integers, you have to choose between
VWCVT{,U} and V{S,Z}EXT. Here, VZEXT fits better because you could stick with
an element size of 16 bits.
> + vsetvli zero, a4, e16, m1, ta, ma
> +.else
> + vsetvli zero, a4, e16, m1, ta, ma
This resets the same type and length every time. Just set them once in the
calling code then leave VL and VTYPE alone.
> + vle16.v v8, (a0)
> + vle16.v v9, (t4)
> + vle16.v v10, (t5)
> + vle16.v v11, (t6)
> +.endif
> + vadd.vx v8, v8, a3
> + vadd.vx v9, v9, a3
> + vadd.vx v10, v10, a3
> + vadd.vx v11, v11, a3
In the 8-bit case, you could add -128 to a3. Effectively this converts the
samples to signed 8-bit.
> + clip zero, a5, v8, v9, v10, v11
> + addi t1, t1, -4
> +.if \depth == 8
> + vsetvli zero, a4, e8, mf2, ta, ma
It is idiomatic to use zero as source here.
> + vncvt.x.x.w v0, v8
> + vncvt.x.x.w v1, v9
> + vncvt.x.x.w v2, v10
> + vncvt.x.x.w v3, v11
If you added -128 above, then you can use VNCLIP.WI to clip *and* narrow,
eliminating VMAX, VMIN and VNCVT. However you need to add a VXOR.VX to flip the
sign bit back.
Alternatively, replace VMIN and VNCT with VNCLIPU.WI.
Either way, there won't much code left to share between 8-bit and HDR...
> + vse8.v v0, (a0)
> + vse8.v v1, (t4)
> + vse8.v v2, (t5)
> + vse8.v v3, (t6)
> +.else
> + vse16.v v8, (a0)
> + vse16.v v9, (t4)
> + vse16.v v10, (t5)
> + vse16.v v11, (t6)
> +.endif
> + sh2add a0, a2, a0 // src += stride * 4
> + bnez t1, 1b
> + ret
> +endfunc
> +.endm
> +
> +idct_dc_add 8
> +idct_dc_add 16
> +
> +func ff_h264_idct4_dc_add_8_rvv, zve64x
> + li a4, 4
> + li a5, 255
Using VNCLIP{,U} should make this redundant.
> + j ff_h264_idct_dc_add_8_rvv
> +endfunc
> +func ff_h264_idct8_dc_add_8_rvv, zve64x
> + li a4, 8
> + li a5, 255
> + j ff_h264_idct_dc_add_8_rvv
> +endfunc
> +
> +.irp depth,9,10,12,14
> +func ff_h264_idct4_dc_add_\depth\()_rvv, zve64x
VSETIVLI here. And while at it, pick 1/2 group multiplier which should be
enough for 4 elements per vector.
That trick won't work for 8-bit, and that's another reason not to share the
code between bit-widths.
> + li a4, 4
> + li a5, (1 << \depth) - 1
> + j ff_h264_idct_dc_add_16_rvv
> +endfunc
> +
> +func ff_h264_idct8_dc_add_\depth\()_rvv, zve64x
> + li a4, 8
> + li a5, (1 << \depth) - 1
> + j ff_h264_idct_dc_add_16_rvv
> +endfunc
> +.endr
Last but not least, in the case of 32-bit or 64-bit rows, and if they are
suitably aligned, it is probably faster to pack all values contiguously in
vector groups using strided loads and stores. Thus all looping and unrolling
would be eliminated.
That trick does not work for 128-bit rows unfortunately, and it also will not
work with more complex functions involving calculations between rows.
--
Rémi Denis-Courmont
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2024-06-10 15:03 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-06-10 6:22 [FFmpeg-devel] [PATCH] avcodec/riscv: add h264 dc idct rvv J. Dekker
2024-06-10 15:03 ` Rémi Denis-Courmont
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git