* [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks
@ 2023-05-17 7:13 Arnie Chang
2023-05-17 14:54 ` Lynne
2023-05-19 17:12 ` Rémi Denis-Courmont
0 siblings, 2 replies; 11+ messages in thread
From: Arnie Chang @ 2023-05-17 7:13 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Arnie Chang
Optimize the put and avg filtering for 8x8 chroma blocks
Signed-off-by: Arnie Chang <arnie.chang@sifive.com>
---
libavcodec/h264chroma.c | 2 +
libavcodec/h264chroma.h | 1 +
libavcodec/riscv/Makefile | 3 +
libavcodec/riscv/h264_chroma_init_riscv.c | 39 ++
libavcodec/riscv/h264_mc_chroma.S | 492 ++++++++++++++++++++++
libavcodec/riscv/h264_mc_chroma.h | 34 ++
6 files changed, 571 insertions(+)
create mode 100644 libavcodec/riscv/h264_chroma_init_riscv.c
create mode 100644 libavcodec/riscv/h264_mc_chroma.S
create mode 100644 libavcodec/riscv/h264_mc_chroma.h
diff --git a/libavcodec/h264chroma.c b/libavcodec/h264chroma.c
index 60b86b6fba..1eeab7bc40 100644
--- a/libavcodec/h264chroma.c
+++ b/libavcodec/h264chroma.c
@@ -58,5 +58,7 @@ av_cold void ff_h264chroma_init(H264ChromaContext *c, int bit_depth)
ff_h264chroma_init_mips(c, bit_depth);
#elif ARCH_LOONGARCH64
ff_h264chroma_init_loongarch(c, bit_depth);
+#elif ARCH_RISCV
+ ff_h264chroma_init_riscv(c, bit_depth);
#endif
}
diff --git a/libavcodec/h264chroma.h b/libavcodec/h264chroma.h
index b8f9c8f4fc..9c81c18a76 100644
--- a/libavcodec/h264chroma.h
+++ b/libavcodec/h264chroma.h
@@ -37,5 +37,6 @@ void ff_h264chroma_init_ppc(H264ChromaContext *c, int bit_depth);
void ff_h264chroma_init_x86(H264ChromaContext *c, int bit_depth);
void ff_h264chroma_init_mips(H264ChromaContext *c, int bit_depth);
void ff_h264chroma_init_loongarch(H264ChromaContext *c, int bit_depth);
+void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth);
#endif /* AVCODEC_H264CHROMA_H */
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 965942f4df..08b76c93cb 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -19,3 +19,6 @@ OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_init.o \
RVV-OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_rvv.o
OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
+
+OBJS-$(CONFIG_H264CHROMA) += riscv/h264_chroma_init_riscv.o
+RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o
diff --git a/libavcodec/riscv/h264_chroma_init_riscv.c b/libavcodec/riscv/h264_chroma_init_riscv.c
new file mode 100644
index 0000000000..b6f98ba693
--- /dev/null
+++ b/libavcodec/riscv/h264_chroma_init_riscv.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavcodec/h264chroma.h"
+#include "config.h"
+#include "h264_mc_chroma.h"
+
+av_cold void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth)
+{
+#if HAVE_RVV
+ const int high_bit_depth = bit_depth > 8;
+
+ if (!high_bit_depth) {
+ c->put_h264_chroma_pixels_tab[0] = h264_put_chroma_mc8_rvv;
+ c->avg_h264_chroma_pixels_tab[0] = h264_avg_chroma_mc8_rvv;
+ }
+#endif
+}
\ No newline at end of file
diff --git a/libavcodec/riscv/h264_mc_chroma.S b/libavcodec/riscv/h264_mc_chroma.S
new file mode 100644
index 0000000000..a02866f633
--- /dev/null
+++ b/libavcodec/riscv/h264_mc_chroma.S
@@ -0,0 +1,492 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+ .text
+
+ .globl h264_put_chroma_mc8_rvv
+ .p2align 1
+ .type h264_put_chroma_mc8_rvv,@function
+h264_put_chroma_mc8_rvv:
+ slliw t2, a5, 3
+ mulw t1, a5, a4
+ sh3add a5, a4, t2
+ slliw a4, a4, 3
+ subw a5, t1, a5
+ subw a7, a4, t1
+ addiw a6, a5, 64
+ subw t0, t2, t1
+ vsetivli t3, 8, e8, m1, ta, mu
+ beqz t1, .LBB0_4
+ blez a3, .LBB0_17
+ li t4, 0
+ li t2, 0
+ addi a5, t3, 1
+ slli t3, a2, 2
+.LBB0_3: # if (xy != 0)
+ add a4, a1, t4
+ vsetvli zero, a5, e8, m1, ta, ma
+ addiw t2, t2, 4
+ vle8.v v10, (a4)
+ add a4, a4, a2
+ vslidedown.vi v11, v10, 1
+ vsetivli zero, 8, e8, m1, ta, ma
+ vwmulu.vx v8, v10, a6
+ vwmaccu.vx v8, a7, v11
+ vsetvli zero, a5, e8, m1, ta, ma
+ vle8.v v12, (a4)
+ vsetivli zero, 8, e8, m1, ta, ma
+ add a4, a4, a2
+ vwmaccu.vx v8, t0, v12
+ vsetvli zero, a5, e8, m1, ta, ma
+ vslidedown.vi v13, v12, 1
+ vsetivli zero, 8, e8, m1, ta, ma
+ vwmulu.vx v10, v12, a6
+ vwmaccu.vx v8, t1, v13
+ vwmaccu.vx v10, a7, v13
+ vsetvli zero, a5, e8, m1, ta, ma
+ vle8.v v14, (a4)
+ vsetivli zero, 8, e8, m1, ta, ma
+ add a4, a4, a2
+ vwmaccu.vx v10, t0, v14
+ vsetvli zero, a5, e8, m1, ta, ma
+ vslidedown.vi v15, v14, 1
+ vsetivli zero, 8, e8, m1, ta, ma
+ vwmulu.vx v12, v14, a6
+ vwmaccu.vx v10, t1, v15
+ vwmaccu.vx v12, a7, v15
+ vsetvli zero, a5, e8, m1, ta, ma
+ vle8.v v14, (a4)
+ vsetivli zero, 8, e8, m1, ta, ma
+ add a4, a4, a2
+ vwmaccu.vx v12, t0, v14
+ vsetvli zero, a5, e8, m1, ta, ma
+ vslidedown.vi v15, v14, 1
+ vsetivli zero, 8, e8, m1, ta, ma
+ vwmulu.vx v16, v14, a6
+ vwmaccu.vx v12, t1, v15
+ vwmaccu.vx v16, a7, v15
+ vsetvli zero, a5, e8, m1, ta, ma
+ vle8.v v14, (a4)
+ vsetivli zero, 8, e8, m1, ta, ma
+ add a4, a0, t4
+ add t4, t4, t3
+ vwmaccu.vx v16, t0, v14
+ vsetvli zero, a5, e8, m1, ta, ma
+ vslidedown.vi v14, v14, 1
+ vsetivli zero, 8, e8, m1, ta, ma
+ vnclipu.wi v15, v8, 6
+ vwmaccu.vx v16, t1, v14
+ vse8.v v15, (a4)
+ add a4, a4, a2
+ vnclipu.wi v8, v10, 6
+ vse8.v v8, (a4)
+ add a4, a4, a2
+ vnclipu.wi v8, v12, 6
+ vse8.v v8, (a4)
+ add a4, a4, a2
+ vnclipu.wi v8, v16, 6
+ vse8.v v8, (a4)
+ blt t2, a3, .LBB0_3
+ j .LBB0_17
+.LBB0_4:
+ bnez a4, .LBB0_9
+ beqz t2, .LBB0_9
+ blez a3, .LBB0_17
+ li a4, 0
+ li t1, 0
+ slli a7, a2, 2
+.LBB0_8: # if ((x8 - xy) == 0 && (y8 -xy) != 0)
+ add a5, a1, a4
+ vsetvli zero, zero, e8, m1, ta, ma
+ addiw t1, t1, 4
+ vle8.v v8, (a5)
+ add a5, a5, a2
+ add t2, a5, a2
+ vwmulu.vx v10, v8, a6
+ vle8.v v8, (a5)
+ vwmulu.vx v12, v8, a6
+ vle8.v v9, (t2)
+ add t2, t2, a2
+ add a5, t2, a2
+ vwmaccu.vx v10, t0, v8
+ vle8.v v8, (t2)
+ vle8.v v14, (a5)
+ add a5, a0, a4
+ add a4, a4, a7
+ vwmaccu.vx v12, t0, v9
+ vnclipu.wi v15, v10, 6
+ vwmulu.vx v10, v9, a6
+ vse8.v v15, (a5)
+ add a5, a5, a2
+ vnclipu.wi v9, v12, 6
+ vwmaccu.vx v10, t0, v8
+ vwmulu.vx v12, v8, a6
+ vse8.v v9, (a5)
+ add a5, a5, a2
+ vnclipu.wi v8, v10, 6
+ vwmaccu.vx v12, t0, v14
+ vse8.v v8, (a5)
+ add a5, a5, a2
+ vnclipu.wi v8, v12, 6
+ vse8.v v8, (a5)
+ blt t1, a3, .LBB0_8
+ j .LBB0_17
+.LBB0_9:
+ beqz a4, .LBB0_14
+ bnez t2, .LBB0_14
+ blez a3, .LBB0_17
+ li a4, 0
+ li t2, 0
+ addi t0, t3, 1
+ slli t1, a2, 2
+.LBB0_13: # if ((x8 - xy) != 0 && (y8 -xy) == 0)
+ add a5, a1, a4
+ vsetvli zero, t0, e8, m1, ta, ma
+ addiw t2, t2, 4
+ vle8.v v8, (a5)
+ add a5, a5, a2
+ vslidedown.vi v9, v8, 1
+ vsetivli zero, 8, e8, m1, ta, ma
+ vwmulu.vx v10, v8, a6
+ vwmaccu.vx v10, a7, v9
+ vsetvli zero, t0, e8, m1, ta, ma
+ vle8.v v8, (a5)
+ add a5, a5, a2
+ vslidedown.vi v9, v8, 1
+ vsetivli zero, 8, e8, m1, ta, ma
+ vwmulu.vx v12, v8, a6
+ vwmaccu.vx v12, a7, v9
+ vsetvli zero, t0, e8, m1, ta, ma
+ vle8.v v8, (a5)
+ add a5, a5, a2
+ vslidedown.vi v9, v8, 1
+ vsetivli zero, 8, e8, m1, ta, ma
+ vwmulu.vx v14, v8, a6
+ vwmaccu.vx v14, a7, v9
+ vsetvli zero, t0, e8, m1, ta, ma
+ vle8.v v8, (a5)
+ add a5, a0, a4
+ add a4, a4, t1
+ vslidedown.vi v9, v8, 1
+ vsetivli zero, 8, e8, m1, ta, ma
+ vnclipu.wi v16, v10, 6
+ vse8.v v16, (a5)
+ add a5, a5, a2
+ vnclipu.wi v10, v12, 6
+ vwmulu.vx v12, v8, a6
+ vse8.v v10, (a5)
+ add a5, a5, a2
+ vnclipu.wi v8, v14, 6
+ vwmaccu.vx v12, a7, v9
+ vse8.v v8, (a5)
+ add a5, a5, a2
+ vnclipu.wi v8, v12, 6
+ vse8.v v8, (a5)
+ blt t2, a3, .LBB0_13
+ j .LBB0_17
+.LBB0_14:
+ blez a3, .LBB0_17
+ li a4, 0
+ li t2, 0
+ slli a7, a2, 2
+.LBB0_16: # the final else, none of the above conditions are met
+ add t0, a1, a4
+ vsetvli zero, zero, e8, m1, ta, ma
+ add a5, a0, a4
+ add a4, a4, a7
+ addiw t2, t2, 4
+ vle8.v v8, (t0)
+ add t0, t0, a2
+ add t1, t0, a2
+ vwmulu.vx v10, v8, a6
+ vle8.v v8, (t0)
+ add t0, t1, a2
+ vle8.v v9, (t1)
+ vle8.v v12, (t0)
+ vnclipu.wi v13, v10, 6
+ vwmulu.vx v10, v8, a6
+ vse8.v v13, (a5)
+ add a5, a5, a2
+ vnclipu.wi v8, v10, 6
+ vwmulu.vx v10, v9, a6
+ vse8.v v8, (a5)
+ add a5, a5, a2
+ vnclipu.wi v8, v10, 6
+ vwmulu.vx v10, v12, a6
+ vse8.v v8, (a5)
+ add a5, a5, a2
+ vnclipu.wi v8, v10, 6
+ vse8.v v8, (a5)
+ blt t2, a3, .LBB0_16
+.LBB0_17: # Exit h264_put_chroma_mc8_rvv
+ ret
+.Lfunc_end0:
+ .size h264_put_chroma_mc8_rvv, .Lfunc_end0-h264_put_chroma_mc8_rvv
+
+ .globl h264_avg_chroma_mc8_rvv
+ .p2align 1
+ .type h264_avg_chroma_mc8_rvv,@function
+h264_avg_chroma_mc8_rvv:
+ slliw t2, a5, 3
+ mulw t1, a5, a4
+ sh3add a5, a4, t2
+ slliw a4, a4, 3
+ subw a5, t1, a5
+ subw a7, a4, t1
+ addiw a6, a5, 64
+ subw t0, t2, t1
+ vsetivli t3, 8, e8, m1, ta, mu
+ beqz t1, .LBB1_4
+ blez a3, .LBB1_17
+ li t4, 0
+ li t2, 0
+ addi a5, t3, 1
+ slli t3, a2, 2
+.LBB1_3: # if (xy != 0)
+ add a4, a1, t4
+ vsetvli zero, a5, e8, m1, ta, ma
+ addiw t2, t2, 4
+ vle8.v v10, (a4)
+ add a4, a4, a2
+ vslidedown.vi v11, v10, 1
+ vsetivli zero, 8, e8, m1, ta, ma
+ vwmulu.vx v8, v10, a6
+ vwmaccu.vx v8, a7, v11
+ vsetvli zero, a5, e8, m1, ta, ma
+ vle8.v v12, (a4)
+ vsetivli zero, 8, e8, m1, ta, ma
+ add a4, a4, a2
+ vwmaccu.vx v8, t0, v12
+ vsetvli zero, a5, e8, m1, ta, ma
+ vslidedown.vi v13, v12, 1
+ vsetivli zero, 8, e8, m1, ta, ma
+ vwmulu.vx v10, v12, a6
+ vwmaccu.vx v8, t1, v13
+ vwmaccu.vx v10, a7, v13
+ vsetvli zero, a5, e8, m1, ta, ma
+ vle8.v v14, (a4)
+ vsetivli zero, 8, e8, m1, ta, ma
+ add a4, a4, a2
+ vwmaccu.vx v10, t0, v14
+ vsetvli zero, a5, e8, m1, ta, ma
+ vslidedown.vi v15, v14, 1
+ vsetivli zero, 8, e8, m1, ta, ma
+ vwmulu.vx v12, v14, a6
+ vwmaccu.vx v10, t1, v15
+ vwmaccu.vx v12, a7, v15
+ vsetvli zero, a5, e8, m1, ta, ma
+ vle8.v v14, (a4)
+ vsetivli zero, 8, e8, m1, ta, ma
+ add a4, a4, a2
+ vwmaccu.vx v12, t0, v14
+ vsetvli zero, a5, e8, m1, ta, ma
+ vslidedown.vi v15, v14, 1
+ vsetivli zero, 8, e8, m1, ta, ma
+ vwmulu.vx v16, v14, a6
+ vwmaccu.vx v12, t1, v15
+ vwmaccu.vx v16, a7, v15
+ vsetvli zero, a5, e8, m1, ta, ma
+ vle8.v v14, (a4)
+ vsetivli zero, 8, e8, m1, ta, ma
+ add a4, a0, t4
+ add t4, t4, t3
+ vwmaccu.vx v16, t0, v14
+ vsetvli zero, a5, e8, m1, ta, ma
+ vslidedown.vi v14, v14, 1
+ vsetivli zero, 8, e8, m1, ta, ma
+ vnclipu.wi v15, v8, 6
+ vle8.v v8, (a4)
+ vwmaccu.vx v16, t1, v14
+ vaaddu.vv v8, v15, v8
+ vse8.v v8, (a4)
+ add a4, a4, a2
+ vnclipu.wi v8, v10, 6
+ vle8.v v9, (a4)
+ vaaddu.vv v8, v8, v9
+ vse8.v v8, (a4)
+ add a4, a4, a2
+ vnclipu.wi v8, v12, 6
+ vle8.v v9, (a4)
+ vaaddu.vv v8, v8, v9
+ vse8.v v8, (a4)
+ add a4, a4, a2
+ vnclipu.wi v8, v16, 6
+ vle8.v v9, (a4)
+ vaaddu.vv v8, v8, v9
+ vse8.v v8, (a4)
+ blt t2, a3, .LBB1_3
+ j .LBB1_17
+.LBB1_4:
+ bnez a4, .LBB1_9
+ beqz t2, .LBB1_9
+ blez a3, .LBB1_17
+ li t2, 0
+ li t1, 0
+ slli a7, a2, 2
+.LBB1_8: # if ((x8 - xy) == 0 && (y8 -xy) != 0)
+ add a4, a1, t2
+ vsetvli zero, zero, e8, m1, ta, ma
+ addiw t1, t1, 4
+ vle8.v v8, (a4)
+ add a4, a4, a2
+ vwmulu.vx v10, v8, a6
+ vle8.v v8, (a4)
+ add a4, a4, a2
+ add a5, a4, a2
+ vle8.v v9, (a4)
+ add a4, a5, a2
+ vle8.v v12, (a5)
+ vwmaccu.vx v10, t0, v8
+ vle8.v v13, (a4)
+ add a4, a0, t2
+ add t2, t2, a7
+ vnclipu.wi v14, v10, 6
+ vwmulu.vx v10, v8, a6
+ vle8.v v8, (a4)
+ vaaddu.vv v8, v14, v8
+ vwmaccu.vx v10, t0, v9
+ vse8.v v8, (a4)
+ add a4, a4, a2
+ vnclipu.wi v8, v10, 6
+ vwmulu.vx v10, v9, a6
+ vle8.v v9, (a4)
+ vaaddu.vv v8, v8, v9
+ vwmaccu.vx v10, t0, v12
+ vse8.v v8, (a4)
+ add a4, a4, a2
+ vnclipu.wi v8, v10, 6
+ vwmulu.vx v10, v12, a6
+ vle8.v v9, (a4)
+ vaaddu.vv v8, v8, v9
+ vwmaccu.vx v10, t0, v13
+ vse8.v v8, (a4)
+ add a4, a4, a2
+ vnclipu.wi v8, v10, 6
+ vle8.v v9, (a4)
+ vaaddu.vv v8, v8, v9
+ vse8.v v8, (a4)
+ blt t1, a3, .LBB1_8
+ j .LBB1_17
+.LBB1_9:
+ beqz a4, .LBB1_14
+ bnez t2, .LBB1_14
+ blez a3, .LBB1_17
+ li a5, 0
+ li t2, 0
+ addi t0, t3, 1
+ slli t1, a2, 2
+.LBB1_13: # if ((x8 - xy) != 0 && (y8 -xy) == 0)
+ add a4, a1, a5
+ vsetvli zero, t0, e8, m1, ta, ma
+ addiw t2, t2, 4
+ vle8.v v8, (a4)
+ add a4, a4, a2
+ vslidedown.vi v9, v8, 1
+ vsetivli zero, 8, e8, m1, ta, ma
+ vwmulu.vx v10, v8, a6
+ vwmaccu.vx v10, a7, v9
+ vsetvli zero, t0, e8, m1, ta, ma
+ vle8.v v8, (a4)
+ add a4, a4, a2
+ vslidedown.vi v9, v8, 1
+ vsetivli zero, 8, e8, m1, ta, ma
+ vwmulu.vx v12, v8, a6
+ vwmaccu.vx v12, a7, v9
+ vsetvli zero, t0, e8, m1, ta, ma
+ vle8.v v8, (a4)
+ add a4, a4, a2
+ vslidedown.vi v9, v8, 1
+ vsetivli zero, 8, e8, m1, ta, ma
+ vwmulu.vx v14, v8, a6
+ vwmaccu.vx v14, a7, v9
+ vsetvli zero, t0, e8, m1, ta, ma
+ vle8.v v8, (a4)
+ add a4, a0, a5
+ add a5, a5, t1
+ vslidedown.vi v9, v8, 1
+ vsetivli zero, 8, e8, m1, ta, ma
+ vnclipu.wi v16, v10, 6
+ vle8.v v10, (a4)
+ vaaddu.vv v10, v16, v10
+ vse8.v v10, (a4)
+ add a4, a4, a2
+ vnclipu.wi v10, v12, 6
+ vle8.v v11, (a4)
+ vwmulu.vx v12, v8, a6
+ vaaddu.vv v10, v10, v11
+ vwmaccu.vx v12, a7, v9
+ vse8.v v10, (a4)
+ add a4, a4, a2
+ vnclipu.wi v10, v14, 6
+ vle8.v v8, (a4)
+ vaaddu.vv v8, v10, v8
+ vse8.v v8, (a4)
+ add a4, a4, a2
+ vnclipu.wi v8, v12, 6
+ vle8.v v9, (a4)
+ vaaddu.vv v8, v8, v9
+ vse8.v v8, (a4)
+ blt t2, a3, .LBB1_13
+ j .LBB1_17
+.LBB1_14:
+ blez a3, .LBB1_17
+ li a4, 0
+ li t0, 0
+ slli a7, a2, 2
+.LBB1_16: # the final else, none of the above conditions are met
+ add a5, a1, a4
+ vsetvli zero, zero, e8, m1, ta, ma
+ addiw t0, t0, 4
+ vle8.v v8, (a5)
+ add a5, a5, a2
+ add t1, a5, a2
+ vwmulu.vx v10, v8, a6
+ vle8.v v8, (a5)
+ add a5, t1, a2
+ vle8.v v9, (t1)
+ vle8.v v12, (a5)
+ add a5, a0, a4
+ add a4, a4, a7
+ vnclipu.wi v13, v10, 6
+ vle8.v v10, (a5)
+ vwmulu.vx v14, v8, a6
+ vaaddu.vv v10, v13, v10
+ vse8.v v10, (a5)
+ add a5, a5, a2
+ vnclipu.wi v8, v14, 6
+ vle8.v v10, (a5)
+ vaaddu.vv v8, v8, v10
+ vwmulu.vx v10, v9, a6
+ vse8.v v8, (a5)
+ add a5, a5, a2
+ vnclipu.wi v8, v10, 6
+ vle8.v v9, (a5)
+ vwmulu.vx v10, v12, a6
+ vaaddu.vv v8, v8, v9
+ vse8.v v8, (a5)
+ add a5, a5, a2
+ vnclipu.wi v8, v10, 6
+ vle8.v v9, (a5)
+ vaaddu.vv v8, v8, v9
+ vse8.v v8, (a5)
+ blt t0, a3, .LBB1_16
+.LBB1_17: # Exit h264_avg_chroma_mc8_rvv
+ ret
+.Lfunc_end1:
+ .size h264_avg_chroma_mc8_rvv, .Lfunc_end1-h264_avg_chroma_mc8_rvv
diff --git a/libavcodec/riscv/h264_mc_chroma.h b/libavcodec/riscv/h264_mc_chroma.h
new file mode 100644
index 0000000000..cb350d0e4a
--- /dev/null
+++ b/libavcodec/riscv/h264_mc_chroma.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_RISCV_H264_MC_CHROMA_H
+#define AVCODEC_RISCV_H264_MC_CHROMA_H
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stddef.h>
+#include "config.h"
+
+#if HAVE_RVV
+void h264_put_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
+void h264_avg_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
+#endif
+#endif
\ No newline at end of file
--
2.17.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks
2023-05-17 7:13 [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks Arnie Chang
@ 2023-05-17 14:54 ` Lynne
2023-05-18 8:20 ` Arnie Chang
` (2 more replies)
2023-05-19 17:12 ` Rémi Denis-Courmont
1 sibling, 3 replies; 11+ messages in thread
From: Lynne @ 2023-05-17 14:54 UTC (permalink / raw)
To: FFmpeg development discussions and patches
May 17, 2023, 09:13 by arnie.chang@sifive.com:
> Optimize the put and avg filtering for 8x8 chroma blocks
>
> Signed-off-by: Arnie Chang <arnie.chang@sifive.com>
> ---
> libavcodec/h264chroma.c | 2 +
> libavcodec/h264chroma.h | 1 +
> libavcodec/riscv/Makefile | 3 +
> libavcodec/riscv/h264_chroma_init_riscv.c | 39 ++
> libavcodec/riscv/h264_mc_chroma.S | 492 ++++++++++++++++++++++
> libavcodec/riscv/h264_mc_chroma.h | 34 ++
> 6 files changed, 571 insertions(+)
> create mode 100644 libavcodec/riscv/h264_chroma_init_riscv.c
> create mode 100644 libavcodec/riscv/h264_mc_chroma.S
> create mode 100644 libavcodec/riscv/h264_mc_chroma.h
>
> +#include <stdint.h>
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavcodec/h264chroma.h"
> +#include "config.h"
> +#include "h264_mc_chroma.h"
> +
> +av_cold void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth)
> +{
> +#if HAVE_RVV
> + const int high_bit_depth = bit_depth > 8;
>
You don't need this constant.
> +
> + if (!high_bit_depth) {
> + c->put_h264_chroma_pixels_tab[0] = h264_put_chroma_mc8_rvv;
> + c->avg_h264_chroma_pixels_tab[0] = h264_avg_chroma_mc8_rvv;
> + }
> +#endif
>
You have to check if RVV is supported:
> int flags = av_get_cpu_flags();
>
> if (flags & AV_CPU_FLAG_RVV_F32) {
> if (bit_depth > 8) {
> + .text
> +
> + .globl h264_put_chroma_mc8_rvv
> + .p2align 1
> + .type h264_put_chroma_mc8_rvv,@function
> +h264_put_chroma_mc8_rvv:
>
You don't need any of this. We already have macros to
handle this - take a look at libavcodec/riscv/opusdsp_rvv.S:
> func ff_opus_postfilter_rvv_256, zve32f
> lvtypei a5, e32, m1, ta, ma // function instructions start here
Make sure to change zve32f to whatever instruction extension you use
to initialize the assembler to handle it.
> + slliw t2, a5, 3
> + mulw t1, a5, a4
> + sh3add a5, a4, t2
> + slliw a4, a4, 3
> + subw a5, t1, a5
> + subw a7, a4, t1
> + addiw a6, a5, 64
> + subw t0, t2, t1
>
Coding style issue - we style our RISC-V assembly the same way
we style our AArch64 assembly:
<8 spaces><instruction><spaces until the 24th character on the line><arguments,registers .etc>
For example:
> vsetvl zero, a4, a5
> lw t2, 20(a1)
> vfmul.vv v8, v24, v16
> addi a0, a0, 4
> vslide1down.vx v16, v16, t2
> MACRO arg1, arg2
> +.LBB0_8: # if ((x8 - xy) == 0 && (y8 -xy) != 0)
> + add a5, a1, a4
> + vsetvli zero, zero, e8, m1, ta, ma
> + addiw t1, t1, 4
> + vle8.v v8, (a5)
> + add a5, a5, a2
> + add t2, a5, a2
> + vwmulu.vx v10, v8, a6
>
This branch looks very similar to
> .LBB1_16: # the final else, none of the above conditions are met
> add a5, a1, a4
> vsetvli zero, zero, e8, m1, ta, ma
> addiw t0, t0, 4
> vle8.v v8, (a5)
> add a5, a5, a2
> add t1, a5, a2
> vwmulu.vx v10, v8, a6
Consider using a macro.
In fact, a lot of the branches look similar to each other. Looking at other
implementations, they only consider 3 possible variants, the same ones
that the C function has.
> + .size h264_avg_chroma_mc8_rvv, .Lfunc_end1-h264_avg_chroma_mc8_rvv
> diff --git a/libavcodec/riscv/h264_mc_chroma.h b/libavcodec/riscv/h264_mc_chroma.h
> new file mode 100644
> index 0000000000..cb350d0e4a
> --- /dev/null
> +++ b/libavcodec/riscv/h264_mc_chroma.h
> @@ -0,0 +1,34 @@
> +/*
> + * Copyright (c) 2023 SiFive, Inc. All rights reserved.
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#ifndef AVCODEC_RISCV_H264_MC_CHROMA_H
> +#define AVCODEC_RISCV_H264_MC_CHROMA_H
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <stdint.h>
> +#include <string.h>
> +#include <stddef.h>
> +#include "config.h"
>
You don't need all of these includes. Just config.h and stdint.h would be enough.
> +#if HAVE_RVV
> +void h264_put_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
> +void h264_avg_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
> +#endif
> +#endif
> \ No newline at end of file
>
You need your file to end in a newline. Git already warns you if you don't.
Finally, run:
make checkasm && ./tests/checkasm/checkasm --bench
and report on the timings for both the C and assembly versions.
If you've made a mistake somewhere, (forgot to restore stack, or a callee-saved register,
or your function produces an incorrect result), checkasm will fail.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks
2023-05-17 14:54 ` Lynne
@ 2023-05-18 8:20 ` Arnie Chang
2023-05-18 12:53 ` Ronald S. Bultje
2023-05-18 16:56 ` Lynne
2023-05-19 17:15 ` Rémi Denis-Courmont
2023-05-20 10:03 ` Rémi Denis-Courmont
2 siblings, 2 replies; 11+ messages in thread
From: Arnie Chang @ 2023-05-18 8:20 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Wed, May 17, 2023 at 10:54 PM Lynne <dev@lynne.ee> wrote:
>
> Finally, run:
> make checkasm && ./tests/checkasm/checkasm --bench
> and report on the timings for both the C and assembly versions.
> If you've made a mistake somewhere, (forgot to restore stack, or a
> callee-saved register,
> or your function produces an incorrect result), checkasm will fail.
>
The checkasm does not cover motion compensation of chroma samples in the
H.264 decoder,
so my modification can not be tested.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks
2023-05-18 8:20 ` Arnie Chang
@ 2023-05-18 12:53 ` Ronald S. Bultje
2023-05-18 16:56 ` Lynne
1 sibling, 0 replies; 11+ messages in thread
From: Ronald S. Bultje @ 2023-05-18 12:53 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Hi,
On Thu, May 18, 2023 at 4:21 AM Arnie Chang <arnie.chang@sifive.com> wrote:
> On Wed, May 17, 2023 at 10:54 PM Lynne <dev@lynne.ee> wrote:
>
> >
> > Finally, run:
> > make checkasm && ./tests/checkasm/checkasm --bench
> > and report on the timings for both the C and assembly versions.
> > If you've made a mistake somewhere, (forgot to restore stack, or a
> > callee-saved register,
> > or your function produces an incorrect result), checkasm will fail.
> >
>
> The checkasm does not cover motion compensation of chroma samples in the
> H.264 decoder,
> so my modification can not be tested.
>
Maybe you could fix that?
Ronald
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks
2023-05-18 8:20 ` Arnie Chang
2023-05-18 12:53 ` Ronald S. Bultje
@ 2023-05-18 16:56 ` Lynne
1 sibling, 0 replies; 11+ messages in thread
From: Lynne @ 2023-05-18 16:56 UTC (permalink / raw)
To: FFmpeg development discussions and patches
May 18, 2023, 10:21 by arnie.chang@sifive.com:
> On Wed, May 17, 2023 at 10:54 PM Lynne <dev@lynne.ee> wrote:
>
>>
>> Finally, run:
>> make checkasm && ./tests/checkasm/checkasm --bench
>> and report on the timings for both the C and assembly versions.
>> If you've made a mistake somewhere, (forgot to restore stack, or a
>> callee-saved register,
>> or your function produces an incorrect result), checkasm will fail.
>>
>
> The checkasm does not cover motion compensation of chroma samples in the
> H.264 decoder,
> so my modification can not be tested.
>
I added one here http://ffmpeg.org/pipermail/ffmpeg-devel/2023-May/309812.html
follow the example and add a test next time.
I wouldn't accept risc-v asm without checkasm, the platform is too new (and doesn't
even have hardware yet), and probably prone to silicon issues.
We've found a few silicon bugs through checkasm over the years.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks
2023-05-17 7:13 [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks Arnie Chang
2023-05-17 14:54 ` Lynne
@ 2023-05-19 17:12 ` Rémi Denis-Courmont
2023-05-22 14:15 ` Arnie Chang
1 sibling, 1 reply; 11+ messages in thread
From: Rémi Denis-Courmont @ 2023-05-19 17:12 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Arnie Chang
Le keskiviikkona 17. toukokuuta 2023, 10.13.01 EEST Arnie Chang a écrit :
> Optimize the put and avg filtering for 8x8 chroma blocks
>
> Signed-off-by: Arnie Chang <arnie.chang@sifive.com>
> ---
> libavcodec/h264chroma.c | 2 +
> libavcodec/h264chroma.h | 1 +
> libavcodec/riscv/Makefile | 3 +
> libavcodec/riscv/h264_chroma_init_riscv.c | 39 ++
> libavcodec/riscv/h264_mc_chroma.S | 492 ++++++++++++++++++++++
> libavcodec/riscv/h264_mc_chroma.h | 34 ++
> 6 files changed, 571 insertions(+)
> create mode 100644 libavcodec/riscv/h264_chroma_init_riscv.c
> create mode 100644 libavcodec/riscv/h264_mc_chroma.S
> create mode 100644 libavcodec/riscv/h264_mc_chroma.h
>
> diff --git a/libavcodec/h264chroma.c b/libavcodec/h264chroma.c
> index 60b86b6fba..1eeab7bc40 100644
> --- a/libavcodec/h264chroma.c
> +++ b/libavcodec/h264chroma.c
> @@ -58,5 +58,7 @@ av_cold void ff_h264chroma_init(H264ChromaContext *c, int
> bit_depth) ff_h264chroma_init_mips(c, bit_depth);
> #elif ARCH_LOONGARCH64
> ff_h264chroma_init_loongarch(c, bit_depth);
> +#elif ARCH_RISCV
> + ff_h264chroma_init_riscv(c, bit_depth);
> #endif
> }
> diff --git a/libavcodec/h264chroma.h b/libavcodec/h264chroma.h
> index b8f9c8f4fc..9c81c18a76 100644
> --- a/libavcodec/h264chroma.h
> +++ b/libavcodec/h264chroma.h
> @@ -37,5 +37,6 @@ void ff_h264chroma_init_ppc(H264ChromaContext *c, int
> bit_depth); void ff_h264chroma_init_x86(H264ChromaContext *c, int
> bit_depth); void ff_h264chroma_init_mips(H264ChromaContext *c, int
> bit_depth); void ff_h264chroma_init_loongarch(H264ChromaContext *c, int
> bit_depth); +void ff_h264chroma_init_riscv(H264ChromaContext *c, int
> bit_depth);
>
> #endif /* AVCODEC_H264CHROMA_H */
> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> index 965942f4df..08b76c93cb 100644
> --- a/libavcodec/riscv/Makefile
> +++ b/libavcodec/riscv/Makefile
> @@ -19,3 +19,6 @@ OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_init.o \
> RVV-OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_rvv.o
> OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
> RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
> +
> +OBJS-$(CONFIG_H264CHROMA) += riscv/h264_chroma_init_riscv.o
> +RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o
Please maintain the existing ordering, which is to say, alphabetical.
> diff --git a/libavcodec/riscv/h264_chroma_init_riscv.c
> b/libavcodec/riscv/h264_chroma_init_riscv.c new file mode 100644
> index 0000000000..b6f98ba693
> --- /dev/null
> +++ b/libavcodec/riscv/h264_chroma_init_riscv.c
> @@ -0,0 +1,39 @@
> +/*
> + * Copyright (c) 2023 SiFive, Inc. All rights reserved.
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA + */
> +
> +#include <stdint.h>
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavcodec/h264chroma.h"
> +#include "config.h"
> +#include "h264_mc_chroma.h"
> +
> +av_cold void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth)
> +{
> +#if HAVE_RVV
> + const int high_bit_depth = bit_depth > 8;
> +
> + if (!high_bit_depth) {
> + c->put_h264_chroma_pixels_tab[0] = h264_put_chroma_mc8_rvv;
> + c->avg_h264_chroma_pixels_tab[0] = h264_avg_chroma_mc8_rvv;
> + }
> +#endif
> +}
> \ No newline at end of file
> diff --git a/libavcodec/riscv/h264_mc_chroma.S
> b/libavcodec/riscv/h264_mc_chroma.S new file mode 100644
> index 0000000000..a02866f633
> --- /dev/null
> +++ b/libavcodec/riscv/h264_mc_chroma.S
> @@ -0,0 +1,492 @@
> +/*
> + * Copyright (c) 2023 SiFive, Inc. All rights reserved.
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA + */
> + .text
> +
> + .globl h264_put_chroma_mc8_rvv
> + .p2align 1
> + .type h264_put_chroma_mc8_rvv,@function
> +h264_put_chroma_mc8_rvv:
> + slliw t2, a5, 3
> + mulw t1, a5, a4
> + sh3add a5, a4, t2
> + slliw a4, a4, 3
> + subw a5, t1, a5
> + subw a7, a4, t1
> + addiw a6, a5, 64
> + subw t0, t2, t1
I would avoid narrow instructions unless strictly necessary, so that we don't
unnecessarily restrict target platforms.
> + vsetivli t3, 8, e8, m1, ta, mu
> + beqz t1, .LBB0_4
> + blez a3, .LBB0_17
If you're going to use numeric labels (which is totally fine, I do that too),
then you really can just use the usual numeric label assembler syntax. There
are no needs to create local branch symbols.
> + li t4, 0
> + li t2, 0
> + addi a5, t3, 1
> + slli t3, a2, 2
> +.LBB0_3: # if (xy != 0)
> + add a4, a1, t4
> + vsetvli zero, a5, e8, m1, ta, ma
> + addiw t2, t2, 4
> + vle8.v v10, (a4)
> + add a4, a4, a2
> + vslidedown.vi v11, v10, 1
Isn't vslide1down.vx zero potentially faster than vslidedown.vi 1?
> + vsetivli zero, 8, e8, m1, ta, ma
Do we really need to reconfigure the active vector length so many times? I
suspect that is not going to go down to well with some implementations.
> + vwmulu.vx v8, v10, a6
> + vwmaccu.vx v8, a7, v11
> + vsetvli zero, a5, e8, m1, ta, ma
> + vle8.v v12, (a4)
> + vsetivli zero, 8, e8, m1, ta, ma
> + add a4, a4, a2
> + vwmaccu.vx v8, t0, v12
> + vsetvli zero, a5, e8, m1, ta, ma
> + vslidedown.vi v13, v12, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v10, v12, a6
> + vwmaccu.vx v8, t1, v13
> + vwmaccu.vx v10, a7, v13
> + vsetvli zero, a5, e8, m1, ta, ma
> + vle8.v v14, (a4)
> + vsetivli zero, 8, e8, m1, ta, ma
> + add a4, a4, a2
> + vwmaccu.vx v10, t0, v14
> + vsetvli zero, a5, e8, m1, ta, ma
> + vslidedown.vi v15, v14, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v12, v14, a6
> + vwmaccu.vx v10, t1, v15
> + vwmaccu.vx v12, a7, v15
> + vsetvli zero, a5, e8, m1, ta, ma
> + vle8.v v14, (a4)
> + vsetivli zero, 8, e8, m1, ta, ma
> + add a4, a4, a2
> + vwmaccu.vx v12, t0, v14
> + vsetvli zero, a5, e8, m1, ta, ma
> + vslidedown.vi v15, v14, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v16, v14, a6
> + vwmaccu.vx v12, t1, v15
> + vwmaccu.vx v16, a7, v15
> + vsetvli zero, a5, e8, m1, ta, ma
> + vle8.v v14, (a4)
> + vsetivli zero, 8, e8, m1, ta, ma
> + add a4, a0, t4
> + add t4, t4, t3
I could be totally wrong since I have no hardware to verify with, but I would
assume that it is preferable to interleave independent scalar and vector
instructions whence possible. For out-of-order processors, it shouldn't
matter, but I suppose that it would on in-order multi-issue processors.
> + vwmaccu.vx v16, t0, v14
> + vsetvli zero, a5, e8, m1, ta, ma
> + vslidedown.vi v14, v14, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vnclipu.wi v15, v8, 6
> + vwmaccu.vx v16, t1, v14
> + vse8.v v15, (a4)
> + add a4, a4, a2
> + vnclipu.wi v8, v10, 6
> + vse8.v v8, (a4)
> + add a4, a4, a2
> + vnclipu.wi v8, v12, 6
> + vse8.v v8, (a4)
> + add a4, a4, a2
> + vnclipu.wi v8, v16, 6
> + vse8.v v8, (a4)
> + blt t2, a3, .LBB0_3
> + j .LBB0_17
> +.LBB0_4:
> + bnez a4, .LBB0_9
> + beqz t2, .LBB0_9
> + blez a3, .LBB0_17
> + li a4, 0
> + li t1, 0
> + slli a7, a2, 2
> +.LBB0_8: # if ((x8 - xy) == 0 && (y8 -xy) !=
> 0) + add a5, a1, a4
> + vsetvli zero, zero, e8, m1, ta, ma
> + addiw t1, t1, 4
> + vle8.v v8, (a5)
> + add a5, a5, a2
> + add t2, a5, a2
> + vwmulu.vx v10, v8, a6
> + vle8.v v8, (a5)
> + vwmulu.vx v12, v8, a6
> + vle8.v v9, (t2)
> + add t2, t2, a2
> + add a5, t2, a2
> + vwmaccu.vx v10, t0, v8
> + vle8.v v8, (t2)
> + vle8.v v14, (a5)
> + add a5, a0, a4
> + add a4, a4, a7
> + vwmaccu.vx v12, t0, v9
> + vnclipu.wi v15, v10, 6
> + vwmulu.vx v10, v9, a6
> + vse8.v v15, (a5)
> + add a5, a5, a2
> + vnclipu.wi v9, v12, 6
> + vwmaccu.vx v10, t0, v8
> + vwmulu.vx v12, v8, a6
> + vse8.v v9, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v10, 6
> + vwmaccu.vx v12, t0, v14
> + vse8.v v8, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v12, 6
> + vse8.v v8, (a5)
> + blt t1, a3, .LBB0_8
> + j .LBB0_17
> +.LBB0_9:
> + beqz a4, .LBB0_14
> + bnez t2, .LBB0_14
> + blez a3, .LBB0_17
> + li a4, 0
> + li t2, 0
> + addi t0, t3, 1
> + slli t1, a2, 2
> +.LBB0_13: # if ((x8 - xy) != 0 && (y8 -xy) ==
> 0) + add a5, a1, a4
> + vsetvli zero, t0, e8, m1, ta, ma
> + addiw t2, t2, 4
> + vle8.v v8, (a5)
> + add a5, a5, a2
> + vslidedown.vi v9, v8, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v10, v8, a6
> + vwmaccu.vx v10, a7, v9
> + vsetvli zero, t0, e8, m1, ta, ma
> + vle8.v v8, (a5)
> + add a5, a5, a2
> + vslidedown.vi v9, v8, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v12, v8, a6
> + vwmaccu.vx v12, a7, v9
> + vsetvli zero, t0, e8, m1, ta, ma
> + vle8.v v8, (a5)
> + add a5, a5, a2
> + vslidedown.vi v9, v8, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v14, v8, a6
> + vwmaccu.vx v14, a7, v9
> + vsetvli zero, t0, e8, m1, ta, ma
> + vle8.v v8, (a5)
> + add a5, a0, a4
> + add a4, a4, t1
> + vslidedown.vi v9, v8, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vnclipu.wi v16, v10, 6
> + vse8.v v16, (a5)
> + add a5, a5, a2
> + vnclipu.wi v10, v12, 6
> + vwmulu.vx v12, v8, a6
> + vse8.v v10, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v14, 6
> + vwmaccu.vx v12, a7, v9
> + vse8.v v8, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v12, 6
> + vse8.v v8, (a5)
> + blt t2, a3, .LBB0_13
> + j .LBB0_17
> +.LBB0_14:
> + blez a3, .LBB0_17
> + li a4, 0
> + li t2, 0
> + slli a7, a2, 2
> +.LBB0_16: # the final else, none of the above
> conditions are met + add t0, a1, a4
> + vsetvli zero, zero, e8, m1, ta, ma
> + add a5, a0, a4
> + add a4, a4, a7
> + addiw t2, t2, 4
> + vle8.v v8, (t0)
> + add t0, t0, a2
> + add t1, t0, a2
> + vwmulu.vx v10, v8, a6
> + vle8.v v8, (t0)
> + add t0, t1, a2
> + vle8.v v9, (t1)
> + vle8.v v12, (t0)
> + vnclipu.wi v13, v10, 6
> + vwmulu.vx v10, v8, a6
> + vse8.v v13, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v10, 6
> + vwmulu.vx v10, v9, a6
> + vse8.v v8, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v10, 6
> + vwmulu.vx v10, v12, a6
> + vse8.v v8, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v10, 6
> + vse8.v v8, (a5)
> + blt t2, a3, .LBB0_16
> +.LBB0_17: # Exit h264_put_chroma_mc8_rvv
> + ret
> +.Lfunc_end0:
> + .size h264_put_chroma_mc8_rvv, .Lfunc_end0-h264_put_chroma_mc8_rvv
> +
> + .globl h264_avg_chroma_mc8_rvv
> + .p2align 1
> + .type h264_avg_chroma_mc8_rvv,@function
> +h264_avg_chroma_mc8_rvv:
> + slliw t2, a5, 3
> + mulw t1, a5, a4
> + sh3add a5, a4, t2
> + slliw a4, a4, 3
> + subw a5, t1, a5
> + subw a7, a4, t1
> + addiw a6, a5, 64
> + subw t0, t2, t1
> + vsetivli t3, 8, e8, m1, ta, mu
> + beqz t1, .LBB1_4
> + blez a3, .LBB1_17
> + li t4, 0
> + li t2, 0
> + addi a5, t3, 1
> + slli t3, a2, 2
> +.LBB1_3: # if (xy != 0)
> + add a4, a1, t4
> + vsetvli zero, a5, e8, m1, ta, ma
> + addiw t2, t2, 4
> + vle8.v v10, (a4)
> + add a4, a4, a2
> + vslidedown.vi v11, v10, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v8, v10, a6
> + vwmaccu.vx v8, a7, v11
> + vsetvli zero, a5, e8, m1, ta, ma
> + vle8.v v12, (a4)
> + vsetivli zero, 8, e8, m1, ta, ma
> + add a4, a4, a2
> + vwmaccu.vx v8, t0, v12
> + vsetvli zero, a5, e8, m1, ta, ma
> + vslidedown.vi v13, v12, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v10, v12, a6
> + vwmaccu.vx v8, t1, v13
> + vwmaccu.vx v10, a7, v13
> + vsetvli zero, a5, e8, m1, ta, ma
> + vle8.v v14, (a4)
> + vsetivli zero, 8, e8, m1, ta, ma
> + add a4, a4, a2
> + vwmaccu.vx v10, t0, v14
> + vsetvli zero, a5, e8, m1, ta, ma
> + vslidedown.vi v15, v14, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v12, v14, a6
> + vwmaccu.vx v10, t1, v15
> + vwmaccu.vx v12, a7, v15
> + vsetvli zero, a5, e8, m1, ta, ma
> + vle8.v v14, (a4)
> + vsetivli zero, 8, e8, m1, ta, ma
> + add a4, a4, a2
> + vwmaccu.vx v12, t0, v14
> + vsetvli zero, a5, e8, m1, ta, ma
> + vslidedown.vi v15, v14, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v16, v14, a6
> + vwmaccu.vx v12, t1, v15
> + vwmaccu.vx v16, a7, v15
> + vsetvli zero, a5, e8, m1, ta, ma
> + vle8.v v14, (a4)
> + vsetivli zero, 8, e8, m1, ta, ma
> + add a4, a0, t4
> + add t4, t4, t3
> + vwmaccu.vx v16, t0, v14
> + vsetvli zero, a5, e8, m1, ta, ma
> + vslidedown.vi v14, v14, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vnclipu.wi v15, v8, 6
> + vle8.v v8, (a4)
> + vwmaccu.vx v16, t1, v14
> + vaaddu.vv v8, v15, v8
> + vse8.v v8, (a4)
> + add a4, a4, a2
> + vnclipu.wi v8, v10, 6
> + vle8.v v9, (a4)
> + vaaddu.vv v8, v8, v9
> + vse8.v v8, (a4)
> + add a4, a4, a2
> + vnclipu.wi v8, v12, 6
> + vle8.v v9, (a4)
> + vaaddu.vv v8, v8, v9
> + vse8.v v8, (a4)
> + add a4, a4, a2
> + vnclipu.wi v8, v16, 6
> + vle8.v v9, (a4)
> + vaaddu.vv v8, v8, v9
> + vse8.v v8, (a4)
> + blt t2, a3, .LBB1_3
> + j .LBB1_17
> +.LBB1_4:
> + bnez a4, .LBB1_9
> + beqz t2, .LBB1_9
> + blez a3, .LBB1_17
> + li t2, 0
> + li t1, 0
> + slli a7, a2, 2
> +.LBB1_8: # if ((x8 - xy) == 0 && (y8 -xy) !=
> 0) + add a4, a1, t2
> + vsetvli zero, zero, e8, m1, ta, ma
> + addiw t1, t1, 4
> + vle8.v v8, (a4)
> + add a4, a4, a2
> + vwmulu.vx v10, v8, a6
> + vle8.v v8, (a4)
> + add a4, a4, a2
> + add a5, a4, a2
> + vle8.v v9, (a4)
> + add a4, a5, a2
> + vle8.v v12, (a5)
> + vwmaccu.vx v10, t0, v8
> + vle8.v v13, (a4)
> + add a4, a0, t2
> + add t2, t2, a7
> + vnclipu.wi v14, v10, 6
> + vwmulu.vx v10, v8, a6
> + vle8.v v8, (a4)
> + vaaddu.vv v8, v14, v8
> + vwmaccu.vx v10, t0, v9
> + vse8.v v8, (a4)
> + add a4, a4, a2
> + vnclipu.wi v8, v10, 6
> + vwmulu.vx v10, v9, a6
> + vle8.v v9, (a4)
> + vaaddu.vv v8, v8, v9
> + vwmaccu.vx v10, t0, v12
> + vse8.v v8, (a4)
> + add a4, a4, a2
> + vnclipu.wi v8, v10, 6
> + vwmulu.vx v10, v12, a6
> + vle8.v v9, (a4)
> + vaaddu.vv v8, v8, v9
> + vwmaccu.vx v10, t0, v13
> + vse8.v v8, (a4)
> + add a4, a4, a2
> + vnclipu.wi v8, v10, 6
> + vle8.v v9, (a4)
> + vaaddu.vv v8, v8, v9
> + vse8.v v8, (a4)
> + blt t1, a3, .LBB1_8
> + j .LBB1_17
> +.LBB1_9:
> + beqz a4, .LBB1_14
> + bnez t2, .LBB1_14
> + blez a3, .LBB1_17
> + li a5, 0
> + li t2, 0
> + addi t0, t3, 1
> + slli t1, a2, 2
> +.LBB1_13: # if ((x8 - xy) != 0 && (y8 -xy) ==
> 0) + add a4, a1, a5
> + vsetvli zero, t0, e8, m1, ta, ma
> + addiw t2, t2, 4
> + vle8.v v8, (a4)
> + add a4, a4, a2
> + vslidedown.vi v9, v8, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v10, v8, a6
> + vwmaccu.vx v10, a7, v9
> + vsetvli zero, t0, e8, m1, ta, ma
> + vle8.v v8, (a4)
> + add a4, a4, a2
> + vslidedown.vi v9, v8, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v12, v8, a6
> + vwmaccu.vx v12, a7, v9
> + vsetvli zero, t0, e8, m1, ta, ma
> + vle8.v v8, (a4)
> + add a4, a4, a2
> + vslidedown.vi v9, v8, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v14, v8, a6
> + vwmaccu.vx v14, a7, v9
> + vsetvli zero, t0, e8, m1, ta, ma
> + vle8.v v8, (a4)
> + add a4, a0, a5
> + add a5, a5, t1
> + vslidedown.vi v9, v8, 1
> + vsetivli zero, 8, e8, m1, ta, ma
> + vnclipu.wi v16, v10, 6
> + vle8.v v10, (a4)
> + vaaddu.vv v10, v16, v10
> + vse8.v v10, (a4)
> + add a4, a4, a2
> + vnclipu.wi v10, v12, 6
> + vle8.v v11, (a4)
> + vwmulu.vx v12, v8, a6
> + vaaddu.vv v10, v10, v11
> + vwmaccu.vx v12, a7, v9
> + vse8.v v10, (a4)
> + add a4, a4, a2
> + vnclipu.wi v10, v14, 6
> + vle8.v v8, (a4)
> + vaaddu.vv v8, v10, v8
> + vse8.v v8, (a4)
> + add a4, a4, a2
> + vnclipu.wi v8, v12, 6
> + vle8.v v9, (a4)
> + vaaddu.vv v8, v8, v9
> + vse8.v v8, (a4)
> + blt t2, a3, .LBB1_13
> + j .LBB1_17
> +.LBB1_14:
> + blez a3, .LBB1_17
> + li a4, 0
> + li t0, 0
> + slli a7, a2, 2
> +.LBB1_16: # the final else, none of the above
> conditions are met + add a5, a1, a4
> + vsetvli zero, zero, e8, m1, ta, ma
> + addiw t0, t0, 4
> + vle8.v v8, (a5)
> + add a5, a5, a2
> + add t1, a5, a2
> + vwmulu.vx v10, v8, a6
> + vle8.v v8, (a5)
> + add a5, t1, a2
> + vle8.v v9, (t1)
> + vle8.v v12, (a5)
> + add a5, a0, a4
> + add a4, a4, a7
> + vnclipu.wi v13, v10, 6
> + vle8.v v10, (a5)
> + vwmulu.vx v14, v8, a6
> + vaaddu.vv v10, v13, v10
> + vse8.v v10, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v14, 6
> + vle8.v v10, (a5)
> + vaaddu.vv v8, v8, v10
> + vwmulu.vx v10, v9, a6
> + vse8.v v8, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v10, 6
> + vle8.v v9, (a5)
> + vwmulu.vx v10, v12, a6
> + vaaddu.vv v8, v8, v9
> + vse8.v v8, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v10, 6
> + vle8.v v9, (a5)
> + vaaddu.vv v8, v8, v9
> + vse8.v v8, (a5)
> + blt t0, a3, .LBB1_16
> +.LBB1_17: # Exit h264_avg_chroma_mc8_rvv
> + ret
> +.Lfunc_end1:
> + .size h264_avg_chroma_mc8_rvv, .Lfunc_end1-h264_avg_chroma_mc8_rvv
--
Реми Дёни-Курмон
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks
2023-05-17 14:54 ` Lynne
2023-05-18 8:20 ` Arnie Chang
@ 2023-05-19 17:15 ` Rémi Denis-Courmont
2023-05-19 18:52 ` Lynne
2023-05-20 10:03 ` Rémi Denis-Courmont
2 siblings, 1 reply; 11+ messages in thread
From: Rémi Denis-Courmont @ 2023-05-19 17:15 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Le keskiviikkona 17. toukokuuta 2023, 17.54.22 EEST Lynne a écrit :
> Finally, run:
> make checkasm && ./tests/checkasm/checkasm --bench
> and report on the timings for both the C and assembly versions.
> If you've made a mistake somewhere, (forgot to restore stack, or a
> callee-saved register, or your function produces an incorrect result),
> checkasm will fail.
To be fair, in this particular case, the stack pointer and saved registers are
never used, so the risk of messing those are zero.
checkasm would of course verify that the function does that it is supposed to
do, and personally, I have kept off untested functions. But I am not sure if it
is fair to require adding test cases whilst other architectures weren't
required to have them.
--
雷米‧德尼-库尔蒙
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks
2023-05-19 17:15 ` Rémi Denis-Courmont
@ 2023-05-19 18:52 ` Lynne
2023-05-20 9:59 ` Rémi Denis-Courmont
0 siblings, 1 reply; 11+ messages in thread
From: Lynne @ 2023-05-19 18:52 UTC (permalink / raw)
To: FFmpeg development discussions and patches
May 19, 2023, 19:16 by remi@remlab.net:
> Le keskiviikkona 17. toukokuuta 2023, 17.54.22 EEST Lynne a écrit :
>
>> Finally, run:
>> make checkasm && ./tests/checkasm/checkasm --bench
>> and report on the timings for both the C and assembly versions.
>> If you've made a mistake somewhere, (forgot to restore stack, or a
>> callee-saved register, or your function produces an incorrect result),
>> checkasm will fail.
>>
>
> To be fair, in this particular case, the stack pointer and saved registers are
> never used, so the risk of messing those are zero.
>
> checkasm would of course verify that the function does that it is supposed to
> do, and personally, I have kept off untested functions. But I am not sure if it
> is fair to require adding test cases whilst other architectures weren't
> required to have them.
>
Other hardware exists, and even without checkasm, bugs are found
through fate. Right now, the only things that can run this code are FPGAs
and emulators, which are both software, something to which bugs are old friends.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks
2023-05-19 18:52 ` Lynne
@ 2023-05-20 9:59 ` Rémi Denis-Courmont
0 siblings, 0 replies; 11+ messages in thread
From: Rémi Denis-Courmont @ 2023-05-20 9:59 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Le perjantaina 19. toukokuuta 2023, 21.52.57 EEST Lynne a écrit :
> May 19, 2023, 19:16 by remi@remlab.net:
> > Le keskiviikkona 17. toukokuuta 2023, 17.54.22 EEST Lynne a écrit :
> >> Finally, run:
> >> make checkasm && ./tests/checkasm/checkasm --bench
> >> and report on the timings for both the C and assembly versions.
> >> If you've made a mistake somewhere, (forgot to restore stack, or a
> >> callee-saved register, or your function produces an incorrect result),
> >> checkasm will fail.
> >
> > To be fair, in this particular case, the stack pointer and saved registers
> > are never used, so the risk of messing those are zero.
> >
> > checkasm would of course verify that the function does that it is supposed
> > to do, and personally, I have kept off untested functions. But I am not
> > sure if it is fair to require adding test cases whilst other
> > architectures weren't required to have them.
>
> Other hardware exists, and even without checkasm, bugs are found
> through fate.
There are exactly two FATE RISC-V instances of which exactly zero support
vectors to date, so the chance for FATE to find bugs there is nul. I do
appreciate that checkasm would make revectoring of the patchset easier and
safer though.
That said, the argument is moot seen as you seem to have posted a fix.
--
Реми Дёни-Курмон
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks
2023-05-17 14:54 ` Lynne
2023-05-18 8:20 ` Arnie Chang
2023-05-19 17:15 ` Rémi Denis-Courmont
@ 2023-05-20 10:03 ` Rémi Denis-Courmont
2 siblings, 0 replies; 11+ messages in thread
From: Rémi Denis-Courmont @ 2023-05-20 10:03 UTC (permalink / raw)
To: FFmpeg development discussions and patches
Le keskiviikkona 17. toukokuuta 2023, 17.54.22 EEST Lynne a écrit :
> Finally, run:
> make checkasm && ./tests/checkasm/checkasm --bench
> and report on the timings for both the C and assembly versions.
> If you've made a mistake somewhere, (forgot to restore stack, or a
> callee-saved register, or your function produces an incorrect result),
> checkasm will fail.
I don't specifically know SiFive's policies. In my experience however, silicon
design companies will ABSOLUTELY NOT publish benchmark results from unreleased
products in any stage of development (including FPGA simulation). Your say is
unlikely to change those policies, so I don't think we can require RVV
benchmarks this year.
--
雷米‧德尼-库尔蒙
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks
2023-05-19 17:12 ` Rémi Denis-Courmont
@ 2023-05-22 14:15 ` Arnie Chang
0 siblings, 0 replies; 11+ messages in thread
From: Arnie Chang @ 2023-05-22 14:15 UTC (permalink / raw)
To: Rémi Denis-Courmont; +Cc: ffmpeg-devel
On Sat, May 20, 2023 at 1:12 AM Rémi Denis-Courmont <remi@remlab.net> wrote:
> > + li t4, 0
> > + li t2, 0
> > + addi a5, t3, 1
> > + slli t3, a2, 2
> > +.LBB0_3: # if (xy != 0)
> > + add a4, a1, t4
> > + vsetvli zero, a5, e8, m1, ta, ma
> > + addiw t2, t2, 4
> > + vle8.v v10, (a4)
> > + add a4, a4, a2
> > + vslidedown.vi v11, v10, 1
>
> Isn't vslide1down.vx zero potentially faster than vslidedown.vi 1?
>
It depends on hardware design,
but in general, vslide1down.vi is typically not slower than vslidedown.vx
Using vslide1down.vi would be better here, I will fix it.
> > + vsetivli zero, 8, e8, m1, ta, ma
>
> Do we really need to reconfigure the active vector length so many times? I
> suspect that is not going to go down to well with some implementations.
>
We need to reconfigure it because the VL is changed.
The VL for vslidedown differs from that of the other instructions.
> + vwmaccu.vx v12, t1, v15
> > + vwmaccu.vx v16, a7, v15
> > + vsetvli zero, a5, e8, m1, ta, ma
> > + vle8.v v14, (a4)
> > + vsetivli zero, 8, e8, m1, ta, ma
> > + add a4, a0, t4
> > + add t4, t4, t3
>
> I could be totally wrong since I have no hardware to verify with, but I
> would
> assume that it is preferable to interleave independent scalar and vector
> instructions whence possible. For out-of-order processors, it shouldn't
> matter, but I suppose that it would on in-order multi-issue processors.
>
Interleaving those instructions can improve overall performance
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2023-05-22 14:15 UTC | newest]
Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-05-17 7:13 [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks Arnie Chang
2023-05-17 14:54 ` Lynne
2023-05-18 8:20 ` Arnie Chang
2023-05-18 12:53 ` Ronald S. Bultje
2023-05-18 16:56 ` Lynne
2023-05-19 17:15 ` Rémi Denis-Courmont
2023-05-19 18:52 ` Lynne
2023-05-20 9:59 ` Rémi Denis-Courmont
2023-05-20 10:03 ` Rémi Denis-Courmont
2023-05-19 17:12 ` Rémi Denis-Courmont
2023-05-22 14:15 ` Arnie Chang
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git