Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks
@ 2023-05-17  7:13 Arnie Chang
  2023-05-17 14:54 ` Lynne
  2023-05-19 17:12 ` Rémi Denis-Courmont
  0 siblings, 2 replies; 11+ messages in thread
From: Arnie Chang @ 2023-05-17  7:13 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Arnie Chang

Optimize the put and avg filtering for 8x8 chroma blocks

Signed-off-by: Arnie Chang <arnie.chang@sifive.com>
---
 libavcodec/h264chroma.c                   |   2 +
 libavcodec/h264chroma.h                   |   1 +
 libavcodec/riscv/Makefile                 |   3 +
 libavcodec/riscv/h264_chroma_init_riscv.c |  39 ++
 libavcodec/riscv/h264_mc_chroma.S         | 492 ++++++++++++++++++++++
 libavcodec/riscv/h264_mc_chroma.h         |  34 ++
 6 files changed, 571 insertions(+)
 create mode 100644 libavcodec/riscv/h264_chroma_init_riscv.c
 create mode 100644 libavcodec/riscv/h264_mc_chroma.S
 create mode 100644 libavcodec/riscv/h264_mc_chroma.h

diff --git a/libavcodec/h264chroma.c b/libavcodec/h264chroma.c
index 60b86b6fba..1eeab7bc40 100644
--- a/libavcodec/h264chroma.c
+++ b/libavcodec/h264chroma.c
@@ -58,5 +58,7 @@ av_cold void ff_h264chroma_init(H264ChromaContext *c, int bit_depth)
     ff_h264chroma_init_mips(c, bit_depth);
 #elif ARCH_LOONGARCH64
     ff_h264chroma_init_loongarch(c, bit_depth);
+#elif ARCH_RISCV
+    ff_h264chroma_init_riscv(c, bit_depth);
 #endif
 }
diff --git a/libavcodec/h264chroma.h b/libavcodec/h264chroma.h
index b8f9c8f4fc..9c81c18a76 100644
--- a/libavcodec/h264chroma.h
+++ b/libavcodec/h264chroma.h
@@ -37,5 +37,6 @@ void ff_h264chroma_init_ppc(H264ChromaContext *c, int bit_depth);
 void ff_h264chroma_init_x86(H264ChromaContext *c, int bit_depth);
 void ff_h264chroma_init_mips(H264ChromaContext *c, int bit_depth);
 void ff_h264chroma_init_loongarch(H264ChromaContext *c, int bit_depth);
+void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth);
 
 #endif /* AVCODEC_H264CHROMA_H */
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 965942f4df..08b76c93cb 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -19,3 +19,6 @@ OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_init.o \
 RVV-OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_rvv.o
 OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
 RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
+
+OBJS-$(CONFIG_H264CHROMA) += riscv/h264_chroma_init_riscv.o
+RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o
diff --git a/libavcodec/riscv/h264_chroma_init_riscv.c b/libavcodec/riscv/h264_chroma_init_riscv.c
new file mode 100644
index 0000000000..b6f98ba693
--- /dev/null
+++ b/libavcodec/riscv/h264_chroma_init_riscv.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavcodec/h264chroma.h"
+#include "config.h"
+#include "h264_mc_chroma.h"
+
+av_cold void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth)
+{
+#if HAVE_RVV
+    const int high_bit_depth = bit_depth > 8;
+
+    if (!high_bit_depth) {
+        c->put_h264_chroma_pixels_tab[0] = h264_put_chroma_mc8_rvv;
+        c->avg_h264_chroma_pixels_tab[0] = h264_avg_chroma_mc8_rvv;
+    }
+#endif
+}
\ No newline at end of file
diff --git a/libavcodec/riscv/h264_mc_chroma.S b/libavcodec/riscv/h264_mc_chroma.S
new file mode 100644
index 0000000000..a02866f633
--- /dev/null
+++ b/libavcodec/riscv/h264_mc_chroma.S
@@ -0,0 +1,492 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+    .text
+
+    .globl    h264_put_chroma_mc8_rvv
+    .p2align    1
+    .type    h264_put_chroma_mc8_rvv,@function
+h264_put_chroma_mc8_rvv:
+    slliw    t2, a5, 3
+    mulw    t1, a5, a4
+    sh3add    a5, a4, t2
+    slliw    a4, a4, 3
+    subw    a5, t1, a5
+    subw    a7, a4, t1
+    addiw    a6, a5, 64
+    subw    t0, t2, t1
+    vsetivli    t3, 8, e8, m1, ta, mu
+    beqz    t1, .LBB0_4
+    blez    a3, .LBB0_17
+    li    t4, 0
+    li    t2, 0
+    addi    a5, t3, 1
+    slli    t3, a2, 2
+.LBB0_3:                                # if (xy != 0)
+    add    a4, a1, t4
+    vsetvli    zero, a5, e8, m1, ta, ma
+    addiw    t2, t2, 4
+    vle8.v    v10, (a4)
+    add    a4, a4, a2
+    vslidedown.vi    v11, v10, 1
+    vsetivli    zero, 8, e8, m1, ta, ma
+    vwmulu.vx    v8, v10, a6
+    vwmaccu.vx    v8, a7, v11
+    vsetvli    zero, a5, e8, m1, ta, ma
+    vle8.v    v12, (a4)
+    vsetivli    zero, 8, e8, m1, ta, ma
+    add    a4, a4, a2
+    vwmaccu.vx    v8, t0, v12
+    vsetvli    zero, a5, e8, m1, ta, ma
+    vslidedown.vi    v13, v12, 1
+    vsetivli    zero, 8, e8, m1, ta, ma
+    vwmulu.vx    v10, v12, a6
+    vwmaccu.vx    v8, t1, v13
+    vwmaccu.vx    v10, a7, v13
+    vsetvli    zero, a5, e8, m1, ta, ma
+    vle8.v    v14, (a4)
+    vsetivli    zero, 8, e8, m1, ta, ma
+    add    a4, a4, a2
+    vwmaccu.vx    v10, t0, v14
+    vsetvli    zero, a5, e8, m1, ta, ma
+    vslidedown.vi    v15, v14, 1
+    vsetivli    zero, 8, e8, m1, ta, ma
+    vwmulu.vx    v12, v14, a6
+    vwmaccu.vx    v10, t1, v15
+    vwmaccu.vx    v12, a7, v15
+    vsetvli    zero, a5, e8, m1, ta, ma
+    vle8.v    v14, (a4)
+    vsetivli    zero, 8, e8, m1, ta, ma
+    add    a4, a4, a2
+    vwmaccu.vx    v12, t0, v14
+    vsetvli    zero, a5, e8, m1, ta, ma
+    vslidedown.vi    v15, v14, 1
+    vsetivli    zero, 8, e8, m1, ta, ma
+    vwmulu.vx    v16, v14, a6
+    vwmaccu.vx    v12, t1, v15
+    vwmaccu.vx    v16, a7, v15
+    vsetvli    zero, a5, e8, m1, ta, ma
+    vle8.v    v14, (a4)
+    vsetivli    zero, 8, e8, m1, ta, ma
+    add    a4, a0, t4
+    add    t4, t4, t3
+    vwmaccu.vx    v16, t0, v14
+    vsetvli    zero, a5, e8, m1, ta, ma
+    vslidedown.vi    v14, v14, 1
+    vsetivli    zero, 8, e8, m1, ta, ma
+    vnclipu.wi    v15, v8, 6
+    vwmaccu.vx    v16, t1, v14
+    vse8.v    v15, (a4)
+    add    a4, a4, a2
+    vnclipu.wi    v8, v10, 6
+    vse8.v    v8, (a4)
+    add    a4, a4, a2
+    vnclipu.wi    v8, v12, 6
+    vse8.v    v8, (a4)
+    add    a4, a4, a2
+    vnclipu.wi    v8, v16, 6
+    vse8.v    v8, (a4)
+    blt    t2, a3, .LBB0_3
+    j    .LBB0_17
+.LBB0_4:
+    bnez    a4, .LBB0_9
+    beqz    t2, .LBB0_9
+    blez    a3, .LBB0_17
+    li    a4, 0
+    li    t1, 0
+    slli    a7, a2, 2
+.LBB0_8:                                # if ((x8 - xy) == 0 && (y8 -xy) != 0)
+    add    a5, a1, a4
+    vsetvli    zero, zero, e8, m1, ta, ma
+    addiw    t1, t1, 4
+    vle8.v    v8, (a5)
+    add    a5, a5, a2
+    add    t2, a5, a2
+    vwmulu.vx    v10, v8, a6
+    vle8.v    v8, (a5)
+    vwmulu.vx    v12, v8, a6
+    vle8.v    v9, (t2)
+    add    t2, t2, a2
+    add    a5, t2, a2
+    vwmaccu.vx    v10, t0, v8
+    vle8.v    v8, (t2)
+    vle8.v    v14, (a5)
+    add    a5, a0, a4
+    add    a4, a4, a7
+    vwmaccu.vx    v12, t0, v9
+    vnclipu.wi    v15, v10, 6
+    vwmulu.vx    v10, v9, a6
+    vse8.v    v15, (a5)
+    add    a5, a5, a2
+    vnclipu.wi    v9, v12, 6
+    vwmaccu.vx    v10, t0, v8
+    vwmulu.vx    v12, v8, a6
+    vse8.v    v9, (a5)
+    add    a5, a5, a2
+    vnclipu.wi    v8, v10, 6
+    vwmaccu.vx    v12, t0, v14
+    vse8.v    v8, (a5)
+    add    a5, a5, a2
+    vnclipu.wi    v8, v12, 6
+    vse8.v    v8, (a5)
+    blt    t1, a3, .LBB0_8
+    j    .LBB0_17
+.LBB0_9:
+    beqz    a4, .LBB0_14
+    bnez    t2, .LBB0_14
+    blez    a3, .LBB0_17
+    li    a4, 0
+    li    t2, 0
+    addi    t0, t3, 1
+    slli    t1, a2, 2
+.LBB0_13:                               # if ((x8 - xy) != 0 && (y8 -xy) == 0)
+    add    a5, a1, a4
+    vsetvli    zero, t0, e8, m1, ta, ma
+    addiw    t2, t2, 4
+    vle8.v    v8, (a5)
+    add    a5, a5, a2
+    vslidedown.vi    v9, v8, 1
+    vsetivli    zero, 8, e8, m1, ta, ma
+    vwmulu.vx    v10, v8, a6
+    vwmaccu.vx    v10, a7, v9
+    vsetvli    zero, t0, e8, m1, ta, ma
+    vle8.v    v8, (a5)
+    add    a5, a5, a2
+    vslidedown.vi    v9, v8, 1
+    vsetivli    zero, 8, e8, m1, ta, ma
+    vwmulu.vx    v12, v8, a6
+    vwmaccu.vx    v12, a7, v9
+    vsetvli    zero, t0, e8, m1, ta, ma
+    vle8.v    v8, (a5)
+    add    a5, a5, a2
+    vslidedown.vi    v9, v8, 1
+    vsetivli    zero, 8, e8, m1, ta, ma
+    vwmulu.vx    v14, v8, a6
+    vwmaccu.vx    v14, a7, v9
+    vsetvli    zero, t0, e8, m1, ta, ma
+    vle8.v    v8, (a5)
+    add    a5, a0, a4
+    add    a4, a4, t1
+    vslidedown.vi    v9, v8, 1
+    vsetivli    zero, 8, e8, m1, ta, ma
+    vnclipu.wi    v16, v10, 6
+    vse8.v    v16, (a5)
+    add    a5, a5, a2
+    vnclipu.wi    v10, v12, 6
+    vwmulu.vx    v12, v8, a6
+    vse8.v    v10, (a5)
+    add    a5, a5, a2
+    vnclipu.wi    v8, v14, 6
+    vwmaccu.vx    v12, a7, v9
+    vse8.v    v8, (a5)
+    add    a5, a5, a2
+    vnclipu.wi    v8, v12, 6
+    vse8.v    v8, (a5)
+    blt    t2, a3, .LBB0_13
+    j    .LBB0_17
+.LBB0_14:
+    blez    a3, .LBB0_17
+    li    a4, 0
+    li    t2, 0
+    slli    a7, a2, 2
+.LBB0_16:                               # the final else, none of the above conditions are met
+    add    t0, a1, a4
+    vsetvli    zero, zero, e8, m1, ta, ma
+    add    a5, a0, a4
+    add    a4, a4, a7
+    addiw    t2, t2, 4
+    vle8.v    v8, (t0)
+    add    t0, t0, a2
+    add    t1, t0, a2
+    vwmulu.vx    v10, v8, a6
+    vle8.v    v8, (t0)
+    add    t0, t1, a2
+    vle8.v    v9, (t1)
+    vle8.v    v12, (t0)
+    vnclipu.wi    v13, v10, 6
+    vwmulu.vx    v10, v8, a6
+    vse8.v    v13, (a5)
+    add    a5, a5, a2
+    vnclipu.wi    v8, v10, 6
+    vwmulu.vx    v10, v9, a6
+    vse8.v    v8, (a5)
+    add    a5, a5, a2
+    vnclipu.wi    v8, v10, 6
+    vwmulu.vx    v10, v12, a6
+    vse8.v    v8, (a5)
+    add    a5, a5, a2
+    vnclipu.wi    v8, v10, 6
+    vse8.v    v8, (a5)
+    blt    t2, a3, .LBB0_16
+.LBB0_17:                               # Exit h264_put_chroma_mc8_rvv
+    ret
+.Lfunc_end0:
+    .size    h264_put_chroma_mc8_rvv, .Lfunc_end0-h264_put_chroma_mc8_rvv
+
+    .globl    h264_avg_chroma_mc8_rvv
+    .p2align    1
+    .type    h264_avg_chroma_mc8_rvv,@function
+h264_avg_chroma_mc8_rvv:
+    slliw    t2, a5, 3
+    mulw    t1, a5, a4
+    sh3add    a5, a4, t2
+    slliw    a4, a4, 3
+    subw    a5, t1, a5
+    subw    a7, a4, t1
+    addiw    a6, a5, 64
+    subw    t0, t2, t1
+    vsetivli    t3, 8, e8, m1, ta, mu
+    beqz    t1, .LBB1_4
+    blez    a3, .LBB1_17
+    li    t4, 0
+    li    t2, 0
+    addi    a5, t3, 1
+    slli    t3, a2, 2
+.LBB1_3:                                # if (xy != 0)
+    add    a4, a1, t4
+    vsetvli    zero, a5, e8, m1, ta, ma
+    addiw    t2, t2, 4
+    vle8.v    v10, (a4)
+    add    a4, a4, a2
+    vslidedown.vi    v11, v10, 1
+    vsetivli    zero, 8, e8, m1, ta, ma
+    vwmulu.vx    v8, v10, a6
+    vwmaccu.vx    v8, a7, v11
+    vsetvli    zero, a5, e8, m1, ta, ma
+    vle8.v    v12, (a4)
+    vsetivli    zero, 8, e8, m1, ta, ma
+    add    a4, a4, a2
+    vwmaccu.vx    v8, t0, v12
+    vsetvli    zero, a5, e8, m1, ta, ma
+    vslidedown.vi    v13, v12, 1
+    vsetivli    zero, 8, e8, m1, ta, ma
+    vwmulu.vx    v10, v12, a6
+    vwmaccu.vx    v8, t1, v13
+    vwmaccu.vx    v10, a7, v13
+    vsetvli    zero, a5, e8, m1, ta, ma
+    vle8.v    v14, (a4)
+    vsetivli    zero, 8, e8, m1, ta, ma
+    add    a4, a4, a2
+    vwmaccu.vx    v10, t0, v14
+    vsetvli    zero, a5, e8, m1, ta, ma
+    vslidedown.vi    v15, v14, 1
+    vsetivli    zero, 8, e8, m1, ta, ma
+    vwmulu.vx    v12, v14, a6
+    vwmaccu.vx    v10, t1, v15
+    vwmaccu.vx    v12, a7, v15
+    vsetvli    zero, a5, e8, m1, ta, ma
+    vle8.v    v14, (a4)
+    vsetivli    zero, 8, e8, m1, ta, ma
+    add    a4, a4, a2
+    vwmaccu.vx    v12, t0, v14
+    vsetvli    zero, a5, e8, m1, ta, ma
+    vslidedown.vi    v15, v14, 1
+    vsetivli    zero, 8, e8, m1, ta, ma
+    vwmulu.vx    v16, v14, a6
+    vwmaccu.vx    v12, t1, v15
+    vwmaccu.vx    v16, a7, v15
+    vsetvli    zero, a5, e8, m1, ta, ma
+    vle8.v    v14, (a4)
+    vsetivli    zero, 8, e8, m1, ta, ma
+    add    a4, a0, t4
+    add    t4, t4, t3
+    vwmaccu.vx    v16, t0, v14
+    vsetvli    zero, a5, e8, m1, ta, ma
+    vslidedown.vi    v14, v14, 1
+    vsetivli    zero, 8, e8, m1, ta, ma
+    vnclipu.wi    v15, v8, 6
+    vle8.v    v8, (a4)
+    vwmaccu.vx    v16, t1, v14
+    vaaddu.vv    v8, v15, v8
+    vse8.v    v8, (a4)
+    add    a4, a4, a2
+    vnclipu.wi    v8, v10, 6
+    vle8.v    v9, (a4)
+    vaaddu.vv    v8, v8, v9
+    vse8.v    v8, (a4)
+    add    a4, a4, a2
+    vnclipu.wi    v8, v12, 6
+    vle8.v    v9, (a4)
+    vaaddu.vv    v8, v8, v9
+    vse8.v    v8, (a4)
+    add    a4, a4, a2
+    vnclipu.wi    v8, v16, 6
+    vle8.v    v9, (a4)
+    vaaddu.vv    v8, v8, v9
+    vse8.v    v8, (a4)
+    blt    t2, a3, .LBB1_3
+    j    .LBB1_17
+.LBB1_4:
+    bnez    a4, .LBB1_9
+    beqz    t2, .LBB1_9
+    blez    a3, .LBB1_17
+    li    t2, 0
+    li    t1, 0
+    slli    a7, a2, 2
+.LBB1_8:                                # if ((x8 - xy) == 0 && (y8 -xy) != 0)
+    add    a4, a1, t2
+    vsetvli    zero, zero, e8, m1, ta, ma
+    addiw    t1, t1, 4
+    vle8.v    v8, (a4)
+    add    a4, a4, a2
+    vwmulu.vx    v10, v8, a6
+    vle8.v    v8, (a4)
+    add    a4, a4, a2
+    add    a5, a4, a2
+    vle8.v    v9, (a4)
+    add    a4, a5, a2
+    vle8.v    v12, (a5)
+    vwmaccu.vx    v10, t0, v8
+    vle8.v    v13, (a4)
+    add    a4, a0, t2
+    add    t2, t2, a7
+    vnclipu.wi    v14, v10, 6
+    vwmulu.vx    v10, v8, a6
+    vle8.v    v8, (a4)
+    vaaddu.vv    v8, v14, v8
+    vwmaccu.vx    v10, t0, v9
+    vse8.v    v8, (a4)
+    add    a4, a4, a2
+    vnclipu.wi    v8, v10, 6
+    vwmulu.vx    v10, v9, a6
+    vle8.v    v9, (a4)
+    vaaddu.vv    v8, v8, v9
+    vwmaccu.vx    v10, t0, v12
+    vse8.v    v8, (a4)
+    add    a4, a4, a2
+    vnclipu.wi    v8, v10, 6
+    vwmulu.vx    v10, v12, a6
+    vle8.v    v9, (a4)
+    vaaddu.vv    v8, v8, v9
+    vwmaccu.vx    v10, t0, v13
+    vse8.v    v8, (a4)
+    add    a4, a4, a2
+    vnclipu.wi    v8, v10, 6
+    vle8.v    v9, (a4)
+    vaaddu.vv    v8, v8, v9
+    vse8.v    v8, (a4)
+    blt    t1, a3, .LBB1_8
+    j    .LBB1_17
+.LBB1_9:
+    beqz    a4, .LBB1_14
+    bnez    t2, .LBB1_14
+    blez    a3, .LBB1_17
+    li    a5, 0
+    li    t2, 0
+    addi    t0, t3, 1
+    slli    t1, a2, 2
+.LBB1_13:                               # if ((x8 - xy) != 0 && (y8 -xy) == 0)
+    add    a4, a1, a5
+    vsetvli    zero, t0, e8, m1, ta, ma
+    addiw    t2, t2, 4
+    vle8.v    v8, (a4)
+    add    a4, a4, a2
+    vslidedown.vi    v9, v8, 1
+    vsetivli    zero, 8, e8, m1, ta, ma
+    vwmulu.vx    v10, v8, a6
+    vwmaccu.vx    v10, a7, v9
+    vsetvli    zero, t0, e8, m1, ta, ma
+    vle8.v    v8, (a4)
+    add    a4, a4, a2
+    vslidedown.vi    v9, v8, 1
+    vsetivli    zero, 8, e8, m1, ta, ma
+    vwmulu.vx    v12, v8, a6
+    vwmaccu.vx    v12, a7, v9
+    vsetvli    zero, t0, e8, m1, ta, ma
+    vle8.v    v8, (a4)
+    add    a4, a4, a2
+    vslidedown.vi    v9, v8, 1
+    vsetivli    zero, 8, e8, m1, ta, ma
+    vwmulu.vx    v14, v8, a6
+    vwmaccu.vx    v14, a7, v9
+    vsetvli    zero, t0, e8, m1, ta, ma
+    vle8.v    v8, (a4)
+    add    a4, a0, a5
+    add    a5, a5, t1
+    vslidedown.vi    v9, v8, 1
+    vsetivli    zero, 8, e8, m1, ta, ma
+    vnclipu.wi    v16, v10, 6
+    vle8.v    v10, (a4)
+    vaaddu.vv    v10, v16, v10
+    vse8.v    v10, (a4)
+    add    a4, a4, a2
+    vnclipu.wi    v10, v12, 6
+    vle8.v    v11, (a4)
+    vwmulu.vx    v12, v8, a6
+    vaaddu.vv    v10, v10, v11
+    vwmaccu.vx    v12, a7, v9
+    vse8.v    v10, (a4)
+    add    a4, a4, a2
+    vnclipu.wi    v10, v14, 6
+    vle8.v    v8, (a4)
+    vaaddu.vv    v8, v10, v8
+    vse8.v    v8, (a4)
+    add    a4, a4, a2
+    vnclipu.wi    v8, v12, 6
+    vle8.v    v9, (a4)
+    vaaddu.vv    v8, v8, v9
+    vse8.v    v8, (a4)
+    blt    t2, a3, .LBB1_13
+    j    .LBB1_17
+.LBB1_14:
+    blez    a3, .LBB1_17
+    li    a4, 0
+    li    t0, 0
+    slli    a7, a2, 2
+.LBB1_16:                               # the final else, none of the above conditions are met
+    add    a5, a1, a4
+    vsetvli    zero, zero, e8, m1, ta, ma
+    addiw    t0, t0, 4
+    vle8.v    v8, (a5)
+    add    a5, a5, a2
+    add    t1, a5, a2
+    vwmulu.vx    v10, v8, a6
+    vle8.v    v8, (a5)
+    add    a5, t1, a2
+    vle8.v    v9, (t1)
+    vle8.v    v12, (a5)
+    add    a5, a0, a4
+    add    a4, a4, a7
+    vnclipu.wi    v13, v10, 6
+    vle8.v    v10, (a5)
+    vwmulu.vx    v14, v8, a6
+    vaaddu.vv    v10, v13, v10
+    vse8.v    v10, (a5)
+    add    a5, a5, a2
+    vnclipu.wi    v8, v14, 6
+    vle8.v    v10, (a5)
+    vaaddu.vv    v8, v8, v10
+    vwmulu.vx    v10, v9, a6
+    vse8.v    v8, (a5)
+    add    a5, a5, a2
+    vnclipu.wi    v8, v10, 6
+    vle8.v    v9, (a5)
+    vwmulu.vx    v10, v12, a6
+    vaaddu.vv    v8, v8, v9
+    vse8.v    v8, (a5)
+    add    a5, a5, a2
+    vnclipu.wi    v8, v10, 6
+    vle8.v    v9, (a5)
+    vaaddu.vv    v8, v8, v9
+    vse8.v    v8, (a5)
+    blt    t0, a3, .LBB1_16
+.LBB1_17:                               # Exit h264_avg_chroma_mc8_rvv
+    ret
+.Lfunc_end1:
+    .size    h264_avg_chroma_mc8_rvv, .Lfunc_end1-h264_avg_chroma_mc8_rvv
diff --git a/libavcodec/riscv/h264_mc_chroma.h b/libavcodec/riscv/h264_mc_chroma.h
new file mode 100644
index 0000000000..cb350d0e4a
--- /dev/null
+++ b/libavcodec/riscv/h264_mc_chroma.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_RISCV_H264_MC_CHROMA_H
+#define AVCODEC_RISCV_H264_MC_CHROMA_H
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stddef.h>
+#include "config.h"
+
+#if HAVE_RVV
+void h264_put_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
+void h264_avg_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
+#endif
+#endif
\ No newline at end of file
-- 
2.17.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks
  2023-05-17  7:13 [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks Arnie Chang
@ 2023-05-17 14:54 ` Lynne
  2023-05-18  8:20   ` Arnie Chang
                     ` (2 more replies)
  2023-05-19 17:12 ` Rémi Denis-Courmont
  1 sibling, 3 replies; 11+ messages in thread
From: Lynne @ 2023-05-17 14:54 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

May 17, 2023, 09:13 by arnie.chang@sifive.com:

> Optimize the put and avg filtering for 8x8 chroma blocks
>
> Signed-off-by: Arnie Chang <arnie.chang@sifive.com>
> ---
> libavcodec/h264chroma.c                   |   2 +
> libavcodec/h264chroma.h                   |   1 +
> libavcodec/riscv/Makefile                 |   3 +
> libavcodec/riscv/h264_chroma_init_riscv.c |  39 ++
> libavcodec/riscv/h264_mc_chroma.S         | 492 ++++++++++++++++++++++
> libavcodec/riscv/h264_mc_chroma.h         |  34 ++
> 6 files changed, 571 insertions(+)
> create mode 100644 libavcodec/riscv/h264_chroma_init_riscv.c
> create mode 100644 libavcodec/riscv/h264_mc_chroma.S
> create mode 100644 libavcodec/riscv/h264_mc_chroma.h
>
> +#include <stdint.h>
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavcodec/h264chroma.h"
> +#include "config.h"
> +#include "h264_mc_chroma.h"
> +
> +av_cold void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth)
> +{
> +#if HAVE_RVV
> +    const int high_bit_depth = bit_depth > 8;
>

You don't need this constant.


> +
> +    if (!high_bit_depth) {
> +        c->put_h264_chroma_pixels_tab[0] = h264_put_chroma_mc8_rvv;
> +        c->avg_h264_chroma_pixels_tab[0] = h264_avg_chroma_mc8_rvv;
> +    }
> +#endif
>

You have to check if RVV is supported:
> int flags = av_get_cpu_flags();
>
> if (flags & AV_CPU_FLAG_RVV_F32) {
>     if (bit_depth > 8) {


> +    .text
> +
> +    .globl    h264_put_chroma_mc8_rvv
> +    .p2align    1
> +    .type    h264_put_chroma_mc8_rvv,@function
> +h264_put_chroma_mc8_rvv:
>

You don't need any of this. We already have macros to
handle this - take a look at libavcodec/riscv/opusdsp_rvv.S:

> func ff_opus_postfilter_rvv_256, zve32f
>         lvtypei a5, e32, m1, ta, ma // function instructions start here

Make sure to change zve32f to whatever instruction extension you use
to initialize the assembler to handle it.


> +    slliw    t2, a5, 3
> +    mulw    t1, a5, a4
> +    sh3add    a5, a4, t2
> +    slliw    a4, a4, 3
> +    subw    a5, t1, a5
> +    subw    a7, a4, t1
> +    addiw    a6, a5, 64
> +    subw    t0, t2, t1
>

Coding style issue - we style our RISC-V assembly the same way
we style our AArch64 assembly:

<8 spaces><instruction><spaces until the 24th character on the line><arguments,registers .etc>

For example:
>         vsetvl          zero, a4, a5
>         lw              t2, 20(a1)
>         vfmul.vv        v8, v24, v16
>         addi            a0, a0, 4
>         vslide1down.vx  v16, v16, t2
>         MACRO           arg1, arg2


> +.LBB0_8:                                # if ((x8 - xy) == 0 && (y8 -xy) != 0)
> +    add    a5, a1, a4
> +    vsetvli    zero, zero, e8, m1, ta, ma
> +    addiw    t1, t1, 4
> +    vle8.v    v8, (a5)
> +    add    a5, a5, a2
> +    add    t2, a5, a2
> +    vwmulu.vx    v10, v8, a6
>

This branch looks very similar to
> .LBB1_16:                               # the final else, none of the above conditions are met
>     add    a5, a1, a4
>     vsetvli    zero, zero, e8, m1, ta, ma
>     addiw    t0, t0, 4
>     vle8.v    v8, (a5)
>     add    a5, a5, a2
>     add    t1, a5, a2
>     vwmulu.vx    v10, v8, a6

Consider using a macro.

In fact, a lot of the branches look similar to each other. Looking at other
implementations, they only consider 3 possible variants, the same ones
that the C function has.


> +    .size    h264_avg_chroma_mc8_rvv, .Lfunc_end1-h264_avg_chroma_mc8_rvv
> diff --git a/libavcodec/riscv/h264_mc_chroma.h b/libavcodec/riscv/h264_mc_chroma.h
> new file mode 100644
> index 0000000000..cb350d0e4a
> --- /dev/null
> +++ b/libavcodec/riscv/h264_mc_chroma.h
> @@ -0,0 +1,34 @@
> +/*
> + * Copyright (c) 2023 SiFive, Inc. All rights reserved.
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#ifndef AVCODEC_RISCV_H264_MC_CHROMA_H
> +#define AVCODEC_RISCV_H264_MC_CHROMA_H
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <stdint.h>
> +#include <string.h>
> +#include <stddef.h>
> +#include "config.h"
>

You don't need all of these includes. Just config.h and stdint.h would be enough.


> +#if HAVE_RVV
> +void h264_put_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
> +void h264_avg_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
> +#endif
> +#endif
> \ No newline at end of file
>

You need your file to end in a newline. Git already warns you if you don't.

Finally, run:
make checkasm && ./tests/checkasm/checkasm --bench
and report on the timings for both the C and assembly versions.
If you've made a mistake somewhere, (forgot to restore stack, or a callee-saved register,
or your function produces an incorrect result), checkasm will fail.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks
  2023-05-17 14:54 ` Lynne
@ 2023-05-18  8:20   ` Arnie Chang
  2023-05-18 12:53     ` Ronald S. Bultje
  2023-05-18 16:56     ` Lynne
  2023-05-19 17:15   ` Rémi Denis-Courmont
  2023-05-20 10:03   ` Rémi Denis-Courmont
  2 siblings, 2 replies; 11+ messages in thread
From: Arnie Chang @ 2023-05-18  8:20 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

On Wed, May 17, 2023 at 10:54 PM Lynne <dev@lynne.ee> wrote:

>
> Finally, run:
> make checkasm && ./tests/checkasm/checkasm --bench
> and report on the timings for both the C and assembly versions.
> If you've made a mistake somewhere, (forgot to restore stack, or a
> callee-saved register,
> or your function produces an incorrect result), checkasm will fail.
>

The checkasm does not cover motion compensation of chroma samples in the
H.264 decoder,
so my modification can not be tested.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks
  2023-05-18  8:20   ` Arnie Chang
@ 2023-05-18 12:53     ` Ronald S. Bultje
  2023-05-18 16:56     ` Lynne
  1 sibling, 0 replies; 11+ messages in thread
From: Ronald S. Bultje @ 2023-05-18 12:53 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Hi,

On Thu, May 18, 2023 at 4:21 AM Arnie Chang <arnie.chang@sifive.com> wrote:

> On Wed, May 17, 2023 at 10:54 PM Lynne <dev@lynne.ee> wrote:
>
> >
> > Finally, run:
> > make checkasm && ./tests/checkasm/checkasm --bench
> > and report on the timings for both the C and assembly versions.
> > If you've made a mistake somewhere, (forgot to restore stack, or a
> > callee-saved register,
> > or your function produces an incorrect result), checkasm will fail.
> >
>
> The checkasm does not cover motion compensation of chroma samples in the
> H.264 decoder,
> so my modification can not be tested.
>

Maybe you could fix that?

Ronald
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks
  2023-05-18  8:20   ` Arnie Chang
  2023-05-18 12:53     ` Ronald S. Bultje
@ 2023-05-18 16:56     ` Lynne
  1 sibling, 0 replies; 11+ messages in thread
From: Lynne @ 2023-05-18 16:56 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

May 18, 2023, 10:21 by arnie.chang@sifive.com:

> On Wed, May 17, 2023 at 10:54 PM Lynne <dev@lynne.ee> wrote:
>
>>
>> Finally, run:
>> make checkasm && ./tests/checkasm/checkasm --bench
>> and report on the timings for both the C and assembly versions.
>> If you've made a mistake somewhere, (forgot to restore stack, or a
>> callee-saved register,
>> or your function produces an incorrect result), checkasm will fail.
>>
>
> The checkasm does not cover motion compensation of chroma samples in the
> H.264 decoder,
> so my modification can not be tested.
>

I added one here http://ffmpeg.org/pipermail/ffmpeg-devel/2023-May/309812.html
follow the example and add a test next time.
I wouldn't accept risc-v asm without checkasm, the platform is too new (and doesn't
even have hardware yet), and probably prone to silicon issues.
We've found a few silicon bugs through checkasm over the years.

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks
  2023-05-17  7:13 [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks Arnie Chang
  2023-05-17 14:54 ` Lynne
@ 2023-05-19 17:12 ` Rémi Denis-Courmont
  2023-05-22 14:15   ` Arnie Chang
  1 sibling, 1 reply; 11+ messages in thread
From: Rémi Denis-Courmont @ 2023-05-19 17:12 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Arnie Chang

Le keskiviikkona 17. toukokuuta 2023, 10.13.01 EEST Arnie Chang a écrit :
> Optimize the put and avg filtering for 8x8 chroma blocks
> 
> Signed-off-by: Arnie Chang <arnie.chang@sifive.com>
> ---
>  libavcodec/h264chroma.c                   |   2 +
>  libavcodec/h264chroma.h                   |   1 +
>  libavcodec/riscv/Makefile                 |   3 +
>  libavcodec/riscv/h264_chroma_init_riscv.c |  39 ++
>  libavcodec/riscv/h264_mc_chroma.S         | 492 ++++++++++++++++++++++
>  libavcodec/riscv/h264_mc_chroma.h         |  34 ++
>  6 files changed, 571 insertions(+)
>  create mode 100644 libavcodec/riscv/h264_chroma_init_riscv.c
>  create mode 100644 libavcodec/riscv/h264_mc_chroma.S
>  create mode 100644 libavcodec/riscv/h264_mc_chroma.h
> 
> diff --git a/libavcodec/h264chroma.c b/libavcodec/h264chroma.c
> index 60b86b6fba..1eeab7bc40 100644
> --- a/libavcodec/h264chroma.c
> +++ b/libavcodec/h264chroma.c
> @@ -58,5 +58,7 @@ av_cold void ff_h264chroma_init(H264ChromaContext *c, int
> bit_depth) ff_h264chroma_init_mips(c, bit_depth);
>  #elif ARCH_LOONGARCH64
>      ff_h264chroma_init_loongarch(c, bit_depth);
> +#elif ARCH_RISCV
> +    ff_h264chroma_init_riscv(c, bit_depth);
>  #endif
>  }
> diff --git a/libavcodec/h264chroma.h b/libavcodec/h264chroma.h
> index b8f9c8f4fc..9c81c18a76 100644
> --- a/libavcodec/h264chroma.h
> +++ b/libavcodec/h264chroma.h
> @@ -37,5 +37,6 @@ void ff_h264chroma_init_ppc(H264ChromaContext *c, int
> bit_depth); void ff_h264chroma_init_x86(H264ChromaContext *c, int
> bit_depth); void ff_h264chroma_init_mips(H264ChromaContext *c, int
> bit_depth); void ff_h264chroma_init_loongarch(H264ChromaContext *c, int
> bit_depth); +void ff_h264chroma_init_riscv(H264ChromaContext *c, int
> bit_depth);
> 
>  #endif /* AVCODEC_H264CHROMA_H */
> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> index 965942f4df..08b76c93cb 100644
> --- a/libavcodec/riscv/Makefile
> +++ b/libavcodec/riscv/Makefile
> @@ -19,3 +19,6 @@ OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_init.o \
>  RVV-OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_rvv.o
>  OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
>  RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
> +
> +OBJS-$(CONFIG_H264CHROMA) += riscv/h264_chroma_init_riscv.o
> +RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o

Please maintain the existing ordering, which is to say, alphabetical.

> diff --git a/libavcodec/riscv/h264_chroma_init_riscv.c
> b/libavcodec/riscv/h264_chroma_init_riscv.c new file mode 100644
> index 0000000000..b6f98ba693
> --- /dev/null
> +++ b/libavcodec/riscv/h264_chroma_init_riscv.c
> @@ -0,0 +1,39 @@
> +/*
> + * Copyright (c) 2023 SiFive, Inc. All rights reserved.
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA + */
> +
> +#include <stdint.h>
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavcodec/h264chroma.h"
> +#include "config.h"
> +#include "h264_mc_chroma.h"
> +
> +av_cold void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth)
> +{
> +#if HAVE_RVV
> +    const int high_bit_depth = bit_depth > 8;
> +
> +    if (!high_bit_depth) {
> +        c->put_h264_chroma_pixels_tab[0] = h264_put_chroma_mc8_rvv;
> +        c->avg_h264_chroma_pixels_tab[0] = h264_avg_chroma_mc8_rvv;
> +    }
> +#endif
> +}
> \ No newline at end of file
> diff --git a/libavcodec/riscv/h264_mc_chroma.S
> b/libavcodec/riscv/h264_mc_chroma.S new file mode 100644
> index 0000000000..a02866f633
> --- /dev/null
> +++ b/libavcodec/riscv/h264_mc_chroma.S
> @@ -0,0 +1,492 @@
> +/*
> + * Copyright (c) 2023 SiFive, Inc. All rights reserved.
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA + */
> +    .text
> +
> +    .globl    h264_put_chroma_mc8_rvv
> +    .p2align    1
> +    .type    h264_put_chroma_mc8_rvv,@function
> +h264_put_chroma_mc8_rvv:
> +    slliw    t2, a5, 3
> +    mulw    t1, a5, a4
> +    sh3add    a5, a4, t2
> +    slliw    a4, a4, 3
> +    subw    a5, t1, a5
> +    subw    a7, a4, t1
> +    addiw    a6, a5, 64
> +    subw    t0, t2, t1

I would avoid narrow instructions unless strictly necessary, so that we don't 
unnecessarily restrict target platforms.

> +    vsetivli    t3, 8, e8, m1, ta, mu
> +    beqz    t1, .LBB0_4
> +    blez    a3, .LBB0_17

If you're going to use numeric labels (which is totally fine, I do that too), 
then you really can just use the usual numeric label assembler syntax. There 
are no needs to create local branch symbols.

> +    li    t4, 0
> +    li    t2, 0
> +    addi    a5, t3, 1
> +    slli    t3, a2, 2
> +.LBB0_3:                                # if (xy != 0)
> +    add    a4, a1, t4
> +    vsetvli    zero, a5, e8, m1, ta, ma
> +    addiw    t2, t2, 4
> +    vle8.v    v10, (a4)
> +    add    a4, a4, a2
> +    vslidedown.vi    v11, v10, 1

Isn't vslide1down.vx zero potentially faster than vslidedown.vi 1?

> +    vsetivli    zero, 8, e8, m1, ta, ma

Do we really need to reconfigure the active vector length so many times? I 
suspect that is not going to go down to well with some implementations.

> +    vwmulu.vx    v8, v10, a6
> +    vwmaccu.vx    v8, a7, v11
> +    vsetvli    zero, a5, e8, m1, ta, ma
> +    vle8.v    v12, (a4)
> +    vsetivli    zero, 8, e8, m1, ta, ma
> +    add    a4, a4, a2
> +    vwmaccu.vx    v8, t0, v12
> +    vsetvli    zero, a5, e8, m1, ta, ma
> +    vslidedown.vi    v13, v12, 1
> +    vsetivli    zero, 8, e8, m1, ta, ma
> +    vwmulu.vx    v10, v12, a6
> +    vwmaccu.vx    v8, t1, v13
> +    vwmaccu.vx    v10, a7, v13
> +    vsetvli    zero, a5, e8, m1, ta, ma
> +    vle8.v    v14, (a4)
> +    vsetivli    zero, 8, e8, m1, ta, ma
> +    add    a4, a4, a2
> +    vwmaccu.vx    v10, t0, v14
> +    vsetvli    zero, a5, e8, m1, ta, ma
> +    vslidedown.vi    v15, v14, 1
> +    vsetivli    zero, 8, e8, m1, ta, ma
> +    vwmulu.vx    v12, v14, a6
> +    vwmaccu.vx    v10, t1, v15
> +    vwmaccu.vx    v12, a7, v15
> +    vsetvli    zero, a5, e8, m1, ta, ma
> +    vle8.v    v14, (a4)
> +    vsetivli    zero, 8, e8, m1, ta, ma
> +    add    a4, a4, a2
> +    vwmaccu.vx    v12, t0, v14
> +    vsetvli    zero, a5, e8, m1, ta, ma
> +    vslidedown.vi    v15, v14, 1
> +    vsetivli    zero, 8, e8, m1, ta, ma
> +    vwmulu.vx    v16, v14, a6
> +    vwmaccu.vx    v12, t1, v15
> +    vwmaccu.vx    v16, a7, v15
> +    vsetvli    zero, a5, e8, m1, ta, ma
> +    vle8.v    v14, (a4)
> +    vsetivli    zero, 8, e8, m1, ta, ma
> +    add    a4, a0, t4
> +    add    t4, t4, t3

I could be totally wrong since I have no hardware to verify with, but I would 
assume that it is preferable to interleave independent scalar and vector 
instructions whence possible. For out-of-order processors, it shouldn't 
matter, but I suppose that it would on in-order multi-issue processors.

> +    vwmaccu.vx    v16, t0, v14
> +    vsetvli    zero, a5, e8, m1, ta, ma
> +    vslidedown.vi    v14, v14, 1
> +    vsetivli    zero, 8, e8, m1, ta, ma
> +    vnclipu.wi    v15, v8, 6
> +    vwmaccu.vx    v16, t1, v14
> +    vse8.v    v15, (a4)
> +    add    a4, a4, a2
> +    vnclipu.wi    v8, v10, 6
> +    vse8.v    v8, (a4)
> +    add    a4, a4, a2
> +    vnclipu.wi    v8, v12, 6
> +    vse8.v    v8, (a4)
> +    add    a4, a4, a2
> +    vnclipu.wi    v8, v16, 6
> +    vse8.v    v8, (a4)
> +    blt    t2, a3, .LBB0_3
> +    j    .LBB0_17
> +.LBB0_4:
> +    bnez    a4, .LBB0_9
> +    beqz    t2, .LBB0_9
> +    blez    a3, .LBB0_17
> +    li    a4, 0
> +    li    t1, 0
> +    slli    a7, a2, 2
> +.LBB0_8:                                # if ((x8 - xy) == 0 && (y8 -xy) !=
> 0) +    add    a5, a1, a4
> +    vsetvli    zero, zero, e8, m1, ta, ma
> +    addiw    t1, t1, 4
> +    vle8.v    v8, (a5)
> +    add    a5, a5, a2
> +    add    t2, a5, a2
> +    vwmulu.vx    v10, v8, a6
> +    vle8.v    v8, (a5)
> +    vwmulu.vx    v12, v8, a6
> +    vle8.v    v9, (t2)
> +    add    t2, t2, a2
> +    add    a5, t2, a2
> +    vwmaccu.vx    v10, t0, v8
> +    vle8.v    v8, (t2)
> +    vle8.v    v14, (a5)
> +    add    a5, a0, a4
> +    add    a4, a4, a7
> +    vwmaccu.vx    v12, t0, v9
> +    vnclipu.wi    v15, v10, 6
> +    vwmulu.vx    v10, v9, a6
> +    vse8.v    v15, (a5)
> +    add    a5, a5, a2
> +    vnclipu.wi    v9, v12, 6
> +    vwmaccu.vx    v10, t0, v8
> +    vwmulu.vx    v12, v8, a6
> +    vse8.v    v9, (a5)
> +    add    a5, a5, a2
> +    vnclipu.wi    v8, v10, 6
> +    vwmaccu.vx    v12, t0, v14
> +    vse8.v    v8, (a5)
> +    add    a5, a5, a2
> +    vnclipu.wi    v8, v12, 6
> +    vse8.v    v8, (a5)
> +    blt    t1, a3, .LBB0_8
> +    j    .LBB0_17
> +.LBB0_9:
> +    beqz    a4, .LBB0_14
> +    bnez    t2, .LBB0_14
> +    blez    a3, .LBB0_17
> +    li    a4, 0
> +    li    t2, 0
> +    addi    t0, t3, 1
> +    slli    t1, a2, 2
> +.LBB0_13:                               # if ((x8 - xy) != 0 && (y8 -xy) ==
> 0) +    add    a5, a1, a4
> +    vsetvli    zero, t0, e8, m1, ta, ma
> +    addiw    t2, t2, 4
> +    vle8.v    v8, (a5)
> +    add    a5, a5, a2
> +    vslidedown.vi    v9, v8, 1
> +    vsetivli    zero, 8, e8, m1, ta, ma
> +    vwmulu.vx    v10, v8, a6
> +    vwmaccu.vx    v10, a7, v9
> +    vsetvli    zero, t0, e8, m1, ta, ma
> +    vle8.v    v8, (a5)
> +    add    a5, a5, a2
> +    vslidedown.vi    v9, v8, 1
> +    vsetivli    zero, 8, e8, m1, ta, ma
> +    vwmulu.vx    v12, v8, a6
> +    vwmaccu.vx    v12, a7, v9
> +    vsetvli    zero, t0, e8, m1, ta, ma
> +    vle8.v    v8, (a5)
> +    add    a5, a5, a2
> +    vslidedown.vi    v9, v8, 1
> +    vsetivli    zero, 8, e8, m1, ta, ma
> +    vwmulu.vx    v14, v8, a6
> +    vwmaccu.vx    v14, a7, v9
> +    vsetvli    zero, t0, e8, m1, ta, ma
> +    vle8.v    v8, (a5)
> +    add    a5, a0, a4
> +    add    a4, a4, t1
> +    vslidedown.vi    v9, v8, 1
> +    vsetivli    zero, 8, e8, m1, ta, ma
> +    vnclipu.wi    v16, v10, 6
> +    vse8.v    v16, (a5)
> +    add    a5, a5, a2
> +    vnclipu.wi    v10, v12, 6
> +    vwmulu.vx    v12, v8, a6
> +    vse8.v    v10, (a5)
> +    add    a5, a5, a2
> +    vnclipu.wi    v8, v14, 6
> +    vwmaccu.vx    v12, a7, v9
> +    vse8.v    v8, (a5)
> +    add    a5, a5, a2
> +    vnclipu.wi    v8, v12, 6
> +    vse8.v    v8, (a5)
> +    blt    t2, a3, .LBB0_13
> +    j    .LBB0_17
> +.LBB0_14:
> +    blez    a3, .LBB0_17
> +    li    a4, 0
> +    li    t2, 0
> +    slli    a7, a2, 2
> +.LBB0_16:                               # the final else, none of the above
> conditions are met +    add    t0, a1, a4
> +    vsetvli    zero, zero, e8, m1, ta, ma
> +    add    a5, a0, a4
> +    add    a4, a4, a7
> +    addiw    t2, t2, 4
> +    vle8.v    v8, (t0)
> +    add    t0, t0, a2
> +    add    t1, t0, a2
> +    vwmulu.vx    v10, v8, a6
> +    vle8.v    v8, (t0)
> +    add    t0, t1, a2
> +    vle8.v    v9, (t1)
> +    vle8.v    v12, (t0)
> +    vnclipu.wi    v13, v10, 6
> +    vwmulu.vx    v10, v8, a6
> +    vse8.v    v13, (a5)
> +    add    a5, a5, a2
> +    vnclipu.wi    v8, v10, 6
> +    vwmulu.vx    v10, v9, a6
> +    vse8.v    v8, (a5)
> +    add    a5, a5, a2
> +    vnclipu.wi    v8, v10, 6
> +    vwmulu.vx    v10, v12, a6
> +    vse8.v    v8, (a5)
> +    add    a5, a5, a2
> +    vnclipu.wi    v8, v10, 6
> +    vse8.v    v8, (a5)
> +    blt    t2, a3, .LBB0_16
> +.LBB0_17:                               # Exit h264_put_chroma_mc8_rvv
> +    ret
> +.Lfunc_end0:
> +    .size    h264_put_chroma_mc8_rvv, .Lfunc_end0-h264_put_chroma_mc8_rvv
> +
> +    .globl    h264_avg_chroma_mc8_rvv
> +    .p2align    1
> +    .type    h264_avg_chroma_mc8_rvv,@function
> +h264_avg_chroma_mc8_rvv:
> +    slliw    t2, a5, 3
> +    mulw    t1, a5, a4
> +    sh3add    a5, a4, t2
> +    slliw    a4, a4, 3
> +    subw    a5, t1, a5
> +    subw    a7, a4, t1
> +    addiw    a6, a5, 64
> +    subw    t0, t2, t1
> +    vsetivli    t3, 8, e8, m1, ta, mu
> +    beqz    t1, .LBB1_4
> +    blez    a3, .LBB1_17
> +    li    t4, 0
> +    li    t2, 0
> +    addi    a5, t3, 1
> +    slli    t3, a2, 2
> +.LBB1_3:                                # if (xy != 0)
> +    add    a4, a1, t4
> +    vsetvli    zero, a5, e8, m1, ta, ma
> +    addiw    t2, t2, 4
> +    vle8.v    v10, (a4)
> +    add    a4, a4, a2
> +    vslidedown.vi    v11, v10, 1
> +    vsetivli    zero, 8, e8, m1, ta, ma
> +    vwmulu.vx    v8, v10, a6
> +    vwmaccu.vx    v8, a7, v11
> +    vsetvli    zero, a5, e8, m1, ta, ma
> +    vle8.v    v12, (a4)
> +    vsetivli    zero, 8, e8, m1, ta, ma
> +    add    a4, a4, a2
> +    vwmaccu.vx    v8, t0, v12
> +    vsetvli    zero, a5, e8, m1, ta, ma
> +    vslidedown.vi    v13, v12, 1
> +    vsetivli    zero, 8, e8, m1, ta, ma
> +    vwmulu.vx    v10, v12, a6
> +    vwmaccu.vx    v8, t1, v13
> +    vwmaccu.vx    v10, a7, v13
> +    vsetvli    zero, a5, e8, m1, ta, ma
> +    vle8.v    v14, (a4)
> +    vsetivli    zero, 8, e8, m1, ta, ma
> +    add    a4, a4, a2
> +    vwmaccu.vx    v10, t0, v14
> +    vsetvli    zero, a5, e8, m1, ta, ma
> +    vslidedown.vi    v15, v14, 1
> +    vsetivli    zero, 8, e8, m1, ta, ma
> +    vwmulu.vx    v12, v14, a6
> +    vwmaccu.vx    v10, t1, v15
> +    vwmaccu.vx    v12, a7, v15
> +    vsetvli    zero, a5, e8, m1, ta, ma
> +    vle8.v    v14, (a4)
> +    vsetivli    zero, 8, e8, m1, ta, ma
> +    add    a4, a4, a2
> +    vwmaccu.vx    v12, t0, v14
> +    vsetvli    zero, a5, e8, m1, ta, ma
> +    vslidedown.vi    v15, v14, 1
> +    vsetivli    zero, 8, e8, m1, ta, ma
> +    vwmulu.vx    v16, v14, a6
> +    vwmaccu.vx    v12, t1, v15
> +    vwmaccu.vx    v16, a7, v15
> +    vsetvli    zero, a5, e8, m1, ta, ma
> +    vle8.v    v14, (a4)
> +    vsetivli    zero, 8, e8, m1, ta, ma
> +    add    a4, a0, t4
> +    add    t4, t4, t3
> +    vwmaccu.vx    v16, t0, v14
> +    vsetvli    zero, a5, e8, m1, ta, ma
> +    vslidedown.vi    v14, v14, 1
> +    vsetivli    zero, 8, e8, m1, ta, ma
> +    vnclipu.wi    v15, v8, 6
> +    vle8.v    v8, (a4)
> +    vwmaccu.vx    v16, t1, v14
> +    vaaddu.vv    v8, v15, v8
> +    vse8.v    v8, (a4)
> +    add    a4, a4, a2
> +    vnclipu.wi    v8, v10, 6
> +    vle8.v    v9, (a4)
> +    vaaddu.vv    v8, v8, v9
> +    vse8.v    v8, (a4)
> +    add    a4, a4, a2
> +    vnclipu.wi    v8, v12, 6
> +    vle8.v    v9, (a4)
> +    vaaddu.vv    v8, v8, v9
> +    vse8.v    v8, (a4)
> +    add    a4, a4, a2
> +    vnclipu.wi    v8, v16, 6
> +    vle8.v    v9, (a4)
> +    vaaddu.vv    v8, v8, v9
> +    vse8.v    v8, (a4)
> +    blt    t2, a3, .LBB1_3
> +    j    .LBB1_17
> +.LBB1_4:
> +    bnez    a4, .LBB1_9
> +    beqz    t2, .LBB1_9
> +    blez    a3, .LBB1_17
> +    li    t2, 0
> +    li    t1, 0
> +    slli    a7, a2, 2
> +.LBB1_8:                                # if ((x8 - xy) == 0 && (y8 -xy) !=
> 0) +    add    a4, a1, t2
> +    vsetvli    zero, zero, e8, m1, ta, ma
> +    addiw    t1, t1, 4
> +    vle8.v    v8, (a4)
> +    add    a4, a4, a2
> +    vwmulu.vx    v10, v8, a6
> +    vle8.v    v8, (a4)
> +    add    a4, a4, a2
> +    add    a5, a4, a2
> +    vle8.v    v9, (a4)
> +    add    a4, a5, a2
> +    vle8.v    v12, (a5)
> +    vwmaccu.vx    v10, t0, v8
> +    vle8.v    v13, (a4)
> +    add    a4, a0, t2
> +    add    t2, t2, a7
> +    vnclipu.wi    v14, v10, 6
> +    vwmulu.vx    v10, v8, a6
> +    vle8.v    v8, (a4)
> +    vaaddu.vv    v8, v14, v8
> +    vwmaccu.vx    v10, t0, v9
> +    vse8.v    v8, (a4)
> +    add    a4, a4, a2
> +    vnclipu.wi    v8, v10, 6
> +    vwmulu.vx    v10, v9, a6
> +    vle8.v    v9, (a4)
> +    vaaddu.vv    v8, v8, v9
> +    vwmaccu.vx    v10, t0, v12
> +    vse8.v    v8, (a4)
> +    add    a4, a4, a2
> +    vnclipu.wi    v8, v10, 6
> +    vwmulu.vx    v10, v12, a6
> +    vle8.v    v9, (a4)
> +    vaaddu.vv    v8, v8, v9
> +    vwmaccu.vx    v10, t0, v13
> +    vse8.v    v8, (a4)
> +    add    a4, a4, a2
> +    vnclipu.wi    v8, v10, 6
> +    vle8.v    v9, (a4)
> +    vaaddu.vv    v8, v8, v9
> +    vse8.v    v8, (a4)
> +    blt    t1, a3, .LBB1_8
> +    j    .LBB1_17
> +.LBB1_9:
> +    beqz    a4, .LBB1_14
> +    bnez    t2, .LBB1_14
> +    blez    a3, .LBB1_17
> +    li    a5, 0
> +    li    t2, 0
> +    addi    t0, t3, 1
> +    slli    t1, a2, 2
> +.LBB1_13:                               # if ((x8 - xy) != 0 && (y8 -xy) ==
> 0) +    add    a4, a1, a5
> +    vsetvli    zero, t0, e8, m1, ta, ma
> +    addiw    t2, t2, 4
> +    vle8.v    v8, (a4)
> +    add    a4, a4, a2
> +    vslidedown.vi    v9, v8, 1
> +    vsetivli    zero, 8, e8, m1, ta, ma
> +    vwmulu.vx    v10, v8, a6
> +    vwmaccu.vx    v10, a7, v9
> +    vsetvli    zero, t0, e8, m1, ta, ma
> +    vle8.v    v8, (a4)
> +    add    a4, a4, a2
> +    vslidedown.vi    v9, v8, 1
> +    vsetivli    zero, 8, e8, m1, ta, ma
> +    vwmulu.vx    v12, v8, a6
> +    vwmaccu.vx    v12, a7, v9
> +    vsetvli    zero, t0, e8, m1, ta, ma
> +    vle8.v    v8, (a4)
> +    add    a4, a4, a2
> +    vslidedown.vi    v9, v8, 1
> +    vsetivli    zero, 8, e8, m1, ta, ma
> +    vwmulu.vx    v14, v8, a6
> +    vwmaccu.vx    v14, a7, v9
> +    vsetvli    zero, t0, e8, m1, ta, ma
> +    vle8.v    v8, (a4)
> +    add    a4, a0, a5
> +    add    a5, a5, t1
> +    vslidedown.vi    v9, v8, 1
> +    vsetivli    zero, 8, e8, m1, ta, ma
> +    vnclipu.wi    v16, v10, 6
> +    vle8.v    v10, (a4)
> +    vaaddu.vv    v10, v16, v10
> +    vse8.v    v10, (a4)
> +    add    a4, a4, a2
> +    vnclipu.wi    v10, v12, 6
> +    vle8.v    v11, (a4)
> +    vwmulu.vx    v12, v8, a6
> +    vaaddu.vv    v10, v10, v11
> +    vwmaccu.vx    v12, a7, v9
> +    vse8.v    v10, (a4)
> +    add    a4, a4, a2
> +    vnclipu.wi    v10, v14, 6
> +    vle8.v    v8, (a4)
> +    vaaddu.vv    v8, v10, v8
> +    vse8.v    v8, (a4)
> +    add    a4, a4, a2
> +    vnclipu.wi    v8, v12, 6
> +    vle8.v    v9, (a4)
> +    vaaddu.vv    v8, v8, v9
> +    vse8.v    v8, (a4)
> +    blt    t2, a3, .LBB1_13
> +    j    .LBB1_17
> +.LBB1_14:
> +    blez    a3, .LBB1_17
> +    li    a4, 0
> +    li    t0, 0
> +    slli    a7, a2, 2
> +.LBB1_16:                               # the final else, none of the above
> conditions are met +    add    a5, a1, a4
> +    vsetvli    zero, zero, e8, m1, ta, ma
> +    addiw    t0, t0, 4
> +    vle8.v    v8, (a5)
> +    add    a5, a5, a2
> +    add    t1, a5, a2
> +    vwmulu.vx    v10, v8, a6
> +    vle8.v    v8, (a5)
> +    add    a5, t1, a2
> +    vle8.v    v9, (t1)
> +    vle8.v    v12, (a5)
> +    add    a5, a0, a4
> +    add    a4, a4, a7
> +    vnclipu.wi    v13, v10, 6
> +    vle8.v    v10, (a5)
> +    vwmulu.vx    v14, v8, a6
> +    vaaddu.vv    v10, v13, v10
> +    vse8.v    v10, (a5)
> +    add    a5, a5, a2
> +    vnclipu.wi    v8, v14, 6
> +    vle8.v    v10, (a5)
> +    vaaddu.vv    v8, v8, v10
> +    vwmulu.vx    v10, v9, a6
> +    vse8.v    v8, (a5)
> +    add    a5, a5, a2
> +    vnclipu.wi    v8, v10, 6
> +    vle8.v    v9, (a5)
> +    vwmulu.vx    v10, v12, a6
> +    vaaddu.vv    v8, v8, v9
> +    vse8.v    v8, (a5)
> +    add    a5, a5, a2
> +    vnclipu.wi    v8, v10, 6
> +    vle8.v    v9, (a5)
> +    vaaddu.vv    v8, v8, v9
> +    vse8.v    v8, (a5)
> +    blt    t0, a3, .LBB1_16
> +.LBB1_17:                               # Exit h264_avg_chroma_mc8_rvv
> +    ret
> +.Lfunc_end1:
> +    .size    h264_avg_chroma_mc8_rvv, .Lfunc_end1-h264_avg_chroma_mc8_rvv

-- 
Реми Дёни-Курмон
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks
  2023-05-17 14:54 ` Lynne
  2023-05-18  8:20   ` Arnie Chang
@ 2023-05-19 17:15   ` Rémi Denis-Courmont
  2023-05-19 18:52     ` Lynne
  2023-05-20 10:03   ` Rémi Denis-Courmont
  2 siblings, 1 reply; 11+ messages in thread
From: Rémi Denis-Courmont @ 2023-05-19 17:15 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Le keskiviikkona 17. toukokuuta 2023, 17.54.22 EEST Lynne a écrit :
> Finally, run:
> make checkasm && ./tests/checkasm/checkasm --bench
> and report on the timings for both the C and assembly versions.
> If you've made a mistake somewhere, (forgot to restore stack, or a
> callee-saved register, or your function produces an incorrect result),
> checkasm will fail.

To be fair, in this particular case, the stack pointer and saved registers are 
never used, so the risk of messing those are zero.

checkasm would of course verify that the function does that it is supposed to 
do, and personally, I have kept off untested functions. But I am not sure if it 
is fair to require adding test cases whilst other architectures weren't 
required to have them.

-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks
  2023-05-19 17:15   ` Rémi Denis-Courmont
@ 2023-05-19 18:52     ` Lynne
  2023-05-20  9:59       ` Rémi Denis-Courmont
  0 siblings, 1 reply; 11+ messages in thread
From: Lynne @ 2023-05-19 18:52 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

May 19, 2023, 19:16 by remi@remlab.net:

> Le keskiviikkona 17. toukokuuta 2023, 17.54.22 EEST Lynne a écrit :
>
>> Finally, run:
>> make checkasm && ./tests/checkasm/checkasm --bench
>> and report on the timings for both the C and assembly versions.
>> If you've made a mistake somewhere, (forgot to restore stack, or a
>> callee-saved register, or your function produces an incorrect result),
>> checkasm will fail.
>>
>
> To be fair, in this particular case, the stack pointer and saved registers are 
> never used, so the risk of messing those are zero.
>
> checkasm would of course verify that the function does that it is supposed to 
> do, and personally, I have kept off untested functions. But I am not sure if it 
> is fair to require adding test cases whilst other architectures weren't 
> required to have them.
>

Other hardware exists, and even without checkasm, bugs are found
through fate. Right now, the only things that can run this code are FPGAs
and emulators, which are both software, something to which bugs are old friends.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks
  2023-05-19 18:52     ` Lynne
@ 2023-05-20  9:59       ` Rémi Denis-Courmont
  0 siblings, 0 replies; 11+ messages in thread
From: Rémi Denis-Courmont @ 2023-05-20  9:59 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Le perjantaina 19. toukokuuta 2023, 21.52.57 EEST Lynne a écrit :
> May 19, 2023, 19:16 by remi@remlab.net:
> > Le keskiviikkona 17. toukokuuta 2023, 17.54.22 EEST Lynne a écrit :
> >> Finally, run:
> >> make checkasm && ./tests/checkasm/checkasm --bench
> >> and report on the timings for both the C and assembly versions.
> >> If you've made a mistake somewhere, (forgot to restore stack, or a
> >> callee-saved register, or your function produces an incorrect result),
> >> checkasm will fail.
> > 
> > To be fair, in this particular case, the stack pointer and saved registers
> > are never used, so the risk of messing those are zero.
> > 
> > checkasm would of course verify that the function does that it is supposed
> > to do, and personally, I have kept off untested functions. But I am not
> > sure if it is fair to require adding test cases whilst other
> > architectures weren't required to have them.
> 
> Other hardware exists, and even without checkasm, bugs are found
> through fate.

There are exactly two FATE RISC-V instances of which exactly zero support 
vectors to date, so the chance for FATE to find bugs there is nul. I do 
appreciate that checkasm would make revectoring of the patchset easier and 
safer though.

That said, the argument is moot seen as you seem to have posted a fix.

-- 
Реми Дёни-Курмон
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks
  2023-05-17 14:54 ` Lynne
  2023-05-18  8:20   ` Arnie Chang
  2023-05-19 17:15   ` Rémi Denis-Courmont
@ 2023-05-20 10:03   ` Rémi Denis-Courmont
  2 siblings, 0 replies; 11+ messages in thread
From: Rémi Denis-Courmont @ 2023-05-20 10:03 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Le keskiviikkona 17. toukokuuta 2023, 17.54.22 EEST Lynne a écrit :
> Finally, run:
> make checkasm && ./tests/checkasm/checkasm --bench
> and report on the timings for both the C and assembly versions.
> If you've made a mistake somewhere, (forgot to restore stack, or a
> callee-saved register, or your function produces an incorrect result),
> checkasm will fail.

I don't specifically know SiFive's policies. In my experience however, silicon 
design companies will ABSOLUTELY NOT publish benchmark results from unreleased 
products in any stage of development (including FPGA simulation). Your say is 
unlikely to change those policies, so I don't think we can require RVV 
benchmarks this year.

-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks
  2023-05-19 17:12 ` Rémi Denis-Courmont
@ 2023-05-22 14:15   ` Arnie Chang
  0 siblings, 0 replies; 11+ messages in thread
From: Arnie Chang @ 2023-05-22 14:15 UTC (permalink / raw)
  To: Rémi Denis-Courmont; +Cc: ffmpeg-devel

On Sat, May 20, 2023 at 1:12 AM Rémi Denis-Courmont <remi@remlab.net> wrote:

> > +    li    t4, 0
> > +    li    t2, 0
> > +    addi    a5, t3, 1
> > +    slli    t3, a2, 2
> > +.LBB0_3:                                # if (xy != 0)
> > +    add    a4, a1, t4
> > +    vsetvli    zero, a5, e8, m1, ta, ma
> > +    addiw    t2, t2, 4
> > +    vle8.v    v10, (a4)
> > +    add    a4, a4, a2
> > +    vslidedown.vi    v11, v10, 1
>
> Isn't vslide1down.vx zero potentially faster than vslidedown.vi 1?
>

It depends on hardware design,
but in general, vslide1down.vi is typically not slower than vslidedown.vx
Using vslide1down.vi would be better here, I will fix it.



> > +    vsetivli    zero, 8, e8, m1, ta, ma
>
> Do we really need to reconfigure the active vector length so many times? I
> suspect that is not going to go down to well with some implementations.
>

We need to reconfigure it because the VL is changed.
The VL for vslidedown differs from that of the other instructions.

> +    vwmaccu.vx    v12, t1, v15
> > +    vwmaccu.vx    v16, a7, v15
> > +    vsetvli    zero, a5, e8, m1, ta, ma
> > +    vle8.v    v14, (a4)
> > +    vsetivli    zero, 8, e8, m1, ta, ma
> > +    add    a4, a0, t4
> > +    add    t4, t4, t3
>
> I could be totally wrong since I have no hardware to verify with, but I
> would
> assume that it is preferable to interleave independent scalar and vector
> instructions whence possible. For out-of-order processors, it shouldn't
> matter, but I suppose that it would on in-order multi-issue processors.
>

Interleaving those instructions can improve overall performance
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2023-05-22 14:15 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-05-17  7:13 [FFmpeg-devel] [PATCH] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks Arnie Chang
2023-05-17 14:54 ` Lynne
2023-05-18  8:20   ` Arnie Chang
2023-05-18 12:53     ` Ronald S. Bultje
2023-05-18 16:56     ` Lynne
2023-05-19 17:15   ` Rémi Denis-Courmont
2023-05-19 18:52     ` Lynne
2023-05-20  9:59       ` Rémi Denis-Courmont
2023-05-20 10:03   ` Rémi Denis-Courmont
2023-05-19 17:12 ` Rémi Denis-Courmont
2023-05-22 14:15   ` Arnie Chang

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git