Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH v3] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks
@ 2023-05-22 14:48 Arnie Chang
  2023-05-22 16:42 ` Rémi Denis-Courmont
  2023-05-22 16:44 ` Lynne
  0 siblings, 2 replies; 3+ messages in thread
From: Arnie Chang @ 2023-05-22 14:48 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Arnie Chang

Optimize the put and avg filtering for 8x8 chroma blocks

Signed-off-by: Arnie Chang <arnie.chang@sifive.com>
---
V3:
1. Use a macro to extract repetitive segments
2. Fix coding style issues
3. Use macros in riscv/asm.S to handle function declarations
4. Replace vslidedown with vslide1down
checkasm: using random seed 2379273251
RVVi32:
 - h264dsp.chroma_mc         [OK]
checkasm: all 2 tests passed
avg_h264_chroma_mc1_8_c: 1821.5
avg_h264_chroma_mc1_8_rvv_i32: 482.5
put_h264_chroma_mc1_8_c: 1436.5
put_h264_chroma_mc1_8_rvv_i32: 390.5

 libavcodec/h264chroma.c                   |   2 +
 libavcodec/h264chroma.h                   |   1 +
 libavcodec/riscv/Makefile                 |   2 +
 libavcodec/riscv/h264_chroma_init_riscv.c |  40 +++
 libavcodec/riscv/h264_mc_chroma.S         | 306 ++++++++++++++++++++++
 libavcodec/riscv/h264_mc_chroma.h         |  30 +++
 6 files changed, 381 insertions(+)
 create mode 100644 libavcodec/riscv/h264_chroma_init_riscv.c
 create mode 100644 libavcodec/riscv/h264_mc_chroma.S
 create mode 100644 libavcodec/riscv/h264_mc_chroma.h

diff --git a/libavcodec/h264chroma.c b/libavcodec/h264chroma.c
index 60b86b6fba..1eeab7bc40 100644
--- a/libavcodec/h264chroma.c
+++ b/libavcodec/h264chroma.c
@@ -58,5 +58,7 @@ av_cold void ff_h264chroma_init(H264ChromaContext *c, int bit_depth)
     ff_h264chroma_init_mips(c, bit_depth);
 #elif ARCH_LOONGARCH64
     ff_h264chroma_init_loongarch(c, bit_depth);
+#elif ARCH_RISCV
+    ff_h264chroma_init_riscv(c, bit_depth);
 #endif
 }
diff --git a/libavcodec/h264chroma.h b/libavcodec/h264chroma.h
index b8f9c8f4fc..9c81c18a76 100644
--- a/libavcodec/h264chroma.h
+++ b/libavcodec/h264chroma.h
@@ -37,5 +37,6 @@ void ff_h264chroma_init_ppc(H264ChromaContext *c, int bit_depth);
 void ff_h264chroma_init_x86(H264ChromaContext *c, int bit_depth);
 void ff_h264chroma_init_mips(H264ChromaContext *c, int bit_depth);
 void ff_h264chroma_init_loongarch(H264ChromaContext *c, int bit_depth);
+void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth);
 
 #endif /* AVCODEC_H264CHROMA_H */
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 965942f4df..ee17a521fd 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -10,6 +10,8 @@ OBJS-$(CONFIG_BSWAPDSP) += riscv/bswapdsp_init.o \
 RVV-OBJS-$(CONFIG_BSWAPDSP) += riscv/bswapdsp_rvv.o
 OBJS-$(CONFIG_FMTCONVERT) += riscv/fmtconvert_init.o
 RVV-OBJS-$(CONFIG_FMTCONVERT) += riscv/fmtconvert_rvv.o
+OBJS-$(CONFIG_H264CHROMA) += riscv/h264_chroma_init_riscv.o
+RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o
 OBJS-$(CONFIG_IDCTDSP) += riscv/idctdsp_init.o
 RVV-OBJS-$(CONFIG_IDCTDSP) += riscv/idctdsp_rvv.o
 OBJS-$(CONFIG_OPUS_DECODER) += riscv/opusdsp_init.o
diff --git a/libavcodec/riscv/h264_chroma_init_riscv.c b/libavcodec/riscv/h264_chroma_init_riscv.c
new file mode 100644
index 0000000000..2e47f1365e
--- /dev/null
+++ b/libavcodec/riscv/h264_chroma_init_riscv.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavcodec/h264chroma.h"
+#include "config.h"
+#include "h264_mc_chroma.h"
+
+av_cold void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth)
+{
+#if HAVE_RVV
+    int flags = av_get_cpu_flags();
+
+    if (bit_depth == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
+        c->put_h264_chroma_pixels_tab[0] = h264_put_chroma_mc8_rvv;
+        c->avg_h264_chroma_pixels_tab[0] = h264_avg_chroma_mc8_rvv;
+    }
+#endif
+}
+
diff --git a/libavcodec/riscv/h264_mc_chroma.S b/libavcodec/riscv/h264_mc_chroma.S
new file mode 100644
index 0000000000..1c373c8cc7
--- /dev/null
+++ b/libavcodec/riscv/h264_mc_chroma.S
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "libavutil/riscv/asm.S"
+
+.macro  h264_chroma_mc8 type
+func h264_\type\()_chroma_mc8_rvv, zvl128b
+        slliw           t2, a5, 3
+        mulw            t1, a5, a4
+        sh3add          a5, a4, t2
+        slliw           a4, a4, 3
+        subw            a5, t1, a5
+        subw            a7, a4, t1
+        addiw           a6, a5, 64
+        subw            t0, t2, t1
+        vsetivli        t3, 8, e8, m1, ta, mu
+        beqz            t1, 2f
+        blez            a3, 8f
+        li              t4, 0
+        li              t2, 0
+        li              t5, 1
+        addi            a5, t3, 1
+        slli            t3, a2, 2
+1:                                # if (xy != 0)
+        add             a4, a1, t4
+        vsetvli         zero, a5, e8, m1, ta, ma
+        addiw           t2, t2, 4
+        vle8.v          v10, (a4)
+        add             a4, a4, a2
+        vslide1down.vx  v11, v10, t5
+        vsetivli        zero, 8, e8, m1, ta, ma
+        vwmulu.vx       v8, v10, a6
+        vwmaccu.vx      v8, a7, v11
+        vsetvli         zero, a5, e8, m1, ta, ma
+        vle8.v          v12, (a4)
+        vsetivli        zero, 8, e8, m1, ta, ma
+        add             a4, a4, a2
+        vwmaccu.vx      v8, t0, v12
+        vsetvli         zero, a5, e8, m1, ta, ma
+        vslide1down.vx  v13, v12, t5
+        vsetivli        zero, 8, e8, m1, ta, ma
+        vwmulu.vx       v10, v12, a6
+        vwmaccu.vx      v8, t1, v13
+        vwmaccu.vx      v10, a7, v13
+        vsetvli         zero, a5, e8, m1, ta, ma
+        vle8.v          v14, (a4)
+        vsetivli        zero, 8, e8, m1, ta, ma
+        add             a4, a4, a2
+        vwmaccu.vx      v10, t0, v14
+        vsetvli         zero, a5, e8, m1, ta, ma
+        vslide1down.vx  v15, v14, t5
+        vsetivli        zero, 8, e8, m1, ta, ma
+        vwmulu.vx       v12, v14, a6
+        vwmaccu.vx      v10, t1, v15
+        vwmaccu.vx      v12, a7, v15
+        vsetvli         zero, a5, e8, m1, ta, ma
+        vle8.v          v14, (a4)
+        vsetivli        zero, 8, e8, m1, ta, ma
+        add             a4, a4, a2
+        vwmaccu.vx      v12, t0, v14
+        vsetvli         zero, a5, e8, m1, ta, ma
+        vslide1down.vx  v15, v14, t5
+        vsetivli        zero, 8, e8, m1, ta, ma
+        vwmulu.vx       v16, v14, a6
+        vwmaccu.vx      v12, t1, v15
+        vwmaccu.vx      v16, a7, v15
+        vsetvli         zero, a5, e8, m1, ta, ma
+        vle8.v          v14, (a4)
+        vsetivli        zero, 8, e8, m1, ta, ma
+        add             a4, a0, t4
+        add             t4, t4, t3
+        vwmaccu.vx      v16, t0, v14
+        vsetvli         zero, a5, e8, m1, ta, ma
+        vslide1down.vx  v14, v14, t5
+        vsetivli        zero, 8, e8, m1, ta, ma
+        vnclipu.wi      v15, v8, 6
+        vwmaccu.vx      v16, t1, v14
+  .ifc \type,avg
+        vle8.v          v9, (a4)
+        vaaddu.vv       v15, v15, v9
+  .endif
+        vse8.v          v15, (a4)
+        add             a4, a4, a2
+        vnclipu.wi      v8, v10, 6
+  .ifc \type,avg
+        vle8.v          v9, (a4)
+        vaaddu.vv       v8, v8, v9
+  .endif
+        vse8.v          v8, (a4)
+        add             a4, a4, a2
+        vnclipu.wi      v8, v12, 6
+  .ifc \type,avg
+        vle8.v          v9, (a4)
+        vaaddu.vv       v8, v8, v9
+  .endif
+        vse8.v          v8, (a4)
+        add             a4, a4, a2
+        vnclipu.wi      v8, v16, 6
+  .ifc \type,avg
+        vle8.v          v9, (a4)
+        vaaddu.vv       v8, v8, v9
+  .endif
+        vse8.v          v8, (a4)
+        blt             t2, a3, 1b
+        j               8f
+2:
+        bnez            a4, 4f
+        beqz            t2, 4f
+        blez            a3, 8f
+        li              a4, 0
+        li              t1, 0
+        slli            a7, a2, 2
+3:                                # if ((x8 - xy) == 0 && (y8 -xy) != 0)
+        add             a5, a1, a4
+        vsetvli         zero, zero, e8, m1, ta, ma
+        addiw           t1, t1, 4
+        vle8.v          v8, (a5)
+        add             a5, a5, a2
+        add             t2, a5, a2
+        vwmulu.vx       v10, v8, a6
+        vle8.v          v8, (a5)
+        vwmulu.vx       v12, v8, a6
+        vle8.v          v9, (t2)
+        add             t2, t2, a2
+        add             a5, t2, a2
+        vwmaccu.vx      v10, t0, v8
+        vle8.v          v8, (t2)
+        vle8.v          v14, (a5)
+        add             a5, a0, a4
+        add             a4, a4, a7
+        vwmaccu.vx      v12, t0, v9
+        vnclipu.wi      v15, v10, 6
+        vwmulu.vx       v10, v9, a6
+  .ifc \type,avg
+        vle8.v          v16, (a5)
+        vaaddu.vv       v15, v15, v16
+  .endif
+        vse8.v          v15, (a5)
+        add             a5, a5, a2
+        vnclipu.wi      v9, v12, 6
+        vwmaccu.vx      v10, t0, v8
+        vwmulu.vx       v12, v8, a6
+  .ifc \type,avg
+        vle8.v          v16, (a5)
+        vaaddu.vv       v9, v9, v16
+  .endif
+        vse8.v          v9, (a5)
+        add             a5, a5, a2
+        vnclipu.wi      v8, v10, 6
+        vwmaccu.vx      v12, t0, v14
+  .ifc \type,avg
+        vle8.v          v16, (a5)
+        vaaddu.vv       v8, v8, v16
+  .endif
+        vse8.v          v8, (a5)
+        add             a5, a5, a2
+        vnclipu.wi      v8, v12, 6
+  .ifc \type,avg
+        vle8.v          v16, (a5)
+        vaaddu.vv       v8, v8, v16
+  .endif
+        vse8.v          v8, (a5)
+        blt             t1, a3, 3b
+        j               8f
+4:
+        beqz            a4, 6f
+        bnez            t2, 6f
+        blez            a3, 8f
+        li              a4, 0
+        li              t2, 0
+        addi            t0, t3, 1
+        slli            t1, a2, 2
+5:                               # if ((x8 - xy) != 0 && (y8 -xy) == 0)
+        add             a5, a1, a4
+        vsetvli         zero, t0, e8, m1, ta, ma
+        addiw           t2, t2, 4
+        vle8.v          v8, (a5)
+        add             a5, a5, a2
+        vslide1down.vx  v9, v8, t5
+        vsetivli        zero, 8, e8, m1, ta, ma
+        vwmulu.vx       v10, v8, a6
+        vwmaccu.vx      v10, a7, v9
+        vsetvli         zero, t0, e8, m1, ta, ma
+        vle8.v          v8, (a5)
+        add             a5, a5, a2
+        vslide1down.vx  v9, v8, t5
+        vsetivli        zero, 8, e8, m1, ta, ma
+        vwmulu.vx       v12, v8, a6
+        vwmaccu.vx      v12, a7, v9
+        vsetvli         zero, t0, e8, m1, ta, ma
+        vle8.v          v8, (a5)
+        add             a5, a5, a2
+        vslide1down.vx  v9, v8, t5
+        vsetivli        zero, 8, e8, m1, ta, ma
+        vwmulu.vx       v14, v8, a6
+        vwmaccu.vx      v14, a7, v9
+        vsetvli         zero, t0, e8, m1, ta, ma
+        vle8.v          v8, (a5)
+        add             a5, a0, a4
+        add             a4, a4, t1
+        vslide1down.vx  v9, v8, t5
+        vsetivli        zero, 8, e8, m1, ta, ma
+        vnclipu.wi      v16, v10, 6
+  .ifc \type,avg
+        vle8.v          v18, (a5)
+        vaaddu.vv       v16, v16, v18
+  .endif
+        vse8.v          v16, (a5)
+        add             a5, a5, a2
+        vnclipu.wi      v10, v12, 6
+        vwmulu.vx       v12, v8, a6
+  .ifc \type,avg
+        vle8.v          v18, (a5)
+        vaaddu.vv       v10, v10, v18
+  .endif
+        vse8.v          v10, (a5)
+        add             a5, a5, a2
+        vnclipu.wi      v8, v14, 6
+        vwmaccu.vx      v12, a7, v9
+  .ifc \type,avg
+        vle8.v          v18, (a5)
+        vaaddu.vv       v8, v8, v18
+  .endif
+        vse8.v          v8, (a5)
+        add             a5, a5, a2
+        vnclipu.wi      v8, v12, 6
+  .ifc \type,avg
+        vle8.v          v18, (a5)
+        vaaddu.vv       v8, v8, v18
+  .endif
+        vse8.v          v8, (a5)
+        blt             t2, a3, 5b
+        j               8f
+6:
+        blez            a3, 8f
+        li              a4, 0
+        li              t2, 0
+        slli            a7, a2, 2
+7:                               # the final else, none of the above conditions are met
+        add             t0, a1, a4
+        vsetvli         zero, zero, e8, m1, ta, ma
+        add             a5, a0, a4
+        add             a4, a4, a7
+        addiw           t2, t2, 4
+        vle8.v          v8, (t0)
+        add             t0, t0, a2
+        add             t1, t0, a2
+        vwmulu.vx       v10, v8, a6
+        vle8.v          v8, (t0)
+        add             t0, t1, a2
+        vle8.v          v9, (t1)
+        vle8.v          v12, (t0)
+        vnclipu.wi      v13, v10, 6
+        vwmulu.vx       v10, v8, a6
+  .ifc \type,avg
+        vle8.v          v18, (a5)
+        vaaddu.vv       v13, v13, v18
+  .endif
+        vse8.v          v13, (a5)
+        add             a5, a5, a2
+        vnclipu.wi      v8, v10, 6
+        vwmulu.vx       v10, v9, a6
+  .ifc \type,avg
+        vle8.v          v18, (a5)
+        vaaddu.vv       v8, v8, v18
+  .endif
+        vse8.v          v8, (a5)
+        add             a5, a5, a2
+        vnclipu.wi      v8, v10, 6
+        vwmulu.vx       v10, v12, a6
+  .ifc \type,avg
+        vle8.v          v18, (a5)
+        vaaddu.vv       v8, v8, v18
+  .endif
+        vse8.v          v8, (a5)
+        add             a5, a5, a2
+        vnclipu.wi      v8, v10, 6
+  .ifc \type,avg
+        vle8.v          v18, (a5)
+        vaaddu.vv       v8, v8, v18
+  .endif
+        vse8.v          v8, (a5)
+        blt             t2, a3, 7b
+8:
+        ret
+endfunc
+.endm
+
+h264_chroma_mc8 put
+h264_chroma_mc8 avg
diff --git a/libavcodec/riscv/h264_mc_chroma.h b/libavcodec/riscv/h264_mc_chroma.h
new file mode 100644
index 0000000000..027f2ee053
--- /dev/null
+++ b/libavcodec/riscv/h264_mc_chroma.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_RISCV_H264_MC_CHROMA_H
+#define AVCODEC_RISCV_H264_MC_CHROMA_H
+#include "config.h"
+
+#if HAVE_RVV
+void h264_put_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
+void h264_avg_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
+#endif
+#endif
+
-- 
2.17.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [FFmpeg-devel] [PATCH v3] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks
  2023-05-22 14:48 [FFmpeg-devel] [PATCH v3] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks Arnie Chang
@ 2023-05-22 16:42 ` Rémi Denis-Courmont
  2023-05-22 16:44 ` Lynne
  1 sibling, 0 replies; 3+ messages in thread
From: Rémi Denis-Courmont @ 2023-05-22 16:42 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Arnie Chang

Le maanantaina 22. toukokuuta 2023, 17.48.40 EEST Arnie Chang a écrit :
> --- /dev/null
> +++ b/libavcodec/riscv/h264_chroma_init_riscv.c
> @@ -0,0 +1,40 @@
> +/*
> + * Copyright (c) 2023 SiFive, Inc. All rights reserved.
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA + */
> +
> +#include <stdint.h>
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavcodec/h264chroma.h"
> +#include "config.h"
> +#include "h264_mc_chroma.h"
> +
> +av_cold void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth)
> +{
> +#if HAVE_RVV
> +    int flags = av_get_cpu_flags();
> +
> +    if (bit_depth == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
> +        c->put_h264_chroma_pixels_tab[0] = h264_put_chroma_mc8_rvv;
> +        c->avg_h264_chroma_pixels_tab[0] = h264_avg_chroma_mc8_rvv;
> +    }

You should check that the vector length is large enough:
`ff_get_rv_vlenb() >= 16`

> +#endif
> +}
> +
> diff --git a/libavcodec/riscv/h264_mc_chroma.S
> b/libavcodec/riscv/h264_mc_chroma.S new file mode 100644
> index 0000000000..1c373c8cc7
> --- /dev/null
> +++ b/libavcodec/riscv/h264_mc_chroma.S
> @@ -0,0 +1,306 @@
> +/*
> + * Copyright (c) 2023 SiFive, Inc. All rights reserved.
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA + */
> +#include "libavutil/riscv/asm.S"
> +
> +.macro  h264_chroma_mc8 type
> +func h264_\type\()_chroma_mc8_rvv, zvl128b

That works, but `zve32x` would be more idiomatic and consistent with existing code. Selecting a vector length in the assembler doesn't really do anything other than enable `zve32x` implicitly anyway.

> +        slliw           t2, a5, 3

Don't use narrow AL unless it is really necessary. AFAICT, the C compiler will sign-extend `a5` to XLEN bits, so you should not need to care.

But if you do really need 32- rather than XLEN-bit instructions, then you should gate the code: `#if (__riscv_xlen >= 64)`

> +        mulw            t1, a5, a4
> +        sh3add          a5, a4, t2
> +        slliw           a4, a4, 3
> +        subw            a5, t1, a5
> +        subw            a7, a4, t1
> +        addiw           a6, a5, 64
> +        subw            t0, t2, t1
> +        vsetivli        t3, 8, e8, m1, ta, mu
> +        beqz            t1, 2f
> +        blez            a3, 8f
> +        li              t4, 0
> +        li              t2, 0
> +        li              t5, 1
> +        addi            a5, t3, 1
> +        slli            t3, a2, 2
> +1:                                # if (xy != 0)
> +        add             a4, a1, t4
> +        vsetvli         zero, a5, e8, m1, ta, ma
> +        addiw           t2, t2, 4
> +        vle8.v          v10, (a4)
> +        add             a4, a4, a2
> +        vslide1down.vx  v11, v10, t5
> +        vsetivli        zero, 8, e8, m1, ta, ma
> +        vwmulu.vx       v8, v10, a6
> +        vwmaccu.vx      v8, a7, v11

From a quick glance, the code seems to be using between a quarter and half of the vector bank, so it would be preferable to use exclusively even-numbered registers. Then we can double LMUL easily later if that turns out faster.

> +        vsetvli         zero, a5, e8, m1, ta, ma
> +        vle8.v          v12, (a4)
> +        vsetivli        zero, 8, e8, m1, ta, ma
> +        add             a4, a4, a2
> +        vwmaccu.vx      v8, t0, v12
> +        vsetvli         zero, a5, e8, m1, ta, ma
> +        vslide1down.vx  v13, v12, t5
> +        vsetivli        zero, 8, e8, m1, ta, ma
> +        vwmulu.vx       v10, v12, a6
> +        vwmaccu.vx      v8, t1, v13
> +        vwmaccu.vx      v10, a7, v13
> +        vsetvli         zero, a5, e8, m1, ta, ma
> +        vle8.v          v14, (a4)
> +        vsetivli        zero, 8, e8, m1, ta, ma
> +        add             a4, a4, a2
> +        vwmaccu.vx      v10, t0, v14
> +        vsetvli         zero, a5, e8, m1, ta, ma
> +        vslide1down.vx  v15, v14, t5
> +        vsetivli        zero, 8, e8, m1, ta, ma
> +        vwmulu.vx       v12, v14, a6
> +        vwmaccu.vx      v10, t1, v15
> +        vwmaccu.vx      v12, a7, v15
> +        vsetvli         zero, a5, e8, m1, ta, ma
> +        vle8.v          v14, (a4)
> +        vsetivli        zero, 8, e8, m1, ta, ma
> +        add             a4, a4, a2
> +        vwmaccu.vx      v12, t0, v14
> +        vsetvli         zero, a5, e8, m1, ta, ma
> +        vslide1down.vx  v15, v14, t5
> +        vsetivli        zero, 8, e8, m1, ta, ma
> +        vwmulu.vx       v16, v14, a6
> +        vwmaccu.vx      v12, t1, v15
> +        vwmaccu.vx      v16, a7, v15
> +        vsetvli         zero, a5, e8, m1, ta, ma
> +        vle8.v          v14, (a4)
> +        vsetivli        zero, 8, e8, m1, ta, ma
> +        add             a4, a0, t4
> +        add             t4, t4, t3
> +        vwmaccu.vx      v16, t0, v14
> +        vsetvli         zero, a5, e8, m1, ta, ma
> +        vslide1down.vx  v14, v14, t5
> +        vsetivli        zero, 8, e8, m1, ta, ma
> +        vnclipu.wi      v15, v8, 6
> +        vwmaccu.vx      v16, t1, v14
> +  .ifc \type,avg
> +        vle8.v          v9, (a4)
> +        vaaddu.vv       v15, v15, v9
> +  .endif
> +        vse8.v          v15, (a4)
> +        add             a4, a4, a2
> +        vnclipu.wi      v8, v10, 6
> +  .ifc \type,avg
> +        vle8.v          v9, (a4)
> +        vaaddu.vv       v8, v8, v9
> +  .endif
> +        vse8.v          v8, (a4)
> +        add             a4, a4, a2
> +        vnclipu.wi      v8, v12, 6
> +  .ifc \type,avg
> +        vle8.v          v9, (a4)
> +        vaaddu.vv       v8, v8, v9
> +  .endif
> +        vse8.v          v8, (a4)
> +        add             a4, a4, a2
> +        vnclipu.wi      v8, v16, 6
> +  .ifc \type,avg
> +        vle8.v          v9, (a4)
> +        vaaddu.vv       v8, v8, v9
> +  .endif
> +        vse8.v          v8, (a4)
> +        blt             t2, a3, 1b
> +        j               8f
> +2:
> +        bnez            a4, 4f
> +        beqz            t2, 4f
> +        blez            a3, 8f
> +        li              a4, 0
> +        li              t1, 0
> +        slli            a7, a2, 2
> +3:                                # if ((x8 - xy) == 0 && (y8 -xy) != 0)
> +        add             a5, a1, a4
> +        vsetvli         zero, zero, e8, m1, ta, ma
> +        addiw           t1, t1, 4
> +        vle8.v          v8, (a5)
> +        add             a5, a5, a2
> +        add             t2, a5, a2
> +        vwmulu.vx       v10, v8, a6
> +        vle8.v          v8, (a5)
> +        vwmulu.vx       v12, v8, a6
> +        vle8.v          v9, (t2)
> +        add             t2, t2, a2
> +        add             a5, t2, a2
> +        vwmaccu.vx      v10, t0, v8
> +        vle8.v          v8, (t2)
> +        vle8.v          v14, (a5)
> +        add             a5, a0, a4
> +        add             a4, a4, a7
> +        vwmaccu.vx      v12, t0, v9
> +        vnclipu.wi      v15, v10, 6
> +        vwmulu.vx       v10, v9, a6
> +  .ifc \type,avg
> +        vle8.v          v16, (a5)
> +        vaaddu.vv       v15, v15, v16
> +  .endif
> +        vse8.v          v15, (a5)

The store depends on the previous add, which depends on the previous load. That might presumably cause some pipeline delay depending on the IP. You may want to reorder independent vector instructions a little bit.

> +        add             a5, a5, a2
> +        vnclipu.wi      v9, v12, 6
> +        vwmaccu.vx      v10, t0, v8
> +        vwmulu.vx       v12, v8, a6
> +  .ifc \type,avg
> +        vle8.v          v16, (a5)
> +        vaaddu.vv       v9, v9, v16
> +  .endif
> +        vse8.v          v9, (a5)
> +        add             a5, a5, a2
> +        vnclipu.wi      v8, v10, 6
> +        vwmaccu.vx      v12, t0, v14
> +  .ifc \type,avg
> +        vle8.v          v16, (a5)
> +        vaaddu.vv       v8, v8, v16
> +  .endif
> +        vse8.v          v8, (a5)
> +        add             a5, a5, a2
> +        vnclipu.wi      v8, v12, 6
> +  .ifc \type,avg
> +        vle8.v          v16, (a5)
> +        vaaddu.vv       v8, v8, v16
> +  .endif
> +        vse8.v          v8, (a5)
> +        blt             t1, a3, 3b
> +        j               8f
> +4:
> +        beqz            a4, 6f
> +        bnez            t2, 6f
> +        blez            a3, 8f
> +        li              a4, 0
> +        li              t2, 0
> +        addi            t0, t3, 1
> +        slli            t1, a2, 2
> +5:                               # if ((x8 - xy) != 0 && (y8 -xy) == 0)
> +        add             a5, a1, a4
> +        vsetvli         zero, t0, e8, m1, ta, ma
> +        addiw           t2, t2, 4
> +        vle8.v          v8, (a5)
> +        add             a5, a5, a2
> +        vslide1down.vx  v9, v8, t5
> +        vsetivli        zero, 8, e8, m1, ta, ma
> +        vwmulu.vx       v10, v8, a6
> +        vwmaccu.vx      v10, a7, v9
> +        vsetvli         zero, t0, e8, m1, ta, ma
> +        vle8.v          v8, (a5)
> +        add             a5, a5, a2
> +        vslide1down.vx  v9, v8, t5
> +        vsetivli        zero, 8, e8, m1, ta, ma
> +        vwmulu.vx       v12, v8, a6
> +        vwmaccu.vx      v12, a7, v9
> +        vsetvli         zero, t0, e8, m1, ta, ma
> +        vle8.v          v8, (a5)
> +        add             a5, a5, a2
> +        vslide1down.vx  v9, v8, t5
> +        vsetivli        zero, 8, e8, m1, ta, ma
> +        vwmulu.vx       v14, v8, a6
> +        vwmaccu.vx      v14, a7, v9
> +        vsetvli         zero, t0, e8, m1, ta, ma
> +        vle8.v          v8, (a5)
> +        add             a5, a0, a4
> +        add             a4, a4, t1
> +        vslide1down.vx  v9, v8, t5
> +        vsetivli        zero, 8, e8, m1, ta, ma
> +        vnclipu.wi      v16, v10, 6
> +  .ifc \type,avg
> +        vle8.v          v18, (a5)
> +        vaaddu.vv       v16, v16, v18
> +  .endif
> +        vse8.v          v16, (a5)
> +        add             a5, a5, a2
> +        vnclipu.wi      v10, v12, 6
> +        vwmulu.vx       v12, v8, a6
> +  .ifc \type,avg
> +        vle8.v          v18, (a5)
> +        vaaddu.vv       v10, v10, v18
> +  .endif
> +        vse8.v          v10, (a5)
> +        add             a5, a5, a2
> +        vnclipu.wi      v8, v14, 6
> +        vwmaccu.vx      v12, a7, v9
> +  .ifc \type,avg
> +        vle8.v          v18, (a5)
> +        vaaddu.vv       v8, v8, v18
> +  .endif
> +        vse8.v          v8, (a5)
> +        add             a5, a5, a2
> +        vnclipu.wi      v8, v12, 6
> +  .ifc \type,avg
> +        vle8.v          v18, (a5)
> +        vaaddu.vv       v8, v8, v18
> +  .endif
> +        vse8.v          v8, (a5)
> +        blt             t2, a3, 5b
> +        j               8f
> +6:
> +        blez            a3, 8f
> +        li              a4, 0
> +        li              t2, 0
> +        slli            a7, a2, 2
> +7:                               # the final else, none of the above
> conditions are met +        add             t0, a1, a4
> +        vsetvli         zero, zero, e8, m1, ta, ma
> +        add             a5, a0, a4
> +        add             a4, a4, a7
> +        addiw           t2, t2, 4
> +        vle8.v          v8, (t0)
> +        add             t0, t0, a2
> +        add             t1, t0, a2
> +        vwmulu.vx       v10, v8, a6
> +        vle8.v          v8, (t0)
> +        add             t0, t1, a2
> +        vle8.v          v9, (t1)
> +        vle8.v          v12, (t0)
> +        vnclipu.wi      v13, v10, 6
> +        vwmulu.vx       v10, v8, a6
> +  .ifc \type,avg
> +        vle8.v          v18, (a5)
> +        vaaddu.vv       v13, v13, v18
> +  .endif
> +        vse8.v          v13, (a5)
> +        add             a5, a5, a2
> +        vnclipu.wi      v8, v10, 6
> +        vwmulu.vx       v10, v9, a6
> +  .ifc \type,avg
> +        vle8.v          v18, (a5)
> +        vaaddu.vv       v8, v8, v18
> +  .endif
> +        vse8.v          v8, (a5)
> +        add             a5, a5, a2
> +        vnclipu.wi      v8, v10, 6
> +        vwmulu.vx       v10, v12, a6
> +  .ifc \type,avg
> +        vle8.v          v18, (a5)
> +        vaaddu.vv       v8, v8, v18
> +  .endif
> +        vse8.v          v8, (a5)
> +        add             a5, a5, a2
> +        vnclipu.wi      v8, v10, 6
> +  .ifc \type,avg
> +        vle8.v          v18, (a5)
> +        vaaddu.vv       v8, v8, v18
> +  .endif
> +        vse8.v          v8, (a5)
> +        blt             t2, a3, 7b
> +8:
> +        ret
> +endfunc
> +.endm
> +
> +h264_chroma_mc8 put
> +h264_chroma_mc8 avg

-- 
Rémi Denis-Courmont
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [FFmpeg-devel] [PATCH v3] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks
  2023-05-22 14:48 [FFmpeg-devel] [PATCH v3] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks Arnie Chang
  2023-05-22 16:42 ` Rémi Denis-Courmont
@ 2023-05-22 16:44 ` Lynne
  1 sibling, 0 replies; 3+ messages in thread
From: Lynne @ 2023-05-22 16:44 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

May 22, 2023, 16:48 by arnie.chang@sifive.com:

> Optimize the put and avg filtering for 8x8 chroma blocks
>
> Signed-off-by: Arnie Chang <arnie.chang@sifive.com>
> ---
> V3:
> 1. Use a macro to extract repetitive segments
> 2. Fix coding style issues
> 3. Use macros in riscv/asm.S to handle function declarations
> 4. Replace vslidedown with vslide1down
> checkasm: using random seed 2379273251
> RVVi32:
>  - h264dsp.chroma_mc         [OK]
> checkasm: all 2 tests passed
> avg_h264_chroma_mc1_8_c: 1821.5
> avg_h264_chroma_mc1_8_rvv_i32: 482.5
> put_h264_chroma_mc1_8_c: 1436.5
> put_h264_chroma_mc1_8_rvv_i32: 390.5
>

Pretty nice. You'd struggle to get this speedup with NEON.
Though, it's still only an FPGA.
The checkasm patch was merged with a better and more extensive form,
you should check again that it passes, just in case.


>  libavcodec/h264chroma.c                   |   2 +
>  libavcodec/h264chroma.h                   |   1 +
>  libavcodec/riscv/Makefile                 |   2 +
>  libavcodec/riscv/h264_chroma_init_riscv.c |  40 +++
>  libavcodec/riscv/h264_mc_chroma.S         | 306 ++++++++++++++++++++++
>  libavcodec/riscv/h264_mc_chroma.h         |  30 +++
>  6 files changed, 381 insertions(+)
>  create mode 100644 libavcodec/riscv/h264_chroma_init_riscv.c
>  create mode 100644 libavcodec/riscv/h264_mc_chroma.S
>  create mode 100644 libavcodec/riscv/h264_mc_chroma.h
>
> diff --git a/libavcodec/h264chroma.c b/libavcodec/h264chroma.c
> index 60b86b6fba..1eeab7bc40 100644
> --- a/libavcodec/h264chroma.c
> +++ b/libavcodec/h264chroma.c
> @@ -58,5 +58,7 @@ av_cold void ff_h264chroma_init(H264ChromaContext *c, int bit_depth)
>  ff_h264chroma_init_mips(c, bit_depth);
>  #elif ARCH_LOONGARCH64
>  ff_h264chroma_init_loongarch(c, bit_depth);
> +#elif ARCH_RISCV
> +    ff_h264chroma_init_riscv(c, bit_depth);
>  #endif
>  }
> diff --git a/libavcodec/h264chroma.h b/libavcodec/h264chroma.h
> index b8f9c8f4fc..9c81c18a76 100644
> --- a/libavcodec/h264chroma.h
> +++ b/libavcodec/h264chroma.h
> @@ -37,5 +37,6 @@ void ff_h264chroma_init_ppc(H264ChromaContext *c, int bit_depth);
>  void ff_h264chroma_init_x86(H264ChromaContext *c, int bit_depth);
>  void ff_h264chroma_init_mips(H264ChromaContext *c, int bit_depth);
>  void ff_h264chroma_init_loongarch(H264ChromaContext *c, int bit_depth);
> +void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth);
>  
>  #endif /* AVCODEC_H264CHROMA_H */
> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> index 965942f4df..ee17a521fd 100644
> --- a/libavcodec/riscv/Makefile
> +++ b/libavcodec/riscv/Makefile
> @@ -10,6 +10,8 @@ OBJS-$(CONFIG_BSWAPDSP) += riscv/bswapdsp_init.o \
>  RVV-OBJS-$(CONFIG_BSWAPDSP) += riscv/bswapdsp_rvv.o
>  OBJS-$(CONFIG_FMTCONVERT) += riscv/fmtconvert_init.o
>  RVV-OBJS-$(CONFIG_FMTCONVERT) += riscv/fmtconvert_rvv.o
> +OBJS-$(CONFIG_H264CHROMA) += riscv/h264_chroma_init_riscv.o
> +RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o
>  OBJS-$(CONFIG_IDCTDSP) += riscv/idctdsp_init.o
>  RVV-OBJS-$(CONFIG_IDCTDSP) += riscv/idctdsp_rvv.o
>  OBJS-$(CONFIG_OPUS_DECODER) += riscv/opusdsp_init.o
> diff --git a/libavcodec/riscv/h264_chroma_init_riscv.c b/libavcodec/riscv/h264_chroma_init_riscv.c
> new file mode 100644
> index 0000000000..2e47f1365e
> --- /dev/null
> +++ b/libavcodec/riscv/h264_chroma_init_riscv.c
> @@ -0,0 +1,40 @@
> +/*
> + * Copyright (c) 2023 SiFive, Inc. All rights reserved.
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include <stdint.h>
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavcodec/h264chroma.h"
> +#include "config.h"
> +#include "h264_mc_chroma.h"
> +
> +av_cold void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth)
> +{
> +#if HAVE_RVV
> +    int flags = av_get_cpu_flags();
> +
> +    if (bit_depth == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
> +        c->put_h264_chroma_pixels_tab[0] = h264_put_chroma_mc8_rvv;
> +        c->avg_h264_chroma_pixels_tab[0] = h264_avg_chroma_mc8_rvv;
> +    }
> +#endif
> +}
> index 0000000000..027f2ee053
> --- /dev/null
> +++ b/libavcodec/riscv/h264_mc_chroma.h
> @@ -0,0 +1,30 @@
> +/*
> + * Copyright (c) 2023 SiFive, Inc. All rights reserved.
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#ifndef AVCODEC_RISCV_H264_MC_CHROMA_H
> +#define AVCODEC_RISCV_H264_MC_CHROMA_H
> +#include "config.h"
> +
> +#if HAVE_RVV
> +void h264_put_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
> +void h264_avg_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
>

You should remove the entire h264_mc_chroma.h file, and instead just
put the function definitions at the top of libavcodec/riscv/h264_chroma_init_riscv.c
It's how everything else does this.
With that change, the non-asm portion of the patch looks good.
No comment on the assembly.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2023-05-22 16:44 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-05-22 14:48 [FFmpeg-devel] [PATCH v3] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks Arnie Chang
2023-05-22 16:42 ` Rémi Denis-Courmont
2023-05-22 16:44 ` Lynne

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git