* [FFmpeg-devel] [PATCH v3] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks
@ 2023-05-22 14:48 Arnie Chang
2023-05-22 16:42 ` Rémi Denis-Courmont
2023-05-22 16:44 ` Lynne
0 siblings, 2 replies; 3+ messages in thread
From: Arnie Chang @ 2023-05-22 14:48 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Arnie Chang
Optimize the put and avg filtering for 8x8 chroma blocks
Signed-off-by: Arnie Chang <arnie.chang@sifive.com>
---
V3:
1. Use a macro to extract repetitive segments
2. Fix coding style issues
3. Use macros in riscv/asm.S to handle function declarations
4. Replace vslidedown with vslide1down
checkasm: using random seed 2379273251
RVVi32:
- h264dsp.chroma_mc [OK]
checkasm: all 2 tests passed
avg_h264_chroma_mc1_8_c: 1821.5
avg_h264_chroma_mc1_8_rvv_i32: 482.5
put_h264_chroma_mc1_8_c: 1436.5
put_h264_chroma_mc1_8_rvv_i32: 390.5
libavcodec/h264chroma.c | 2 +
libavcodec/h264chroma.h | 1 +
libavcodec/riscv/Makefile | 2 +
libavcodec/riscv/h264_chroma_init_riscv.c | 40 +++
libavcodec/riscv/h264_mc_chroma.S | 306 ++++++++++++++++++++++
libavcodec/riscv/h264_mc_chroma.h | 30 +++
6 files changed, 381 insertions(+)
create mode 100644 libavcodec/riscv/h264_chroma_init_riscv.c
create mode 100644 libavcodec/riscv/h264_mc_chroma.S
create mode 100644 libavcodec/riscv/h264_mc_chroma.h
diff --git a/libavcodec/h264chroma.c b/libavcodec/h264chroma.c
index 60b86b6fba..1eeab7bc40 100644
--- a/libavcodec/h264chroma.c
+++ b/libavcodec/h264chroma.c
@@ -58,5 +58,7 @@ av_cold void ff_h264chroma_init(H264ChromaContext *c, int bit_depth)
ff_h264chroma_init_mips(c, bit_depth);
#elif ARCH_LOONGARCH64
ff_h264chroma_init_loongarch(c, bit_depth);
+#elif ARCH_RISCV
+ ff_h264chroma_init_riscv(c, bit_depth);
#endif
}
diff --git a/libavcodec/h264chroma.h b/libavcodec/h264chroma.h
index b8f9c8f4fc..9c81c18a76 100644
--- a/libavcodec/h264chroma.h
+++ b/libavcodec/h264chroma.h
@@ -37,5 +37,6 @@ void ff_h264chroma_init_ppc(H264ChromaContext *c, int bit_depth);
void ff_h264chroma_init_x86(H264ChromaContext *c, int bit_depth);
void ff_h264chroma_init_mips(H264ChromaContext *c, int bit_depth);
void ff_h264chroma_init_loongarch(H264ChromaContext *c, int bit_depth);
+void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth);
#endif /* AVCODEC_H264CHROMA_H */
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 965942f4df..ee17a521fd 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -10,6 +10,8 @@ OBJS-$(CONFIG_BSWAPDSP) += riscv/bswapdsp_init.o \
RVV-OBJS-$(CONFIG_BSWAPDSP) += riscv/bswapdsp_rvv.o
OBJS-$(CONFIG_FMTCONVERT) += riscv/fmtconvert_init.o
RVV-OBJS-$(CONFIG_FMTCONVERT) += riscv/fmtconvert_rvv.o
+OBJS-$(CONFIG_H264CHROMA) += riscv/h264_chroma_init_riscv.o
+RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o
OBJS-$(CONFIG_IDCTDSP) += riscv/idctdsp_init.o
RVV-OBJS-$(CONFIG_IDCTDSP) += riscv/idctdsp_rvv.o
OBJS-$(CONFIG_OPUS_DECODER) += riscv/opusdsp_init.o
diff --git a/libavcodec/riscv/h264_chroma_init_riscv.c b/libavcodec/riscv/h264_chroma_init_riscv.c
new file mode 100644
index 0000000000..2e47f1365e
--- /dev/null
+++ b/libavcodec/riscv/h264_chroma_init_riscv.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavcodec/h264chroma.h"
+#include "config.h"
+#include "h264_mc_chroma.h"
+
+av_cold void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth)
+{
+#if HAVE_RVV
+ int flags = av_get_cpu_flags();
+
+ if (bit_depth == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
+ c->put_h264_chroma_pixels_tab[0] = h264_put_chroma_mc8_rvv;
+ c->avg_h264_chroma_pixels_tab[0] = h264_avg_chroma_mc8_rvv;
+ }
+#endif
+}
+
diff --git a/libavcodec/riscv/h264_mc_chroma.S b/libavcodec/riscv/h264_mc_chroma.S
new file mode 100644
index 0000000000..1c373c8cc7
--- /dev/null
+++ b/libavcodec/riscv/h264_mc_chroma.S
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "libavutil/riscv/asm.S"
+
+.macro h264_chroma_mc8 type
+func h264_\type\()_chroma_mc8_rvv, zvl128b
+ slliw t2, a5, 3
+ mulw t1, a5, a4
+ sh3add a5, a4, t2
+ slliw a4, a4, 3
+ subw a5, t1, a5
+ subw a7, a4, t1
+ addiw a6, a5, 64
+ subw t0, t2, t1
+ vsetivli t3, 8, e8, m1, ta, mu
+ beqz t1, 2f
+ blez a3, 8f
+ li t4, 0
+ li t2, 0
+ li t5, 1
+ addi a5, t3, 1
+ slli t3, a2, 2
+1: # if (xy != 0)
+ add a4, a1, t4
+ vsetvli zero, a5, e8, m1, ta, ma
+ addiw t2, t2, 4
+ vle8.v v10, (a4)
+ add a4, a4, a2
+ vslide1down.vx v11, v10, t5
+ vsetivli zero, 8, e8, m1, ta, ma
+ vwmulu.vx v8, v10, a6
+ vwmaccu.vx v8, a7, v11
+ vsetvli zero, a5, e8, m1, ta, ma
+ vle8.v v12, (a4)
+ vsetivli zero, 8, e8, m1, ta, ma
+ add a4, a4, a2
+ vwmaccu.vx v8, t0, v12
+ vsetvli zero, a5, e8, m1, ta, ma
+ vslide1down.vx v13, v12, t5
+ vsetivli zero, 8, e8, m1, ta, ma
+ vwmulu.vx v10, v12, a6
+ vwmaccu.vx v8, t1, v13
+ vwmaccu.vx v10, a7, v13
+ vsetvli zero, a5, e8, m1, ta, ma
+ vle8.v v14, (a4)
+ vsetivli zero, 8, e8, m1, ta, ma
+ add a4, a4, a2
+ vwmaccu.vx v10, t0, v14
+ vsetvli zero, a5, e8, m1, ta, ma
+ vslide1down.vx v15, v14, t5
+ vsetivli zero, 8, e8, m1, ta, ma
+ vwmulu.vx v12, v14, a6
+ vwmaccu.vx v10, t1, v15
+ vwmaccu.vx v12, a7, v15
+ vsetvli zero, a5, e8, m1, ta, ma
+ vle8.v v14, (a4)
+ vsetivli zero, 8, e8, m1, ta, ma
+ add a4, a4, a2
+ vwmaccu.vx v12, t0, v14
+ vsetvli zero, a5, e8, m1, ta, ma
+ vslide1down.vx v15, v14, t5
+ vsetivli zero, 8, e8, m1, ta, ma
+ vwmulu.vx v16, v14, a6
+ vwmaccu.vx v12, t1, v15
+ vwmaccu.vx v16, a7, v15
+ vsetvli zero, a5, e8, m1, ta, ma
+ vle8.v v14, (a4)
+ vsetivli zero, 8, e8, m1, ta, ma
+ add a4, a0, t4
+ add t4, t4, t3
+ vwmaccu.vx v16, t0, v14
+ vsetvli zero, a5, e8, m1, ta, ma
+ vslide1down.vx v14, v14, t5
+ vsetivli zero, 8, e8, m1, ta, ma
+ vnclipu.wi v15, v8, 6
+ vwmaccu.vx v16, t1, v14
+ .ifc \type,avg
+ vle8.v v9, (a4)
+ vaaddu.vv v15, v15, v9
+ .endif
+ vse8.v v15, (a4)
+ add a4, a4, a2
+ vnclipu.wi v8, v10, 6
+ .ifc \type,avg
+ vle8.v v9, (a4)
+ vaaddu.vv v8, v8, v9
+ .endif
+ vse8.v v8, (a4)
+ add a4, a4, a2
+ vnclipu.wi v8, v12, 6
+ .ifc \type,avg
+ vle8.v v9, (a4)
+ vaaddu.vv v8, v8, v9
+ .endif
+ vse8.v v8, (a4)
+ add a4, a4, a2
+ vnclipu.wi v8, v16, 6
+ .ifc \type,avg
+ vle8.v v9, (a4)
+ vaaddu.vv v8, v8, v9
+ .endif
+ vse8.v v8, (a4)
+ blt t2, a3, 1b
+ j 8f
+2:
+ bnez a4, 4f
+ beqz t2, 4f
+ blez a3, 8f
+ li a4, 0
+ li t1, 0
+ slli a7, a2, 2
+3: # if ((x8 - xy) == 0 && (y8 -xy) != 0)
+ add a5, a1, a4
+ vsetvli zero, zero, e8, m1, ta, ma
+ addiw t1, t1, 4
+ vle8.v v8, (a5)
+ add a5, a5, a2
+ add t2, a5, a2
+ vwmulu.vx v10, v8, a6
+ vle8.v v8, (a5)
+ vwmulu.vx v12, v8, a6
+ vle8.v v9, (t2)
+ add t2, t2, a2
+ add a5, t2, a2
+ vwmaccu.vx v10, t0, v8
+ vle8.v v8, (t2)
+ vle8.v v14, (a5)
+ add a5, a0, a4
+ add a4, a4, a7
+ vwmaccu.vx v12, t0, v9
+ vnclipu.wi v15, v10, 6
+ vwmulu.vx v10, v9, a6
+ .ifc \type,avg
+ vle8.v v16, (a5)
+ vaaddu.vv v15, v15, v16
+ .endif
+ vse8.v v15, (a5)
+ add a5, a5, a2
+ vnclipu.wi v9, v12, 6
+ vwmaccu.vx v10, t0, v8
+ vwmulu.vx v12, v8, a6
+ .ifc \type,avg
+ vle8.v v16, (a5)
+ vaaddu.vv v9, v9, v16
+ .endif
+ vse8.v v9, (a5)
+ add a5, a5, a2
+ vnclipu.wi v8, v10, 6
+ vwmaccu.vx v12, t0, v14
+ .ifc \type,avg
+ vle8.v v16, (a5)
+ vaaddu.vv v8, v8, v16
+ .endif
+ vse8.v v8, (a5)
+ add a5, a5, a2
+ vnclipu.wi v8, v12, 6
+ .ifc \type,avg
+ vle8.v v16, (a5)
+ vaaddu.vv v8, v8, v16
+ .endif
+ vse8.v v8, (a5)
+ blt t1, a3, 3b
+ j 8f
+4:
+ beqz a4, 6f
+ bnez t2, 6f
+ blez a3, 8f
+ li a4, 0
+ li t2, 0
+ addi t0, t3, 1
+ slli t1, a2, 2
+5: # if ((x8 - xy) != 0 && (y8 -xy) == 0)
+ add a5, a1, a4
+ vsetvli zero, t0, e8, m1, ta, ma
+ addiw t2, t2, 4
+ vle8.v v8, (a5)
+ add a5, a5, a2
+ vslide1down.vx v9, v8, t5
+ vsetivli zero, 8, e8, m1, ta, ma
+ vwmulu.vx v10, v8, a6
+ vwmaccu.vx v10, a7, v9
+ vsetvli zero, t0, e8, m1, ta, ma
+ vle8.v v8, (a5)
+ add a5, a5, a2
+ vslide1down.vx v9, v8, t5
+ vsetivli zero, 8, e8, m1, ta, ma
+ vwmulu.vx v12, v8, a6
+ vwmaccu.vx v12, a7, v9
+ vsetvli zero, t0, e8, m1, ta, ma
+ vle8.v v8, (a5)
+ add a5, a5, a2
+ vslide1down.vx v9, v8, t5
+ vsetivli zero, 8, e8, m1, ta, ma
+ vwmulu.vx v14, v8, a6
+ vwmaccu.vx v14, a7, v9
+ vsetvli zero, t0, e8, m1, ta, ma
+ vle8.v v8, (a5)
+ add a5, a0, a4
+ add a4, a4, t1
+ vslide1down.vx v9, v8, t5
+ vsetivli zero, 8, e8, m1, ta, ma
+ vnclipu.wi v16, v10, 6
+ .ifc \type,avg
+ vle8.v v18, (a5)
+ vaaddu.vv v16, v16, v18
+ .endif
+ vse8.v v16, (a5)
+ add a5, a5, a2
+ vnclipu.wi v10, v12, 6
+ vwmulu.vx v12, v8, a6
+ .ifc \type,avg
+ vle8.v v18, (a5)
+ vaaddu.vv v10, v10, v18
+ .endif
+ vse8.v v10, (a5)
+ add a5, a5, a2
+ vnclipu.wi v8, v14, 6
+ vwmaccu.vx v12, a7, v9
+ .ifc \type,avg
+ vle8.v v18, (a5)
+ vaaddu.vv v8, v8, v18
+ .endif
+ vse8.v v8, (a5)
+ add a5, a5, a2
+ vnclipu.wi v8, v12, 6
+ .ifc \type,avg
+ vle8.v v18, (a5)
+ vaaddu.vv v8, v8, v18
+ .endif
+ vse8.v v8, (a5)
+ blt t2, a3, 5b
+ j 8f
+6:
+ blez a3, 8f
+ li a4, 0
+ li t2, 0
+ slli a7, a2, 2
+7: # the final else, none of the above conditions are met
+ add t0, a1, a4
+ vsetvli zero, zero, e8, m1, ta, ma
+ add a5, a0, a4
+ add a4, a4, a7
+ addiw t2, t2, 4
+ vle8.v v8, (t0)
+ add t0, t0, a2
+ add t1, t0, a2
+ vwmulu.vx v10, v8, a6
+ vle8.v v8, (t0)
+ add t0, t1, a2
+ vle8.v v9, (t1)
+ vle8.v v12, (t0)
+ vnclipu.wi v13, v10, 6
+ vwmulu.vx v10, v8, a6
+ .ifc \type,avg
+ vle8.v v18, (a5)
+ vaaddu.vv v13, v13, v18
+ .endif
+ vse8.v v13, (a5)
+ add a5, a5, a2
+ vnclipu.wi v8, v10, 6
+ vwmulu.vx v10, v9, a6
+ .ifc \type,avg
+ vle8.v v18, (a5)
+ vaaddu.vv v8, v8, v18
+ .endif
+ vse8.v v8, (a5)
+ add a5, a5, a2
+ vnclipu.wi v8, v10, 6
+ vwmulu.vx v10, v12, a6
+ .ifc \type,avg
+ vle8.v v18, (a5)
+ vaaddu.vv v8, v8, v18
+ .endif
+ vse8.v v8, (a5)
+ add a5, a5, a2
+ vnclipu.wi v8, v10, 6
+ .ifc \type,avg
+ vle8.v v18, (a5)
+ vaaddu.vv v8, v8, v18
+ .endif
+ vse8.v v8, (a5)
+ blt t2, a3, 7b
+8:
+ ret
+endfunc
+.endm
+
+h264_chroma_mc8 put
+h264_chroma_mc8 avg
diff --git a/libavcodec/riscv/h264_mc_chroma.h b/libavcodec/riscv/h264_mc_chroma.h
new file mode 100644
index 0000000000..027f2ee053
--- /dev/null
+++ b/libavcodec/riscv/h264_mc_chroma.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_RISCV_H264_MC_CHROMA_H
+#define AVCODEC_RISCV_H264_MC_CHROMA_H
+#include "config.h"
+
+#if HAVE_RVV
+void h264_put_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
+void h264_avg_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
+#endif
+#endif
+
--
2.17.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks
2023-05-22 14:48 [FFmpeg-devel] [PATCH v3] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks Arnie Chang
@ 2023-05-22 16:42 ` Rémi Denis-Courmont
2023-05-22 16:44 ` Lynne
1 sibling, 0 replies; 3+ messages in thread
From: Rémi Denis-Courmont @ 2023-05-22 16:42 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Arnie Chang
Le maanantaina 22. toukokuuta 2023, 17.48.40 EEST Arnie Chang a écrit :
> --- /dev/null
> +++ b/libavcodec/riscv/h264_chroma_init_riscv.c
> @@ -0,0 +1,40 @@
> +/*
> + * Copyright (c) 2023 SiFive, Inc. All rights reserved.
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA + */
> +
> +#include <stdint.h>
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavcodec/h264chroma.h"
> +#include "config.h"
> +#include "h264_mc_chroma.h"
> +
> +av_cold void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth)
> +{
> +#if HAVE_RVV
> + int flags = av_get_cpu_flags();
> +
> + if (bit_depth == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
> + c->put_h264_chroma_pixels_tab[0] = h264_put_chroma_mc8_rvv;
> + c->avg_h264_chroma_pixels_tab[0] = h264_avg_chroma_mc8_rvv;
> + }
You should check that the vector length is large enough:
`ff_get_rv_vlenb() >= 16`
> +#endif
> +}
> +
> diff --git a/libavcodec/riscv/h264_mc_chroma.S
> b/libavcodec/riscv/h264_mc_chroma.S new file mode 100644
> index 0000000000..1c373c8cc7
> --- /dev/null
> +++ b/libavcodec/riscv/h264_mc_chroma.S
> @@ -0,0 +1,306 @@
> +/*
> + * Copyright (c) 2023 SiFive, Inc. All rights reserved.
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA + */
> +#include "libavutil/riscv/asm.S"
> +
> +.macro h264_chroma_mc8 type
> +func h264_\type\()_chroma_mc8_rvv, zvl128b
That works, but `zve32x` would be more idiomatic and consistent with existing code. Selecting a vector length in the assembler doesn't really do anything other than enable `zve32x` implicitly anyway.
> + slliw t2, a5, 3
Don't use narrow AL unless it is really necessary. AFAICT, the C compiler will sign-extend `a5` to XLEN bits, so you should not need to care.
But if you do really need 32- rather than XLEN-bit instructions, then you should gate the code: `#if (__riscv_xlen >= 64)`
> + mulw t1, a5, a4
> + sh3add a5, a4, t2
> + slliw a4, a4, 3
> + subw a5, t1, a5
> + subw a7, a4, t1
> + addiw a6, a5, 64
> + subw t0, t2, t1
> + vsetivli t3, 8, e8, m1, ta, mu
> + beqz t1, 2f
> + blez a3, 8f
> + li t4, 0
> + li t2, 0
> + li t5, 1
> + addi a5, t3, 1
> + slli t3, a2, 2
> +1: # if (xy != 0)
> + add a4, a1, t4
> + vsetvli zero, a5, e8, m1, ta, ma
> + addiw t2, t2, 4
> + vle8.v v10, (a4)
> + add a4, a4, a2
> + vslide1down.vx v11, v10, t5
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v8, v10, a6
> + vwmaccu.vx v8, a7, v11
From a quick glance, the code seems to be using between a quarter and half of the vector bank, so it would be preferable to use exclusively even-numbered registers. Then we can double LMUL easily later if that turns out faster.
> + vsetvli zero, a5, e8, m1, ta, ma
> + vle8.v v12, (a4)
> + vsetivli zero, 8, e8, m1, ta, ma
> + add a4, a4, a2
> + vwmaccu.vx v8, t0, v12
> + vsetvli zero, a5, e8, m1, ta, ma
> + vslide1down.vx v13, v12, t5
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v10, v12, a6
> + vwmaccu.vx v8, t1, v13
> + vwmaccu.vx v10, a7, v13
> + vsetvli zero, a5, e8, m1, ta, ma
> + vle8.v v14, (a4)
> + vsetivli zero, 8, e8, m1, ta, ma
> + add a4, a4, a2
> + vwmaccu.vx v10, t0, v14
> + vsetvli zero, a5, e8, m1, ta, ma
> + vslide1down.vx v15, v14, t5
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v12, v14, a6
> + vwmaccu.vx v10, t1, v15
> + vwmaccu.vx v12, a7, v15
> + vsetvli zero, a5, e8, m1, ta, ma
> + vle8.v v14, (a4)
> + vsetivli zero, 8, e8, m1, ta, ma
> + add a4, a4, a2
> + vwmaccu.vx v12, t0, v14
> + vsetvli zero, a5, e8, m1, ta, ma
> + vslide1down.vx v15, v14, t5
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v16, v14, a6
> + vwmaccu.vx v12, t1, v15
> + vwmaccu.vx v16, a7, v15
> + vsetvli zero, a5, e8, m1, ta, ma
> + vle8.v v14, (a4)
> + vsetivli zero, 8, e8, m1, ta, ma
> + add a4, a0, t4
> + add t4, t4, t3
> + vwmaccu.vx v16, t0, v14
> + vsetvli zero, a5, e8, m1, ta, ma
> + vslide1down.vx v14, v14, t5
> + vsetivli zero, 8, e8, m1, ta, ma
> + vnclipu.wi v15, v8, 6
> + vwmaccu.vx v16, t1, v14
> + .ifc \type,avg
> + vle8.v v9, (a4)
> + vaaddu.vv v15, v15, v9
> + .endif
> + vse8.v v15, (a4)
> + add a4, a4, a2
> + vnclipu.wi v8, v10, 6
> + .ifc \type,avg
> + vle8.v v9, (a4)
> + vaaddu.vv v8, v8, v9
> + .endif
> + vse8.v v8, (a4)
> + add a4, a4, a2
> + vnclipu.wi v8, v12, 6
> + .ifc \type,avg
> + vle8.v v9, (a4)
> + vaaddu.vv v8, v8, v9
> + .endif
> + vse8.v v8, (a4)
> + add a4, a4, a2
> + vnclipu.wi v8, v16, 6
> + .ifc \type,avg
> + vle8.v v9, (a4)
> + vaaddu.vv v8, v8, v9
> + .endif
> + vse8.v v8, (a4)
> + blt t2, a3, 1b
> + j 8f
> +2:
> + bnez a4, 4f
> + beqz t2, 4f
> + blez a3, 8f
> + li a4, 0
> + li t1, 0
> + slli a7, a2, 2
> +3: # if ((x8 - xy) == 0 && (y8 -xy) != 0)
> + add a5, a1, a4
> + vsetvli zero, zero, e8, m1, ta, ma
> + addiw t1, t1, 4
> + vle8.v v8, (a5)
> + add a5, a5, a2
> + add t2, a5, a2
> + vwmulu.vx v10, v8, a6
> + vle8.v v8, (a5)
> + vwmulu.vx v12, v8, a6
> + vle8.v v9, (t2)
> + add t2, t2, a2
> + add a5, t2, a2
> + vwmaccu.vx v10, t0, v8
> + vle8.v v8, (t2)
> + vle8.v v14, (a5)
> + add a5, a0, a4
> + add a4, a4, a7
> + vwmaccu.vx v12, t0, v9
> + vnclipu.wi v15, v10, 6
> + vwmulu.vx v10, v9, a6
> + .ifc \type,avg
> + vle8.v v16, (a5)
> + vaaddu.vv v15, v15, v16
> + .endif
> + vse8.v v15, (a5)
The store depends on the previous add, which depends on the previous load. That might presumably cause some pipeline delay depending on the IP. You may want to reorder independent vector instructions a little bit.
> + add a5, a5, a2
> + vnclipu.wi v9, v12, 6
> + vwmaccu.vx v10, t0, v8
> + vwmulu.vx v12, v8, a6
> + .ifc \type,avg
> + vle8.v v16, (a5)
> + vaaddu.vv v9, v9, v16
> + .endif
> + vse8.v v9, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v10, 6
> + vwmaccu.vx v12, t0, v14
> + .ifc \type,avg
> + vle8.v v16, (a5)
> + vaaddu.vv v8, v8, v16
> + .endif
> + vse8.v v8, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v12, 6
> + .ifc \type,avg
> + vle8.v v16, (a5)
> + vaaddu.vv v8, v8, v16
> + .endif
> + vse8.v v8, (a5)
> + blt t1, a3, 3b
> + j 8f
> +4:
> + beqz a4, 6f
> + bnez t2, 6f
> + blez a3, 8f
> + li a4, 0
> + li t2, 0
> + addi t0, t3, 1
> + slli t1, a2, 2
> +5: # if ((x8 - xy) != 0 && (y8 -xy) == 0)
> + add a5, a1, a4
> + vsetvli zero, t0, e8, m1, ta, ma
> + addiw t2, t2, 4
> + vle8.v v8, (a5)
> + add a5, a5, a2
> + vslide1down.vx v9, v8, t5
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v10, v8, a6
> + vwmaccu.vx v10, a7, v9
> + vsetvli zero, t0, e8, m1, ta, ma
> + vle8.v v8, (a5)
> + add a5, a5, a2
> + vslide1down.vx v9, v8, t5
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v12, v8, a6
> + vwmaccu.vx v12, a7, v9
> + vsetvli zero, t0, e8, m1, ta, ma
> + vle8.v v8, (a5)
> + add a5, a5, a2
> + vslide1down.vx v9, v8, t5
> + vsetivli zero, 8, e8, m1, ta, ma
> + vwmulu.vx v14, v8, a6
> + vwmaccu.vx v14, a7, v9
> + vsetvli zero, t0, e8, m1, ta, ma
> + vle8.v v8, (a5)
> + add a5, a0, a4
> + add a4, a4, t1
> + vslide1down.vx v9, v8, t5
> + vsetivli zero, 8, e8, m1, ta, ma
> + vnclipu.wi v16, v10, 6
> + .ifc \type,avg
> + vle8.v v18, (a5)
> + vaaddu.vv v16, v16, v18
> + .endif
> + vse8.v v16, (a5)
> + add a5, a5, a2
> + vnclipu.wi v10, v12, 6
> + vwmulu.vx v12, v8, a6
> + .ifc \type,avg
> + vle8.v v18, (a5)
> + vaaddu.vv v10, v10, v18
> + .endif
> + vse8.v v10, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v14, 6
> + vwmaccu.vx v12, a7, v9
> + .ifc \type,avg
> + vle8.v v18, (a5)
> + vaaddu.vv v8, v8, v18
> + .endif
> + vse8.v v8, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v12, 6
> + .ifc \type,avg
> + vle8.v v18, (a5)
> + vaaddu.vv v8, v8, v18
> + .endif
> + vse8.v v8, (a5)
> + blt t2, a3, 5b
> + j 8f
> +6:
> + blez a3, 8f
> + li a4, 0
> + li t2, 0
> + slli a7, a2, 2
> +7: # the final else, none of the above
> conditions are met + add t0, a1, a4
> + vsetvli zero, zero, e8, m1, ta, ma
> + add a5, a0, a4
> + add a4, a4, a7
> + addiw t2, t2, 4
> + vle8.v v8, (t0)
> + add t0, t0, a2
> + add t1, t0, a2
> + vwmulu.vx v10, v8, a6
> + vle8.v v8, (t0)
> + add t0, t1, a2
> + vle8.v v9, (t1)
> + vle8.v v12, (t0)
> + vnclipu.wi v13, v10, 6
> + vwmulu.vx v10, v8, a6
> + .ifc \type,avg
> + vle8.v v18, (a5)
> + vaaddu.vv v13, v13, v18
> + .endif
> + vse8.v v13, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v10, 6
> + vwmulu.vx v10, v9, a6
> + .ifc \type,avg
> + vle8.v v18, (a5)
> + vaaddu.vv v8, v8, v18
> + .endif
> + vse8.v v8, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v10, 6
> + vwmulu.vx v10, v12, a6
> + .ifc \type,avg
> + vle8.v v18, (a5)
> + vaaddu.vv v8, v8, v18
> + .endif
> + vse8.v v8, (a5)
> + add a5, a5, a2
> + vnclipu.wi v8, v10, 6
> + .ifc \type,avg
> + vle8.v v18, (a5)
> + vaaddu.vv v8, v8, v18
> + .endif
> + vse8.v v8, (a5)
> + blt t2, a3, 7b
> +8:
> + ret
> +endfunc
> +.endm
> +
> +h264_chroma_mc8 put
> +h264_chroma_mc8 avg
--
Rémi Denis-Courmont
http://www.remlab.net/
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks
2023-05-22 14:48 [FFmpeg-devel] [PATCH v3] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks Arnie Chang
2023-05-22 16:42 ` Rémi Denis-Courmont
@ 2023-05-22 16:44 ` Lynne
1 sibling, 0 replies; 3+ messages in thread
From: Lynne @ 2023-05-22 16:44 UTC (permalink / raw)
To: FFmpeg development discussions and patches
May 22, 2023, 16:48 by arnie.chang@sifive.com:
> Optimize the put and avg filtering for 8x8 chroma blocks
>
> Signed-off-by: Arnie Chang <arnie.chang@sifive.com>
> ---
> V3:
> 1. Use a macro to extract repetitive segments
> 2. Fix coding style issues
> 3. Use macros in riscv/asm.S to handle function declarations
> 4. Replace vslidedown with vslide1down
> checkasm: using random seed 2379273251
> RVVi32:
> - h264dsp.chroma_mc [OK]
> checkasm: all 2 tests passed
> avg_h264_chroma_mc1_8_c: 1821.5
> avg_h264_chroma_mc1_8_rvv_i32: 482.5
> put_h264_chroma_mc1_8_c: 1436.5
> put_h264_chroma_mc1_8_rvv_i32: 390.5
>
Pretty nice. You'd struggle to get this speedup with NEON.
Though, it's still only an FPGA.
The checkasm patch was merged with a better and more extensive form,
you should check again that it passes, just in case.
> libavcodec/h264chroma.c | 2 +
> libavcodec/h264chroma.h | 1 +
> libavcodec/riscv/Makefile | 2 +
> libavcodec/riscv/h264_chroma_init_riscv.c | 40 +++
> libavcodec/riscv/h264_mc_chroma.S | 306 ++++++++++++++++++++++
> libavcodec/riscv/h264_mc_chroma.h | 30 +++
> 6 files changed, 381 insertions(+)
> create mode 100644 libavcodec/riscv/h264_chroma_init_riscv.c
> create mode 100644 libavcodec/riscv/h264_mc_chroma.S
> create mode 100644 libavcodec/riscv/h264_mc_chroma.h
>
> diff --git a/libavcodec/h264chroma.c b/libavcodec/h264chroma.c
> index 60b86b6fba..1eeab7bc40 100644
> --- a/libavcodec/h264chroma.c
> +++ b/libavcodec/h264chroma.c
> @@ -58,5 +58,7 @@ av_cold void ff_h264chroma_init(H264ChromaContext *c, int bit_depth)
> ff_h264chroma_init_mips(c, bit_depth);
> #elif ARCH_LOONGARCH64
> ff_h264chroma_init_loongarch(c, bit_depth);
> +#elif ARCH_RISCV
> + ff_h264chroma_init_riscv(c, bit_depth);
> #endif
> }
> diff --git a/libavcodec/h264chroma.h b/libavcodec/h264chroma.h
> index b8f9c8f4fc..9c81c18a76 100644
> --- a/libavcodec/h264chroma.h
> +++ b/libavcodec/h264chroma.h
> @@ -37,5 +37,6 @@ void ff_h264chroma_init_ppc(H264ChromaContext *c, int bit_depth);
> void ff_h264chroma_init_x86(H264ChromaContext *c, int bit_depth);
> void ff_h264chroma_init_mips(H264ChromaContext *c, int bit_depth);
> void ff_h264chroma_init_loongarch(H264ChromaContext *c, int bit_depth);
> +void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth);
>
> #endif /* AVCODEC_H264CHROMA_H */
> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> index 965942f4df..ee17a521fd 100644
> --- a/libavcodec/riscv/Makefile
> +++ b/libavcodec/riscv/Makefile
> @@ -10,6 +10,8 @@ OBJS-$(CONFIG_BSWAPDSP) += riscv/bswapdsp_init.o \
> RVV-OBJS-$(CONFIG_BSWAPDSP) += riscv/bswapdsp_rvv.o
> OBJS-$(CONFIG_FMTCONVERT) += riscv/fmtconvert_init.o
> RVV-OBJS-$(CONFIG_FMTCONVERT) += riscv/fmtconvert_rvv.o
> +OBJS-$(CONFIG_H264CHROMA) += riscv/h264_chroma_init_riscv.o
> +RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o
> OBJS-$(CONFIG_IDCTDSP) += riscv/idctdsp_init.o
> RVV-OBJS-$(CONFIG_IDCTDSP) += riscv/idctdsp_rvv.o
> OBJS-$(CONFIG_OPUS_DECODER) += riscv/opusdsp_init.o
> diff --git a/libavcodec/riscv/h264_chroma_init_riscv.c b/libavcodec/riscv/h264_chroma_init_riscv.c
> new file mode 100644
> index 0000000000..2e47f1365e
> --- /dev/null
> +++ b/libavcodec/riscv/h264_chroma_init_riscv.c
> @@ -0,0 +1,40 @@
> +/*
> + * Copyright (c) 2023 SiFive, Inc. All rights reserved.
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include <stdint.h>
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavcodec/h264chroma.h"
> +#include "config.h"
> +#include "h264_mc_chroma.h"
> +
> +av_cold void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth)
> +{
> +#if HAVE_RVV
> + int flags = av_get_cpu_flags();
> +
> + if (bit_depth == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
> + c->put_h264_chroma_pixels_tab[0] = h264_put_chroma_mc8_rvv;
> + c->avg_h264_chroma_pixels_tab[0] = h264_avg_chroma_mc8_rvv;
> + }
> +#endif
> +}
> index 0000000000..027f2ee053
> --- /dev/null
> +++ b/libavcodec/riscv/h264_mc_chroma.h
> @@ -0,0 +1,30 @@
> +/*
> + * Copyright (c) 2023 SiFive, Inc. All rights reserved.
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#ifndef AVCODEC_RISCV_H264_MC_CHROMA_H
> +#define AVCODEC_RISCV_H264_MC_CHROMA_H
> +#include "config.h"
> +
> +#if HAVE_RVV
> +void h264_put_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
> +void h264_avg_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
>
You should remove the entire h264_mc_chroma.h file, and instead just
put the function definitions at the top of libavcodec/riscv/h264_chroma_init_riscv.c
It's how everything else does this.
With that change, the non-asm portion of the patch looks good.
No comment on the assembly.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2023-05-22 16:44 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-05-22 14:48 [FFmpeg-devel] [PATCH v3] lavc/h264chroma: RISC-V V add motion compensation for 8x8 chroma blocks Arnie Chang
2023-05-22 16:42 ` Rémi Denis-Courmont
2023-05-22 16:44 ` Lynne
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git