* [FFmpeg-devel] [PATCH] rvv: add optimized h264 intra prediction functions (PR #20494)
@ 2025-09-11 17:27 tmatth via ffmpeg-devel
2025-09-11 18:17 ` [FFmpeg-devel] " Tristan Matthews via ffmpeg-devel
0 siblings, 1 reply; 2+ messages in thread
From: tmatth via ffmpeg-devel @ 2025-09-11 17:27 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: tmatth
PR #20494 opened by tmatth
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20494
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20494.patch
Benched on SpaceMIT for VLEN 256
Reviewed-by: Nathan E. Egge <unlord@xiph.org>
C RVV256
pred8x8_horizontal_8: 36.4 32.6 (1.12x)
pred8x8_plane_8: 225.7 149.7 (1.51x)
pred16x16_dc_8: 70.2 58.3 (1.20x)
pred16x16_dc_128_8: 24.6 23.9 (1.03x)
pred16x16_horizontal_8: 90.3 56.4 (1.60x)
pred16x16_left_dc_8: 49.0 46.4 (1.06x)
pred16x16_plane_8: 746.9 232.2 (3.22x)
pred16x16_top_dc_8: 47.1 45.2 (1.04x)
pred16x16_vertical_8: 32.6 24.5 (1.33x)
From c33acce5510cf27595e442b687a3433ae013ce18 Mon Sep 17 00:00:00 2001
From: Tristan Matthews <tmatth@videolan.org>
Date: Sun, 20 Jul 2025 12:13:00 +0000
Subject: [PATCH] rvv: add optimized h264 intra prediction functions
Benched on SpaceMIT for VLEN 256
Reviewed-by: Nathan E. Egge <unlord@xiph.org>
C RVV256
pred8x8_horizontal_8: 36.4 32.6 (1.12x)
pred8x8_plane_8: 225.7 149.7 (1.51x)
pred16x16_dc_8: 70.2 58.3 (1.20x)
pred16x16_dc_128_8: 24.6 23.9 (1.03x)
pred16x16_horizontal_8: 90.3 56.4 (1.60x)
pred16x16_left_dc_8: 49.0 46.4 (1.06x)
pred16x16_plane_8: 746.9 232.2 (3.22x)
pred16x16_top_dc_8: 47.1 45.2 (1.04x)
pred16x16_vertical_8: 32.6 24.5 (1.33x)
---
libavcodec/h264pred.c | 2 +
libavcodec/h264pred.h | 3 +-
libavcodec/riscv/Makefile | 2 +
libavcodec/riscv/h264_intrapred_init.c | 82 ++++
libavcodec/riscv/h264_intrapred_rvv.S | 501 +++++++++++++++++++++++++
tests/checkasm/h264pred.c | 9 +-
6 files changed, 597 insertions(+), 2 deletions(-)
create mode 100644 libavcodec/riscv/h264_intrapred_init.c
create mode 100644 libavcodec/riscv/h264_intrapred_rvv.S
diff --git a/libavcodec/h264pred.c b/libavcodec/h264pred.c
index 25f9995a0b..f4ad02c326 100644
--- a/libavcodec/h264pred.c
+++ b/libavcodec/h264pred.c
@@ -598,5 +598,7 @@ av_cold void ff_h264_pred_init(H264PredContext *h, int codec_id,
ff_h264_pred_init_mips(h, codec_id, bit_depth, chroma_format_idc);
#elif ARCH_LOONGARCH
ff_h264_pred_init_loongarch(h, codec_id, bit_depth, chroma_format_idc);
+#elif ARCH_RISCV
+ ff_h264_pred_init_riscv(h, codec_id, bit_depth, chroma_format_idc);
#endif
}
diff --git a/libavcodec/h264pred.h b/libavcodec/h264pred.h
index cb008548fc..8ac5088b34 100644
--- a/libavcodec/h264pred.h
+++ b/libavcodec/h264pred.h
@@ -126,5 +126,6 @@ void ff_h264_pred_init_mips(H264PredContext *h, int codec_id,
const int bit_depth, const int chroma_format_idc);
void ff_h264_pred_init_loongarch(H264PredContext *h, int codec_id,
const int bit_depth, const int chroma_format_idc);
-
+void ff_h264_pred_init_riscv(H264PredContext *h, int codec_id,
+ const int bit_depth, const int chroma_format_idc);
#endif /* AVCODEC_H264PRED_H */
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 736f873fe8..096bae7f16 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -33,6 +33,8 @@ RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o
OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_init.o
RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o riscv/h264dsp_rvv.o \
riscv/h264idct_rvv.o
+OBJS-$(CONFIG_H264PRED) += riscv/h264_intrapred_init.o
+RVV-OBJS-$(CONFIG_H264PRED) += riscv/h264_intrapred_rvv.o
OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o
RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o
OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o
diff --git a/libavcodec/riscv/h264_intrapred_init.c b/libavcodec/riscv/h264_intrapred_init.c
new file mode 100644
index 0000000000..e6af5cb375
--- /dev/null
+++ b/libavcodec/riscv/h264_intrapred_init.c
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2025 Tristan Matthews <tmatth@videolan.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/riscv/cpu.h"
+#include "libavcodec/codec_id.h"
+#include "libavcodec/h264pred.h"
+
+#define PRED8x8(TYPE, DEPTH, SUFFIX) \
+void ff_pred8x8_ ## TYPE ## _ ## DEPTH ## _ ## SUFFIX (uint8_t *src, \
+ ptrdiff_t stride);
+
+#define PRED16x16(TYPE, DEPTH, SUFFIX) \
+void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## SUFFIX (uint8_t *src, \
+ ptrdiff_t stride);
+
+/* 8-bit versions */
+PRED8x8(horizontal, 8, rvv)
+PRED8x8(horizontal, 8, rvv_vl256)
+PRED8x8(plane, 8, rvv)
+PRED16x16(horizontal, 8, rvv)
+PRED16x16(vertical, 8, rvv)
+PRED16x16(dc, 8, rvv)
+PRED16x16(128_dc, 8, rvv)
+PRED16x16(left_dc, 8, rvv)
+PRED16x16(top_dc, 8, rvv)
+PRED16x16(plane, 8, rvv)
+
+av_cold void ff_h264_pred_init_riscv(H264PredContext *h, int codec_id,
+ const int bit_depth,
+ const int chroma_format_idc)
+{
+#if HAVE_RVV
+ int cpu_flags = av_get_cpu_flags();
+ const int vlen = 8 * ff_get_rv_vlenb();
+
+ if (!(cpu_flags & AV_CPU_FLAG_RVV_I32)) return;
+
+ if (bit_depth == 8) {
+ if (chroma_format_idc <= 1) {
+ h->pred8x8[HOR_PRED8x8] = ff_pred8x8_horizontal_8_rvv;
+ if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) {
+ h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_8_rvv;
+ }
+ if (vlen >= 256) {
+ h->pred8x8[HOR_PRED8x8] = ff_pred8x8_horizontal_8_rvv_vl256;
+ }
+ }
+ h->pred16x16[HOR_PRED8x8] = ff_pred16x16_horizontal_8_rvv;
+ h->pred16x16[DC_PRED8x8] = ff_pred16x16_dc_8_rvv;
+ h->pred16x16[LEFT_DC_PRED8x8] = ff_pred16x16_left_dc_8_rvv;
+ h->pred16x16[TOP_DC_PRED8x8] = ff_pred16x16_top_dc_8_rvv;
+ h->pred16x16[VERT_PRED8x8] = ff_pred16x16_vertical_8_rvv;
+ h->pred16x16[DC_128_PRED8x8] = ff_pred16x16_128_dc_8_rvv;
+ if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 &&
+ codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) {
+ h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_8_rvv;
+ }
+ }
+#endif
+}
diff --git a/libavcodec/riscv/h264_intrapred_rvv.S b/libavcodec/riscv/h264_intrapred_rvv.S
new file mode 100644
index 0000000000..3999037833
--- /dev/null
+++ b/libavcodec/riscv/h264_intrapred_rvv.S
@@ -0,0 +1,501 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright © 2025 Tristan Matthews
+ * Partly based on h264_pred.c Copyright (c) 2023 SiFive, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "libavutil/riscv/asm.S"
+
+func ff_pred8x8_horizontal_8_rvv_vl256, zve32x
+ lpad 0
+ # mf4 only for VLEN >= 256, we only need to load 8 elements not all 32 elements so use mf4
+ vsetivli zero, 8, e8, mf4, ta
+ j ff_pred8x8_horizontal_8_main
+endfunc
+
+func ff_pred8x8_horizontal_8_rvv, zve32x
+ lpad 0
+ vsetivli zero, 8, e8, mf2, ta
+ff_pred8x8_horizontal_8_main:
+ add t0, a1, a1
+ mv t1, a0
+ add t2, a0, a1
+
+ .rept 3
+ lb t3, -1(t1) # Load left pixel of row 0
+ lb t4, -1(t2) # Load left pixel of row 1 (doing 2 loads up front since they may stall)
+ vmv.v.x v0, t3
+ vmv.v.x v1, t4
+ vse8.v v0, (t1)
+ add t1, t1, t0
+ vse8.v v1, (t2)
+ add t2, t2, t0
+ .endr
+
+ lb t3, -1(t1)
+ lb t4, -1(t2)
+ vmv.v.x v0, t3
+ vmv.v.x v1, t4
+ vse8.v v0, (t1)
+ vse8.v v1, (t2)
+
+ ret
+endfunc
+
+func ff_pred8x8_plane_8_rvv, zve32x
+ lpad 0
+ vsetivli zero, 4, e8, mf2, ta
+
+ vid.v v0
+ vmv.v.i v1, 3
+ vsub.vv v2, v1, v0 # v2 = {3, 2, 1, 0} aka index
+
+ vsetivli zero, 4, e16, mf2, ta
+
+ vid.v v4
+ vadd.vi v6, v4, 1 # v6 = {1..4} = weight1
+
+ vsetivli zero, 4, e8, mf2, ta
+
+ sub t0, a0, a1
+ addi t1, t0, 4
+ vle8.v v8, (t1)
+
+ addi t2, t0, -1
+ vle8.v v9, (t2)
+
+ vrgather.vv v10, v9, v2
+
+ slli t3, a1, 2
+ addi t4, t3, -1
+ add t5, a0, t4
+
+ vlse8.v v11, (t5), a1
+ vlse8.v v12, (t2), a1
+ vrgather.vv v13, v12, v2
+
+ vsetivli zero, 4, e16, mf2, ta
+
+ vzext.vf2 v14, v8
+ vzext.vf2 v15, v10
+
+ vzext.vf2 v16, v11
+ vzext.vf2 v17, v13
+
+ vsub.vv v18, v14, v15
+ vmul.vv v19, v18, v6
+
+ vsub.vv v20, v16, v17
+ vmul.vv v21, v20, v6
+
+ vmv.v.x v22, zero
+ vwredsum.vs v22, v21, v22
+
+ vmv.v.x v23, zero
+ vwredsum.vs v23, v19, v23
+
+ vmv.x.s t6, v23
+ slli t1, t6, 4
+ add t6, t6, t1
+ addi t6, t6, 16
+ srai t6, t6, 5
+
+ vmv.x.s t5, v22
+ slli t1, t5, 4
+ add t5, t5, t1
+ addi t5, t5, 16
+ srai t5, t5, 5
+
+ add t2, t6, t5
+ slli t3, t2, 1
+ add t3, t3, t2
+
+ slli a2, a1, 3
+ sub a2, a2, a1
+ addi a2, a2, -1
+ add a2, a2, a0
+ lbu a3, (a2)
+
+ addi a2, a0, 7
+ sub a2, a2, a1
+ lbu a4, (a2)
+
+ add a5, a4, a3
+ addi a5, a5, 1
+ slli a5, a5, 4
+
+ sub a5, a5, t3 # linear combination of H, V and src
+
+ vsetivli zero, 8, e16, mf2, ta
+ vid.v v7
+
+ vmv.v.x v18, t6
+ vmul.vv v18, v18, v7
+
+ .irp reg 19, 20, 21, 22, 23, 24, 25, 26
+ vadd.vx v\reg, v18, a5
+ vmax.vx v\reg, v\reg, zero
+ add a5, a5, t5
+ .endr
+
+ li t2, 255
+
+ .irp reg 19, 20, 21, 22, 23, 24, 25, 26
+ vsra.vi v\reg, v\reg, 5
+ vmin.vx v\reg, v\reg, t2 # clamp to unsigned 255
+ .endr
+
+ vsetivli zero, 8, e8, mf2, ta
+
+ .irp reg 19, 20, 21, 22, 23, 24, 25, 26
+ vnclipu.wi v\reg, v\reg, 0
+ .endr
+
+ vse8.v v19, (a0)
+ add t1, a0, a1
+
+ .irp reg 20, 21, 22, 23, 24, 25
+ vse8.v v\reg, (t1)
+ add t1, t1, a1
+ .endr
+
+ vse8.v v26, (t1)
+
+ ret
+endfunc
+
+func ff_pred16x16_horizontal_8_rvv, zve32x
+ lpad 0
+ vsetivli zero, 16, e8, mf2, ta
+ add t0, a1, a1
+ mv t1, a0
+ add t2, a0, a1
+
+ .rept 7
+ lb t3, -1(t1) # Load left pixels of rows 0,1 (doing 2 loads up front since they may stall)
+ lb t4, -1(t2)
+ vmv.v.x v0, t3
+ vmv.v.x v1, t4
+ vse8.v v0, (t1)
+ add t1, t1, t0
+ vse8.v v1, (t2)
+ add t2, t2, t0
+ .endr
+
+ lb t3, -1(t1)
+ lb t4, -1(t2)
+ vmv.v.x v0, t3
+ vmv.v.x v1, t4
+ vse8.v v0, (t1)
+ vse8.v v1, (t2)
+
+ ret
+endfunc
+
+func ff_pred16x16_vertical_8_rvv, zve32x
+ lpad 0
+ vsetivli zero, 16, e8, mf2, ta
+
+ sub t0, a0, a1
+
+ vle8.v v0, (t0)
+
+ vse8.v v0, (a0)
+
+ add t1, a0, a1
+ vse8.v v0, (t1)
+
+ .rept 14
+ add t1, t1, a1
+ vse8.v v0, (t1)
+ .endr
+
+ ret
+endfunc
+
+func ff_pred16x16_128_dc_8_rvv, zve32x
+ lpad 0
+ vsetivli zero, 16, e8, mf2, ta
+ li t0, 128
+ vmv.v.x v0, t0
+
+ slli a2, a1, 1
+ vse8.v v0, (a0)
+
+ add t2, a0, a1
+ vse8.v v0, (t2)
+
+ add t1, a0, a2
+ add t2, t2, a2
+
+ .rept 6
+ vse8.v v0, (t1)
+ vse8.v v0, (t2)
+
+ add t1, t1, a2
+ add t2, t2, a2
+
+ .endr
+
+ vse8.v v0, (t1)
+ vse8.v v0, (t2)
+
+ ret
+endfunc
+
+func ff_pred16x16_dc_8_rvv, zve32x
+ lpad 0
+ vsetivli zero, 16, e8, mf2, ta, ma
+ csrw vxrm, 0
+
+ addi t0, a0, -1
+ vlse8.v v0, (t0), a1
+
+ sub t1, a0, a1
+ vle8.v v1, (t1)
+
+ vsetivli zero, 8, e8, mf4, ta, ma
+ vmv.v.i v2, 0
+
+ vsetivli zero, 16, e8, mf2, ta, ma
+ vwredsumu.vs v2, v0, v2 # sum = sum(left)
+ vwredsumu.vs v2, v1, v2 # sum += sum(top)
+
+ li t2, 5
+ vsetivli zero, 8, e16, mf4, ta, ma
+ vssrl.vx v3, v2, t2
+
+ vsetivli zero, 16, e8, mf2, ta, ma
+ vrgather.vi v4, v3, 0
+
+ vse8.v v4, (a0)
+ add t1, a0, a1
+
+ .rept 15
+ vse8.v v4, (t1)
+ add t1, t1, a1
+ .endr
+
+ ret
+endfunc
+
+
+func ff_pred16x16_left_dc_8_rvv, zve32x
+ lpad 0
+ vsetivli zero, 16, e8, mf2, ta, ma
+
+ addi t2, a0, -1
+ vlse8.v v1, (t2), a1
+
+ vmv.v.i v2, 0
+ vwredsumu.vs v2, v1, v2
+
+ vsetivli zero, 8, e16, mf4, ta, ma
+ vadd.vi v2, v2, 8
+ vsrl.vi v3, v2, 4
+
+ vsetivli zero, 16, e8, mf2, ta, ma
+ vmv.x.s t3, v3
+ vmv.v.x v4, t3
+
+ slli a2, a1, 1
+ add t4, a0, a1
+
+ vse8.v v4, (a0)
+ vse8.v v4, (t4)
+ add t1, a0, a2
+ add t4, t4, a2
+
+ .rept 6
+ vse8.v v4, (t1)
+ vse8.v v4, (t4)
+ add t1, t1, a2
+ add t4, t4, a2
+ .endr
+
+ vse8.v v4, (t1)
+ vse8.v v4, (t4)
+
+ ret
+endfunc
+
+func ff_pred16x16_top_dc_8_rvv, zve32x
+ lpad 0
+ vsetivli zero, 16, e8, mf2, ta, ma
+ csrw vxrm, 0
+
+ sub t0, a0, a1
+ vle8.v v0, (t0)
+
+ vmv.v.i v1, 0
+
+ vsetivli zero, 16, e8, m1, ta, ma
+
+ vwredsumu.vs v1, v0, v1
+
+ li t1, 4
+ vsetivli zero, 8, e16, mf4, ta, ma
+ vssrl.vx v1, v1, t1
+
+ vsetivli zero, 16, e8, mf2, ta, ma
+ vrgather.vi v2, v1, 0
+
+ vse8.v v2, (a0)
+ slli a2, a1, 1
+ add t1, a0, a2
+ add t2, a0, a1
+ vse8.v v2, (t2)
+
+ add t2, t2, a2
+ vse8.v v2, (t1)
+ vse8.v v2, (t2)
+
+ .rept 6
+ add t1, t1, a2
+ add t2, t2, a2
+ vse8.v v2, (t1)
+ vse8.v v2, (t2)
+ .endr
+
+ ret
+endfunc
+
+func ff_pred16x16_plane_8_rvv, zve32x
+ lpad 0
+ vsetivli zero, 8, e8, mf2, ta
+
+ vid.v v0
+ vmv.v.i v1, 7
+ vsub.vv v2, v1, v0 # v2 = {7, 6, 5, 4, 3, 2, 1, 0} aka index
+
+ sub t0, a0, a1
+ addi t1, t0, 8
+ addi t2, t0, -1
+ vle8.v v3, (t1)
+ vle8.v v4, (t2)
+ vrgather.vv v5, v4, v2
+
+ slli t3, a1, 3
+ add t3, a0, t3
+ addi t3, t3, -1
+
+ vlse8.v v6, (t3), a1
+ vlse8.v v7, (t2), a1
+ vrgather.vv v8, v7, v2
+
+ vsetivli zero, 8, e16, m1, ta
+
+ vzext.vf2 v9, v3
+ vzext.vf2 v10, v5
+
+ vzext.vf2 v11, v6
+ vzext.vf2 v12, v8
+
+ vsub.vv v13, v9, v10
+ vid.v v14
+ vadd.vi v14, v14, 1
+ vmul.vv v15, v13, v14
+
+ vsub.vv v16, v11, v12
+ vmul.vv v17, v16, v14
+
+ vmv.v.x v18, zero
+ vwredsum.vs v18, v17, v18
+
+ vmv.v.x v19, zero
+ vwredsum.vs v19, v15, v19
+
+ vmv.x.s t4, v19
+ slli t1, t4, 2
+ add t1, t1, t4
+ addi t1, t1, 32
+ srai t1, t1, 6
+
+ vmv.x.s t5, v18
+ slli t2, t5, 2
+ add t2, t2, t5
+ addi t2, t2, 32
+ srai t2, t2, 6
+
+ add t3, t1, t2
+ slli t4, t3, 3
+ sub t4, t4, t3
+
+ slli t5, a1, 4
+ sub t5, t5, a1
+ addi t5, t5, -1
+
+ li t6, 15
+ sub t6, t6, a1
+ add t6, a0, t6
+ lbu a2, (t6)
+
+ add a3, a0, t5
+ lbu a4, (a3)
+
+ add a4, a4, a2
+ addi a4, a4, 1
+ slli a4, a4, 4
+
+ sub a5, a4, t4 # a5 = linear combination of H, V and src
+
+ vsetivli zero, 16, e16, m1, ta
+ vid.v v20
+
+ vmv.v.x v21, t1
+ vmul.vv v22, v21, v20
+
+ mv t3, a0
+
+ .rept 2
+
+ vsetivli zero, 16, e16, m1, ta
+
+ .irp reg 23, 24, 25, 26, 27, 28, 29, 30
+ vadd.vx v\reg, v22, a5
+ vmax.vx v\reg, v\reg, zero
+ add a5, a5, t2
+ .endr
+
+ li a6, 255
+
+ .irp reg, 23, 24, 25, 26, 27, 28, 29, 30
+ vsra.vi v\reg, v\reg, 5
+ vmin.vx v\reg, v\reg, a6
+ .endr
+
+ vsetivli zero, 16, e8, mf2, ta
+
+ .irp reg, 23, 24, 25, 26, 27, 28, 29, 30
+ vnclipu.wi v\reg, v\reg, 0
+ vse8.v v\reg, (t3)
+ add t3, t3, a1
+ .endr
+
+ .endr
+
+ ret
+endfunc
diff --git a/tests/checkasm/h264pred.c b/tests/checkasm/h264pred.c
index 53e1cdb219..7254854534 100644
--- a/tests/checkasm/h264pred.c
+++ b/tests/checkasm/h264pred.c
@@ -172,8 +172,15 @@ static void check_pred8x8(H264PredContext *h, uint8_t *buf0, uint8_t *buf1,
randomize_buffers();
call_ref(src0, 24*SIZEOF_PIXEL);
call_new(src1, 24*SIZEOF_PIXEL);
- if (memcmp(buf0, buf1, BUF_SIZE))
+ if (memcmp(buf0, buf1, BUF_SIZE)) {
+ fprintf(stderr, "REF:\n");
+ for (int i = 0; i < BUF_SIZE; i++)
+ fprintf(stderr, "%d\t", buf0[i]);
+ fprintf(stderr, "\nASM:\n");
+ for (int i = 0; i < BUF_SIZE; i++)
+ fprintf(stderr, "%d\t", buf1[i]);
fail();
+ }
bench_new(src1, 24*SIZEOF_PIXEL);
}
}
--
2.49.1
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] 2+ messages in thread
* [FFmpeg-devel] Re: [PATCH] rvv: add optimized h264 intra prediction functions (PR #20494)
2025-09-11 17:27 [FFmpeg-devel] [PATCH] rvv: add optimized h264 intra prediction functions (PR #20494) tmatth via ffmpeg-devel
@ 2025-09-11 18:17 ` Tristan Matthews via ffmpeg-devel
0 siblings, 0 replies; 2+ messages in thread
From: Tristan Matthews via ffmpeg-devel @ 2025-09-11 18:17 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: tmatth, Tristan Matthews
On Thu, Sep 11, 2025 at 1:29 PM tmatth via ffmpeg-devel
<ffmpeg-devel@ffmpeg.org> wrote:
>
> PR #20494 opened by tmatth
>
> +endfunc
> diff --git a/tests/checkasm/h264pred.c b/tests/checkasm/h264pred.c
> index 53e1cdb219..7254854534 100644
> --- a/tests/checkasm/h264pred.c
> +++ b/tests/checkasm/h264pred.c
> @@ -172,8 +172,15 @@ static void check_pred8x8(H264PredContext *h, uint8_t *buf0, uint8_t *buf1,
> randomize_buffers();
> call_ref(src0, 24*SIZEOF_PIXEL);
> call_new(src1, 24*SIZEOF_PIXEL);
> - if (memcmp(buf0, buf1, BUF_SIZE))
> + if (memcmp(buf0, buf1, BUF_SIZE)) {
> + fprintf(stderr, "REF:\n");
> + for (int i = 0; i < BUF_SIZE; i++)
> + fprintf(stderr, "%d\t", buf0[i]);
> + fprintf(stderr, "\nASM:\n");
> + for (int i = 0; i < BUF_SIZE; i++)
> + fprintf(stderr, "%d\t", buf1[i]);
> fail();
> + }
> bench_new(src1, 24*SIZEOF_PIXEL);
> }
> }
> --
> 2.49.1
>
This debug code wasn't supposed to be here (in fact I didn't expect
this WIP/draft PR to get mailed out), please see the latest version at
https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20494
Best,
Tristan
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2025-09-11 18:18 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-09-11 17:27 [FFmpeg-devel] [PATCH] rvv: add optimized h264 intra prediction functions (PR #20494) tmatth via ffmpeg-devel
2025-09-11 18:17 ` [FFmpeg-devel] " Tristan Matthews via ffmpeg-devel
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git