Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH] rvv: add optimized h264 intra prediction functions (PR #20494)
@ 2025-09-11 17:27 tmatth via ffmpeg-devel
  2025-09-11 18:17 ` [FFmpeg-devel] " Tristan Matthews via ffmpeg-devel
  0 siblings, 1 reply; 2+ messages in thread
From: tmatth via ffmpeg-devel @ 2025-09-11 17:27 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: tmatth

PR #20494 opened by tmatth
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20494
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20494.patch

Benched on SpaceMIT for VLEN 256

Reviewed-by: Nathan E. Egge <unlord@xiph.org>

                                C       RVV256
pred8x8_horizontal_8:           36.4    32.6    (1.12x)
pred8x8_plane_8:                225.7   149.7   (1.51x)
pred16x16_dc_8:                 70.2    58.3    (1.20x)
pred16x16_dc_128_8:             24.6    23.9    (1.03x)
pred16x16_horizontal_8:         90.3    56.4    (1.60x)
pred16x16_left_dc_8:            49.0    46.4    (1.06x)
pred16x16_plane_8:              746.9   232.2   (3.22x)
pred16x16_top_dc_8:             47.1    45.2    (1.04x)
pred16x16_vertical_8:           32.6    24.5    (1.33x)


From c33acce5510cf27595e442b687a3433ae013ce18 Mon Sep 17 00:00:00 2001
From: Tristan Matthews <tmatth@videolan.org>
Date: Sun, 20 Jul 2025 12:13:00 +0000
Subject: [PATCH] rvv: add optimized h264 intra prediction functions

Benched on SpaceMIT for VLEN 256

Reviewed-by: Nathan E. Egge <unlord@xiph.org>

                                C       RVV256
pred8x8_horizontal_8:           36.4    32.6    (1.12x)
pred8x8_plane_8:                225.7   149.7   (1.51x)
pred16x16_dc_8:                 70.2    58.3    (1.20x)
pred16x16_dc_128_8:             24.6    23.9    (1.03x)
pred16x16_horizontal_8:         90.3    56.4    (1.60x)
pred16x16_left_dc_8:            49.0    46.4    (1.06x)
pred16x16_plane_8:              746.9   232.2   (3.22x)
pred16x16_top_dc_8:             47.1    45.2    (1.04x)
pred16x16_vertical_8:           32.6    24.5    (1.33x)
---
 libavcodec/h264pred.c                  |   2 +
 libavcodec/h264pred.h                  |   3 +-
 libavcodec/riscv/Makefile              |   2 +
 libavcodec/riscv/h264_intrapred_init.c |  82 ++++
 libavcodec/riscv/h264_intrapred_rvv.S  | 501 +++++++++++++++++++++++++
 tests/checkasm/h264pred.c              |   9 +-
 6 files changed, 597 insertions(+), 2 deletions(-)
 create mode 100644 libavcodec/riscv/h264_intrapred_init.c
 create mode 100644 libavcodec/riscv/h264_intrapred_rvv.S

diff --git a/libavcodec/h264pred.c b/libavcodec/h264pred.c
index 25f9995a0b..f4ad02c326 100644
--- a/libavcodec/h264pred.c
+++ b/libavcodec/h264pred.c
@@ -598,5 +598,7 @@ av_cold void ff_h264_pred_init(H264PredContext *h, int codec_id,
     ff_h264_pred_init_mips(h, codec_id, bit_depth, chroma_format_idc);
 #elif ARCH_LOONGARCH
     ff_h264_pred_init_loongarch(h, codec_id, bit_depth, chroma_format_idc);
+#elif ARCH_RISCV
+    ff_h264_pred_init_riscv(h, codec_id, bit_depth, chroma_format_idc);
 #endif
 }
diff --git a/libavcodec/h264pred.h b/libavcodec/h264pred.h
index cb008548fc..8ac5088b34 100644
--- a/libavcodec/h264pred.h
+++ b/libavcodec/h264pred.h
@@ -126,5 +126,6 @@ void ff_h264_pred_init_mips(H264PredContext *h, int codec_id,
                             const int bit_depth, const int chroma_format_idc);
 void ff_h264_pred_init_loongarch(H264PredContext *h, int codec_id,
                                  const int bit_depth, const int chroma_format_idc);
-
+void ff_h264_pred_init_riscv(H264PredContext *h, int codec_id,
+                             const int bit_depth, const int chroma_format_idc);
 #endif /* AVCODEC_H264PRED_H */
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 736f873fe8..096bae7f16 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -33,6 +33,8 @@ RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o
 OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_init.o
 RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o riscv/h264dsp_rvv.o \
                               riscv/h264idct_rvv.o
+OBJS-$(CONFIG_H264PRED) += riscv/h264_intrapred_init.o
+RVV-OBJS-$(CONFIG_H264PRED)  += riscv/h264_intrapred_rvv.o
 OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o
 RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o
 OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o
diff --git a/libavcodec/riscv/h264_intrapred_init.c b/libavcodec/riscv/h264_intrapred_init.c
new file mode 100644
index 0000000000..e6af5cb375
--- /dev/null
+++ b/libavcodec/riscv/h264_intrapred_init.c
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2025 Tristan Matthews <tmatth@videolan.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/riscv/cpu.h"
+#include "libavcodec/codec_id.h"
+#include "libavcodec/h264pred.h"
+
+#define PRED8x8(TYPE, DEPTH, SUFFIX) \
+void ff_pred8x8_ ## TYPE ## _ ## DEPTH ## _ ## SUFFIX (uint8_t *src, \
+                                                       ptrdiff_t stride);
+
+#define PRED16x16(TYPE, DEPTH, SUFFIX) \
+void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## SUFFIX (uint8_t *src, \
+                                                         ptrdiff_t stride);
+
+/* 8-bit versions */
+PRED8x8(horizontal, 8, rvv)
+PRED8x8(horizontal, 8, rvv_vl256)
+PRED8x8(plane, 8, rvv)
+PRED16x16(horizontal, 8, rvv)
+PRED16x16(vertical, 8, rvv)
+PRED16x16(dc, 8, rvv)
+PRED16x16(128_dc, 8, rvv)
+PRED16x16(left_dc, 8, rvv)
+PRED16x16(top_dc, 8, rvv)
+PRED16x16(plane, 8, rvv)
+
+av_cold void ff_h264_pred_init_riscv(H264PredContext *h, int codec_id,
+                                     const int bit_depth,
+                                     const int chroma_format_idc)
+{
+#if HAVE_RVV
+    int cpu_flags = av_get_cpu_flags();
+    const int vlen = 8 * ff_get_rv_vlenb();
+
+    if (!(cpu_flags & AV_CPU_FLAG_RVV_I32)) return;
+
+    if (bit_depth == 8) {
+        if (chroma_format_idc <= 1) {
+            h->pred8x8[HOR_PRED8x8] = ff_pred8x8_horizontal_8_rvv;
+            if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) {
+                h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_8_rvv;
+            }
+            if (vlen >= 256) {
+                h->pred8x8[HOR_PRED8x8] = ff_pred8x8_horizontal_8_rvv_vl256;
+            }
+        }
+        h->pred16x16[HOR_PRED8x8] = ff_pred16x16_horizontal_8_rvv;
+        h->pred16x16[DC_PRED8x8] = ff_pred16x16_dc_8_rvv;
+        h->pred16x16[LEFT_DC_PRED8x8] = ff_pred16x16_left_dc_8_rvv;
+        h->pred16x16[TOP_DC_PRED8x8] = ff_pred16x16_top_dc_8_rvv;
+        h->pred16x16[VERT_PRED8x8] = ff_pred16x16_vertical_8_rvv;
+        h->pred16x16[DC_128_PRED8x8] = ff_pred16x16_128_dc_8_rvv;
+        if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 &&
+            codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) {
+            h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_8_rvv;
+        }
+    }
+#endif
+}
diff --git a/libavcodec/riscv/h264_intrapred_rvv.S b/libavcodec/riscv/h264_intrapred_rvv.S
new file mode 100644
index 0000000000..3999037833
--- /dev/null
+++ b/libavcodec/riscv/h264_intrapred_rvv.S
@@ -0,0 +1,501 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright © 2025 Tristan Matthews
+ * Partly based on h264_pred.c Copyright (c) 2023 SiFive, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "libavutil/riscv/asm.S"
+
+func ff_pred8x8_horizontal_8_rvv_vl256, zve32x
+        lpad    0
+        # mf4 only for VLEN >= 256, we only need to load 8 elements not all 32 elements so use mf4
+        vsetivli zero, 8, e8, mf4, ta
+        j ff_pred8x8_horizontal_8_main
+endfunc
+
+func ff_pred8x8_horizontal_8_rvv, zve32x
+        lpad    0
+        vsetivli zero, 8, e8, mf2, ta
+ff_pred8x8_horizontal_8_main:
+        add     t0, a1, a1
+        mv      t1, a0
+        add     t2, a0, a1
+
+        .rept 3
+        lb      t3, -1(t1)             # Load left pixel of row 0
+        lb      t4, -1(t2)             # Load left pixel of row 1 (doing 2 loads up front since they may stall)
+        vmv.v.x v0, t3
+        vmv.v.x v1, t4
+        vse8.v  v0, (t1)
+        add     t1, t1, t0
+        vse8.v  v1, (t2)
+        add     t2, t2, t0
+        .endr
+
+        lb      t3, -1(t1)
+        lb      t4, -1(t2)
+        vmv.v.x v0, t3
+        vmv.v.x v1, t4
+        vse8.v  v0, (t1)
+        vse8.v  v1, (t2)
+
+        ret
+endfunc
+
+func ff_pred8x8_plane_8_rvv, zve32x
+        lpad    0
+        vsetivli zero, 4, e8, mf2, ta
+
+        vid.v   v0
+        vmv.v.i v1, 3
+        vsub.vv v2, v1, v0                   # v2 = {3, 2, 1, 0} aka index
+
+        vsetivli zero, 4, e16, mf2, ta
+
+        vid.v   v4
+        vadd.vi v6, v4, 1                    # v6 = {1..4} = weight1
+
+        vsetivli zero, 4, e8, mf2, ta
+
+        sub     t0, a0, a1
+        addi    t1, t0, 4
+        vle8.v  v8, (t1)
+
+        addi    t2, t0, -1
+        vle8.v  v9, (t2)
+
+        vrgather.vv v10, v9, v2
+
+        slli    t3, a1, 2
+        addi    t4, t3, -1
+        add     t5, a0, t4
+
+        vlse8.v v11, (t5), a1
+        vlse8.v v12, (t2), a1
+        vrgather.vv v13, v12, v2
+
+        vsetivli zero, 4, e16, mf2, ta
+
+        vzext.vf2 v14, v8
+        vzext.vf2 v15, v10
+
+        vzext.vf2 v16, v11
+        vzext.vf2 v17, v13
+
+        vsub.vv v18, v14, v15
+        vmul.vv v19, v18, v6
+
+        vsub.vv v20, v16, v17
+        vmul.vv v21, v20, v6
+
+        vmv.v.x v22, zero
+        vwredsum.vs v22, v21, v22
+
+        vmv.v.x v23, zero
+        vwredsum.vs v23, v19, v23
+
+        vmv.x.s t6, v23
+        slli t1, t6, 4
+        add  t6, t6, t1
+        addi t6, t6, 16
+        srai t6, t6, 5
+
+        vmv.x.s t5, v22
+        slli t1, t5, 4
+        add  t5, t5, t1
+        addi t5, t5, 16
+        srai t5, t5, 5
+
+        add t2, t6, t5
+        slli t3, t2, 1
+        add t3, t3, t2
+
+        slli a2, a1, 3
+        sub a2, a2, a1
+        addi a2, a2, -1
+        add a2, a2, a0
+        lbu  a3, (a2)
+
+        addi a2, a0, 7
+        sub a2, a2, a1
+        lbu a4, (a2)
+
+        add a5, a4, a3
+        addi a5, a5, 1
+        slli a5, a5, 4
+
+        sub a5, a5, t3                       # linear combination of H, V and src
+
+        vsetivli zero, 8, e16, mf2, ta
+        vid.v   v7
+
+        vmv.v.x v18, t6
+        vmul.vv v18, v18, v7
+
+        .irp reg 19, 20, 21, 22, 23, 24, 25, 26
+        vadd.vx v\reg, v18, a5
+        vmax.vx v\reg, v\reg, zero
+        add a5, a5, t5
+        .endr
+
+        li         t2, 255
+
+        .irp reg 19, 20, 21, 22, 23, 24, 25, 26
+        vsra.vi    v\reg, v\reg, 5
+        vmin.vx    v\reg, v\reg, t2          # clamp to unsigned 255
+        .endr
+
+        vsetivli zero, 8, e8, mf2, ta
+
+        .irp reg 19, 20, 21, 22, 23, 24, 25, 26
+        vnclipu.wi v\reg, v\reg, 0
+        .endr
+
+        vse8.v  v19, (a0)
+        add t1, a0, a1
+
+        .irp reg 20, 21, 22, 23, 24, 25
+        vse8.v  v\reg, (t1)
+        add t1, t1, a1
+        .endr
+
+        vse8.v  v26, (t1)
+
+        ret
+endfunc
+
+func ff_pred16x16_horizontal_8_rvv, zve32x
+        lpad    0
+        vsetivli zero, 16, e8, mf2, ta
+        add     t0, a1, a1
+        mv      t1, a0
+        add     t2, a0, a1
+
+        .rept 7
+        lb      t3, -1(t1)             # Load left pixels of rows 0,1 (doing 2 loads up front since they may stall)
+        lb      t4, -1(t2)
+        vmv.v.x v0, t3
+        vmv.v.x v1, t4
+        vse8.v  v0, (t1)
+        add     t1, t1, t0
+        vse8.v  v1, (t2)
+        add     t2, t2, t0
+        .endr
+
+        lb      t3, -1(t1)
+        lb      t4, -1(t2)
+        vmv.v.x v0, t3
+        vmv.v.x v1, t4
+        vse8.v  v0, (t1)
+        vse8.v  v1, (t2)
+
+        ret
+endfunc
+
+func ff_pred16x16_vertical_8_rvv, zve32x
+        lpad    0
+        vsetivli zero, 16, e8, mf2, ta
+
+        sub     t0, a0, a1
+
+        vle8.v  v0, (t0)
+
+        vse8.v  v0, (a0)
+
+        add t1, a0, a1
+        vse8.v  v0, (t1)
+
+        .rept 14
+        add t1, t1, a1
+        vse8.v  v0, (t1)
+        .endr
+
+        ret
+endfunc
+
+func ff_pred16x16_128_dc_8_rvv, zve32x
+        lpad    0
+        vsetivli zero, 16, e8, mf2, ta
+        li t0, 128
+        vmv.v.x v0, t0
+
+        slli a2, a1, 1
+        vse8.v  v0, (a0)
+
+        add t2, a0, a1
+        vse8.v  v0, (t2)
+
+        add t1, a0, a2
+        add t2, t2, a2
+
+        .rept 6
+        vse8.v  v0, (t1)
+        vse8.v  v0, (t2)
+
+        add t1, t1, a2
+        add t2, t2, a2
+
+        .endr
+
+        vse8.v  v0, (t1)
+        vse8.v  v0, (t2)
+
+        ret
+endfunc
+
+func ff_pred16x16_dc_8_rvv, zve32x
+        lpad    0
+        vsetivli zero, 16, e8, mf2, ta, ma
+        csrw vxrm, 0
+
+        addi t0, a0, -1
+        vlse8.v v0, (t0), a1
+
+        sub     t1, a0, a1
+        vle8.v  v1, (t1)
+
+        vsetivli zero, 8, e8, mf4, ta, ma
+        vmv.v.i v2, 0
+
+        vsetivli zero, 16, e8, mf2, ta, ma
+        vwredsumu.vs v2, v0, v2 # sum = sum(left)
+        vwredsumu.vs v2, v1, v2 # sum += sum(top)
+
+        li t2, 5
+        vsetivli        zero, 8, e16, mf4, ta, ma
+        vssrl.vx        v3, v2, t2
+
+        vsetivli        zero, 16, e8, mf2, ta, ma
+        vrgather.vi     v4, v3, 0
+
+        vse8.v v4, (a0)
+        add t1, a0, a1
+
+        .rept 15
+        vse8.v v4, (t1)
+        add t1, t1, a1
+        .endr
+
+        ret
+endfunc
+
+
+func ff_pred16x16_left_dc_8_rvv, zve32x
+        lpad    0
+        vsetivli zero, 16, e8, mf2, ta, ma
+
+        addi t2, a0, -1
+        vlse8.v v1, (t2), a1
+
+        vmv.v.i v2, 0
+        vwredsumu.vs v2, v1, v2
+
+        vsetivli zero, 8, e16, mf4, ta, ma
+        vadd.vi v2, v2, 8
+        vsrl.vi v3, v2, 4
+
+        vsetivli zero, 16, e8, mf2, ta, ma
+        vmv.x.s t3, v3
+        vmv.v.x v4, t3
+
+        slli a2, a1, 1
+        add t4, a0, a1
+
+        vse8.v v4, (a0)
+        vse8.v v4, (t4)
+        add t1, a0, a2
+        add t4, t4, a2
+
+        .rept 6
+        vse8.v v4, (t1)
+        vse8.v v4, (t4)
+        add t1, t1, a2
+        add t4, t4, a2
+        .endr
+
+        vse8.v v4, (t1)
+        vse8.v v4, (t4)
+
+        ret
+endfunc
+
+func ff_pred16x16_top_dc_8_rvv, zve32x
+        lpad    0
+        vsetivli zero, 16, e8, mf2, ta, ma
+        csrw vxrm, 0
+
+        sub     t0, a0, a1
+        vle8.v  v0, (t0)
+
+        vmv.v.i v1, 0
+
+        vsetivli zero, 16, e8, m1, ta, ma
+
+        vwredsumu.vs v1, v0, v1
+
+        li t1, 4
+        vsetivli        zero, 8, e16, mf4, ta, ma
+        vssrl.vx        v1, v1, t1
+
+        vsetivli        zero, 16, e8, mf2, ta, ma
+        vrgather.vi     v2, v1, 0
+
+        vse8.v v2, (a0)
+        slli a2, a1, 1
+        add t1, a0, a2
+        add t2, a0, a1
+        vse8.v v2, (t2)
+
+        add t2, t2, a2
+        vse8.v v2, (t1)
+        vse8.v v2, (t2)
+
+        .rept 6
+        add t1, t1, a2
+        add t2, t2, a2
+        vse8.v v2, (t1)
+        vse8.v v2, (t2)
+        .endr
+
+        ret
+endfunc
+
+func ff_pred16x16_plane_8_rvv, zve32x
+        lpad    0
+        vsetivli zero, 8, e8, mf2, ta
+
+        vid.v   v0
+        vmv.v.i v1, 7
+        vsub.vv v2, v1, v0                   # v2 = {7, 6, 5, 4, 3, 2, 1, 0} aka index
+
+        sub     t0, a0, a1
+        addi    t1, t0, 8
+        addi    t2, t0, -1
+        vle8.v  v3, (t1)
+        vle8.v  v4, (t2)
+        vrgather.vv v5, v4, v2
+
+        slli    t3, a1, 3
+        add     t3, a0, t3
+        addi    t3, t3, -1
+
+        vlse8.v v6, (t3), a1
+        vlse8.v v7, (t2), a1
+        vrgather.vv v8, v7, v2
+
+        vsetivli zero, 8, e16, m1, ta
+
+        vzext.vf2 v9, v3
+        vzext.vf2 v10, v5
+
+        vzext.vf2 v11, v6
+        vzext.vf2 v12, v8
+
+        vsub.vv v13, v9, v10
+        vid.v   v14
+        vadd.vi v14, v14, 1
+        vmul.vv v15, v13, v14
+
+        vsub.vv v16, v11, v12
+        vmul.vv v17, v16, v14
+
+        vmv.v.x v18, zero
+        vwredsum.vs v18, v17, v18
+
+        vmv.v.x v19, zero
+        vwredsum.vs v19, v15, v19
+
+        vmv.x.s t4, v19
+        slli t1, t4, 2
+        add t1, t1, t4
+        addi t1, t1, 32
+        srai t1, t1, 6
+
+        vmv.x.s t5, v18
+        slli t2, t5, 2
+        add t2, t2, t5
+        addi t2, t2, 32
+        srai t2, t2, 6
+
+        add t3, t1, t2
+        slli t4, t3, 3
+        sub t4, t4, t3
+
+        slli t5, a1, 4
+        sub t5, t5, a1
+        addi t5, t5, -1
+
+        li t6, 15
+        sub t6, t6, a1
+        add t6, a0, t6
+        lbu a2, (t6)
+
+        add a3, a0, t5
+        lbu a4, (a3)
+
+        add a4, a4, a2
+        addi a4, a4, 1
+        slli a4, a4, 4
+
+        sub a5, a4, t4                       # a5 = linear combination of H, V and src
+
+        vsetivli zero, 16, e16, m1, ta
+        vid.v v20
+
+        vmv.v.x v21, t1
+        vmul.vv v22, v21, v20
+
+        mv t3, a0
+
+        .rept 2
+
+        vsetivli zero, 16, e16, m1, ta
+
+        .irp reg 23, 24, 25, 26, 27, 28, 29, 30
+        vadd.vx v\reg, v22, a5
+        vmax.vx v\reg, v\reg, zero
+        add a5, a5, t2
+        .endr
+
+        li a6, 255
+
+        .irp reg, 23, 24, 25, 26, 27, 28, 29, 30
+        vsra.vi v\reg, v\reg, 5
+        vmin.vx v\reg, v\reg, a6
+        .endr
+
+        vsetivli zero, 16, e8, mf2, ta
+
+        .irp reg, 23, 24, 25, 26, 27, 28, 29, 30
+        vnclipu.wi v\reg, v\reg, 0
+        vse8.v  v\reg, (t3)
+        add t3, t3, a1
+        .endr
+
+        .endr
+
+        ret
+endfunc
diff --git a/tests/checkasm/h264pred.c b/tests/checkasm/h264pred.c
index 53e1cdb219..7254854534 100644
--- a/tests/checkasm/h264pred.c
+++ b/tests/checkasm/h264pred.c
@@ -172,8 +172,15 @@ static void check_pred8x8(H264PredContext *h, uint8_t *buf0, uint8_t *buf1,
             randomize_buffers();
             call_ref(src0, 24*SIZEOF_PIXEL);
             call_new(src1, 24*SIZEOF_PIXEL);
-            if (memcmp(buf0, buf1, BUF_SIZE))
+            if (memcmp(buf0, buf1, BUF_SIZE)) {
+                fprintf(stderr, "REF:\n");
+                for (int i = 0; i < BUF_SIZE; i++)
+                    fprintf(stderr, "%d\t", buf0[i]);
+                fprintf(stderr, "\nASM:\n");
+                for (int i = 0; i < BUF_SIZE; i++)
+                    fprintf(stderr, "%d\t", buf1[i]);
                 fail();
+            }
             bench_new(src1, 24*SIZEOF_PIXEL);
         }
     }
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org

^ permalink raw reply	[flat|nested] 2+ messages in thread

* [FFmpeg-devel] Re: [PATCH] rvv: add optimized h264 intra prediction functions (PR #20494)
  2025-09-11 17:27 [FFmpeg-devel] [PATCH] rvv: add optimized h264 intra prediction functions (PR #20494) tmatth via ffmpeg-devel
@ 2025-09-11 18:17 ` Tristan Matthews via ffmpeg-devel
  0 siblings, 0 replies; 2+ messages in thread
From: Tristan Matthews via ffmpeg-devel @ 2025-09-11 18:17 UTC (permalink / raw)
  To: FFmpeg development discussions and patches; +Cc: tmatth, Tristan Matthews

On Thu, Sep 11, 2025 at 1:29 PM tmatth via ffmpeg-devel
<ffmpeg-devel@ffmpeg.org> wrote:
>
> PR #20494 opened by tmatth
>
> +endfunc
> diff --git a/tests/checkasm/h264pred.c b/tests/checkasm/h264pred.c
> index 53e1cdb219..7254854534 100644
> --- a/tests/checkasm/h264pred.c
> +++ b/tests/checkasm/h264pred.c
> @@ -172,8 +172,15 @@ static void check_pred8x8(H264PredContext *h, uint8_t *buf0, uint8_t *buf1,
>              randomize_buffers();
>              call_ref(src0, 24*SIZEOF_PIXEL);
>              call_new(src1, 24*SIZEOF_PIXEL);
> -            if (memcmp(buf0, buf1, BUF_SIZE))
> +            if (memcmp(buf0, buf1, BUF_SIZE)) {
> +                fprintf(stderr, "REF:\n");
> +                for (int i = 0; i < BUF_SIZE; i++)
> +                    fprintf(stderr, "%d\t", buf0[i]);
> +                fprintf(stderr, "\nASM:\n");
> +                for (int i = 0; i < BUF_SIZE; i++)
> +                    fprintf(stderr, "%d\t", buf1[i]);
>                  fail();
> +            }
>              bench_new(src1, 24*SIZEOF_PIXEL);
>          }
>      }
> --
> 2.49.1
>

This debug code wasn't supposed to be here (in fact I didn't expect
this WIP/draft PR to get mailed out), please see the latest version at
https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20494

Best,
Tristan
_______________________________________________
ffmpeg-devel mailing list -- ffmpeg-devel@ffmpeg.org
To unsubscribe send an email to ffmpeg-devel-leave@ffmpeg.org

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2025-09-11 18:18 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-09-11 17:27 [FFmpeg-devel] [PATCH] rvv: add optimized h264 intra prediction functions (PR #20494) tmatth via ffmpeg-devel
2025-09-11 18:17 ` [FFmpeg-devel] " Tristan Matthews via ffmpeg-devel

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git