Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
From: uk7b@foxmail.com
To: ffmpeg-devel@ffmpeg.org
Cc: sunyuechi <sunyuechi@iscas.ac.cn>
Subject: [FFmpeg-devel] [PATCH 06/10] lavc/vp9dsp: R-V V mc tap h
Date: Sat,  4 May 2024 23:03:09 +0800
Message-ID: <tencent_78E27848A89869EE0A0B396C924750F5CB05@qq.com> (raw)
In-Reply-To: <20240504150313.2472910-1-uk7b@foxmail.com>

From: sunyuechi <sunyuechi@iscas.ac.cn>

C908:
vp9_avg_8tap_smooth_4h_8bpp_c: 12.7
vp9_avg_8tap_smooth_4h_8bpp_rvv_i64: 5.0
vp9_avg_8tap_smooth_8h_8bpp_c: 48.5
vp9_avg_8tap_smooth_8h_8bpp_rvv_i64: 9.2
vp9_avg_8tap_smooth_16h_8bpp_c: 191.7
vp9_avg_8tap_smooth_16h_8bpp_rvv_i64: 21.0
vp9_avg_8tap_smooth_32h_8bpp_c: 780.0
vp9_avg_8tap_smooth_32h_8bpp_rvv_i64: 66.5
vp9_avg_8tap_smooth_64h_8bpp_c: 3123.7
vp9_avg_8tap_smooth_64h_8bpp_rvv_i64: 264.2
vp9_put_8tap_smooth_4h_8bpp_c: 11.0
vp9_put_8tap_smooth_4h_8bpp_rvv_i64: 4.2
vp9_put_8tap_smooth_8h_8bpp_c: 42.0
vp9_put_8tap_smooth_8h_8bpp_rvv_i64: 8.2
vp9_put_8tap_smooth_16h_8bpp_c: 165.5
vp9_put_8tap_smooth_16h_8bpp_rvv_i64: 19.7
vp9_put_8tap_smooth_32h_8bpp_c: 659.0
vp9_put_8tap_smooth_32h_8bpp_rvv_i64: 64.0
vp9_put_8tap_smooth_64h_8bpp_c: 2682.0
vp9_put_8tap_smooth_64h_8bpp_rvv_i64: 272.2
---
 libavcodec/riscv/vp9_mc_rvv.S  | 233 +++++++++++++++++++++++++++++++++
 libavcodec/riscv/vp9dsp_init.c |   8 +-
 2 files changed, 240 insertions(+), 1 deletion(-)

diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index a97807633e..289c377a42 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -123,6 +123,231 @@ func ff_copy\len\()_rvv, zve32x
 endfunc
 .endr
 
+const subpel_filters_regular
+        .byte  0,  0,   0, 128,   0,   0,  0,  0
+        .byte  0,  1,  -5, 126,   8,  -3,  1,  0
+        .byte -1,  3, -10, 122,  18,  -6,  2,  0
+        .byte -1,  4, -13, 118,  27,  -9,  3, -1
+        .byte -1,  4, -16, 112,  37, -11,  4, -1
+        .byte -1,  5, -18, 105,  48, -14,  4, -1
+        .byte -1,  5, -19,  97,  58, -16,  5, -1
+        .byte -1,  6, -19,  88,  68, -18,  5, -1
+        .byte -1,  6, -19,  78,  78, -19,  6, -1
+        .byte -1,  5, -18,  68,  88, -19,  6, -1
+        .byte -1,  5, -16,  58,  97, -19,  5, -1
+        .byte -1,  4, -14,  48, 105, -18,  5, -1
+        .byte -1,  4, -11,  37, 112, -16,  4, -1
+        .byte -1,  3,  -9,  27, 118, -13,  4, -1
+        .byte  0,  2,  -6,  18, 122, -10,  3, -1
+        .byte  0,  1,  -3,   8, 126,  -5,  1,  0
+subpel_filters_sharp:
+        .byte  0,  0,   0, 128,   0,   0,  0,  0
+        .byte -1,  3,  -7, 127,   8,  -3,  1,  0
+        .byte -2,  5, -13, 125,  17,  -6,  3, -1
+        .byte -3,  7, -17, 121,  27, -10,  5, -2
+        .byte -4,  9, -20, 115,  37, -13,  6, -2
+        .byte -4, 10, -23, 108,  48, -16,  8, -3
+        .byte -4, 10, -24, 100,  59, -19,  9, -3
+        .byte -4, 11, -24,  90,  70, -21, 10, -4
+        .byte -4, 11, -23,  80,  80, -23, 11, -4
+        .byte -4, 10, -21,  70,  90, -24, 11, -4
+        .byte -3,  9, -19,  59, 100, -24, 10, -4
+        .byte -3,  8, -16,  48, 108, -23, 10, -4
+        .byte -2,  6, -13,  37, 115, -20,  9, -4
+        .byte -2,  5, -10,  27, 121, -17,  7, -3
+        .byte -1,  3,  -6,  17, 125, -13,  5, -2
+        .byte  0,  1,  -3,   8, 127,  -7,  3, -1
+subpel_filters_smooth:
+        .byte  0,  0,   0, 128,   0,   0,  0,  0
+        .byte -3, -1,  32,  64,  38,   1, -3,  0
+        .byte -2, -2,  29,  63,  41,   2, -3,  0
+        .byte -2, -2,  26,  63,  43,   4, -4,  0
+        .byte -2, -3,  24,  62,  46,   5, -4,  0
+        .byte -2, -3,  21,  60,  49,   7, -4,  0
+        .byte -1, -4,  18,  59,  51,   9, -4,  0
+        .byte -1, -4,  16,  57,  53,  12, -4, -1
+        .byte -1, -4,  14,  55,  55,  14, -4, -1
+        .byte -1, -4,  12,  53,  57,  16, -4, -1
+        .byte  0, -4,   9,  51,  59,  18, -4, -1
+        .byte  0, -4,   7,  49,  60,  21, -3, -2
+        .byte  0, -4,   5,  46,  62,  24, -3, -2
+        .byte  0, -4,   4,  43,  63,  26, -2, -2
+        .byte  0, -3,   2,  41,  63,  29, -2, -2
+        .byte  0, -3,   1,  38,  64,  32, -1, -3
+endconst
+
+.macro epel_filter name type regtype
+        lla             \regtype\()2, subpel_filters_\name
+        li              \regtype\()1, 8
+        mul             \regtype\()0, a5, \regtype\()1
+        add             \regtype\()0, \regtype\()0, \regtype\()2
+        .irp n 1,2,3,4,5,6
+        lb              \regtype\n, \n(\regtype\()0)
+        .endr
+.ifc \regtype,t
+        lb              a7, 7(\regtype\()0)
+.elseif \regtype == s
+        lb              s7, 7(\regtype\()0)
+.endif
+        lb              \regtype\()0, 0(\regtype\()0)
+.endm
+
+.macro epel_load dst len do name type from_mem regtype
+        li              a5, 64
+.ifc \from_mem, 1
+        vle8.v          v22, (a2)
+        addi            a2, a2, -1
+        vle8.v          v20, (a2)
+        addi            a2, a2, 2
+        vle8.v          v24, (a2)
+        addi            a2, a2, 1
+        vle8.v          v26, (a2)
+        addi            a2, a2, 1
+        vle8.v          v28, (a2)
+        addi            a2, a2, 1
+        vle8.v          v30, (a2)
+
+.ifc \name,smooth
+        vwmulu.vx       v16, v24, \regtype\()4
+        vwmaccu.vx      v16, \regtype\()2, v20
+        vwmaccu.vx      v16, \regtype\()5, v26
+        vwmaccsu.vx     v16, \regtype\()6, v28
+.else
+        vwmulu.vx       v16, v28, \regtype\()6
+        vwmaccsu.vx     v16, \regtype\()2, v20
+        vwmaccsu.vx     v16, \regtype\()5, v26
+.endif
+
+.ifc \regtype,t
+        vwmaccsu.vx     v16, a7, v30
+.elseif \regtype == s
+        vwmaccsu.vx     v16, s7, v30
+.endif
+
+        addi            a2, a2, -6
+        vle8.v          v28, (a2)
+        addi            a2, a2, -1
+        vle8.v          v26, (a2)
+        addi            a2, a2, 3
+
+.ifc \name,smooth
+        vwmaccsu.vx     v16, \regtype\()1, v28
+.else
+        vwmaccu.vx      v16, \regtype\()1, v28
+        vwmulu.vx       v28, v24, \regtype\()4
+.endif
+        vwmaccsu.vx     v16, \regtype\()0, v26
+        vwmulu.vx       v20, v22, \regtype\()3
+.else
+.ifc \name,smooth
+        vwmulu.vx       v16, v8, \regtype\()4
+        vwmaccu.vx      v16, \regtype\()2, v4
+        vwmaccu.vx      v16, \regtype\()5, v10
+        vwmaccsu.vx     v16, \regtype\()6, v12
+        vwmaccsu.vx     v16, \regtype\()1, v2
+.else
+        vwmulu.vx       v16, v2, \regtype\()1
+        vwmaccu.vx      v16, \regtype\()6, v12
+        vwmaccsu.vx     v16, \regtype\()5, v10
+        vwmaccsu.vx     v16, \regtype\()2, v4
+        vwmulu.vx       v28, v8, \regtype\()4
+.endif
+        vwmaccsu.vx     v16, \regtype\()0, v0
+        vwmulu.vx       v20, v6, \regtype\()3
+
+.ifc \regtype,t
+        vwmaccsu.vx     v16, a7, v14
+.elseif \regtype == s
+        vwmaccsu.vx     v16, s7, v14
+.endif
+
+.endif
+        vwadd.wx        v16, v16, a5
+.ifc \len,4
+        vsetvli         zero, zero, e16, mf2, ta, ma
+.elseif \len == 8
+        vsetvli         zero, zero, e16, m1, ta, ma
+.elseif \len == 16
+        vsetvli         zero, zero, e16, m2, ta, ma
+.else
+        vsetvli         zero, zero, e16, m4, ta, ma
+.endif
+
+.ifc \name,smooth
+        vwadd.vv        v24, v16, v20
+.else
+        vwadd.vv        v24, v16, v28
+        vwadd.wv        v24, v24, v20
+.endif
+        vnsra.wi        v24, v24, 7
+        vmax.vx         v24, v24, zero
+.ifc \len,4
+        vsetvli         zero, zero, e8, mf4, ta, ma
+.elseif \len == 8
+        vsetvli         zero, zero, e8, mf2, ta, ma
+.elseif \len == 16
+        vsetvli         zero, zero, e8, m1, ta, ma
+.else
+        vsetvli         zero, zero, e8, m2, ta, ma
+.endif
+
+.ifc \do,put
+        vnclipu.wi      \dst, v24, 0
+.elseif \do == avg
+        vle8.v          \dst, (a0)
+        vnclipu.wi      v24, v24, 0
+        vaaddu.vv       \dst, \dst, v24
+.endif
+
+.endm
+
+.macro epel_load_inc dst len do name type from_mem regtype
+        epel_load       \dst \len \do \name \type \from_mem \regtype
+        add             a2, a2, a3
+.endm
+
+.macro epel len do name type
+        epel_filter \name \type t
+
+.ifc \len,4
+        vsetivli        zero, 4, e8, mf4, ta, ma
+.elseif \len == 8
+        vsetivli        zero, 8, e8, mf2, ta, ma
+.elseif \len == 16
+        vsetivli        zero, 16, e8, m1, ta, ma
+.else
+        li              a5, 32
+        vsetvli         zero, a5, e8, m2, ta, ma
+.endif
+.ifc \do,avg
+        csrwi           vxrm, 0
+.endif
+
+1:
+        addi            a4, a4, -1
+        epel_load       v30 \len \do \name \type 1 t
+        vse8.v          v30, (a0)
+.ifc \len,64
+        addi            a0, a0, 32
+        addi            a2, a2, 32
+        epel_load       v30 \len \do \name \type 1 t
+        vse8.v          v30, (a0)
+        addi            a0, a0, -32
+        addi            a2, a2, -32
+.endif
+        add             a2, a2, a3
+        add             a0, a0, a1
+        bnez            a4, 1b
+
+        ret
+.endm
+
+.macro gen_epel len do name type
+func ff_\do\()_8tap_\name\()_\len\()\type\()_rvv, zve32x
+        epel \len \do \name \type
+endfunc
+.endm
+
 .irp len 64, 32, 16, 8, 4
 func ff_avg\len\()_rvv, zve32x
         copy_avg \len avg
@@ -134,4 +359,12 @@ endfunc
 func ff_avg_bilin_\len\()h_rvv, zve32x
         bilin_h \len avg
 endfunc
+
+.irp name regular sharp smooth
+        .irp do put avg
+                .irp type h
+                        gen_epel \len \do \name \type
+                .endr
+        .endr
+.endr
 .endr
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index 248501f5d2..97f02e601d 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -109,7 +109,13 @@ static av_cold void vp9dsp_mc_init_rvv(VP9DSPContext *dsp, int bpp)
 
 #define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type)  \
     dsp->mc[idx1][FILTER_BILINEAR    ][idx2][idxh][idxv] =   \
-        ff_##type##_bilin_##sz##dir##_rvv;
+        ff_##type##_bilin_##sz##dir##_rvv;                   \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] =   \
+        ff_##type##_8tap_smooth_##sz##dir##_rvv;             \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] =   \
+        ff_##type##_8tap_regular_##sz##dir##_rvv;            \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] =   \
+        ff_##type##_8tap_sharp_##sz##dir##_rvv;
 
 #define init_subpel2(idx, idxh, idxv, dir, type)      \
     init_subpel1(0, idx, idxh, idxv, 64, dir, type);  \
-- 
2.45.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

  parent reply	other threads:[~2024-05-04 15:04 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <20240504150313.2472910-1-uk7b@foxmail.com>
2024-05-04 15:03 ` [FFmpeg-devel] [PATCH 02/10] lavc/vp9dsp: R-V V ipred hor uk7b
2024-05-04 15:03 ` [FFmpeg-devel] [PATCH 03/10] lavc/vp9dsp: R-V V ipred tm uk7b
2024-05-04 15:03 ` [FFmpeg-devel] [PATCH 04/10] lavc/vp9dsp: R-V mc copy_avg uk7b
2024-05-04 15:03 ` [FFmpeg-devel] [PATCH 05/10] lavc/vp9dsp: R-V V mc bilin h uk7b
2024-05-04 15:03 ` uk7b [this message]
2024-05-04 15:03 ` [FFmpeg-devel] [PATCH 07/10] lavc/vp9dsp: R-V V mc bilin v uk7b
2024-05-04 15:03 ` [FFmpeg-devel] [PATCH 08/10] lavc/vp9dsp: R-V V mc tap v uk7b
2024-05-04 15:03 ` [FFmpeg-devel] [PATCH 09/10] lavc/vp9dsp: R-V V mc bilin hv uk7b
2024-05-04 15:03 ` [FFmpeg-devel] [PATCH 10/10] lavc/vp9dsp: R-V V mc tap hv uk7b

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=tencent_78E27848A89869EE0A0B396C924750F5CB05@qq.com \
    --to=uk7b@foxmail.com \
    --cc=ffmpeg-devel@ffmpeg.org \
    --cc=sunyuechi@iscas.ac.cn \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git