From: Arnie Chang <arnie.chang-at-sifive.com@ffmpeg.org>
To: ffmpeg-devel@ffmpeg.org
Subject: Re: [FFmpeg-devel] [PATCH v2] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks
Date: Tue, 25 Jul 2023 11:37:14 +0800
Message-ID: <CAPUBFYGAaKC3XitW1jVcjexCkdP3G-xJm-BysqRFsevMzRy1FA@mail.gmail.com> (raw)
In-Reply-To: <20230619130609.15547-1-arnie.chang@sifive.com>
It appears that all the issues raised during the review have been fixed,
and there have been no additional comments for over 1 month.
Could I kindly request assistance in pushing the patch?
On Mon, Jun 19, 2023 at 9:06 PM Arnie Chang <arnie.chang@sifive.com> wrote:
> Optimize the put and avg filtering for 4xH and 2xH blocks
>
> Signed-off-by: Arnie Chang <arnie.chang@sifive.com>
> ---
> V2:
> 1. Change the \width to an run time argument
> 2. Call to an internal function instead of instantiating similar code
> three times
>
> RVVi32:
> - h264chroma.chroma_mc [OK]
> checkasm: all 6 tests passed
> avg_h264_chroma_mc1_8_c: 1821.5
> avg_h264_chroma_mc1_8_rvv_i32: 466.5
> avg_h264_chroma_mc2_8_c: 939.2
> avg_h264_chroma_mc2_8_rvv_i32: 466.5
> avg_h264_chroma_mc4_8_c: 502.2
> avg_h264_chroma_mc4_8_rvv_i32: 466.5
> put_h264_chroma_mc1_8_c: 1436.5
> put_h264_chroma_mc1_8_rvv_i32: 382.5
> put_h264_chroma_mc2_8_c: 824.2
> put_h264_chroma_mc2_8_rvv_i32: 382.5
> put_h264_chroma_mc4_8_c: 431.2
> put_h264_chroma_mc4_8_rvv_i32: 382.5
>
> libavcodec/riscv/h264_chroma_init_riscv.c | 8 +
> libavcodec/riscv/h264_mc_chroma.S | 237 ++++++++++++++--------
> 2 files changed, 160 insertions(+), 85 deletions(-)
>
> diff --git a/libavcodec/riscv/h264_chroma_init_riscv.c
> b/libavcodec/riscv/h264_chroma_init_riscv.c
> index 7c905edfcd..9f95150ea3 100644
> --- a/libavcodec/riscv/h264_chroma_init_riscv.c
> +++ b/libavcodec/riscv/h264_chroma_init_riscv.c
> @@ -27,6 +27,10 @@
>
> void h264_put_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src,
> ptrdiff_t stride, int h, int x, int y);
> void h264_avg_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src,
> ptrdiff_t stride, int h, int x, int y);
> +void h264_put_chroma_mc4_rvv(uint8_t *p_dst, const uint8_t *p_src,
> ptrdiff_t stride, int h, int x, int y);
> +void h264_avg_chroma_mc4_rvv(uint8_t *p_dst, const uint8_t *p_src,
> ptrdiff_t stride, int h, int x, int y);
> +void h264_put_chroma_mc2_rvv(uint8_t *p_dst, const uint8_t *p_src,
> ptrdiff_t stride, int h, int x, int y);
> +void h264_avg_chroma_mc2_rvv(uint8_t *p_dst, const uint8_t *p_src,
> ptrdiff_t stride, int h, int x, int y);
>
> av_cold void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth)
> {
> @@ -36,6 +40,10 @@ av_cold void ff_h264chroma_init_riscv(H264ChromaContext
> *c, int bit_depth)
> if (bit_depth == 8 && (flags & AV_CPU_FLAG_RVV_I32) &&
> ff_get_rv_vlenb() >= 16) {
> c->put_h264_chroma_pixels_tab[0] = h264_put_chroma_mc8_rvv;
> c->avg_h264_chroma_pixels_tab[0] = h264_avg_chroma_mc8_rvv;
> + c->put_h264_chroma_pixels_tab[1] = h264_put_chroma_mc4_rvv;
> + c->avg_h264_chroma_pixels_tab[1] = h264_avg_chroma_mc4_rvv;
> + c->put_h264_chroma_pixels_tab[2] = h264_put_chroma_mc2_rvv;
> + c->avg_h264_chroma_pixels_tab[2] = h264_avg_chroma_mc2_rvv;
> }
> #endif
> }
> diff --git a/libavcodec/riscv/h264_mc_chroma.S
> b/libavcodec/riscv/h264_mc_chroma.S
> index 364bc3156e..ce99bda44d 100644
> --- a/libavcodec/riscv/h264_mc_chroma.S
> +++ b/libavcodec/riscv/h264_mc_chroma.S
> @@ -19,8 +19,7 @@
> */
> #include "libavutil/riscv/asm.S"
>
> -.macro h264_chroma_mc8 type
> -func h264_\type\()_chroma_mc8_rvv, zve32x
> +.macro do_chroma_mc type unroll
> csrw vxrm, zero
> slli t2, a5, 3
> mul t1, a5, a4
> @@ -30,94 +29,100 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
> sub a7, a4, t1
> addi a6, a5, 64
> sub t0, t2, t1
> - vsetivli t3, 8, e8, m1, ta, mu
> + vsetvli t3, t6, e8, m1, ta, mu
> beqz t1, 2f
> blez a3, 8f
> li t4, 0
> li t2, 0
> li t5, 1
> addi a5, t3, 1
> - slli t3, a2, 2
> + slli t3, a2, (1 + \unroll)
> 1: # if (xy != 0)
> add a4, a1, t4
> vsetvli zero, a5, e8, m1, ta, ma
> + .ifc \unroll,1
> addi t2, t2, 4
> + .else
> + addi t2, t2, 2
> + .endif
> vle8.v v10, (a4)
> add a4, a4, a2
> vslide1down.vx v11, v10, t5
> - vsetivli zero, 8, e8, m1, ta, ma
> + vsetvli zero, t6, e8, m1, ta, ma
> vwmulu.vx v8, v10, a6
> vwmaccu.vx v8, a7, v11
> vsetvli zero, a5, e8, m1, ta, ma
> vle8.v v12, (a4)
> - vsetivli zero, 8, e8, m1, ta, ma
> + vsetvli zero, t6, e8, m1, ta, ma
> add a4, a4, a2
> vwmaccu.vx v8, t0, v12
> vsetvli zero, a5, e8, m1, ta, ma
> vslide1down.vx v13, v12, t5
> - vsetivli zero, 8, e8, m1, ta, ma
> + vsetvli zero, t6, e8, m1, ta, ma
> vwmulu.vx v10, v12, a6
> vwmaccu.vx v8, t1, v13
> vwmaccu.vx v10, a7, v13
> vsetvli zero, a5, e8, m1, ta, ma
> vle8.v v14, (a4)
> - vsetivli zero, 8, e8, m1, ta, ma
> + vsetvli zero, t6, e8, m1, ta, ma
> add a4, a4, a2
> vwmaccu.vx v10, t0, v14
> vsetvli zero, a5, e8, m1, ta, ma
> vslide1down.vx v15, v14, t5
> - vsetivli zero, 8, e8, m1, ta, ma
> + vsetvli zero, t6, e8, m1, ta, ma
> vwmulu.vx v12, v14, a6
> vwmaccu.vx v10, t1, v15
> vwmaccu.vx v12, a7, v15
> + vnclipu.wi v15, v8, 6
> + .ifc \type,avg
> + vle8.v v9, (a0)
> + vaaddu.vv v15, v15, v9
> + .endif
> + vse8.v v15, (a0)
> + add a0, a0, a2
> + vnclipu.wi v8, v10, 6
> + .ifc \type,avg
> + vle8.v v9, (a0)
> + vaaddu.vv v8, v8, v9
> + .endif
> + add t4, t4, t3
> + vse8.v v8, (a0)
> + add a0, a0, a2
> + .ifc \unroll,1
> vsetvli zero, a5, e8, m1, ta, ma
> vle8.v v14, (a4)
> - vsetivli zero, 8, e8, m1, ta, ma
> + vsetvli zero, t6, e8, m1, ta, ma
> add a4, a4, a2
> vwmaccu.vx v12, t0, v14
> vsetvli zero, a5, e8, m1, ta, ma
> vslide1down.vx v15, v14, t5
> - vsetivli zero, 8, e8, m1, ta, ma
> + vsetvli zero, t6, e8, m1, ta, ma
> vwmulu.vx v16, v14, a6
> vwmaccu.vx v12, t1, v15
> vwmaccu.vx v16, a7, v15
> vsetvli zero, a5, e8, m1, ta, ma
> vle8.v v14, (a4)
> - vsetivli zero, 8, e8, m1, ta, ma
> - add a4, a0, t4
> - add t4, t4, t3
> + vsetvli zero, t6, e8, m1, ta, ma
> vwmaccu.vx v16, t0, v14
> vsetvli zero, a5, e8, m1, ta, ma
> vslide1down.vx v14, v14, t5
> - vsetivli zero, 8, e8, m1, ta, ma
> - vnclipu.wi v15, v8, 6
> + vsetvli zero, t6, e8, m1, ta, ma
> vwmaccu.vx v16, t1, v14
> - .ifc \type,avg
> - vle8.v v9, (a4)
> - vaaddu.vv v15, v15, v9
> - .endif
> - vse8.v v15, (a4)
> - add a4, a4, a2
> - vnclipu.wi v8, v10, 6
> - .ifc \type,avg
> - vle8.v v9, (a4)
> - vaaddu.vv v8, v8, v9
> - .endif
> - vse8.v v8, (a4)
> - add a4, a4, a2
> vnclipu.wi v8, v12, 6
> .ifc \type,avg
> - vle8.v v9, (a4)
> + vle8.v v9, (a0)
> vaaddu.vv v8, v8, v9
> .endif
> - vse8.v v8, (a4)
> - add a4, a4, a2
> + vse8.v v8, (a0)
> + add a0, a0, a2
> vnclipu.wi v8, v16, 6
> .ifc \type,avg
> - vle8.v v9, (a4)
> + vle8.v v9, (a0)
> vaaddu.vv v8, v8, v9
> .endif
> - vse8.v v8, (a4)
> + vse8.v v8, (a0)
> + add a0, a0, a2
> + .endif
> blt t2, a3, 1b
> j 8f
> 2:
> @@ -126,11 +131,15 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
> blez a3, 8f
> li a4, 0
> li t1, 0
> - slli a7, a2, 2
> + slli a7, a2, (1 + \unroll)
> 3: # if ((x8 - xy) == 0 && (y8 -xy) != 0)
> add a5, a1, a4
> vsetvli zero, zero, e8, m1, ta, ma
> + .ifc \unroll,1
> addi t1, t1, 4
> + .else
> + addi t1, t1, 2
> + .endif
> vle8.v v8, (a5)
> add a5, a5, a2
> add t2, a5, a2
> @@ -141,42 +150,44 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
> add t2, t2, a2
> add a5, t2, a2
> vwmaccu.vx v10, t0, v8
> - vle8.v v8, (t2)
> - vle8.v v14, (a5)
> - add a5, a0, a4
> add a4, a4, a7
> vwmaccu.vx v12, t0, v9
> vnclipu.wi v15, v10, 6
> vwmulu.vx v10, v9, a6
> + vnclipu.wi v9, v12, 6
> .ifc \type,avg
> - vle8.v v16, (a5)
> + vle8.v v16, (a0)
> vaaddu.vv v15, v15, v16
> .endif
> - vse8.v v15, (a5)
> - add a5, a5, a2
> - vnclipu.wi v9, v12, 6
> - vwmaccu.vx v10, t0, v8
> - vwmulu.vx v12, v8, a6
> + vse8.v v15, (a0)
> + add a0, a0, a2
> .ifc \type,avg
> - vle8.v v16, (a5)
> + vle8.v v16, (a0)
> vaaddu.vv v9, v9, v16
> .endif
> - vse8.v v9, (a5)
> - add a5, a5, a2
> + vse8.v v9, (a0)
> + add a0, a0, a2
> + .ifc \unroll,1
> + vle8.v v8, (t2)
> + vle8.v v14, (a5)
> + vwmaccu.vx v10, t0, v8
> + vwmulu.vx v12, v8, a6
> vnclipu.wi v8, v10, 6
> vwmaccu.vx v12, t0, v14
> .ifc \type,avg
> - vle8.v v16, (a5)
> + vle8.v v16, (a0)
> vaaddu.vv v8, v8, v16
> .endif
> - vse8.v v8, (a5)
> - add a5, a5, a2
> + vse8.v v8, (a0)
> + add a0, a0, a2
> vnclipu.wi v8, v12, 6
> .ifc \type,avg
> - vle8.v v16, (a5)
> + vle8.v v16, (a0)
> vaaddu.vv v8, v8, v16
> .endif
> - vse8.v v8, (a5)
> + vse8.v v8, (a0)
> + add a0, a0, a2
> + .endif
> blt t1, a3, 3b
> j 8f
> 4:
> @@ -186,87 +197,95 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
> li a4, 0
> li t2, 0
> addi t0, t3, 1
> - slli t1, a2, 2
> + slli t1, a2, (1 + \unroll)
> 5: # if ((x8 - xy) != 0 && (y8 -xy) == 0)
> add a5, a1, a4
> vsetvli zero, t0, e8, m1, ta, ma
> + .ifc \unroll,1
> addi t2, t2, 4
> + .else
> + addi t2, t2, 2
> + .endif
> vle8.v v8, (a5)
> add a5, a5, a2
> vslide1down.vx v9, v8, t5
> - vsetivli zero, 8, e8, m1, ta, ma
> + vsetvli zero, t6, e8, m1, ta, ma
> vwmulu.vx v10, v8, a6
> vwmaccu.vx v10, a7, v9
> vsetvli zero, t0, e8, m1, ta, ma
> vle8.v v8, (a5)
> add a5, a5, a2
> vslide1down.vx v9, v8, t5
> - vsetivli zero, 8, e8, m1, ta, ma
> + vsetvli zero, t6, e8, m1, ta, ma
> vwmulu.vx v12, v8, a6
> vwmaccu.vx v12, a7, v9
> + vnclipu.wi v16, v10, 6
> + .ifc \type,avg
> + vle8.v v18, (a0)
> + vaaddu.vv v16, v16, v18
> + .endif
> + vse8.v v16, (a0)
> + add a0, a0, a2
> + vnclipu.wi v10, v12, 6
> + .ifc \type,avg
> + vle8.v v18, (a0)
> + vaaddu.vv v10, v10, v18
> + .endif
> + add a4, a4, t1
> + vse8.v v10, (a0)
> + add a0, a0, a2
> + .ifc \unroll,1
> vsetvli zero, t0, e8, m1, ta, ma
> vle8.v v8, (a5)
> add a5, a5, a2
> vslide1down.vx v9, v8, t5
> - vsetivli zero, 8, e8, m1, ta, ma
> + vsetvli zero, t6, e8, m1, ta, ma
> vwmulu.vx v14, v8, a6
> vwmaccu.vx v14, a7, v9
> vsetvli zero, t0, e8, m1, ta, ma
> vle8.v v8, (a5)
> - add a5, a0, a4
> - add a4, a4, t1
> vslide1down.vx v9, v8, t5
> - vsetivli zero, 8, e8, m1, ta, ma
> - vnclipu.wi v16, v10, 6
> - .ifc \type,avg
> - vle8.v v18, (a5)
> - vaaddu.vv v16, v16, v18
> - .endif
> - vse8.v v16, (a5)
> - add a5, a5, a2
> - vnclipu.wi v10, v12, 6
> + vsetvli zero, t6, e8, m1, ta, ma
> vwmulu.vx v12, v8, a6
> - .ifc \type,avg
> - vle8.v v18, (a5)
> - vaaddu.vv v10, v10, v18
> - .endif
> - vse8.v v10, (a5)
> - add a5, a5, a2
> vnclipu.wi v8, v14, 6
> vwmaccu.vx v12, a7, v9
> .ifc \type,avg
> - vle8.v v18, (a5)
> + vle8.v v18, (a0)
> vaaddu.vv v8, v8, v18
> .endif
> - vse8.v v8, (a5)
> - add a5, a5, a2
> + vse8.v v8, (a0)
> + add a0, a0, a2
> vnclipu.wi v8, v12, 6
> .ifc \type,avg
> - vle8.v v18, (a5)
> + vle8.v v18, (a0)
> vaaddu.vv v8, v8, v18
> .endif
> - vse8.v v8, (a5)
> + vse8.v v8, (a0)
> + add a0, a0, a2
> + .endif
> blt t2, a3, 5b
> j 8f
> 6:
> blez a3, 8f
> li a4, 0
> li t2, 0
> - slli a7, a2, 2
> + slli a7, a2, (1 + \unroll)
> 7: # the final else, none of the above
> conditions are met
> add t0, a1, a4
> vsetvli zero, zero, e8, m1, ta, ma
> add a5, a0, a4
> add a4, a4, a7
> + .ifc \unroll,1
> addi t2, t2, 4
> + .else
> + addi t2, t2, 2
> + .endif
> vle8.v v8, (t0)
> add t0, t0, a2
> add t1, t0, a2
> vwmulu.vx v10, v8, a6
> vle8.v v8, (t0)
> add t0, t1, a2
> - vle8.v v9, (t1)
> - vle8.v v12, (t0)
> vnclipu.wi v13, v10, 6
> vwmulu.vx v10, v8, a6
> .ifc \type,avg
> @@ -276,13 +295,16 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
> vse8.v v13, (a5)
> add a5, a5, a2
> vnclipu.wi v8, v10, 6
> - vwmulu.vx v10, v9, a6
> .ifc \type,avg
> vle8.v v18, (a5)
> vaaddu.vv v8, v8, v18
> .endif
> vse8.v v8, (a5)
> add a5, a5, a2
> + .ifc \unroll,1
> + vle8.v v9, (t1)
> + vle8.v v12, (t0)
> + vwmulu.vx v10, v9, a6
> vnclipu.wi v8, v10, 6
> vwmulu.vx v10, v12, a6
> .ifc \type,avg
> @@ -297,11 +319,56 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
> vaaddu.vv v8, v8, v18
> .endif
> vse8.v v8, (a5)
> + .endif
> blt t2, a3, 7b
> 8:
> ret
> -endfunc
> .endm
>
> -h264_chroma_mc8 put
> -h264_chroma_mc8 avg
> +func h264_put_chroma_mc_rvv, zve32x
> +11:
> + li a7, 3
> + blt a3, a7, 12f
> + do_chroma_mc put 1
> +12:
> + do_chroma_mc put 0
> +endfunc
> +
> +func h264_avg_chroma_mc_rvv, zve32x
> +21:
> + li a7, 3
> + blt a3, a7, 22f
> + do_chroma_mc avg 1
> +22:
> + do_chroma_mc avg 0
> +endfunc
> +
> +func h264_put_chroma_mc8_rvv, zve32x
> + li t6, 8
> + j 11b
> +endfunc
> +
> +func h264_put_chroma_mc4_rvv, zve32x
> + li t6, 4
> + j 11b
> +endfunc
> +
> +func h264_put_chroma_mc2_rvv, zve32x
> + li t6, 2
> + j 11b
> +endfunc
> +
> +func h264_avg_chroma_mc8_rvv, zve32x
> + li t6, 8
> + j 21b
> +endfunc
> +
> +func h264_avg_chroma_mc4_rvv, zve32x
> + li t6, 4
> + j 21b
> +endfunc
> +
> +func h264_avg_chroma_mc2_rvv, zve32x
> + li t6, 2
> + j 21b
> +endfunc
> --
> 2.17.1
>
>
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next prev parent reply other threads:[~2023-07-25 3:37 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-06-19 13:06 Arnie Chang
2023-07-25 3:37 ` Arnie Chang [this message]
2023-07-25 6:24 ` Rémi Denis-Courmont
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=CAPUBFYGAaKC3XitW1jVcjexCkdP3G-xJm-BysqRFsevMzRy1FA@mail.gmail.com \
--to=arnie.chang-at-sifive.com@ffmpeg.org \
--cc=ffmpeg-devel@ffmpeg.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git