Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH 1/2] sws/rgb2rgb: rework R-V V YUY2 to 4:2:2 planar
@ 2023-11-09 18:34 Rémi Denis-Courmont
  2023-11-09 18:34 ` [FFmpeg-devel] [PATCH 2/2] sws/rgb2rgb: fix unaligned accesses in R-V V YUYV to I422p Rémi Denis-Courmont
  0 siblings, 1 reply; 3+ messages in thread
From: Rémi Denis-Courmont @ 2023-11-09 18:34 UTC (permalink / raw)
  To: ffmpeg-devel

This saves three scratch registers and three instructions per line. The
performance gains are mostly negligible. The main point is to free up
registers for further rework.
---
 libswscale/riscv/rgb2rgb_rvv.S | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/libswscale/riscv/rgb2rgb_rvv.S b/libswscale/riscv/rgb2rgb_rvv.S
index 671089c842..172f5918dc 100644
--- a/libswscale/riscv/rgb2rgb_rvv.S
+++ b/libswscale/riscv/rgb2rgb_rvv.S
@@ -127,31 +127,30 @@ func ff_deinterleave_bytes_rvv, zve32x
 endfunc
 
 .macro yuy2_to_i422p y_shift
-        addi    a4, a4, 1
+        slli    t4, a4, 1 // pixel width -> (source) byte width
         lw      t6, (sp)
+        sub     a6, a6, a4
         srai    a4, a4, 1 // pixel width -> chroma width
+        sub     a7, a7, a4
+        sub     t6, t6, t4
 1:
         mv      t4, a4
-        mv      t3, a3
-        mv      t0, a0
-        mv      t1, a1
-        mv      t2, a2
         addi    a5, a5, -1
 2:
         vsetvli    t5, t4, e8, m2, ta, ma
-        vlseg2e16.v v16, (t3)
+        vlseg2e16.v v16, (a3)
         sub        t4, t4, t5
         vnsrl.wi   v24, v16, \y_shift // Y0
-        sh2add     t3, t5, t3
+        sh2add     a3, t5, a3
         vnsrl.wi   v26, v20, \y_shift // Y1
         vnsrl.wi   v28, v16, 8 - \y_shift // U
         vnsrl.wi   v30, v20, 8 - \y_shift // V
-        vsseg2e8.v v24, (t0)
-        sh1add     t0, t5, t0
-        vse8.v     v28, (t1)
-        add        t1, t5, t1
-        vse8.v     v30, (t2)
-        add        t2, t5, t2
+        vsseg2e8.v v24, (a0)
+        sh1add     a0, t5, a0
+        vse8.v     v28, (a1)
+        add        a1, t5, a1
+        vse8.v     v30, (a2)
+        add        a2, t5, a2
         bnez       t4, 2b
 
         add     a3, a3, t6
-- 
2.42.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [FFmpeg-devel] [PATCH 2/2] sws/rgb2rgb: fix unaligned accesses in R-V V YUYV to I422p
  2023-11-09 18:34 [FFmpeg-devel] [PATCH 1/2] sws/rgb2rgb: rework R-V V YUY2 to 4:2:2 planar Rémi Denis-Courmont
@ 2023-11-09 18:34 ` Rémi Denis-Courmont
  2023-11-09 19:45   ` Rémi Denis-Courmont
  0 siblings, 1 reply; 3+ messages in thread
From: Rémi Denis-Courmont @ 2023-11-09 18:34 UTC (permalink / raw)
  To: ffmpeg-devel

In my personal opinion, we should not need to support unaligned YUY2
pixel maps. They should always be aligned to at least 32 bits, and the
current code assumes just 16 bits. However checkasm does test for
unaligned input bitmaps. QEMU accepts it, but real hardware dose not.

In this particular case, we can at the same time improve performance and
handle unaligned inputs, so do just that.

uyvytoyuv422_c:      104060.0
uyvytoyuv422_rvv_i32: 25284.0 (before)
uyvytoyuv422_rvv_i32: 20148.2 (after)
---
 libswscale/riscv/rgb2rgb_rvv.S | 45 +++++++++++++++++-----------------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/libswscale/riscv/rgb2rgb_rvv.S b/libswscale/riscv/rgb2rgb_rvv.S
index 172f5918dc..716948dc82 100644
--- a/libswscale/riscv/rgb2rgb_rvv.S
+++ b/libswscale/riscv/rgb2rgb_rvv.S
@@ -126,32 +126,33 @@ func ff_deinterleave_bytes_rvv, zve32x
         ret
 endfunc
 
-.macro yuy2_to_i422p y_shift
-        slli    t4, a4, 1 // pixel width -> (source) byte width
+.macro yuy2_to_i422p luma, chroma
+        srai    t4, a4, 1 // pixel width -> chroma width
         lw      t6, (sp)
+        slli    t5, a4, 1 // pixel width -> (source) byte width
         sub     a6, a6, a4
-        srai    a4, a4, 1 // pixel width -> chroma width
-        sub     a7, a7, a4
-        sub     t6, t6, t4
+        sub     a7, a7, t4
+        sub     t6, t6, t5
 1:
         mv      t4, a4
         addi    a5, a5, -1
 2:
-        vsetvli    t5, t4, e8, m2, ta, ma
-        vlseg2e16.v v16, (a3)
-        sub        t4, t4, t5
-        vnsrl.wi   v24, v16, \y_shift // Y0
-        sh2add     a3, t5, a3
-        vnsrl.wi   v26, v20, \y_shift // Y1
-        vnsrl.wi   v28, v16, 8 - \y_shift // U
-        vnsrl.wi   v30, v20, 8 - \y_shift // V
-        vsseg2e8.v v24, (a0)
-        sh1add     a0, t5, a0
-        vse8.v     v28, (a1)
-        add        a1, t5, a1
-        vse8.v     v30, (a2)
-        add        a2, t5, a2
-        bnez       t4, 2b
+        vsetvli t5, t4, e8, m4, ta, ma
+        vlseg2e8.v v16, (a3)
+        srli    t1, t5, 1
+        vsetvli zero, t1, e8, m2, ta, ma
+        vnsrl.wi   v24, \chroma, 0 // U
+        sub     t4, t4, t5
+        vnsrl.wi   v28, \chroma, 8 // V
+        sh1add  a3, t5, a3
+        vse8.v  v24, (a1)
+        add     a1, t1, a1
+        vse8.v  v28, (a2)
+        add     a2, t1, a2
+        vsetvli zero, t5, e8, m4, ta, ma
+        vse8.v  \luma, (a0)
+        add     a0, t5, a0
+        bnez    t4, 2b
 
         add     a3, a3, t6
         add     a0, a0, a6
@@ -163,9 +164,9 @@ endfunc
 .endm
 
 func ff_uyvytoyuv422_rvv, zve32x
-        yuy2_to_i422p 8
+        yuy2_to_i422p v20, v16
 endfunc
 
 func ff_yuyvtoyuv422_rvv, zve32x
-        yuy2_to_i422p 0
+        yuy2_to_i422p v16, v20
 endfunc
-- 
2.42.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/2] sws/rgb2rgb: fix unaligned accesses in R-V V YUYV to I422p
  2023-11-09 18:34 ` [FFmpeg-devel] [PATCH 2/2] sws/rgb2rgb: fix unaligned accesses in R-V V YUYV to I422p Rémi Denis-Courmont
@ 2023-11-09 19:45   ` Rémi Denis-Courmont
  0 siblings, 0 replies; 3+ messages in thread
From: Rémi Denis-Courmont @ 2023-11-09 19:45 UTC (permalink / raw)
  To: ffmpeg-devel

Le torstaina 9. marraskuuta 2023, 20.34.53 EET Rémi Denis-Courmont a écrit :
> In my personal opinion, we should not need to support unaligned YUY2
> pixel maps. They should always be aligned to at least 32 bits, and the
> current code assumes just 16 bits. However checkasm does test for
> unaligned input bitmaps. QEMU accepts it, but real hardware dose not.
> 
> In this particular case, we can at the same time improve performance and
> handle unaligned inputs, so do just that.
> 
> uyvytoyuv422_c:      104060.0
> uyvytoyuv422_rvv_i32: 25284.0 (before)
> uyvytoyuv422_rvv_i32: 20148.2 (after)
> ---
>  libswscale/riscv/rgb2rgb_rvv.S | 45 +++++++++++++++++-----------------
>  1 file changed, 23 insertions(+), 22 deletions(-)
> 
> diff --git a/libswscale/riscv/rgb2rgb_rvv.S b/libswscale/riscv/rgb2rgb_rvv.S
> index 172f5918dc..716948dc82 100644
> --- a/libswscale/riscv/rgb2rgb_rvv.S
> +++ b/libswscale/riscv/rgb2rgb_rvv.S
> @@ -126,32 +126,33 @@ func ff_deinterleave_bytes_rvv, zve32x
>          ret
>  endfunc
> 
> -.macro yuy2_to_i422p y_shift
> -        slli    t4, a4, 1 // pixel width -> (source) byte width
> +.macro yuy2_to_i422p luma, chroma
> +        srai    t4, a4, 1 // pixel width -> chroma width
>          lw      t6, (sp)
> +        slli    t5, a4, 1 // pixel width -> (source) byte width
>          sub     a6, a6, a4
> -        srai    a4, a4, 1 // pixel width -> chroma width
> -        sub     a7, a7, a4
> -        sub     t6, t6, t4
> +        sub     a7, a7, t4
> +        sub     t6, t6, t5
>  1:
>          mv      t4, a4
>          addi    a5, a5, -1
>  2:
> -        vsetvli    t5, t4, e8, m2, ta, ma
> -        vlseg2e16.v v16, (a3)
> -        sub        t4, t4, t5
> -        vnsrl.wi   v24, v16, \y_shift // Y0
> -        sh2add     a3, t5, a3
> -        vnsrl.wi   v26, v20, \y_shift // Y1
> -        vnsrl.wi   v28, v16, 8 - \y_shift // U
> -        vnsrl.wi   v30, v20, 8 - \y_shift // V
> -        vsseg2e8.v v24, (a0)
> -        sh1add     a0, t5, a0
> -        vse8.v     v28, (a1)
> -        add        a1, t5, a1
> -        vse8.v     v30, (a2)
> -        add        a2, t5, a2
> -        bnez       t4, 2b
> +        vsetvli t5, t4, e8, m4, ta, ma
> +        vlseg2e8.v v16, (a3)
> +        srli    t1, t5, 1

Due to the specifications of RVV, we cannot assume that on the penultimate loop 
iteration, the selected vector length (t5) will be even. If it is odd, then 
this won't work. The test hardware always picks the maximum possible, which is 
always even, but...

This needs special handling :(

> +        vsetvli zero, t1, e8, m2, ta, ma
> +        vnsrl.wi   v24, \chroma, 0 // U
> +        sub     t4, t4, t5
> +        vnsrl.wi   v28, \chroma, 8 // V
> +        sh1add  a3, t5, a3
> +        vse8.v  v24, (a1)
> +        add     a1, t1, a1
> +        vse8.v  v28, (a2)
> +        add     a2, t1, a2
> +        vsetvli zero, t5, e8, m4, ta, ma
> +        vse8.v  \luma, (a0)
> +        add     a0, t5, a0
> +        bnez    t4, 2b
> 
>          add     a3, a3, t6
>          add     a0, a0, a6
> @@ -163,9 +164,9 @@ endfunc
>  .endm
> 
>  func ff_uyvytoyuv422_rvv, zve32x
> -        yuy2_to_i422p 8
> +        yuy2_to_i422p v20, v16
>  endfunc
> 
>  func ff_yuyvtoyuv422_rvv, zve32x
> -        yuy2_to_i422p 0
> +        yuy2_to_i422p v16, v20
>  endfunc


-- 
Rémi Denis-Courmont
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2023-11-09 19:45 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-11-09 18:34 [FFmpeg-devel] [PATCH 1/2] sws/rgb2rgb: rework R-V V YUY2 to 4:2:2 planar Rémi Denis-Courmont
2023-11-09 18:34 ` [FFmpeg-devel] [PATCH 2/2] sws/rgb2rgb: fix unaligned accesses in R-V V YUYV to I422p Rémi Denis-Courmont
2023-11-09 19:45   ` Rémi Denis-Courmont

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git