* [FFmpeg-devel] [PATCH 1/3] swscale: [LA] Optimize range convert for yuvj420p.
2024-03-16 3:03 [FFmpeg-devel] Add optimization in swscale for LA Shiyou Yin
@ 2024-03-16 3:03 ` Shiyou Yin
2024-03-16 6:22 ` 陈昊
2024-03-16 3:03 ` [FFmpeg-devel] [PATCH 2/3] swscale: [LA] Optimize yuv2plane1_8_c Shiyou Yin
` (2 subsequent siblings)
3 siblings, 1 reply; 9+ messages in thread
From: Shiyou Yin @ 2024-03-16 3:03 UTC (permalink / raw)
To: ffmpeg-devel
---
libswscale/loongarch/swscale.S | 368 ++++++++++++++++++
libswscale/loongarch/swscale_init_loongarch.c | 33 ++
libswscale/loongarch/swscale_loongarch.h | 11 +
libswscale/swscale_internal.h | 1 +
libswscale/utils.c | 6 +-
5 files changed, 418 insertions(+), 1 deletion(-)
diff --git a/libswscale/loongarch/swscale.S b/libswscale/loongarch/swscale.S
index aa4c5cbe28..67b1bc834d 100644
--- a/libswscale/loongarch/swscale.S
+++ b/libswscale/loongarch/swscale.S
@@ -1866,3 +1866,371 @@ function ff_hscale_16_to_19_sub_lsx
ld.d s8, sp, 64
addi.d sp, sp, 72
endfunc
+
+function lumRangeFromJpeg_lsx
+ li.w t0, 14071
+ li.w t1, 33561947
+ vreplgr2vr.h vr0, t0
+ srli.w t2, a1, 3
+ andi t3, a1, 7
+ beqz t2, 2f
+1:
+ vld vr1, a0, 0
+ vreplgr2vr.w vr2, t1
+ vreplgr2vr.w vr3, t1
+ vmaddwev.w.h vr2, vr0, vr1
+ vmaddwod.w.h vr3, vr0, vr1
+ vsrai.w vr2, vr2, 14
+ vsrai.w vr3, vr3, 14
+ vpackev.h vr1, vr3, vr2
+ vst vr1, a0, 0
+ addi.d a0, a0, 16
+ addi.d t2, t2, -1
+ bnez t2, 1b
+2:
+ beqz t3, 4f
+3:
+ ld.h t4, a0, 0
+ mul.w t4, t4, t0
+ add.w t4, t4, t1
+ srai.w t4, t4, 14
+ st.h t4, a0, 0
+ addi.d a0, a0, 2
+ addi.d t3, t3, -1
+ bnez t3, 3b
+4:
+endfunc
+
+function lumRangeFromJpeg_lasx
+ li.w t0, 14071
+ li.w t1, 33561947
+ xvreplgr2vr.h xr0, t0
+ srli.w t2, a1, 4
+ andi t3, a1, 15
+ beqz t2, 2f
+1:
+ xvld xr1, a0, 0
+ xvreplgr2vr.w xr2, t1
+ xvreplgr2vr.w xr3, t1
+ xvmaddwev.w.h xr2, xr0, xr1
+ xvmaddwod.w.h xr3, xr0, xr1
+ xvsrai.w xr2, xr2, 14
+ xvsrai.w xr3, xr3, 14
+ xvpackev.h xr1, xr3, xr2
+ xvst xr1, a0, 0
+ addi.d a0, a0, 32
+ addi.d t2, t2, -1
+ bnez t2, 1b
+2:
+ beqz t3, 4f
+3:
+ ld.h t4, a0, 0
+ mul.w t4, t4, t0
+ add.w t4, t4, t1
+ srai.w t4, t4, 14
+ st.h t4, a0, 0
+ addi.d a0, a0, 2
+ addi.d t3, t3, -1
+ bnez t3, 3b
+4:
+endfunc
+
+function lumRangeToJpeg_lsx
+ li.w t0, 19077
+ li.w t1, -39057361
+ li.w t2, 30189
+ vreplgr2vr.h vr0, t0
+ vreplgr2vr.h vr4, t2
+ srli.w t2, a1, 3
+ andi t3, a1, 7
+ beqz t2, 2f
+1:
+ vld vr1, a0, 0
+ vreplgr2vr.w vr2, t1
+ vreplgr2vr.w vr3, t1
+ vmin.h vr1, vr1, vr4
+ vmaddwev.w.h vr2, vr0, vr1
+ vmaddwod.w.h vr3, vr0, vr1
+ vsrai.w vr2, vr2, 14
+ vsrai.w vr3, vr3, 14
+ vpackev.h vr1, vr3, vr2
+ vst vr1, a0, 0
+ addi.d a0, a0, 16
+ addi.d t2, t2, -1
+ bnez t2, 1b
+2:
+ beqz t3, 4f
+3:
+ ld.h t4, a0, 0
+ vreplgr2vr.h vr1, t4
+ vmin.h vr1, vr1, vr4
+ vpickve2gr.h t4, vr1, 0
+ mul.w t4, t4, t0
+ add.w t4, t4, t1
+ srai.w t4, t4, 14
+ st.h t4, a0, 0
+ addi.d a0, a0, 2
+ addi.d t3, t3, -1
+ bnez t3, 3b
+4:
+endfunc
+
+function lumRangeToJpeg_lasx
+ li.w t0, 19077
+ li.w t1, -39057361
+ li.w t2, 30189
+ xvreplgr2vr.h xr0, t0
+ xvreplgr2vr.h xr4, t2
+ srli.w t2, a1, 4
+ andi t3, a1, 15
+ beqz t2, 2f
+1:
+ xvld xr1, a0, 0
+ xvreplgr2vr.w xr2, t1
+ xvreplgr2vr.w xr3, t1
+ xvmin.h xr1, xr1, xr4
+ xvmaddwev.w.h xr2, xr0, xr1
+ xvmaddwod.w.h xr3, xr0, xr1
+ xvsrai.w xr2, xr2, 14
+ xvsrai.w xr3, xr3, 14
+ xvpackev.h xr1, xr3, xr2
+ xvst xr1, a0, 0
+ addi.d a0, a0, 32
+ addi.d t2, t2, -1
+ bnez t2, 1b
+2:
+ beqz t3, 4f
+3:
+ ld.h t4, a0, 0
+ vreplgr2vr.h vr1, t4
+ vmin.h vr1, vr1, vr4
+ vpickve2gr.h t4, vr1, 0
+ mul.w t4, t4, t0
+ add.w t4, t4, t1
+ srai.w t4, t4, 14
+ st.h t4, a0, 0
+ addi.d a0, a0, 2
+ addi.d t3, t3, -1
+ bnez t3, 3b
+4:
+endfunc
+
+function chrRangeFromJpeg_lsx
+ li.w t0, 1799
+ li.w t1, 4081085
+ vreplgr2vr.h vr0, t0
+ srli.w t2, a2, 3
+ andi t3, a2, 7
+ beqz t2, 2f
+1:
+ vld vr1, a0, 0
+ vld vr2, a1, 0
+ vreplgr2vr.w vr3, t1
+ vreplgr2vr.w vr4, t1
+ vreplgr2vr.w vr5, t1
+ vreplgr2vr.w vr6, t1
+ vmaddwev.w.h vr3, vr0, vr1
+ vmaddwod.w.h vr4, vr0, vr1
+ vmaddwev.w.h vr5, vr0, vr2
+ vmaddwod.w.h vr6, vr0, vr2
+ vsrai.w vr3, vr3, 11
+ vsrai.w vr4, vr4, 11
+ vsrai.w vr5, vr5, 11
+ vsrai.w vr6, vr6, 11
+ vpackev.h vr1, vr4, vr3
+ vpackev.h vr2, vr6, vr5
+ vst vr1, a0, 0
+ vst vr2, a1, 0
+ addi.d a0, a0, 16
+ addi.d a1, a1, 16
+ addi.d t2, t2, -1
+ bnez t2, 1b
+2:
+ beqz t3, 4f
+3:
+ ld.h t4, a0, 0
+ ld.h t5, a1, 0
+ mul.w t4, t4, t0
+ mul.w t5, t5, t0
+ add.w t4, t4, t1
+ add.w t5, t5, t1
+ srai.w t4, t4, 11
+ srai.w t5, t5, 11
+ st.h t4, a0, 0
+ st.h t5, a1, 0
+ addi.d a0, a0, 2
+ addi.d a1, a1, 2
+ addi.d t3, t3, -1
+ bnez t3, 3b
+4:
+endfunc
+
+function chrRangeFromJpeg_lasx
+ li.w t0, 1799
+ li.w t1, 4081085
+ xvreplgr2vr.h xr0, t0
+ srli.w t2, a2, 4
+ andi t3, a2, 15
+ beqz t2, 2f
+1:
+ xvld xr1, a0, 0
+ xvld xr2, a1, 0
+ xvreplgr2vr.w xr3, t1
+ xvreplgr2vr.w xr4, t1
+ xvreplgr2vr.w xr5, t1
+ xvreplgr2vr.w xr6, t1
+ xvmaddwev.w.h xr3, xr0, xr1
+ xvmaddwod.w.h xr4, xr0, xr1
+ xvmaddwev.w.h xr5, xr0, xr2
+ xvmaddwod.w.h xr6, xr0, xr2
+ xvsrai.w xr3, xr3, 11
+ xvsrai.w xr4, xr4, 11
+ xvsrai.w xr5, xr5, 11
+ xvsrai.w xr6, xr6, 11
+ xvpackev.h xr1, xr4, xr3
+ xvpackev.h xr2, xr6, xr5
+ xvst xr1, a0, 0
+ xvst xr2, a1, 0
+ addi.d a0, a0, 32
+ addi.d a1, a1, 32
+ addi.d t2, t2, -1
+ bnez t2, 1b
+2:
+ beqz t3, 4f
+3:
+ ld.h t4, a0, 0
+ ld.h t5, a1, 0
+ mul.w t4, t4, t0
+ mul.w t5, t5, t0
+ add.w t4, t4, t1
+ add.w t5, t5, t1
+ srai.w t4, t4, 11
+ srai.w t5, t5, 11
+ st.h t4, a0, 0
+ st.h t5, a1, 0
+ addi.d a0, a0, 2
+ addi.d a1, a1, 2
+ addi.d t3, t3, -1
+ bnez t3, 3b
+4:
+endfunc
+
+function chrRangeToJpeg_lsx
+ li.w t0, 4663
+ li.w t1, -9289992
+ li.w t2, 30775
+ vreplgr2vr.h vr0, t0
+ vreplgr2vr.h vr7, t2
+ srli.w t2, a2, 3
+ andi t3, a2, 7
+ beqz t2, 2f
+1:
+ vld vr1, a0, 0
+ vld vr2, a1, 0
+ vreplgr2vr.w vr3, t1
+ vreplgr2vr.w vr4, t1
+ vreplgr2vr.w vr5, t1
+ vreplgr2vr.w vr6, t1
+ vmin.h vr1, vr1, vr7
+ vmin.h vr2, vr2, vr7
+ vmaddwev.w.h vr3, vr0, vr1
+ vmaddwod.w.h vr4, vr0, vr1
+ vmaddwev.w.h vr5, vr0, vr2
+ vmaddwod.w.h vr6, vr0, vr2
+ vsrai.w vr3, vr3, 12
+ vsrai.w vr4, vr4, 12
+ vsrai.w vr5, vr5, 12
+ vsrai.w vr6, vr6, 12
+ vpackev.h vr1, vr4, vr3
+ vpackev.h vr2, vr6, vr5
+ vst vr1, a0, 0
+ vst vr2, a1, 0
+ addi.d a0, a0, 16
+ addi.d a1, a1, 16
+ addi.d t2, t2, -1
+ bnez t2, 1b
+2:
+ beqz t3, 4f
+3:
+ ld.h t4, a0, 0
+ ld.h t5, a1, 0
+ vreplgr2vr.h vr1, t4
+ vreplgr2vr.h vr2, t5
+ vmin.h vr1, vr1, vr7
+ vmin.h vr2, vr2, vr7
+ vpickve2gr.h t4, vr1, 0
+ vpickve2gr.h t5, vr2, 0
+ mul.w t4, t4, t0
+ mul.w t5, t5, t0
+ add.w t4, t4, t1
+ add.w t5, t5, t1
+ srai.w t4, t4, 12
+ srai.w t5, t5, 12
+ st.h t4, a0, 0
+ st.h t5, a1, 0
+ addi.d a0, a0, 2
+ addi.d a1, a1, 2
+ addi.d t3, t3, -1
+ bnez t3, 3b
+4:
+endfunc
+
+function chrRangeToJpeg_lasx
+ li.w t0, 4663
+ li.w t1, -9289992
+ li.w t2, 30775
+ xvreplgr2vr.h xr0, t0
+ xvreplgr2vr.h xr7, t2
+ srli.w t2, a2, 4
+ andi t3, a2, 15
+ beqz t2, 2f
+1:
+ xvld xr1, a0, 0
+ xvld xr2, a1, 0
+ xvreplgr2vr.w xr3, t1
+ xvreplgr2vr.w xr4, t1
+ xvreplgr2vr.w xr5, t1
+ xvreplgr2vr.w xr6, t1
+ xvmin.h xr1, xr1, xr7
+ xvmin.h xr2, xr2, xr7
+ xvmaddwev.w.h xr3, xr0, xr1
+ xvmaddwod.w.h xr4, xr0, xr1
+ xvmaddwev.w.h xr5, xr0, xr2
+ xvmaddwod.w.h xr6, xr0, xr2
+ xvsrai.w xr3, xr3, 12
+ xvsrai.w xr4, xr4, 12
+ xvsrai.w xr5, xr5, 12
+ xvsrai.w xr6, xr6, 12
+ xvpackev.h xr1, xr4, xr3
+ xvpackev.h xr2, xr6, xr5
+ xvst xr1, a0, 0
+ xvst xr2, a1, 0
+ addi.d a0, a0, 32
+ addi.d a1, a1, 32
+ addi.d t2, t2, -1
+ bnez t2, 1b
+2:
+ beqz t3, 4f
+3:
+ ld.h t4, a0, 0
+ ld.h t5, a1, 0
+ vreplgr2vr.h vr1, t4
+ vreplgr2vr.h vr2, t5
+ vmin.h vr1, vr1, vr7
+ vmin.h vr2, vr2, vr7
+ vpickve2gr.h t4, vr1, 0
+ vpickve2gr.h t5, vr2, 0
+ mul.w t4, t4, t0
+ mul.w t5, t5, t0
+ add.w t4, t4, t1
+ add.w t5, t5, t1
+ srai.w t4, t4, 12
+ srai.w t5, t5, 12
+ st.h t4, a0, 0
+ st.h t5, a1, 0
+ addi.d a0, a0, 2
+ addi.d a1, a1, 2
+ addi.d t3, t3, -1
+ bnez t3, 3b
+4:
+endfunc
diff --git a/libswscale/loongarch/swscale_init_loongarch.c b/libswscale/loongarch/swscale_init_loongarch.c
index 53e4f970b6..6d2786c55f 100644
--- a/libswscale/loongarch/swscale_init_loongarch.c
+++ b/libswscale/loongarch/swscale_init_loongarch.c
@@ -24,6 +24,38 @@
#include "libswscale/rgb2rgb.h"
#include "libavutil/loongarch/cpu.h"
+av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_lsx(cpu_flags)) {
+ if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
+ if (c->dstBpc <= 14) {
+ if (c->srcRange) {
+ c->lumConvertRange = lumRangeFromJpeg_lsx;
+ c->chrConvertRange = chrRangeFromJpeg_lsx;
+ } else {
+ c->lumConvertRange = lumRangeToJpeg_lsx;
+ c->chrConvertRange = chrRangeToJpeg_lsx;
+ }
+ }
+ }
+ }
+ if (have_lasx(cpu_flags)) {
+ if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
+ if (c->dstBpc <= 14) {
+ if (c->srcRange) {
+ c->lumConvertRange = lumRangeFromJpeg_lasx;
+ c->chrConvertRange = chrRangeFromJpeg_lasx;
+ } else {
+ c->lumConvertRange = lumRangeToJpeg_lasx;
+ c->chrConvertRange = chrRangeToJpeg_lasx;
+ }
+ }
+ }
+ }
+}
+
av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
@@ -77,6 +109,7 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
c->yuv2planeX = ff_yuv2planeX_8_lasx;
}
#endif // #if HAVE_LASX
+ ff_sws_init_range_convert_loongarch(c);
}
av_cold void rgb2rgb_init_loongarch(void)
diff --git a/libswscale/loongarch/swscale_loongarch.h b/libswscale/loongarch/swscale_loongarch.h
index 0514abae21..c96b085982 100644
--- a/libswscale/loongarch/swscale_loongarch.h
+++ b/libswscale/loongarch/swscale_loongarch.h
@@ -50,6 +50,11 @@ void ff_hscale_16_to_19_sub_lsx(SwsContext *c, int16_t *_dst, int dstW,
const uint8_t *_src, const int16_t *filter,
const int32_t *filterPos, int filterSize, int sh);
+void lumRangeFromJpeg_lsx(int16_t *dst, int width);
+void chrRangeFromJpeg_lsx(int16_t *dstU, int16_t *dstV, int width);
+void lumRangeToJpeg_lsx(int16_t *dst, int width);
+void chrRangeToJpeg_lsx(int16_t *dstU, int16_t *dstV, int width);
+
void planar_rgb_to_uv_lsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
int width, int32_t *rgb2yuv, void *opq);
@@ -97,6 +102,11 @@ void ff_hscale_16_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
const uint8_t *_src, const int16_t *filter,
const int32_t *filterPos, int filterSize);
+void lumRangeFromJpeg_lasx(int16_t *dst, int width);
+void chrRangeFromJpeg_lasx(int16_t *dstU, int16_t *dstV, int width);
+void lumRangeToJpeg_lasx(int16_t *dst, int width);
+void chrRangeToJpeg_lasx(int16_t *dstU, int16_t *dstV, int width);
+
void planar_rgb_to_uv_lasx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
int width, int32_t *rgb2yuv, void *opq);
@@ -130,6 +140,7 @@ void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
const uint8_t *dither, int offset);
av_cold void ff_sws_init_output_lasx(SwsContext *c);
+
#endif // #if HAVE_LASX
#endif /* SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H */
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index abeebbb002..0db581acf8 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -695,6 +695,7 @@ void ff_yuv2rgb_init_tables_ppc(SwsContext *c, const int inv_table[4],
void ff_updateMMXDitherTables(SwsContext *c, int dstY);
av_cold void ff_sws_init_range_convert(SwsContext *c);
+av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c);
SwsFunc ff_yuv2rgb_init_x86(SwsContext *c);
SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c);
diff --git a/libswscale/utils.c b/libswscale/utils.c
index ab8a68e241..47db65ef0e 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -1049,8 +1049,12 @@ int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
c->srcRange = srcRange;
c->dstRange = dstRange;
- if (need_reinit)
+ if (need_reinit) {
ff_sws_init_range_convert(c);
+#if ARCH_LOONGARCH64
+ ff_sws_init_range_convert_loongarch(c);
+#endif
+ }
c->dstFormatBpp = av_get_bits_per_pixel(desc_dst);
c->srcFormatBpp = av_get_bits_per_pixel(desc_src);
--
2.20.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/3] swscale: [LA] Optimize range convert for yuvj420p.
2024-03-16 3:03 ` [FFmpeg-devel] [PATCH 1/3] swscale: [LA] Optimize range convert for yuvj420p Shiyou Yin
@ 2024-03-16 6:22 ` 陈昊
0 siblings, 0 replies; 9+ messages in thread
From: 陈昊 @ 2024-03-16 6:22 UTC (permalink / raw)
To: FFmpeg development discussions and patches
LGTM
2024-03-16 11:03:31 "yinshiyou-hf@loongson.cn" <yinshiyou-hf@loongson.cn> 写道:
> ---
> libswscale/loongarch/swscale.S | 368 ++++++++++++++++++
> libswscale/loongarch/swscale_init_loongarch.c | 33 ++
> libswscale/loongarch/swscale_loongarch.h | 11 +
> libswscale/swscale_internal.h | 1 +
> libswscale/utils.c | 6 +-
> 5 files changed, 418 insertions(+), 1 deletion(-)
>
> diff --git a/libswscale/loongarch/swscale.S b/libswscale/loongarch/swscale.S
> index aa4c5cbe28..67b1bc834d 100644
> --- a/libswscale/loongarch/swscale.S
> +++ b/libswscale/loongarch/swscale.S
> @@ -1866,3 +1866,371 @@ function ff_hscale_16_to_19_sub_lsx
> ld.d s8, sp, 64
> addi.d sp, sp, 72
> endfunc
> +
> +function lumRangeFromJpeg_lsx
> + li.w t0, 14071
> + li.w t1, 33561947
> + vreplgr2vr.h vr0, t0
> + srli.w t2, a1, 3
> + andi t3, a1, 7
> + beqz t2, 2f
> +1:
> + vld vr1, a0, 0
> + vreplgr2vr.w vr2, t1
> + vreplgr2vr.w vr3, t1
> + vmaddwev.w.h vr2, vr0, vr1
> + vmaddwod.w.h vr3, vr0, vr1
> + vsrai.w vr2, vr2, 14
> + vsrai.w vr3, vr3, 14
> + vpackev.h vr1, vr3, vr2
> + vst vr1, a0, 0
> + addi.d a0, a0, 16
> + addi.d t2, t2, -1
> + bnez t2, 1b
> +2:
> + beqz t3, 4f
> +3:
> + ld.h t4, a0, 0
> + mul.w t4, t4, t0
> + add.w t4, t4, t1
> + srai.w t4, t4, 14
> + st.h t4, a0, 0
> + addi.d a0, a0, 2
> + addi.d t3, t3, -1
> + bnez t3, 3b
> +4:
> +endfunc
> +
> +function lumRangeFromJpeg_lasx
> + li.w t0, 14071
> + li.w t1, 33561947
> + xvreplgr2vr.h xr0, t0
> + srli.w t2, a1, 4
> + andi t3, a1, 15
> + beqz t2, 2f
> +1:
> + xvld xr1, a0, 0
> + xvreplgr2vr.w xr2, t1
> + xvreplgr2vr.w xr3, t1
> + xvmaddwev.w.h xr2, xr0, xr1
> + xvmaddwod.w.h xr3, xr0, xr1
> + xvsrai.w xr2, xr2, 14
> + xvsrai.w xr3, xr3, 14
> + xvpackev.h xr1, xr3, xr2
> + xvst xr1, a0, 0
> + addi.d a0, a0, 32
> + addi.d t2, t2, -1
> + bnez t2, 1b
> +2:
> + beqz t3, 4f
> +3:
> + ld.h t4, a0, 0
> + mul.w t4, t4, t0
> + add.w t4, t4, t1
> + srai.w t4, t4, 14
> + st.h t4, a0, 0
> + addi.d a0, a0, 2
> + addi.d t3, t3, -1
> + bnez t3, 3b
> +4:
> +endfunc
> +
> +function lumRangeToJpeg_lsx
> + li.w t0, 19077
> + li.w t1, -39057361
> + li.w t2, 30189
> + vreplgr2vr.h vr0, t0
> + vreplgr2vr.h vr4, t2
> + srli.w t2, a1, 3
> + andi t3, a1, 7
> + beqz t2, 2f
> +1:
> + vld vr1, a0, 0
> + vreplgr2vr.w vr2, t1
> + vreplgr2vr.w vr3, t1
> + vmin.h vr1, vr1, vr4
> + vmaddwev.w.h vr2, vr0, vr1
> + vmaddwod.w.h vr3, vr0, vr1
> + vsrai.w vr2, vr2, 14
> + vsrai.w vr3, vr3, 14
> + vpackev.h vr1, vr3, vr2
> + vst vr1, a0, 0
> + addi.d a0, a0, 16
> + addi.d t2, t2, -1
> + bnez t2, 1b
> +2:
> + beqz t3, 4f
> +3:
> + ld.h t4, a0, 0
> + vreplgr2vr.h vr1, t4
> + vmin.h vr1, vr1, vr4
> + vpickve2gr.h t4, vr1, 0
> + mul.w t4, t4, t0
> + add.w t4, t4, t1
> + srai.w t4, t4, 14
> + st.h t4, a0, 0
> + addi.d a0, a0, 2
> + addi.d t3, t3, -1
> + bnez t3, 3b
> +4:
> +endfunc
> +
> +function lumRangeToJpeg_lasx
> + li.w t0, 19077
> + li.w t1, -39057361
> + li.w t2, 30189
> + xvreplgr2vr.h xr0, t0
> + xvreplgr2vr.h xr4, t2
> + srli.w t2, a1, 4
> + andi t3, a1, 15
> + beqz t2, 2f
> +1:
> + xvld xr1, a0, 0
> + xvreplgr2vr.w xr2, t1
> + xvreplgr2vr.w xr3, t1
> + xvmin.h xr1, xr1, xr4
> + xvmaddwev.w.h xr2, xr0, xr1
> + xvmaddwod.w.h xr3, xr0, xr1
> + xvsrai.w xr2, xr2, 14
> + xvsrai.w xr3, xr3, 14
> + xvpackev.h xr1, xr3, xr2
> + xvst xr1, a0, 0
> + addi.d a0, a0, 32
> + addi.d t2, t2, -1
> + bnez t2, 1b
> +2:
> + beqz t3, 4f
> +3:
> + ld.h t4, a0, 0
> + vreplgr2vr.h vr1, t4
> + vmin.h vr1, vr1, vr4
> + vpickve2gr.h t4, vr1, 0
> + mul.w t4, t4, t0
> + add.w t4, t4, t1
> + srai.w t4, t4, 14
> + st.h t4, a0, 0
> + addi.d a0, a0, 2
> + addi.d t3, t3, -1
> + bnez t3, 3b
> +4:
> +endfunc
> +
> +function chrRangeFromJpeg_lsx
> + li.w t0, 1799
> + li.w t1, 4081085
> + vreplgr2vr.h vr0, t0
> + srli.w t2, a2, 3
> + andi t3, a2, 7
> + beqz t2, 2f
> +1:
> + vld vr1, a0, 0
> + vld vr2, a1, 0
> + vreplgr2vr.w vr3, t1
> + vreplgr2vr.w vr4, t1
> + vreplgr2vr.w vr5, t1
> + vreplgr2vr.w vr6, t1
> + vmaddwev.w.h vr3, vr0, vr1
> + vmaddwod.w.h vr4, vr0, vr1
> + vmaddwev.w.h vr5, vr0, vr2
> + vmaddwod.w.h vr6, vr0, vr2
> + vsrai.w vr3, vr3, 11
> + vsrai.w vr4, vr4, 11
> + vsrai.w vr5, vr5, 11
> + vsrai.w vr6, vr6, 11
> + vpackev.h vr1, vr4, vr3
> + vpackev.h vr2, vr6, vr5
> + vst vr1, a0, 0
> + vst vr2, a1, 0
> + addi.d a0, a0, 16
> + addi.d a1, a1, 16
> + addi.d t2, t2, -1
> + bnez t2, 1b
> +2:
> + beqz t3, 4f
> +3:
> + ld.h t4, a0, 0
> + ld.h t5, a1, 0
> + mul.w t4, t4, t0
> + mul.w t5, t5, t0
> + add.w t4, t4, t1
> + add.w t5, t5, t1
> + srai.w t4, t4, 11
> + srai.w t5, t5, 11
> + st.h t4, a0, 0
> + st.h t5, a1, 0
> + addi.d a0, a0, 2
> + addi.d a1, a1, 2
> + addi.d t3, t3, -1
> + bnez t3, 3b
> +4:
> +endfunc
> +
> +function chrRangeFromJpeg_lasx
> + li.w t0, 1799
> + li.w t1, 4081085
> + xvreplgr2vr.h xr0, t0
> + srli.w t2, a2, 4
> + andi t3, a2, 15
> + beqz t2, 2f
> +1:
> + xvld xr1, a0, 0
> + xvld xr2, a1, 0
> + xvreplgr2vr.w xr3, t1
> + xvreplgr2vr.w xr4, t1
> + xvreplgr2vr.w xr5, t1
> + xvreplgr2vr.w xr6, t1
> + xvmaddwev.w.h xr3, xr0, xr1
> + xvmaddwod.w.h xr4, xr0, xr1
> + xvmaddwev.w.h xr5, xr0, xr2
> + xvmaddwod.w.h xr6, xr0, xr2
> + xvsrai.w xr3, xr3, 11
> + xvsrai.w xr4, xr4, 11
> + xvsrai.w xr5, xr5, 11
> + xvsrai.w xr6, xr6, 11
> + xvpackev.h xr1, xr4, xr3
> + xvpackev.h xr2, xr6, xr5
> + xvst xr1, a0, 0
> + xvst xr2, a1, 0
> + addi.d a0, a0, 32
> + addi.d a1, a1, 32
> + addi.d t2, t2, -1
> + bnez t2, 1b
> +2:
> + beqz t3, 4f
> +3:
> + ld.h t4, a0, 0
> + ld.h t5, a1, 0
> + mul.w t4, t4, t0
> + mul.w t5, t5, t0
> + add.w t4, t4, t1
> + add.w t5, t5, t1
> + srai.w t4, t4, 11
> + srai.w t5, t5, 11
> + st.h t4, a0, 0
> + st.h t5, a1, 0
> + addi.d a0, a0, 2
> + addi.d a1, a1, 2
> + addi.d t3, t3, -1
> + bnez t3, 3b
> +4:
> +endfunc
> +
> +function chrRangeToJpeg_lsx
> + li.w t0, 4663
> + li.w t1, -9289992
> + li.w t2, 30775
> + vreplgr2vr.h vr0, t0
> + vreplgr2vr.h vr7, t2
> + srli.w t2, a2, 3
> + andi t3, a2, 7
> + beqz t2, 2f
> +1:
> + vld vr1, a0, 0
> + vld vr2, a1, 0
> + vreplgr2vr.w vr3, t1
> + vreplgr2vr.w vr4, t1
> + vreplgr2vr.w vr5, t1
> + vreplgr2vr.w vr6, t1
> + vmin.h vr1, vr1, vr7
> + vmin.h vr2, vr2, vr7
> + vmaddwev.w.h vr3, vr0, vr1
> + vmaddwod.w.h vr4, vr0, vr1
> + vmaddwev.w.h vr5, vr0, vr2
> + vmaddwod.w.h vr6, vr0, vr2
> + vsrai.w vr3, vr3, 12
> + vsrai.w vr4, vr4, 12
> + vsrai.w vr5, vr5, 12
> + vsrai.w vr6, vr6, 12
> + vpackev.h vr1, vr4, vr3
> + vpackev.h vr2, vr6, vr5
> + vst vr1, a0, 0
> + vst vr2, a1, 0
> + addi.d a0, a0, 16
> + addi.d a1, a1, 16
> + addi.d t2, t2, -1
> + bnez t2, 1b
> +2:
> + beqz t3, 4f
> +3:
> + ld.h t4, a0, 0
> + ld.h t5, a1, 0
> + vreplgr2vr.h vr1, t4
> + vreplgr2vr.h vr2, t5
> + vmin.h vr1, vr1, vr7
> + vmin.h vr2, vr2, vr7
> + vpickve2gr.h t4, vr1, 0
> + vpickve2gr.h t5, vr2, 0
> + mul.w t4, t4, t0
> + mul.w t5, t5, t0
> + add.w t4, t4, t1
> + add.w t5, t5, t1
> + srai.w t4, t4, 12
> + srai.w t5, t5, 12
> + st.h t4, a0, 0
> + st.h t5, a1, 0
> + addi.d a0, a0, 2
> + addi.d a1, a1, 2
> + addi.d t3, t3, -1
> + bnez t3, 3b
> +4:
> +endfunc
> +
> +function chrRangeToJpeg_lasx
> + li.w t0, 4663
> + li.w t1, -9289992
> + li.w t2, 30775
> + xvreplgr2vr.h xr0, t0
> + xvreplgr2vr.h xr7, t2
> + srli.w t2, a2, 4
> + andi t3, a2, 15
> + beqz t2, 2f
> +1:
> + xvld xr1, a0, 0
> + xvld xr2, a1, 0
> + xvreplgr2vr.w xr3, t1
> + xvreplgr2vr.w xr4, t1
> + xvreplgr2vr.w xr5, t1
> + xvreplgr2vr.w xr6, t1
> + xvmin.h xr1, xr1, xr7
> + xvmin.h xr2, xr2, xr7
> + xvmaddwev.w.h xr3, xr0, xr1
> + xvmaddwod.w.h xr4, xr0, xr1
> + xvmaddwev.w.h xr5, xr0, xr2
> + xvmaddwod.w.h xr6, xr0, xr2
> + xvsrai.w xr3, xr3, 12
> + xvsrai.w xr4, xr4, 12
> + xvsrai.w xr5, xr5, 12
> + xvsrai.w xr6, xr6, 12
> + xvpackev.h xr1, xr4, xr3
> + xvpackev.h xr2, xr6, xr5
> + xvst xr1, a0, 0
> + xvst xr2, a1, 0
> + addi.d a0, a0, 32
> + addi.d a1, a1, 32
> + addi.d t2, t2, -1
> + bnez t2, 1b
> +2:
> + beqz t3, 4f
> +3:
> + ld.h t4, a0, 0
> + ld.h t5, a1, 0
> + vreplgr2vr.h vr1, t4
> + vreplgr2vr.h vr2, t5
> + vmin.h vr1, vr1, vr7
> + vmin.h vr2, vr2, vr7
> + vpickve2gr.h t4, vr1, 0
> + vpickve2gr.h t5, vr2, 0
> + mul.w t4, t4, t0
> + mul.w t5, t5, t0
> + add.w t4, t4, t1
> + add.w t5, t5, t1
> + srai.w t4, t4, 12
> + srai.w t5, t5, 12
> + st.h t4, a0, 0
> + st.h t5, a1, 0
> + addi.d a0, a0, 2
> + addi.d a1, a1, 2
> + addi.d t3, t3, -1
> + bnez t3, 3b
> +4:
> +endfunc
> diff --git a/libswscale/loongarch/swscale_init_loongarch.c b/libswscale/loongarch/swscale_init_loongarch.c
> index 53e4f970b6..6d2786c55f 100644
> --- a/libswscale/loongarch/swscale_init_loongarch.c
> +++ b/libswscale/loongarch/swscale_init_loongarch.c
> @@ -24,6 +24,38 @@
> #include "libswscale/rgb2rgb.h"
> #include "libavutil/loongarch/cpu.h"
>
> +av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c)
> +{
> + int cpu_flags = av_get_cpu_flags();
> +
> + if (have_lsx(cpu_flags)) {
> + if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
> + if (c->dstBpc <= 14) {
> + if (c->srcRange) {
> + c->lumConvertRange = lumRangeFromJpeg_lsx;
> + c->chrConvertRange = chrRangeFromJpeg_lsx;
> + } else {
> + c->lumConvertRange = lumRangeToJpeg_lsx;
> + c->chrConvertRange = chrRangeToJpeg_lsx;
> + }
> + }
> + }
> + }
> + if (have_lasx(cpu_flags)) {
> + if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
> + if (c->dstBpc <= 14) {
> + if (c->srcRange) {
> + c->lumConvertRange = lumRangeFromJpeg_lasx;
> + c->chrConvertRange = chrRangeFromJpeg_lasx;
> + } else {
> + c->lumConvertRange = lumRangeToJpeg_lasx;
> + c->chrConvertRange = chrRangeToJpeg_lasx;
> + }
> + }
> + }
> + }
> +}
> +
> av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
> {
> int cpu_flags = av_get_cpu_flags();
> @@ -77,6 +109,7 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
> c->yuv2planeX = ff_yuv2planeX_8_lasx;
> }
> #endif // #if HAVE_LASX
> + ff_sws_init_range_convert_loongarch(c);
> }
>
> av_cold void rgb2rgb_init_loongarch(void)
> diff --git a/libswscale/loongarch/swscale_loongarch.h b/libswscale/loongarch/swscale_loongarch.h
> index 0514abae21..c96b085982 100644
> --- a/libswscale/loongarch/swscale_loongarch.h
> +++ b/libswscale/loongarch/swscale_loongarch.h
> @@ -50,6 +50,11 @@ void ff_hscale_16_to_19_sub_lsx(SwsContext *c, int16_t *_dst, int dstW,
> const uint8_t *_src, const int16_t *filter,
> const int32_t *filterPos, int filterSize, int sh);
>
> +void lumRangeFromJpeg_lsx(int16_t *dst, int width);
> +void chrRangeFromJpeg_lsx(int16_t *dstU, int16_t *dstV, int width);
> +void lumRangeToJpeg_lsx(int16_t *dst, int width);
> +void chrRangeToJpeg_lsx(int16_t *dstU, int16_t *dstV, int width);
> +
> void planar_rgb_to_uv_lsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
> int width, int32_t *rgb2yuv, void *opq);
>
> @@ -97,6 +102,11 @@ void ff_hscale_16_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
> const uint8_t *_src, const int16_t *filter,
> const int32_t *filterPos, int filterSize);
>
> +void lumRangeFromJpeg_lasx(int16_t *dst, int width);
> +void chrRangeFromJpeg_lasx(int16_t *dstU, int16_t *dstV, int width);
> +void lumRangeToJpeg_lasx(int16_t *dst, int width);
> +void chrRangeToJpeg_lasx(int16_t *dstU, int16_t *dstV, int width);
> +
> void planar_rgb_to_uv_lasx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
> int width, int32_t *rgb2yuv, void *opq);
>
> @@ -130,6 +140,7 @@ void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
> const uint8_t *dither, int offset);
>
> av_cold void ff_sws_init_output_lasx(SwsContext *c);
> +
> #endif // #if HAVE_LASX
>
> #endif /* SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H */
> diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
> index abeebbb002..0db581acf8 100644
> --- a/libswscale/swscale_internal.h
> +++ b/libswscale/swscale_internal.h
> @@ -695,6 +695,7 @@ void ff_yuv2rgb_init_tables_ppc(SwsContext *c, const int inv_table[4],
> void ff_updateMMXDitherTables(SwsContext *c, int dstY);
>
> av_cold void ff_sws_init_range_convert(SwsContext *c);
> +av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c);
>
> SwsFunc ff_yuv2rgb_init_x86(SwsContext *c);
> SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c);
> diff --git a/libswscale/utils.c b/libswscale/utils.c
> index ab8a68e241..47db65ef0e 100644
> --- a/libswscale/utils.c
> +++ b/libswscale/utils.c
> @@ -1049,8 +1049,12 @@ int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
> c->srcRange = srcRange;
> c->dstRange = dstRange;
>
> - if (need_reinit)
> + if (need_reinit) {
> ff_sws_init_range_convert(c);
> +#if ARCH_LOONGARCH64
> + ff_sws_init_range_convert_loongarch(c);
> +#endif
> + }
>
> c->dstFormatBpp = av_get_bits_per_pixel(desc_dst);
> c->srcFormatBpp = av_get_bits_per_pixel(desc_src);
> --
> 2.20.1
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
本邮件及其附件含有龙芯中科的商业秘密信息,仅限于发送给上面地址中列出的个人或群组。禁止任何其他人以任何形式使用(包括但不限于全部或部分地泄露、复制或散发)本邮件及其附件中的信息。如果您错收本邮件,请您立即电话或邮件通知发件人并删除本邮件。
This email and its attachments contain confidential information from Loongson Technology , which is intended only for the person or entity whose address is listed above. Any use of the information contained herein in any way (including, but not limited to, total or partial disclosure, reproduction or dissemination) by persons other than the intended recipient(s) is prohibited. If you receive this email in error, please notify the sender by phone or email immediately and delete it.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* [FFmpeg-devel] [PATCH 2/3] swscale: [LA] Optimize yuv2plane1_8_c.
2024-03-16 3:03 [FFmpeg-devel] Add optimization in swscale for LA Shiyou Yin
2024-03-16 3:03 ` [FFmpeg-devel] [PATCH 1/3] swscale: [LA] Optimize range convert for yuvj420p Shiyou Yin
@ 2024-03-16 3:03 ` Shiyou Yin
2024-03-16 3:03 ` [FFmpeg-devel] [PATCH 3/3] swscale: [LA] Optimize swscale funcs in input.c Shiyou Yin
2024-03-26 3:11 ` [FFmpeg-devel] Add optimization in swscale for LA Shiyou Yin
3 siblings, 0 replies; 9+ messages in thread
From: Shiyou Yin @ 2024-03-16 3:03 UTC (permalink / raw)
To: ffmpeg-devel
---
libswscale/loongarch/output.S | 254 +++++++++++++++++-
libswscale/loongarch/output_lasx.c | 23 +-
libswscale/loongarch/output_lsx.c | 22 +-
libswscale/loongarch/swscale_init_loongarch.c | 12 +-
libswscale/loongarch/swscale_loongarch.h | 29 +-
5 files changed, 324 insertions(+), 16 deletions(-)
diff --git a/libswscale/loongarch/output.S b/libswscale/loongarch/output.S
index b44bac502a..d71667e38a 100644
--- a/libswscale/loongarch/output.S
+++ b/libswscale/loongarch/output.S
@@ -23,11 +23,11 @@
#include "libavcodec/loongarch/loongson_asm.S"
-/* static void ff_yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
+/* static void yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
* const int16_t **src, uint8_t *dest, int dstW,
* const uint8_t *dither, int offset)
*/
-function ff_yuv2planeX_8_lsx
+function yuv2planeX_8_lsx
addi.w t1, a6, 1
addi.w t2, a6, 2
addi.w t3, a6, 3
@@ -136,3 +136,253 @@ function ff_yuv2planeX_8_lsx
blt zero, a4, .DEST
.END:
endfunc
+
+/*
+ * void yuv2plane1_8_lsx(const int16_t *src, uint8_t *dest, int dstW,
+ * const uint8_t *dither, int offset)
+ */
+function yuv2plane1_8_lsx
+ addi.w t1, a4, 1
+ addi.w t2, a4, 2
+ addi.w t3, a4, 3
+ addi.w t4, a4, 4
+ addi.w t5, a4, 5
+ addi.w t6, a4, 6
+ addi.w t7, a4, 7
+ andi t0, a4, 7
+ andi t1, t1, 7
+ andi t2, t2, 7
+ andi t3, t3, 7
+ andi t4, t4, 7
+ andi t5, t5, 7
+ andi t6, t6, 7
+ andi t7, t7, 7
+ ldx.bu t0, a3, t0
+ ldx.bu t1, a3, t1
+ ldx.bu t2, a3, t2
+ ldx.bu t3, a3, t3
+ ldx.bu t4, a3, t4
+ ldx.bu t5, a3, t5
+ ldx.bu t6, a3, t6
+ ldx.bu t7, a3, t7
+ vinsgr2vr.h vr1, t0, 0
+ vinsgr2vr.h vr1, t1, 1
+ vinsgr2vr.h vr1, t2, 2
+ vinsgr2vr.h vr1, t3, 3
+ vinsgr2vr.h vr1, t4, 4
+ vinsgr2vr.h vr1, t5, 5
+ vinsgr2vr.h vr1, t6, 6
+ vinsgr2vr.h vr1, t7, 7
+ vsub.h vr0, vr0, vr0
+ vilvl.h vr2, vr0, vr1
+ vilvh.h vr3, vr0, vr1
+
+ andi t8, a2, 7
+ srli.d a2, a2, 3
+ beqz a2, 2f
+1:
+ vld vr1, a0, 0
+ addi.d a0, a0, 16
+ vshuf4i.d vr0, vr1, 8
+ vexth.w.h vr4, vr0
+ vexth.w.h vr5, vr1
+
+ vadd.w vr4, vr2, vr4
+ vadd.w vr5, vr3, vr5
+ vsrai.w vr4, vr4, 7
+ vsrai.w vr5, vr5, 7
+ vclip255.w vr4, vr4
+ vclip255.w vr5, vr5
+ vpickev.h vr1, vr5, vr4
+ vpickev.b vr1, vr1, vr1
+ fst.d f1, a1, 0
+ addi.d a1, a1, 8
+ addi.d a2, a2, -1
+ bnez a2, 1b
+2:
+ beqz t8, 4f
+3:
+ add.w a4, a4, t8
+ addi.w t1, a4, 1
+ addi.w t2, a4, 2
+ addi.w t3, a4, 3
+ addi.w t4, a4, 4
+ addi.w t5, a4, 5
+ addi.w t6, a4, 6
+ addi.w t7, a4, 7
+ andi t0, a4, 7
+ andi t1, t1, 7
+ andi t2, t2, 7
+ andi t3, t3, 7
+ andi t4, t4, 7
+ andi t5, t5, 7
+ andi t6, t6, 7
+ andi t7, t7, 7
+ ldx.bu t0, a3, t0
+ ldx.bu t1, a3, t1
+ ldx.bu t2, a3, t2
+ ldx.bu t3, a3, t3
+ ldx.bu t4, a3, t4
+ ldx.bu t5, a3, t5
+ ldx.bu t6, a3, t6
+ ldx.bu t7, a3, t7
+ vinsgr2vr.h vr1, t0, 0
+ vinsgr2vr.h vr1, t1, 1
+ vinsgr2vr.h vr1, t2, 2
+ vinsgr2vr.h vr1, t3, 3
+ vinsgr2vr.h vr1, t4, 4
+ vinsgr2vr.h vr1, t5, 5
+ vinsgr2vr.h vr1, t6, 6
+ vinsgr2vr.h vr1, t7, 7
+ vsub.h vr0, vr0, vr0
+ vilvl.h vr2, vr0, vr1
+ vilvh.h vr3, vr0, vr1
+
+ addi.d a0, a0, -16
+ add.d a0, a0, t8
+ add.d a0, a0, t8
+ addi.d a1, a1, -8
+ add.d a1, a1, t8
+
+ vld vr1, a0, 0
+ vshuf4i.d vr0, vr1, 8
+ vexth.w.h vr4, vr0
+ vexth.w.h vr5, vr1
+
+ vadd.w vr4, vr2, vr4
+ vadd.w vr5, vr3, vr5
+ vsrai.w vr4, vr4, 7
+ vsrai.w vr5, vr5, 7
+ vclip255.w vr4, vr4
+ vclip255.w vr5, vr5
+ vpickev.h vr1, vr5, vr4
+ vpickev.b vr1, vr1, vr1
+ fst.d f1, a1, 0
+4:
+endfunc
+
+function yuv2plane1_8_lasx
+ addi.w t1, a4, 1
+ addi.w t2, a4, 2
+ addi.w t3, a4, 3
+ addi.w t4, a4, 4
+ addi.w t5, a4, 5
+ addi.w t6, a4, 6
+ addi.w t7, a4, 7
+ andi t0, a4, 7
+ andi t1, t1, 7
+ andi t2, t2, 7
+ andi t3, t3, 7
+ andi t4, t4, 7
+ andi t5, t5, 7
+ andi t6, t6, 7
+ andi t7, t7, 7
+ ldx.bu t0, a3, t0
+ ldx.bu t1, a3, t1
+ ldx.bu t2, a3, t2
+ ldx.bu t3, a3, t3
+ ldx.bu t4, a3, t4
+ ldx.bu t5, a3, t5
+ ldx.bu t6, a3, t6
+ ldx.bu t7, a3, t7
+ vinsgr2vr.h vr1, t0, 0
+ vinsgr2vr.h vr1, t1, 1
+ vinsgr2vr.h vr1, t2, 2
+ vinsgr2vr.h vr1, t3, 3
+ vinsgr2vr.h vr1, t4, 4
+ vinsgr2vr.h vr1, t5, 5
+ vinsgr2vr.h vr1, t6, 6
+ vinsgr2vr.h vr1, t7, 7
+ xvpermi.q xr1, xr1, 0
+ xvsub.h xr0, xr0, xr0
+ xvilvl.h xr2, xr0, xr1
+ xvilvh.h xr3, xr0, xr1
+
+ andi t8, a2, 15
+ srli.d a2, a2, 4
+ beqz a2, 2f
+1:
+ xvld xr1, a0, 0
+ addi.d a0, a0, 32
+ xvpermi.d xr0, xr1, 0xa0
+ xvexth.w.h xr4, xr0
+ xvexth.w.h xr5, xr1
+
+ xvadd.w xr4, xr2, xr4
+ xvadd.w xr5, xr3, xr5
+ xvsrai.w xr4, xr4, 7
+ xvsrai.w xr5, xr5, 7
+ xvclip255.w xr4, xr4
+ xvclip255.w xr5, xr5
+ xvpickev.h xr1, xr5, xr4
+ xvpickev.b xr0, xr1, xr1
+ xvpermi.q xr1, xr0, 1
+ fst.d f0, a1, 0
+ fst.d f1, a1, 8
+ addi.d a1, a1, 16
+ addi.d a2, a2, -1
+ bnez a2, 1b
+2:
+ beqz t8, 4f
+3:
+ add.w a4, a4, t8
+ addi.w t1, a4, 1
+ addi.w t2, a4, 2
+ addi.w t3, a4, 3
+ addi.w t4, a4, 4
+ addi.w t5, a4, 5
+ addi.w t6, a4, 6
+ addi.w t7, a4, 7
+ andi t0, a4, 7
+ andi t1, t1, 7
+ andi t2, t2, 7
+ andi t3, t3, 7
+ andi t4, t4, 7
+ andi t5, t5, 7
+ andi t6, t6, 7
+ andi t7, t7, 7
+ ldx.bu t0, a3, t0
+ ldx.bu t1, a3, t1
+ ldx.bu t2, a3, t2
+ ldx.bu t3, a3, t3
+ ldx.bu t4, a3, t4
+ ldx.bu t5, a3, t5
+ ldx.bu t6, a3, t6
+ ldx.bu t7, a3, t7
+ vinsgr2vr.h vr1, t0, 0
+ vinsgr2vr.h vr1, t1, 1
+ vinsgr2vr.h vr1, t2, 2
+ vinsgr2vr.h vr1, t3, 3
+ vinsgr2vr.h vr1, t4, 4
+ vinsgr2vr.h vr1, t5, 5
+ vinsgr2vr.h vr1, t6, 6
+ vinsgr2vr.h vr1, t7, 7
+ xvpermi.q xr1, xr1, 0
+ xvsub.h xr0, xr0, xr0
+ xvilvl.h xr2, xr0, xr1
+ xvilvh.h xr3, xr0, xr1
+
+ addi.d a0, a0, -32
+ add.d a0, a0, t8
+ add.d a0, a0, t8
+ addi.d a1, a1, -16
+ add.d a1, a1, t8
+
+ xvld xr1, a0, 0
+ xvpermi.d xr0, xr1, 0xa0
+ xvexth.w.h xr4, xr0
+ xvexth.w.h xr5, xr1
+
+ xvadd.w xr4, xr2, xr4
+ xvadd.w xr5, xr3, xr5
+ xvsrai.w xr4, xr4, 7
+ xvsrai.w xr5, xr5, 7
+ xvclip255.w xr4, xr4
+ xvclip255.w xr5, xr5
+ xvpickev.h xr1, xr5, xr4
+ xvpickev.b xr0, xr1, xr1
+ xvpermi.q xr1, xr0, 1
+ fst.d f0, a1, 0
+ fst.d f1, a1, 8
+4:
+endfunc
diff --git a/libswscale/loongarch/output_lasx.c b/libswscale/loongarch/output_lasx.c
index 277d7063e6..bc8ab8cf36 100644
--- a/libswscale/loongarch/output_lasx.c
+++ b/libswscale/loongarch/output_lasx.c
@@ -22,7 +22,7 @@
#include "swscale_loongarch.h"
#include "libavutil/loongarch/loongson_intrinsics.h"
-void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
+void yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset)
{
@@ -1775,8 +1775,27 @@ YUV2RGBWRAPPER(yuv2, rgb_full, bgr8_full, AV_PIX_FMT_BGR8, 0)
YUV2RGBWRAPPER(yuv2, rgb_full, rgb8_full, AV_PIX_FMT_RGB8, 0)
-av_cold void ff_sws_init_output_lasx(SwsContext *c)
+av_cold void ff_sws_init_output_lasx(SwsContext *c,
+ yuv2planar1_fn *yuv2plane1,
+ yuv2planarX_fn *yuv2planeX,
+ yuv2interleavedX_fn *yuv2nv12cX,
+ yuv2packed1_fn *yuv2packed1,
+ yuv2packed2_fn *yuv2packed2,
+ yuv2packedX_fn *yuv2packedX,
+ yuv2anyX_fn *yuv2anyX)
{
+ enum AVPixelFormat dstFormat = c->dstFormat;
+
+ /* Add initialization once optimized */
+ if (isSemiPlanarYUV(dstFormat) && isDataInHighBits(dstFormat)) {
+ } else if (is16BPS(dstFormat)) {
+ } else if (isNBPS(dstFormat)) {
+ } else if (dstFormat == AV_PIX_FMT_GRAYF32BE) {
+ } else if (dstFormat == AV_PIX_FMT_GRAYF32LE) {
+ } else {
+ *yuv2plane1 = yuv2plane1_8_lasx;
+ *yuv2planeX = yuv2planeX_8_lasx;
+ }
if(c->flags & SWS_FULL_CHR_H_INT) {
switch (c->dstFormat) {
diff --git a/libswscale/loongarch/output_lsx.c b/libswscale/loongarch/output_lsx.c
index 768cc3abc6..de9b1534ee 100644
--- a/libswscale/loongarch/output_lsx.c
+++ b/libswscale/loongarch/output_lsx.c
@@ -1624,8 +1624,28 @@ YUV2RGBWRAPPER(yuv2, rgb_full, bgr8_full, AV_PIX_FMT_BGR8, 0)
YUV2RGBWRAPPER(yuv2, rgb_full, rgb8_full, AV_PIX_FMT_RGB8, 0)
-av_cold void ff_sws_init_output_lsx(SwsContext *c)
+av_cold void ff_sws_init_output_lsx(SwsContext *c,
+ yuv2planar1_fn *yuv2plane1,
+ yuv2planarX_fn *yuv2planeX,
+ yuv2interleavedX_fn *yuv2nv12cX,
+ yuv2packed1_fn *yuv2packed1,
+ yuv2packed2_fn *yuv2packed2,
+ yuv2packedX_fn *yuv2packedX,
+ yuv2anyX_fn *yuv2anyX)
{
+ enum AVPixelFormat dstFormat = c->dstFormat;
+
+ /* Add initialization once optimized */
+ if (isSemiPlanarYUV(dstFormat) && isDataInHighBits(dstFormat)) {
+ } else if (is16BPS(dstFormat)) {
+ } else if (isNBPS(dstFormat)) {
+ } else if (dstFormat == AV_PIX_FMT_GRAYF32BE) {
+ } else if (dstFormat == AV_PIX_FMT_GRAYF32LE) {
+ } else {
+ *yuv2plane1 = yuv2plane1_8_lsx;
+ *yuv2planeX = yuv2planeX_8_lsx;
+ }
+
if(c->flags & SWS_FULL_CHR_H_INT) {
switch (c->dstFormat) {
case AV_PIX_FMT_RGBA:
diff --git a/libswscale/loongarch/swscale_init_loongarch.c b/libswscale/loongarch/swscale_init_loongarch.c
index 6d2786c55f..04d2553fa4 100644
--- a/libswscale/loongarch/swscale_init_loongarch.c
+++ b/libswscale/loongarch/swscale_init_loongarch.c
@@ -60,7 +60,9 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (have_lsx(cpu_flags)) {
- ff_sws_init_output_lsx(c);
+ ff_sws_init_output_lsx(c, &c->yuv2plane1, &c->yuv2planeX,
+ &c->yuv2nv12cX, &c->yuv2packed1,
+ &c->yuv2packed2, &c->yuv2packedX, &c->yuv2anyX);
if (c->srcBpc == 8) {
if (c->dstBpc <= 14) {
c->hyScale = c->hcScale = ff_hscale_8_to_15_lsx;
@@ -80,12 +82,12 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
}
break;
}
- if (c->dstBpc == 8)
- c->yuv2planeX = ff_yuv2planeX_8_lsx;
}
#if HAVE_LASX
if (have_lasx(cpu_flags)) {
- ff_sws_init_output_lasx(c);
+ ff_sws_init_output_lasx(c, &c->yuv2plane1, &c->yuv2planeX,
+ &c->yuv2nv12cX, &c->yuv2packed1,
+ &c->yuv2packed2, &c->yuv2packedX, &c->yuv2anyX);
if (c->srcBpc == 8) {
if (c->dstBpc <= 14) {
c->hyScale = c->hcScale = ff_hscale_8_to_15_lasx;
@@ -105,8 +107,6 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
}
break;
}
- if (c->dstBpc == 8)
- c->yuv2planeX = ff_yuv2planeX_8_lasx;
}
#endif // #if HAVE_LASX
ff_sws_init_range_convert_loongarch(c);
diff --git a/libswscale/loongarch/swscale_loongarch.h b/libswscale/loongarch/swscale_loongarch.h
index c96b085982..ea93881f8e 100644
--- a/libswscale/loongarch/swscale_loongarch.h
+++ b/libswscale/loongarch/swscale_loongarch.h
@@ -61,11 +61,21 @@ void planar_rgb_to_uv_lsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
void planar_rgb_to_y_lsx(uint8_t *_dst, const uint8_t *src[4], int width,
int32_t *rgb2yuv, void *opq);
-void ff_yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
+void yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset);
-av_cold void ff_sws_init_output_lsx(SwsContext *c);
+void yuv2plane1_8_lsx(const int16_t *src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset);
+
+av_cold void ff_sws_init_output_lsx(SwsContext *c,
+ yuv2planar1_fn *yuv2plane1,
+ yuv2planarX_fn *yuv2planeX,
+ yuv2interleavedX_fn *yuv2nv12cX,
+ yuv2packed1_fn *yuv2packed1,
+ yuv2packed2_fn *yuv2packed2,
+ yuv2packedX_fn *yuv2packedX,
+ yuv2anyX_fn *yuv2anyX);
int yuv420_rgb24_lsx(SwsContext *c, const uint8_t *src[], int srcStride[],
int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
@@ -135,12 +145,21 @@ void ff_interleave_bytes_lasx(const uint8_t *src1, const uint8_t *src2,
uint8_t *dest, int width, int height,
int src1Stride, int src2Stride, int dstStride);
-void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
+void yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset);
-av_cold void ff_sws_init_output_lasx(SwsContext *c);
-
+void yuv2plane1_8_lasx(const int16_t *src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset);
+
+av_cold void ff_sws_init_output_lasx(SwsContext *c,
+ yuv2planar1_fn *yuv2plane1,
+ yuv2planarX_fn *yuv2planeX,
+ yuv2interleavedX_fn *yuv2nv12cX,
+ yuv2packed1_fn *yuv2packed1,
+ yuv2packed2_fn *yuv2packed2,
+ yuv2packedX_fn *yuv2packedX,
+ yuv2anyX_fn *yuv2anyX);
#endif // #if HAVE_LASX
#endif /* SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H */
--
2.20.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* [FFmpeg-devel] [PATCH 3/3] swscale: [LA] Optimize swscale funcs in input.c
2024-03-16 3:03 [FFmpeg-devel] Add optimization in swscale for LA Shiyou Yin
2024-03-16 3:03 ` [FFmpeg-devel] [PATCH 1/3] swscale: [LA] Optimize range convert for yuvj420p Shiyou Yin
2024-03-16 3:03 ` [FFmpeg-devel] [PATCH 2/3] swscale: [LA] Optimize yuv2plane1_8_c Shiyou Yin
@ 2024-03-16 3:03 ` Shiyou Yin
2024-03-26 3:11 ` [FFmpeg-devel] Add optimization in swscale for LA Shiyou Yin
3 siblings, 0 replies; 9+ messages in thread
From: Shiyou Yin @ 2024-03-16 3:03 UTC (permalink / raw)
To: ffmpeg-devel
Optimized 7 funcs with LSX and LASX:
1. yuy2ToUV_c
2. yvy2ToUV_c
3. uyvyToUV_c
4. nv12ToUV_c
5. nv21ToUV_c
6. abgrToA_c
7. rgbaToA_c
---
libswscale/loongarch/Makefile | 1 +
libswscale/loongarch/input.S | 495 ++++++++++++++++++
libswscale/loongarch/input_lasx.c | 43 ++
libswscale/loongarch/input_lsx.c | 65 +++
libswscale/loongarch/swscale_init_loongarch.c | 20 +-
libswscale/loongarch/swscale_loongarch.h | 46 ++
6 files changed, 652 insertions(+), 18 deletions(-)
create mode 100644 libswscale/loongarch/input_lsx.c
diff --git a/libswscale/loongarch/Makefile b/libswscale/loongarch/Makefile
index c35ba309a4..7ba11d492e 100644
--- a/libswscale/loongarch/Makefile
+++ b/libswscale/loongarch/Makefile
@@ -9,4 +9,5 @@ LSX-OBJS-$(CONFIG_SWSCALE) += loongarch/swscale.o \
loongarch/input.o \
loongarch/output.o \
loongarch/output_lsx.o \
+ loongarch/input_lsx.o \
loongarch/yuv2rgb_lsx.o
diff --git a/libswscale/loongarch/input.S b/libswscale/loongarch/input.S
index d01f7384b1..717592b004 100644
--- a/libswscale/loongarch/input.S
+++ b/libswscale/loongarch/input.S
@@ -283,3 +283,498 @@ function planar_rgb_to_uv_lsx
ld.d s3, sp, 16
addi.d sp, sp, 24
endfunc
+
+/*
+ * void yuy2ToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+ * const uint8_t *src2, int width, uint32_t *unused, void *opq)
+ */
+function yuy2ToUV_lsx
+ andi t0, a5, 7
+ srli.d a5, a5, 3
+ beqz a5, 2f
+1:
+ vld vr0, a3, 1
+ vld vr1, a3, 17
+ addi.d a5, a5, -1
+ addi.d a3, a3, 32
+ vpickev.b vr2, vr1, vr0
+ vpickev.b vr0, vr2, vr2
+ vpickod.b vr1, vr2, vr2
+ fst.d f0, a0, 0
+ fst.d f1, a1, 0
+ addi.d a0, a0, 8
+ addi.d a1, a1, 8
+ bnez a5, 1b
+2:
+ beqz t0, 4f
+3:
+ ld.b t1, a3, 1
+ ld.b t2, a3, 3
+ addi.d a3, a3, 4
+ addi.d t0, t0, -1
+ st.b t1, a0, 0
+ st.b t2, a1, 0
+ addi.d a0, a0, 1
+ addi.d a1, a1, 1
+ bnez t0, 3b
+4:
+endfunc
+
+function yuy2ToUV_lasx
+ andi t0, a5, 15
+ srli.d a5, a5, 4
+ beqz a5, 2f
+1:
+ xvld xr0, a3, 1
+ xvld xr1, a3, 33
+ addi.d a5, a5, -1
+ addi.d a3, a3, 64
+ xvpickev.b xr2, xr1, xr0
+ xvpermi.d xr2, xr2, 0xd8
+ xvpickev.b xr0, xr2, xr2
+ xvpermi.d xr0, xr0, 0xd8
+ xvpickod.b xr1, xr2, xr2
+ xvpermi.d xr1, xr1, 0xd8
+ vst vr0, a0, 0
+ vst vr1, a1, 0
+ addi.d a0, a0, 16
+ addi.d a1, a1, 16
+ bnez a5, 1b
+2:
+ beqz t0, 4f
+3:
+ ld.b t1, a3, 1
+ ld.b t2, a3, 3
+ addi.d a3, a3, 4
+ addi.d t0, t0, -1
+ st.b t1, a0, 0
+ st.b t2, a1, 0
+ addi.d a0, a0, 1
+ addi.d a1, a1, 1
+ bnez t0, 3b
+4:
+endfunc
+
+/*
+ * void yvy2ToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+ * const uint8_t *src2, int width, uint32_t *unused, void *opq)
+ */
+function yvy2ToUV_lsx
+ andi t0, a5, 7
+ srli.d a5, a5, 3
+ beqz a5, 2f
+1:
+ vld vr0, a3, 1
+ vld vr1, a3, 17
+ addi.d a5, a5, -1
+ addi.d a3, a3, 32
+ vpickev.b vr2, vr1, vr0
+ vpickev.b vr0, vr2, vr2
+ vpickod.b vr1, vr2, vr2
+ fst.d f0, a1, 0
+ fst.d f1, a0, 0
+ addi.d a0, a0, 8
+ addi.d a1, a1, 8
+ bnez a5, 1b
+2:
+ beqz t0, 4f
+3:
+ ld.b t1, a3, 1
+ ld.b t2, a3, 3
+ addi.d a3, a3, 4
+ addi.d t0, t0, -1
+ st.b t1, a1, 0
+ st.b t2, a0, 0
+ addi.d a0, a0, 1
+ addi.d a1, a1, 1
+ bnez t0, 3b
+4:
+endfunc
+
+function yvy2ToUV_lasx
+ andi t0, a5, 15
+ srli.d a5, a5, 4
+ beqz a5, 2f
+1:
+ xvld xr0, a3, 1
+ xvld xr1, a3, 33
+ addi.d a5, a5, -1
+ addi.d a3, a3, 64
+ xvpickev.b xr2, xr1, xr0
+ xvpermi.d xr2, xr2, 0xd8
+ xvpickev.b xr0, xr2, xr2
+ xvpermi.d xr0, xr0, 0xd8
+ xvpickod.b xr1, xr2, xr2
+ xvpermi.d xr1, xr1, 0xd8
+ vst vr0, a1, 0
+ vst vr1, a0, 0
+ addi.d a0, a0, 16
+ addi.d a1, a1, 16
+ bnez a5, 1b
+2:
+ beqz t0, 4f
+3:
+ ld.b t1, a3, 1
+ ld.b t2, a3, 3
+ addi.d a3, a3, 4
+ addi.d t0, t0, -1
+ st.b t1, a1, 0
+ st.b t2, a0, 0
+ addi.d a0, a0, 1
+ addi.d a1, a1, 1
+ bnez t0, 3b
+4:
+endfunc
+
+/*
+ * void uyvyToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+ * const uint8_t *src2, int width, uint32_t *unused, void *opq)
+ */
+function uyvyToUV_lsx
+ andi t0, a5, 7
+ srli.d a5, a5, 3
+ beqz a5, 2f
+1:
+ vld vr0, a3, 0
+ vld vr1, a3, 16
+ addi.d a5, a5, -1
+ addi.d a3, a3, 32
+ vpickev.b vr2, vr1, vr0
+ vpickev.b vr0, vr2, vr2
+ vpickod.b vr1, vr2, vr2
+ fst.d f0, a0, 0
+ fst.d f1, a1, 0
+ addi.d a0, a0, 8
+ addi.d a1, a1, 8
+ bnez a5, 1b
+2:
+ beqz t0, 4f
+3:
+ ld.b t1, a3, 1
+ ld.b t2, a3, 3
+ addi.d a3, a3, 4
+ addi.d t0, t0, -1
+ st.b t1, a0, 0
+ st.b t2, a1, 0
+ addi.d a0, a0, 1
+ addi.d a1, a1, 1
+ bnez t0, 3b
+4:
+endfunc
+
+function uyvyToUV_lasx
+ andi t0, a5, 15
+ srli.d a5, a5, 4
+ beqz a5, 2f
+1:
+ xvld xr0, a3, 0
+ xvld xr1, a3, 32
+ addi.d a5, a5, -1
+ addi.d a3, a3, 64
+ xvpickev.b xr2, xr1, xr0
+ xvpermi.d xr2, xr2, 0xd8
+ xvpickev.b xr0, xr2, xr2
+ xvpermi.d xr0, xr0, 0xd8
+ xvpickod.b xr1, xr2, xr2
+ xvpermi.d xr1, xr1, 0xd8
+ vst vr0, a0, 0
+ vst vr1, a1, 0
+ addi.d a0, a0, 16
+ addi.d a1, a1, 16
+ bnez a5, 1b
+2:
+ beqz t0, 4f
+3:
+ ld.b t1, a3, 1
+ ld.b t2, a3, 3
+ addi.d a3, a3, 4
+ addi.d t0, t0, -1
+ st.b t1, a0, 0
+ st.b t2, a1, 0
+ addi.d a0, a0, 1
+ addi.d a1, a1, 1
+ bnez t0, 3b
+4:
+endfunc
+
+/*
+ * void nv12ToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+ * const uint8_t *src2, int width, uint32_t *unused, void *opq)
+ */
+function nv12ToUV_lsx
+ andi t0, a5, 15
+ srli.d a5, a5, 4
+ beqz a5, 2f
+1:
+ vld vr0, a3, 0
+ vld vr1, a3, 16
+ addi.d a5, a5, -1
+ addi.d a3, a3, 32
+ vpickev.b vr2, vr1, vr0
+ vpickod.b vr3, vr1, vr0
+ vst vr2, a0, 0
+ vst vr3, a1, 0
+ addi.d a0, a0, 16
+ addi.d a1, a1, 16
+ bnez a5, 1b
+2:
+ beqz t0, 4f
+3:
+ ld.b t1, a3, 0
+ ld.b t2, a3, 1
+ addi.d a3, a3, 2
+ addi.d t0, t0, -1
+ st.b t1, a0, 0
+ st.b t2, a1, 0
+ addi.d a0, a0, 1
+ addi.d a1, a1, 1
+ bnez t0, 3b
+4:
+endfunc
+
+function nv12ToUV_lasx
+ andi t0, a5, 31
+ srli.d a5, a5, 5
+ beqz a5, 2f
+1:
+ xvld xr0, a3, 0
+ xvld xr1, a3, 32
+ addi.d a5, a5, -1
+ addi.d a3, a3, 64
+ xvpickev.b xr2, xr1, xr0
+ xvpickod.b xr3, xr1, xr0
+ xvpermi.d xr2, xr2, 0xd8
+ xvpermi.d xr3, xr3, 0xd8
+ xvst xr2, a0, 0
+ xvst xr3, a1, 0
+ addi.d a0, a0, 32
+ addi.d a1, a1, 32
+ bnez a5, 1b
+2:
+ beqz t0, 4f
+3:
+ ld.b t1, a3, 0
+ ld.b t2, a3, 1
+ addi.d a3, a3, 2
+ addi.d t0, t0, -1
+ st.b t1, a0, 0
+ st.b t2, a1, 0
+ addi.d a0, a0, 1
+ addi.d a1, a1, 1
+ bnez t0, 3b
+4:
+endfunc
+
+/*
+ * void nv21ToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+ * const uint8_t *src2, int width, uint32_t *unused, void *opq)
+ */
+function nv21ToUV_lsx
+ andi t0, a5, 15
+ srli.d a5, a5, 4
+ beqz a5, 2f
+1:
+ vld vr0, a3, 0
+ vld vr1, a3, 16
+ addi.d a5, a5, -1
+ addi.d a3, a3, 32
+ vpickev.b vr2, vr1, vr0
+ vpickod.b vr3, vr1, vr0
+ vst vr2, a1, 0
+ vst vr3, a0, 0
+ addi.d a0, a0, 16
+ addi.d a1, a1, 16
+ bnez a5, 1b
+2:
+ beqz t0, 4f
+3:
+ ld.b t1, a3, 0
+ ld.b t2, a3, 1
+ addi.d a3, a3, 2
+ addi.d t0, t0, -1
+ st.b t1, a1, 0
+ st.b t2, a0, 0
+ addi.d a0, a0, 1
+ addi.d a1, a1, 1
+ bnez t0, 3b
+4:
+endfunc
+
+function nv21ToUV_lasx
+ andi t0, a5, 31
+ srli.d a5, a5, 5
+ beqz a5, 2f
+1:
+ xvld xr0, a3, 0
+ xvld xr1, a3, 32
+ addi.d a5, a5, -1
+ addi.d a3, a3, 64
+ xvpickev.b xr2, xr1, xr0
+ xvpickod.b xr3, xr1, xr0
+ xvpermi.d xr2, xr2, 0xd8
+ xvpermi.d xr3, xr3, 0xd8
+ xvst xr2, a1, 0
+ xvst xr3, a0, 0
+ addi.d a0, a0, 32
+ addi.d a1, a1, 32
+ bnez a5, 1b
+2:
+ beqz t0, 4f
+3:
+ ld.b t1, a3, 0
+ ld.b t2, a3, 1
+ addi.d a3, a3, 2
+ addi.d t0, t0, -1
+ st.b t1, a1, 0
+ st.b t2, a0, 0
+ addi.d a0, a0, 1
+ addi.d a1, a1, 1
+ bnez t0, 3b
+4:
+endfunc
+
+/*
+ *void abgrToA_lsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
+ * const uint8_t *unused2, int width, uint32_t *unused, void *opq)
+ */
+function abgrToA_lsx
+ andi t0, a4, 7
+ srli.d a4, a4, 3
+ vxor.v vr0, vr0, vr0
+ beqz a4, 2f
+1:
+ vld vr1, a1, 0
+ vld vr2, a1, 16
+ addi.d a4, a4, -1
+ addi.d a1, a1, 32
+ vpickev.b vr3, vr2, vr1
+ vpackev.b vr3, vr0, vr3
+ vslli.h vr1, vr3, 6
+ vsrli.h vr2, vr3, 2
+ vor.v vr3, vr2, vr1
+ vst vr3, a0, 0
+ addi.d a0, a0, 16
+ bnez a4, 1b
+2:
+ beqz t0, 4f
+3:
+ ld.b t1, a1, 3
+ addi.d t0, t0, -1
+ addi.d a1, a1, 4
+ andi t1, t1, 0xff
+ slli.w t2, t1, 6
+ srli.w t3, t1, 2
+ or t1, t2, t3
+ st.h t1, a0, 0
+ addi.d a0, a0, 2
+ bnez t0, 3b
+4:
+endfunc
+
+function abgrToA_lasx
+ andi t0, a4, 15
+ srli.d a4, a4, 4
+ xvxor.v xr0, xr0, xr0
+ beqz a4, 2f
+1:
+ xvld xr1, a1, 0
+ xvld xr2, a1, 32
+ addi.d a4, a4, -1
+ addi.d a1, a1, 64
+ xvpickev.b xr3, xr2, xr1
+ xvpermi.d xr3, xr3, 0xd8
+ xvpackev.b xr3, xr0, xr3
+ xvslli.h xr1, xr3, 6
+ xvsrli.h xr2, xr3, 2
+ xvor.v xr3, xr2, xr1
+ xvst xr3, a0, 0
+ addi.d a0, a0, 32
+ bnez a4, 1b
+2:
+ beqz t0, 4f
+3:
+ ld.b t1, a1, 3
+ addi.d t0, t0, -1
+ addi.d a1, a1, 4
+ andi t1, t1, 0xff
+ slli.w t2, t1, 6
+ srli.w t3, t1, 2
+ or t1, t2, t3
+ st.h t1, a0, 0
+ addi.d a0, a0, 2
+ bnez t0, 3b
+4:
+endfunc
+
+/*
+ *void rgbaToA_lsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
+ * const uint8_t *unused2, int width, uint32_t *unused, void *opq)
+ */
+function rgbaToA_lsx
+ andi t0, a4, 7
+ srli.d a4, a4, 3
+ vxor.v vr0, vr0, vr0
+ beqz a4, 2f
+1:
+ vld vr1, a1, 3
+ vld vr2, a1, 19
+ addi.d a4, a4, -1
+ addi.d a1, a1, 32
+ vpickev.b vr3, vr2, vr1
+ vpackev.b vr3, vr0, vr3
+ vslli.h vr1, vr3, 6
+ vsrli.h vr2, vr3, 2
+ vor.v vr3, vr2, vr1
+ vst vr3, a0, 0
+ addi.d a0, a0, 16
+ bnez a4, 1b
+2:
+ beqz t0, 4f
+3:
+ ld.b t1, a1, 3
+ addi.d t0, t0, -1
+ addi.d a1, a1, 4
+ andi t1, t1, 0xff
+ slli.w t2, t1, 6
+ srli.w t3, t1, 2
+ or t1, t2, t3
+ st.h t1, a0, 0
+ addi.d a0, a0, 2
+ bnez t0, 3b
+4:
+endfunc
+
+function rgbaToA_lasx
+ andi t0, a4, 15
+ srli.d a4, a4, 4
+ xvxor.v xr0, xr0, xr0
+ beqz a4, 2f
+1:
+ xvld xr1, a1, 3
+ xvld xr2, a1, 35
+ addi.d a4, a4, -1
+ addi.d a1, a1, 64
+ xvpickev.b xr3, xr2, xr1
+ xvpermi.d xr3, xr3, 0xd8
+ xvpackev.b xr3, xr0, xr3
+ xvslli.h xr1, xr3, 6
+ xvsrli.h xr2, xr3, 2
+ xvor.v xr3, xr2, xr1
+ xvst xr3, a0, 0
+ addi.d a0, a0, 32
+ bnez a4, 1b
+2:
+ beqz t0, 4f
+3:
+ ld.b t1, a1, 3
+ addi.d t0, t0, -1
+ addi.d a1, a1, 4
+ andi t1, t1, 0xff
+ slli.w t2, t1, 6
+ srli.w t3, t1, 2
+ or t1, t2, t3
+ st.h t1, a0, 0
+ addi.d a0, a0, 2
+ bnez t0, 3b
+4:
+endfunc
diff --git a/libswscale/loongarch/input_lasx.c b/libswscale/loongarch/input_lasx.c
index 4830072eaf..0f1d954880 100644
--- a/libswscale/loongarch/input_lasx.c
+++ b/libswscale/loongarch/input_lasx.c
@@ -200,3 +200,46 @@ void planar_rgb_to_y_lasx(uint8_t *_dst, const uint8_t *src[4], int width,
dst[i] = (tem_ry * r + tem_gy * g + tem_by * b + set) >> shift;
}
}
+
+av_cold void ff_sws_init_input_lasx(SwsContext *c)
+{
+ enum AVPixelFormat srcFormat = c->srcFormat;
+
+ switch (srcFormat) {
+ case AV_PIX_FMT_YUYV422:
+ c->chrToYV12 = yuy2ToUV_lasx;
+ break;
+ case AV_PIX_FMT_YVYU422:
+ c->chrToYV12 = yvy2ToUV_lasx;
+ break;
+ case AV_PIX_FMT_UYVY422:
+ c->chrToYV12 = uyvyToUV_lasx;
+ break;
+ case AV_PIX_FMT_NV12:
+ case AV_PIX_FMT_NV16:
+ case AV_PIX_FMT_NV24:
+ c->chrToYV12 = nv12ToUV_lasx;
+ break;
+ case AV_PIX_FMT_NV21:
+ case AV_PIX_FMT_NV42:
+ c->chrToYV12 = nv21ToUV_lasx;
+ break;
+ case AV_PIX_FMT_GBRAP:
+ case AV_PIX_FMT_GBRP:
+ c->readChrPlanar = planar_rgb_to_uv_lasx;
+ break;
+ }
+
+ if (c->needAlpha) {
+ switch (srcFormat) {
+ case AV_PIX_FMT_BGRA:
+ case AV_PIX_FMT_RGBA:
+ c->alpToYV12 = rgbaToA_lasx;
+ break;
+ case AV_PIX_FMT_ABGR:
+ case AV_PIX_FMT_ARGB:
+ c->alpToYV12 = abgrToA_lasx;
+ break;
+ }
+ }
+}
diff --git a/libswscale/loongarch/input_lsx.c b/libswscale/loongarch/input_lsx.c
new file mode 100644
index 0000000000..1bb04457bb
--- /dev/null
+++ b/libswscale/loongarch/input_lsx.c
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) 2024 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin<yinshiyou-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "swscale_loongarch.h"
+
+av_cold void ff_sws_init_input_lsx(SwsContext *c)
+{
+ enum AVPixelFormat srcFormat = c->srcFormat;
+
+ switch (srcFormat) {
+ case AV_PIX_FMT_YUYV422:
+ c->chrToYV12 = yuy2ToUV_lsx;
+ break;
+ case AV_PIX_FMT_YVYU422:
+ c->chrToYV12 = yvy2ToUV_lsx;
+ break;
+ case AV_PIX_FMT_UYVY422:
+ c->chrToYV12 = uyvyToUV_lsx;
+ break;
+ case AV_PIX_FMT_NV12:
+ case AV_PIX_FMT_NV16:
+ case AV_PIX_FMT_NV24:
+ c->chrToYV12 = nv12ToUV_lsx;
+ break;
+ case AV_PIX_FMT_NV21:
+ case AV_PIX_FMT_NV42:
+ c->chrToYV12 = nv21ToUV_lsx;
+ break;
+ case AV_PIX_FMT_GBRAP:
+ case AV_PIX_FMT_GBRP:
+ c->readChrPlanar = planar_rgb_to_uv_lsx;
+ break;
+ }
+
+ if (c->needAlpha) {
+ switch (srcFormat) {
+ case AV_PIX_FMT_BGRA:
+ case AV_PIX_FMT_RGBA:
+ c->alpToYV12 = rgbaToA_lsx;
+ break;
+ case AV_PIX_FMT_ABGR:
+ case AV_PIX_FMT_ARGB:
+ c->alpToYV12 = abgrToA_lsx;
+ break;
+ }
+ }
+}
diff --git a/libswscale/loongarch/swscale_init_loongarch.c b/libswscale/loongarch/swscale_init_loongarch.c
index 04d2553fa4..3a5a7ee856 100644
--- a/libswscale/loongarch/swscale_init_loongarch.c
+++ b/libswscale/loongarch/swscale_init_loongarch.c
@@ -63,6 +63,7 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
ff_sws_init_output_lsx(c, &c->yuv2plane1, &c->yuv2planeX,
&c->yuv2nv12cX, &c->yuv2packed1,
&c->yuv2packed2, &c->yuv2packedX, &c->yuv2anyX);
+ ff_sws_init_input_lsx(c);
if (c->srcBpc == 8) {
if (c->dstBpc <= 14) {
c->hyScale = c->hcScale = ff_hscale_8_to_15_lsx;
@@ -73,21 +74,13 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
c->hyScale = c->hcScale = c->dstBpc > 14 ? ff_hscale_16_to_19_lsx
: ff_hscale_16_to_15_lsx;
}
- switch (c->srcFormat) {
- case AV_PIX_FMT_GBRAP:
- case AV_PIX_FMT_GBRP:
- {
- c->readChrPlanar = planar_rgb_to_uv_lsx;
- c->readLumPlanar = planar_rgb_to_y_lsx;
- }
- break;
- }
}
#if HAVE_LASX
if (have_lasx(cpu_flags)) {
ff_sws_init_output_lasx(c, &c->yuv2plane1, &c->yuv2planeX,
&c->yuv2nv12cX, &c->yuv2packed1,
&c->yuv2packed2, &c->yuv2packedX, &c->yuv2anyX);
+ ff_sws_init_input_lasx(c);
if (c->srcBpc == 8) {
if (c->dstBpc <= 14) {
c->hyScale = c->hcScale = ff_hscale_8_to_15_lasx;
@@ -98,15 +91,6 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
c->hyScale = c->hcScale = c->dstBpc > 14 ? ff_hscale_16_to_19_lasx
: ff_hscale_16_to_15_lasx;
}
- switch (c->srcFormat) {
- case AV_PIX_FMT_GBRAP:
- case AV_PIX_FMT_GBRP:
- {
- c->readChrPlanar = planar_rgb_to_uv_lasx;
- c->readLumPlanar = planar_rgb_to_y_lasx;
- }
- break;
- }
}
#endif // #if HAVE_LASX
ff_sws_init_range_convert_loongarch(c);
diff --git a/libswscale/loongarch/swscale_loongarch.h b/libswscale/loongarch/swscale_loongarch.h
index ea93881f8e..07c91bc25c 100644
--- a/libswscale/loongarch/swscale_loongarch.h
+++ b/libswscale/loongarch/swscale_loongarch.h
@@ -68,6 +68,29 @@ void yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
void yuv2plane1_8_lsx(const int16_t *src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset);
+void yuy2ToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *unused, void *opq);
+
+void yvy2ToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *unused, void *opq);
+
+void uyvyToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *unused, void *opq);
+
+void nv12ToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *unused, void *opq);
+
+void nv21ToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *unused, void *opq);
+
+void abgrToA_lsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
+ const uint8_t *unused2, int width, uint32_t *unused, void *opq);
+
+void rgbaToA_lsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
+ const uint8_t *unused2, int width, uint32_t *unused, void *opq);
+
+av_cold void ff_sws_init_input_lsx(SwsContext *c);
+
av_cold void ff_sws_init_output_lsx(SwsContext *c,
yuv2planar1_fn *yuv2plane1,
yuv2planarX_fn *yuv2planeX,
@@ -152,6 +175,29 @@ void yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
void yuv2plane1_8_lasx(const int16_t *src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset);
+void yuy2ToUV_lasx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *unused, void *opq);
+
+void yvy2ToUV_lasx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *unused, void *opq);
+
+void uyvyToUV_lasx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *unused, void *opq);
+
+void nv12ToUV_lasx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *unused, void *opq);
+
+void nv21ToUV_lasx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *unused, void *opq);
+
+void abgrToA_lasx(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
+ const uint8_t *unused2, int width, uint32_t *unused, void *opq);
+
+void rgbaToA_lasx(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
+ const uint8_t *unused2, int width, uint32_t *unused, void *opq);
+
+av_cold void ff_sws_init_input_lasx(SwsContext *c);
+
av_cold void ff_sws_init_output_lasx(SwsContext *c,
yuv2planar1_fn *yuv2plane1,
yuv2planarX_fn *yuv2planeX,
--
2.20.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] Add optimization in swscale for LA.
2024-03-16 3:03 [FFmpeg-devel] Add optimization in swscale for LA Shiyou Yin
` (2 preceding siblings ...)
2024-03-16 3:03 ` [FFmpeg-devel] [PATCH 3/3] swscale: [LA] Optimize swscale funcs in input.c Shiyou Yin
@ 2024-03-26 3:11 ` Shiyou Yin
2024-03-26 19:31 ` Michael Niedermayer
3 siblings, 1 reply; 9+ messages in thread
From: Shiyou Yin @ 2024-03-26 3:11 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: Michael Niedermayer
> 2024年3月16日 11:03,Shiyou Yin <yinshiyou-hf@loongson.cn> 写道:
>
> [PATCH 1/3] swscale: [LA] Optimize range convert for yuvj420p.
> [PATCH 2/3] swscale: [LA] Optimize yuv2plane1_8_c.
> [PATCH 3/3] swscale: [LA] Optimize swscale funcs in input.c
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe”.
Hi, Michale
Could you please help to review this patch set, thanks.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] Add optimization in swscale for LA.
2024-03-26 3:11 ` [FFmpeg-devel] Add optimization in swscale for LA Shiyou Yin
@ 2024-03-26 19:31 ` Michael Niedermayer
2024-04-09 12:19 ` Shiyou Yin
0 siblings, 1 reply; 9+ messages in thread
From: Michael Niedermayer @ 2024-03-26 19:31 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1.1: Type: text/plain, Size: 1048 bytes --]
On Tue, Mar 26, 2024 at 11:11:00AM +0800, Shiyou Yin wrote:
>
> > 2024年3月16日 11:03,Shiyou Yin <yinshiyou-hf@loongson.cn> 写道:
> >
> > [PATCH 1/3] swscale: [LA] Optimize range convert for yuvj420p.
> > [PATCH 2/3] swscale: [LA] Optimize yuv2plane1_8_c.
> > [PATCH 3/3] swscale: [LA] Optimize swscale funcs in input.c
> >
> > _______________________________________________
> > ffmpeg-devel mailing list
> > ffmpeg-devel@ffmpeg.org
> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >
> > To unsubscribe, visit link above, or email
> > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe”.
>
> Hi, Michale
> Could you please help to review this patch set, thanks.
I can apply it if it has been reviewed but i cannot review it currently
thx
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
In fact, the RIAA has been known to suggest that students drop out
of college or go to community college in order to be able to afford
settlements. -- The RIAA
[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]
[-- Attachment #2: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] Add optimization in swscale for LA.
2024-03-26 19:31 ` Michael Niedermayer
@ 2024-04-09 12:19 ` Shiyou Yin
2024-04-11 21:54 ` Michael Niedermayer
0 siblings, 1 reply; 9+ messages in thread
From: Shiyou Yin @ 2024-04-09 12:19 UTC (permalink / raw)
To: FFmpeg development discussions and patches
> 2024年3月27日 03:31,Michael Niedermayer <michael@niedermayer.cc> 写道:
>
> On Tue, Mar 26, 2024 at 11:11:00AM +0800, Shiyou Yin wrote:
>>
>>> 2024年3月16日 11:03,Shiyou Yin <yinshiyou-hf@loongson.cn> 写道:
>>>
>>> [PATCH 1/3] swscale: [LA] Optimize range convert for yuvj420p.
>>> [PATCH 2/3] swscale: [LA] Optimize yuv2plane1_8_c.
>>> [PATCH 3/3] swscale: [LA] Optimize swscale funcs in input.c
>>>
>>> _______________________________________________
>>> ffmpeg-devel mailing list
>>> ffmpeg-devel@ffmpeg.org
>>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>>
>>> To unsubscribe, visit link above, or email
>>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe”.
>>
>> Hi, Michale
>> Could you please help to review this patch set, thanks.
>
> I can apply it if it has been reviewed but i cannot review it currently
>
> thx
>
Please help to apply this patch set, it has been tested and reviewed by my colleague.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] Add optimization in swscale for LA.
2024-04-09 12:19 ` Shiyou Yin
@ 2024-04-11 21:54 ` Michael Niedermayer
0 siblings, 0 replies; 9+ messages in thread
From: Michael Niedermayer @ 2024-04-11 21:54 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1.1: Type: text/plain, Size: 1467 bytes --]
On Tue, Apr 09, 2024 at 08:19:19PM +0800, Shiyou Yin wrote:
>
>
> > 2024年3月27日 03:31,Michael Niedermayer <michael@niedermayer.cc> 写道:
> >
> > On Tue, Mar 26, 2024 at 11:11:00AM +0800, Shiyou Yin wrote:
> >>
> >>> 2024年3月16日 11:03,Shiyou Yin <yinshiyou-hf@loongson.cn> 写道:
> >>>
> >>> [PATCH 1/3] swscale: [LA] Optimize range convert for yuvj420p.
> >>> [PATCH 2/3] swscale: [LA] Optimize yuv2plane1_8_c.
> >>> [PATCH 3/3] swscale: [LA] Optimize swscale funcs in input.c
> >>>
> >>> _______________________________________________
> >>> ffmpeg-devel mailing list
> >>> ffmpeg-devel@ffmpeg.org
> >>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >>>
> >>> To unsubscribe, visit link above, or email
> >>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe”.
> >>
> >> Hi, Michale
> >> Could you please help to review this patch set, thanks.
> >
> > I can apply it if it has been reviewed but i cannot review it currently
> >
> > thx
> >
>
> Please help to apply this patch set, it has been tested and reviewed by my colleague.
will apply
thx
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
"You are 36 times more likely to die in a bathtub than at the hands of a
terrorist. Also, you are 2.5 times more likely to become a president and
2 times more likely to become an astronaut, than to die in a terrorist
attack." -- Thoughty2
[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]
[-- Attachment #2: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 9+ messages in thread