* [FFmpeg-devel] [PATCH 2/2] swscale/aarch64: Add bgra/rgba to yuv [not found] <20240615095718.37319-1-quinkblack@foxmail.com> @ 2024-06-15 9:57 ` Zhao Zhili 2024-06-18 20:32 ` Martin Storsjö 2024-06-19 7:07 ` Rémi Denis-Courmont 0 siblings, 2 replies; 9+ messages in thread From: Zhao Zhili @ 2024-06-15 9:57 UTC (permalink / raw) To: ffmpeg-devel; +Cc: Zhao Zhili From: Zhao Zhili <zhilizhao@tencent.com> Test on Apple M1 with kperf bgra_to_uv_8_c: 13.4 bgra_to_uv_8_neon: 37.4 bgra_to_uv_128_c: 155.9 bgra_to_uv_128_neon: 91.7 bgra_to_uv_1080_c: 1173.2 bgra_to_uv_1080_neon: 822.7 bgra_to_uv_1920_c: 2078.2 bgra_to_uv_1920_neon: 1437.7 bgra_to_uv_half_8_c: 17.9 bgra_to_uv_half_8_neon: 37.4 bgra_to_uv_half_128_c: 103.9 bgra_to_uv_half_128_neon: 73.9 bgra_to_uv_half_1080_c: 850.2 bgra_to_uv_half_1080_neon: 484.2 bgra_to_uv_half_1920_c: 1479.2 bgra_to_uv_half_1920_neon: 824.2 bgra_to_y_8_c: 8.2 bgra_to_y_8_neon: 18.2 bgra_to_y_128_c: 101.4 bgra_to_y_128_neon: 74.9 bgra_to_y_1080_c: 739.4 bgra_to_y_1080_neon: 613.4 bgra_to_y_1920_c: 1298.7 bgra_to_y_1920_neon: 918.7 --- libswscale/aarch64/input.S | 81 +++++++++++++++++++++++++++++++----- libswscale/aarch64/swscale.c | 16 +++++++ 2 files changed, 86 insertions(+), 11 deletions(-) diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S index 2b956fe5c2..37f1158504 100644 --- a/libswscale/aarch64/input.S +++ b/libswscale/aarch64/input.S @@ -20,8 +20,12 @@ #include "libavutil/aarch64/asm.S" -.macro rgb_to_yuv_load_rgb src +.macro rgb_to_yuv_load_rgb src, element=3 + .if \element == 3 ld3 { v16.16b, v17.16b, v18.16b }, [\src] + .else + ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [\src] + .endif uxtl v19.8h, v16.8b // v19: r uxtl v20.8h, v17.8b // v20: g uxtl v21.8h, v18.8b // v21: b @@ -43,7 +47,7 @@ sqshrn2 \dst\().8h, \dst2\().4s, \right_shift // dst_higher_half = dst2 >> right_shift .endm -.macro rgbToY bgr +.macro rgbToY bgr, element=3 cmp w4, #0 // check width > 0 .if \bgr ldr w12, [x5] // w12: ry @@ -67,11 +71,15 @@ dup v2.8h, w12 b.lt 2f 1: - rgb_to_yuv_load_rgb x1 + rgb_to_yuv_load_rgb x1, \element rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9 rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9 sub w4, w4, #16 // width -= 16 + .if \element == 3 add x1, x1, #48 // src += 48 + .else + add x1, x1, #64 + .endif cmp w4, #16 // width >= 16 ? stp q16, q17, [x0], #32 // store to dst b.ge 1b @@ -86,7 +94,7 @@ smaddl x13, w15, w12, x13 // x13 += by * b asr w13, w13, #9 // x13 >>= 9 sub w4, w4, #1 // width-- - add x1, x1, #3 // src += 3 + add x1, x1, \element strh w13, [x0], #2 // store to dst cbnz w4, 2b 3: @@ -101,6 +109,14 @@ function ff_bgr24ToY_neon, export=1 rgbToY bgr=1 endfunc +function ff_rgba32ToY_neon, export=1 + rgbToY bgr=0, element=4 +endfunc + +function ff_bgra32ToY_neon, export=1 + rgbToY bgr=1, element=4 +endfunc + .macro rgb_load_uv_coeff half, bgr .if \bgr ldr w12, [x6, #12] @@ -130,7 +146,7 @@ endfunc dup v6.4s, w9 .endm -.macro rgbToUV_half bgr +.macro rgbToUV_half bgr, element=3 cmp w5, #0 // check width > 0 b.le 3f @@ -139,7 +155,11 @@ endfunc b.lt 2f // The following comments assume RGB order. The logic for RGB and BGR is the same. 1: + .if \element == 3 ld3 { v16.16b, v17.16b, v18.16b }, [x3] + .else + ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [x3] + .endif uaddlp v19.8h, v16.16b // v19: r uaddlp v20.8h, v17.16b // v20: g uaddlp v21.8h, v18.16b // v21: b @@ -147,7 +167,11 @@ endfunc rgb_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10 rgb_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10 sub w5, w5, #8 // width -= 8 - add x3, x3, #48 // src += 48 + .if \element == 3 + add x3, x3, #48 + .else + add x3, x3, #64 + .endif cmp w5, #8 // width >= 8 ? str q16, [x0], #16 // store dst_u str q17, [x1], #16 // store dst_v @@ -155,9 +179,10 @@ endfunc cbz w5, 3f 2: ldrb w2, [x3] // w2: r1 - ldrb w4, [x3, #3] // w4: r2 + ldrb w4, [x3, \element] // w4: r2 add w2, w2, w4 // w2 = r1 + r2 + .if \element == 3 ldrb w4, [x3, #1] // w4: g1 ldrb w7, [x3, #4] // w7: g2 add w4, w4, w7 // w4 = g1 + g2 @@ -165,6 +190,15 @@ endfunc ldrb w7, [x3, #2] // w7: b1 ldrb w8, [x3, #5] // w8: b2 add w7, w7, w8 // w7 = b1 + b2 + .else + ldrb w4, [x3, #1] // w4: g1 + ldrb w7, [x3, #5] // w7: g2 + add w4, w4, w7 // w4 = g1 + g2 + + ldrb w7, [x3, #2] // w7: b1 + ldrb w8, [x3, #6] // w8: b2 + add w7, w7, w8 // w7 = b1 + b2 + .endif smaddl x8, w2, w10, x9 // dst_u = ru * r + const_offset smaddl x8, w4, w11, x8 // dst_u += gu * g @@ -177,7 +211,12 @@ endfunc smaddl x8, w7, w15, x8 // dst_v += bv * b asr x8, x8, #10 // dst_v >>= 10 sub w5, w5, #1 - add x3, x3, #6 // src += 6 + ldrb w4, [x3, #1] // w4: g1 + .if \element == 3 + add x3, x3, #6 + .else + add x3, x3, #8 + .endif strh w8, [x1], #2 // store dst_v cbnz w5, 2b 3: @@ -192,7 +231,15 @@ function ff_bgr24ToUV_half_neon, export=1 rgbToUV_half bgr=1 endfunc -.macro rgbToUV bgr +function ff_rgba32ToUV_half_neon, export=1 + rgbToUV_half bgr=0, element=4 +endfunc + +function ff_bgra32ToUV_half_neon, export=1 + rgbToUV_half bgr=1, element=4 +endfunc + +.macro rgbToUV bgr, element=3 cmp w5, #0 // check width > 0 b.le 3f @@ -201,13 +248,17 @@ endfunc b.lt 2f // The following comments assume RGB order. The logic for RGB and BGR is the same. 1: - rgb_to_yuv_load_rgb x3 + rgb_to_yuv_load_rgb x3, \element rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9 rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9 rgb_to_yuv_product v19, v20, v21, v25, v26, v18, v3, v4, v5, #9 rgb_to_yuv_product v22, v23, v24, v27, v28, v19, v3, v4, v5, #9 sub w5, w5, #16 + .if \element == 3 add x3, x3, #48 // src += 48 + .else + add x3, x3, #64 + .endif cmp w5, #16 stp q16, q17, [x0], #32 // store to dst_u stp q18, q19, [x1], #32 // store to dst_v @@ -229,7 +280,7 @@ endfunc smaddl x8, w4, w15, x8 // x8 += bv * b asr w8, w8, #9 // x8 >>= 9 sub w5, w5, #1 // width-- - add x3, x3, #3 // src += 3 + add x3, x3, \element strh w8, [x1], #2 // store to dst_v cbnz w5, 2b 3: @@ -243,3 +294,11 @@ endfunc function ff_bgr24ToUV_neon, export=1 rgbToUV bgr=1 endfunc + +function ff_rgba32ToUV_neon, export=1 + rgbToUV bgr=0, element=4 +endfunc + +function ff_bgra32ToUV_neon, export=1 + rgbToUV bgr=1, element=4 +endfunc diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c index ce70dbedcc..8fe9fb11ac 100644 --- a/libswscale/aarch64/swscale.c +++ b/libswscale/aarch64/swscale.c @@ -212,7 +212,9 @@ void ff_##name##ToUV_half_neon(uint8_t *, uint8_t *, const uint8_t *, \ uint32_t *coeffs, void *) NEON_INPUT(bgr24); +NEON_INPUT(bgra32); NEON_INPUT(rgb24); +NEON_INPUT(rgba32); av_cold void ff_sws_init_swscale_aarch64(SwsContext *c) { @@ -233,6 +235,13 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c) else c->chrToYV12 = ff_bgr24ToUV_neon; break; + case AV_PIX_FMT_BGRA: + c->lumToYV12 = ff_bgra32ToY_neon; + if (c->chrSrcHSubSample) + c->chrToYV12 = ff_bgra32ToUV_half_neon; + else + c->chrToYV12 = ff_bgra32ToUV_neon; + break; case AV_PIX_FMT_RGB24: c->lumToYV12 = ff_rgb24ToY_neon; if (c->chrSrcHSubSample) @@ -240,6 +249,13 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c) else c->chrToYV12 = ff_rgb24ToUV_neon; break; + case AV_PIX_FMT_RGBA: + c->lumToYV12 = ff_rgba32ToY_neon; + if (c->chrSrcHSubSample) + c->chrToYV12 = ff_rgba32ToUV_half_neon; + else + c->chrToYV12 = ff_rgba32ToUV_neon; + break; default: break; } -- 2.42.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/2] swscale/aarch64: Add bgra/rgba to yuv 2024-06-15 9:57 ` [FFmpeg-devel] [PATCH 2/2] swscale/aarch64: Add bgra/rgba to yuv Zhao Zhili @ 2024-06-18 20:32 ` Martin Storsjö 2024-06-19 7:07 ` Rémi Denis-Courmont 1 sibling, 0 replies; 9+ messages in thread From: Martin Storsjö @ 2024-06-18 20:32 UTC (permalink / raw) To: FFmpeg development discussions and patches; +Cc: Zhao Zhili On Sat, 15 Jun 2024, Zhao Zhili wrote: > From: Zhao Zhili <zhilizhao@tencent.com> > > Test on Apple M1 with kperf > > bgra_to_uv_8_c: 13.4 > bgra_to_uv_8_neon: 37.4 > bgra_to_uv_128_c: 155.9 > bgra_to_uv_128_neon: 91.7 > bgra_to_uv_1080_c: 1173.2 > bgra_to_uv_1080_neon: 822.7 > bgra_to_uv_1920_c: 2078.2 > bgra_to_uv_1920_neon: 1437.7 > bgra_to_uv_half_8_c: 17.9 > bgra_to_uv_half_8_neon: 37.4 > bgra_to_uv_half_128_c: 103.9 > bgra_to_uv_half_128_neon: 73.9 > bgra_to_uv_half_1080_c: 850.2 > bgra_to_uv_half_1080_neon: 484.2 > bgra_to_uv_half_1920_c: 1479.2 > bgra_to_uv_half_1920_neon: 824.2 > bgra_to_y_8_c: 8.2 > bgra_to_y_8_neon: 18.2 > bgra_to_y_128_c: 101.4 > bgra_to_y_128_neon: 74.9 > bgra_to_y_1080_c: 739.4 > bgra_to_y_1080_neon: 613.4 > bgra_to_y_1920_c: 1298.7 > bgra_to_y_1920_neon: 918.7 > --- > libswscale/aarch64/input.S | 81 +++++++++++++++++++++++++++++++----- > libswscale/aarch64/swscale.c | 16 +++++++ > 2 files changed, 86 insertions(+), 11 deletions(-) > > diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S > index 2b956fe5c2..37f1158504 100644 > --- a/libswscale/aarch64/input.S > +++ b/libswscale/aarch64/input.S > @@ -20,8 +20,12 @@ > > #include "libavutil/aarch64/asm.S" > > -.macro rgb_to_yuv_load_rgb src > +.macro rgb_to_yuv_load_rgb src, element=3 > + .if \element == 3 > ld3 { v16.16b, v17.16b, v18.16b }, [\src] > + .else > + ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [\src] > + .endif > uxtl v19.8h, v16.8b // v19: r > uxtl v20.8h, v17.8b // v20: g > uxtl v21.8h, v18.8b // v21: b > @@ -43,7 +47,7 @@ > sqshrn2 \dst\().8h, \dst2\().4s, \right_shift // dst_higher_half = dst2 >> right_shift > .endm > > -.macro rgbToY bgr > +.macro rgbToY bgr, element=3 > cmp w4, #0 // check width > 0 > .if \bgr > ldr w12, [x5] // w12: ry > @@ -67,11 +71,15 @@ > dup v2.8h, w12 > b.lt 2f > 1: > - rgb_to_yuv_load_rgb x1 > + rgb_to_yuv_load_rgb x1, \element > rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9 > rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9 > sub w4, w4, #16 // width -= 16 > + .if \element == 3 > add x1, x1, #48 // src += 48 > + .else > + add x1, x1, #64 > + .endif I guess this also could be just #(16*\element) > cmp w4, #16 // width >= 16 ? > stp q16, q17, [x0], #32 // store to dst > b.ge 1b > @@ -86,7 +94,7 @@ > smaddl x13, w15, w12, x13 // x13 += by * b > asr w13, w13, #9 // x13 >>= 9 > sub w4, w4, #1 // width-- > - add x1, x1, #3 // src += 3 > + add x1, x1, \element Keep the # for the immediate constant here, i.e. #\element. Perhaps it doen't matter for most assemblers we use, but it's good to stay consistent. > strh w13, [x0], #2 // store to dst > cbnz w4, 2b > 3: > @@ -101,6 +109,14 @@ function ff_bgr24ToY_neon, export=1 > rgbToY bgr=1 > endfunc > > +function ff_rgba32ToY_neon, export=1 > + rgbToY bgr=0, element=4 > +endfunc > + > +function ff_bgra32ToY_neon, export=1 > + rgbToY bgr=1, element=4 > +endfunc > + > .macro rgb_load_uv_coeff half, bgr > .if \bgr > ldr w12, [x6, #12] > @@ -130,7 +146,7 @@ endfunc > dup v6.4s, w9 > .endm > > -.macro rgbToUV_half bgr > +.macro rgbToUV_half bgr, element=3 > cmp w5, #0 // check width > 0 > b.le 3f > > @@ -139,7 +155,11 @@ endfunc > b.lt 2f > // The following comments assume RGB order. The logic for RGB and BGR is the same. > 1: > + .if \element == 3 > ld3 { v16.16b, v17.16b, v18.16b }, [x3] > + .else > + ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [x3] > + .endif > uaddlp v19.8h, v16.16b // v19: r > uaddlp v20.8h, v17.16b // v20: g > uaddlp v21.8h, v18.16b // v21: b > @@ -147,7 +167,11 @@ endfunc > rgb_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10 > rgb_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10 > sub w5, w5, #8 // width -= 8 > - add x3, x3, #48 // src += 48 > + .if \element == 3 > + add x3, x3, #48 > + .else > + add x3, x3, #64 > + .endif > cmp w5, #8 // width >= 8 ? > str q16, [x0], #16 // store dst_u > str q17, [x1], #16 // store dst_v > @@ -155,9 +179,10 @@ endfunc > cbz w5, 3f > 2: > ldrb w2, [x3] // w2: r1 > - ldrb w4, [x3, #3] // w4: r2 > + ldrb w4, [x3, \element] // w4: r2 Ditto about keeping the # > add w2, w2, w4 // w2 = r1 + r2 > > + .if \element == 3 > ldrb w4, [x3, #1] // w4: g1 > ldrb w7, [x3, #4] // w7: g2 > add w4, w4, w7 // w4 = g1 + g2 > @@ -165,6 +190,15 @@ endfunc > ldrb w7, [x3, #2] // w7: b1 > ldrb w8, [x3, #5] // w8: b2 > add w7, w7, w8 // w7 = b1 + b2 > + .else > + ldrb w4, [x3, #1] // w4: g1 > + ldrb w7, [x3, #5] // w7: g2 > + add w4, w4, w7 // w4 = g1 + g2 > + > + ldrb w7, [x3, #2] // w7: b1 > + ldrb w8, [x3, #6] // w8: b2 > + add w7, w7, w8 // w7 = b1 + b2 > + .endif > > smaddl x8, w2, w10, x9 // dst_u = ru * r + const_offset > smaddl x8, w4, w11, x8 // dst_u += gu * g > @@ -177,7 +211,12 @@ endfunc > smaddl x8, w7, w15, x8 // dst_v += bv * b > asr x8, x8, #10 // dst_v >>= 10 > sub w5, w5, #1 > - add x3, x3, #6 // src += 6 > + ldrb w4, [x3, #1] // w4: g1 > + .if \element == 3 > + add x3, x3, #6 > + .else > + add x3, x3, #8 > + .endif And this can be #(2*\element) // Martin _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/2] swscale/aarch64: Add bgra/rgba to yuv 2024-06-15 9:57 ` [FFmpeg-devel] [PATCH 2/2] swscale/aarch64: Add bgra/rgba to yuv Zhao Zhili 2024-06-18 20:32 ` Martin Storsjö @ 2024-06-19 7:07 ` Rémi Denis-Courmont 2024-06-19 9:24 ` Zhao Zhili 1 sibling, 1 reply; 9+ messages in thread From: Rémi Denis-Courmont @ 2024-06-19 7:07 UTC (permalink / raw) To: FFmpeg development discussions and patches Le 15 juin 2024 11:57:18 GMT+02:00, Zhao Zhili <quinkblack@foxmail.com> a écrit : >From: Zhao Zhili <zhilizhao@tencent.com> > >Test on Apple M1 with kperf > >bgra_to_uv_8_c: 13.4 >bgra_to_uv_8_neon: 37.4 >bgra_to_uv_128_c: 155.9 >bgra_to_uv_128_neon: 91.7 >bgra_to_uv_1080_c: 1173.2 >bgra_to_uv_1080_neon: 822.7 >bgra_to_uv_1920_c: 2078.2 >bgra_to_uv_1920_neon: 1437.7 >bgra_to_uv_half_8_c: 17.9 >bgra_to_uv_half_8_neon: 37.4 >bgra_to_uv_half_128_c: 103.9 >bgra_to_uv_half_128_neon: 73.9 >bgra_to_uv_half_1080_c: 850.2 >bgra_to_uv_half_1080_neon: 484.2 >bgra_to_uv_half_1920_c: 1479.2 >bgra_to_uv_half_1920_neon: 824.2 >bgra_to_y_8_c: 8.2 >bgra_to_y_8_neon: 18.2 >bgra_to_y_128_c: 101.4 >bgra_to_y_128_neon: 74.9 >bgra_to_y_1080_c: 739.4 >bgra_to_y_1080_neon: 613.4 >bgra_to_y_1920_c: 1298.7 >bgra_to_y_1920_neon: 918.7 >--- > libswscale/aarch64/input.S | 81 +++++++++++++++++++++++++++++++----- > libswscale/aarch64/swscale.c | 16 +++++++ > 2 files changed, 86 insertions(+), 11 deletions(-) > >diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S >index 2b956fe5c2..37f1158504 100644 >--- a/libswscale/aarch64/input.S >+++ b/libswscale/aarch64/input.S >@@ -20,8 +20,12 @@ > > #include "libavutil/aarch64/asm.S" > >-.macro rgb_to_yuv_load_rgb src >+.macro rgb_to_yuv_load_rgb src, element=3 >+ .if \element == 3 > ld3 { v16.16b, v17.16b, v18.16b }, [\src] >+ .else >+ ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [\src] >+ .endif > uxtl v19.8h, v16.8b // v19: r > uxtl v20.8h, v17.8b // v20: g > uxtl v21.8h, v18.8b // v21: b >@@ -43,7 +47,7 @@ > sqshrn2 \dst\().8h, \dst2\().4s, \right_shift // dst_higher_half = dst2 >> right_shift > .endm > >-.macro rgbToY bgr >+.macro rgbToY bgr, element=3 AFAICT, you don't need to a macro parameter for component order. Just swap red and blue coefficients in the prologue and then run the bit-exact same loops for bgr/rgb, rgba/bgra and argb/abgr. This adds one branch in the prologue but that's mostly negligible compared to the loop. > cmp w4, #0 // check width > 0 > .if \bgr > ldr w12, [x5] // w12: ry >@@ -67,11 +71,15 @@ > dup v2.8h, w12 > b.lt 2f > 1: >- rgb_to_yuv_load_rgb x1 >+ rgb_to_yuv_load_rgb x1, \element > rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9 > rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9 > sub w4, w4, #16 // width -= 16 >+ .if \element == 3 > add x1, x1, #48 // src += 48 >+ .else >+ add x1, x1, #64 >+ .endif > cmp w4, #16 // width >= 16 ? > stp q16, q17, [x0], #32 // store to dst > b.ge 1b >@@ -86,7 +94,7 @@ > smaddl x13, w15, w12, x13 // x13 += by * b > asr w13, w13, #9 // x13 >>= 9 > sub w4, w4, #1 // width-- >- add x1, x1, #3 // src += 3 >+ add x1, x1, \element > strh w13, [x0], #2 // store to dst > cbnz w4, 2b > 3: >@@ -101,6 +109,14 @@ function ff_bgr24ToY_neon, export=1 > rgbToY bgr=1 > endfunc > >+function ff_rgba32ToY_neon, export=1 >+ rgbToY bgr=0, element=4 >+endfunc >+ >+function ff_bgra32ToY_neon, export=1 >+ rgbToY bgr=1, element=4 >+endfunc >+ > .macro rgb_load_uv_coeff half, bgr > .if \bgr > ldr w12, [x6, #12] >@@ -130,7 +146,7 @@ endfunc > dup v6.4s, w9 > .endm > >-.macro rgbToUV_half bgr >+.macro rgbToUV_half bgr, element=3 > cmp w5, #0 // check width > 0 > b.le 3f > >@@ -139,7 +155,11 @@ endfunc > b.lt 2f > // The following comments assume RGB order. The logic for RGB and BGR is the same. > 1: >+ .if \element == 3 > ld3 { v16.16b, v17.16b, v18.16b }, [x3] >+ .else >+ ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [x3] >+ .endif > uaddlp v19.8h, v16.16b // v19: r > uaddlp v20.8h, v17.16b // v20: g > uaddlp v21.8h, v18.16b // v21: b >@@ -147,7 +167,11 @@ endfunc > rgb_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10 > rgb_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10 > sub w5, w5, #8 // width -= 8 >- add x3, x3, #48 // src += 48 >+ .if \element == 3 >+ add x3, x3, #48 >+ .else >+ add x3, x3, #64 >+ .endif > cmp w5, #8 // width >= 8 ? > str q16, [x0], #16 // store dst_u > str q17, [x1], #16 // store dst_v >@@ -155,9 +179,10 @@ endfunc > cbz w5, 3f > 2: > ldrb w2, [x3] // w2: r1 >- ldrb w4, [x3, #3] // w4: r2 >+ ldrb w4, [x3, \element] // w4: r2 > add w2, w2, w4 // w2 = r1 + r2 > >+ .if \element == 3 > ldrb w4, [x3, #1] // w4: g1 > ldrb w7, [x3, #4] // w7: g2 > add w4, w4, w7 // w4 = g1 + g2 >@@ -165,6 +190,15 @@ endfunc > ldrb w7, [x3, #2] // w7: b1 > ldrb w8, [x3, #5] // w8: b2 > add w7, w7, w8 // w7 = b1 + b2 >+ .else >+ ldrb w4, [x3, #1] // w4: g1 >+ ldrb w7, [x3, #5] // w7: g2 >+ add w4, w4, w7 // w4 = g1 + g2 >+ >+ ldrb w7, [x3, #2] // w7: b1 >+ ldrb w8, [x3, #6] // w8: b2 >+ add w7, w7, w8 // w7 = b1 + b2 >+ .endif > > smaddl x8, w2, w10, x9 // dst_u = ru * r + const_offset > smaddl x8, w4, w11, x8 // dst_u += gu * g >@@ -177,7 +211,12 @@ endfunc > smaddl x8, w7, w15, x8 // dst_v += bv * b > asr x8, x8, #10 // dst_v >>= 10 > sub w5, w5, #1 >- add x3, x3, #6 // src += 6 >+ ldrb w4, [x3, #1] // w4: g1 >+ .if \element == 3 >+ add x3, x3, #6 >+ .else >+ add x3, x3, #8 >+ .endif > strh w8, [x1], #2 // store dst_v > cbnz w5, 2b > 3: >@@ -192,7 +231,15 @@ function ff_bgr24ToUV_half_neon, export=1 > rgbToUV_half bgr=1 > endfunc > >-.macro rgbToUV bgr >+function ff_rgba32ToUV_half_neon, export=1 >+ rgbToUV_half bgr=0, element=4 >+endfunc >+ >+function ff_bgra32ToUV_half_neon, export=1 >+ rgbToUV_half bgr=1, element=4 >+endfunc >+ >+.macro rgbToUV bgr, element=3 > cmp w5, #0 // check width > 0 > b.le 3f > >@@ -201,13 +248,17 @@ endfunc > b.lt 2f > // The following comments assume RGB order. The logic for RGB and BGR is the same. > 1: >- rgb_to_yuv_load_rgb x3 >+ rgb_to_yuv_load_rgb x3, \element > rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9 > rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9 > rgb_to_yuv_product v19, v20, v21, v25, v26, v18, v3, v4, v5, #9 > rgb_to_yuv_product v22, v23, v24, v27, v28, v19, v3, v4, v5, #9 > sub w5, w5, #16 >+ .if \element == 3 > add x3, x3, #48 // src += 48 >+ .else >+ add x3, x3, #64 >+ .endif > cmp w5, #16 > stp q16, q17, [x0], #32 // store to dst_u > stp q18, q19, [x1], #32 // store to dst_v >@@ -229,7 +280,7 @@ endfunc > smaddl x8, w4, w15, x8 // x8 += bv * b > asr w8, w8, #9 // x8 >>= 9 > sub w5, w5, #1 // width-- >- add x3, x3, #3 // src += 3 >+ add x3, x3, \element > strh w8, [x1], #2 // store to dst_v > cbnz w5, 2b > 3: >@@ -243,3 +294,11 @@ endfunc > function ff_bgr24ToUV_neon, export=1 > rgbToUV bgr=1 > endfunc >+ >+function ff_rgba32ToUV_neon, export=1 >+ rgbToUV bgr=0, element=4 >+endfunc >+ >+function ff_bgra32ToUV_neon, export=1 >+ rgbToUV bgr=1, element=4 >+endfunc >diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c >index ce70dbedcc..8fe9fb11ac 100644 >--- a/libswscale/aarch64/swscale.c >+++ b/libswscale/aarch64/swscale.c >@@ -212,7 +212,9 @@ void ff_##name##ToUV_half_neon(uint8_t *, uint8_t *, const uint8_t *, \ > uint32_t *coeffs, void *) > > NEON_INPUT(bgr24); >+NEON_INPUT(bgra32); > NEON_INPUT(rgb24); >+NEON_INPUT(rgba32); > > av_cold void ff_sws_init_swscale_aarch64(SwsContext *c) > { >@@ -233,6 +235,13 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c) > else > c->chrToYV12 = ff_bgr24ToUV_neon; > break; >+ case AV_PIX_FMT_BGRA: >+ c->lumToYV12 = ff_bgra32ToY_neon; >+ if (c->chrSrcHSubSample) >+ c->chrToYV12 = ff_bgra32ToUV_half_neon; >+ else >+ c->chrToYV12 = ff_bgra32ToUV_neon; >+ break; > case AV_PIX_FMT_RGB24: > c->lumToYV12 = ff_rgb24ToY_neon; > if (c->chrSrcHSubSample) >@@ -240,6 +249,13 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c) > else > c->chrToYV12 = ff_rgb24ToUV_neon; > break; >+ case AV_PIX_FMT_RGBA: >+ c->lumToYV12 = ff_rgba32ToY_neon; >+ if (c->chrSrcHSubSample) >+ c->chrToYV12 = ff_rgba32ToUV_half_neon; >+ else >+ c->chrToYV12 = ff_rgba32ToUV_neon; >+ break; > default: > break; > } _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/2] swscale/aarch64: Add bgra/rgba to yuv 2024-06-19 7:07 ` Rémi Denis-Courmont @ 2024-06-19 9:24 ` Zhao Zhili 2024-06-19 12:05 ` Rémi Denis-Courmont 0 siblings, 1 reply; 9+ messages in thread From: Zhao Zhili @ 2024-06-19 9:24 UTC (permalink / raw) To: FFmpeg development discussions and patches > On Jun 19, 2024, at 15:07, Rémi Denis-Courmont <remi@remlab.net> wrote: > > > > Le 15 juin 2024 11:57:18 GMT+02:00, Zhao Zhili <quinkblack@foxmail.com> a écrit : >> From: Zhao Zhili <zhilizhao@tencent.com> >> >> Test on Apple M1 with kperf >> >> bgra_to_uv_8_c: 13.4 >> bgra_to_uv_8_neon: 37.4 >> bgra_to_uv_128_c: 155.9 >> bgra_to_uv_128_neon: 91.7 >> bgra_to_uv_1080_c: 1173.2 >> bgra_to_uv_1080_neon: 822.7 >> bgra_to_uv_1920_c: 2078.2 >> bgra_to_uv_1920_neon: 1437.7 >> bgra_to_uv_half_8_c: 17.9 >> bgra_to_uv_half_8_neon: 37.4 >> bgra_to_uv_half_128_c: 103.9 >> bgra_to_uv_half_128_neon: 73.9 >> bgra_to_uv_half_1080_c: 850.2 >> bgra_to_uv_half_1080_neon: 484.2 >> bgra_to_uv_half_1920_c: 1479.2 >> bgra_to_uv_half_1920_neon: 824.2 >> bgra_to_y_8_c: 8.2 >> bgra_to_y_8_neon: 18.2 >> bgra_to_y_128_c: 101.4 >> bgra_to_y_128_neon: 74.9 >> bgra_to_y_1080_c: 739.4 >> bgra_to_y_1080_neon: 613.4 >> bgra_to_y_1920_c: 1298.7 >> bgra_to_y_1920_neon: 918.7 >> --- >> libswscale/aarch64/input.S | 81 +++++++++++++++++++++++++++++++----- >> libswscale/aarch64/swscale.c | 16 +++++++ >> 2 files changed, 86 insertions(+), 11 deletions(-) >> >> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S >> index 2b956fe5c2..37f1158504 100644 >> --- a/libswscale/aarch64/input.S >> +++ b/libswscale/aarch64/input.S >> @@ -20,8 +20,12 @@ >> >> #include "libavutil/aarch64/asm.S" >> >> -.macro rgb_to_yuv_load_rgb src >> +.macro rgb_to_yuv_load_rgb src, element=3 >> + .if \element == 3 >> ld3 { v16.16b, v17.16b, v18.16b }, [\src] >> + .else >> + ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [\src] >> + .endif >> uxtl v19.8h, v16.8b // v19: r >> uxtl v20.8h, v17.8b // v20: g >> uxtl v21.8h, v18.8b // v21: b >> @@ -43,7 +47,7 @@ >> sqshrn2 \dst\().8h, \dst2\().4s, \right_shift // dst_higher_half = dst2 >> right_shift >> .endm >> >> -.macro rgbToY bgr >> +.macro rgbToY bgr, element=3 > > AFAICT, you don't need to a macro parameter for component order. Just swap red and blue coefficients in the prologue and then run the bit-exact same loops for bgr/rgb, rgba/bgra and argb/abgr. This adds one branch in the prologue but that's mostly negligible compared to the loop. I’m not sure where to add the branch. Could you elaborate? Do you mean load coefficients first like the following: function ff_bgr24ToUV_half_neon, export=1 ldr w12, [x6, #12] ldr w11, [x6, #16] ldr w10, [x6, #20] ldr w15, [x6, #24] ldr w14, [x6, #28] ldr w13, [x6, #32] rgbToUV_half endfunc > >> cmp w4, #0 // check width > 0 >> .if \bgr >> ldr w12, [x5] // w12: ry >> @@ -67,11 +71,15 @@ >> dup v2.8h, w12 >> b.lt 2f >> 1: >> - rgb_to_yuv_load_rgb x1 >> + rgb_to_yuv_load_rgb x1, \element >> rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9 >> rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9 >> sub w4, w4, #16 // width -= 16 >> + .if \element == 3 >> add x1, x1, #48 // src += 48 >> + .else >> + add x1, x1, #64 >> + .endif >> cmp w4, #16 // width >= 16 ? >> stp q16, q17, [x0], #32 // store to dst >> b.ge 1b >> @@ -86,7 +94,7 @@ >> smaddl x13, w15, w12, x13 // x13 += by * b >> asr w13, w13, #9 // x13 >>= 9 >> sub w4, w4, #1 // width-- >> - add x1, x1, #3 // src += 3 >> + add x1, x1, \element >> strh w13, [x0], #2 // store to dst >> cbnz w4, 2b >> 3: >> @@ -101,6 +109,14 @@ function ff_bgr24ToY_neon, export=1 >> rgbToY bgr=1 >> endfunc >> >> +function ff_rgba32ToY_neon, export=1 >> + rgbToY bgr=0, element=4 >> +endfunc >> + >> +function ff_bgra32ToY_neon, export=1 >> + rgbToY bgr=1, element=4 >> +endfunc >> + >> .macro rgb_load_uv_coeff half, bgr >> .if \bgr >> ldr w12, [x6, #12] >> @@ -130,7 +146,7 @@ endfunc >> dup v6.4s, w9 >> .endm >> >> -.macro rgbToUV_half bgr >> +.macro rgbToUV_half bgr, element=3 >> cmp w5, #0 // check width > 0 >> b.le 3f >> >> @@ -139,7 +155,11 @@ endfunc >> b.lt 2f >> // The following comments assume RGB order. The logic for RGB and BGR is the same. >> 1: >> + .if \element == 3 >> ld3 { v16.16b, v17.16b, v18.16b }, [x3] >> + .else >> + ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [x3] >> + .endif >> uaddlp v19.8h, v16.16b // v19: r >> uaddlp v20.8h, v17.16b // v20: g >> uaddlp v21.8h, v18.16b // v21: b >> @@ -147,7 +167,11 @@ endfunc >> rgb_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10 >> rgb_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10 >> sub w5, w5, #8 // width -= 8 >> - add x3, x3, #48 // src += 48 >> + .if \element == 3 >> + add x3, x3, #48 >> + .else >> + add x3, x3, #64 >> + .endif >> cmp w5, #8 // width >= 8 ? >> str q16, [x0], #16 // store dst_u >> str q17, [x1], #16 // store dst_v >> @@ -155,9 +179,10 @@ endfunc >> cbz w5, 3f >> 2: >> ldrb w2, [x3] // w2: r1 >> - ldrb w4, [x3, #3] // w4: r2 >> + ldrb w4, [x3, \element] // w4: r2 >> add w2, w2, w4 // w2 = r1 + r2 >> >> + .if \element == 3 >> ldrb w4, [x3, #1] // w4: g1 >> ldrb w7, [x3, #4] // w7: g2 >> add w4, w4, w7 // w4 = g1 + g2 >> @@ -165,6 +190,15 @@ endfunc >> ldrb w7, [x3, #2] // w7: b1 >> ldrb w8, [x3, #5] // w8: b2 >> add w7, w7, w8 // w7 = b1 + b2 >> + .else >> + ldrb w4, [x3, #1] // w4: g1 >> + ldrb w7, [x3, #5] // w7: g2 >> + add w4, w4, w7 // w4 = g1 + g2 >> + >> + ldrb w7, [x3, #2] // w7: b1 >> + ldrb w8, [x3, #6] // w8: b2 >> + add w7, w7, w8 // w7 = b1 + b2 >> + .endif >> >> smaddl x8, w2, w10, x9 // dst_u = ru * r + const_offset >> smaddl x8, w4, w11, x8 // dst_u += gu * g >> @@ -177,7 +211,12 @@ endfunc >> smaddl x8, w7, w15, x8 // dst_v += bv * b >> asr x8, x8, #10 // dst_v >>= 10 >> sub w5, w5, #1 >> - add x3, x3, #6 // src += 6 >> + ldrb w4, [x3, #1] // w4: g1 >> + .if \element == 3 >> + add x3, x3, #6 >> + .else >> + add x3, x3, #8 >> + .endif >> strh w8, [x1], #2 // store dst_v >> cbnz w5, 2b >> 3: >> @@ -192,7 +231,15 @@ function ff_bgr24ToUV_half_neon, export=1 >> rgbToUV_half bgr=1 >> endfunc >> >> -.macro rgbToUV bgr >> +function ff_rgba32ToUV_half_neon, export=1 >> + rgbToUV_half bgr=0, element=4 >> +endfunc >> + >> +function ff_bgra32ToUV_half_neon, export=1 >> + rgbToUV_half bgr=1, element=4 >> +endfunc >> + >> +.macro rgbToUV bgr, element=3 >> cmp w5, #0 // check width > 0 >> b.le 3f >> >> @@ -201,13 +248,17 @@ endfunc >> b.lt 2f >> // The following comments assume RGB order. The logic for RGB and BGR is the same. >> 1: >> - rgb_to_yuv_load_rgb x3 >> + rgb_to_yuv_load_rgb x3, \element >> rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9 >> rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9 >> rgb_to_yuv_product v19, v20, v21, v25, v26, v18, v3, v4, v5, #9 >> rgb_to_yuv_product v22, v23, v24, v27, v28, v19, v3, v4, v5, #9 >> sub w5, w5, #16 >> + .if \element == 3 >> add x3, x3, #48 // src += 48 >> + .else >> + add x3, x3, #64 >> + .endif >> cmp w5, #16 >> stp q16, q17, [x0], #32 // store to dst_u >> stp q18, q19, [x1], #32 // store to dst_v >> @@ -229,7 +280,7 @@ endfunc >> smaddl x8, w4, w15, x8 // x8 += bv * b >> asr w8, w8, #9 // x8 >>= 9 >> sub w5, w5, #1 // width-- >> - add x3, x3, #3 // src += 3 >> + add x3, x3, \element >> strh w8, [x1], #2 // store to dst_v >> cbnz w5, 2b >> 3: >> @@ -243,3 +294,11 @@ endfunc >> function ff_bgr24ToUV_neon, export=1 >> rgbToUV bgr=1 >> endfunc >> + >> +function ff_rgba32ToUV_neon, export=1 >> + rgbToUV bgr=0, element=4 >> +endfunc >> + >> +function ff_bgra32ToUV_neon, export=1 >> + rgbToUV bgr=1, element=4 >> +endfunc >> diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c >> index ce70dbedcc..8fe9fb11ac 100644 >> --- a/libswscale/aarch64/swscale.c >> +++ b/libswscale/aarch64/swscale.c >> @@ -212,7 +212,9 @@ void ff_##name##ToUV_half_neon(uint8_t *, uint8_t *, const uint8_t *, \ >> uint32_t *coeffs, void *) >> >> NEON_INPUT(bgr24); >> +NEON_INPUT(bgra32); >> NEON_INPUT(rgb24); >> +NEON_INPUT(rgba32); >> >> av_cold void ff_sws_init_swscale_aarch64(SwsContext *c) >> { >> @@ -233,6 +235,13 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c) >> else >> c->chrToYV12 = ff_bgr24ToUV_neon; >> break; >> + case AV_PIX_FMT_BGRA: >> + c->lumToYV12 = ff_bgra32ToY_neon; >> + if (c->chrSrcHSubSample) >> + c->chrToYV12 = ff_bgra32ToUV_half_neon; >> + else >> + c->chrToYV12 = ff_bgra32ToUV_neon; >> + break; >> case AV_PIX_FMT_RGB24: >> c->lumToYV12 = ff_rgb24ToY_neon; >> if (c->chrSrcHSubSample) >> @@ -240,6 +249,13 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c) >> else >> c->chrToYV12 = ff_rgb24ToUV_neon; >> break; >> + case AV_PIX_FMT_RGBA: >> + c->lumToYV12 = ff_rgba32ToY_neon; >> + if (c->chrSrcHSubSample) >> + c->chrToYV12 = ff_rgba32ToUV_half_neon; >> + else >> + c->chrToYV12 = ff_rgba32ToUV_neon; >> + break; >> default: >> break; >> } > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/2] swscale/aarch64: Add bgra/rgba to yuv 2024-06-19 9:24 ` Zhao Zhili @ 2024-06-19 12:05 ` Rémi Denis-Courmont 2024-06-19 17:15 ` Zhao Zhili 0 siblings, 1 reply; 9+ messages in thread From: Rémi Denis-Courmont @ 2024-06-19 12:05 UTC (permalink / raw) To: FFmpeg development discussions and patches Le 19 juin 2024 11:24:28 GMT+02:00, Zhao Zhili <quinkblack@foxmail.com> a écrit : > > >> On Jun 19, 2024, at 15:07, Rémi Denis-Courmont <remi@remlab.net> wrote: >> >> >> >> Le 15 juin 2024 11:57:18 GMT+02:00, Zhao Zhili <quinkblack@foxmail.com> a écrit : >>> From: Zhao Zhili <zhilizhao@tencent.com> >>> >>> Test on Apple M1 with kperf >>> >>> bgra_to_uv_8_c: 13.4 >>> bgra_to_uv_8_neon: 37.4 >>> bgra_to_uv_128_c: 155.9 >>> bgra_to_uv_128_neon: 91.7 >>> bgra_to_uv_1080_c: 1173.2 >>> bgra_to_uv_1080_neon: 822.7 >>> bgra_to_uv_1920_c: 2078.2 >>> bgra_to_uv_1920_neon: 1437.7 >>> bgra_to_uv_half_8_c: 17.9 >>> bgra_to_uv_half_8_neon: 37.4 >>> bgra_to_uv_half_128_c: 103.9 >>> bgra_to_uv_half_128_neon: 73.9 >>> bgra_to_uv_half_1080_c: 850.2 >>> bgra_to_uv_half_1080_neon: 484.2 >>> bgra_to_uv_half_1920_c: 1479.2 >>> bgra_to_uv_half_1920_neon: 824.2 >>> bgra_to_y_8_c: 8.2 >>> bgra_to_y_8_neon: 18.2 >>> bgra_to_y_128_c: 101.4 >>> bgra_to_y_128_neon: 74.9 >>> bgra_to_y_1080_c: 739.4 >>> bgra_to_y_1080_neon: 613.4 >>> bgra_to_y_1920_c: 1298.7 >>> bgra_to_y_1920_neon: 918.7 >>> --- >>> libswscale/aarch64/input.S | 81 +++++++++++++++++++++++++++++++----- >>> libswscale/aarch64/swscale.c | 16 +++++++ >>> 2 files changed, 86 insertions(+), 11 deletions(-) >>> >>> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S >>> index 2b956fe5c2..37f1158504 100644 >>> --- a/libswscale/aarch64/input.S >>> +++ b/libswscale/aarch64/input.S >>> @@ -20,8 +20,12 @@ >>> >>> #include "libavutil/aarch64/asm.S" >>> >>> -.macro rgb_to_yuv_load_rgb src >>> +.macro rgb_to_yuv_load_rgb src, element=3 >>> + .if \element == 3 >>> ld3 { v16.16b, v17.16b, v18.16b }, [\src] >>> + .else >>> + ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [\src] >>> + .endif >>> uxtl v19.8h, v16.8b // v19: r >>> uxtl v20.8h, v17.8b // v20: g >>> uxtl v21.8h, v18.8b // v21: b >>> @@ -43,7 +47,7 @@ >>> sqshrn2 \dst\().8h, \dst2\().4s, \right_shift // dst_higher_half = dst2 >> right_shift >>> .endm >>> >>> -.macro rgbToY bgr >>> +.macro rgbToY bgr, element=3 >> >> AFAICT, you don't need to a macro parameter for component order. Just swap red and blue coefficients in the prologue and then run the bit-exact same loops for bgr/rgb, rgba/bgra and argb/abgr. This adds one branch in the prologue but that's mostly negligible compared to the loop. > >I’m not sure where to add the branch. Could you elaborate? Do you mean load coefficients first like the following: > >function ff_bgr24ToUV_half_neon, export=1 > ldr w12, [x6, #12] > ldr w11, [x6, #16] > ldr w10, [x6, #20] > ldr w15, [x6, #24] > ldr w14, [x6, #28] > ldr w13, [x6, #32] > rgbToUV_half >endfunc Hmm, no. You need to jump past the loading of red and blue coefficients. It might help to load green coefficients last. By the way, I think you can use LDP instead of LDR. > >> >>> cmp w4, #0 // check width > 0 >>> .if \bgr >>> ldr w12, [x5] // w12: ry >>> @@ -67,11 +71,15 @@ >>> dup v2.8h, w12 >>> b.lt 2f >>> 1: >>> - rgb_to_yuv_load_rgb x1 >>> + rgb_to_yuv_load_rgb x1, \element >>> rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9 >>> rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9 >>> sub w4, w4, #16 // width -= 16 >>> + .if \element == 3 >>> add x1, x1, #48 // src += 48 >>> + .else >>> + add x1, x1, #64 >>> + .endif >>> cmp w4, #16 // width >= 16 ? >>> stp q16, q17, [x0], #32 // store to dst >>> b.ge 1b >>> @@ -86,7 +94,7 @@ >>> smaddl x13, w15, w12, x13 // x13 += by * b >>> asr w13, w13, #9 // x13 >>= 9 >>> sub w4, w4, #1 // width-- >>> - add x1, x1, #3 // src += 3 >>> + add x1, x1, \element >>> strh w13, [x0], #2 // store to dst >>> cbnz w4, 2b >>> 3: >>> @@ -101,6 +109,14 @@ function ff_bgr24ToY_neon, export=1 >>> rgbToY bgr=1 >>> endfunc >>> >>> +function ff_rgba32ToY_neon, export=1 >>> + rgbToY bgr=0, element=4 >>> +endfunc >>> + >>> +function ff_bgra32ToY_neon, export=1 >>> + rgbToY bgr=1, element=4 >>> +endfunc >>> + >>> .macro rgb_load_uv_coeff half, bgr >>> .if \bgr >>> ldr w12, [x6, #12] >>> @@ -130,7 +146,7 @@ endfunc >>> dup v6.4s, w9 >>> .endm >>> >>> -.macro rgbToUV_half bgr >>> +.macro rgbToUV_half bgr, element=3 >>> cmp w5, #0 // check width > 0 >>> b.le 3f >>> >>> @@ -139,7 +155,11 @@ endfunc >>> b.lt 2f >>> // The following comments assume RGB order. The logic for RGB and BGR is the same. >>> 1: >>> + .if \element == 3 >>> ld3 { v16.16b, v17.16b, v18.16b }, [x3] >>> + .else >>> + ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [x3] >>> + .endif >>> uaddlp v19.8h, v16.16b // v19: r >>> uaddlp v20.8h, v17.16b // v20: g >>> uaddlp v21.8h, v18.16b // v21: b >>> @@ -147,7 +167,11 @@ endfunc >>> rgb_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10 >>> rgb_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10 >>> sub w5, w5, #8 // width -= 8 >>> - add x3, x3, #48 // src += 48 >>> + .if \element == 3 >>> + add x3, x3, #48 >>> + .else >>> + add x3, x3, #64 >>> + .endif >>> cmp w5, #8 // width >= 8 ? >>> str q16, [x0], #16 // store dst_u >>> str q17, [x1], #16 // store dst_v >>> @@ -155,9 +179,10 @@ endfunc >>> cbz w5, 3f >>> 2: >>> ldrb w2, [x3] // w2: r1 >>> - ldrb w4, [x3, #3] // w4: r2 >>> + ldrb w4, [x3, \element] // w4: r2 >>> add w2, w2, w4 // w2 = r1 + r2 >>> >>> + .if \element == 3 >>> ldrb w4, [x3, #1] // w4: g1 >>> ldrb w7, [x3, #4] // w7: g2 >>> add w4, w4, w7 // w4 = g1 + g2 >>> @@ -165,6 +190,15 @@ endfunc >>> ldrb w7, [x3, #2] // w7: b1 >>> ldrb w8, [x3, #5] // w8: b2 >>> add w7, w7, w8 // w7 = b1 + b2 >>> + .else >>> + ldrb w4, [x3, #1] // w4: g1 >>> + ldrb w7, [x3, #5] // w7: g2 >>> + add w4, w4, w7 // w4 = g1 + g2 >>> + >>> + ldrb w7, [x3, #2] // w7: b1 >>> + ldrb w8, [x3, #6] // w8: b2 >>> + add w7, w7, w8 // w7 = b1 + b2 >>> + .endif >>> >>> smaddl x8, w2, w10, x9 // dst_u = ru * r + const_offset >>> smaddl x8, w4, w11, x8 // dst_u += gu * g >>> @@ -177,7 +211,12 @@ endfunc >>> smaddl x8, w7, w15, x8 // dst_v += bv * b >>> asr x8, x8, #10 // dst_v >>= 10 >>> sub w5, w5, #1 >>> - add x3, x3, #6 // src += 6 >>> + ldrb w4, [x3, #1] // w4: g1 >>> + .if \element == 3 >>> + add x3, x3, #6 >>> + .else >>> + add x3, x3, #8 >>> + .endif >>> strh w8, [x1], #2 // store dst_v >>> cbnz w5, 2b >>> 3: >>> @@ -192,7 +231,15 @@ function ff_bgr24ToUV_half_neon, export=1 >>> rgbToUV_half bgr=1 >>> endfunc >>> >>> -.macro rgbToUV bgr >>> +function ff_rgba32ToUV_half_neon, export=1 >>> + rgbToUV_half bgr=0, element=4 >>> +endfunc >>> + >>> +function ff_bgra32ToUV_half_neon, export=1 >>> + rgbToUV_half bgr=1, element=4 >>> +endfunc >>> + >>> +.macro rgbToUV bgr, element=3 >>> cmp w5, #0 // check width > 0 >>> b.le 3f >>> >>> @@ -201,13 +248,17 @@ endfunc >>> b.lt 2f >>> // The following comments assume RGB order. The logic for RGB and BGR is the same. >>> 1: >>> - rgb_to_yuv_load_rgb x3 >>> + rgb_to_yuv_load_rgb x3, \element >>> rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9 >>> rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9 >>> rgb_to_yuv_product v19, v20, v21, v25, v26, v18, v3, v4, v5, #9 >>> rgb_to_yuv_product v22, v23, v24, v27, v28, v19, v3, v4, v5, #9 >>> sub w5, w5, #16 >>> + .if \element == 3 >>> add x3, x3, #48 // src += 48 >>> + .else >>> + add x3, x3, #64 >>> + .endif >>> cmp w5, #16 >>> stp q16, q17, [x0], #32 // store to dst_u >>> stp q18, q19, [x1], #32 // store to dst_v >>> @@ -229,7 +280,7 @@ endfunc >>> smaddl x8, w4, w15, x8 // x8 += bv * b >>> asr w8, w8, #9 // x8 >>= 9 >>> sub w5, w5, #1 // width-- >>> - add x3, x3, #3 // src += 3 >>> + add x3, x3, \element >>> strh w8, [x1], #2 // store to dst_v >>> cbnz w5, 2b >>> 3: >>> @@ -243,3 +294,11 @@ endfunc >>> function ff_bgr24ToUV_neon, export=1 >>> rgbToUV bgr=1 >>> endfunc >>> + >>> +function ff_rgba32ToUV_neon, export=1 >>> + rgbToUV bgr=0, element=4 >>> +endfunc >>> + >>> +function ff_bgra32ToUV_neon, export=1 >>> + rgbToUV bgr=1, element=4 >>> +endfunc >>> diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c >>> index ce70dbedcc..8fe9fb11ac 100644 >>> --- a/libswscale/aarch64/swscale.c >>> +++ b/libswscale/aarch64/swscale.c >>> @@ -212,7 +212,9 @@ void ff_##name##ToUV_half_neon(uint8_t *, uint8_t *, const uint8_t *, \ >>> uint32_t *coeffs, void *) >>> >>> NEON_INPUT(bgr24); >>> +NEON_INPUT(bgra32); >>> NEON_INPUT(rgb24); >>> +NEON_INPUT(rgba32); >>> >>> av_cold void ff_sws_init_swscale_aarch64(SwsContext *c) >>> { >>> @@ -233,6 +235,13 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c) >>> else >>> c->chrToYV12 = ff_bgr24ToUV_neon; >>> break; >>> + case AV_PIX_FMT_BGRA: >>> + c->lumToYV12 = ff_bgra32ToY_neon; >>> + if (c->chrSrcHSubSample) >>> + c->chrToYV12 = ff_bgra32ToUV_half_neon; >>> + else >>> + c->chrToYV12 = ff_bgra32ToUV_neon; >>> + break; >>> case AV_PIX_FMT_RGB24: >>> c->lumToYV12 = ff_rgb24ToY_neon; >>> if (c->chrSrcHSubSample) >>> @@ -240,6 +249,13 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c) >>> else >>> c->chrToYV12 = ff_rgb24ToUV_neon; >>> break; >>> + case AV_PIX_FMT_RGBA: >>> + c->lumToYV12 = ff_rgba32ToY_neon; >>> + if (c->chrSrcHSubSample) >>> + c->chrToYV12 = ff_rgba32ToUV_half_neon; >>> + else >>> + c->chrToYV12 = ff_rgba32ToUV_neon; >>> + break; >>> default: >>> break; >>> } >> _______________________________________________ >> ffmpeg-devel mailing list >> ffmpeg-devel@ffmpeg.org >> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel >> >> To unsubscribe, visit link above, or email >> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". > >_______________________________________________ >ffmpeg-devel mailing list >ffmpeg-devel@ffmpeg.org >https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > >To unsubscribe, visit link above, or email >ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/2] swscale/aarch64: Add bgra/rgba to yuv 2024-06-19 12:05 ` Rémi Denis-Courmont @ 2024-06-19 17:15 ` Zhao Zhili 2024-06-20 12:49 ` Martin Storsjö 0 siblings, 1 reply; 9+ messages in thread From: Zhao Zhili @ 2024-06-19 17:15 UTC (permalink / raw) To: FFmpeg development discussions and patches > On Jun 19, 2024, at 20:05, Rémi Denis-Courmont <remi@remlab.net> wrote: > > > > Le 19 juin 2024 11:24:28 GMT+02:00, Zhao Zhili <quinkblack@foxmail.com <mailto:quinkblack@foxmail.com>> a écrit : >> >> >>> On Jun 19, 2024, at 15:07, Rémi Denis-Courmont <remi@remlab.net> wrote: >>> >>> >>> >>> Le 15 juin 2024 11:57:18 GMT+02:00, Zhao Zhili <quinkblack@foxmail.com> a écrit : >>>> From: Zhao Zhili <zhilizhao@tencent.com> >>>> >>>> Test on Apple M1 with kperf >>>> >>>> bgra_to_uv_8_c: 13.4 >>>> bgra_to_uv_8_neon: 37.4 >>>> bgra_to_uv_128_c: 155.9 >>>> bgra_to_uv_128_neon: 91.7 >>>> bgra_to_uv_1080_c: 1173.2 >>>> bgra_to_uv_1080_neon: 822.7 >>>> bgra_to_uv_1920_c: 2078.2 >>>> bgra_to_uv_1920_neon: 1437.7 >>>> bgra_to_uv_half_8_c: 17.9 >>>> bgra_to_uv_half_8_neon: 37.4 >>>> bgra_to_uv_half_128_c: 103.9 >>>> bgra_to_uv_half_128_neon: 73.9 >>>> bgra_to_uv_half_1080_c: 850.2 >>>> bgra_to_uv_half_1080_neon: 484.2 >>>> bgra_to_uv_half_1920_c: 1479.2 >>>> bgra_to_uv_half_1920_neon: 824.2 >>>> bgra_to_y_8_c: 8.2 >>>> bgra_to_y_8_neon: 18.2 >>>> bgra_to_y_128_c: 101.4 >>>> bgra_to_y_128_neon: 74.9 >>>> bgra_to_y_1080_c: 739.4 >>>> bgra_to_y_1080_neon: 613.4 >>>> bgra_to_y_1920_c: 1298.7 >>>> bgra_to_y_1920_neon: 918.7 >>>> --- >>>> libswscale/aarch64/input.S | 81 +++++++++++++++++++++++++++++++----- >>>> libswscale/aarch64/swscale.c | 16 +++++++ >>>> 2 files changed, 86 insertions(+), 11 deletions(-) >>>> >>>> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S >>>> index 2b956fe5c2..37f1158504 100644 >>>> --- a/libswscale/aarch64/input.S >>>> +++ b/libswscale/aarch64/input.S >>>> @@ -20,8 +20,12 @@ >>>> >>>> #include "libavutil/aarch64/asm.S" >>>> >>>> -.macro rgb_to_yuv_load_rgb src >>>> +.macro rgb_to_yuv_load_rgb src, element=3 >>>> + .if \element == 3 >>>> ld3 { v16.16b, v17.16b, v18.16b }, [\src] >>>> + .else >>>> + ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [\src] >>>> + .endif >>>> uxtl v19.8h, v16.8b // v19: r >>>> uxtl v20.8h, v17.8b // v20: g >>>> uxtl v21.8h, v18.8b // v21: b >>>> @@ -43,7 +47,7 @@ >>>> sqshrn2 \dst\().8h, \dst2\().4s, \right_shift // dst_higher_half = dst2 >> right_shift >>>> .endm >>>> >>>> -.macro rgbToY bgr >>>> +.macro rgbToY bgr, element=3 >>> >>> AFAICT, you don't need to a macro parameter for component order. Just swap red and blue coefficients in the prologue and then run the bit-exact same loops for bgr/rgb, rgba/bgra and argb/abgr. This adds one branch in the prologue but that's mostly negligible compared to the loop. >> >> I’m not sure where to add the branch. Could you elaborate? Do you mean load coefficients first like the following: >> >> function ff_bgr24ToUV_half_neon, export=1 >> ldr w12, [x6, #12] >> ldr w11, [x6, #16] >> ldr w10, [x6, #20] >> ldr w15, [x6, #24] >> ldr w14, [x6, #28] >> ldr w13, [x6, #32] >> rgbToUV_half >> endfunc > > Hmm, no. You need to jump past the loading of red and blue coefficients. It might help to load green coefficients last. > > By the way, I think you can use LDP instead of LDR. Patch v2 replace LDR by LDP, then the "jump past the loading of red and blue coefficients” doesn’t apply now. > >> >>> >>>> cmp w4, #0 // check width > 0 >>>> .if \bgr >>>> ldr w12, [x5] // w12: ry >>>> @@ -67,11 +71,15 @@ >>>> dup v2.8h, w12 >>>> b.lt 2f >>>> 1: >>>> - rgb_to_yuv_load_rgb x1 >>>> + rgb_to_yuv_load_rgb x1, \element >>>> rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9 >>>> rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9 >>>> sub w4, w4, #16 // width -= 16 >>>> + .if \element == 3 >>>> add x1, x1, #48 // src += 48 >>>> + .else >>>> + add x1, x1, #64 >>>> + .endif >>>> cmp w4, #16 // width >= 16 ? >>>> stp q16, q17, [x0], #32 // store to dst >>>> b.ge 1b >>>> @@ -86,7 +94,7 @@ >>>> smaddl x13, w15, w12, x13 // x13 += by * b >>>> asr w13, w13, #9 // x13 >>= 9 >>>> sub w4, w4, #1 // width-- >>>> - add x1, x1, #3 // src += 3 >>>> + add x1, x1, \element >>>> strh w13, [x0], #2 // store to dst >>>> cbnz w4, 2b >>>> 3: >>>> @@ -101,6 +109,14 @@ function ff_bgr24ToY_neon, export=1 >>>> rgbToY bgr=1 >>>> endfunc >>>> >>>> +function ff_rgba32ToY_neon, export=1 >>>> + rgbToY bgr=0, element=4 >>>> +endfunc >>>> + >>>> +function ff_bgra32ToY_neon, export=1 >>>> + rgbToY bgr=1, element=4 >>>> +endfunc >>>> + >>>> .macro rgb_load_uv_coeff half, bgr >>>> .if \bgr >>>> ldr w12, [x6, #12] >>>> @@ -130,7 +146,7 @@ endfunc >>>> dup v6.4s, w9 >>>> .endm >>>> >>>> -.macro rgbToUV_half bgr >>>> +.macro rgbToUV_half bgr, element=3 >>>> cmp w5, #0 // check width > 0 >>>> b.le 3f >>>> >>>> @@ -139,7 +155,11 @@ endfunc >>>> b.lt 2f >>>> // The following comments assume RGB order. The logic for RGB and BGR is the same. >>>> 1: >>>> + .if \element == 3 >>>> ld3 { v16.16b, v17.16b, v18.16b }, [x3] >>>> + .else >>>> + ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [x3] >>>> + .endif >>>> uaddlp v19.8h, v16.16b // v19: r >>>> uaddlp v20.8h, v17.16b // v20: g >>>> uaddlp v21.8h, v18.16b // v21: b >>>> @@ -147,7 +167,11 @@ endfunc >>>> rgb_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10 >>>> rgb_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10 >>>> sub w5, w5, #8 // width -= 8 >>>> - add x3, x3, #48 // src += 48 >>>> + .if \element == 3 >>>> + add x3, x3, #48 >>>> + .else >>>> + add x3, x3, #64 >>>> + .endif >>>> cmp w5, #8 // width >= 8 ? >>>> str q16, [x0], #16 // store dst_u >>>> str q17, [x1], #16 // store dst_v >>>> @@ -155,9 +179,10 @@ endfunc >>>> cbz w5, 3f >>>> 2: >>>> ldrb w2, [x3] // w2: r1 >>>> - ldrb w4, [x3, #3] // w4: r2 >>>> + ldrb w4, [x3, \element] // w4: r2 >>>> add w2, w2, w4 // w2 = r1 + r2 >>>> >>>> + .if \element == 3 >>>> ldrb w4, [x3, #1] // w4: g1 >>>> ldrb w7, [x3, #4] // w7: g2 >>>> add w4, w4, w7 // w4 = g1 + g2 >>>> @@ -165,6 +190,15 @@ endfunc >>>> ldrb w7, [x3, #2] // w7: b1 >>>> ldrb w8, [x3, #5] // w8: b2 >>>> add w7, w7, w8 // w7 = b1 + b2 >>>> + .else >>>> + ldrb w4, [x3, #1] // w4: g1 >>>> + ldrb w7, [x3, #5] // w7: g2 >>>> + add w4, w4, w7 // w4 = g1 + g2 >>>> + >>>> + ldrb w7, [x3, #2] // w7: b1 >>>> + ldrb w8, [x3, #6] // w8: b2 >>>> + add w7, w7, w8 // w7 = b1 + b2 >>>> + .endif >>>> >>>> smaddl x8, w2, w10, x9 // dst_u = ru * r + const_offset >>>> smaddl x8, w4, w11, x8 // dst_u += gu * g >>>> @@ -177,7 +211,12 @@ endfunc >>>> smaddl x8, w7, w15, x8 // dst_v += bv * b >>>> asr x8, x8, #10 // dst_v >>= 10 >>>> sub w5, w5, #1 >>>> - add x3, x3, #6 // src += 6 >>>> + ldrb w4, [x3, #1] // w4: g1 >>>> + .if \element == 3 >>>> + add x3, x3, #6 >>>> + .else >>>> + add x3, x3, #8 >>>> + .endif >>>> strh w8, [x1], #2 // store dst_v >>>> cbnz w5, 2b >>>> 3: >>>> @@ -192,7 +231,15 @@ function ff_bgr24ToUV_half_neon, export=1 >>>> rgbToUV_half bgr=1 >>>> endfunc >>>> >>>> -.macro rgbToUV bgr >>>> +function ff_rgba32ToUV_half_neon, export=1 >>>> + rgbToUV_half bgr=0, element=4 >>>> +endfunc >>>> + >>>> +function ff_bgra32ToUV_half_neon, export=1 >>>> + rgbToUV_half bgr=1, element=4 >>>> +endfunc >>>> + >>>> +.macro rgbToUV bgr, element=3 >>>> cmp w5, #0 // check width > 0 >>>> b.le 3f >>>> >>>> @@ -201,13 +248,17 @@ endfunc >>>> b.lt 2f >>>> // The following comments assume RGB order. The logic for RGB and BGR is the same. >>>> 1: >>>> - rgb_to_yuv_load_rgb x3 >>>> + rgb_to_yuv_load_rgb x3, \element >>>> rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9 >>>> rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9 >>>> rgb_to_yuv_product v19, v20, v21, v25, v26, v18, v3, v4, v5, #9 >>>> rgb_to_yuv_product v22, v23, v24, v27, v28, v19, v3, v4, v5, #9 >>>> sub w5, w5, #16 >>>> + .if \element == 3 >>>> add x3, x3, #48 // src += 48 >>>> + .else >>>> + add x3, x3, #64 >>>> + .endif >>>> cmp w5, #16 >>>> stp q16, q17, [x0], #32 // store to dst_u >>>> stp q18, q19, [x1], #32 // store to dst_v >>>> @@ -229,7 +280,7 @@ endfunc >>>> smaddl x8, w4, w15, x8 // x8 += bv * b >>>> asr w8, w8, #9 // x8 >>= 9 >>>> sub w5, w5, #1 // width-- >>>> - add x3, x3, #3 // src += 3 >>>> + add x3, x3, \element >>>> strh w8, [x1], #2 // store to dst_v >>>> cbnz w5, 2b >>>> 3: >>>> @@ -243,3 +294,11 @@ endfunc >>>> function ff_bgr24ToUV_neon, export=1 >>>> rgbToUV bgr=1 >>>> endfunc >>>> + >>>> +function ff_rgba32ToUV_neon, export=1 >>>> + rgbToUV bgr=0, element=4 >>>> +endfunc >>>> + >>>> +function ff_bgra32ToUV_neon, export=1 >>>> + rgbToUV bgr=1, element=4 >>>> +endfunc >>>> diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c >>>> index ce70dbedcc..8fe9fb11ac 100644 >>>> --- a/libswscale/aarch64/swscale.c >>>> +++ b/libswscale/aarch64/swscale.c >>>> @@ -212,7 +212,9 @@ void ff_##name##ToUV_half_neon(uint8_t *, uint8_t *, const uint8_t *, \ >>>> uint32_t *coeffs, void *) >>>> >>>> NEON_INPUT(bgr24); >>>> +NEON_INPUT(bgra32); >>>> NEON_INPUT(rgb24); >>>> +NEON_INPUT(rgba32); >>>> >>>> av_cold void ff_sws_init_swscale_aarch64(SwsContext *c) >>>> { >>>> @@ -233,6 +235,13 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c) >>>> else >>>> c->chrToYV12 = ff_bgr24ToUV_neon; >>>> break; >>>> + case AV_PIX_FMT_BGRA: >>>> + c->lumToYV12 = ff_bgra32ToY_neon; >>>> + if (c->chrSrcHSubSample) >>>> + c->chrToYV12 = ff_bgra32ToUV_half_neon; >>>> + else >>>> + c->chrToYV12 = ff_bgra32ToUV_neon; >>>> + break; >>>> case AV_PIX_FMT_RGB24: >>>> c->lumToYV12 = ff_rgb24ToY_neon; >>>> if (c->chrSrcHSubSample) >>>> @@ -240,6 +249,13 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c) >>>> else >>>> c->chrToYV12 = ff_rgb24ToUV_neon; >>>> break; >>>> + case AV_PIX_FMT_RGBA: >>>> + c->lumToYV12 = ff_rgba32ToY_neon; >>>> + if (c->chrSrcHSubSample) >>>> + c->chrToYV12 = ff_rgba32ToUV_half_neon; >>>> + else >>>> + c->chrToYV12 = ff_rgba32ToUV_neon; >>>> + break; >>>> default: >>>> break; >>>> } >>> _______________________________________________ >>> ffmpeg-devel mailing list >>> ffmpeg-devel@ffmpeg.org >>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel >>> >>> To unsubscribe, visit link above, or email >>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". >> >> _______________________________________________ >> ffmpeg-devel mailing list >> ffmpeg-devel@ffmpeg.org >> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel >> >> To unsubscribe, visit link above, or email >> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org <mailto:ffmpeg-devel@ffmpeg.org> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org <mailto:ffmpeg-devel-request@ffmpeg.org> with subject "unsubscribe". _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/2] swscale/aarch64: Add bgra/rgba to yuv 2024-06-19 17:15 ` Zhao Zhili @ 2024-06-20 12:49 ` Martin Storsjö 2024-06-20 16:02 ` Zhao Zhili 0 siblings, 1 reply; 9+ messages in thread From: Martin Storsjö @ 2024-06-20 12:49 UTC (permalink / raw) To: FFmpeg development discussions and patches On Thu, 20 Jun 2024, Zhao Zhili wrote: >> On Jun 19, 2024, at 20:05, Rémi Denis-Courmont <remi@remlab.net> wrote: >> >> Le 19 juin 2024 11:24:28 GMT+02:00, Zhao Zhili <quinkblack@foxmail.com <mailto:quinkblack@foxmail.com>> a écrit : >>> >>>> On Jun 19, 2024, at 15:07, Rémi Denis-Courmont <remi@remlab.net> wrote: >>>> >>>> >>>> >>>> Le 15 juin 2024 11:57:18 GMT+02:00, Zhao Zhili <quinkblack@foxmail.com> a écrit : >>>>> >>>>> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S >>>>> index 2b956fe5c2..37f1158504 100644 >>>>> --- a/libswscale/aarch64/input.S >>>>> +++ b/libswscale/aarch64/input.S >>>>> @@ -20,8 +20,12 @@ >>>>> >>>>> #include "libavutil/aarch64/asm.S" >>>>> >>>>> -.macro rgb_to_yuv_load_rgb src >>>>> +.macro rgb_to_yuv_load_rgb src, element=3 >>>>> + .if \element == 3 >>>>> ld3 { v16.16b, v17.16b, v18.16b }, [\src] >>>>> + .else >>>>> + ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [\src] >>>>> + .endif >>>>> uxtl v19.8h, v16.8b // v19: r >>>>> uxtl v20.8h, v17.8b // v20: g >>>>> uxtl v21.8h, v18.8b // v21: b >>>>> @@ -43,7 +47,7 @@ >>>>> sqshrn2 \dst\().8h, \dst2\().4s, \right_shift // dst_higher_half = dst2 >> right_shift >>>>> .endm >>>>> >>>>> -.macro rgbToY bgr >>>>> +.macro rgbToY bgr, element=3 >>>> >>>> AFAICT, you don't need to a macro parameter for component order. Just swap red and blue coefficients in the prologue and then run the bit-exact same loops for bgr/rgb, rgba/bgra and argb/abgr. This adds one branch in the prologue but that's mostly negligible compared to the loop. >>> >>> I’m not sure where to add the branch. Could you elaborate? Do you mean load coefficients first like the following: >>> >>> function ff_bgr24ToUV_half_neon, export=1 >>> ldr w12, [x6, #12] >>> ldr w11, [x6, #16] >>> ldr w10, [x6, #20] >>> ldr w15, [x6, #24] >>> ldr w14, [x6, #28] >>> ldr w13, [x6, #32] >>> rgbToUV_half >>> endfunc >> >> Hmm, no. You need to jump past the loading of red and blue coefficients. It might help to load green coefficients last. >> >> By the way, I think you can use LDP instead of LDR. > > Patch v2 replace LDR by LDP, then the "jump past the loading of red and blue coefficients” doesn’t apply now. Rémi's point is that you don't need to duplicate the whole function, when the only thing you're changing is a couple of instructions in the prologue of the function. By reusing the actual bulk of the function, you save on binary size. One way of doing it looks like this: diff --git a/libavutil/aarch64/asm.S b/libavutil/aarch64/asm.S index 1840f9fb01..eb870e4dca 100644 --- a/libavutil/aarch64/asm.S +++ b/libavutil/aarch64/asm.S @@ -256,5 +256,11 @@ ELF .size \name, . - \name #define JOIN(a, b) GLUE(a, b) #define X(s) JOIN(EXTERN_ASM, s) +#ifdef __APPLE__ +#define L(x) L ## x +#else +#define L(x) .L ## x +#endif + #define x18 do_not_use_x18 #define w18 do_not_use_w18 diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S index 33afa34111..ce10d584c6 100644 --- a/libswscale/aarch64/input.S +++ b/libswscale/aarch64/input.S @@ -49,6 +49,7 @@ function ff_rgb24ToY_neon, export=1 ldr w12, [x5, #8] // w12: by b.le 3f +L(rgb24ToY_internal): mov w9, #256 // w9 = 1 << (RGB2YUV_SHIFT - 7) movk w9, #8, lsl #16 // w9 += 32 << (RGB2YUV_SHIFT - 1) dup v6.4s, w9 // w9: const_offset @@ -85,6 +86,14 @@ function ff_rgb24ToY_neon, export=1 ret endfunc +function ff_bgr24ToY_neon, export=1 + cmp w4, #0 // check width > 0 + ldp w12, w11, [x5] // w12: ry, w11: gy + ldr w10, [x5, #8] // w10: by + b.gt L(rgb24ToY_internal) + ret +endfunc + .macro rgb24_load_uv_coeff half ldp w10, w11, [x6, #12] // w10: ru, w11: gu ldp w12, w13, [x6, #20] // w12: bu, w13: rv Another way looks like this: diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S index 33afa34111..5c4b7a41fd 100644 --- a/libswscale/aarch64/input.S +++ b/libswscale/aarch64/input.S @@ -43,12 +43,23 @@ sqshrn2 \dst\().8h, \dst2\().4s, \right_shift // dst_higher_half = dst2 >> right_shift .endm +function ff_bgr24ToY_neon, export=1 + cmp w4, #0 // check width > 0 + ldp w12, w11, [x5] // w12: ry, w11: gy + ldr w10, [x5, #8] // w10: by + b.gt rgb24ToY_internal + ret +endfunc + function ff_rgb24ToY_neon, export=1 cmp w4, #0 // check width > 0 ldp w10, w11, [x5] // w10: ry, w11: gy ldr w12, [x5, #8] // w12: by - b.le 3f + b.gt rgb24ToY_internal + ret +endfunc +function rgb24ToY_internal mov w9, #256 // w9 = 1 << (RGB2YUV_SHIFT - 7) movk w9, #8, lsl #16 // w9 += 32 << (RGB2YUV_SHIFT - 1) dup v6.4s, w9 // w9: const_offset Or if you want to be really adventurous, you can make a fallthrough: diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S index 33afa34111..025a965b76 100644 --- a/libswscale/aarch64/input.S +++ b/libswscale/aarch64/input.S @@ -43,12 +43,22 @@ sqshrn2 \dst\().8h, \dst2\().4s, \right_shift // dst_higher_half = dst2 >> right_shift .endm +function ff_bgr24ToY_neon, export=1 + cmp w4, #0 // check width > 0 + ldp w12, w11, [x5] // w12: ry, w11: gy + ldr w10, [x5, #8] // w10: by + b.gt rgb24ToY_internal + ret +endfunc + function ff_rgb24ToY_neon, export=1 cmp w4, #0 // check width > 0 ldp w10, w11, [x5] // w10: ry, w11: gy ldr w12, [x5, #8] // w12: by b.le 3f +endfunc +function rgb24ToY_internal mov w9, #256 // w9 = 1 << (RGB2YUV_SHIFT - 7) movk w9, #8, lsl #16 // w9 += 32 << (RGB2YUV_SHIFT - 1) dup v6.4s, w9 // w9: const_offset // Martin _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/2] swscale/aarch64: Add bgra/rgba to yuv 2024-06-20 12:49 ` Martin Storsjö @ 2024-06-20 16:02 ` Zhao Zhili 2024-06-20 16:25 ` Rémi Denis-Courmont 0 siblings, 1 reply; 9+ messages in thread From: Zhao Zhili @ 2024-06-20 16:02 UTC (permalink / raw) To: FFmpeg development discussions and patches > On Jun 20, 2024, at 20:49, Martin Storsjö <martin@martin.st> wrote: > > On Thu, 20 Jun 2024, Zhao Zhili wrote: > >>> On Jun 19, 2024, at 20:05, Rémi Denis-Courmont <remi@remlab.net> wrote: >>> Le 19 juin 2024 11:24:28 GMT+02:00, Zhao Zhili <quinkblack@foxmail.com <mailto:quinkblack@foxmail.com>> a écrit : >>>>> On Jun 19, 2024, at 15:07, Rémi Denis-Courmont <remi@remlab.net> wrote: >>>>> Le 15 juin 2024 11:57:18 GMT+02:00, Zhao Zhili <quinkblack@foxmail.com> a écrit : >>>>>> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S >>>>>> index 2b956fe5c2..37f1158504 100644 >>>>>> --- a/libswscale/aarch64/input.S >>>>>> +++ b/libswscale/aarch64/input.S >>>>>> @@ -20,8 +20,12 @@ >>>>>> #include "libavutil/aarch64/asm.S" >>>>>> -.macro rgb_to_yuv_load_rgb src >>>>>> +.macro rgb_to_yuv_load_rgb src, element=3 >>>>>> + .if \element == 3 >>>>>> ld3 { v16.16b, v17.16b, v18.16b }, [\src] >>>>>> + .else >>>>>> + ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [\src] >>>>>> + .endif >>>>>> uxtl v19.8h, v16.8b // v19: r >>>>>> uxtl v20.8h, v17.8b // v20: g >>>>>> uxtl v21.8h, v18.8b // v21: b >>>>>> @@ -43,7 +47,7 @@ >>>>>> sqshrn2 \dst\().8h, \dst2\().4s, \right_shift // dst_higher_half = dst2 >> right_shift >>>>>> .endm >>>>>> -.macro rgbToY bgr >>>>>> +.macro rgbToY bgr, element=3 >>>>> AFAICT, you don't need to a macro parameter for component order. Just swap red and blue coefficients in the prologue and then run the bit-exact same loops for bgr/rgb, rgba/bgra and argb/abgr. This adds one branch in the prologue but that's mostly negligible compared to the loop. >>>> I’m not sure where to add the branch. Could you elaborate? Do you mean load coefficients first like the following: >>>> function ff_bgr24ToUV_half_neon, export=1 >>>> ldr w12, [x6, #12] >>>> ldr w11, [x6, #16] >>>> ldr w10, [x6, #20] >>>> ldr w15, [x6, #24] >>>> ldr w14, [x6, #28] >>>> ldr w13, [x6, #32] >>>> rgbToUV_half >>>> endfunc >>> Hmm, no. You need to jump past the loading of red and blue coefficients. It might help to load green coefficients last. >>> By the way, I think you can use LDP instead of LDR. >> >> Patch v2 replace LDR by LDP, then the "jump past the loading of red and blue coefficients” doesn’t apply now. > > Rémi's point is that you don't need to duplicate the whole function, when the only thing you're changing is a couple of instructions in the prologue of the function. By reusing the actual bulk of the function, you save on binary size. Thank you for the detailed examples. I missed the key point here is to save binary size. I have seen similar example of fall through in risk/input_rvv.s. Is it well defined to jump to a local label in another function? > > One way of doing it looks like this: > > diff --git a/libavutil/aarch64/asm.S b/libavutil/aarch64/asm.S > index 1840f9fb01..eb870e4dca 100644 > --- a/libavutil/aarch64/asm.S > +++ b/libavutil/aarch64/asm.S > @@ -256,5 +256,11 @@ ELF .size \name, . - \name > #define JOIN(a, b) GLUE(a, b) > #define X(s) JOIN(EXTERN_ASM, s) > > +#ifdef __APPLE__ > +#define L(x) L ## x > +#else > +#define L(x) .L ## x > +#endif > + > #define x18 do_not_use_x18 > #define w18 do_not_use_w18 > diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S > index 33afa34111..ce10d584c6 100644 > --- a/libswscale/aarch64/input.S > +++ b/libswscale/aarch64/input.S > @@ -49,6 +49,7 @@ function ff_rgb24ToY_neon, export=1 > ldr w12, [x5, #8] // w12: by > b.le 3f > > +L(rgb24ToY_internal): > mov w9, #256 // w9 = 1 << (RGB2YUV_SHIFT - 7) > movk w9, #8, lsl #16 // w9 += 32 << (RGB2YUV_SHIFT - 1) > dup v6.4s, w9 // w9: const_offset > @@ -85,6 +86,14 @@ function ff_rgb24ToY_neon, export=1 > ret > endfunc > > +function ff_bgr24ToY_neon, export=1 > + cmp w4, #0 // check width > 0 > + ldp w12, w11, [x5] // w12: ry, w11: gy > + ldr w10, [x5, #8] // w10: by > + b.gt L(rgb24ToY_internal) > + ret > +endfunc > + > .macro rgb24_load_uv_coeff half > ldp w10, w11, [x6, #12] // w10: ru, w11: gu > ldp w12, w13, [x6, #20] // w12: bu, w13: rv > > > Another way looks like this: > > diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S > index 33afa34111..5c4b7a41fd 100644 > --- a/libswscale/aarch64/input.S > +++ b/libswscale/aarch64/input.S > @@ -43,12 +43,23 @@ > sqshrn2 \dst\().8h, \dst2\().4s, \right_shift // dst_higher_half = dst2 >> right_shift > .endm > > +function ff_bgr24ToY_neon, export=1 > + cmp w4, #0 // check width > 0 > + ldp w12, w11, [x5] // w12: ry, w11: gy > + ldr w10, [x5, #8] // w10: by > + b.gt rgb24ToY_internal > + ret > +endfunc > + > function ff_rgb24ToY_neon, export=1 > cmp w4, #0 // check width > 0 > ldp w10, w11, [x5] // w10: ry, w11: gy > ldr w12, [x5, #8] // w12: by > - b.le 3f > + b.gt rgb24ToY_internal > + ret > +endfunc > > +function rgb24ToY_internal > mov w9, #256 // w9 = 1 << (RGB2YUV_SHIFT - 7) > movk w9, #8, lsl #16 // w9 += 32 << (RGB2YUV_SHIFT - 1) > dup v6.4s, w9 // w9: const_offset > > > Or if you want to be really adventurous, you can make a fallthrough: > > > diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S > index 33afa34111..025a965b76 100644 > --- a/libswscale/aarch64/input.S > +++ b/libswscale/aarch64/input.S > @@ -43,12 +43,22 @@ > sqshrn2 \dst\().8h, \dst2\().4s, \right_shift // dst_higher_half = dst2 >> right_shift > .endm > > +function ff_bgr24ToY_neon, export=1 > + cmp w4, #0 // check width > 0 > + ldp w12, w11, [x5] // w12: ry, w11: gy > + ldr w10, [x5, #8] // w10: by > + b.gt rgb24ToY_internal > + ret > +endfunc > + > function ff_rgb24ToY_neon, export=1 > cmp w4, #0 // check width > 0 > ldp w10, w11, [x5] // w10: ry, w11: gy > ldr w12, [x5, #8] // w12: by > b.le 3f > +endfunc > > +function rgb24ToY_internal > mov w9, #256 // w9 = 1 << (RGB2YUV_SHIFT - 7) > movk w9, #8, lsl #16 // w9 += 32 << (RGB2YUV_SHIFT - 1) > dup v6.4s, w9 // w9: const_offset > > > // Martin > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/2] swscale/aarch64: Add bgra/rgba to yuv 2024-06-20 16:02 ` Zhao Zhili @ 2024-06-20 16:25 ` Rémi Denis-Courmont 0 siblings, 0 replies; 9+ messages in thread From: Rémi Denis-Courmont @ 2024-06-20 16:25 UTC (permalink / raw) To: FFmpeg development discussions and patches Le 20 juin 2024 18:02:31 GMT+02:00, Zhao Zhili <quinkblack@foxmail.com> a écrit : > > >> On Jun 20, 2024, at 20:49, Martin Storsjö <martin@martin.st> wrote: >> >> On Thu, 20 Jun 2024, Zhao Zhili wrote: >> >>>> On Jun 19, 2024, at 20:05, Rémi Denis-Courmont <remi@remlab.net> wrote: >>>> Le 19 juin 2024 11:24:28 GMT+02:00, Zhao Zhili <quinkblack@foxmail.com <mailto:quinkblack@foxmail.com>> a écrit : >>>>>> On Jun 19, 2024, at 15:07, Rémi Denis-Courmont <remi@remlab.net> wrote: >>>>>> Le 15 juin 2024 11:57:18 GMT+02:00, Zhao Zhili <quinkblack@foxmail.com> a écrit : >>>>>>> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S >>>>>>> index 2b956fe5c2..37f1158504 100644 >>>>>>> --- a/libswscale/aarch64/input.S >>>>>>> +++ b/libswscale/aarch64/input.S >>>>>>> @@ -20,8 +20,12 @@ >>>>>>> #include "libavutil/aarch64/asm.S" >>>>>>> -.macro rgb_to_yuv_load_rgb src >>>>>>> +.macro rgb_to_yuv_load_rgb src, element=3 >>>>>>> + .if \element == 3 >>>>>>> ld3 { v16.16b, v17.16b, v18.16b }, [\src] >>>>>>> + .else >>>>>>> + ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [\src] >>>>>>> + .endif >>>>>>> uxtl v19.8h, v16.8b // v19: r >>>>>>> uxtl v20.8h, v17.8b // v20: g >>>>>>> uxtl v21.8h, v18.8b // v21: b >>>>>>> @@ -43,7 +47,7 @@ >>>>>>> sqshrn2 \dst\().8h, \dst2\().4s, \right_shift // dst_higher_half = dst2 >> right_shift >>>>>>> .endm >>>>>>> -.macro rgbToY bgr >>>>>>> +.macro rgbToY bgr, element=3 >>>>>> AFAICT, you don't need to a macro parameter for component order. Just swap red and blue coefficients in the prologue and then run the bit-exact same loops for bgr/rgb, rgba/bgra and argb/abgr. This adds one branch in the prologue but that's mostly negligible compared to the loop. >>>>> I’m not sure where to add the branch. Could you elaborate? Do you mean load coefficients first like the following: >>>>> function ff_bgr24ToUV_half_neon, export=1 >>>>> ldr w12, [x6, #12] >>>>> ldr w11, [x6, #16] >>>>> ldr w10, [x6, #20] >>>>> ldr w15, [x6, #24] >>>>> ldr w14, [x6, #28] >>>>> ldr w13, [x6, #32] >>>>> rgbToUV_half >>>>> endfunc >>>> Hmm, no. You need to jump past the loading of red and blue coefficients. It might help to load green coefficients last. >>>> By the way, I think you can use LDP instead of LDR. >>> >>> Patch v2 replace LDR by LDP, then the "jump past the loading of red and blue coefficients” doesn’t apply now. >> >> Rémi's point is that you don't need to duplicate the whole function, when the only thing you're changing is a couple of instructions in the prologue of the function. By reusing the actual bulk of the function, you save on binary size. > >Thank you for the detailed examples. I missed the key point here is to save binary size. > >I have seen similar example of fall through in risk/input_rvv.s. Is it well defined to jump to a local label in another function? Falling through is well defined so long as we don't use function-sections. Jumping to a label inside another function is well defined, as the assembler has no notion of what a function is. `func` and `endfunc` are just FFmpeg macros for defining symbols. _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". ^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2024-06-20 16:26 UTC | newest] Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- [not found] <20240615095718.37319-1-quinkblack@foxmail.com> 2024-06-15 9:57 ` [FFmpeg-devel] [PATCH 2/2] swscale/aarch64: Add bgra/rgba to yuv Zhao Zhili 2024-06-18 20:32 ` Martin Storsjö 2024-06-19 7:07 ` Rémi Denis-Courmont 2024-06-19 9:24 ` Zhao Zhili 2024-06-19 12:05 ` Rémi Denis-Courmont 2024-06-19 17:15 ` Zhao Zhili 2024-06-20 12:49 ` Martin Storsjö 2024-06-20 16:02 ` Zhao Zhili 2024-06-20 16:25 ` Rémi Denis-Courmont
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git