Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH 2/2] swscale/aarch64: Add rgb24 to yuv implementation
       [not found] <20240603071732.52523-1-quinkblack@foxmail.com>
@ 2024-06-03  7:17 ` Zhao Zhili
  2024-06-03  8:07   ` Martin Storsjö
  0 siblings, 1 reply; 6+ messages in thread
From: Zhao Zhili @ 2024-06-03  7:17 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Zhao Zhili

From: Zhao Zhili <zhilizhao@tencent.com>

Test on Apple M1:

rgb24_to_uv_1080_c: 7.0
rgb24_to_uv_1080_neon: 5.5
rgb24_to_uv_1280_c: 8.2
rgb24_to_uv_1280_neon: 6.0
rgb24_to_uv_4096_c: 26.2
rgb24_to_uv_4096_neon: 20.7

rgb24_to_uv_half_540_c: 6.5
rgb24_to_uv_half_540_neon: 3.0
rgb24_to_uv_half_640_c: 8.0
rgb24_to_uv_half_640_neon: 3.2
rgb24_to_uv_half_2048_c: 24.0
rgb24_to_uv_half_2048_neon: 11.2

rgb24_to_y_1080_c: 4.5
rgb24_to_y_1080_neon: 3.2
rgb24_to_y_1280_c: 5.5
rgb24_to_y_1280_neon: 3.7
rgb24_to_y_4096_c: 17.7
rgb24_to_y_4096_neon: 12.2

Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
---
 libswscale/aarch64/Makefile  |   1 +
 libswscale/aarch64/input.S   | 229 +++++++++++++++++++++++++++++++++++
 libswscale/aarch64/swscale.c |  21 ++++
 3 files changed, 251 insertions(+)
 create mode 100644 libswscale/aarch64/input.S

diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
index da1d909561..adfd90a1b6 100644
--- a/libswscale/aarch64/Makefile
+++ b/libswscale/aarch64/Makefile
@@ -3,6 +3,7 @@ OBJS        += aarch64/rgb2rgb.o                \
                aarch64/swscale_unscaled.o       \
 
 NEON-OBJS   += aarch64/hscale.o                 \
+               aarch64/input.o                  \
                aarch64/output.o                 \
                aarch64/rgb2rgb_neon.o           \
                aarch64/yuv2rgb_neon.o           \
diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
new file mode 100644
index 0000000000..0a46475723
--- /dev/null
+++ b/libswscale/aarch64/input.S
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+.macro rgb24_to_yuv_load_rgb, src
+        ld3.16b         { v16, v17, v18 }, [\src]
+        ushll.8h        v19, v16, #0         // v19: r
+        ushll.8h        v20, v17, #0         // v20: g
+        ushll.8h        v21, v18, #0         // v21: b
+        ushll2.8h       v22, v16, #0         // v22: r
+        ushll2.8h       v23, v17, #0         // v23: g
+        ushll2.8h       v24, v18, #0         // v24: b
+.endm
+
+.macro rgb24_to_yuv_product, r, g, b, dst1, dst2, dst, coef0, coef1, coef2, right_shift
+        mov.4s          \dst1, v6                   // dst1 = const_offset
+        mov.4s          \dst2, v6                   // dst2 = const_offset
+        smlal.4s        \dst1, \coef0, \r           // dst1 += rx * r
+        smlal2.4s       \dst2, \coef0, \r           // dst2 += rx * r
+        smlal.4s        \dst1, \coef1, \g           // dst1 += gx * g
+        smlal2.4s       \dst2, \coef1, \g           // dst2 += gx * g
+        smlal.4s        \dst1, \coef2, \b           // dst1 += bx * b
+        smlal2.4s       \dst2, \coef2, \b           // dst2 += bx * b
+        sqshrn.4h       \dst, \dst1, \right_shift   // dst_lower_half = dst1 >> right_shift
+        sqshrn2.8h      \dst, \dst2, \right_shift   // dst_higher_half = dst2 >> right_shift
+.endm
+
+function ff_rgb24ToY_neon, export=1
+        cmp             w4, #0                  // check width > 0
+        b.le            4f
+
+        ldp             w10, w11, [x5], #8       // w10: ry, w11: gy
+        dup             v0.8H, w10
+        dup             v1.8H, w11
+        ldr             w12, [x5]               // w12: by
+        dup             v2.8H, w12
+
+        mov             w9, #256                // w9 = 1 << (RGB2YUV_SHIFT - 7)
+        movk            w9, #8, lsl #16         // w9 += 32 << (RGB2YUV_SHIFT - 1)
+        dup.4s          v6, w9                  // w9: const_offset
+
+        mov             x2, #0                  // w2: i
+        and             w3, w4, #0xFFFFFFF0     // w3 = width / 16 * 16
+        cbz             w3, 3f
+1:
+        rgb24_to_yuv_load_rgb x1
+        rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
+        rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
+        stp             q16, q17, [x0], #32     // store to dst
+
+        add             w2, w2, #16             // i += 16
+        add             x1, x1, #48             // src += 48
+        cmp             w2, w3                  // i < (width / 16 * 16)
+        b.lt            1b
+        b               3f
+2:
+        ldrb            w13, [x1]               // w13: r
+        ldrb            w14, [x1, #1]           // w14: g
+        ldrb            w15, [x1, #2]           // w15: b
+
+        smaddl          x13, w13, w10, x9       // x13 = ry * r + const_offset
+        smaddl          x13, w14, w11, x13      // x13 += gy * g
+        smaddl          x13, w15, w12, x13      // x13 += by * b
+        asr             w13, w13, #9            // x13 >>= 9
+        strh            w13, [x0], #2           // store to dst
+
+        add             w2, w2, #1              // i++
+        add             x1, x1, #3              // src += 3
+3:
+        cmp             w2, w4                  // i < width
+        b.lt            2b
+4:
+        ret
+endfunc
+
+.macro rgb24_load_uv_coeff half
+        add             x6, x6, #12
+
+        ldp             w10, w11, [x6], #8      // w10: ru, w11: gu
+        dup             v0.8H, w10
+        dup             v1.8H, w11
+
+        ldp             w12, w13, [x6], #8      // w12: bu, w13: rv
+        dup             v2.8H, w12
+        dup             v3.8H, w13
+
+        ldp             w14, w15, [x6], #8      // w14: gv, w15: bv
+        dup             v4.8H, w14
+        dup             v5.8H, w15
+
+    .if \half
+        mov             w9, #512
+        movk            w9, #128, lsl #16       // w9: const_offset
+    .else
+        mov             w9, #256
+        movk            w9, #64, lsl #16        // w9: const_offset
+    .endif
+        dup             v6.4S, w9
+.endm
+
+function ff_rgb24ToUV_half_neon, export=1
+        cmp             w5, #0          // check width > 0
+        b.le            4f
+
+        rgb24_load_uv_coeff half=1
+
+        mov             x9, #0                  // x9: i
+        and             w7, w5, #0xFFFFFFF8     // w7 = width / 8 * 8
+        cbz             w7, 3f
+1:
+        ld3.16b         { v16, v17, v18 }, [x3]
+        uaddlp          v19.8H, v16.16B         // v19: r
+        uaddlp          v20.8H, v17.16B         // v20: g
+        uaddlp          v21.8H, v18.16B         // v21: b
+
+        rgb24_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10
+        str             q16, [x0], #16          // store dst_u
+        rgb24_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10
+        str             q17, [x1], #16          // store dst_v
+
+        add             w9, w9, #8              // i += 8
+        add             x3, x3, #48             // src += 48
+        cmp             w9, w7                  // i < (width * 8 / 8)
+        b.lt            1b
+        b               3f
+2:
+        ldrb            w2, [x3]                // w2: r1
+        ldrb            w4, [x3, #3]            // w4: r2
+        add             w2, w2, w4              // w2 = r1 + r2
+
+        ldrb            w4, [x3, #1]            // w4: g1
+        ldrb            w7, [x3, #4]            // w7: g2
+        add             w4, w4, w7              // w4 = g1 + g2
+
+        ldrb            w7, [x3, #2]            // w7: b1
+        ldrb            w8, [x3, #5]            // w8: b2
+        add             w7, w7, w8              // w7 = b1 + b2
+
+        umov            w8, v6.S[0]             // dst_u = const_offset
+        smaddl          x8, w2, w10, x8         // dst_u += ru * r
+        smaddl          x8, w4, w11, x8         // dst_u += gu * g
+        smaddl          x8, w7, w12, x8         // dst_u += bu * b
+        asr             x8, x8, #10             // dst_u >>= 10
+        strh            w8, [x0], #2            // store dst_u
+
+        umov            w8, v6.S[0]             // dst_v = const_offset
+        smaddl          x8, w2, w13, x8         // dst_v += rv * r
+        smaddl          x8, w4, w14, x8         // dst_v += gv * g
+        smaddl          x8, w7, w15, x8         // dst_v += bv * b
+        asr             x8, x8, #10             // dst_v >>= 10
+        strh            w8, [x1], #2            // store dst_v
+
+        add             w9, w9, #1              // i++
+        add             x3, x3, #6              // src += 6
+3:
+        cmp		w9, w5
+        b.lt		2b
+4:
+        ret
+endfunc
+
+function ff_rgb24ToUV_neon, export=1
+        cmp             w5, #0                  // check width > 0
+        b.le            4f
+
+        rgb24_load_uv_coeff half=0
+
+        mov             x2, #0                  // w2: i
+        and             w4, w5, #0xFFFFFFF0     // w4: width / 16 * 16
+        cbz             w4, 3f
+1:
+        rgb24_to_yuv_load_rgb x3
+        rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
+        rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
+        stp             q16, q17, [x0], #32      // store to dst_u
+        rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v3, v4, v5, #9
+        rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v3, v4, v5, #9
+        stp             q16, q17, [x1], #32      // store to dst_v
+
+        add             w2, w2, #16             // i += 16
+        add             x3, x3, #48             // src += 48
+        cmp             w2, w4                  // i < (width / 16 * 16)
+        b.lt            1b
+        b               3f
+2:
+        ldrb            w16, [x3]               // w16: r
+        ldrb            w17, [x3, #1]           // w17: g
+        ldrb            w4, [x3, #2]            // w4: b
+
+        umov            w7, v6.S[0]            // w7 = const_offset
+
+        smaddl          x8, w16, w10, x7        // x8 = ru * r + const_offset
+        smaddl          x8, w17, w11, x8        // x8 += gu * g
+        smaddl          x8, w4, w12, x8         // x8 += bu * b
+        asr             w8, w8, #9              // x8 >>= 9
+        strh            w8, [x0], #2            // store to dst_u
+
+        smaddl          x8, w16, w13, x7        // x8 = rv * r + const_offset
+        smaddl          x8, w17, w14, x8        // x8 += gv * g
+        smaddl          x8, w4, w15, x8         // x8 += bv * b
+        asr             w8, w8, #9              // x8 >>= 9
+        strh            w8, [x1], #2            // store to dst_v
+
+        add             w2, w2, #1              // i++
+        add             x3, x3, #3              // src += 3
+3:
+        cmp             w2, w5                  // i < width
+        b.lt            2b
+4:
+        ret
+endfunc
diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
index bbd9719a44..5dd700a16b 100644
--- a/libswscale/aarch64/swscale.c
+++ b/libswscale/aarch64/swscale.c
@@ -201,6 +201,20 @@ void ff_yuv2plane1_8_neon(
     default: break;                                                     \
     }
 
+void ff_rgb24ToY_neon(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
+                      const uint8_t *unused2, int width,
+                      uint32_t *rgb2yuv, void *opq);
+
+void ff_rgb24ToUV_neon(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0,
+                       const uint8_t *src1,
+                       const uint8_t *src2, int width, uint32_t *rgb2yuv,
+                       void *opq);
+
+void ff_rgb24ToUV_half_neon(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0,
+                       const uint8_t *src1,
+                       const uint8_t *src2, int width, uint32_t *rgb2yuv,
+                       void *opq);
+
 av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -212,5 +226,12 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
         if (c->dstBpc == 8) {
             c->yuv2planeX = ff_yuv2planeX_8_neon;
         }
+        if (c->srcFormat == AV_PIX_FMT_RGB24) {
+            c->lumToYV12 = ff_rgb24ToY_neon;
+            if (c->chrSrcHSubSample)
+                c->chrToYV12 = ff_rgb24ToUV_half_neon;
+            else
+                c->chrToYV12 = ff_rgb24ToUV_neon;
+        }
     }
 }
-- 
2.42.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/2] swscale/aarch64: Add rgb24 to yuv implementation
  2024-06-03  7:17 ` [FFmpeg-devel] [PATCH 2/2] swscale/aarch64: Add rgb24 to yuv implementation Zhao Zhili
@ 2024-06-03  8:07   ` Martin Storsjö
  2024-06-03 13:11     ` Zhao Zhili
  0 siblings, 1 reply; 6+ messages in thread
From: Martin Storsjö @ 2024-06-03  8:07 UTC (permalink / raw)
  To: FFmpeg development discussions and patches; +Cc: Zhao Zhili

On Mon, 3 Jun 2024, Zhao Zhili wrote:

> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
> new file mode 100644
> index 0000000000..0a46475723
> --- /dev/null
> +++ b/libswscale/aarch64/input.S
> @@ -0,0 +1,229 @@
> +/*
> + * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +
> +.macro rgb24_to_yuv_load_rgb, src
> +        ld3.16b         { v16, v17, v18 }, [\src]
> +        ushll.8h        v19, v16, #0         // v19: r
> +        ushll.8h        v20, v17, #0         // v20: g
> +        ushll.8h        v21, v18, #0         // v21: b
> +        ushll2.8h       v22, v16, #0         // v22: r
> +        ushll2.8h       v23, v17, #0         // v23: g
> +        ushll2.8h       v24, v18, #0         // v24: b

Don't use this nonstandard, Apple specific aarch64 syntax. This was used 
by Apple tools at the start, when the proper standardized aarch64 syntax 
wasn't quite settled yet, and it is still accepted. (And apparently this 
is still the preferred form to disassemble things in, for apple 
platforms.)

With this syntax, the assembly is rejected by GNU binutils and MSVC.

> +function ff_rgb24ToY_neon, export=1
> +        cmp             w4, #0                  // check width > 0
> +        b.le            4f
> +
> +        ldp             w10, w11, [x5], #8       // w10: ry, w11: gy
> +        dup             v0.8H, w10
> +        dup             v1.8H, w11
> +        ldr             w12, [x5]               // w12: by
> +        dup             v2.8H, w12

Don't use uppercase .8H for field layout configurations, we prefer to 
stick to all lowercase here - see 
184103b3105f02f1189fa0047af4269e027dfbd6. The same goes for a number of 
places in this patch.

> +        add             w9, w9, #1              // i++
> +        add             x3, x3, #6              // src += 6
> +3:
> +        cmp		w9, w5
> +        b.lt		2b
> +4:

Incorrect indentation for the cmp/b.lt instructions here.


I have set up a bunch of github actions for testing aarch64 assembly - see 
https://github.com/mstorsjo/ffmpeg/commits/gha-aarch64. If you have a 
github account, grab a copy of this branch into your repo, add your own 
commits on top, and push to your fork (and if necessary, activate running 
the actions), then you should get a wide testing of your patches.

See https://github.com/mstorsjo/FFmpeg/actions/runs/9346228714 for one 
example run of these actions with your patches.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/2] swscale/aarch64: Add rgb24 to yuv implementation
  2024-06-03  8:07   ` Martin Storsjö
@ 2024-06-03 13:11     ` Zhao Zhili
  2024-06-03 14:17       ` Rémi Denis-Courmont
  0 siblings, 1 reply; 6+ messages in thread
From: Zhao Zhili @ 2024-06-03 13:11 UTC (permalink / raw)
  To: FFmpeg development discussions and patches



> On Jun 3, 2024, at 16:07, Martin Storsjö <martin@martin.st> wrote:
> 
> On Mon, 3 Jun 2024, Zhao Zhili wrote:
> 
>> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
>> new file mode 100644
>> index 0000000000..0a46475723
>> --- /dev/null
>> +++ b/libswscale/aarch64/input.S
>> @@ -0,0 +1,229 @@
>> +/*
>> + * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
>> + *
>> + * This file is part of FFmpeg.
>> + *
>> + * FFmpeg is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU Lesser General Public
>> + * License as published by the Free Software Foundation; either
>> + * version 2.1 of the License, or (at your option) any later version.
>> + *
>> + * FFmpeg is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> + * Lesser General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU Lesser General Public
>> + * License along with FFmpeg; if not, write to the Free Software
>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>> + */
>> +
>> +#include "libavutil/aarch64/asm.S"
>> +
>> +.macro rgb24_to_yuv_load_rgb, src
>> +        ld3.16b         { v16, v17, v18 }, [\src]
>> +        ushll.8h        v19, v16, #0         // v19: r
>> +        ushll.8h        v20, v17, #0         // v20: g
>> +        ushll.8h        v21, v18, #0         // v21: b
>> +        ushll2.8h       v22, v16, #0         // v22: r
>> +        ushll2.8h       v23, v17, #0         // v23: g
>> +        ushll2.8h       v24, v18, #0         // v24: b
> 
> Don't use this nonstandard, Apple specific aarch64 syntax. This was used by Apple tools at the start, when the proper standardized aarch64 syntax wasn't quite settled yet, and it is still accepted. (And apparently this is still the preferred form to disassemble things in, for apple platforms.)
> 
> With this syntax, the assembly is rejected by GNU binutils and MSVC.
> 
>> +function ff_rgb24ToY_neon, export=1
>> +        cmp             w4, #0                  // check width > 0
>> +        b.le            4f
>> +
>> +        ldp             w10, w11, [x5], #8       // w10: ry, w11: gy
>> +        dup             v0.8H, w10
>> +        dup             v1.8H, w11
>> +        ldr             w12, [x5]               // w12: by
>> +        dup             v2.8H, w12
> 
> Don't use uppercase .8H for field layout configurations, we prefer to stick to all lowercase here - see 184103b3105f02f1189fa0047af4269e027dfbd6. The same goes for a number of places in this patch.
> 
>> +        add             w9, w9, #1              // i++
>> +        add             x3, x3, #6              // src += 6
>> +3:
>> +        cmp w9, w5
>> +        b.lt 2b
>> +4:
> 
> Incorrect indentation for the cmp/b.lt instructions here.
> 
> 
> I have set up a bunch of github actions for testing aarch64 assembly - see https://github.com/mstorsjo/ffmpeg/commits/gha-aarch64. If you have a github account, grab a copy of this branch into your repo, add your own commits on top, and push to your fork (and if necessary, activate running the actions), then you should get a wide testing of your patches.
> 
> See https://github.com/mstorsjo/FFmpeg/actions/runs/9346228714 for one example run of these actions with your patches.

Wow, it’s very helpful. This is the action result of the updated patch:

https://github.com/quink-black/FFmpeg/actions/runs/9350348848

https://ffmpeg.org/pipermail/ffmpeg-devel/2024-June/328786.html

The test still failed on x86, but success on all arm64 platform and
longarch. I have tried to call rgb24ToY_c and ff_rgb24ToY_avx
directly and compare the results, they don't match. I’m confused.

> 
> // Martin
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/2] swscale/aarch64: Add rgb24 to yuv implementation
  2024-06-03 13:11     ` Zhao Zhili
@ 2024-06-03 14:17       ` Rémi Denis-Courmont
  2024-06-03 16:14         ` Zhao Zhili
  0 siblings, 1 reply; 6+ messages in thread
From: Rémi Denis-Courmont @ 2024-06-03 14:17 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Le maanantaina 3. kesäkuuta 2024, 16.11.15 EEST Zhao Zhili a écrit :
> > See https://github.com/mstorsjo/FFmpeg/actions/runs/9346228714 for one
> > example run of these actions with your patches.
> Wow, it’s very helpful. This is the action result of the updated patch:
> 
> https://github.com/quink-black/FFmpeg/actions/runs/9350348848
> 
> https://ffmpeg.org/pipermail/ffmpeg-devel/2024-June/328786.html
> 
> The test still failed on x86, but success on all arm64 platform and
> longarch. I have tried to call rgb24ToY_c and ff_rgb24ToY_avx
> directly and compare the results, they don't match. I’m confused.

As Martin write, some x86 code is imprecise, or even wrong.

I would just disable the test on x86 until some x86 arcane magician fixes it.

-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/2] swscale/aarch64: Add rgb24 to yuv implementation
  2024-06-03 14:17       ` Rémi Denis-Courmont
@ 2024-06-03 16:14         ` Zhao Zhili
  2024-06-03 17:22           ` Ronald S. Bultje
  0 siblings, 1 reply; 6+ messages in thread
From: Zhao Zhili @ 2024-06-03 16:14 UTC (permalink / raw)
  To: FFmpeg development discussions and patches



> On Jun 3, 2024, at 22:17, Rémi Denis-Courmont <remi@remlab.net> wrote:
> 
> Le maanantaina 3. kesäkuuta 2024, 16.11.15 EEST Zhao Zhili a écrit :
>>> See https://github.com/mstorsjo/FFmpeg/actions/runs/9346228714 for one
>>> example run of these actions with your patches.
>> Wow, it’s very helpful. This is the action result of the updated patch:
>> 
>> https://github.com/quink-black/FFmpeg/actions/runs/9350348848
>> 
>> https://ffmpeg.org/pipermail/ffmpeg-devel/2024-June/328786.html
>> 
>> The test still failed on x86, but success on all arm64 platform and
>> longarch. I have tried to call rgb24ToY_c and ff_rgb24ToY_avx
>> directly and compare the results, they don't match. I’m confused.
> 
> As Martin write, some x86 code is imprecise, or even wrong.

On x86:

With the following command:
./ffmpeg -bitexact -cpuflags 0  -f lavfi -i testsrc  -frames 1 -pix_fmt yuv420p -f framemd5 -
0,          0,          0,        1,   115200, d6b3abfc5280311c2758d5e4028c07b5

Without “-cpuflags 0”
./ffmpeg -bitexact -f lavfi -i testsrc  -frames 1 -pix_fmt yuv420p -f framemd5 -
0,          0,          0,        1,   115200, 1d302ce90bd5b6eec681730cc0868be4

It's indeed non bitexact.

On aarch64 with the neon implementation I can get the same result as
"-cpuflags 0":
./ffmpeg -bitexact  -f lavfi -i testsrc  -frames 1 -pix_fmt yuv420p -f framemd5 -
0,          0,          0,        1,   115200, d6b3abfc5280311c2758d5e4028c07b5

Now I can disable the test for x86 and continue the work on aarch64.

> 
> I would just disable the test on x86 until some x86 arcane magician fixes it.
> 
> -- 
> 雷米‧德尼-库尔蒙
> http://www.remlab.net/
> 
> 
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/2] swscale/aarch64: Add rgb24 to yuv implementation
  2024-06-03 16:14         ` Zhao Zhili
@ 2024-06-03 17:22           ` Ronald S. Bultje
  0 siblings, 0 replies; 6+ messages in thread
From: Ronald S. Bultje @ 2024-06-03 17:22 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Hi,

On Mon, Jun 3, 2024 at 12:14 PM Zhao Zhili <quinkblack@foxmail.com> wrote:

>
>
> > On Jun 3, 2024, at 22:17, Rémi Denis-Courmont <remi@remlab.net> wrote:
> >
> > Le maanantaina 3. kesäkuuta 2024, 16.11.15 EEST Zhao Zhili a écrit :
> >>> See https://github.com/mstorsjo/FFmpeg/actions/runs/9346228714 for one
> >>> example run of these actions with your patches.
> >> Wow, it’s very helpful. This is the action result of the updated patch:
> >>
> >> https://github.com/quink-black/FFmpeg/actions/runs/9350348848
> >>
> >> https://ffmpeg.org/pipermail/ffmpeg-devel/2024-June/328786.html
> >>
> >> The test still failed on x86, but success on all arm64 platform and
> >> longarch. I have tried to call rgb24ToY_c and ff_rgb24ToY_avx
> >> directly and compare the results, they don't match. I’m confused.
> >
> > As Martin write, some x86 code is imprecise, or even wrong.
>
> On x86:
>
> With the following command:
> ./ffmpeg -bitexact -cpuflags 0  -f lavfi -i testsrc  -frames 1 -pix_fmt
> yuv420p -f framemd5 -
> 0,          0,          0,        1,   115200,
> d6b3abfc5280311c2758d5e4028c07b5
>
> Without “-cpuflags 0”
> ./ffmpeg -bitexact -f lavfi -i testsrc  -frames 1 -pix_fmt yuv420p -f
> framemd5 -
> 0,          0,          0,        1,   115200,
> 1d302ce90bd5b6eec681730cc0868be4
>
> It's indeed non bitexact.
>
> On aarch64 with the neon implementation I can get the same result as
> "-cpuflags 0":
> ./ffmpeg -bitexact  -f lavfi -i testsrc  -frames 1 -pix_fmt yuv420p -f
> framemd5 -
> 0,          0,          0,        1,   115200,
> d6b3abfc5280311c2758d5e4028c07b5
>
> Now I can disable the test for x86 and continue the work on aarch64.
>

Uhm, that's a bit hacky. I think things like -sws_flags +bitexact need to
be after the -i argument and then it might work?

Ronald
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2024-06-03 17:23 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20240603071732.52523-1-quinkblack@foxmail.com>
2024-06-03  7:17 ` [FFmpeg-devel] [PATCH 2/2] swscale/aarch64: Add rgb24 to yuv implementation Zhao Zhili
2024-06-03  8:07   ` Martin Storsjö
2024-06-03 13:11     ` Zhao Zhili
2024-06-03 14:17       ` Rémi Denis-Courmont
2024-06-03 16:14         ` Zhao Zhili
2024-06-03 17:22           ` Ronald S. Bultje

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git