Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH 1/4] swscale: add nv24/nv42 to yuv420p unscaled converter
@ 2024-08-09 11:26 Ramiro Polla
  2024-08-09 11:26 ` [FFmpeg-devel] [PATCH 2/4] checkasm/yuv2yuv: add tests for semiplanar unscaled converters Ramiro Polla
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Ramiro Polla @ 2024-08-09 11:26 UTC (permalink / raw)
  To: ffmpeg-devel

---
 libswscale/swscale_unscaled.c | 45 +++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
index a5c9917799..239258ab8c 100644
--- a/libswscale/swscale_unscaled.c
+++ b/libswscale/swscale_unscaled.c
@@ -221,6 +221,48 @@ static int nv24ToPlanarWrapper(SwsContext *c, const uint8_t *src[],
     return srcSliceH;
 }
 
+static void nv24_to_yuv420p_chroma(uint8_t *dst1, int dstStride1,
+                                   uint8_t *dst2, int dstStride2,
+                                   const uint8_t *src, int srcStride,
+                                   int w, int h)
+{
+    const uint8_t *src1 = src;
+    const uint8_t *src2 = src + srcStride;
+    // average 4 pixels into 1 (interleaved U and V)
+    for (int y = 0; y < h; y += 2) {
+        for (int x = 0; x < w; x++) {
+            dst1[x] = (src1[4 * x + 0] + src1[4 * x + 2] +
+                       src2[4 * x + 0] + src2[4 * x + 2]) >> 2;
+            dst2[x] = (src1[4 * x + 1] + src1[4 * x + 3] +
+                       src2[4 * x + 1] + src2[4 * x + 3]) >> 2;
+        }
+        src1 += srcStride * 2;
+        src2 += srcStride * 2;
+        dst1 += dstStride1;
+        dst2 += dstStride2;
+    }
+}
+
+static int nv24ToYuv420Wrapper(SwsContext *c, const uint8_t *src[],
+                               int srcStride[], int srcSliceY, int srcSliceH,
+                               uint8_t *dstParam[], int dstStride[])
+{
+    uint8_t *dst1 = dstParam[1] + dstStride[1] * srcSliceY / 2;
+    uint8_t *dst2 = dstParam[2] + dstStride[2] * srcSliceY / 2;
+
+    copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
+              dstParam[0], dstStride[0]);
+
+    if (c->srcFormat == AV_PIX_FMT_NV24)
+        nv24_to_yuv420p_chroma(dst1, dstStride[1], dst2, dstStride[2],
+                               src[1], srcStride[1], c->srcW / 2, srcSliceH);
+    else
+        nv24_to_yuv420p_chroma(dst2, dstStride[2], dst1, dstStride[1],
+                               src[1], srcStride[1], c->srcW / 2, srcSliceH);
+
+    return srcSliceH;
+}
+
 static int planarToP01xWrapper(SwsContext *c, const uint8_t *src8[],
                                int srcStride[], int srcSliceY,
                                int srcSliceH, uint8_t *dstParam8[],
@@ -2206,6 +2248,9 @@ void ff_get_unscaled_swscale(SwsContext *c)
         c->convert_unscaled = yuyvToYuv422Wrapper;
     if (srcFormat == AV_PIX_FMT_UYVY422 && dstFormat == AV_PIX_FMT_YUV422P)
         c->convert_unscaled = uyvyToYuv422Wrapper;
+    if (dstFormat == AV_PIX_FMT_YUV420P &&
+        (srcFormat == AV_PIX_FMT_NV24 || srcFormat == AV_PIX_FMT_NV42))
+        c->convert_unscaled = nv24ToYuv420Wrapper;
 
 #define isPlanarGray(x) (isGray(x) && (x) != AV_PIX_FMT_YA8 && (x) != AV_PIX_FMT_YA16LE && (x) != AV_PIX_FMT_YA16BE)
     /* simple copy */
-- 
2.30.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [FFmpeg-devel] [PATCH 2/4] checkasm/yuv2yuv: add tests for semiplanar unscaled converters
  2024-08-09 11:26 [FFmpeg-devel] [PATCH 1/4] swscale: add nv24/nv42 to yuv420p unscaled converter Ramiro Polla
@ 2024-08-09 11:26 ` Ramiro Polla
  2024-08-15 14:19   ` Ramiro Polla
  2024-08-09 11:26 ` [FFmpeg-devel] [PATCH 3/4] swscale: export ff_copyPlane so it may be used by simd code Ramiro Polla
  2024-08-09 11:26 ` [FFmpeg-devel] [PATCH 4/4] swscale/aarch64: add nv24/nv42 to yuv420p unscaled converter Ramiro Polla
  2 siblings, 1 reply; 7+ messages in thread
From: Ramiro Polla @ 2024-08-09 11:26 UTC (permalink / raw)
  To: ffmpeg-devel

---
 tests/checkasm/Makefile     |   2 +-
 tests/checkasm/checkasm.c   |   1 +
 tests/checkasm/checkasm.h   |   1 +
 tests/checkasm/sw_yuv2yuv.c | 131 ++++++++++++++++++++++++++++++++++++
 4 files changed, 134 insertions(+), 1 deletion(-)
 create mode 100644 tests/checkasm/sw_yuv2yuv.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 3a7670e24b..2d2e42e445 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -63,7 +63,7 @@ AVFILTEROBJS-$(CONFIG_SOBEL_FILTER)      += vf_convolution.o
 CHECKASMOBJS-$(CONFIG_AVFILTER) += $(AVFILTEROBJS-yes)
 
 # swscale tests
-SWSCALEOBJS                             += sw_gbrp.o sw_range_convert.o sw_rgb.o sw_scale.o sw_yuv2rgb.o
+SWSCALEOBJS                             += sw_gbrp.o sw_range_convert.o sw_rgb.o sw_scale.o sw_yuv2rgb.o sw_yuv2yuv.o
 
 CHECKASMOBJS-$(CONFIG_SWSCALE)  += $(SWSCALEOBJS)
 
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 58597d3888..5c407de2ba 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -255,6 +255,7 @@ static const struct {
     { "sw_rgb", checkasm_check_sw_rgb },
     { "sw_scale", checkasm_check_sw_scale },
     { "sw_yuv2rgb", checkasm_check_sw_yuv2rgb },
+    { "sw_yuv2yuv", checkasm_check_sw_yuv2yuv },
 #endif
 #if CONFIG_AVUTIL
         { "fixed_dsp", checkasm_check_fixed_dsp },
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 4d5f3e387e..3e73808739 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -123,6 +123,7 @@ void checkasm_check_sw_range_convert(void);
 void checkasm_check_sw_rgb(void);
 void checkasm_check_sw_scale(void);
 void checkasm_check_sw_yuv2rgb(void);
+void checkasm_check_sw_yuv2yuv(void);
 void checkasm_check_takdsp(void);
 void checkasm_check_utvideodsp(void);
 void checkasm_check_v210dec(void);
diff --git a/tests/checkasm/sw_yuv2yuv.c b/tests/checkasm/sw_yuv2yuv.c
new file mode 100644
index 0000000000..b561b46768
--- /dev/null
+++ b/tests/checkasm/sw_yuv2yuv.c
@@ -0,0 +1,131 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string.h>
+
+#include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+#include "libavutil/pixdesc.h"
+
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+
+#include "checkasm.h"
+
+#define randomize_buffers(buf, size)      \
+    do {                                  \
+        for (int j = 0; j < size; j += 4) \
+            AV_WN32(buf + j, rnd());      \
+    } while (0)
+
+static void check_semiplanar(int dst_pix_fmt)
+{
+    static const int src_fmts[] = {
+        AV_PIX_FMT_NV24,
+        AV_PIX_FMT_NV42,
+    };
+    const AVPixFmtDescriptor *dst_desc = av_pix_fmt_desc_get(dst_pix_fmt);
+#define MAX_LINE_SIZE 1920
+    static const int input_sizes[] = {8, 128, 1080, MAX_LINE_SIZE};
+
+    declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT,
+                      int, SwsContext *c, const uint8_t *src[],
+                           int srcStride[], int srcSliceY, int srcSliceH,
+                           uint8_t *dst[], int dstStride[]);
+
+    LOCAL_ALIGNED_8(uint8_t, src_y,  [MAX_LINE_SIZE * 2]);
+    LOCAL_ALIGNED_8(uint8_t, src_uv, [MAX_LINE_SIZE * 2 * 2]);
+    const uint8_t *src[4] = { src_y, src_uv };
+
+    LOCAL_ALIGNED_8(uint8_t, dst0_y, [MAX_LINE_SIZE * 2]);
+    LOCAL_ALIGNED_8(uint8_t, dst0_u, [MAX_LINE_SIZE]);
+    LOCAL_ALIGNED_8(uint8_t, dst0_v, [MAX_LINE_SIZE]);
+    uint8_t *dst0[4] = { dst0_y, dst0_u, dst0_v };
+
+    LOCAL_ALIGNED_8(uint8_t, dst1_y, [MAX_LINE_SIZE * 2]);
+    LOCAL_ALIGNED_8(uint8_t, dst1_u, [MAX_LINE_SIZE]);
+    LOCAL_ALIGNED_8(uint8_t, dst1_v, [MAX_LINE_SIZE]);
+    uint8_t *dst1[4] = { dst1_y, dst1_u, dst1_v };
+
+    randomize_buffers(src_y,  MAX_LINE_SIZE * 2);
+    randomize_buffers(src_uv, MAX_LINE_SIZE * 2 * 2);
+
+    for (int sfi = 0; sfi < FF_ARRAY_ELEMS(src_fmts); sfi++) {
+        int src_pix_fmt = src_fmts[sfi];
+        const AVPixFmtDescriptor *src_desc = av_pix_fmt_desc_get(src_pix_fmt);
+        for (int isi = 0; isi < FF_ARRAY_ELEMS(input_sizes); isi++) {
+            struct SwsContext *ctx;
+            int log_level;
+            int width = input_sizes[isi];
+            int srcSliceY = 0;
+            int srcSliceH = 2;
+            int srcStride[4] = {
+                width,
+                width << 1,
+            };
+            int dstStride[4] = {
+                width,
+                width >> dst_desc->log2_chroma_w,
+                width >> dst_desc->log2_chroma_w,
+            };
+
+            // override log level to prevent spamming of the message
+            // "No accelerated colorspace conversion found from %s to %s"
+            log_level = av_log_get_level();
+            av_log_set_level(AV_LOG_ERROR);
+            ctx = sws_getContext(width, srcSliceH, src_pix_fmt,
+                                 width, srcSliceH, dst_pix_fmt,
+                                 0, NULL, NULL, NULL);
+            av_log_set_level(log_level);
+            if (!ctx)
+                fail();
+
+            if (check_func(ctx->convert_unscaled, "%s_%s_%d", src_desc->name, dst_desc->name, width)) {
+                memset(dst0_y, 0xFF, MAX_LINE_SIZE * 2);
+                memset(dst0_u, 0xFF, MAX_LINE_SIZE);
+                memset(dst0_v, 0xFF, MAX_LINE_SIZE);
+                memset(dst1_y, 0xFF, MAX_LINE_SIZE * 2);
+                memset(dst1_u, 0xFF, MAX_LINE_SIZE);
+                memset(dst1_v, 0xFF, MAX_LINE_SIZE);
+
+                call_ref(ctx, src, srcStride, srcSliceY,
+                         srcSliceH, dst0, dstStride);
+                call_new(ctx, src, srcStride, srcSliceY,
+                         srcSliceH, dst1, dstStride);
+
+                if (memcmp(dst0_y, dst1_y, MAX_LINE_SIZE * 2) ||
+                    memcmp(dst0_u, dst1_u, MAX_LINE_SIZE) ||
+                    memcmp(dst0_v, dst1_v, MAX_LINE_SIZE))
+                    fail();
+
+                bench_new(ctx, src, srcStride, srcSliceY,
+                          srcSliceH, dst0, dstStride);
+            }
+            sws_freeContext(ctx);
+        }
+    }
+}
+
+#undef MAX_LINE_SIZE
+
+void checkasm_check_sw_yuv2yuv(void)
+{
+    check_semiplanar(AV_PIX_FMT_YUV420P);
+    report("yuv420p");
+}
-- 
2.30.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [FFmpeg-devel] [PATCH 3/4] swscale: export ff_copyPlane so it may be used by simd code
  2024-08-09 11:26 [FFmpeg-devel] [PATCH 1/4] swscale: add nv24/nv42 to yuv420p unscaled converter Ramiro Polla
  2024-08-09 11:26 ` [FFmpeg-devel] [PATCH 2/4] checkasm/yuv2yuv: add tests for semiplanar unscaled converters Ramiro Polla
@ 2024-08-09 11:26 ` Ramiro Polla
  2024-08-09 11:26 ` [FFmpeg-devel] [PATCH 4/4] swscale/aarch64: add nv24/nv42 to yuv420p unscaled converter Ramiro Polla
  2 siblings, 0 replies; 7+ messages in thread
From: Ramiro Polla @ 2024-08-09 11:26 UTC (permalink / raw)
  To: ffmpeg-devel

---
 libswscale/swscale_internal.h |  4 ++++
 libswscale/swscale_unscaled.c | 42 +++++++++++++++++------------------
 2 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index e5610161d0..50127d288f 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -1009,6 +1009,10 @@ int ff_sws_alphablendaway(SwsContext *c, const uint8_t *src[],
                           int srcStride[], int srcSliceY, int srcSliceH,
                           uint8_t *dst[], int dstStride[]);
 
+void ff_copyPlane(const uint8_t *src, int srcStride,
+                  int srcSliceY, int srcSliceH, int width,
+                  uint8_t *dst, int dstStride);
+
 static inline void fillPlane16(uint8_t *plane, int stride, int width, int height, int y,
                                int alpha, int bits, const int big_endian)
 {
diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
index 239258ab8c..dc1d5f3593 100644
--- a/libswscale/swscale_unscaled.c
+++ b/libswscale/swscale_unscaled.c
@@ -122,9 +122,9 @@ static void fillPlane(uint8_t *plane, int stride, int width, int height, int y,
     }
 }
 
-static void copyPlane(const uint8_t *src, int srcStride,
-                      int srcSliceY, int srcSliceH, int width,
-                      uint8_t *dst, int dstStride)
+void ff_copyPlane(const uint8_t *src, int srcStride,
+                  int srcSliceY, int srcSliceH, int width,
+                  uint8_t *dst, int dstStride)
 {
     dst += dstStride * srcSliceY;
     if (dstStride == srcStride && srcStride > 0) {
@@ -146,8 +146,8 @@ static int planarToNv12Wrapper(SwsContext *c, const uint8_t *src[],
 {
     uint8_t *dst = dstParam[1] + dstStride[1] * srcSliceY / 2;
 
-    copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
-              dstParam[0], dstStride[0]);
+    ff_copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
+                 dstParam[0], dstStride[0]);
 
     if (c->dstFormat == AV_PIX_FMT_NV12)
         interleaveBytes(src[1], src[2], dst, c->chrSrcW, (srcSliceH + 1) / 2,
@@ -167,8 +167,8 @@ static int nv12ToPlanarWrapper(SwsContext *c, const uint8_t *src[],
     uint8_t *dst1 = dstParam[1] + dstStride[1] * srcSliceY / 2;
     uint8_t *dst2 = dstParam[2] + dstStride[2] * srcSliceY / 2;
 
-    copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
-              dstParam[0], dstStride[0]);
+    ff_copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
+                 dstParam[0], dstStride[0]);
 
     if (c->srcFormat == AV_PIX_FMT_NV12)
         deinterleaveBytes(src[1], dst1, dst2, c->chrSrcW, (srcSliceH + 1) / 2,
@@ -187,8 +187,8 @@ static int planarToNv24Wrapper(SwsContext *c, const uint8_t *src[],
 {
     uint8_t *dst = dstParam[1] + dstStride[1] * srcSliceY;
 
-    copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
-              dstParam[0], dstStride[0]);
+    ff_copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
+                 dstParam[0], dstStride[0]);
 
     if (c->dstFormat == AV_PIX_FMT_NV24)
         interleaveBytes(src[1], src[2], dst, c->chrSrcW, srcSliceH,
@@ -208,8 +208,8 @@ static int nv24ToPlanarWrapper(SwsContext *c, const uint8_t *src[],
     uint8_t *dst1 = dstParam[1] + dstStride[1] * srcSliceY;
     uint8_t *dst2 = dstParam[2] + dstStride[2] * srcSliceY;
 
-    copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
-              dstParam[0], dstStride[0]);
+    ff_copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
+                 dstParam[0], dstStride[0]);
 
     if (c->srcFormat == AV_PIX_FMT_NV24)
         deinterleaveBytes(src[1], dst1, dst2, c->chrSrcW, srcSliceH,
@@ -250,8 +250,8 @@ static int nv24ToYuv420Wrapper(SwsContext *c, const uint8_t *src[],
     uint8_t *dst1 = dstParam[1] + dstStride[1] * srcSliceY / 2;
     uint8_t *dst2 = dstParam[2] + dstStride[2] * srcSliceY / 2;
 
-    copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
-              dstParam[0], dstStride[0]);
+    ff_copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
+                 dstParam[0], dstStride[0]);
 
     if (c->srcFormat == AV_PIX_FMT_NV24)
         nv24_to_yuv420p_chroma(dst1, dstStride[1], dst2, dstStride[2],
@@ -1173,12 +1173,12 @@ static int planarRgbToplanarRgbWrapper(SwsContext *c,
                                        int srcSliceY, int srcSliceH,
                                        uint8_t *dst[], int dstStride[])
 {
-    copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
-              dst[0], dstStride[0]);
-    copyPlane(src[1], srcStride[1], srcSliceY, srcSliceH, c->srcW,
-              dst[1], dstStride[1]);
-    copyPlane(src[2], srcStride[2], srcSliceY, srcSliceH, c->srcW,
-              dst[2], dstStride[2]);
+    ff_copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
+                 dst[0], dstStride[0]);
+    ff_copyPlane(src[1], srcStride[1], srcSliceY, srcSliceH, c->srcW,
+                 dst[1], dstStride[1]);
+    ff_copyPlane(src[2], srcStride[2], srcSliceY, srcSliceH, c->srcW,
+                 dst[2], dstStride[2]);
     if (dst[3])
         fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
 
@@ -1700,8 +1700,8 @@ static int yvu9ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
                              int srcStride[], int srcSliceY, int srcSliceH,
                              uint8_t *dst[], int dstStride[])
 {
-    copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
-              dst[0], dstStride[0]);
+    ff_copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
+                 dst[0], dstStride[0]);
 
     planar2x(src[1], dst[1] + dstStride[1] * (srcSliceY >> 1), c->chrSrcW,
              srcSliceH >> 2, srcStride[1], dstStride[1]);
-- 
2.30.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [FFmpeg-devel] [PATCH 4/4] swscale/aarch64: add nv24/nv42 to yuv420p unscaled converter
  2024-08-09 11:26 [FFmpeg-devel] [PATCH 1/4] swscale: add nv24/nv42 to yuv420p unscaled converter Ramiro Polla
  2024-08-09 11:26 ` [FFmpeg-devel] [PATCH 2/4] checkasm/yuv2yuv: add tests for semiplanar unscaled converters Ramiro Polla
  2024-08-09 11:26 ` [FFmpeg-devel] [PATCH 3/4] swscale: export ff_copyPlane so it may be used by simd code Ramiro Polla
@ 2024-08-09 11:26 ` Ramiro Polla
  2024-08-14 12:31   ` Martin Storsjö
  2 siblings, 1 reply; 7+ messages in thread
From: Ramiro Polla @ 2024-08-09 11:26 UTC (permalink / raw)
  To: ffmpeg-devel

checkasm --bench for Raspberry Pi 5 Model B Rev 1.0:
nv24_yuv420p_128_c: 423.0
nv24_yuv420p_128_neon: 115.7
nv24_yuv420p_1920_c: 5939.5
nv24_yuv420p_1920_neon: 1339.7
nv42_yuv420p_128_c: 423.2
nv42_yuv420p_128_neon: 115.7
nv42_yuv420p_1920_c: 5907.5
nv42_yuv420p_1920_neon: 1342.5
---
 libswscale/aarch64/Makefile                |  1 +
 libswscale/aarch64/swscale_unscaled.c      | 30 +++++++++
 libswscale/aarch64/swscale_unscaled_neon.S | 75 ++++++++++++++++++++++
 3 files changed, 106 insertions(+)
 create mode 100644 libswscale/aarch64/swscale_unscaled_neon.S

diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
index 37ad960619..1de8c9c0d6 100644
--- a/libswscale/aarch64/Makefile
+++ b/libswscale/aarch64/Makefile
@@ -7,4 +7,5 @@ NEON-OBJS   += aarch64/hscale.o                 \
                aarch64/output.o                 \
                aarch64/range_convert_neon.o     \
                aarch64/rgb2rgb_neon.o           \
+               aarch64/swscale_unscaled_neon.o  \
                aarch64/yuv2rgb_neon.o           \
diff --git a/libswscale/aarch64/swscale_unscaled.c b/libswscale/aarch64/swscale_unscaled.c
index b3093bbc9d..87bb011709 100644
--- a/libswscale/aarch64/swscale_unscaled.c
+++ b/libswscale/aarch64/swscale_unscaled.c
@@ -83,6 +83,31 @@ static int ifmt##_to_##ofmt##_neon_wrapper(SwsContext *c, const uint8_t *src[],
                                         c->yuv2rgb_y_coeff);                                \
 }                                                                                           \
 
+void ff_nv24_to_yuv420p_chroma_neon(uint8_t *dst1, int dstStride1,
+                                    uint8_t *dst2, int dstStride2,
+                                    const uint8_t *src, int srcStride,
+                                    int w, int h);
+
+static int nv24_to_yuv420p_neon_wrapper(SwsContext *c, const uint8_t *src[],
+                                        int srcStride[], int srcSliceY, int srcSliceH,
+                                        uint8_t *dst[], int dstStride[])
+{
+    uint8_t *dst1 = dst[1] + dstStride[1] * srcSliceY / 2;
+    uint8_t *dst2 = dst[2] + dstStride[2] * srcSliceY / 2;
+
+    ff_copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
+                 dst[0], dstStride[0]);
+
+    if (c->srcFormat == AV_PIX_FMT_NV24)
+        ff_nv24_to_yuv420p_chroma_neon(dst1, dstStride[1], dst2, dstStride[2],
+                                       src[1], srcStride[1], c->srcW / 2, srcSliceH);
+    else
+        ff_nv24_to_yuv420p_chroma_neon(dst2, dstStride[2], dst1, dstStride[1],
+                                       src[1], srcStride[1], c->srcW / 2, srcSliceH);
+
+    return srcSliceH;
+}
+
 #define DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nvx)                                               \
 DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, argb)                                                     \
 DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgba)                                                     \
@@ -119,6 +144,11 @@ static void get_unscaled_swscale_neon(SwsContext *c) {
     SET_FF_NVX_TO_ALL_RGBX_FUNC(nv21, NV21, accurate_rnd);
     SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv420p, YUV420P, accurate_rnd);
     SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv422p, YUV422P, accurate_rnd);
+
+    if (c->dstFormat == AV_PIX_FMT_YUV420P &&
+        (c->srcFormat == AV_PIX_FMT_NV24 || c->srcFormat == AV_PIX_FMT_NV42) &&
+        !(c->srcH & 1) && !(c->srcW & 15) && !accurate_rnd)
+        c->convert_unscaled = nv24_to_yuv420p_neon_wrapper;
 }
 
 void ff_get_unscaled_swscale_aarch64(SwsContext *c)
diff --git a/libswscale/aarch64/swscale_unscaled_neon.S b/libswscale/aarch64/swscale_unscaled_neon.S
new file mode 100644
index 0000000000..a206fda41f
--- /dev/null
+++ b/libswscale/aarch64/swscale_unscaled_neon.S
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2024 Ramiro Polla
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+function ff_nv24_to_yuv420p_chroma_neon, export=1
+// x0  uint8_t *dst1
+// x1  int dstStride1
+// x2  uint8_t *dst2
+// x3  int dstStride2
+// x4  const uint8_t *src
+// x5  int srcStride
+// w6  int w
+// w7  int h
+
+        uxtw            x1, w1
+        uxtw            x3, w3
+        uxtw            x5, w5
+
+        add             x9, x4, x5                  // x9 = src + srcStride
+        lsl             w5, w5, #1                  // srcStride *= 2
+
+1:
+        mov             w10, w6                     // w10 = w
+        mov             x11, x4                     // x11 = src1 (line 1)
+        mov             x12, x9                     // x12 = src2 (line 2)
+        mov             x13, x0                     // x13 = dst1 (dstU)
+        mov             x14, x2                     // x14 = dst2 (dstV)
+
+2:
+        ld2             { v0.16b, v1.16b }, [x11], #32 // v0 = U1, v1 = V1
+        ld2             { v2.16b, v3.16b }, [x12], #32 // v2 = U2, v3 = V2
+
+        uaddlp          v0.8h, v0.16b               // pairwise add U1 into v0
+        uaddlp          v1.8h, v1.16b               // pairwise add V1 into v1
+        uadalp          v0.8h, v2.16b               // pairwise add U2, accumulate into v0
+        uadalp          v1.8h, v3.16b               // pairwise add V2, accumulate into v1
+
+        shrn            v0.8b, v0.8h, #2            // divide by 4
+        shrn            v1.8b, v1.8h, #2            // divide by 4
+
+        st1             { v0.8b }, [x13], #8        // store U into dst1
+        st1             { v1.8b }, [x14], #8        // store V into dst2
+
+        subs            w10, w10, #8
+        b.gt            2b
+
+        // next row
+        add             x4, x4, x5                  // src1 += srcStride * 2
+        add             x9, x9, x5                  // src2 += srcStride * 2
+        add             x0, x0, x1                  // dst1 += dstStride1
+        add             x2, x2, x3                  // dst2 += dstStride2
+
+        subs            w7, w7, #2
+        b.gt            1b
+
+        ret
+endfunc
-- 
2.30.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [FFmpeg-devel] [PATCH 4/4] swscale/aarch64: add nv24/nv42 to yuv420p unscaled converter
  2024-08-09 11:26 ` [FFmpeg-devel] [PATCH 4/4] swscale/aarch64: add nv24/nv42 to yuv420p unscaled converter Ramiro Polla
@ 2024-08-14 12:31   ` Martin Storsjö
  2024-08-15 14:25     ` Ramiro Polla
  0 siblings, 1 reply; 7+ messages in thread
From: Martin Storsjö @ 2024-08-14 12:31 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

On Fri, 9 Aug 2024, Ramiro Polla wrote:

> checkasm --bench for Raspberry Pi 5 Model B Rev 1.0:
> nv24_yuv420p_128_c: 423.0
> nv24_yuv420p_128_neon: 115.7
> nv24_yuv420p_1920_c: 5939.5
> nv24_yuv420p_1920_neon: 1339.7
> nv42_yuv420p_128_c: 423.2
> nv42_yuv420p_128_neon: 115.7
> nv42_yuv420p_1920_c: 5907.5
> nv42_yuv420p_1920_neon: 1342.5
> ---
> libswscale/aarch64/Makefile                |  1 +
> libswscale/aarch64/swscale_unscaled.c      | 30 +++++++++
> libswscale/aarch64/swscale_unscaled_neon.S | 75 ++++++++++++++++++++++
> 3 files changed, 106 insertions(+)
> create mode 100644 libswscale/aarch64/swscale_unscaled_neon.S

> diff --git a/libswscale/aarch64/swscale_unscaled_neon.S b/libswscale/aarch64/swscale_unscaled_neon.S
> new file mode 100644
> index 0000000000..a206fda41f
> --- /dev/null
> +++ b/libswscale/aarch64/swscale_unscaled_neon.S
> @@ -0,0 +1,75 @@
> +/*
> + * Copyright (c) 2024 Ramiro Polla
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +
> +function ff_nv24_to_yuv420p_chroma_neon, export=1
> +// x0  uint8_t *dst1
> +// x1  int dstStride1
> +// x2  uint8_t *dst2
> +// x3  int dstStride2
> +// x4  const uint8_t *src
> +// x5  int srcStride
> +// w6  int w
> +// w7  int h
> +
> +        uxtw            x1, w1
> +        uxtw            x3, w3
> +        uxtw            x5, w5

You can often avoid the explicit uxtw instructions, if you can fold an 
uxtw attribute into the cases where the register is used. (If it's used 
often, it may be slightly more performant to do it upfront like this 
though, but often it can be omitted entirely.) And whenever you do an 
operation with a wN register as destination, the upper half of the 
register gets explicitly cleared, so these also may be avoided that way.

> +
> +        add             x9, x4, x5                  // x9 = src + srcStride
> +        lsl             w5, w5, #1                  // srcStride *= 2
> +
> +1:
> +        mov             w10, w6                     // w10 = w
> +        mov             x11, x4                     // x11 = src1 (line 1)
> +        mov             x12, x9                     // x12 = src2 (line 2)
> +        mov             x13, x0                     // x13 = dst1 (dstU)
> +        mov             x14, x2                     // x14 = dst2 (dstV)
> +
> +2:
> +        ld2             { v0.16b, v1.16b }, [x11], #32 // v0 = U1, v1 = V1
> +        ld2             { v2.16b, v3.16b }, [x12], #32 // v2 = U2, v3 = V2
> +
> +        uaddlp          v0.8h, v0.16b               // pairwise add U1 into v0
> +        uaddlp          v1.8h, v1.16b               // pairwise add V1 into v1
> +        uadalp          v0.8h, v2.16b               // pairwise add U2, accumulate into v0
> +        uadalp          v1.8h, v3.16b               // pairwise add V2, accumulate into v1
> +
> +        shrn            v0.8b, v0.8h, #2            // divide by 4
> +        shrn            v1.8b, v1.8h, #2            // divide by 4
> +
> +        st1             { v0.8b }, [x13], #8        // store U into dst1
> +        st1             { v1.8b }, [x14], #8        // store V into dst2
> +
> +        subs            w10, w10, #8
> +        b.gt            2b
> +
> +        // next row
> +        add             x4, x4, x5                  // src1 += srcStride * 2
> +        add             x9, x9, x5                  // src2 += srcStride * 2
> +        add             x0, x0, x1                  // dst1 += dstStride1
> +        add             x2, x2, x3                  // dst2 += dstStride2

It's often possible to avoid the extra step of moving the pointers back 
into the the x11/x12/x13/x14 registers, if you subtract the width from the 
stride at the start of the function. Then you don't need two separate 
registers for each pointer, and shortens dependency chain when moving on 
to the next line.

If the width can be any uneven value, but we in practice write in 
increments of 8 pixels, you may need to align the width up to 8 before 
using it to decrement the stride that way though.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [FFmpeg-devel] [PATCH 2/4] checkasm/yuv2yuv: add tests for semiplanar unscaled converters
  2024-08-09 11:26 ` [FFmpeg-devel] [PATCH 2/4] checkasm/yuv2yuv: add tests for semiplanar unscaled converters Ramiro Polla
@ 2024-08-15 14:19   ` Ramiro Polla
  0 siblings, 0 replies; 7+ messages in thread
From: Ramiro Polla @ 2024-08-15 14:19 UTC (permalink / raw)
  To: ffmpeg-devel

[-- Attachment #1: Type: text/plain, Size: 515 bytes --]

On Fri, Aug 9, 2024 at 1:26 PM Ramiro Polla <ramiro.polla@gmail.com> wrote:
>
> ---
>  tests/checkasm/Makefile     |   2 +-
>  tests/checkasm/checkasm.c   |   1 +
>  tests/checkasm/checkasm.h   |   1 +
>  tests/checkasm/sw_yuv2yuv.c | 131 ++++++++++++++++++++++++++++++++++++
>  4 files changed, 134 insertions(+), 1 deletion(-)
>  create mode 100644 tests/checkasm/sw_yuv2yuv.c

New patch attached improves the tests by converting 4 lines instead of
2 and by not setting stride to be equal to width.

[-- Attachment #2: v2-0002-checkasm-yuv2yuv-add-tests-for-semiplanar-unscale.patch --]
[-- Type: text/x-patch, Size: 7582 bytes --]

From 96122637b5df57da53208d9a81b79b5b2e4707b6 Mon Sep 17 00:00:00 2001
From: Ramiro Polla <ramiro.polla@gmail.com>
Date: Wed, 7 Aug 2024 23:29:43 +0200
Subject: [PATCH v2 2/4] checkasm/yuv2yuv: add tests for semiplanar unscaled
 converters

---
 tests/checkasm/Makefile     |   2 +-
 tests/checkasm/checkasm.c   |   1 +
 tests/checkasm/checkasm.h   |   1 +
 tests/checkasm/sw_yuv2yuv.c | 133 ++++++++++++++++++++++++++++++++++++
 4 files changed, 136 insertions(+), 1 deletion(-)
 create mode 100644 tests/checkasm/sw_yuv2yuv.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 3a7670e24b..2d2e42e445 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -63,7 +63,7 @@ AVFILTEROBJS-$(CONFIG_SOBEL_FILTER)      += vf_convolution.o
 CHECKASMOBJS-$(CONFIG_AVFILTER) += $(AVFILTEROBJS-yes)
 
 # swscale tests
-SWSCALEOBJS                             += sw_gbrp.o sw_range_convert.o sw_rgb.o sw_scale.o sw_yuv2rgb.o
+SWSCALEOBJS                             += sw_gbrp.o sw_range_convert.o sw_rgb.o sw_scale.o sw_yuv2rgb.o sw_yuv2yuv.o
 
 CHECKASMOBJS-$(CONFIG_SWSCALE)  += $(SWSCALEOBJS)
 
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 58597d3888..5c407de2ba 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -255,6 +255,7 @@ static const struct {
     { "sw_rgb", checkasm_check_sw_rgb },
     { "sw_scale", checkasm_check_sw_scale },
     { "sw_yuv2rgb", checkasm_check_sw_yuv2rgb },
+    { "sw_yuv2yuv", checkasm_check_sw_yuv2yuv },
 #endif
 #if CONFIG_AVUTIL
         { "fixed_dsp", checkasm_check_fixed_dsp },
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 4d5f3e387e..3e73808739 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -123,6 +123,7 @@ void checkasm_check_sw_range_convert(void);
 void checkasm_check_sw_rgb(void);
 void checkasm_check_sw_scale(void);
 void checkasm_check_sw_yuv2rgb(void);
+void checkasm_check_sw_yuv2yuv(void);
 void checkasm_check_takdsp(void);
 void checkasm_check_utvideodsp(void);
 void checkasm_check_v210dec(void);
diff --git a/tests/checkasm/sw_yuv2yuv.c b/tests/checkasm/sw_yuv2yuv.c
new file mode 100644
index 0000000000..90a51601ed
--- /dev/null
+++ b/tests/checkasm/sw_yuv2yuv.c
@@ -0,0 +1,133 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string.h>
+
+#include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+#include "libavutil/pixdesc.h"
+
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+
+#include "checkasm.h"
+
+#define randomize_buffers(buf, size)      \
+    do {                                  \
+        for (int j = 0; j < size; j += 4) \
+            AV_WN32(buf + j, rnd());      \
+    } while (0)
+
+static void check_semiplanar(int dst_pix_fmt)
+{
+    static const int src_fmts[] = {
+        AV_PIX_FMT_NV24,
+        AV_PIX_FMT_NV42,
+    };
+    const AVPixFmtDescriptor *dst_desc = av_pix_fmt_desc_get(dst_pix_fmt);
+#define NUM_LINES 4
+#define MAX_LINE_SIZE 1920
+    static const int input_sizes[] = {8, 128, 1080, MAX_LINE_SIZE};
+
+    declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT,
+                      int, SwsContext *c, const uint8_t *src[],
+                           int srcStride[], int srcSliceY, int srcSliceH,
+                           uint8_t *dst[], int dstStride[]);
+
+    LOCAL_ALIGNED_8(uint8_t, src_y,  [MAX_LINE_SIZE * NUM_LINES]);
+    LOCAL_ALIGNED_8(uint8_t, src_uv, [MAX_LINE_SIZE * NUM_LINES * 2]);
+    const uint8_t *src[4] = { src_y, src_uv };
+
+    LOCAL_ALIGNED_8(uint8_t, dst0_y, [MAX_LINE_SIZE * NUM_LINES]);
+    LOCAL_ALIGNED_8(uint8_t, dst0_u, [MAX_LINE_SIZE * NUM_LINES / 2]);
+    LOCAL_ALIGNED_8(uint8_t, dst0_v, [MAX_LINE_SIZE * NUM_LINES / 2]);
+    uint8_t *dst0[4] = { dst0_y, dst0_u, dst0_v };
+
+    LOCAL_ALIGNED_8(uint8_t, dst1_y, [MAX_LINE_SIZE * NUM_LINES]);
+    LOCAL_ALIGNED_8(uint8_t, dst1_u, [MAX_LINE_SIZE * NUM_LINES / 2]);
+    LOCAL_ALIGNED_8(uint8_t, dst1_v, [MAX_LINE_SIZE * NUM_LINES / 2]);
+    uint8_t *dst1[4] = { dst1_y, dst1_u, dst1_v };
+
+    randomize_buffers(src_y,  MAX_LINE_SIZE * NUM_LINES);
+    randomize_buffers(src_uv, MAX_LINE_SIZE * NUM_LINES * 2);
+
+    for (int sfi = 0; sfi < FF_ARRAY_ELEMS(src_fmts); sfi++) {
+        int src_pix_fmt = src_fmts[sfi];
+        const AVPixFmtDescriptor *src_desc = av_pix_fmt_desc_get(src_pix_fmt);
+        for (int isi = 0; isi < FF_ARRAY_ELEMS(input_sizes); isi++) {
+            struct SwsContext *ctx;
+            int log_level;
+            int width = input_sizes[isi];
+            int srcSliceY = 0;
+            int srcSliceH = NUM_LINES;
+            int srcStride[4] = {
+                MAX_LINE_SIZE,
+                MAX_LINE_SIZE * 2,
+            };
+            int dstStride[4] = {
+                MAX_LINE_SIZE,
+                MAX_LINE_SIZE >> dst_desc->log2_chroma_w,
+                MAX_LINE_SIZE >> dst_desc->log2_chroma_w,
+            };
+
+            // override log level to prevent spamming of the message
+            // "No accelerated colorspace conversion found from %s to %s"
+            log_level = av_log_get_level();
+            av_log_set_level(AV_LOG_ERROR);
+            ctx = sws_getContext(width, srcSliceH, src_pix_fmt,
+                                 width, srcSliceH, dst_pix_fmt,
+                                 0, NULL, NULL, NULL);
+            av_log_set_level(log_level);
+            if (!ctx)
+                fail();
+
+            if (check_func(ctx->convert_unscaled, "%s_%s_%d", src_desc->name, dst_desc->name, width)) {
+                memset(dst0_y, 0xFF, MAX_LINE_SIZE * NUM_LINES);
+                memset(dst0_u, 0xFF, MAX_LINE_SIZE * NUM_LINES / 2);
+                memset(dst0_v, 0xFF, MAX_LINE_SIZE * NUM_LINES / 2);
+                memset(dst1_y, 0xFF, MAX_LINE_SIZE * NUM_LINES);
+                memset(dst1_u, 0xFF, MAX_LINE_SIZE * NUM_LINES / 2);
+                memset(dst1_v, 0xFF, MAX_LINE_SIZE * NUM_LINES / 2);
+
+                call_ref(ctx, src, srcStride, srcSliceY,
+                         srcSliceH, dst0, dstStride);
+                call_new(ctx, src, srcStride, srcSliceY,
+                         srcSliceH, dst1, dstStride);
+
+                if (memcmp(dst0_y, dst1_y, MAX_LINE_SIZE * NUM_LINES) ||
+                    memcmp(dst0_u, dst1_u, MAX_LINE_SIZE * NUM_LINES / 2) ||
+                    memcmp(dst0_v, dst1_v, MAX_LINE_SIZE * NUM_LINES / 2))
+                    fail();
+
+                bench_new(ctx, src, srcStride, srcSliceY,
+                          srcSliceH, dst0, dstStride);
+            }
+            sws_freeContext(ctx);
+        }
+    }
+}
+
+#undef NUM_LINES
+#undef MAX_LINE_SIZE
+
+void checkasm_check_sw_yuv2yuv(void)
+{
+    check_semiplanar(AV_PIX_FMT_YUV420P);
+    report("yuv420p");
+}
-- 
2.39.2


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [FFmpeg-devel] [PATCH 4/4] swscale/aarch64: add nv24/nv42 to yuv420p unscaled converter
  2024-08-14 12:31   ` Martin Storsjö
@ 2024-08-15 14:25     ` Ramiro Polla
  0 siblings, 0 replies; 7+ messages in thread
From: Ramiro Polla @ 2024-08-15 14:25 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

[-- Attachment #1: Type: text/plain, Size: 5270 bytes --]

On Wed, Aug 14, 2024 at 7:21 PM Martin Storsjö <martin@martin.st> wrote:
> On Fri, 9 Aug 2024, Ramiro Polla wrote:
> > checkasm --bench for Raspberry Pi 5 Model B Rev 1.0:
> > nv24_yuv420p_128_c: 423.0
> > nv24_yuv420p_128_neon: 115.7
> > nv24_yuv420p_1920_c: 5939.5
> > nv24_yuv420p_1920_neon: 1339.7
> > nv42_yuv420p_128_c: 423.2
> > nv42_yuv420p_128_neon: 115.7
> > nv42_yuv420p_1920_c: 5907.5
> > nv42_yuv420p_1920_neon: 1342.5
> > ---
> > libswscale/aarch64/Makefile                |  1 +
> > libswscale/aarch64/swscale_unscaled.c      | 30 +++++++++
> > libswscale/aarch64/swscale_unscaled_neon.S | 75 ++++++++++++++++++++++
> > 3 files changed, 106 insertions(+)
> > create mode 100644 libswscale/aarch64/swscale_unscaled_neon.S
>
> > diff --git a/libswscale/aarch64/swscale_unscaled_neon.S b/libswscale/aarch64/swscale_unscaled_neon.S
> > new file mode 100644
> > index 0000000000..a206fda41f
> > --- /dev/null
> > +++ b/libswscale/aarch64/swscale_unscaled_neon.S
> > @@ -0,0 +1,75 @@
> > +/*
> > + * Copyright (c) 2024 Ramiro Polla
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> > + */
> > +
> > +#include "libavutil/aarch64/asm.S"
> > +
> > +function ff_nv24_to_yuv420p_chroma_neon, export=1
> > +// x0  uint8_t *dst1
> > +// x1  int dstStride1
> > +// x2  uint8_t *dst2
> > +// x3  int dstStride2
> > +// x4  const uint8_t *src
> > +// x5  int srcStride
> > +// w6  int w
> > +// w7  int h
> > +
> > +        uxtw            x1, w1
> > +        uxtw            x3, w3
> > +        uxtw            x5, w5
>
> You can often avoid the explicit uxtw instructions, if you can fold an
> uxtw attribute into the cases where the register is used. (If it's used
> often, it may be slightly more performant to do it upfront like this
> though, but often it can be omitted entirely.) And whenever you do an
> operation with a wN register as destination, the upper half of the
> register gets explicitly cleared, so these also may be avoided that way.
>
> > +
> > +        add             x9, x4, x5                  // x9 = src + srcStride
> > +        lsl             w5, w5, #1                  // srcStride *= 2
> > +
> > +1:
> > +        mov             w10, w6                     // w10 = w
> > +        mov             x11, x4                     // x11 = src1 (line 1)
> > +        mov             x12, x9                     // x12 = src2 (line 2)
> > +        mov             x13, x0                     // x13 = dst1 (dstU)
> > +        mov             x14, x2                     // x14 = dst2 (dstV)
> > +
> > +2:
> > +        ld2             { v0.16b, v1.16b }, [x11], #32 // v0 = U1, v1 = V1
> > +        ld2             { v2.16b, v3.16b }, [x12], #32 // v2 = U2, v3 = V2
> > +
> > +        uaddlp          v0.8h, v0.16b               // pairwise add U1 into v0
> > +        uaddlp          v1.8h, v1.16b               // pairwise add V1 into v1
> > +        uadalp          v0.8h, v2.16b               // pairwise add U2, accumulate into v0
> > +        uadalp          v1.8h, v3.16b               // pairwise add V2, accumulate into v1
> > +
> > +        shrn            v0.8b, v0.8h, #2            // divide by 4
> > +        shrn            v1.8b, v1.8h, #2            // divide by 4
> > +
> > +        st1             { v0.8b }, [x13], #8        // store U into dst1
> > +        st1             { v1.8b }, [x14], #8        // store V into dst2
> > +
> > +        subs            w10, w10, #8
> > +        b.gt            2b
> > +
> > +        // next row
> > +        add             x4, x4, x5                  // src1 += srcStride * 2
> > +        add             x9, x9, x5                  // src2 += srcStride * 2
> > +        add             x0, x0, x1                  // dst1 += dstStride1
> > +        add             x2, x2, x3                  // dst2 += dstStride2
>
> It's often possible to avoid the extra step of moving the pointers back
> into the the x11/x12/x13/x14 registers, if you subtract the width from the
> stride at the start of the function. Then you don't need two separate
> registers for each pointer, and shortens dependency chain when moving on
> to the next line.
>
> If the width can be any uneven value, but we in practice write in
> increments of 8 pixels, you may need to align the width up to 8 before
> using it to decrement the stride that way though.

Thank you for the review. New patch attached.

[-- Attachment #2: v2-0004-swscale-aarch64-add-nv24-nv42-to-yuv420p-unscaled.patch --]
[-- Type: text/x-patch, Size: 6912 bytes --]

From f6ea1edb0590c14e168fbce2ae42958220b6e778 Mon Sep 17 00:00:00 2001
From: Ramiro Polla <ramiro.polla@gmail.com>
Date: Wed, 7 Aug 2024 18:53:12 +0200
Subject: [PATCH v2 4/4] swscale/aarch64: add nv24/nv42 to yuv420p unscaled
 converter

checkasm --bench for Raspberry Pi 5 Model B Rev 1.0:
nv24_yuv420p_128_c: 1320.2
nv24_yuv420p_128_neon: 709.5
nv24_yuv420p_1920_c: 12448.0
nv24_yuv420p_1920_neon: 2698.0
nv42_yuv420p_128_c: 1329.2
nv42_yuv420p_128_neon: 841.7
nv42_yuv420p_1920_c: 11967.5
nv42_yuv420p_1920_neon: 2866.5
---
 libswscale/aarch64/Makefile                |  1 +
 libswscale/aarch64/swscale_unscaled.c      | 30 ++++++++++
 libswscale/aarch64/swscale_unscaled_neon.S | 70 ++++++++++++++++++++++
 3 files changed, 101 insertions(+)
 create mode 100644 libswscale/aarch64/swscale_unscaled_neon.S

diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
index 37ad960619..1de8c9c0d6 100644
--- a/libswscale/aarch64/Makefile
+++ b/libswscale/aarch64/Makefile
@@ -7,4 +7,5 @@ NEON-OBJS   += aarch64/hscale.o                 \
                aarch64/output.o                 \
                aarch64/range_convert_neon.o     \
                aarch64/rgb2rgb_neon.o           \
+               aarch64/swscale_unscaled_neon.o  \
                aarch64/yuv2rgb_neon.o           \
diff --git a/libswscale/aarch64/swscale_unscaled.c b/libswscale/aarch64/swscale_unscaled.c
index b3093bbc9d..87bb011709 100644
--- a/libswscale/aarch64/swscale_unscaled.c
+++ b/libswscale/aarch64/swscale_unscaled.c
@@ -83,6 +83,31 @@ static int ifmt##_to_##ofmt##_neon_wrapper(SwsContext *c, const uint8_t *src[],
                                         c->yuv2rgb_y_coeff);                                \
 }                                                                                           \
 
+void ff_nv24_to_yuv420p_chroma_neon(uint8_t *dst1, int dstStride1,
+                                    uint8_t *dst2, int dstStride2,
+                                    const uint8_t *src, int srcStride,
+                                    int w, int h);
+
+static int nv24_to_yuv420p_neon_wrapper(SwsContext *c, const uint8_t *src[],
+                                        int srcStride[], int srcSliceY, int srcSliceH,
+                                        uint8_t *dst[], int dstStride[])
+{
+    uint8_t *dst1 = dst[1] + dstStride[1] * srcSliceY / 2;
+    uint8_t *dst2 = dst[2] + dstStride[2] * srcSliceY / 2;
+
+    ff_copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
+                 dst[0], dstStride[0]);
+
+    if (c->srcFormat == AV_PIX_FMT_NV24)
+        ff_nv24_to_yuv420p_chroma_neon(dst1, dstStride[1], dst2, dstStride[2],
+                                       src[1], srcStride[1], c->srcW / 2, srcSliceH);
+    else
+        ff_nv24_to_yuv420p_chroma_neon(dst2, dstStride[2], dst1, dstStride[1],
+                                       src[1], srcStride[1], c->srcW / 2, srcSliceH);
+
+    return srcSliceH;
+}
+
 #define DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nvx)                                               \
 DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, argb)                                                     \
 DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgba)                                                     \
@@ -119,6 +144,11 @@ static void get_unscaled_swscale_neon(SwsContext *c) {
     SET_FF_NVX_TO_ALL_RGBX_FUNC(nv21, NV21, accurate_rnd);
     SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv420p, YUV420P, accurate_rnd);
     SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv422p, YUV422P, accurate_rnd);
+
+    if (c->dstFormat == AV_PIX_FMT_YUV420P &&
+        (c->srcFormat == AV_PIX_FMT_NV24 || c->srcFormat == AV_PIX_FMT_NV42) &&
+        !(c->srcH & 1) && !(c->srcW & 15) && !accurate_rnd)
+        c->convert_unscaled = nv24_to_yuv420p_neon_wrapper;
 }
 
 void ff_get_unscaled_swscale_aarch64(SwsContext *c)
diff --git a/libswscale/aarch64/swscale_unscaled_neon.S b/libswscale/aarch64/swscale_unscaled_neon.S
new file mode 100644
index 0000000000..7f1890f58a
--- /dev/null
+++ b/libswscale/aarch64/swscale_unscaled_neon.S
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2024 Ramiro Polla
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+function ff_nv24_to_yuv420p_chroma_neon, export=1
+// x0  uint8_t *dst1
+// x1  int dstStride1
+// x2  uint8_t *dst2
+// x3  int dstStride2
+// x4  const uint8_t *src
+// x5  int srcStride
+// w6  int w
+// w7  int h
+
+        add             x9, x4, w5, sxtw            // x9 = src + srcStride
+        lsl             w5, w5, #1                  // srcStride *= 2
+        sub             w5, w5, w6, lsl #2          // srcPadding = (2 * srcStride) - (4 * w)
+        sub             w1, w1, w6                  // dstPadding1 = dstStride1 - w
+        sub             w3, w3, w6                  // dstPadding2 = dstStride2 - w
+
+1:
+        mov             w10, w6                     // w10 = w
+
+2:
+        ld2             { v0.16b, v1.16b }, [x4], #32 // v0 = U1, v1 = V1
+        ld2             { v2.16b, v3.16b }, [x9], #32 // v2 = U2, v3 = V2
+
+        uaddlp          v0.8h, v0.16b               // pairwise add U1 into v0
+        uaddlp          v1.8h, v1.16b               // pairwise add V1 into v1
+        uadalp          v0.8h, v2.16b               // pairwise add U2, accumulate into v0
+        uadalp          v1.8h, v3.16b               // pairwise add V2, accumulate into v1
+
+        shrn            v0.8b, v0.8h, #2            // divide by 4
+        shrn            v1.8b, v1.8h, #2            // divide by 4
+
+        st1             { v0.8b }, [x0], #8         // store U into dst1
+        st1             { v1.8b }, [x2], #8         // store V into dst2
+
+        subs            w10, w10, #8
+        b.gt            2b
+
+        // next row
+        add             x4, x4, x5, sxtw            // src1 += srcPadding
+        add             x9, x9, x5, sxtw            // src2 += srcPadding
+        add             x0, x0, x1, sxtw            // dst1 += dstPadding1
+        add             x2, x2, x3, sxtw            // dst2 += dstPadding2
+
+        subs            w7, w7, #2
+        b.gt            1b
+
+        ret
+endfunc
-- 
2.39.2


[-- Attachment #3: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2024-08-15 14:25 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-08-09 11:26 [FFmpeg-devel] [PATCH 1/4] swscale: add nv24/nv42 to yuv420p unscaled converter Ramiro Polla
2024-08-09 11:26 ` [FFmpeg-devel] [PATCH 2/4] checkasm/yuv2yuv: add tests for semiplanar unscaled converters Ramiro Polla
2024-08-15 14:19   ` Ramiro Polla
2024-08-09 11:26 ` [FFmpeg-devel] [PATCH 3/4] swscale: export ff_copyPlane so it may be used by simd code Ramiro Polla
2024-08-09 11:26 ` [FFmpeg-devel] [PATCH 4/4] swscale/aarch64: add nv24/nv42 to yuv420p unscaled converter Ramiro Polla
2024-08-14 12:31   ` Martin Storsjö
2024-08-15 14:25     ` Ramiro Polla

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git