* [FFmpeg-devel] [PATCH 1/4] swscale: add nv24/nv42 to yuv420p unscaled converter
@ 2024-08-09 11:26 Ramiro Polla
2024-08-09 11:26 ` [FFmpeg-devel] [PATCH 2/4] checkasm/yuv2yuv: add tests for semiplanar unscaled converters Ramiro Polla
` (2 more replies)
0 siblings, 3 replies; 7+ messages in thread
From: Ramiro Polla @ 2024-08-09 11:26 UTC (permalink / raw)
To: ffmpeg-devel
---
libswscale/swscale_unscaled.c | 45 +++++++++++++++++++++++++++++++++++
1 file changed, 45 insertions(+)
diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
index a5c9917799..239258ab8c 100644
--- a/libswscale/swscale_unscaled.c
+++ b/libswscale/swscale_unscaled.c
@@ -221,6 +221,48 @@ static int nv24ToPlanarWrapper(SwsContext *c, const uint8_t *src[],
return srcSliceH;
}
+static void nv24_to_yuv420p_chroma(uint8_t *dst1, int dstStride1,
+ uint8_t *dst2, int dstStride2,
+ const uint8_t *src, int srcStride,
+ int w, int h)
+{
+ const uint8_t *src1 = src;
+ const uint8_t *src2 = src + srcStride;
+ // average 4 pixels into 1 (interleaved U and V)
+ for (int y = 0; y < h; y += 2) {
+ for (int x = 0; x < w; x++) {
+ dst1[x] = (src1[4 * x + 0] + src1[4 * x + 2] +
+ src2[4 * x + 0] + src2[4 * x + 2]) >> 2;
+ dst2[x] = (src1[4 * x + 1] + src1[4 * x + 3] +
+ src2[4 * x + 1] + src2[4 * x + 3]) >> 2;
+ }
+ src1 += srcStride * 2;
+ src2 += srcStride * 2;
+ dst1 += dstStride1;
+ dst2 += dstStride2;
+ }
+}
+
+static int nv24ToYuv420Wrapper(SwsContext *c, const uint8_t *src[],
+ int srcStride[], int srcSliceY, int srcSliceH,
+ uint8_t *dstParam[], int dstStride[])
+{
+ uint8_t *dst1 = dstParam[1] + dstStride[1] * srcSliceY / 2;
+ uint8_t *dst2 = dstParam[2] + dstStride[2] * srcSliceY / 2;
+
+ copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
+ dstParam[0], dstStride[0]);
+
+ if (c->srcFormat == AV_PIX_FMT_NV24)
+ nv24_to_yuv420p_chroma(dst1, dstStride[1], dst2, dstStride[2],
+ src[1], srcStride[1], c->srcW / 2, srcSliceH);
+ else
+ nv24_to_yuv420p_chroma(dst2, dstStride[2], dst1, dstStride[1],
+ src[1], srcStride[1], c->srcW / 2, srcSliceH);
+
+ return srcSliceH;
+}
+
static int planarToP01xWrapper(SwsContext *c, const uint8_t *src8[],
int srcStride[], int srcSliceY,
int srcSliceH, uint8_t *dstParam8[],
@@ -2206,6 +2248,9 @@ void ff_get_unscaled_swscale(SwsContext *c)
c->convert_unscaled = yuyvToYuv422Wrapper;
if (srcFormat == AV_PIX_FMT_UYVY422 && dstFormat == AV_PIX_FMT_YUV422P)
c->convert_unscaled = uyvyToYuv422Wrapper;
+ if (dstFormat == AV_PIX_FMT_YUV420P &&
+ (srcFormat == AV_PIX_FMT_NV24 || srcFormat == AV_PIX_FMT_NV42))
+ c->convert_unscaled = nv24ToYuv420Wrapper;
#define isPlanarGray(x) (isGray(x) && (x) != AV_PIX_FMT_YA8 && (x) != AV_PIX_FMT_YA16LE && (x) != AV_PIX_FMT_YA16BE)
/* simple copy */
--
2.30.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
* [FFmpeg-devel] [PATCH 2/4] checkasm/yuv2yuv: add tests for semiplanar unscaled converters
2024-08-09 11:26 [FFmpeg-devel] [PATCH 1/4] swscale: add nv24/nv42 to yuv420p unscaled converter Ramiro Polla
@ 2024-08-09 11:26 ` Ramiro Polla
2024-08-15 14:19 ` Ramiro Polla
2024-08-09 11:26 ` [FFmpeg-devel] [PATCH 3/4] swscale: export ff_copyPlane so it may be used by simd code Ramiro Polla
2024-08-09 11:26 ` [FFmpeg-devel] [PATCH 4/4] swscale/aarch64: add nv24/nv42 to yuv420p unscaled converter Ramiro Polla
2 siblings, 1 reply; 7+ messages in thread
From: Ramiro Polla @ 2024-08-09 11:26 UTC (permalink / raw)
To: ffmpeg-devel
---
tests/checkasm/Makefile | 2 +-
tests/checkasm/checkasm.c | 1 +
tests/checkasm/checkasm.h | 1 +
tests/checkasm/sw_yuv2yuv.c | 131 ++++++++++++++++++++++++++++++++++++
4 files changed, 134 insertions(+), 1 deletion(-)
create mode 100644 tests/checkasm/sw_yuv2yuv.c
diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 3a7670e24b..2d2e42e445 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -63,7 +63,7 @@ AVFILTEROBJS-$(CONFIG_SOBEL_FILTER) += vf_convolution.o
CHECKASMOBJS-$(CONFIG_AVFILTER) += $(AVFILTEROBJS-yes)
# swscale tests
-SWSCALEOBJS += sw_gbrp.o sw_range_convert.o sw_rgb.o sw_scale.o sw_yuv2rgb.o
+SWSCALEOBJS += sw_gbrp.o sw_range_convert.o sw_rgb.o sw_scale.o sw_yuv2rgb.o sw_yuv2yuv.o
CHECKASMOBJS-$(CONFIG_SWSCALE) += $(SWSCALEOBJS)
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 58597d3888..5c407de2ba 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -255,6 +255,7 @@ static const struct {
{ "sw_rgb", checkasm_check_sw_rgb },
{ "sw_scale", checkasm_check_sw_scale },
{ "sw_yuv2rgb", checkasm_check_sw_yuv2rgb },
+ { "sw_yuv2yuv", checkasm_check_sw_yuv2yuv },
#endif
#if CONFIG_AVUTIL
{ "fixed_dsp", checkasm_check_fixed_dsp },
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 4d5f3e387e..3e73808739 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -123,6 +123,7 @@ void checkasm_check_sw_range_convert(void);
void checkasm_check_sw_rgb(void);
void checkasm_check_sw_scale(void);
void checkasm_check_sw_yuv2rgb(void);
+void checkasm_check_sw_yuv2yuv(void);
void checkasm_check_takdsp(void);
void checkasm_check_utvideodsp(void);
void checkasm_check_v210dec(void);
diff --git a/tests/checkasm/sw_yuv2yuv.c b/tests/checkasm/sw_yuv2yuv.c
new file mode 100644
index 0000000000..b561b46768
--- /dev/null
+++ b/tests/checkasm/sw_yuv2yuv.c
@@ -0,0 +1,131 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string.h>
+
+#include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+#include "libavutil/pixdesc.h"
+
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+
+#include "checkasm.h"
+
+#define randomize_buffers(buf, size) \
+ do { \
+ for (int j = 0; j < size; j += 4) \
+ AV_WN32(buf + j, rnd()); \
+ } while (0)
+
+static void check_semiplanar(int dst_pix_fmt)
+{
+ static const int src_fmts[] = {
+ AV_PIX_FMT_NV24,
+ AV_PIX_FMT_NV42,
+ };
+ const AVPixFmtDescriptor *dst_desc = av_pix_fmt_desc_get(dst_pix_fmt);
+#define MAX_LINE_SIZE 1920
+ static const int input_sizes[] = {8, 128, 1080, MAX_LINE_SIZE};
+
+ declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT,
+ int, SwsContext *c, const uint8_t *src[],
+ int srcStride[], int srcSliceY, int srcSliceH,
+ uint8_t *dst[], int dstStride[]);
+
+ LOCAL_ALIGNED_8(uint8_t, src_y, [MAX_LINE_SIZE * 2]);
+ LOCAL_ALIGNED_8(uint8_t, src_uv, [MAX_LINE_SIZE * 2 * 2]);
+ const uint8_t *src[4] = { src_y, src_uv };
+
+ LOCAL_ALIGNED_8(uint8_t, dst0_y, [MAX_LINE_SIZE * 2]);
+ LOCAL_ALIGNED_8(uint8_t, dst0_u, [MAX_LINE_SIZE]);
+ LOCAL_ALIGNED_8(uint8_t, dst0_v, [MAX_LINE_SIZE]);
+ uint8_t *dst0[4] = { dst0_y, dst0_u, dst0_v };
+
+ LOCAL_ALIGNED_8(uint8_t, dst1_y, [MAX_LINE_SIZE * 2]);
+ LOCAL_ALIGNED_8(uint8_t, dst1_u, [MAX_LINE_SIZE]);
+ LOCAL_ALIGNED_8(uint8_t, dst1_v, [MAX_LINE_SIZE]);
+ uint8_t *dst1[4] = { dst1_y, dst1_u, dst1_v };
+
+ randomize_buffers(src_y, MAX_LINE_SIZE * 2);
+ randomize_buffers(src_uv, MAX_LINE_SIZE * 2 * 2);
+
+ for (int sfi = 0; sfi < FF_ARRAY_ELEMS(src_fmts); sfi++) {
+ int src_pix_fmt = src_fmts[sfi];
+ const AVPixFmtDescriptor *src_desc = av_pix_fmt_desc_get(src_pix_fmt);
+ for (int isi = 0; isi < FF_ARRAY_ELEMS(input_sizes); isi++) {
+ struct SwsContext *ctx;
+ int log_level;
+ int width = input_sizes[isi];
+ int srcSliceY = 0;
+ int srcSliceH = 2;
+ int srcStride[4] = {
+ width,
+ width << 1,
+ };
+ int dstStride[4] = {
+ width,
+ width >> dst_desc->log2_chroma_w,
+ width >> dst_desc->log2_chroma_w,
+ };
+
+ // override log level to prevent spamming of the message
+ // "No accelerated colorspace conversion found from %s to %s"
+ log_level = av_log_get_level();
+ av_log_set_level(AV_LOG_ERROR);
+ ctx = sws_getContext(width, srcSliceH, src_pix_fmt,
+ width, srcSliceH, dst_pix_fmt,
+ 0, NULL, NULL, NULL);
+ av_log_set_level(log_level);
+ if (!ctx)
+ fail();
+
+ if (check_func(ctx->convert_unscaled, "%s_%s_%d", src_desc->name, dst_desc->name, width)) {
+ memset(dst0_y, 0xFF, MAX_LINE_SIZE * 2);
+ memset(dst0_u, 0xFF, MAX_LINE_SIZE);
+ memset(dst0_v, 0xFF, MAX_LINE_SIZE);
+ memset(dst1_y, 0xFF, MAX_LINE_SIZE * 2);
+ memset(dst1_u, 0xFF, MAX_LINE_SIZE);
+ memset(dst1_v, 0xFF, MAX_LINE_SIZE);
+
+ call_ref(ctx, src, srcStride, srcSliceY,
+ srcSliceH, dst0, dstStride);
+ call_new(ctx, src, srcStride, srcSliceY,
+ srcSliceH, dst1, dstStride);
+
+ if (memcmp(dst0_y, dst1_y, MAX_LINE_SIZE * 2) ||
+ memcmp(dst0_u, dst1_u, MAX_LINE_SIZE) ||
+ memcmp(dst0_v, dst1_v, MAX_LINE_SIZE))
+ fail();
+
+ bench_new(ctx, src, srcStride, srcSliceY,
+ srcSliceH, dst0, dstStride);
+ }
+ sws_freeContext(ctx);
+ }
+ }
+}
+
+#undef MAX_LINE_SIZE
+
+void checkasm_check_sw_yuv2yuv(void)
+{
+ check_semiplanar(AV_PIX_FMT_YUV420P);
+ report("yuv420p");
+}
--
2.30.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
* [FFmpeg-devel] [PATCH 3/4] swscale: export ff_copyPlane so it may be used by simd code
2024-08-09 11:26 [FFmpeg-devel] [PATCH 1/4] swscale: add nv24/nv42 to yuv420p unscaled converter Ramiro Polla
2024-08-09 11:26 ` [FFmpeg-devel] [PATCH 2/4] checkasm/yuv2yuv: add tests for semiplanar unscaled converters Ramiro Polla
@ 2024-08-09 11:26 ` Ramiro Polla
2024-08-09 11:26 ` [FFmpeg-devel] [PATCH 4/4] swscale/aarch64: add nv24/nv42 to yuv420p unscaled converter Ramiro Polla
2 siblings, 0 replies; 7+ messages in thread
From: Ramiro Polla @ 2024-08-09 11:26 UTC (permalink / raw)
To: ffmpeg-devel
---
libswscale/swscale_internal.h | 4 ++++
libswscale/swscale_unscaled.c | 42 +++++++++++++++++------------------
2 files changed, 25 insertions(+), 21 deletions(-)
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index e5610161d0..50127d288f 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -1009,6 +1009,10 @@ int ff_sws_alphablendaway(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[]);
+void ff_copyPlane(const uint8_t *src, int srcStride,
+ int srcSliceY, int srcSliceH, int width,
+ uint8_t *dst, int dstStride);
+
static inline void fillPlane16(uint8_t *plane, int stride, int width, int height, int y,
int alpha, int bits, const int big_endian)
{
diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
index 239258ab8c..dc1d5f3593 100644
--- a/libswscale/swscale_unscaled.c
+++ b/libswscale/swscale_unscaled.c
@@ -122,9 +122,9 @@ static void fillPlane(uint8_t *plane, int stride, int width, int height, int y,
}
}
-static void copyPlane(const uint8_t *src, int srcStride,
- int srcSliceY, int srcSliceH, int width,
- uint8_t *dst, int dstStride)
+void ff_copyPlane(const uint8_t *src, int srcStride,
+ int srcSliceY, int srcSliceH, int width,
+ uint8_t *dst, int dstStride)
{
dst += dstStride * srcSliceY;
if (dstStride == srcStride && srcStride > 0) {
@@ -146,8 +146,8 @@ static int planarToNv12Wrapper(SwsContext *c, const uint8_t *src[],
{
uint8_t *dst = dstParam[1] + dstStride[1] * srcSliceY / 2;
- copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
- dstParam[0], dstStride[0]);
+ ff_copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
+ dstParam[0], dstStride[0]);
if (c->dstFormat == AV_PIX_FMT_NV12)
interleaveBytes(src[1], src[2], dst, c->chrSrcW, (srcSliceH + 1) / 2,
@@ -167,8 +167,8 @@ static int nv12ToPlanarWrapper(SwsContext *c, const uint8_t *src[],
uint8_t *dst1 = dstParam[1] + dstStride[1] * srcSliceY / 2;
uint8_t *dst2 = dstParam[2] + dstStride[2] * srcSliceY / 2;
- copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
- dstParam[0], dstStride[0]);
+ ff_copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
+ dstParam[0], dstStride[0]);
if (c->srcFormat == AV_PIX_FMT_NV12)
deinterleaveBytes(src[1], dst1, dst2, c->chrSrcW, (srcSliceH + 1) / 2,
@@ -187,8 +187,8 @@ static int planarToNv24Wrapper(SwsContext *c, const uint8_t *src[],
{
uint8_t *dst = dstParam[1] + dstStride[1] * srcSliceY;
- copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
- dstParam[0], dstStride[0]);
+ ff_copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
+ dstParam[0], dstStride[0]);
if (c->dstFormat == AV_PIX_FMT_NV24)
interleaveBytes(src[1], src[2], dst, c->chrSrcW, srcSliceH,
@@ -208,8 +208,8 @@ static int nv24ToPlanarWrapper(SwsContext *c, const uint8_t *src[],
uint8_t *dst1 = dstParam[1] + dstStride[1] * srcSliceY;
uint8_t *dst2 = dstParam[2] + dstStride[2] * srcSliceY;
- copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
- dstParam[0], dstStride[0]);
+ ff_copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
+ dstParam[0], dstStride[0]);
if (c->srcFormat == AV_PIX_FMT_NV24)
deinterleaveBytes(src[1], dst1, dst2, c->chrSrcW, srcSliceH,
@@ -250,8 +250,8 @@ static int nv24ToYuv420Wrapper(SwsContext *c, const uint8_t *src[],
uint8_t *dst1 = dstParam[1] + dstStride[1] * srcSliceY / 2;
uint8_t *dst2 = dstParam[2] + dstStride[2] * srcSliceY / 2;
- copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
- dstParam[0], dstStride[0]);
+ ff_copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
+ dstParam[0], dstStride[0]);
if (c->srcFormat == AV_PIX_FMT_NV24)
nv24_to_yuv420p_chroma(dst1, dstStride[1], dst2, dstStride[2],
@@ -1173,12 +1173,12 @@ static int planarRgbToplanarRgbWrapper(SwsContext *c,
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
- copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
- dst[0], dstStride[0]);
- copyPlane(src[1], srcStride[1], srcSliceY, srcSliceH, c->srcW,
- dst[1], dstStride[1]);
- copyPlane(src[2], srcStride[2], srcSliceY, srcSliceH, c->srcW,
- dst[2], dstStride[2]);
+ ff_copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
+ dst[0], dstStride[0]);
+ ff_copyPlane(src[1], srcStride[1], srcSliceY, srcSliceH, c->srcW,
+ dst[1], dstStride[1]);
+ ff_copyPlane(src[2], srcStride[2], srcSliceY, srcSliceH, c->srcW,
+ dst[2], dstStride[2]);
if (dst[3])
fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
@@ -1700,8 +1700,8 @@ static int yvu9ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
- copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
- dst[0], dstStride[0]);
+ ff_copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
+ dst[0], dstStride[0]);
planar2x(src[1], dst[1] + dstStride[1] * (srcSliceY >> 1), c->chrSrcW,
srcSliceH >> 2, srcStride[1], dstStride[1]);
--
2.30.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
* [FFmpeg-devel] [PATCH 4/4] swscale/aarch64: add nv24/nv42 to yuv420p unscaled converter
2024-08-09 11:26 [FFmpeg-devel] [PATCH 1/4] swscale: add nv24/nv42 to yuv420p unscaled converter Ramiro Polla
2024-08-09 11:26 ` [FFmpeg-devel] [PATCH 2/4] checkasm/yuv2yuv: add tests for semiplanar unscaled converters Ramiro Polla
2024-08-09 11:26 ` [FFmpeg-devel] [PATCH 3/4] swscale: export ff_copyPlane so it may be used by simd code Ramiro Polla
@ 2024-08-09 11:26 ` Ramiro Polla
2024-08-14 12:31 ` Martin Storsjö
2 siblings, 1 reply; 7+ messages in thread
From: Ramiro Polla @ 2024-08-09 11:26 UTC (permalink / raw)
To: ffmpeg-devel
checkasm --bench for Raspberry Pi 5 Model B Rev 1.0:
nv24_yuv420p_128_c: 423.0
nv24_yuv420p_128_neon: 115.7
nv24_yuv420p_1920_c: 5939.5
nv24_yuv420p_1920_neon: 1339.7
nv42_yuv420p_128_c: 423.2
nv42_yuv420p_128_neon: 115.7
nv42_yuv420p_1920_c: 5907.5
nv42_yuv420p_1920_neon: 1342.5
---
libswscale/aarch64/Makefile | 1 +
libswscale/aarch64/swscale_unscaled.c | 30 +++++++++
libswscale/aarch64/swscale_unscaled_neon.S | 75 ++++++++++++++++++++++
3 files changed, 106 insertions(+)
create mode 100644 libswscale/aarch64/swscale_unscaled_neon.S
diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
index 37ad960619..1de8c9c0d6 100644
--- a/libswscale/aarch64/Makefile
+++ b/libswscale/aarch64/Makefile
@@ -7,4 +7,5 @@ NEON-OBJS += aarch64/hscale.o \
aarch64/output.o \
aarch64/range_convert_neon.o \
aarch64/rgb2rgb_neon.o \
+ aarch64/swscale_unscaled_neon.o \
aarch64/yuv2rgb_neon.o \
diff --git a/libswscale/aarch64/swscale_unscaled.c b/libswscale/aarch64/swscale_unscaled.c
index b3093bbc9d..87bb011709 100644
--- a/libswscale/aarch64/swscale_unscaled.c
+++ b/libswscale/aarch64/swscale_unscaled.c
@@ -83,6 +83,31 @@ static int ifmt##_to_##ofmt##_neon_wrapper(SwsContext *c, const uint8_t *src[],
c->yuv2rgb_y_coeff); \
} \
+void ff_nv24_to_yuv420p_chroma_neon(uint8_t *dst1, int dstStride1,
+ uint8_t *dst2, int dstStride2,
+ const uint8_t *src, int srcStride,
+ int w, int h);
+
+static int nv24_to_yuv420p_neon_wrapper(SwsContext *c, const uint8_t *src[],
+ int srcStride[], int srcSliceY, int srcSliceH,
+ uint8_t *dst[], int dstStride[])
+{
+ uint8_t *dst1 = dst[1] + dstStride[1] * srcSliceY / 2;
+ uint8_t *dst2 = dst[2] + dstStride[2] * srcSliceY / 2;
+
+ ff_copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
+ dst[0], dstStride[0]);
+
+ if (c->srcFormat == AV_PIX_FMT_NV24)
+ ff_nv24_to_yuv420p_chroma_neon(dst1, dstStride[1], dst2, dstStride[2],
+ src[1], srcStride[1], c->srcW / 2, srcSliceH);
+ else
+ ff_nv24_to_yuv420p_chroma_neon(dst2, dstStride[2], dst1, dstStride[1],
+ src[1], srcStride[1], c->srcW / 2, srcSliceH);
+
+ return srcSliceH;
+}
+
#define DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nvx) \
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, argb) \
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgba) \
@@ -119,6 +144,11 @@ static void get_unscaled_swscale_neon(SwsContext *c) {
SET_FF_NVX_TO_ALL_RGBX_FUNC(nv21, NV21, accurate_rnd);
SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv420p, YUV420P, accurate_rnd);
SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv422p, YUV422P, accurate_rnd);
+
+ if (c->dstFormat == AV_PIX_FMT_YUV420P &&
+ (c->srcFormat == AV_PIX_FMT_NV24 || c->srcFormat == AV_PIX_FMT_NV42) &&
+ !(c->srcH & 1) && !(c->srcW & 15) && !accurate_rnd)
+ c->convert_unscaled = nv24_to_yuv420p_neon_wrapper;
}
void ff_get_unscaled_swscale_aarch64(SwsContext *c)
diff --git a/libswscale/aarch64/swscale_unscaled_neon.S b/libswscale/aarch64/swscale_unscaled_neon.S
new file mode 100644
index 0000000000..a206fda41f
--- /dev/null
+++ b/libswscale/aarch64/swscale_unscaled_neon.S
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2024 Ramiro Polla
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+function ff_nv24_to_yuv420p_chroma_neon, export=1
+// x0 uint8_t *dst1
+// x1 int dstStride1
+// x2 uint8_t *dst2
+// x3 int dstStride2
+// x4 const uint8_t *src
+// x5 int srcStride
+// w6 int w
+// w7 int h
+
+ uxtw x1, w1
+ uxtw x3, w3
+ uxtw x5, w5
+
+ add x9, x4, x5 // x9 = src + srcStride
+ lsl w5, w5, #1 // srcStride *= 2
+
+1:
+ mov w10, w6 // w10 = w
+ mov x11, x4 // x11 = src1 (line 1)
+ mov x12, x9 // x12 = src2 (line 2)
+ mov x13, x0 // x13 = dst1 (dstU)
+ mov x14, x2 // x14 = dst2 (dstV)
+
+2:
+ ld2 { v0.16b, v1.16b }, [x11], #32 // v0 = U1, v1 = V1
+ ld2 { v2.16b, v3.16b }, [x12], #32 // v2 = U2, v3 = V2
+
+ uaddlp v0.8h, v0.16b // pairwise add U1 into v0
+ uaddlp v1.8h, v1.16b // pairwise add V1 into v1
+ uadalp v0.8h, v2.16b // pairwise add U2, accumulate into v0
+ uadalp v1.8h, v3.16b // pairwise add V2, accumulate into v1
+
+ shrn v0.8b, v0.8h, #2 // divide by 4
+ shrn v1.8b, v1.8h, #2 // divide by 4
+
+ st1 { v0.8b }, [x13], #8 // store U into dst1
+ st1 { v1.8b }, [x14], #8 // store V into dst2
+
+ subs w10, w10, #8
+ b.gt 2b
+
+ // next row
+ add x4, x4, x5 // src1 += srcStride * 2
+ add x9, x9, x5 // src2 += srcStride * 2
+ add x0, x0, x1 // dst1 += dstStride1
+ add x2, x2, x3 // dst2 += dstStride2
+
+ subs w7, w7, #2
+ b.gt 1b
+
+ ret
+endfunc
--
2.30.2
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [FFmpeg-devel] [PATCH 4/4] swscale/aarch64: add nv24/nv42 to yuv420p unscaled converter
2024-08-09 11:26 ` [FFmpeg-devel] [PATCH 4/4] swscale/aarch64: add nv24/nv42 to yuv420p unscaled converter Ramiro Polla
@ 2024-08-14 12:31 ` Martin Storsjö
2024-08-15 14:25 ` Ramiro Polla
0 siblings, 1 reply; 7+ messages in thread
From: Martin Storsjö @ 2024-08-14 12:31 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Fri, 9 Aug 2024, Ramiro Polla wrote:
> checkasm --bench for Raspberry Pi 5 Model B Rev 1.0:
> nv24_yuv420p_128_c: 423.0
> nv24_yuv420p_128_neon: 115.7
> nv24_yuv420p_1920_c: 5939.5
> nv24_yuv420p_1920_neon: 1339.7
> nv42_yuv420p_128_c: 423.2
> nv42_yuv420p_128_neon: 115.7
> nv42_yuv420p_1920_c: 5907.5
> nv42_yuv420p_1920_neon: 1342.5
> ---
> libswscale/aarch64/Makefile | 1 +
> libswscale/aarch64/swscale_unscaled.c | 30 +++++++++
> libswscale/aarch64/swscale_unscaled_neon.S | 75 ++++++++++++++++++++++
> 3 files changed, 106 insertions(+)
> create mode 100644 libswscale/aarch64/swscale_unscaled_neon.S
> diff --git a/libswscale/aarch64/swscale_unscaled_neon.S b/libswscale/aarch64/swscale_unscaled_neon.S
> new file mode 100644
> index 0000000000..a206fda41f
> --- /dev/null
> +++ b/libswscale/aarch64/swscale_unscaled_neon.S
> @@ -0,0 +1,75 @@
> +/*
> + * Copyright (c) 2024 Ramiro Polla
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +
> +function ff_nv24_to_yuv420p_chroma_neon, export=1
> +// x0 uint8_t *dst1
> +// x1 int dstStride1
> +// x2 uint8_t *dst2
> +// x3 int dstStride2
> +// x4 const uint8_t *src
> +// x5 int srcStride
> +// w6 int w
> +// w7 int h
> +
> + uxtw x1, w1
> + uxtw x3, w3
> + uxtw x5, w5
You can often avoid the explicit uxtw instructions, if you can fold an
uxtw attribute into the cases where the register is used. (If it's used
often, it may be slightly more performant to do it upfront like this
though, but often it can be omitted entirely.) And whenever you do an
operation with a wN register as destination, the upper half of the
register gets explicitly cleared, so these also may be avoided that way.
> +
> + add x9, x4, x5 // x9 = src + srcStride
> + lsl w5, w5, #1 // srcStride *= 2
> +
> +1:
> + mov w10, w6 // w10 = w
> + mov x11, x4 // x11 = src1 (line 1)
> + mov x12, x9 // x12 = src2 (line 2)
> + mov x13, x0 // x13 = dst1 (dstU)
> + mov x14, x2 // x14 = dst2 (dstV)
> +
> +2:
> + ld2 { v0.16b, v1.16b }, [x11], #32 // v0 = U1, v1 = V1
> + ld2 { v2.16b, v3.16b }, [x12], #32 // v2 = U2, v3 = V2
> +
> + uaddlp v0.8h, v0.16b // pairwise add U1 into v0
> + uaddlp v1.8h, v1.16b // pairwise add V1 into v1
> + uadalp v0.8h, v2.16b // pairwise add U2, accumulate into v0
> + uadalp v1.8h, v3.16b // pairwise add V2, accumulate into v1
> +
> + shrn v0.8b, v0.8h, #2 // divide by 4
> + shrn v1.8b, v1.8h, #2 // divide by 4
> +
> + st1 { v0.8b }, [x13], #8 // store U into dst1
> + st1 { v1.8b }, [x14], #8 // store V into dst2
> +
> + subs w10, w10, #8
> + b.gt 2b
> +
> + // next row
> + add x4, x4, x5 // src1 += srcStride * 2
> + add x9, x9, x5 // src2 += srcStride * 2
> + add x0, x0, x1 // dst1 += dstStride1
> + add x2, x2, x3 // dst2 += dstStride2
It's often possible to avoid the extra step of moving the pointers back
into the the x11/x12/x13/x14 registers, if you subtract the width from the
stride at the start of the function. Then you don't need two separate
registers for each pointer, and shortens dependency chain when moving on
to the next line.
If the width can be any uneven value, but we in practice write in
increments of 8 pixels, you may need to align the width up to 8 before
using it to decrement the stride that way though.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [FFmpeg-devel] [PATCH 2/4] checkasm/yuv2yuv: add tests for semiplanar unscaled converters
2024-08-09 11:26 ` [FFmpeg-devel] [PATCH 2/4] checkasm/yuv2yuv: add tests for semiplanar unscaled converters Ramiro Polla
@ 2024-08-15 14:19 ` Ramiro Polla
0 siblings, 0 replies; 7+ messages in thread
From: Ramiro Polla @ 2024-08-15 14:19 UTC (permalink / raw)
To: ffmpeg-devel
[-- Attachment #1: Type: text/plain, Size: 515 bytes --]
On Fri, Aug 9, 2024 at 1:26 PM Ramiro Polla <ramiro.polla@gmail.com> wrote:
>
> ---
> tests/checkasm/Makefile | 2 +-
> tests/checkasm/checkasm.c | 1 +
> tests/checkasm/checkasm.h | 1 +
> tests/checkasm/sw_yuv2yuv.c | 131 ++++++++++++++++++++++++++++++++++++
> 4 files changed, 134 insertions(+), 1 deletion(-)
> create mode 100644 tests/checkasm/sw_yuv2yuv.c
New patch attached improves the tests by converting 4 lines instead of
2 and by not setting stride to be equal to width.
[-- Attachment #2: v2-0002-checkasm-yuv2yuv-add-tests-for-semiplanar-unscale.patch --]
[-- Type: text/x-patch, Size: 7582 bytes --]
From 96122637b5df57da53208d9a81b79b5b2e4707b6 Mon Sep 17 00:00:00 2001
From: Ramiro Polla <ramiro.polla@gmail.com>
Date: Wed, 7 Aug 2024 23:29:43 +0200
Subject: [PATCH v2 2/4] checkasm/yuv2yuv: add tests for semiplanar unscaled
converters
---
tests/checkasm/Makefile | 2 +-
tests/checkasm/checkasm.c | 1 +
tests/checkasm/checkasm.h | 1 +
tests/checkasm/sw_yuv2yuv.c | 133 ++++++++++++++++++++++++++++++++++++
4 files changed, 136 insertions(+), 1 deletion(-)
create mode 100644 tests/checkasm/sw_yuv2yuv.c
diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 3a7670e24b..2d2e42e445 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -63,7 +63,7 @@ AVFILTEROBJS-$(CONFIG_SOBEL_FILTER) += vf_convolution.o
CHECKASMOBJS-$(CONFIG_AVFILTER) += $(AVFILTEROBJS-yes)
# swscale tests
-SWSCALEOBJS += sw_gbrp.o sw_range_convert.o sw_rgb.o sw_scale.o sw_yuv2rgb.o
+SWSCALEOBJS += sw_gbrp.o sw_range_convert.o sw_rgb.o sw_scale.o sw_yuv2rgb.o sw_yuv2yuv.o
CHECKASMOBJS-$(CONFIG_SWSCALE) += $(SWSCALEOBJS)
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 58597d3888..5c407de2ba 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -255,6 +255,7 @@ static const struct {
{ "sw_rgb", checkasm_check_sw_rgb },
{ "sw_scale", checkasm_check_sw_scale },
{ "sw_yuv2rgb", checkasm_check_sw_yuv2rgb },
+ { "sw_yuv2yuv", checkasm_check_sw_yuv2yuv },
#endif
#if CONFIG_AVUTIL
{ "fixed_dsp", checkasm_check_fixed_dsp },
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 4d5f3e387e..3e73808739 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -123,6 +123,7 @@ void checkasm_check_sw_range_convert(void);
void checkasm_check_sw_rgb(void);
void checkasm_check_sw_scale(void);
void checkasm_check_sw_yuv2rgb(void);
+void checkasm_check_sw_yuv2yuv(void);
void checkasm_check_takdsp(void);
void checkasm_check_utvideodsp(void);
void checkasm_check_v210dec(void);
diff --git a/tests/checkasm/sw_yuv2yuv.c b/tests/checkasm/sw_yuv2yuv.c
new file mode 100644
index 0000000000..90a51601ed
--- /dev/null
+++ b/tests/checkasm/sw_yuv2yuv.c
@@ -0,0 +1,133 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string.h>
+
+#include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+#include "libavutil/pixdesc.h"
+
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+
+#include "checkasm.h"
+
+#define randomize_buffers(buf, size) \
+ do { \
+ for (int j = 0; j < size; j += 4) \
+ AV_WN32(buf + j, rnd()); \
+ } while (0)
+
+static void check_semiplanar(int dst_pix_fmt)
+{
+ static const int src_fmts[] = {
+ AV_PIX_FMT_NV24,
+ AV_PIX_FMT_NV42,
+ };
+ const AVPixFmtDescriptor *dst_desc = av_pix_fmt_desc_get(dst_pix_fmt);
+#define NUM_LINES 4
+#define MAX_LINE_SIZE 1920
+ static const int input_sizes[] = {8, 128, 1080, MAX_LINE_SIZE};
+
+ declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT,
+ int, SwsContext *c, const uint8_t *src[],
+ int srcStride[], int srcSliceY, int srcSliceH,
+ uint8_t *dst[], int dstStride[]);
+
+ LOCAL_ALIGNED_8(uint8_t, src_y, [MAX_LINE_SIZE * NUM_LINES]);
+ LOCAL_ALIGNED_8(uint8_t, src_uv, [MAX_LINE_SIZE * NUM_LINES * 2]);
+ const uint8_t *src[4] = { src_y, src_uv };
+
+ LOCAL_ALIGNED_8(uint8_t, dst0_y, [MAX_LINE_SIZE * NUM_LINES]);
+ LOCAL_ALIGNED_8(uint8_t, dst0_u, [MAX_LINE_SIZE * NUM_LINES / 2]);
+ LOCAL_ALIGNED_8(uint8_t, dst0_v, [MAX_LINE_SIZE * NUM_LINES / 2]);
+ uint8_t *dst0[4] = { dst0_y, dst0_u, dst0_v };
+
+ LOCAL_ALIGNED_8(uint8_t, dst1_y, [MAX_LINE_SIZE * NUM_LINES]);
+ LOCAL_ALIGNED_8(uint8_t, dst1_u, [MAX_LINE_SIZE * NUM_LINES / 2]);
+ LOCAL_ALIGNED_8(uint8_t, dst1_v, [MAX_LINE_SIZE * NUM_LINES / 2]);
+ uint8_t *dst1[4] = { dst1_y, dst1_u, dst1_v };
+
+ randomize_buffers(src_y, MAX_LINE_SIZE * NUM_LINES);
+ randomize_buffers(src_uv, MAX_LINE_SIZE * NUM_LINES * 2);
+
+ for (int sfi = 0; sfi < FF_ARRAY_ELEMS(src_fmts); sfi++) {
+ int src_pix_fmt = src_fmts[sfi];
+ const AVPixFmtDescriptor *src_desc = av_pix_fmt_desc_get(src_pix_fmt);
+ for (int isi = 0; isi < FF_ARRAY_ELEMS(input_sizes); isi++) {
+ struct SwsContext *ctx;
+ int log_level;
+ int width = input_sizes[isi];
+ int srcSliceY = 0;
+ int srcSliceH = NUM_LINES;
+ int srcStride[4] = {
+ MAX_LINE_SIZE,
+ MAX_LINE_SIZE * 2,
+ };
+ int dstStride[4] = {
+ MAX_LINE_SIZE,
+ MAX_LINE_SIZE >> dst_desc->log2_chroma_w,
+ MAX_LINE_SIZE >> dst_desc->log2_chroma_w,
+ };
+
+ // override log level to prevent spamming of the message
+ // "No accelerated colorspace conversion found from %s to %s"
+ log_level = av_log_get_level();
+ av_log_set_level(AV_LOG_ERROR);
+ ctx = sws_getContext(width, srcSliceH, src_pix_fmt,
+ width, srcSliceH, dst_pix_fmt,
+ 0, NULL, NULL, NULL);
+ av_log_set_level(log_level);
+ if (!ctx)
+ fail();
+
+ if (check_func(ctx->convert_unscaled, "%s_%s_%d", src_desc->name, dst_desc->name, width)) {
+ memset(dst0_y, 0xFF, MAX_LINE_SIZE * NUM_LINES);
+ memset(dst0_u, 0xFF, MAX_LINE_SIZE * NUM_LINES / 2);
+ memset(dst0_v, 0xFF, MAX_LINE_SIZE * NUM_LINES / 2);
+ memset(dst1_y, 0xFF, MAX_LINE_SIZE * NUM_LINES);
+ memset(dst1_u, 0xFF, MAX_LINE_SIZE * NUM_LINES / 2);
+ memset(dst1_v, 0xFF, MAX_LINE_SIZE * NUM_LINES / 2);
+
+ call_ref(ctx, src, srcStride, srcSliceY,
+ srcSliceH, dst0, dstStride);
+ call_new(ctx, src, srcStride, srcSliceY,
+ srcSliceH, dst1, dstStride);
+
+ if (memcmp(dst0_y, dst1_y, MAX_LINE_SIZE * NUM_LINES) ||
+ memcmp(dst0_u, dst1_u, MAX_LINE_SIZE * NUM_LINES / 2) ||
+ memcmp(dst0_v, dst1_v, MAX_LINE_SIZE * NUM_LINES / 2))
+ fail();
+
+ bench_new(ctx, src, srcStride, srcSliceY,
+ srcSliceH, dst0, dstStride);
+ }
+ sws_freeContext(ctx);
+ }
+ }
+}
+
+#undef NUM_LINES
+#undef MAX_LINE_SIZE
+
+void checkasm_check_sw_yuv2yuv(void)
+{
+ check_semiplanar(AV_PIX_FMT_YUV420P);
+ report("yuv420p");
+}
--
2.39.2
[-- Attachment #3: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [FFmpeg-devel] [PATCH 4/4] swscale/aarch64: add nv24/nv42 to yuv420p unscaled converter
2024-08-14 12:31 ` Martin Storsjö
@ 2024-08-15 14:25 ` Ramiro Polla
0 siblings, 0 replies; 7+ messages in thread
From: Ramiro Polla @ 2024-08-15 14:25 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1: Type: text/plain, Size: 5270 bytes --]
On Wed, Aug 14, 2024 at 7:21 PM Martin Storsjö <martin@martin.st> wrote:
> On Fri, 9 Aug 2024, Ramiro Polla wrote:
> > checkasm --bench for Raspberry Pi 5 Model B Rev 1.0:
> > nv24_yuv420p_128_c: 423.0
> > nv24_yuv420p_128_neon: 115.7
> > nv24_yuv420p_1920_c: 5939.5
> > nv24_yuv420p_1920_neon: 1339.7
> > nv42_yuv420p_128_c: 423.2
> > nv42_yuv420p_128_neon: 115.7
> > nv42_yuv420p_1920_c: 5907.5
> > nv42_yuv420p_1920_neon: 1342.5
> > ---
> > libswscale/aarch64/Makefile | 1 +
> > libswscale/aarch64/swscale_unscaled.c | 30 +++++++++
> > libswscale/aarch64/swscale_unscaled_neon.S | 75 ++++++++++++++++++++++
> > 3 files changed, 106 insertions(+)
> > create mode 100644 libswscale/aarch64/swscale_unscaled_neon.S
>
> > diff --git a/libswscale/aarch64/swscale_unscaled_neon.S b/libswscale/aarch64/swscale_unscaled_neon.S
> > new file mode 100644
> > index 0000000000..a206fda41f
> > --- /dev/null
> > +++ b/libswscale/aarch64/swscale_unscaled_neon.S
> > @@ -0,0 +1,75 @@
> > +/*
> > + * Copyright (c) 2024 Ramiro Polla
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> > + */
> > +
> > +#include "libavutil/aarch64/asm.S"
> > +
> > +function ff_nv24_to_yuv420p_chroma_neon, export=1
> > +// x0 uint8_t *dst1
> > +// x1 int dstStride1
> > +// x2 uint8_t *dst2
> > +// x3 int dstStride2
> > +// x4 const uint8_t *src
> > +// x5 int srcStride
> > +// w6 int w
> > +// w7 int h
> > +
> > + uxtw x1, w1
> > + uxtw x3, w3
> > + uxtw x5, w5
>
> You can often avoid the explicit uxtw instructions, if you can fold an
> uxtw attribute into the cases where the register is used. (If it's used
> often, it may be slightly more performant to do it upfront like this
> though, but often it can be omitted entirely.) And whenever you do an
> operation with a wN register as destination, the upper half of the
> register gets explicitly cleared, so these also may be avoided that way.
>
> > +
> > + add x9, x4, x5 // x9 = src + srcStride
> > + lsl w5, w5, #1 // srcStride *= 2
> > +
> > +1:
> > + mov w10, w6 // w10 = w
> > + mov x11, x4 // x11 = src1 (line 1)
> > + mov x12, x9 // x12 = src2 (line 2)
> > + mov x13, x0 // x13 = dst1 (dstU)
> > + mov x14, x2 // x14 = dst2 (dstV)
> > +
> > +2:
> > + ld2 { v0.16b, v1.16b }, [x11], #32 // v0 = U1, v1 = V1
> > + ld2 { v2.16b, v3.16b }, [x12], #32 // v2 = U2, v3 = V2
> > +
> > + uaddlp v0.8h, v0.16b // pairwise add U1 into v0
> > + uaddlp v1.8h, v1.16b // pairwise add V1 into v1
> > + uadalp v0.8h, v2.16b // pairwise add U2, accumulate into v0
> > + uadalp v1.8h, v3.16b // pairwise add V2, accumulate into v1
> > +
> > + shrn v0.8b, v0.8h, #2 // divide by 4
> > + shrn v1.8b, v1.8h, #2 // divide by 4
> > +
> > + st1 { v0.8b }, [x13], #8 // store U into dst1
> > + st1 { v1.8b }, [x14], #8 // store V into dst2
> > +
> > + subs w10, w10, #8
> > + b.gt 2b
> > +
> > + // next row
> > + add x4, x4, x5 // src1 += srcStride * 2
> > + add x9, x9, x5 // src2 += srcStride * 2
> > + add x0, x0, x1 // dst1 += dstStride1
> > + add x2, x2, x3 // dst2 += dstStride2
>
> It's often possible to avoid the extra step of moving the pointers back
> into the the x11/x12/x13/x14 registers, if you subtract the width from the
> stride at the start of the function. Then you don't need two separate
> registers for each pointer, and shortens dependency chain when moving on
> to the next line.
>
> If the width can be any uneven value, but we in practice write in
> increments of 8 pixels, you may need to align the width up to 8 before
> using it to decrement the stride that way though.
Thank you for the review. New patch attached.
[-- Attachment #2: v2-0004-swscale-aarch64-add-nv24-nv42-to-yuv420p-unscaled.patch --]
[-- Type: text/x-patch, Size: 6912 bytes --]
From f6ea1edb0590c14e168fbce2ae42958220b6e778 Mon Sep 17 00:00:00 2001
From: Ramiro Polla <ramiro.polla@gmail.com>
Date: Wed, 7 Aug 2024 18:53:12 +0200
Subject: [PATCH v2 4/4] swscale/aarch64: add nv24/nv42 to yuv420p unscaled
converter
checkasm --bench for Raspberry Pi 5 Model B Rev 1.0:
nv24_yuv420p_128_c: 1320.2
nv24_yuv420p_128_neon: 709.5
nv24_yuv420p_1920_c: 12448.0
nv24_yuv420p_1920_neon: 2698.0
nv42_yuv420p_128_c: 1329.2
nv42_yuv420p_128_neon: 841.7
nv42_yuv420p_1920_c: 11967.5
nv42_yuv420p_1920_neon: 2866.5
---
libswscale/aarch64/Makefile | 1 +
libswscale/aarch64/swscale_unscaled.c | 30 ++++++++++
libswscale/aarch64/swscale_unscaled_neon.S | 70 ++++++++++++++++++++++
3 files changed, 101 insertions(+)
create mode 100644 libswscale/aarch64/swscale_unscaled_neon.S
diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
index 37ad960619..1de8c9c0d6 100644
--- a/libswscale/aarch64/Makefile
+++ b/libswscale/aarch64/Makefile
@@ -7,4 +7,5 @@ NEON-OBJS += aarch64/hscale.o \
aarch64/output.o \
aarch64/range_convert_neon.o \
aarch64/rgb2rgb_neon.o \
+ aarch64/swscale_unscaled_neon.o \
aarch64/yuv2rgb_neon.o \
diff --git a/libswscale/aarch64/swscale_unscaled.c b/libswscale/aarch64/swscale_unscaled.c
index b3093bbc9d..87bb011709 100644
--- a/libswscale/aarch64/swscale_unscaled.c
+++ b/libswscale/aarch64/swscale_unscaled.c
@@ -83,6 +83,31 @@ static int ifmt##_to_##ofmt##_neon_wrapper(SwsContext *c, const uint8_t *src[],
c->yuv2rgb_y_coeff); \
} \
+void ff_nv24_to_yuv420p_chroma_neon(uint8_t *dst1, int dstStride1,
+ uint8_t *dst2, int dstStride2,
+ const uint8_t *src, int srcStride,
+ int w, int h);
+
+static int nv24_to_yuv420p_neon_wrapper(SwsContext *c, const uint8_t *src[],
+ int srcStride[], int srcSliceY, int srcSliceH,
+ uint8_t *dst[], int dstStride[])
+{
+ uint8_t *dst1 = dst[1] + dstStride[1] * srcSliceY / 2;
+ uint8_t *dst2 = dst[2] + dstStride[2] * srcSliceY / 2;
+
+ ff_copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
+ dst[0], dstStride[0]);
+
+ if (c->srcFormat == AV_PIX_FMT_NV24)
+ ff_nv24_to_yuv420p_chroma_neon(dst1, dstStride[1], dst2, dstStride[2],
+ src[1], srcStride[1], c->srcW / 2, srcSliceH);
+ else
+ ff_nv24_to_yuv420p_chroma_neon(dst2, dstStride[2], dst1, dstStride[1],
+ src[1], srcStride[1], c->srcW / 2, srcSliceH);
+
+ return srcSliceH;
+}
+
#define DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nvx) \
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, argb) \
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgba) \
@@ -119,6 +144,11 @@ static void get_unscaled_swscale_neon(SwsContext *c) {
SET_FF_NVX_TO_ALL_RGBX_FUNC(nv21, NV21, accurate_rnd);
SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv420p, YUV420P, accurate_rnd);
SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv422p, YUV422P, accurate_rnd);
+
+ if (c->dstFormat == AV_PIX_FMT_YUV420P &&
+ (c->srcFormat == AV_PIX_FMT_NV24 || c->srcFormat == AV_PIX_FMT_NV42) &&
+ !(c->srcH & 1) && !(c->srcW & 15) && !accurate_rnd)
+ c->convert_unscaled = nv24_to_yuv420p_neon_wrapper;
}
void ff_get_unscaled_swscale_aarch64(SwsContext *c)
diff --git a/libswscale/aarch64/swscale_unscaled_neon.S b/libswscale/aarch64/swscale_unscaled_neon.S
new file mode 100644
index 0000000000..7f1890f58a
--- /dev/null
+++ b/libswscale/aarch64/swscale_unscaled_neon.S
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2024 Ramiro Polla
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+function ff_nv24_to_yuv420p_chroma_neon, export=1
+// x0 uint8_t *dst1
+// x1 int dstStride1
+// x2 uint8_t *dst2
+// x3 int dstStride2
+// x4 const uint8_t *src
+// x5 int srcStride
+// w6 int w
+// w7 int h
+
+ add x9, x4, w5, sxtw // x9 = src + srcStride
+ lsl w5, w5, #1 // srcStride *= 2
+ sub w5, w5, w6, lsl #2 // srcPadding = (2 * srcStride) - (4 * w)
+ sub w1, w1, w6 // dstPadding1 = dstStride1 - w
+ sub w3, w3, w6 // dstPadding2 = dstStride2 - w
+
+1:
+ mov w10, w6 // w10 = w
+
+2:
+ ld2 { v0.16b, v1.16b }, [x4], #32 // v0 = U1, v1 = V1
+ ld2 { v2.16b, v3.16b }, [x9], #32 // v2 = U2, v3 = V2
+
+ uaddlp v0.8h, v0.16b // pairwise add U1 into v0
+ uaddlp v1.8h, v1.16b // pairwise add V1 into v1
+ uadalp v0.8h, v2.16b // pairwise add U2, accumulate into v0
+ uadalp v1.8h, v3.16b // pairwise add V2, accumulate into v1
+
+ shrn v0.8b, v0.8h, #2 // divide by 4
+ shrn v1.8b, v1.8h, #2 // divide by 4
+
+ st1 { v0.8b }, [x0], #8 // store U into dst1
+ st1 { v1.8b }, [x2], #8 // store V into dst2
+
+ subs w10, w10, #8
+ b.gt 2b
+
+ // next row
+ add x4, x4, x5, sxtw // src1 += srcPadding
+ add x9, x9, x5, sxtw // src2 += srcPadding
+ add x0, x0, x1, sxtw // dst1 += dstPadding1
+ add x2, x2, x3, sxtw // dst2 += dstPadding2
+
+ subs w7, w7, #2
+ b.gt 1b
+
+ ret
+endfunc
--
2.39.2
[-- Attachment #3: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2024-08-15 14:25 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-08-09 11:26 [FFmpeg-devel] [PATCH 1/4] swscale: add nv24/nv42 to yuv420p unscaled converter Ramiro Polla
2024-08-09 11:26 ` [FFmpeg-devel] [PATCH 2/4] checkasm/yuv2yuv: add tests for semiplanar unscaled converters Ramiro Polla
2024-08-15 14:19 ` Ramiro Polla
2024-08-09 11:26 ` [FFmpeg-devel] [PATCH 3/4] swscale: export ff_copyPlane so it may be used by simd code Ramiro Polla
2024-08-09 11:26 ` [FFmpeg-devel] [PATCH 4/4] swscale/aarch64: add nv24/nv42 to yuv420p unscaled converter Ramiro Polla
2024-08-14 12:31 ` Martin Storsjö
2024-08-15 14:25 ` Ramiro Polla
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git