* [FFmpeg-devel] [PATCH v3 5/5] avcodec/ac3: Implement sum_square_butterfly_float for aarch64 NEON
@ 2024-04-03 6:43 Geoff Hill
2024-04-04 13:01 ` Martin Storsjö
0 siblings, 1 reply; 2+ messages in thread
From: Geoff Hill @ 2024-04-03 6:43 UTC (permalink / raw)
To: ffmpeg-devel
Signed-off-by: Geoff Hill <geoff@geoffhill.org>
---
libavcodec/aarch64/ac3dsp_init_aarch64.c | 5 ++++
libavcodec/aarch64/ac3dsp_neon.S | 35 ++++++++++++++++++++++++
tests/checkasm/ac3dsp.c | 26 ++++++++++++++++++
3 files changed, 66 insertions(+)
diff --git a/libavcodec/aarch64/ac3dsp_init_aarch64.c b/libavcodec/aarch64/ac3dsp_init_aarch64.c
index e95436c651..e367353e11 100644
--- a/libavcodec/aarch64/ac3dsp_init_aarch64.c
+++ b/libavcodec/aarch64/ac3dsp_init_aarch64.c
@@ -32,6 +32,10 @@ void ff_ac3_sum_square_butterfly_int32_neon(int64_t sum[4],
const int32_t *coef0,
const int32_t *coef1,
int len);
+void ff_ac3_sum_square_butterfly_float_neon(float sum[4],
+ const float *coef0,
+ const float *coef1,
+ int len);
av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
{
@@ -42,4 +46,5 @@ av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
c->extract_exponents = ff_ac3_extract_exponents_neon;
c->float_to_fixed24 = ff_float_to_fixed24_neon;
c->sum_square_butterfly_int32 = ff_ac3_sum_square_butterfly_int32_neon;
+ c->sum_square_butterfly_float = ff_ac3_sum_square_butterfly_float_neon;
}
diff --git a/libavcodec/aarch64/ac3dsp_neon.S b/libavcodec/aarch64/ac3dsp_neon.S
index fa8fcf2e47..4a78ec0b2a 100644
--- a/libavcodec/aarch64/ac3dsp_neon.S
+++ b/libavcodec/aarch64/ac3dsp_neon.S
@@ -88,3 +88,38 @@ function ff_ac3_sum_square_butterfly_int32_neon, export=1
st1 {v0.1d-v3.1d}, [x0]
1: ret
endfunc
+
+function ff_ac3_sum_square_butterfly_float_neon, export=1
+ cbz w3, 1f
+ movi v0.4s, #0
+ movi v1.4s, #0
+ movi v2.4s, #0
+ movi v3.4s, #0
+0: ld1 {v30.4s}, [x1], #16
+ ld1 {v31.4s}, [x2], #16
+ fadd v16.4s, v30.4s, v31.4s
+ fsub v17.4s, v30.4s, v31.4s
+ fmul v30.4s, v30.4s, v30.4s
+ fadd v0.4s, v0.4s, v30.4s
+ fmul v31.4s, v31.4s, v31.4s
+ fadd v1.4s, v1.4s, v31.4s
+ fmul v16.4s, v16.4s, v16.4s
+ fadd v2.4s, v2.4s, v16.4s
+ fmul v17.4s, v17.4s, v17.4s
+ fadd v3.4s, v3.4s, v17.4s
+ subs w3, w3, #4
+ b.gt 0b
+ faddp v0.4s, v0.4s, v0.4s
+ faddp v0.2s, v0.2s, v0.2s
+ st1 {v0.s}[0], [x0], #4
+ faddp v1.4s, v1.4s, v1.4s
+ faddp v1.2s, v1.2s, v1.2s
+ st1 {v1.s}[0], [x0], #4
+ faddp v2.4s, v2.4s, v2.4s
+ faddp v2.2s, v2.2s, v2.2s
+ st1 {v2.s}[0], [x0], #4
+ faddp v3.4s, v3.4s, v3.4s
+ faddp v3.2s, v3.2s, v3.2s
+ st1 {v3.s}[0], [x0]
+1: ret
+endfunc
diff --git a/tests/checkasm/ac3dsp.c b/tests/checkasm/ac3dsp.c
index c920dc9eb0..ef5186cfc1 100644
--- a/tests/checkasm/ac3dsp.c
+++ b/tests/checkasm/ac3dsp.c
@@ -162,6 +162,31 @@ static void check_ac3_sum_square_butterfly_int32(AC3DSPContext *c) {
report("ac3_sum_square_butterfly_int32");
}
+static void check_ac3_sum_square_butterfly_float(AC3DSPContext *c) {
+ LOCAL_ALIGNED_32(float, lt, [ELEMS]);
+ LOCAL_ALIGNED_32(float, rt, [ELEMS]);
+ LOCAL_ALIGNED_16(float, v1, [4]);
+ LOCAL_ALIGNED_16(float, v2, [4]);
+
+ declare_func(void, float[4], const float *, const float *, int);
+
+ randomize_float(lt, ELEMS);
+ randomize_float(rt, ELEMS);
+
+ if (check_func(c->sum_square_butterfly_float,
+ "ac3_sum_square_bufferfly_float")) {
+ call_ref(v1, lt, rt, ELEMS);
+ call_new(v2, lt, rt, ELEMS);
+
+ if (!float_near_ulp_array(v1, v2, 10, 4))
+ fail();
+
+ bench_new(v2, lt, rt, ELEMS);
+ }
+
+ report("ac3_sum_square_butterfly_float");
+}
+
void checkasm_check_ac3dsp(void)
{
AC3DSPContext c;
@@ -171,4 +196,5 @@ void checkasm_check_ac3dsp(void)
check_ac3_extract_exponents(&c);
check_float_to_fixed24(&c);
check_ac3_sum_square_butterfly_int32(&c);
+ check_ac3_sum_square_butterfly_float(&c);
}
--
2.44.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [FFmpeg-devel] [PATCH v3 5/5] avcodec/ac3: Implement sum_square_butterfly_float for aarch64 NEON
2024-04-03 6:43 [FFmpeg-devel] [PATCH v3 5/5] avcodec/ac3: Implement sum_square_butterfly_float for aarch64 NEON Geoff Hill
@ 2024-04-04 13:01 ` Martin Storsjö
0 siblings, 0 replies; 2+ messages in thread
From: Martin Storsjö @ 2024-04-04 13:01 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Tue, 2 Apr 2024, Geoff Hill wrote:
> Signed-off-by: Geoff Hill <geoff@geoffhill.org>
> ---
> libavcodec/aarch64/ac3dsp_init_aarch64.c | 5 ++++
> libavcodec/aarch64/ac3dsp_neon.S | 35 ++++++++++++++++++++++++
> tests/checkasm/ac3dsp.c | 26 ++++++++++++++++++
> 3 files changed, 66 insertions(+)
>
> diff --git a/libavcodec/aarch64/ac3dsp_neon.S b/libavcodec/aarch64/ac3dsp_neon.S
> index fa8fcf2e47..4a78ec0b2a 100644
> --- a/libavcodec/aarch64/ac3dsp_neon.S
> +++ b/libavcodec/aarch64/ac3dsp_neon.S
> @@ -88,3 +88,38 @@ function ff_ac3_sum_square_butterfly_int32_neon, export=1
> st1 {v0.1d-v3.1d}, [x0]
> 1: ret
> endfunc
> +
> +function ff_ac3_sum_square_butterfly_float_neon, export=1
> + cbz w3, 1f
> + movi v0.4s, #0
> + movi v1.4s, #0
> + movi v2.4s, #0
> + movi v3.4s, #0
> +0: ld1 {v30.4s}, [x1], #16
> + ld1 {v31.4s}, [x2], #16
> + fadd v16.4s, v30.4s, v31.4s
> + fsub v17.4s, v30.4s, v31.4s
> + fmul v30.4s, v30.4s, v30.4s
> + fadd v0.4s, v0.4s, v30.4s
The arm version here used vmla instead of separate vmul+vadd - is there
any reason why we can't use fmla here?
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2024-04-04 13:01 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-04-03 6:43 [FFmpeg-devel] [PATCH v3 5/5] avcodec/ac3: Implement sum_square_butterfly_float for aarch64 NEON Geoff Hill
2024-04-04 13:01 ` Martin Storsjö
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git