From: John Cox <jc@kynesim.co.uk> To: ffmpeg-devel@ffmpeg.org Cc: thomas.mundt@hr.de, John Cox <jc@kynesim.co.uk> Subject: [FFmpeg-devel] [PATCH 11/15] avfilter/vf_bwdif: Add neon for filter_line Date: Thu, 29 Jun 2023 17:57:25 +0000 Message-ID: <20230629175729.224383-12-jc@kynesim.co.uk> (raw) In-Reply-To: <20230629175729.224383-1-jc@kynesim.co.uk> Signed-off-by: John Cox <jc@kynesim.co.uk> --- libavfilter/aarch64/vf_bwdif_init_aarch64.c | 21 ++ libavfilter/aarch64/vf_bwdif_neon.S | 215 ++++++++++++++++++++ 2 files changed, 236 insertions(+) diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c index e75cf2f204..21e67884ab 100644 --- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c +++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c @@ -31,6 +31,26 @@ void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1, void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs, int prefs3, int mrefs3, int parity, int clip_max); +void ff_bwdif_filter_line_neon(void *dst1, void *prev1, void *cur1, void *next1, + int w, int prefs, int mrefs, int prefs2, int mrefs2, + int prefs3, int mrefs3, int prefs4, int mrefs4, + int parity, int clip_max); + + +static void filter_line_helper(void *dst1, void *prev1, void *cur1, void *next1, + int w, int prefs, int mrefs, int prefs2, int mrefs2, + int prefs3, int mrefs3, int prefs4, int mrefs4, + int parity, int clip_max) +{ + const int w0 = clip_max != 255 ? 0 : w & ~15; + + ff_bwdif_filter_line_neon(dst1, prev1, cur1, next1, + w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max); + + if (w0 < w) + ff_bwdif_filter_line_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0, + w - w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max); +} static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1, int w, int prefs, int mrefs, int prefs2, int mrefs2, @@ -71,6 +91,7 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth) return; s->filter_intra = filter_intra_helper; + s->filter_line = filter_line_helper; s->filter_edge = filter_edge_helper; } diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S index a33b235882..675e97d966 100644 --- a/libavfilter/aarch64/vf_bwdif_neon.S +++ b/libavfilter/aarch64/vf_bwdif_neon.S @@ -128,6 +128,221 @@ coeffs: .hword 5570, 3801, 1016, -3801 // hf[0] = v0.h[2], -hf[1] = v0.h[5] .hword 5077, 981 // sp[0] = v0.h[6] +// =========================================================================== +// +// void filter_line( +// void *dst1, // x0 +// void *prev1, // x1 +// void *cur1, // x2 +// void *next1, // x3 +// int w, // w4 +// int prefs, // w5 +// int mrefs, // w6 +// int prefs2, // w7 +// int mrefs2, // [sp, #0] +// int prefs3, // [sp, #8] +// int mrefs3, // [sp, #16] +// int prefs4, // [sp, #24] +// int mrefs4, // [sp, #32] +// int parity, // [sp, #40] +// int clip_max) // [sp, #48] + +function ff_bwdif_filter_line_neon, export=1 + // Sanity check w + cmp w4, #0 + ble 99f + + // Rearrange regs to be the same as line3 for ease of debug! + mov w10, w4 // w10 = loop count + mov w9, w6 // w9 = mref + mov w12, w7 // w12 = pref2 + mov w11, w5 // w11 = pref + ldr w8, [sp, #0] // w8 = mref2 + ldr w7, [sp, #16] // w7 = mref3 + ldr w6, [sp, #32] // w6 = mref4 + ldr w13, [sp, #8] // w13 = pref3 + ldr w14, [sp, #24] // w14 = pref4 + + mov x4, x3 + mov x3, x2 + mov x2, x1 + +// #define prev2 cur +// const uint8_t * restrict next2 = parity ? prev : next; + ldr w17, [sp, #40] // parity + cmp w17, #0 + csel x17, x2, x4, ne + + // We want all the V registers - save all the ones we must + stp d14, d15, [sp, #-64]! + stp d8, d9, [sp, #48] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #16] + + ldr q0, coeffs + +// for (x = 0; x < w; x++) { +// int diff0, diff2; +// int d0, d2; +// int temporal_diff0, temporal_diff2; +// +// int i1, i2; +// int j1, j2; +// int p6, p5, p4, p3, p2, p1, c0, m1, m2, m3, m4; + +10: +// c0 = prev2[0] + next2[0]; // c0 = v20, v21 +// d0 = c0 >> 1; // d0 = v10 +// temporal_diff0 = FFABS(prev2[0] - next2[0]); // td0 = v11 + ldr q31, [x3] + ldr q21, [x17] + uhadd v10.16b, v31.16b, v21.16b + uabd v11.16b, v31.16b, v21.16b + uaddl v20.8h, v21.8b, v31.8b + uaddl2 v21.8h, v21.16b, v31.16b + + ldr q31, [x3, w6, SXTW] + ldr q23, [x17, w6, SXTW] + +// i1 = coef_hf[0] * c0; // i1 = v2-v5 + UMULL4K v2, v3, v4, v5, v20, v21, v0.h[2] + + ldr q30, [x3, w14, SXTW] + ldr q25, [x17, w14, SXTW] + +// m4 = prev2[mrefs4] + next2[mrefs4]; // m4 = v22,v23 + uaddl v22.8h, v23.8b, v31.8b + uaddl2 v23.8h, v23.16b, v31.16b + +// p4 = prev2[prefs4] + next2[prefs4]; // p4 = v24,v25, (p4 >> 1) = v12 + uhadd v12.16b, v25.16b, v30.16b + uaddl v24.8h, v25.8b, v30.8b + uaddl2 v25.8h, v25.16b, v30.16b + +// m3 = cur[mrefs3]; // m3 = v20 + ldr q20, [x3, w7, SXTW] + +// p3 = cur[prefs3]; // p3 = v21 + ldr q21, [x3, w13, SXTW] + +// i1 += coef_hf[2] * (m4 + p4); // (-m4:v22,v23) (-p4:v24,v25) + add v22.8h, v22.8h, v24.8h + add v23.8h, v23.8h, v25.8h + UMLAL4K v2, v3, v4, v5, v22, v23, v0.h[4] + + ldr q29, [x3, w8, SXTW] + ldr q23, [x17, w8, SXTW] + +// i1 -= coef_lf[1] * 4 * (m3 + p3); // - + uaddl v30.8h, v20.8b, v21.8b + uaddl2 v31.8h, v20.16b, v21.16b + + UMLSL4K v2, v3, v4, v5, v30, v31, v0.h[1] + +// m2 = prev2[mrefs2] + next2[mrefs2]; // m2 = v22,v23, (m2 >> 1) = v13 + uhadd v13.16b, v23.16b, v29.16b + uaddl v22.8h, v23.8b, v29.8b + uaddl2 v23.8h, v23.16b, v29.16b + + ldr q31, [x3, w12, SXTW] + ldr q27, [x17, w12, SXTW] + +// j1 += coef_hf[2] * (m2 + p6); // (-p6:v24,v25) + add v24.8h, v24.8h, v22.8h + add v25.8h, v25.8h, v23.8h + UMLAL4K v6, v7, v8, v9, v24, v25, v0.h[4] + +// m1 = cur[mrefs]; // m1 = v24 + ldr q24, [x3, w9, SXTW] + +// p2 = prev2[prefs2] + next2[prefs2]; // p2 = v26, v27 +// temporal_diff2 = FFABS(prev2[prefs2] - next2[prefs2]); // td2 = v14 +// d2 = p2 >> 1; // d2 = v15 + uabd v14.16b, v31.16b, v27.16b + uhadd v15.16b, v31.16b, v27.16b + uaddl v26.8h, v27.8b, v31.8b + uaddl2 v27.8h, v27.16b, v31.16b + +// i1 -= coef_hf[1] * (m2 + p2); // (-m2:v22,v23*) (-p2:v26*,v27*) + add v22.8h, v22.8h, v26.8h + add v23.8h, v23.8h, v27.8h + UMLSL4K v2, v3, v4, v5, v22, v23, v0.h[3] + +// p1 = cur[prefs]; // p1 = v22 + ldr q22, [x3, w11, SXTW] + +// i2 = (coef_sp[0] * (m1 + p1) - coef_sp[1] * (m3 + p3)) >> 13; // (-m3:v20*) i2=v17 + uaddl v18.8h, v22.8b, v24.8b + uaddl2 v19.8h, v22.16b, v24.16b + UMULL4K v28, v29, v30, v31, v18, v19, v0.h[6] + + uaddl v18.8h, v20.8b, v21.8b + uaddl2 v19.8h, v20.16b, v21.16b + UMLSL4K v28, v29, v30, v31, v18, v19, v0.h[7] + + SQSHRUNN v17, v28, v29, v30, v31, 13 + +// i1 += coef_lf[0] * 4 * (m1 + p1); // p1 = v22, m1 = v24 + uaddl v26.8h, v24.8b, v22.8b + uaddl2 v27.8h, v24.16b, v22.16b + UMLAL4K v2, v3, v4, v5, v26, v27, v0.h[0] + + ldr q31, [x2, w9, SXTW] + ldr q29, [x4, w9, SXTW] + + ldr q30, [x2, w11, SXTW] + ldr q28, [x4, w11, SXTW] + +// i1 >>= 15; // i1 = v2, -v3, -v4*, -v5* + SQSHRUNN v2, v2, v3, v4, v5, 15 + +// { +// int t1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1; +// int t2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1; + uabd v30.16b, v22.16b, v30.16b + uabd v31.16b, v24.16b, v31.16b + uabd v28.16b, v22.16b, v28.16b + uabd v29.16b, v24.16b, v29.16b + uhadd v31.16b, v31.16b, v30.16b + uhadd v29.16b, v29.16b, v28.16b + +// diff0 = FFMAX3(temporal_diff0 >> 1, t1, t2); // diff0=v18 + ushr v18.16b, v11.16b, #1 + umax v18.16b, v18.16b, v31.16b + umax v18.16b, v18.16b, v29.16b + + // diff0 = v18, (m2 >> 1) = v13, m1 = v24, d0 = v10, p1 = v22, d2 = v15 + SPAT_CHECK v18, v13, v24, v10, v22, v15, v31, v30, v29, v28 + + // i1 = v2, i2 = v17, m1 = v24, d0 = v10, p1 = v22, td2 = v11, diff2 = v18 + INTERPOL v2, v2, v17, v24, v10, v22, v11, v18, v31, v30, v29 + +// dst[0] = av_clip_uint8(interpol); + str q2, [x0], #16 +// } +// +// dst++; +// cur++; +// prev++; +// prev2++; +// next++; +// } + + subs w10, w10, #16 + add x2, x2, #16 + add x3, x3, #16 + add x4, x4, #16 + add x17, x17, #16 + bgt 10b + + ldp d12, d13, [sp, #16] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #48] + ldp d14, d15, [sp], #64 +99: + ret +endfunc + // ============================================================================ // // void ff_bwdif_filter_edge_neon( -- 2.39.2 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
next prev parent reply other threads:[~2023-06-29 18:00 UTC|newest] Thread overview: 32+ messages / expand[flat|nested] mbox.gz Atom feed top 2023-06-29 17:57 [FFmpeg-devel] [PATCH 00/15] avfilter/vf_bwdif: Add aarch64 neon functions John Cox 2023-06-29 17:57 ` [FFmpeg-devel] [PATCH 01/15] avfilter/vf_bwdif: Add outline for aarch " John Cox 2023-06-29 17:57 ` [FFmpeg-devel] [PATCH 02/15] avfilter/vf_bwdif: Add common macros and consts for aarch64 neon John Cox 2023-07-01 21:35 ` Martin Storsjö 2023-07-02 10:27 ` John Cox 2023-07-02 20:07 ` Martin Storsjö 2023-07-02 21:02 ` Martin Storsjö 2023-07-03 8:31 ` John Cox 2023-06-29 17:57 ` [FFmpeg-devel] [PATCH 03/15] avfilter/vf_bwdif: Export C filter_intra John Cox 2023-06-29 17:57 ` [FFmpeg-devel] [PATCH 04/15] avfilter/vf_bwdif: Add neon for filter_intra John Cox 2023-07-01 21:37 ` Martin Storsjö 2023-07-02 10:43 ` John Cox 2023-07-02 20:18 ` Martin Storsjö 2023-06-29 17:57 ` [FFmpeg-devel] [PATCH 05/15] tests/checkasm: Add test for vf_bwdif filter_intra John Cox 2023-06-29 17:57 ` [FFmpeg-devel] [PATCH 06/15] avfilter/vf_bwdif: Add clip and spatial macros for aarch64 neon John Cox 2023-06-29 17:57 ` [FFmpeg-devel] [PATCH 07/15] avfilter/vf_bwdif: Export C filter_edge John Cox 2023-06-29 17:57 ` [FFmpeg-devel] [PATCH 08/15] avfilter/vf_bwdif: Add neon for filter_edge John Cox 2023-07-01 21:40 ` Martin Storsjö 2023-07-02 10:50 ` John Cox 2023-07-02 20:36 ` Martin Storsjö 2023-06-29 17:57 ` [FFmpeg-devel] [PATCH 09/15] tests/checkasm: Add test for vf_bwdif filter_edge John Cox 2023-06-29 17:57 ` [FFmpeg-devel] [PATCH 10/15] avfilter/vf_bwdif: Export C filter_line John Cox 2023-06-29 17:57 ` John Cox [this message] 2023-07-01 21:44 ` [FFmpeg-devel] [PATCH 11/15] avfilter/vf_bwdif: Add neon for filter_line Martin Storsjö 2023-07-02 10:57 ` John Cox 2023-07-02 20:40 ` Martin Storsjö 2023-06-29 17:57 ` [FFmpeg-devel] [PATCH 12/15] avfilter/vf_bwdif: Add a filter_line3 method for optimisation John Cox 2023-06-29 17:57 ` [FFmpeg-devel] [PATCH 13/15] avfilter/vf_bwdif: Add neon for filter_line3 John Cox 2023-06-29 17:57 ` [FFmpeg-devel] [PATCH 14/15] tests/checkasm: Add test for vf_bwdif filter_line3 John Cox 2023-06-29 17:57 ` [FFmpeg-devel] [PATCH 15/15] avfilter/vf_bwdif: Block filter slices into a multiple of 4 lines John Cox 2023-07-01 21:33 ` [FFmpeg-devel] [PATCH 00/15] avfilter/vf_bwdif: Add aarch64 neon functions Martin Storsjö 2023-07-02 10:18 ` John Cox
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20230629175729.224383-12-jc@kynesim.co.uk \ --to=jc@kynesim.co.uk \ --cc=ffmpeg-devel@ffmpeg.org \ --cc=thomas.mundt@hr.de \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel This inbox may be cloned and mirrored by anyone: git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \ ffmpegdev@gitmailbox.com public-inbox-index ffmpegdev Example config snippet for mirrors. AGPL code for this site: git clone https://public-inbox.org/public-inbox.git