* [FFmpeg-devel] [PATCH] aarch64/h26x: optimize sao_band_filter
@ 2025-04-15 10:01 Zhao Zhili
2025-04-25 8:25 ` Martin Storsjö
0 siblings, 1 reply; 5+ messages in thread
From: Zhao Zhili @ 2025-04-15 10:01 UTC (permalink / raw)
To: ffmpeg-devel; +Cc: Zhao Zhili
From: Zhao Zhili <zhilizhao@tencent.com>
int8_t[] is enough for offset_table of 8 bit streams.
On rpi5:
Before After
hevc_sao_band_8_8_c: 252.3 ( 1.00x) 252.3 ( 1.00x)
hevc_sao_band_8_8_neon: 95.8 ( 2.63x) 61.0 ( 4.14x)
hevc_sao_band_16_8_c: 875.2 ( 1.00x) 864.9 ( 1.00x)
hevc_sao_band_16_8_neon: 317.5 ( 2.76x) 150.0 ( 5.76x)
hevc_sao_band_32_8_c: 3853.5 ( 1.00x) 3871.6 ( 1.00x)
hevc_sao_band_32_8_neon: 1222.3 ( 3.15x) 550.6 ( 7.03x)
hevc_sao_band_48_8_c: 8203.6 ( 1.00x) 8182.6 ( 1.00x)
hevc_sao_band_48_8_neon: 2685.7 ( 3.05x) 1185.8 ( 6.90x)
hevc_sao_band_64_8_c: 14023.0 ( 1.00x) 14038.9 ( 1.00x)
hevc_sao_band_64_8_neon: 4783.2 ( 2.93x) 2078.4 ( 6.75x)
---
libavcodec/aarch64/h26x/dsp.h | 4 +
libavcodec/aarch64/h26x/sao_neon.S | 93 ++++++++++++++---------
libavcodec/aarch64/hevcdsp_init_aarch64.c | 4 +-
libavcodec/aarch64/vvc/dsp_init.c | 5 +-
4 files changed, 65 insertions(+), 41 deletions(-)
diff --git a/libavcodec/aarch64/h26x/dsp.h b/libavcodec/aarch64/h26x/dsp.h
index 0fefb4d70f..6ea6a8d36a 100644
--- a/libavcodec/aarch64/h26x/dsp.h
+++ b/libavcodec/aarch64/h26x/dsp.h
@@ -28,6 +28,10 @@ void ff_h26x_sao_band_filter_8x8_8_neon(uint8_t *_dst, const uint8_t *_src,
ptrdiff_t stride_dst, ptrdiff_t stride_src,
const int16_t *sao_offset_val, int sao_left_class,
int width, int height);
+void ff_h26x_sao_band_filter_16x16_8_neon(uint8_t *_dst, const uint8_t *_src,
+ ptrdiff_t stride_dst, ptrdiff_t stride_src,
+ const int16_t *sao_offset_val, int sao_left_class,
+ int width, int height);
void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst,
const int16_t *sao_offset_val, int eo, int width, int height);
void ff_hevc_sao_edge_filter_8x8_8_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst,
diff --git a/libavcodec/aarch64/h26x/sao_neon.S b/libavcodec/aarch64/h26x/sao_neon.S
index c43820135e..60c026fe95 100644
--- a/libavcodec/aarch64/h26x/sao_neon.S
+++ b/libavcodec/aarch64/h26x/sao_neon.S
@@ -35,48 +35,67 @@
// int16_t *sao_offset_val, int sao_left_class,
// int width, int height)
function ff_h26x_sao_band_filter_8x8_8_neon, export=1
- stp xzr, xzr, [sp, #-64]!
+ stp xzr, xzr, [sp, #-32]!
stp xzr, xzr, [sp, #16]
- stp xzr, xzr, [sp, #32]
- stp xzr, xzr, [sp, #48]
mov w8, #4
-0: ldrsh x9, [x4, x8, lsl #1] // sao_offset_val[k+1]
- subs w8, w8, #1
- add w10, w8, w5 // k + sao_left_class
+0:
+ ldrsh x9, [x4, x8, lsl #1] // sao_offset_val[k+1]
+ subs w8, w8, #1
+ add w10, w8, w5 // k + sao_left_class
and w10, w10, #0x1F
- strh w9, [sp, x10, lsl #1]
+ strb w9, [sp, x10]
bne 0b
- add w6, w6, #7
- bic w6, w6, #7
- ld1 {v16.16b-v19.16b}, [sp], #64
- sub x2, x2, x6
- sub x3, x3, x6
- movi v20.8h, #1
-1: mov w8, w6 // beginning of line
-2: // Simple layout for accessing 16bit values
- // with 8bit LUT.
- //
- // 00 01 02 03 04 05 06 07
- // +----------------------------------->
- // |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED|....
- // +----------------------------------->
- // i-0 i-1 i-2 i-3
- ld1 {v2.8b}, [x1], #8 // dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
- subs w8, w8, #8
- uxtl v0.8h, v2.8b // load src[x]
- ushr v2.8h, v0.8h, #3 // >> BIT_DEPTH - 3
- shl v1.8h, v2.8h, #1 // low (x2, accessing short)
- add v3.8h, v1.8h, v20.8h // +1 access upper short
- sli v1.8h, v3.8h, #8 // shift insert index to upper byte
- tbx v2.16b, {v16.16b-v19.16b}, v1.16b // table
- add v1.8h, v0.8h, v2.8h // src[x] + table
- sqxtun v4.8b, v1.8h // clip + narrow
- st1 {v4.8b}, [x0], #8 // store
- // done 8 pixels
+ ldp q16, q17, [sp], #32
+1:
+ ld1 {v2.8b}, [x1], x3
+ subs w7, w7, #1
+ uxtl v0.8h, v2.8b
+ ushr v3.8b, v2.8b, #3 // >> BIT_DEPTH - 3
+ tbx v3.8b, {v16.16b-v17.16b}, v3.8b
+ sxtl v2.8h, v3.8b
+ add v0.8h, v0.8h, v2.8h // src[x] + table
+ sqxtun v0.8b, v0.8h // clip + narrow
+ st1 {v0.8b}, [x0], x2
+ bne 1b
+ ret
+endfunc
+
+function ff_h26x_sao_band_filter_16x16_8_neon, export=1
+ stp xzr, xzr, [sp, #-32]!
+ stp xzr, xzr, [sp, #16]
+ mov w8, #4
+0:
+ ldrsh x9, [x4, x8, lsl #1] // sao_offset_val[k+1]
+ subs w8, w8, #1
+ add w10, w8, w5 // k + sao_left_class
+ and w10, w10, #0x1F
+ strb w9, [sp, x10]
+ bne 0b
+ add w6, w6, #15
+ bic w6, w6, #15
+ ldp q16, q17, [sp], #32
+ sub x2, x2, x6
+ sub x3, x3, x6
+1:
+ mov w8, w6 // beginning of line
+2:
+ ldr q2, [x1], #16
+ subs w8, w8, #16
+ uxtl v0.8h, v2.8b
+ uxtl2 v1.8h, v2.16b
+ ushr v3.16b, v2.16b, #3 // >> BIT_DEPTH - 3
+ tbx v3.16b, {v16.16b-v17.16b}, v3.16b
+ sxtl v2.8h, v3.8b
+ sxtl2 v3.8h, v3.16b
+ add v0.8h, v0.8h, v2.8h // src[x] + table
+ add v1.8h, v1.8h, v3.8h
+ sqxtun v0.8b, v0.8h // clip + narrow
+ sqxtun2 v0.16b, v1.8h
+ str q0, [x0], #16
bne 2b
- subs w7, w7, #1 // finished line, prep. new
- add x0, x0, x2 // dst += stride_dst
- add x1, x1, x3 // src += stride_src
+ subs w7, w7, #1
+ add x0, x0, x2 // dst += stride_dst
+ add x1, x1, x3 // src += stride_src
bne 1b
ret
endfunc
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 5dd470baaa..0b159d1886 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -157,11 +157,11 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_neon;
c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_neon;
c->transform_4x4_luma = ff_hevc_transform_luma_4x4_neon_8;
- c->sao_band_filter[0] =
+ c->sao_band_filter[0] = ff_h26x_sao_band_filter_8x8_8_neon;
c->sao_band_filter[1] =
c->sao_band_filter[2] =
c->sao_band_filter[3] =
- c->sao_band_filter[4] = ff_h26x_sao_band_filter_8x8_8_neon;
+ c->sao_band_filter[4] = ff_h26x_sao_band_filter_16x16_8_neon;
c->sao_edge_filter[0] = ff_hevc_sao_edge_filter_8x8_8_neon;
c->sao_edge_filter[1] =
c->sao_edge_filter[2] =
diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
index 2c99ba206b..9a171234f6 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -197,8 +197,9 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_8_neon;
c->inter.apply_bdof = apply_bdof_8;
- for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
- c->sao.band_filter[i] = ff_h26x_sao_band_filter_8x8_8_neon;
+ c->sao.band_filter[0] = ff_h26x_sao_band_filter_8x8_8_neon;
+ for (int i = 1; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
+ c->sao.band_filter[i] = ff_h26x_sao_band_filter_16x16_8_neon;
c->sao.edge_filter[0] = ff_vvc_sao_edge_filter_8x8_8_neon;
for (int i = 1; i < FF_ARRAY_ELEMS(c->sao.edge_filter); i++)
c->sao.edge_filter[i] = ff_vvc_sao_edge_filter_16x16_8_neon;
--
2.46.0
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [FFmpeg-devel] [PATCH] aarch64/h26x: optimize sao_band_filter
2025-04-15 10:01 [FFmpeg-devel] [PATCH] aarch64/h26x: optimize sao_band_filter Zhao Zhili
@ 2025-04-25 8:25 ` Martin Storsjö
2025-04-29 7:51 ` Zhao Zhili
0 siblings, 1 reply; 5+ messages in thread
From: Martin Storsjö @ 2025-04-25 8:25 UTC (permalink / raw)
To: FFmpeg development discussions and patches; +Cc: Zhao Zhili
On Tue, 15 Apr 2025, Zhao Zhili wrote:
> From: Zhao Zhili <zhilizhao@tencent.com>
>
> int8_t[] is enough for offset_table of 8 bit streams.
>
> On rpi5:
> Before After
> hevc_sao_band_8_8_c: 252.3 ( 1.00x) 252.3 ( 1.00x)
> hevc_sao_band_8_8_neon: 95.8 ( 2.63x) 61.0 ( 4.14x)
> hevc_sao_band_16_8_c: 875.2 ( 1.00x) 864.9 ( 1.00x)
> hevc_sao_band_16_8_neon: 317.5 ( 2.76x) 150.0 ( 5.76x)
> hevc_sao_band_32_8_c: 3853.5 ( 1.00x) 3871.6 ( 1.00x)
> hevc_sao_band_32_8_neon: 1222.3 ( 3.15x) 550.6 ( 7.03x)
> hevc_sao_band_48_8_c: 8203.6 ( 1.00x) 8182.6 ( 1.00x)
> hevc_sao_band_48_8_neon: 2685.7 ( 3.05x) 1185.8 ( 6.90x)
> hevc_sao_band_64_8_c: 14023.0 ( 1.00x) 14038.9 ( 1.00x)
> hevc_sao_band_64_8_neon: 4783.2 ( 2.93x) 2078.4 ( 6.75x)
> ---
> libavcodec/aarch64/h26x/dsp.h | 4 +
> libavcodec/aarch64/h26x/sao_neon.S | 93 ++++++++++++++---------
> libavcodec/aarch64/hevcdsp_init_aarch64.c | 4 +-
> libavcodec/aarch64/vvc/dsp_init.c | 5 +-
> 4 files changed, 65 insertions(+), 41 deletions(-)
>
> diff --git a/libavcodec/aarch64/h26x/dsp.h b/libavcodec/aarch64/h26x/dsp.h
> index 0fefb4d70f..6ea6a8d36a 100644
> --- a/libavcodec/aarch64/h26x/dsp.h
> +++ b/libavcodec/aarch64/h26x/dsp.h
> @@ -28,6 +28,10 @@ void ff_h26x_sao_band_filter_8x8_8_neon(uint8_t *_dst, const uint8_t *_src,
> ptrdiff_t stride_dst, ptrdiff_t stride_src,
> const int16_t *sao_offset_val, int sao_left_class,
> int width, int height);
> +void ff_h26x_sao_band_filter_16x16_8_neon(uint8_t *_dst, const uint8_t *_src,
> + ptrdiff_t stride_dst, ptrdiff_t stride_src,
> + const int16_t *sao_offset_val, int sao_left_class,
> + int width, int height);
> void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst,
> const int16_t *sao_offset_val, int eo, int width, int height);
> void ff_hevc_sao_edge_filter_8x8_8_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst,
> diff --git a/libavcodec/aarch64/h26x/sao_neon.S b/libavcodec/aarch64/h26x/sao_neon.S
> index c43820135e..60c026fe95 100644
> --- a/libavcodec/aarch64/h26x/sao_neon.S
> +++ b/libavcodec/aarch64/h26x/sao_neon.S
> @@ -35,48 +35,67 @@
> // int16_t *sao_offset_val, int sao_left_class,
> // int width, int height)
> function ff_h26x_sao_band_filter_8x8_8_neon, export=1
> - stp xzr, xzr, [sp, #-64]!
> + stp xzr, xzr, [sp, #-32]!
> stp xzr, xzr, [sp, #16]
> - stp xzr, xzr, [sp, #32]
> - stp xzr, xzr, [sp, #48]
> mov w8, #4
> -0: ldrsh x9, [x4, x8, lsl #1] // sao_offset_val[k+1]
> - subs w8, w8, #1
> - add w10, w8, w5 // k + sao_left_class
> +0:
> + ldrsh x9, [x4, x8, lsl #1] // sao_offset_val[k+1]
> + subs w8, w8, #1
> + add w10, w8, w5 // k + sao_left_class
> and w10, w10, #0x1F
> - strh w9, [sp, x10, lsl #1]
> + strb w9, [sp, x10]
> bne 0b
> - add w6, w6, #7
> - bic w6, w6, #7
> - ld1 {v16.16b-v19.16b}, [sp], #64
> - sub x2, x2, x6
> - sub x3, x3, x6
> - movi v20.8h, #1
> -1: mov w8, w6 // beginning of line
> -2: // Simple layout for accessing 16bit values
> - // with 8bit LUT.
> - //
> - // 00 01 02 03 04 05 06 07
> - // +----------------------------------->
> - // |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED|....
> - // +----------------------------------->
> - // i-0 i-1 i-2 i-3
> - ld1 {v2.8b}, [x1], #8 // dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
> - subs w8, w8, #8
> - uxtl v0.8h, v2.8b // load src[x]
> - ushr v2.8h, v0.8h, #3 // >> BIT_DEPTH - 3
> - shl v1.8h, v2.8h, #1 // low (x2, accessing short)
> - add v3.8h, v1.8h, v20.8h // +1 access upper short
> - sli v1.8h, v3.8h, #8 // shift insert index to upper byte
> - tbx v2.16b, {v16.16b-v19.16b}, v1.16b // table
> - add v1.8h, v0.8h, v2.8h // src[x] + table
> - sqxtun v4.8b, v1.8h // clip + narrow
> - st1 {v4.8b}, [x0], #8 // store
> - // done 8 pixels
> + ldp q16, q17, [sp], #32
> +1:
> + ld1 {v2.8b}, [x1], x3
> + subs w7, w7, #1
> + uxtl v0.8h, v2.8b
> + ushr v3.8b, v2.8b, #3 // >> BIT_DEPTH - 3
Nitpick: The comment on this line seems to be misaligned with the other
comments below - please check.
> + tbx v3.8b, {v16.16b-v17.16b}, v3.8b
Is there any specific reason for preferring tbx over tbl here? (I know the
existing code used tbx.) Without having studied cycle tables, I would
expect tbl to maybe be slightly simpler, but perhaps there's no difference
(or tbx is faster)?
Other than these comments, this patch looks good to me, thanks - feel free
to push.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [FFmpeg-devel] [PATCH] aarch64/h26x: optimize sao_band_filter
2025-04-25 8:25 ` Martin Storsjö
@ 2025-04-29 7:51 ` Zhao Zhili
2025-04-29 7:58 ` Martin Storsjö
0 siblings, 1 reply; 5+ messages in thread
From: Zhao Zhili @ 2025-04-29 7:51 UTC (permalink / raw)
To: FFmpeg development discussions and patches
> On Apr 25, 2025, at 16:25, Martin Storsjö <martin@martin.st> wrote:
>
> On Tue, 15 Apr 2025, Zhao Zhili wrote:
>
>> From: Zhao Zhili <zhilizhao@tencent.com>
>>
>> int8_t[] is enough for offset_table of 8 bit streams.
>>
>> On rpi5:
>> Before After
>> hevc_sao_band_8_8_c: 252.3 ( 1.00x) 252.3 ( 1.00x)
>> hevc_sao_band_8_8_neon: 95.8 ( 2.63x) 61.0 ( 4.14x)
>> hevc_sao_band_16_8_c: 875.2 ( 1.00x) 864.9 ( 1.00x)
>> hevc_sao_band_16_8_neon: 317.5 ( 2.76x) 150.0 ( 5.76x)
>> hevc_sao_band_32_8_c: 3853.5 ( 1.00x) 3871.6 ( 1.00x)
>> hevc_sao_band_32_8_neon: 1222.3 ( 3.15x) 550.6 ( 7.03x)
>> hevc_sao_band_48_8_c: 8203.6 ( 1.00x) 8182.6 ( 1.00x)
>> hevc_sao_band_48_8_neon: 2685.7 ( 3.05x) 1185.8 ( 6.90x)
>> hevc_sao_band_64_8_c: 14023.0 ( 1.00x) 14038.9 ( 1.00x)
>> hevc_sao_band_64_8_neon: 4783.2 ( 2.93x) 2078.4 ( 6.75x)
>> ---
>> libavcodec/aarch64/h26x/dsp.h | 4 +
>> libavcodec/aarch64/h26x/sao_neon.S | 93 ++++++++++++++---------
>> libavcodec/aarch64/hevcdsp_init_aarch64.c | 4 +-
>> libavcodec/aarch64/vvc/dsp_init.c | 5 +-
>> 4 files changed, 65 insertions(+), 41 deletions(-)
>>
>> diff --git a/libavcodec/aarch64/h26x/dsp.h b/libavcodec/aarch64/h26x/dsp.h
>> index 0fefb4d70f..6ea6a8d36a 100644
>> --- a/libavcodec/aarch64/h26x/dsp.h
>> +++ b/libavcodec/aarch64/h26x/dsp.h
>> @@ -28,6 +28,10 @@ void ff_h26x_sao_band_filter_8x8_8_neon(uint8_t *_dst, const uint8_t *_src,
>> ptrdiff_t stride_dst, ptrdiff_t stride_src,
>> const int16_t *sao_offset_val, int sao_left_class,
>> int width, int height);
>> +void ff_h26x_sao_band_filter_16x16_8_neon(uint8_t *_dst, const uint8_t *_src,
>> + ptrdiff_t stride_dst, ptrdiff_t stride_src,
>> + const int16_t *sao_offset_val, int sao_left_class,
>> + int width, int height);
>> void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst,
>> const int16_t *sao_offset_val, int eo, int width, int height);
>> void ff_hevc_sao_edge_filter_8x8_8_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst,
>> diff --git a/libavcodec/aarch64/h26x/sao_neon.S b/libavcodec/aarch64/h26x/sao_neon.S
>> index c43820135e..60c026fe95 100644
>> --- a/libavcodec/aarch64/h26x/sao_neon.S
>> +++ b/libavcodec/aarch64/h26x/sao_neon.S
>> @@ -35,48 +35,67 @@
>> // int16_t *sao_offset_val, int sao_left_class,
>> // int width, int height)
>> function ff_h26x_sao_band_filter_8x8_8_neon, export=1
>> - stp xzr, xzr, [sp, #-64]!
>> + stp xzr, xzr, [sp, #-32]!
>> stp xzr, xzr, [sp, #16]
>> - stp xzr, xzr, [sp, #32]
>> - stp xzr, xzr, [sp, #48]
>> mov w8, #4
>> -0: ldrsh x9, [x4, x8, lsl #1] // sao_offset_val[k+1]
>> - subs w8, w8, #1
>> - add w10, w8, w5 // k + sao_left_class
>> +0:
>> + ldrsh x9, [x4, x8, lsl #1] // sao_offset_val[k+1]
>> + subs w8, w8, #1
>> + add w10, w8, w5 // k + sao_left_class
>> and w10, w10, #0x1F
>> - strh w9, [sp, x10, lsl #1]
>> + strb w9, [sp, x10]
>> bne 0b
>> - add w6, w6, #7
>> - bic w6, w6, #7
>> - ld1 {v16.16b-v19.16b}, [sp], #64
>> - sub x2, x2, x6
>> - sub x3, x3, x6
>> - movi v20.8h, #1
>> -1: mov w8, w6 // beginning of line
>> -2: // Simple layout for accessing 16bit values
>> - // with 8bit LUT.
>> - //
>> - // 00 01 02 03 04 05 06 07
>> - // +----------------------------------->
>> - // |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED|....
>> - // +----------------------------------->
>> - // i-0 i-1 i-2 i-3
>> - ld1 {v2.8b}, [x1], #8 // dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
>> - subs w8, w8, #8
>> - uxtl v0.8h, v2.8b // load src[x]
>> - ushr v2.8h, v0.8h, #3 // >> BIT_DEPTH - 3
>> - shl v1.8h, v2.8h, #1 // low (x2, accessing short)
>> - add v3.8h, v1.8h, v20.8h // +1 access upper short
>> - sli v1.8h, v3.8h, #8 // shift insert index to upper byte
>> - tbx v2.16b, {v16.16b-v19.16b}, v1.16b // table
>> - add v1.8h, v0.8h, v2.8h // src[x] + table
>> - sqxtun v4.8b, v1.8h // clip + narrow
>> - st1 {v4.8b}, [x0], #8 // store
>> - // done 8 pixels
>> + ldp q16, q17, [sp], #32
>> +1:
>> + ld1 {v2.8b}, [x1], x3
>> + subs w7, w7, #1
>> + uxtl v0.8h, v2.8b
>> + ushr v3.8b, v2.8b, #3 // >> BIT_DEPTH - 3
>
> Nitpick: The comment on this line seems to be misaligned with the other comments below - please check.
Fixed before push.
>
>> + tbx v3.8b, {v16.16b-v17.16b}, v3.8b
>
> Is there any specific reason for preferring tbx over tbl here? (I know the existing code used tbx.) Without having studied cycle tables, I would expect tbl to maybe be slightly simpler, but perhaps there's no difference (or tbx is faster)?
tbl can be faster. The result is quite impressive. Changed to tbl before push.
Before tbx tbl
hevc_sao_band_8_8_c: 252.3 ( 1.00x) 252.3 ( 1.00x) 252.3 ( 1.00x)
hevc_sao_band_8_8_neon: 95.8 ( 2.63x) 61.0 ( 4.14x) 61.0 ( 4.57x)
hevc_sao_band_16_8_c: 875.2 ( 1.00x) 864.9 ( 1.00x) 864.9 ( 1.00x)
hevc_sao_band_16_8_neon: 317.5 ( 2.76x) 150.0 ( 5.76x) 150.0 ( 6.26x)
hevc_sao_band_32_8_c: 3853.5 ( 1.00x) 3871.6 ( 1.00x) 3871.6 ( 1.00x)
hevc_sao_band_32_8_neon: 1222.3 ( 3.15x) 550.6 ( 7.03x) 550.6 ( 7.39)
hevc_sao_band_48_8_c: 8203.6 ( 1.00x) 8182.6 ( 1.00x) 8182.6 ( 1.00x)
hevc_sao_band_48_8_neon: 2685.7 ( 3.05x) 1185.8 ( 6.90x) 1185.8 ( 7.36x)
hevc_sao_band_64_8_c: 14023.0 ( 1.00x) 14038.9 ( 1.00x) 14038.9 ( 1.00x)
hevc_sao_band_64_8_neon: 4783.2 ( 2.93x) 2078.4 ( 6.75x) 2078.4 ( 7.15x)
>
>
> Other than these comments, this patch looks good to me, thanks - feel free to push.
>
> // Martin
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org <mailto:ffmpeg-devel@ffmpeg.org>
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org <mailto:ffmpeg-devel-request@ffmpeg.org> with subject "unsubscribe".
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [FFmpeg-devel] [PATCH] aarch64/h26x: optimize sao_band_filter
2025-04-29 7:51 ` Zhao Zhili
@ 2025-04-29 7:58 ` Martin Storsjö
2025-04-29 8:14 ` Zhao Zhili
0 siblings, 1 reply; 5+ messages in thread
From: Martin Storsjö @ 2025-04-29 7:58 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Tue, 29 Apr 2025, Zhao Zhili wrote:
>> On Apr 25, 2025, at 16:25, Martin Storsjö <martin@martin.st> wrote:
>>
>> On Tue, 15 Apr 2025, Zhao Zhili wrote:
>>
>>
>>> + tbx v3.8b, {v16.16b-v17.16b}, v3.8b
>>
>> Is there any specific reason for preferring tbx over tbl here? (I know the existing code used tbx.) Without having studied cycle tables, I would expect tbl to maybe be slightly simpler, but perhaps there's no difference (or tbx is faster)?
>
> tbl can be faster. The result is quite impressive. Changed to tbl before push.
>
> Before tbx tbl
> hevc_sao_band_8_8_c: 252.3 ( 1.00x) 252.3 ( 1.00x) 252.3 ( 1.00x)
> hevc_sao_band_8_8_neon: 95.8 ( 2.63x) 61.0 ( 4.14x) 61.0 ( 4.57x)
> hevc_sao_band_16_8_c: 875.2 ( 1.00x) 864.9 ( 1.00x) 864.9 ( 1.00x)
> hevc_sao_band_16_8_neon: 317.5 ( 2.76x) 150.0 ( 5.76x) 150.0 ( 6.26x)
> hevc_sao_band_32_8_c: 3853.5 ( 1.00x) 3871.6 ( 1.00x) 3871.6 ( 1.00x)
> hevc_sao_band_32_8_neon: 1222.3 ( 3.15x) 550.6 ( 7.03x) 550.6 ( 7.39)
> hevc_sao_band_48_8_c: 8203.6 ( 1.00x) 8182.6 ( 1.00x) 8182.6 ( 1.00x)
> hevc_sao_band_48_8_neon: 2685.7 ( 3.05x) 1185.8 ( 6.90x) 1185.8 ( 7.36x)
> hevc_sao_band_64_8_c: 14023.0 ( 1.00x) 14038.9 ( 1.00x) 14038.9 ( 1.00x)
> hevc_sao_band_64_8_neon: 4783.2 ( 2.93x) 2078.4 ( 6.75x) 2078.4 ( 7.15x)
The cycle numbers in the tbl and tbx columns seem to be identical here,
while the relative speedup numbers differ - was this some sort of
copypaste mistake in preparing the table? (The difference in speedup
numbers does seem impressive.)
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [FFmpeg-devel] [PATCH] aarch64/h26x: optimize sao_band_filter
2025-04-29 7:58 ` Martin Storsjö
@ 2025-04-29 8:14 ` Zhao Zhili
0 siblings, 0 replies; 5+ messages in thread
From: Zhao Zhili @ 2025-04-29 8:14 UTC (permalink / raw)
To: FFmpeg development discussions and patches
[-- Attachment #1: Type: text/plain, Size: 2011 bytes --]
> On Apr 29, 2025, at 15:58, Martin Storsjö <martin@martin.st> wrote:
>
> On Tue, 29 Apr 2025, Zhao Zhili wrote:
>
>>> On Apr 25, 2025, at 16:25, Martin Storsjö <martin@martin.st> wrote:
>>> On Tue, 15 Apr 2025, Zhao Zhili wrote:
>>>> + tbx v3.8b, {v16.16b-v17.16b}, v3.8b
>>> Is there any specific reason for preferring tbx over tbl here? (I know the existing code used tbx.) Without having studied cycle tables, I would expect tbl to maybe be slightly simpler, but perhaps there's no difference (or tbx is faster)?
>>
>> tbl can be faster. The result is quite impressive. Changed to tbl before push.
>>
>> Before tbx tbl
>> hevc_sao_band_8_8_c: 252.3 ( 1.00x) 252.3 ( 1.00x) 252.3 ( 1.00x)
>> hevc_sao_band_8_8_neon: 95.8 ( 2.63x) 61.0 ( 4.14x) 61.0 ( 4.57x)
>> hevc_sao_band_16_8_c: 875.2 ( 1.00x) 864.9 ( 1.00x) 864.9 ( 1.00x)
>> hevc_sao_band_16_8_neon: 317.5 ( 2.76x) 150.0 ( 5.76x) 150.0 ( 6.26x)
>> hevc_sao_band_32_8_c: 3853.5 ( 1.00x) 3871.6 ( 1.00x) 3871.6 ( 1.00x)
>> hevc_sao_band_32_8_neon: 1222.3 ( 3.15x) 550.6 ( 7.03x) 550.6 ( 7.39)
>> hevc_sao_band_48_8_c: 8203.6 ( 1.00x) 8182.6 ( 1.00x) 8182.6 ( 1.00x)
>> hevc_sao_band_48_8_neon: 2685.7 ( 3.05x) 1185.8 ( 6.90x) 1185.8 ( 7.36x)
>> hevc_sao_band_64_8_c: 14023.0 ( 1.00x) 14038.9 ( 1.00x) 14038.9 ( 1.00x)
>> hevc_sao_band_64_8_neon: 4783.2 ( 2.93x) 2078.4 ( 6.75x) 2078.4 ( 7.15x)
>
> The cycle numbers in the tbl and tbx columns seem to be identical here, while the relative speedup numbers differ - was this some sort of copypaste mistake in preparing the table? (The difference in speedup numbers does seem impressive.)
They are the same on A75, but not on A76/A77/X3.
tbl: 2 cycle for 1 or 2 table register
tbx: 2 cycle for 1 table register, 4 for 2 table register.
The code use 2 table register.
[-- Attachment #2: PastedGraphic-1.png --]
[-- Type: image/png, Size: 122049 bytes --]
[-- Attachment #3: Type: text/plain, Size: 282 bytes --]
>
> // Martin
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
[-- Attachment #4: Type: text/plain, Size: 251 bytes --]
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2025-04-29 8:15 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-04-15 10:01 [FFmpeg-devel] [PATCH] aarch64/h26x: optimize sao_band_filter Zhao Zhili
2025-04-25 8:25 ` Martin Storsjö
2025-04-29 7:51 ` Zhao Zhili
2025-04-29 7:58 ` Martin Storsjö
2025-04-29 8:14 ` Zhao Zhili
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git