Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH] aarch64/h26x: optimize sao_band_filter
@ 2025-04-15 10:01 Zhao Zhili
  2025-04-25  8:25 ` Martin Storsjö
  0 siblings, 1 reply; 5+ messages in thread
From: Zhao Zhili @ 2025-04-15 10:01 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Zhao Zhili

From: Zhao Zhili <zhilizhao@tencent.com>

int8_t[] is enough for offset_table of 8 bit streams.

On rpi5:
                             Before               After
hevc_sao_band_8_8_c:          252.3 ( 1.00x)     252.3 ( 1.00x)
hevc_sao_band_8_8_neon:        95.8 ( 2.63x)      61.0 ( 4.14x)
hevc_sao_band_16_8_c:         875.2 ( 1.00x)     864.9 ( 1.00x)
hevc_sao_band_16_8_neon:      317.5 ( 2.76x)     150.0 ( 5.76x)
hevc_sao_band_32_8_c:        3853.5 ( 1.00x)    3871.6 ( 1.00x)
hevc_sao_band_32_8_neon:     1222.3 ( 3.15x)     550.6 ( 7.03x)
hevc_sao_band_48_8_c:        8203.6 ( 1.00x)    8182.6 ( 1.00x)
hevc_sao_band_48_8_neon:     2685.7 ( 3.05x)    1185.8 ( 6.90x)
hevc_sao_band_64_8_c:       14023.0 ( 1.00x)   14038.9 ( 1.00x)
hevc_sao_band_64_8_neon:     4783.2 ( 2.93x)    2078.4 ( 6.75x)
---
 libavcodec/aarch64/h26x/dsp.h             |  4 +
 libavcodec/aarch64/h26x/sao_neon.S        | 93 ++++++++++++++---------
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  4 +-
 libavcodec/aarch64/vvc/dsp_init.c         |  5 +-
 4 files changed, 65 insertions(+), 41 deletions(-)

diff --git a/libavcodec/aarch64/h26x/dsp.h b/libavcodec/aarch64/h26x/dsp.h
index 0fefb4d70f..6ea6a8d36a 100644
--- a/libavcodec/aarch64/h26x/dsp.h
+++ b/libavcodec/aarch64/h26x/dsp.h
@@ -28,6 +28,10 @@ void ff_h26x_sao_band_filter_8x8_8_neon(uint8_t *_dst, const uint8_t *_src,
                                         ptrdiff_t stride_dst, ptrdiff_t stride_src,
                                         const int16_t *sao_offset_val, int sao_left_class,
                                         int width, int height);
+void ff_h26x_sao_band_filter_16x16_8_neon(uint8_t *_dst, const uint8_t *_src,
+                                        ptrdiff_t stride_dst, ptrdiff_t stride_src,
+                                        const int16_t *sao_offset_val, int sao_left_class,
+                                        int width, int height);
 void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst,
                                           const int16_t *sao_offset_val, int eo, int width, int height);
 void ff_hevc_sao_edge_filter_8x8_8_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst,
diff --git a/libavcodec/aarch64/h26x/sao_neon.S b/libavcodec/aarch64/h26x/sao_neon.S
index c43820135e..60c026fe95 100644
--- a/libavcodec/aarch64/h26x/sao_neon.S
+++ b/libavcodec/aarch64/h26x/sao_neon.S
@@ -35,48 +35,67 @@
 //                      int16_t *sao_offset_val, int sao_left_class,
 //                      int width, int height)
 function ff_h26x_sao_band_filter_8x8_8_neon, export=1
-        stp             xzr, xzr, [sp, #-64]!
+        stp             xzr, xzr, [sp, #-32]!
         stp             xzr, xzr, [sp, #16]
-        stp             xzr, xzr, [sp, #32]
-        stp             xzr, xzr, [sp, #48]
         mov             w8,  #4
-0:      ldrsh           x9, [x4,  x8, lsl #1]      // sao_offset_val[k+1]
-        subs            w8,  w8,  #1
-        add             w10, w8,  w5               // k + sao_left_class
+0:
+        ldrsh           x9, [x4, x8, lsl #1]        // sao_offset_val[k+1]
+        subs            w8, w8, #1
+        add             w10, w8, w5                 // k + sao_left_class
         and             w10, w10, #0x1F
-        strh            w9, [sp, x10, lsl #1]
+        strb            w9, [sp, x10]
         bne             0b
-        add             w6,  w6,  #7
-        bic             w6,  w6,  #7
-        ld1             {v16.16b-v19.16b}, [sp], #64
-        sub             x2,  x2,  x6
-        sub             x3,  x3,  x6
-        movi            v20.8h,   #1
-1:      mov             w8,  w6                    // beginning of line
-2:      // Simple layout for accessing 16bit values
-        // with 8bit LUT.
-        //
-        //   00  01  02  03  04  05  06  07
-        // +----------------------------------->
-        // |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED|....
-        // +----------------------------------->
-        //    i-0     i-1     i-2     i-3
-        ld1             {v2.8b}, [x1], #8          // dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
-        subs            w8, w8,  #8
-        uxtl            v0.8h,  v2.8b              // load src[x]
-        ushr            v2.8h,  v0.8h, #3          // >> BIT_DEPTH - 3
-        shl             v1.8h,  v2.8h, #1          // low (x2, accessing short)
-        add             v3.8h,  v1.8h, v20.8h      // +1 access upper short
-        sli             v1.8h,  v3.8h, #8          // shift insert index to upper byte
-        tbx             v2.16b, {v16.16b-v19.16b}, v1.16b // table
-        add             v1.8h,  v0.8h, v2.8h       // src[x] + table
-        sqxtun          v4.8b,  v1.8h              // clip + narrow
-        st1             {v4.8b}, [x0], #8          // store
-        // done 8 pixels
+        ldp             q16, q17, [sp], #32
+1:
+        ld1             {v2.8b}, [x1], x3
+        subs            w7, w7, #1
+        uxtl            v0.8h, v2.8b
+        ushr            v3.8b, v2.8b, #3          // >> BIT_DEPTH - 3
+        tbx             v3.8b, {v16.16b-v17.16b}, v3.8b
+        sxtl            v2.8h, v3.8b
+        add             v0.8h, v0.8h, v2.8h         // src[x] + table
+        sqxtun          v0.8b, v0.8h                // clip + narrow
+        st1             {v0.8b}, [x0], x2
+        bne             1b
+        ret
+endfunc
+
+function ff_h26x_sao_band_filter_16x16_8_neon, export=1
+        stp             xzr, xzr, [sp, #-32]!
+        stp             xzr, xzr, [sp, #16]
+        mov             w8,  #4
+0:
+        ldrsh           x9, [x4, x8, lsl #1]        // sao_offset_val[k+1]
+        subs            w8,  w8, #1
+        add             w10, w8, w5                 // k + sao_left_class
+        and             w10, w10, #0x1F
+        strb            w9, [sp, x10]
+        bne             0b
+        add             w6, w6, #15
+        bic             w6, w6, #15
+        ldp             q16, q17, [sp], #32
+        sub             x2, x2, x6
+        sub             x3, x3, x6
+1:
+        mov             w8, w6                      // beginning of line
+2:
+        ldr             q2, [x1], #16
+        subs            w8, w8, #16
+        uxtl            v0.8h, v2.8b
+        uxtl2           v1.8h, v2.16b
+        ushr            v3.16b, v2.16b, #3          // >> BIT_DEPTH - 3
+        tbx             v3.16b, {v16.16b-v17.16b}, v3.16b
+        sxtl            v2.8h, v3.8b
+        sxtl2           v3.8h, v3.16b
+        add             v0.8h, v0.8h, v2.8h         // src[x] + table
+        add             v1.8h, v1.8h, v3.8h
+        sqxtun          v0.8b, v0.8h                // clip + narrow
+        sqxtun2         v0.16b, v1.8h
+        str             q0, [x0], #16
         bne             2b
-        subs            w7, w7,  #1                // finished line, prep. new
-        add             x0, x0,  x2                // dst += stride_dst
-        add             x1, x1,  x3                // src += stride_src
+        subs            w7, w7, #1
+        add             x0, x0, x2                  // dst += stride_dst
+        add             x1, x1, x3                  // src += stride_src
         bne             1b
         ret
 endfunc
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 5dd470baaa..0b159d1886 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -157,11 +157,11 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_8_neon;
         c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_8_neon;
         c->transform_4x4_luma          = ff_hevc_transform_luma_4x4_neon_8;
-        c->sao_band_filter[0]          =
+        c->sao_band_filter[0]          = ff_h26x_sao_band_filter_8x8_8_neon;
         c->sao_band_filter[1]          =
         c->sao_band_filter[2]          =
         c->sao_band_filter[3]          =
-        c->sao_band_filter[4]          = ff_h26x_sao_band_filter_8x8_8_neon;
+        c->sao_band_filter[4]          = ff_h26x_sao_band_filter_16x16_8_neon;
         c->sao_edge_filter[0]          = ff_hevc_sao_edge_filter_8x8_8_neon;
         c->sao_edge_filter[1]          =
         c->sao_edge_filter[2]          =
diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
index 2c99ba206b..9a171234f6 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -197,8 +197,9 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
         c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_8_neon;
         c->inter.apply_bdof = apply_bdof_8;
 
-        for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
-            c->sao.band_filter[i] = ff_h26x_sao_band_filter_8x8_8_neon;
+        c->sao.band_filter[0] = ff_h26x_sao_band_filter_8x8_8_neon;
+        for (int i = 1; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
+            c->sao.band_filter[i] = ff_h26x_sao_band_filter_16x16_8_neon;
         c->sao.edge_filter[0] = ff_vvc_sao_edge_filter_8x8_8_neon;
         for (int i = 1; i < FF_ARRAY_ELEMS(c->sao.edge_filter); i++)
             c->sao.edge_filter[i] = ff_vvc_sao_edge_filter_16x16_8_neon;
-- 
2.46.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [FFmpeg-devel] [PATCH] aarch64/h26x: optimize sao_band_filter
  2025-04-15 10:01 [FFmpeg-devel] [PATCH] aarch64/h26x: optimize sao_band_filter Zhao Zhili
@ 2025-04-25  8:25 ` Martin Storsjö
  2025-04-29  7:51   ` Zhao Zhili
  0 siblings, 1 reply; 5+ messages in thread
From: Martin Storsjö @ 2025-04-25  8:25 UTC (permalink / raw)
  To: FFmpeg development discussions and patches; +Cc: Zhao Zhili

On Tue, 15 Apr 2025, Zhao Zhili wrote:

> From: Zhao Zhili <zhilizhao@tencent.com>
>
> int8_t[] is enough for offset_table of 8 bit streams.
>
> On rpi5:
>                             Before               After
> hevc_sao_band_8_8_c:          252.3 ( 1.00x)     252.3 ( 1.00x)
> hevc_sao_band_8_8_neon:        95.8 ( 2.63x)      61.0 ( 4.14x)
> hevc_sao_band_16_8_c:         875.2 ( 1.00x)     864.9 ( 1.00x)
> hevc_sao_band_16_8_neon:      317.5 ( 2.76x)     150.0 ( 5.76x)
> hevc_sao_band_32_8_c:        3853.5 ( 1.00x)    3871.6 ( 1.00x)
> hevc_sao_band_32_8_neon:     1222.3 ( 3.15x)     550.6 ( 7.03x)
> hevc_sao_band_48_8_c:        8203.6 ( 1.00x)    8182.6 ( 1.00x)
> hevc_sao_band_48_8_neon:     2685.7 ( 3.05x)    1185.8 ( 6.90x)
> hevc_sao_band_64_8_c:       14023.0 ( 1.00x)   14038.9 ( 1.00x)
> hevc_sao_band_64_8_neon:     4783.2 ( 2.93x)    2078.4 ( 6.75x)
> ---
> libavcodec/aarch64/h26x/dsp.h             |  4 +
> libavcodec/aarch64/h26x/sao_neon.S        | 93 ++++++++++++++---------
> libavcodec/aarch64/hevcdsp_init_aarch64.c |  4 +-
> libavcodec/aarch64/vvc/dsp_init.c         |  5 +-
> 4 files changed, 65 insertions(+), 41 deletions(-)
>
> diff --git a/libavcodec/aarch64/h26x/dsp.h b/libavcodec/aarch64/h26x/dsp.h
> index 0fefb4d70f..6ea6a8d36a 100644
> --- a/libavcodec/aarch64/h26x/dsp.h
> +++ b/libavcodec/aarch64/h26x/dsp.h
> @@ -28,6 +28,10 @@ void ff_h26x_sao_band_filter_8x8_8_neon(uint8_t *_dst, const uint8_t *_src,
>                                         ptrdiff_t stride_dst, ptrdiff_t stride_src,
>                                         const int16_t *sao_offset_val, int sao_left_class,
>                                         int width, int height);
> +void ff_h26x_sao_band_filter_16x16_8_neon(uint8_t *_dst, const uint8_t *_src,
> +                                        ptrdiff_t stride_dst, ptrdiff_t stride_src,
> +                                        const int16_t *sao_offset_val, int sao_left_class,
> +                                        int width, int height);
> void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst,
>                                           const int16_t *sao_offset_val, int eo, int width, int height);
> void ff_hevc_sao_edge_filter_8x8_8_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst,
> diff --git a/libavcodec/aarch64/h26x/sao_neon.S b/libavcodec/aarch64/h26x/sao_neon.S
> index c43820135e..60c026fe95 100644
> --- a/libavcodec/aarch64/h26x/sao_neon.S
> +++ b/libavcodec/aarch64/h26x/sao_neon.S
> @@ -35,48 +35,67 @@
> //                      int16_t *sao_offset_val, int sao_left_class,
> //                      int width, int height)
> function ff_h26x_sao_band_filter_8x8_8_neon, export=1
> -        stp             xzr, xzr, [sp, #-64]!
> +        stp             xzr, xzr, [sp, #-32]!
>         stp             xzr, xzr, [sp, #16]
> -        stp             xzr, xzr, [sp, #32]
> -        stp             xzr, xzr, [sp, #48]
>         mov             w8,  #4
> -0:      ldrsh           x9, [x4,  x8, lsl #1]      // sao_offset_val[k+1]
> -        subs            w8,  w8,  #1
> -        add             w10, w8,  w5               // k + sao_left_class
> +0:
> +        ldrsh           x9, [x4, x8, lsl #1]        // sao_offset_val[k+1]
> +        subs            w8, w8, #1
> +        add             w10, w8, w5                 // k + sao_left_class
>         and             w10, w10, #0x1F
> -        strh            w9, [sp, x10, lsl #1]
> +        strb            w9, [sp, x10]
>         bne             0b
> -        add             w6,  w6,  #7
> -        bic             w6,  w6,  #7
> -        ld1             {v16.16b-v19.16b}, [sp], #64
> -        sub             x2,  x2,  x6
> -        sub             x3,  x3,  x6
> -        movi            v20.8h,   #1
> -1:      mov             w8,  w6                    // beginning of line
> -2:      // Simple layout for accessing 16bit values
> -        // with 8bit LUT.
> -        //
> -        //   00  01  02  03  04  05  06  07
> -        // +----------------------------------->
> -        // |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED|....
> -        // +----------------------------------->
> -        //    i-0     i-1     i-2     i-3
> -        ld1             {v2.8b}, [x1], #8          // dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
> -        subs            w8, w8,  #8
> -        uxtl            v0.8h,  v2.8b              // load src[x]
> -        ushr            v2.8h,  v0.8h, #3          // >> BIT_DEPTH - 3
> -        shl             v1.8h,  v2.8h, #1          // low (x2, accessing short)
> -        add             v3.8h,  v1.8h, v20.8h      // +1 access upper short
> -        sli             v1.8h,  v3.8h, #8          // shift insert index to upper byte
> -        tbx             v2.16b, {v16.16b-v19.16b}, v1.16b // table
> -        add             v1.8h,  v0.8h, v2.8h       // src[x] + table
> -        sqxtun          v4.8b,  v1.8h              // clip + narrow
> -        st1             {v4.8b}, [x0], #8          // store
> -        // done 8 pixels
> +        ldp             q16, q17, [sp], #32
> +1:
> +        ld1             {v2.8b}, [x1], x3
> +        subs            w7, w7, #1
> +        uxtl            v0.8h, v2.8b
> +        ushr            v3.8b, v2.8b, #3          // >> BIT_DEPTH - 3

Nitpick: The comment on this line seems to be misaligned with the other 
comments below - please check.

> +        tbx             v3.8b, {v16.16b-v17.16b}, v3.8b

Is there any specific reason for preferring tbx over tbl here? (I know the 
existing code used tbx.) Without having studied cycle tables, I would 
expect tbl to maybe be slightly simpler, but perhaps there's no difference 
(or tbx is faster)?


Other than these comments, this patch looks good to me, thanks - feel free 
to push.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [FFmpeg-devel] [PATCH] aarch64/h26x: optimize sao_band_filter
  2025-04-25  8:25 ` Martin Storsjö
@ 2025-04-29  7:51   ` Zhao Zhili
  2025-04-29  7:58     ` Martin Storsjö
  0 siblings, 1 reply; 5+ messages in thread
From: Zhao Zhili @ 2025-04-29  7:51 UTC (permalink / raw)
  To: FFmpeg development discussions and patches



> On Apr 25, 2025, at 16:25, Martin Storsjö <martin@martin.st> wrote:
> 
> On Tue, 15 Apr 2025, Zhao Zhili wrote:
> 
>> From: Zhao Zhili <zhilizhao@tencent.com>
>> 
>> int8_t[] is enough for offset_table of 8 bit streams.
>> 
>> On rpi5:
>>                            Before               After
>> hevc_sao_band_8_8_c:          252.3 ( 1.00x)     252.3 ( 1.00x)
>> hevc_sao_band_8_8_neon:        95.8 ( 2.63x)      61.0 ( 4.14x)
>> hevc_sao_band_16_8_c:         875.2 ( 1.00x)     864.9 ( 1.00x)
>> hevc_sao_band_16_8_neon:      317.5 ( 2.76x)     150.0 ( 5.76x)
>> hevc_sao_band_32_8_c:        3853.5 ( 1.00x)    3871.6 ( 1.00x)
>> hevc_sao_band_32_8_neon:     1222.3 ( 3.15x)     550.6 ( 7.03x)
>> hevc_sao_band_48_8_c:        8203.6 ( 1.00x)    8182.6 ( 1.00x)
>> hevc_sao_band_48_8_neon:     2685.7 ( 3.05x)    1185.8 ( 6.90x)
>> hevc_sao_band_64_8_c:       14023.0 ( 1.00x)   14038.9 ( 1.00x)
>> hevc_sao_band_64_8_neon:     4783.2 ( 2.93x)    2078.4 ( 6.75x)
>> ---
>> libavcodec/aarch64/h26x/dsp.h             |  4 +
>> libavcodec/aarch64/h26x/sao_neon.S        | 93 ++++++++++++++---------
>> libavcodec/aarch64/hevcdsp_init_aarch64.c |  4 +-
>> libavcodec/aarch64/vvc/dsp_init.c         |  5 +-
>> 4 files changed, 65 insertions(+), 41 deletions(-)
>> 
>> diff --git a/libavcodec/aarch64/h26x/dsp.h b/libavcodec/aarch64/h26x/dsp.h
>> index 0fefb4d70f..6ea6a8d36a 100644
>> --- a/libavcodec/aarch64/h26x/dsp.h
>> +++ b/libavcodec/aarch64/h26x/dsp.h
>> @@ -28,6 +28,10 @@ void ff_h26x_sao_band_filter_8x8_8_neon(uint8_t *_dst, const uint8_t *_src,
>>                                        ptrdiff_t stride_dst, ptrdiff_t stride_src,
>>                                        const int16_t *sao_offset_val, int sao_left_class,
>>                                        int width, int height);
>> +void ff_h26x_sao_band_filter_16x16_8_neon(uint8_t *_dst, const uint8_t *_src,
>> +                                        ptrdiff_t stride_dst, ptrdiff_t stride_src,
>> +                                        const int16_t *sao_offset_val, int sao_left_class,
>> +                                        int width, int height);
>> void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst,
>>                                          const int16_t *sao_offset_val, int eo, int width, int height);
>> void ff_hevc_sao_edge_filter_8x8_8_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst,
>> diff --git a/libavcodec/aarch64/h26x/sao_neon.S b/libavcodec/aarch64/h26x/sao_neon.S
>> index c43820135e..60c026fe95 100644
>> --- a/libavcodec/aarch64/h26x/sao_neon.S
>> +++ b/libavcodec/aarch64/h26x/sao_neon.S
>> @@ -35,48 +35,67 @@
>> //                      int16_t *sao_offset_val, int sao_left_class,
>> //                      int width, int height)
>> function ff_h26x_sao_band_filter_8x8_8_neon, export=1
>> -        stp             xzr, xzr, [sp, #-64]!
>> +        stp             xzr, xzr, [sp, #-32]!
>>        stp             xzr, xzr, [sp, #16]
>> -        stp             xzr, xzr, [sp, #32]
>> -        stp             xzr, xzr, [sp, #48]
>>        mov             w8,  #4
>> -0:      ldrsh           x9, [x4,  x8, lsl #1]      // sao_offset_val[k+1]
>> -        subs            w8,  w8,  #1
>> -        add             w10, w8,  w5               // k + sao_left_class
>> +0:
>> +        ldrsh           x9, [x4, x8, lsl #1]        // sao_offset_val[k+1]
>> +        subs            w8, w8, #1
>> +        add             w10, w8, w5                 // k + sao_left_class
>>        and             w10, w10, #0x1F
>> -        strh            w9, [sp, x10, lsl #1]
>> +        strb            w9, [sp, x10]
>>        bne             0b
>> -        add             w6,  w6,  #7
>> -        bic             w6,  w6,  #7
>> -        ld1             {v16.16b-v19.16b}, [sp], #64
>> -        sub             x2,  x2,  x6
>> -        sub             x3,  x3,  x6
>> -        movi            v20.8h,   #1
>> -1:      mov             w8,  w6                    // beginning of line
>> -2:      // Simple layout for accessing 16bit values
>> -        // with 8bit LUT.
>> -        //
>> -        //   00  01  02  03  04  05  06  07
>> -        // +----------------------------------->
>> -        // |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED|....
>> -        // +----------------------------------->
>> -        //    i-0     i-1     i-2     i-3
>> -        ld1             {v2.8b}, [x1], #8          // dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
>> -        subs            w8, w8,  #8
>> -        uxtl            v0.8h,  v2.8b              // load src[x]
>> -        ushr            v2.8h,  v0.8h, #3          // >> BIT_DEPTH - 3
>> -        shl             v1.8h,  v2.8h, #1          // low (x2, accessing short)
>> -        add             v3.8h,  v1.8h, v20.8h      // +1 access upper short
>> -        sli             v1.8h,  v3.8h, #8          // shift insert index to upper byte
>> -        tbx             v2.16b, {v16.16b-v19.16b}, v1.16b // table
>> -        add             v1.8h,  v0.8h, v2.8h       // src[x] + table
>> -        sqxtun          v4.8b,  v1.8h              // clip + narrow
>> -        st1             {v4.8b}, [x0], #8          // store
>> -        // done 8 pixels
>> +        ldp             q16, q17, [sp], #32
>> +1:
>> +        ld1             {v2.8b}, [x1], x3
>> +        subs            w7, w7, #1
>> +        uxtl            v0.8h, v2.8b
>> +        ushr            v3.8b, v2.8b, #3          // >> BIT_DEPTH - 3
> 
> Nitpick: The comment on this line seems to be misaligned with the other comments below - please check.

Fixed before push.

> 
>> +        tbx             v3.8b, {v16.16b-v17.16b}, v3.8b
> 
> Is there any specific reason for preferring tbx over tbl here? (I know the existing code used tbx.) Without having studied cycle tables, I would expect tbl to maybe be slightly simpler, but perhaps there's no difference (or tbx is faster)?

tbl can be faster. The result is quite impressive. Changed to tbl before push.

                             Before               tbx             tbl
hevc_sao_band_8_8_c:          252.3 ( 1.00x)     252.3 ( 1.00x)    252.3 ( 1.00x)
hevc_sao_band_8_8_neon:        95.8 ( 2.63x)      61.0 ( 4.14x)     61.0 ( 4.57x)
hevc_sao_band_16_8_c:         875.2 ( 1.00x)     864.9 ( 1.00x)    864.9 ( 1.00x)
hevc_sao_band_16_8_neon:      317.5 ( 2.76x)     150.0 ( 5.76x)    150.0 ( 6.26x)
hevc_sao_band_32_8_c:        3853.5 ( 1.00x)    3871.6 ( 1.00x)   3871.6 ( 1.00x)
hevc_sao_band_32_8_neon:     1222.3 ( 3.15x)     550.6 ( 7.03x)    550.6 ( 7.39)
hevc_sao_band_48_8_c:        8203.6 ( 1.00x)    8182.6 ( 1.00x)   8182.6 ( 1.00x)
hevc_sao_band_48_8_neon:     2685.7 ( 3.05x)    1185.8 ( 6.90x)   1185.8 ( 7.36x)
hevc_sao_band_64_8_c:       14023.0 ( 1.00x)   14038.9 ( 1.00x)  14038.9 ( 1.00x)
hevc_sao_band_64_8_neon:     4783.2 ( 2.93x)    2078.4 ( 6.75x)   2078.4 ( 7.15x)

> 
> 
> Other than these comments, this patch looks good to me, thanks - feel free to push.
> 
> // Martin
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org <mailto:ffmpeg-devel@ffmpeg.org>
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org <mailto:ffmpeg-devel-request@ffmpeg.org> with subject "unsubscribe".

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [FFmpeg-devel] [PATCH] aarch64/h26x: optimize sao_band_filter
  2025-04-29  7:51   ` Zhao Zhili
@ 2025-04-29  7:58     ` Martin Storsjö
  2025-04-29  8:14       ` Zhao Zhili
  0 siblings, 1 reply; 5+ messages in thread
From: Martin Storsjö @ 2025-04-29  7:58 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

On Tue, 29 Apr 2025, Zhao Zhili wrote:

>> On Apr 25, 2025, at 16:25, Martin Storsjö <martin@martin.st> wrote:
>> 
>> On Tue, 15 Apr 2025, Zhao Zhili wrote:
>> 
>> 
>>> +        tbx             v3.8b, {v16.16b-v17.16b}, v3.8b
>> 
>> Is there any specific reason for preferring tbx over tbl here? (I know the existing code used tbx.) Without having studied cycle tables, I would expect tbl to maybe be slightly simpler, but perhaps there's no difference (or tbx is faster)?
>
> tbl can be faster. The result is quite impressive. Changed to tbl before push.
>
>                             Before               tbx             tbl
> hevc_sao_band_8_8_c:          252.3 ( 1.00x)     252.3 ( 1.00x)    252.3 ( 1.00x)
> hevc_sao_band_8_8_neon:        95.8 ( 2.63x)      61.0 ( 4.14x)     61.0 ( 4.57x)
> hevc_sao_band_16_8_c:         875.2 ( 1.00x)     864.9 ( 1.00x)    864.9 ( 1.00x)
> hevc_sao_band_16_8_neon:      317.5 ( 2.76x)     150.0 ( 5.76x)    150.0 ( 6.26x)
> hevc_sao_band_32_8_c:        3853.5 ( 1.00x)    3871.6 ( 1.00x)   3871.6 ( 1.00x)
> hevc_sao_band_32_8_neon:     1222.3 ( 3.15x)     550.6 ( 7.03x)    550.6 ( 7.39)
> hevc_sao_band_48_8_c:        8203.6 ( 1.00x)    8182.6 ( 1.00x)   8182.6 ( 1.00x)
> hevc_sao_band_48_8_neon:     2685.7 ( 3.05x)    1185.8 ( 6.90x)   1185.8 ( 7.36x)
> hevc_sao_band_64_8_c:       14023.0 ( 1.00x)   14038.9 ( 1.00x)  14038.9 ( 1.00x)
> hevc_sao_band_64_8_neon:     4783.2 ( 2.93x)    2078.4 ( 6.75x)   2078.4 ( 7.15x)

The cycle numbers in the tbl and tbx columns seem to be identical here, 
while the relative speedup numbers differ - was this some sort of 
copypaste mistake in preparing the table? (The difference in speedup 
numbers does seem impressive.)

// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [FFmpeg-devel] [PATCH] aarch64/h26x: optimize sao_band_filter
  2025-04-29  7:58     ` Martin Storsjö
@ 2025-04-29  8:14       ` Zhao Zhili
  0 siblings, 0 replies; 5+ messages in thread
From: Zhao Zhili @ 2025-04-29  8:14 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

[-- Attachment #1: Type: text/plain, Size: 2011 bytes --]



> On Apr 29, 2025, at 15:58, Martin Storsjö <martin@martin.st> wrote:
> 
> On Tue, 29 Apr 2025, Zhao Zhili wrote:
> 
>>> On Apr 25, 2025, at 16:25, Martin Storsjö <martin@martin.st> wrote:
>>> On Tue, 15 Apr 2025, Zhao Zhili wrote:
>>>> +        tbx             v3.8b, {v16.16b-v17.16b}, v3.8b
>>> Is there any specific reason for preferring tbx over tbl here? (I know the existing code used tbx.) Without having studied cycle tables, I would expect tbl to maybe be slightly simpler, but perhaps there's no difference (or tbx is faster)?
>> 
>> tbl can be faster. The result is quite impressive. Changed to tbl before push.
>> 
>>                            Before               tbx             tbl
>> hevc_sao_band_8_8_c:          252.3 ( 1.00x)     252.3 ( 1.00x)    252.3 ( 1.00x)
>> hevc_sao_band_8_8_neon:        95.8 ( 2.63x)      61.0 ( 4.14x)     61.0 ( 4.57x)
>> hevc_sao_band_16_8_c:         875.2 ( 1.00x)     864.9 ( 1.00x)    864.9 ( 1.00x)
>> hevc_sao_band_16_8_neon:      317.5 ( 2.76x)     150.0 ( 5.76x)    150.0 ( 6.26x)
>> hevc_sao_band_32_8_c:        3853.5 ( 1.00x)    3871.6 ( 1.00x)   3871.6 ( 1.00x)
>> hevc_sao_band_32_8_neon:     1222.3 ( 3.15x)     550.6 ( 7.03x)    550.6 ( 7.39)
>> hevc_sao_band_48_8_c:        8203.6 ( 1.00x)    8182.6 ( 1.00x)   8182.6 ( 1.00x)
>> hevc_sao_band_48_8_neon:     2685.7 ( 3.05x)    1185.8 ( 6.90x)   1185.8 ( 7.36x)
>> hevc_sao_band_64_8_c:       14023.0 ( 1.00x)   14038.9 ( 1.00x)  14038.9 ( 1.00x)
>> hevc_sao_band_64_8_neon:     4783.2 ( 2.93x)    2078.4 ( 6.75x)   2078.4 ( 7.15x)
> 
> The cycle numbers in the tbl and tbx columns seem to be identical here, while the relative speedup numbers differ - was this some sort of copypaste mistake in preparing the table? (The difference in speedup numbers does seem impressive.)

They are the same on A75, but not on A76/A77/X3.

tbl: 2 cycle for 1 or 2 table register
tbx: 2 cycle for 1 table register, 4 for 2 table register.

The code use 2 table register.


[-- Attachment #2: PastedGraphic-1.png --]
[-- Type: image/png, Size: 122049 bytes --]

[-- Attachment #3: Type: text/plain, Size: 282 bytes --]


> 
> // Martin
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".


[-- Attachment #4: Type: text/plain, Size: 251 bytes --]

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2025-04-29  8:15 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-04-15 10:01 [FFmpeg-devel] [PATCH] aarch64/h26x: optimize sao_band_filter Zhao Zhili
2025-04-25  8:25 ` Martin Storsjö
2025-04-29  7:51   ` Zhao Zhili
2025-04-29  7:58     ` Martin Storsjö
2025-04-29  8:14       ` Zhao Zhili

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git