* [FFmpeg-devel] [PATCH 1/3] lavc/aarch64: fix hevc sao band filter
@ 2022-04-28 13:42 J. Dekker
2022-04-28 13:42 ` [FFmpeg-devel] [PATCH 2/3] lavc/aarch64: add hevc sao edge 16x16 J. Dekker
` (2 more replies)
0 siblings, 3 replies; 10+ messages in thread
From: J. Dekker @ 2022-04-28 13:42 UTC (permalink / raw)
To: ffmpeg-devel
The SAO band filter can be called with non-multiples of 8, we round up
to the nearest multiple of 8 to account for this.
Signed-off-by: J. Dekker <jdek@itanimul.li>
---
libavcodec/aarch64/hevcdsp_init_aarch64.c | 10 +++++-----
libavcodec/aarch64/hevcdsp_sao_neon.S | 8 ++++++--
2 files changed, 11 insertions(+), 7 deletions(-)
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 1e40be740c..c8963e6104 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -75,11 +75,11 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_neon;
c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_neon;
c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_neon;
- // This function is disabled, as it doesn't handle widths that aren't
- // an even multiple of 8 correctly. fate-hevc doesn't exercise that
- // for the current size, but if enabled for bigger sizes, the cases
- // of non-multiple of 8 seem to arise.
-// c->sao_band_filter[0] = ff_hevc_sao_band_filter_8x8_8_neon;
+ c->sao_band_filter[0] =
+ c->sao_band_filter[1] =
+ c->sao_band_filter[2] =
+ c->sao_band_filter[3] =
+ c->sao_band_filter[4] = ff_hevc_sao_band_filter_8x8_8_neon;
}
if (bit_depth == 10) {
c->add_residual[0] = ff_hevc_add_residual_4x4_10_neon;
diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S b/libavcodec/aarch64/hevcdsp_sao_neon.S
index d523bf584d..e07e0cea2d 100644
--- a/libavcodec/aarch64/hevcdsp_sao_neon.S
+++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
@@ -41,7 +41,11 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1
and w10, w10, #0x1F
strh w9, [sp, x10, lsl #1]
bne 0b
+ add w6, w6, #7
+ bic w6, w6, #7
ld1 {v16.16b-v19.16b}, [sp], #64
+ sub x2, x2, x6
+ sub x3, x3, x6
movi v20.8h, #1
1: mov w8, w6 // beginning of line
2: // Simple layout for accessing 16bit values
@@ -52,7 +56,7 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1
// |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED|....
// +----------------------------------->
// i-0 i-1 i-2 i-3
- ld1 {v2.8b}, [x1] // dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
+ ld1 {v2.8b}, [x1], #8 // dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
uxtl v0.8h, v2.8b // load src[x]
ushr v2.8h, v0.8h, #3 // >> BIT_DEPTH - 3
shl v1.8h, v2.8h, #1 // low (x2, accessing short)
@@ -61,7 +65,7 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1
tbx v2.16b, {v16.16b-v19.16b}, v1.16b // table
add v1.8h, v0.8h, v2.8h // src[x] + table
sqxtun v4.8b, v1.8h // clip + narrow
- st1 {v4.8b}, [x0] // store
+ st1 {v4.8b}, [x0], #8 // store
subs w8, w8, #8 // done 8 pixels
bne 2b
subs w7, w7, #1 // finished line, prep. new
--
2.32.0 (Apple Git-132)
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH 2/3] lavc/aarch64: add hevc sao edge 16x16
2022-04-28 13:42 [FFmpeg-devel] [PATCH 1/3] lavc/aarch64: fix hevc sao band filter J. Dekker
@ 2022-04-28 13:42 ` J. Dekker
2022-04-28 13:42 ` [FFmpeg-devel] [PATCH 3/3] lavc/aarch64: add hevc sao edge 8x8 J. Dekker
2022-04-28 13:43 ` [FFmpeg-devel] [PATCH 1/3] lavc/aarch64: fix hevc sao band filter J. Dekker
2 siblings, 0 replies; 10+ messages in thread
From: J. Dekker @ 2022-04-28 13:42 UTC (permalink / raw)
To: ffmpeg-devel
bench on AWS Graviton:
hevc_sao_edge_16x16_8_c: 1857.0
hevc_sao_edge_16x16_8_neon: 211.0
hevc_sao_edge_32x32_8_c: 7802.2
hevc_sao_edge_32x32_8_neon: 808.2
hevc_sao_edge_48x48_8_c: 16764.2
hevc_sao_edge_48x48_8_neon: 1796.5
hevc_sao_edge_64x64_8_c: 32647.5
hevc_sao_edge_64x64_8_neon: 3118.5
Signed-off-by: J. Dekker <jdek@itanimul.li>
---
libavcodec/aarch64/hevcdsp_init_aarch64.c | 8 ++-
libavcodec/aarch64/hevcdsp_sao_neon.S | 66 +++++++++++++++++++++++
2 files changed, 72 insertions(+), 2 deletions(-)
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index c8963e6104..df521bb083 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -57,8 +57,8 @@ void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, uint8_t *_src,
ptrdiff_t stride_dst, ptrdiff_t stride_src,
int16_t *sao_offset_val, int sao_left_class,
int width, int height);
-
-
+void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride_dst,
+ int16_t *sao_offset_val, int eo, int width, int height);
av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
{
@@ -80,6 +80,10 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
c->sao_band_filter[2] =
c->sao_band_filter[3] =
c->sao_band_filter[4] = ff_hevc_sao_band_filter_8x8_8_neon;
+ c->sao_edge_filter[1] =
+ c->sao_edge_filter[2] =
+ c->sao_edge_filter[3] =
+ c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_16x16_8_neon;
}
if (bit_depth == 10) {
c->add_residual[0] = ff_hevc_add_residual_4x4_10_neon;
diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S b/libavcodec/aarch64/hevcdsp_sao_neon.S
index e07e0cea2d..0315c479df 100644
--- a/libavcodec/aarch64/hevcdsp_sao_neon.S
+++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
@@ -74,3 +74,69 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1
bne 1b
ret
endfunc
+
+// ASSUMES STRIDE_SRC = 192
+.Lsao_edge_pos:
+.word 1 // horizontal
+.word 192 // vertical
+.word 192 + 1 // 45 degree
+.word 192 - 1 // 135 degree
+
+// ff_hevc_sao_edge_filter_16x16_8_neon(char *dst, char *src, ptrdiff stride_dst,
+// int16 *sao_offset_val, int eo, int width, int height)
+function ff_hevc_sao_edge_filter_16x16_8_neon, export=1
+ adr x7, .Lsao_edge_pos
+ ld1 {v3.8h}, [x3] // load sao_offset_val
+ add w5, w5, #0xF
+ bic w5, w5, #0xF
+ ldr w4, [x7, w4, uxtw #2] // stride_src
+ mov v3.h[7], v3.h[0] // reorder to [1,2,0,3,4]
+ mov v3.h[0], v3.h[1]
+ mov v3.h[1], v3.h[2]
+ mov v3.h[2], v3.h[7]
+ // split 16bit values into two tables
+ uzp2 v1.16b, v3.16b, v3.16b // sao_offset_val -> upper
+ uzp1 v0.16b, v3.16b, v3.16b // sao_offset_val -> lower
+ movi v2.16b, #2
+ mov x15, #192
+ // strides between end of line and next src/dst
+ sub x15, x15, x5 // stride_src - width
+ sub x16, x2, x5 // stride_dst - width
+ mov x11, x1 // copy base src
+1: // new line
+ mov x14, x5 // copy width
+ sub x12, x11, x4 // src_a (prev) = src - sao_edge_pos
+ add x13, x11, x4 // src_b (next) = src + sao_edge_pos
+2: // process 16 bytes
+ ld1 {v3.16b}, [x11], #16 // load src
+ ld1 {v4.16b}, [x12], #16 // load src_a (prev)
+ ld1 {v5.16b}, [x13], #16 // load src_b (next)
+ cmhi v16.16b, v4.16b, v3.16b // (prev > cur)
+ cmhi v17.16b, v3.16b, v4.16b // (cur > prev)
+ cmhi v18.16b, v5.16b, v3.16b // (next > cur)
+ cmhi v19.16b, v3.16b, v5.16b // (cur > next)
+ sub v20.16b, v16.16b, v17.16b // diff0 = CMP(cur, prev) = (cur > prev) - (cur < prev)
+ sub v21.16b, v18.16b, v19.16b // diff1 = CMP(cur, next) = (cur > next) - (cur < next)
+ add v20.16b, v20.16b, v21.16b // diff = diff0 + diff1
+ add v20.16b, v20.16b, v2.16b // offset_val = diff + 2
+ tbl v16.16b, {v0.16b}, v20.16b
+ tbl v17.16b, {v1.16b}, v20.16b
+ uxtl v20.8h, v3.8b // src[0:7]
+ uxtl2 v21.8h, v3.16b // src[7:15]
+ zip1 v18.16b, v16.16b, v17.16b // sao_offset_val lower ->
+ zip2 v19.16b, v16.16b, v17.16b // sao_offset_val upper ->
+ sqadd v20.8h, v18.8h, v20.8h // + sao_offset_val
+ sqadd v21.8h, v19.8h, v21.8h
+ sqxtun v3.8b, v20.8h
+ sqxtun2 v3.16b, v21.8h
+ st1 {v3.16b}, [x0], #16
+ subs x14, x14, #16 // filtered 16 bytes
+ b.ne 2b // do we have width to filter?
+ // no width to filter, setup next line
+ add x11, x11, x15 // stride src to next line
+ add x0, x0, x16 // stride dst to next line
+ subs w6, w6, #1 // filtered line
+ b.ne 1b // do we have lines to process?
+ // no lines to filter
+ ret
+endfunc
--
2.32.0 (Apple Git-132)
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH 3/3] lavc/aarch64: add hevc sao edge 8x8
2022-04-28 13:42 [FFmpeg-devel] [PATCH 1/3] lavc/aarch64: fix hevc sao band filter J. Dekker
2022-04-28 13:42 ` [FFmpeg-devel] [PATCH 2/3] lavc/aarch64: add hevc sao edge 16x16 J. Dekker
@ 2022-04-28 13:42 ` J. Dekker
2022-04-28 19:50 ` Martin Storsjö
2022-04-28 13:43 ` [FFmpeg-devel] [PATCH 1/3] lavc/aarch64: fix hevc sao band filter J. Dekker
2 siblings, 1 reply; 10+ messages in thread
From: J. Dekker @ 2022-04-28 13:42 UTC (permalink / raw)
To: ffmpeg-devel
bench on AWS Graviton:
hevc_sao_edge_8x8_8_c: 516.0
hevc_sao_edge_8x8_8_neon: 81.0
Signed-off-by: J. Dekker <jdek@itanimul.li>
---
libavcodec/aarch64/hevcdsp_init_aarch64.c | 3 ++
libavcodec/aarch64/hevcdsp_sao_neon.S | 51 +++++++++++++++++++++++
2 files changed, 54 insertions(+)
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index df521bb083..2002530266 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -59,6 +59,8 @@ void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, uint8_t *_src,
int width, int height);
void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride_dst,
int16_t *sao_offset_val, int eo, int width, int height);
+void ff_hevc_sao_edge_filter_8x8_8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride_dst,
+ int16_t *sao_offset_val, int eo, int width, int height);
av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
{
@@ -80,6 +82,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
c->sao_band_filter[2] =
c->sao_band_filter[3] =
c->sao_band_filter[4] = ff_hevc_sao_band_filter_8x8_8_neon;
+ c->sao_edge_filter[0] = ff_hevc_sao_edge_filter_8x8_8_neon;
c->sao_edge_filter[1] =
c->sao_edge_filter[2] =
c->sao_edge_filter[3] =
diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S b/libavcodec/aarch64/hevcdsp_sao_neon.S
index 0315c479df..efd8112af4 100644
--- a/libavcodec/aarch64/hevcdsp_sao_neon.S
+++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
@@ -140,3 +140,54 @@ function ff_hevc_sao_edge_filter_16x16_8_neon, export=1
// no lines to filter
ret
endfunc
+
+// ff_hevc_sao_edge_filter_8x8_8_neon(char *dst, char *src, ptrdiff stride_dst,
+// int16 *sao_offset_val, int eo, int width, int height)
+function ff_hevc_sao_edge_filter_8x8_8_neon, export=1
+ adr x7, .Lsao_edge_pos
+ ldr w4, [x7, w4, uxtw #2]
+ ld1 {v3.8h}, [x3]
+ mov v3.h[7], v3.h[0]
+ mov v3.h[0], v3.h[1]
+ mov v3.h[1], v3.h[2]
+ mov v3.h[2], v3.h[7]
+ uzp2 v1.16b, v3.16b, v3.16b
+ uzp1 v0.16b, v3.16b, v3.16b
+ movi v2.16b, #2
+ add x16, x0, x2
+ lsl x2, x2, #1
+ mov x15, #192
+ mov x8, x1
+ sub x9, x1, x4
+ add x10, x1, x4
+ lsr w17, w6, #1
+1: ld1 {v3.d}[0], [ x8], x15
+ ld1 {v4.d}[0], [ x9], x15
+ ld1 {v5.d}[0], [x10], x15
+ ld1 {v3.d}[1], [ x8], x15
+ ld1 {v4.d}[1], [ x9], x15
+ ld1 {v5.d}[1], [x10], x15
+ cmhi v16.16b, v4.16b, v3.16b
+ cmhi v17.16b, v3.16b, v4.16b
+ cmhi v18.16b, v5.16b, v3.16b
+ cmhi v19.16b, v3.16b, v5.16b
+ sub v20.16b, v16.16b, v17.16b
+ sub v21.16b, v18.16b, v19.16b
+ add v20.16b, v20.16b, v21.16b
+ add v20.16b, v20.16b, v2.16b
+ tbl v16.16b, {v0.16b}, v20.16b
+ tbl v17.16b, {v1.16b}, v20.16b
+ uxtl v20.8h, v3.8b
+ uxtl2 v21.8h, v3.16b
+ zip1 v18.16b, v16.16b, v17.16b
+ zip2 v19.16b, v16.16b, v17.16b
+ sqadd v20.8h, v18.8h, v20.8h
+ sqadd v21.8h, v19.8h, v21.8h
+ sqxtun v6.8b, v20.8h
+ sqxtun v7.8b, v21.8h
+ st1 {v6.8b}, [ x0], x2
+ st1 {v7.8b}, [x16], x2
+ subs x17, x17, #1
+ b.ne 1b
+ ret
+endfunc
--
2.32.0 (Apple Git-132)
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH 1/3] lavc/aarch64: fix hevc sao band filter
2022-04-28 13:42 [FFmpeg-devel] [PATCH 1/3] lavc/aarch64: fix hevc sao band filter J. Dekker
2022-04-28 13:42 ` [FFmpeg-devel] [PATCH 2/3] lavc/aarch64: add hevc sao edge 16x16 J. Dekker
2022-04-28 13:42 ` [FFmpeg-devel] [PATCH 3/3] lavc/aarch64: add hevc sao edge 8x8 J. Dekker
@ 2022-04-28 13:43 ` J. Dekker
2 siblings, 0 replies; 10+ messages in thread
From: J. Dekker @ 2022-04-28 13:43 UTC (permalink / raw)
To: ffmpeg-devel
On 28 Apr 2022, at 15:42, J. Dekker wrote:
> The SAO band filter can be called with non-multiples of 8, we round up
> to the nearest multiple of 8 to account for this.
Martin mentioned he wanted checkasm to check these extra cases. Still working on the best way to do this, will post soon.
--
J. Dekker
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH 3/3] lavc/aarch64: add hevc sao edge 8x8
2022-04-28 13:42 ` [FFmpeg-devel] [PATCH 3/3] lavc/aarch64: add hevc sao edge 8x8 J. Dekker
@ 2022-04-28 19:50 ` Martin Storsjö
2022-05-17 11:53 ` J. Dekker
2022-05-17 11:54 ` [FFmpeg-devel] [PATCH] checkasm: improve hevc_sao test J. Dekker
0 siblings, 2 replies; 10+ messages in thread
From: Martin Storsjö @ 2022-04-28 19:50 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Thu, 28 Apr 2022, J. Dekker wrote:
> bench on AWS Graviton:
>
> hevc_sao_edge_8x8_8_c: 516.0
> hevc_sao_edge_8x8_8_neon: 81.0
>
> Signed-off-by: J. Dekker <jdek@itanimul.li>
> ---
> libavcodec/aarch64/hevcdsp_init_aarch64.c | 3 ++
> libavcodec/aarch64/hevcdsp_sao_neon.S | 51 +++++++++++++++++++++++
> 2 files changed, 54 insertions(+)
>
> diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> index df521bb083..2002530266 100644
> --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
> +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> @@ -59,6 +59,8 @@ void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, uint8_t *_src,
> int width, int height);
> void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride_dst,
> int16_t *sao_offset_val, int eo, int width, int height);
> +void ff_hevc_sao_edge_filter_8x8_8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride_dst,
> + int16_t *sao_offset_val, int eo, int width, int height);
>
> av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
> {
> @@ -80,6 +82,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
> c->sao_band_filter[2] =
> c->sao_band_filter[3] =
> c->sao_band_filter[4] = ff_hevc_sao_band_filter_8x8_8_neon;
> + c->sao_edge_filter[0] = ff_hevc_sao_edge_filter_8x8_8_neon;
> c->sao_edge_filter[1] =
> c->sao_edge_filter[2] =
> c->sao_edge_filter[3] =
> diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S b/libavcodec/aarch64/hevcdsp_sao_neon.S
> index 0315c479df..efd8112af4 100644
> --- a/libavcodec/aarch64/hevcdsp_sao_neon.S
> +++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
> @@ -140,3 +140,54 @@ function ff_hevc_sao_edge_filter_16x16_8_neon, export=1
> // no lines to filter
> ret
> endfunc
> +
> +// ff_hevc_sao_edge_filter_8x8_8_neon(char *dst, char *src, ptrdiff stride_dst,
> +// int16 *sao_offset_val, int eo, int width, int height)
> +function ff_hevc_sao_edge_filter_8x8_8_neon, export=1
> + adr x7, .Lsao_edge_pos
> + ldr w4, [x7, w4, uxtw #2]
> + ld1 {v3.8h}, [x3]
> + mov v3.h[7], v3.h[0]
> + mov v3.h[0], v3.h[1]
> + mov v3.h[1], v3.h[2]
> + mov v3.h[2], v3.h[7]
> + uzp2 v1.16b, v3.16b, v3.16b
> + uzp1 v0.16b, v3.16b, v3.16b
> + movi v2.16b, #2
> + add x16, x0, x2
> + lsl x2, x2, #1
> + mov x15, #192
> + mov x8, x1
> + sub x9, x1, x4
> + add x10, x1, x4
> + lsr w17, w6, #1
Compared with the previously applied (and reverted) patch, here, you
previously had "mov x17, #4". I guess that'd mean the function only ever
produced 8 output rows, while it now uses the real height parameter? Was
this change a no-op (height is always 8?) or was this another hidden bug
in the previous implementation?
> +1: ld1 {v3.d}[0], [ x8], x15
> + ld1 {v4.d}[0], [ x9], x15
> + ld1 {v5.d}[0], [x10], x15
> + ld1 {v3.d}[1], [ x8], x15
> + ld1 {v4.d}[1], [ x9], x15
> + ld1 {v5.d}[1], [x10], x15
> + cmhi v16.16b, v4.16b, v3.16b
> + cmhi v17.16b, v3.16b, v4.16b
> + cmhi v18.16b, v5.16b, v3.16b
> + cmhi v19.16b, v3.16b, v5.16b
> + sub v20.16b, v16.16b, v17.16b
> + sub v21.16b, v18.16b, v19.16b
> + add v20.16b, v20.16b, v21.16b
> + add v20.16b, v20.16b, v2.16b
> + tbl v16.16b, {v0.16b}, v20.16b
> + tbl v17.16b, {v1.16b}, v20.16b
> + uxtl v20.8h, v3.8b
> + uxtl2 v21.8h, v3.16b
> + zip1 v18.16b, v16.16b, v17.16b
> + zip2 v19.16b, v16.16b, v17.16b
> + sqadd v20.8h, v18.8h, v20.8h
> + sqadd v21.8h, v19.8h, v21.8h
> + sqxtun v6.8b, v20.8h
> + sqxtun v7.8b, v21.8h
> + st1 {v6.8b}, [ x0], x2
> + st1 {v7.8b}, [x16], x2
> + subs x17, x17, #1
This could be "subs w6, w6, #2" and you wouldn't need the lsr instruction
at all. And you could place the subs before the two st1 instructions to
reduce latency between them a little. (The same thing goes for moving subs
further away from the branch that uses its outcome in the previous patch
too.) But as this is just a reapply of a previously committed and reverted
patch, I guess it's fine this way too...
The patchset otherwise looks good to me, modulo the question about the
difference to the previous patchset above.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH 3/3] lavc/aarch64: add hevc sao edge 8x8
2022-04-28 19:50 ` Martin Storsjö
@ 2022-05-17 11:53 ` J. Dekker
2022-05-17 11:54 ` [FFmpeg-devel] [PATCH] checkasm: improve hevc_sao test J. Dekker
1 sibling, 0 replies; 10+ messages in thread
From: J. Dekker @ 2022-05-17 11:53 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On 28 Apr 2022, at 21:50, Martin Storsjö wrote:
> [...]
> Compared with the previously applied (and reverted) patch, here, you previously had "mov x17, #4". I guess that'd mean the function only ever produced 8 output rows, while it now uses the real height parameter? Was this change a no-op (height is always 8?) or was this another hidden bug in the previous implementation?
>
Yes, this was another bug in a previous implementation which I've fixed in both of the newer versions.
>> [...]
>> + sqxtun v6.8b, v20.8h
>> + sqxtun v7.8b, v21.8h
>> + st1 {v6.8b}, [ x0], x2
>> + st1 {v7.8b}, [x16], x2
>> + subs x17, x17, #1
>
> This could be "subs w6, w6, #2" and you wouldn't need the lsr instruction at all. And you could place the subs before the two st1 instructions to reduce latency between them a little. (The same thing goes for moving subs further away from the branch that uses its outcome in the previous patch too.) But as this is just a reapply of a previously committed and reverted patch, I guess it's fine this way too...
Will do before apply if you're fine with it, not too complex change.
> The patchset otherwise looks good to me, modulo the question about the difference to the previous patchset above.
--
J. Dekker
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* [FFmpeg-devel] [PATCH] checkasm: improve hevc_sao test
2022-04-28 19:50 ` Martin Storsjö
2022-05-17 11:53 ` J. Dekker
@ 2022-05-17 11:54 ` J. Dekker
2022-05-24 20:27 ` Martin Storsjö
1 sibling, 1 reply; 10+ messages in thread
From: J. Dekker @ 2022-05-17 11:54 UTC (permalink / raw)
To: ffmpeg-devel
The HEVC decoder can call these functions with smaller widths than the
functions themselves are designed to operate on so we should only check
the relevant output
Signed-off-by: J. Dekker <jdek@itanimul.li>
---
tests/checkasm/hevc_sao.c | 51 ++++++++++++++++++++++++---------------
1 file changed, 31 insertions(+), 20 deletions(-)
diff --git a/tests/checkasm/hevc_sao.c b/tests/checkasm/hevc_sao.c
index 6b750758e2..72cdb87dd1 100644
--- a/tests/checkasm/hevc_sao.c
+++ b/tests/checkasm/hevc_sao.c
@@ -78,20 +78,26 @@ static void check_sao_band(HEVCDSPContext h, int bit_depth)
for (i = 0; i <= 4; i++) {
int block_size = sao_size[i];
+ int prev_size = i > 0 ? sao_size[i - 1] : 0;
ptrdiff_t stride = PIXEL_STRIDE*SIZEOF_PIXEL;
declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, uint8_t *src, ptrdiff_t dst_stride, ptrdiff_t src_stride,
int16_t *sao_offset_val, int sao_left_class, int width, int height);
- randomize_buffers(src0, src1, BUF_SIZE);
- randomize_buffers2(offset_val, OFFSET_LENGTH);
- memset(dst0, 0, BUF_SIZE);
- memset(dst1, 0, BUF_SIZE);
-
- if (check_func(h.sao_band_filter[i], "hevc_sao_band_%dx%d_%d", block_size, block_size, bit_depth)) {
- call_ref(dst0, src0, stride, stride, offset_val, left_class, block_size, block_size);
- call_new(dst1, src1, stride, stride, offset_val, left_class, block_size, block_size);
- if (memcmp(dst0, dst1, BUF_SIZE))
- fail();
+ if (check_func(h.sao_band_filter[i], "hevc_sao_band_%d_%d", block_size, bit_depth)) {
+
+ for (int w = prev_size + 4; w <= block_size; w += 4) {
+ randomize_buffers(src0, src1, BUF_SIZE);
+ randomize_buffers2(offset_val, OFFSET_LENGTH);
+ memset(dst0, 0, BUF_SIZE);
+ memset(dst1, 0, BUF_SIZE);
+
+ call_ref(dst0, src0, stride, stride, offset_val, left_class, w, block_size);
+ call_new(dst1, src1, stride, stride, offset_val, left_class, w, block_size);
+ for (int j = 0; j < block_size; j++) {
+ if (memcmp(dst0 + j*MAX_PB_SIZE*2, dst1 + j*MAX_PB_SIZE*2, w))
+ fail();
+ }
+ }
bench_new(dst1, src1, stride, stride, offset_val, left_class, block_size, block_size);
}
}
@@ -109,21 +115,26 @@ static void check_sao_edge(HEVCDSPContext h, int bit_depth)
for (i = 0; i <= 4; i++) {
int block_size = sao_size[i];
+ int prev_size = i > 0 ? sao_size[i - 1] : 0;
ptrdiff_t stride = PIXEL_STRIDE*SIZEOF_PIXEL;
int offset = (AV_INPUT_BUFFER_PADDING_SIZE + PIXEL_STRIDE)*SIZEOF_PIXEL;
declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, uint8_t *src, ptrdiff_t stride_dst,
int16_t *sao_offset_val, int eo, int width, int height);
- randomize_buffers(src0, src1, BUF_SIZE);
- randomize_buffers2(offset_val, OFFSET_LENGTH);
- memset(dst0, 0, BUF_SIZE);
- memset(dst1, 0, BUF_SIZE);
-
- if (check_func(h.sao_edge_filter[i], "hevc_sao_edge_%dx%d_%d", block_size, block_size, bit_depth)) {
- call_ref(dst0, src0 + offset, stride, offset_val, eo, block_size, block_size);
- call_new(dst1, src1 + offset, stride, offset_val, eo, block_size, block_size);
- if (memcmp(dst0, dst1, BUF_SIZE))
- fail();
+ for (int w = prev_size + 4; w <= block_size; w += 4) {
+ randomize_buffers(src0, src1, BUF_SIZE);
+ randomize_buffers2(offset_val, OFFSET_LENGTH);
+ memset(dst0, 0, BUF_SIZE);
+ memset(dst1, 0, BUF_SIZE);
+
+ if (check_func(h.sao_edge_filter[i], "hevc_sao_edge_%d_%d", block_size, bit_depth)) {
+ call_ref(dst0, src0 + offset, stride, offset_val, eo, w, block_size);
+ call_new(dst1, src1 + offset, stride, offset_val, eo, w, block_size);
+ for (int j = 0; j < block_size; j++) {
+ if (memcmp(dst0 + j*MAX_PB_SIZE*2, dst1 + j*MAX_PB_SIZE*2, w))
+ fail();
+ }
+ }
bench_new(dst1, src1 + offset, stride, offset_val, eo, block_size, block_size);
}
}
--
2.32.0 (Apple Git-132)
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH] checkasm: improve hevc_sao test
2022-05-17 11:54 ` [FFmpeg-devel] [PATCH] checkasm: improve hevc_sao test J. Dekker
@ 2022-05-24 20:27 ` Martin Storsjö
2022-05-25 7:21 ` J. Dekker
0 siblings, 1 reply; 10+ messages in thread
From: Martin Storsjö @ 2022-05-24 20:27 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Tue, 17 May 2022, J. Dekker wrote:
> The HEVC decoder can call these functions with smaller widths than the
> functions themselves are designed to operate on so we should only check
> the relevant output
>
> Signed-off-by: J. Dekker <jdek@itanimul.li>
> ---
> tests/checkasm/hevc_sao.c | 51 ++++++++++++++++++++++++---------------
> 1 file changed, 31 insertions(+), 20 deletions(-)
>
> diff --git a/tests/checkasm/hevc_sao.c b/tests/checkasm/hevc_sao.c
> index 6b750758e2..72cdb87dd1 100644
> --- a/tests/checkasm/hevc_sao.c
> +++ b/tests/checkasm/hevc_sao.c
> @@ -78,20 +78,26 @@ static void check_sao_band(HEVCDSPContext h, int bit_depth)
>
> for (i = 0; i <= 4; i++) {
> int block_size = sao_size[i];
> + int prev_size = i > 0 ? sao_size[i - 1] : 0;
> ptrdiff_t stride = PIXEL_STRIDE*SIZEOF_PIXEL;
> declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, uint8_t *src, ptrdiff_t dst_stride, ptrdiff_t src_stride,
> int16_t *sao_offset_val, int sao_left_class, int width, int height);
>
> - randomize_buffers(src0, src1, BUF_SIZE);
> - randomize_buffers2(offset_val, OFFSET_LENGTH);
> - memset(dst0, 0, BUF_SIZE);
> - memset(dst1, 0, BUF_SIZE);
> -
> - if (check_func(h.sao_band_filter[i], "hevc_sao_band_%dx%d_%d", block_size, block_size, bit_depth)) {
> - call_ref(dst0, src0, stride, stride, offset_val, left_class, block_size, block_size);
> - call_new(dst1, src1, stride, stride, offset_val, left_class, block_size, block_size);
> - if (memcmp(dst0, dst1, BUF_SIZE))
> - fail();
> + if (check_func(h.sao_band_filter[i], "hevc_sao_band_%d_%d", block_size, bit_depth)) {
> +
> + for (int w = prev_size + 4; w <= block_size; w += 4) {
> + randomize_buffers(src0, src1, BUF_SIZE);
> + randomize_buffers2(offset_val, OFFSET_LENGTH);
> + memset(dst0, 0, BUF_SIZE);
> + memset(dst1, 0, BUF_SIZE);
> +
> + call_ref(dst0, src0, stride, stride, offset_val, left_class, w, block_size);
> + call_new(dst1, src1, stride, stride, offset_val, left_class, w, block_size);
> + for (int j = 0; j < block_size; j++) {
> + if (memcmp(dst0 + j*MAX_PB_SIZE*2, dst1 + j*MAX_PB_SIZE*2, w))
I'm not quite sure about the MAX_PB_SIZE*2 part here - shouldn't that be
just the 'stride' variable instead? And for the compared length ('w'),
shouldn't that be multiplied by SIZEOF_PIXEL?
Other than that, this looks good to me!
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH] checkasm: improve hevc_sao test
2022-05-24 20:27 ` Martin Storsjö
@ 2022-05-25 7:21 ` J. Dekker
2022-05-25 7:40 ` Martin Storsjö
0 siblings, 1 reply; 10+ messages in thread
From: J. Dekker @ 2022-05-25 7:21 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On 24 May 2022, at 22:27, Martin Storsjö wrote:
> On Tue, 17 May 2022, J. Dekker wrote:
>
>> The HEVC decoder can call these functions with smaller widths than the
>> functions themselves are designed to operate on so we should only check
>> the relevant output
>>
>> Signed-off-by: J. Dekker <jdek@itanimul.li>
>> ---
>> tests/checkasm/hevc_sao.c | 51 ++++++++++++++++++++++++---------------
>> 1 file changed, 31 insertions(+), 20 deletions(-)
>>
>> diff --git a/tests/checkasm/hevc_sao.c b/tests/checkasm/hevc_sao.c
>> index 6b750758e2..72cdb87dd1 100644
>> --- a/tests/checkasm/hevc_sao.c
>> +++ b/tests/checkasm/hevc_sao.c
>> @@ -78,20 +78,26 @@ static void check_sao_band(HEVCDSPContext h, int bit_depth)
>>
>> for (i = 0; i <= 4; i++) {
>> int block_size = sao_size[i];
>> + int prev_size = i > 0 ? sao_size[i - 1] : 0;
>> ptrdiff_t stride = PIXEL_STRIDE*SIZEOF_PIXEL;
>> declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, uint8_t *src, ptrdiff_t dst_stride, ptrdiff_t src_stride,
>> int16_t *sao_offset_val, int sao_left_class, int width, int height);
>>
>> - randomize_buffers(src0, src1, BUF_SIZE);
>> - randomize_buffers2(offset_val, OFFSET_LENGTH);
>> - memset(dst0, 0, BUF_SIZE);
>> - memset(dst1, 0, BUF_SIZE);
>> -
>> - if (check_func(h.sao_band_filter[i], "hevc_sao_band_%dx%d_%d", block_size, block_size, bit_depth)) {
>> - call_ref(dst0, src0, stride, stride, offset_val, left_class, block_size, block_size);
>> - call_new(dst1, src1, stride, stride, offset_val, left_class, block_size, block_size);
>> - if (memcmp(dst0, dst1, BUF_SIZE))
>> - fail();
>> + if (check_func(h.sao_band_filter[i], "hevc_sao_band_%d_%d", block_size, bit_depth)) {
>> +
>> + for (int w = prev_size + 4; w <= block_size; w += 4) {
>> + randomize_buffers(src0, src1, BUF_SIZE);
>> + randomize_buffers2(offset_val, OFFSET_LENGTH);
>> + memset(dst0, 0, BUF_SIZE);
>> + memset(dst1, 0, BUF_SIZE);
>> +
>> + call_ref(dst0, src0, stride, stride, offset_val, left_class, w, block_size);
>> + call_new(dst1, src1, stride, stride, offset_val, left_class, w, block_size);
>> + for (int j = 0; j < block_size; j++) {
>> + if (memcmp(dst0 + j*MAX_PB_SIZE*2, dst1 + j*MAX_PB_SIZE*2, w))
>
> I'm not quite sure about the MAX_PB_SIZE*2 part here - shouldn't that be just the 'stride' variable instead? And for the compared length ('w'), shouldn't that be multiplied by SIZEOF_PIXEL?
>
> Other than that, this looks good to me!
Pushed with this fix. Rest of the set as-is. I have an extra SAO patch but don't want to delay this set further.
Thanks,
--
J. Dekker
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [FFmpeg-devel] [PATCH] checkasm: improve hevc_sao test
2022-05-25 7:21 ` J. Dekker
@ 2022-05-25 7:40 ` Martin Storsjö
0 siblings, 0 replies; 10+ messages in thread
From: Martin Storsjö @ 2022-05-25 7:40 UTC (permalink / raw)
To: FFmpeg development discussions and patches
On Wed, 25 May 2022, J. Dekker wrote:
>
>
> On 24 May 2022, at 22:27, Martin Storsjö wrote:
>
>> On Tue, 17 May 2022, J. Dekker wrote:
>>
>>> The HEVC decoder can call these functions with smaller widths than the
>>> functions themselves are designed to operate on so we should only check
>>> the relevant output
>>>
>>> Signed-off-by: J. Dekker <jdek@itanimul.li>
>>> ---
>>> tests/checkasm/hevc_sao.c | 51 ++++++++++++++++++++++++---------------
>>> 1 file changed, 31 insertions(+), 20 deletions(-)
>>>
>>> diff --git a/tests/checkasm/hevc_sao.c b/tests/checkasm/hevc_sao.c
>>> index 6b750758e2..72cdb87dd1 100644
>>> --- a/tests/checkasm/hevc_sao.c
>>> +++ b/tests/checkasm/hevc_sao.c
>>> @@ -78,20 +78,26 @@ static void check_sao_band(HEVCDSPContext h, int bit_depth)
>>>
>>> for (i = 0; i <= 4; i++) {
>>> int block_size = sao_size[i];
>>> + int prev_size = i > 0 ? sao_size[i - 1] : 0;
>>> ptrdiff_t stride = PIXEL_STRIDE*SIZEOF_PIXEL;
>>> declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, uint8_t *src, ptrdiff_t dst_stride, ptrdiff_t src_stride,
>>> int16_t *sao_offset_val, int sao_left_class, int width, int height);
>>>
>>> - randomize_buffers(src0, src1, BUF_SIZE);
>>> - randomize_buffers2(offset_val, OFFSET_LENGTH);
>>> - memset(dst0, 0, BUF_SIZE);
>>> - memset(dst1, 0, BUF_SIZE);
>>> -
>>> - if (check_func(h.sao_band_filter[i], "hevc_sao_band_%dx%d_%d", block_size, block_size, bit_depth)) {
>>> - call_ref(dst0, src0, stride, stride, offset_val, left_class, block_size, block_size);
>>> - call_new(dst1, src1, stride, stride, offset_val, left_class, block_size, block_size);
>>> - if (memcmp(dst0, dst1, BUF_SIZE))
>>> - fail();
>>> + if (check_func(h.sao_band_filter[i], "hevc_sao_band_%d_%d", block_size, bit_depth)) {
>>> +
>>> + for (int w = prev_size + 4; w <= block_size; w += 4) {
>>> + randomize_buffers(src0, src1, BUF_SIZE);
>>> + randomize_buffers2(offset_val, OFFSET_LENGTH);
>>> + memset(dst0, 0, BUF_SIZE);
>>> + memset(dst1, 0, BUF_SIZE);
>>> +
>>> + call_ref(dst0, src0, stride, stride, offset_val, left_class, w, block_size);
>>> + call_new(dst1, src1, stride, stride, offset_val, left_class, w, block_size);
>>> + for (int j = 0; j < block_size; j++) {
>>> + if (memcmp(dst0 + j*MAX_PB_SIZE*2, dst1 + j*MAX_PB_SIZE*2, w))
>>
>> I'm not quite sure about the MAX_PB_SIZE*2 part here - shouldn't that be just the 'stride' variable instead? And for the compared length ('w'), shouldn't that be multiplied by SIZEOF_PIXEL?
>>
>> Other than that, this looks good to me!
>
> Pushed with this fix. Rest of the set as-is. I have an extra SAO patch
> but don't want to delay this set further.
Thanks! Yes, it's better to land the previously reviewed and good parts,
to keep the focus in the reviews.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2022-05-25 7:40 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-28 13:42 [FFmpeg-devel] [PATCH 1/3] lavc/aarch64: fix hevc sao band filter J. Dekker
2022-04-28 13:42 ` [FFmpeg-devel] [PATCH 2/3] lavc/aarch64: add hevc sao edge 16x16 J. Dekker
2022-04-28 13:42 ` [FFmpeg-devel] [PATCH 3/3] lavc/aarch64: add hevc sao edge 8x8 J. Dekker
2022-04-28 19:50 ` Martin Storsjö
2022-05-17 11:53 ` J. Dekker
2022-05-17 11:54 ` [FFmpeg-devel] [PATCH] checkasm: improve hevc_sao test J. Dekker
2022-05-24 20:27 ` Martin Storsjö
2022-05-25 7:21 ` J. Dekker
2022-05-25 7:40 ` Martin Storsjö
2022-04-28 13:43 ` [FFmpeg-devel] [PATCH 1/3] lavc/aarch64: fix hevc sao band filter J. Dekker
Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
This inbox may be cloned and mirrored by anyone:
git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git
# If you have public-inbox 1.1+ installed, you may
# initialize and index your mirror using the following commands:
public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
ffmpegdev@gitmailbox.com
public-inbox-index ffmpegdev
Example config snippet for mirrors.
AGPL code for this site: git clone https://public-inbox.org/public-inbox.git