Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
 help / color / mirror / Atom feed
* [FFmpeg-devel] [PATCH] avcodec/ppc/h264dsp: Fix unaligned stores
@ 2024-03-12 22:15 Andreas Rheinhardt
  2024-03-13 11:30 ` [FFmpeg-devel] [PATCH v2] " Andreas Rheinhardt
  0 siblings, 1 reply; 7+ messages in thread
From: Andreas Rheinhardt @ 2024-03-12 22:15 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Andreas Rheinhardt

Exposed by http://fate.ffmpeg.org/report.cgi?time=20240312011016&slot=ppc-linux-gcc-13.2-ubsan-altivec-qemu

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
I only tested that it compiles (and the size of .text even
goes down a bit, presumably because of the elimination of
int_dst_stride (which basically means that the lowest two bits
of dst_stride had to be zeroed (because the compiler didn't
know that they already are zero)).

 libavcodec/ppc/h264dsp.c | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/libavcodec/ppc/h264dsp.c b/libavcodec/ppc/h264dsp.c
index c02733dda2..21f8c13ca9 100644
--- a/libavcodec/ppc/h264dsp.c
+++ b/libavcodec/ppc/h264dsp.c
@@ -401,30 +401,29 @@ static inline void write16x4(uint8_t *dst, int dst_stride,
                              register vec_u8 r0, register vec_u8 r1,
                              register vec_u8 r2, register vec_u8 r3) {
     DECLARE_ALIGNED(16, unsigned char, result)[64];
-    uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
-    int int_dst_stride = dst_stride/4;
+    uint32_t *src_int = (uint32_t *)result;
 
     vec_st(r0, 0, result);
     vec_st(r1, 16, result);
     vec_st(r2, 32, result);
     vec_st(r3, 48, result);
     /* FIXME: there has to be a better way!!!! */
-    *dst_int = *src_int;
-    *(dst_int+   int_dst_stride) = *(src_int + 1);
-    *(dst_int+ 2*int_dst_stride) = *(src_int + 2);
-    *(dst_int+ 3*int_dst_stride) = *(src_int + 3);
-    *(dst_int+ 4*int_dst_stride) = *(src_int + 4);
-    *(dst_int+ 5*int_dst_stride) = *(src_int + 5);
-    *(dst_int+ 6*int_dst_stride) = *(src_int + 6);
-    *(dst_int+ 7*int_dst_stride) = *(src_int + 7);
-    *(dst_int+ 8*int_dst_stride) = *(src_int + 8);
-    *(dst_int+ 9*int_dst_stride) = *(src_int + 9);
-    *(dst_int+10*int_dst_stride) = *(src_int + 10);
-    *(dst_int+11*int_dst_stride) = *(src_int + 11);
-    *(dst_int+12*int_dst_stride) = *(src_int + 12);
-    *(dst_int+13*int_dst_stride) = *(src_int + 13);
-    *(dst_int+14*int_dst_stride) = *(src_int + 14);
-    *(dst_int+15*int_dst_stride) = *(src_int + 15);
+    AV_WN32(dst,                   src_int[0]);
+    AV_WN32(dst +      dst_stride, src_int[1]);
+    AV_WN32(dst +  2 * dst_stride, src_int[2]);
+    AV_WN32(dst +  3 * dst_stride, src_int[3]);
+    AV_WN32(dst +  4 * dst_stride, src_int[4]);
+    AV_WN32(dst +  5 * dst_stride, src_int[5]);
+    AV_WN32(dst +  6 * dst_stride, src_int[6]);
+    AV_WN32(dst +  7 * dst_stride, src_int[7]);
+    AV_WN32(dst +  8 * dst_stride, src_int[8]);
+    AV_WN32(dst +  9 * dst_stride, src_int[9]);
+    AV_WN32(dst + 10 * dst_stride, src_int[10]);
+    AV_WN32(dst + 11 * dst_stride, src_int[11]);
+    AV_WN32(dst + 12 * dst_stride, src_int[12]);
+    AV_WN32(dst + 13 * dst_stride, src_int[13]);
+    AV_WN32(dst + 14 * dst_stride, src_int[14]);
+    AV_WN32(dst + 15 * dst_stride, src_int[15]);
 }
 
 /** @brief performs a 6x16 transpose of data in src, and stores it to dst
-- 
2.40.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [FFmpeg-devel] [PATCH v2] avcodec/ppc/h264dsp: Fix unaligned stores
  2024-03-12 22:15 [FFmpeg-devel] [PATCH] avcodec/ppc/h264dsp: Fix unaligned stores Andreas Rheinhardt
@ 2024-03-13 11:30 ` Andreas Rheinhardt
  2024-03-13 12:04   ` James Almer
  2024-03-14 19:13   ` Sean McGovern
  0 siblings, 2 replies; 7+ messages in thread
From: Andreas Rheinhardt @ 2024-03-13 11:30 UTC (permalink / raw)
  To: ffmpeg-devel; +Cc: Andreas Rheinhardt

Also fix an effective-type violation.
Exposed by https://fate.ffmpeg.org/report.cgi?time=20240312011016&slot=ppc-linux-gcc-13.2-ubsan-altivec-qemu

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/ppc/h264dsp.c | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/libavcodec/ppc/h264dsp.c b/libavcodec/ppc/h264dsp.c
index c02733dda2..f50f2553a2 100644
--- a/libavcodec/ppc/h264dsp.c
+++ b/libavcodec/ppc/h264dsp.c
@@ -401,30 +401,29 @@ static inline void write16x4(uint8_t *dst, int dst_stride,
                              register vec_u8 r0, register vec_u8 r1,
                              register vec_u8 r2, register vec_u8 r3) {
     DECLARE_ALIGNED(16, unsigned char, result)[64];
-    uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
-    int int_dst_stride = dst_stride/4;
+    uint32_t *src_int = (uint32_t *)result;
 
     vec_st(r0, 0, result);
     vec_st(r1, 16, result);
     vec_st(r2, 32, result);
     vec_st(r3, 48, result);
     /* FIXME: there has to be a better way!!!! */
-    *dst_int = *src_int;
-    *(dst_int+   int_dst_stride) = *(src_int + 1);
-    *(dst_int+ 2*int_dst_stride) = *(src_int + 2);
-    *(dst_int+ 3*int_dst_stride) = *(src_int + 3);
-    *(dst_int+ 4*int_dst_stride) = *(src_int + 4);
-    *(dst_int+ 5*int_dst_stride) = *(src_int + 5);
-    *(dst_int+ 6*int_dst_stride) = *(src_int + 6);
-    *(dst_int+ 7*int_dst_stride) = *(src_int + 7);
-    *(dst_int+ 8*int_dst_stride) = *(src_int + 8);
-    *(dst_int+ 9*int_dst_stride) = *(src_int + 9);
-    *(dst_int+10*int_dst_stride) = *(src_int + 10);
-    *(dst_int+11*int_dst_stride) = *(src_int + 11);
-    *(dst_int+12*int_dst_stride) = *(src_int + 12);
-    *(dst_int+13*int_dst_stride) = *(src_int + 13);
-    *(dst_int+14*int_dst_stride) = *(src_int + 14);
-    *(dst_int+15*int_dst_stride) = *(src_int + 15);
+    AV_WN32(dst,                   AV_RN32A(src_int + 0));
+    AV_WN32(dst +      dst_stride, AV_RN32A(src_int + 1));
+    AV_WN32(dst +  2 * dst_stride, AV_RN32A(src_int + 2));
+    AV_WN32(dst +  3 * dst_stride, AV_RN32A(src_int + 3));
+    AV_WN32(dst +  4 * dst_stride, AV_RN32A(src_int + 4));
+    AV_WN32(dst +  5 * dst_stride, AV_RN32A(src_int + 5));
+    AV_WN32(dst +  6 * dst_stride, AV_RN32A(src_int + 6));
+    AV_WN32(dst +  7 * dst_stride, AV_RN32A(src_int + 7));
+    AV_WN32(dst +  8 * dst_stride, AV_RN32A(src_int + 8));
+    AV_WN32(dst +  9 * dst_stride, AV_RN32A(src_int + 9));
+    AV_WN32(dst + 10 * dst_stride, AV_RN32A(src_int + 10));
+    AV_WN32(dst + 11 * dst_stride, AV_RN32A(src_int + 11));
+    AV_WN32(dst + 12 * dst_stride, AV_RN32A(src_int + 12));
+    AV_WN32(dst + 13 * dst_stride, AV_RN32A(src_int + 13));
+    AV_WN32(dst + 14 * dst_stride, AV_RN32A(src_int + 14));
+    AV_WN32(dst + 15 * dst_stride, AV_RN32A(src_int + 15));
 }
 
 /** @brief performs a 6x16 transpose of data in src, and stores it to dst
-- 
2.40.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [FFmpeg-devel] [PATCH v2] avcodec/ppc/h264dsp: Fix unaligned stores
  2024-03-13 11:30 ` [FFmpeg-devel] [PATCH v2] " Andreas Rheinhardt
@ 2024-03-13 12:04   ` James Almer
  2024-03-13 12:10     ` Andreas Rheinhardt
  2024-03-14 19:13   ` Sean McGovern
  1 sibling, 1 reply; 7+ messages in thread
From: James Almer @ 2024-03-13 12:04 UTC (permalink / raw)
  To: ffmpeg-devel

On 3/13/2024 8:30 AM, Andreas Rheinhardt wrote:
> Also fix an effective-type violation.
> Exposed by https://fate.ffmpeg.org/report.cgi?time=20240312011016&slot=ppc-linux-gcc-13.2-ubsan-altivec-qemu
> 
> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
> ---
>   libavcodec/ppc/h264dsp.c | 35 +++++++++++++++++------------------
>   1 file changed, 17 insertions(+), 18 deletions(-)
> 
> diff --git a/libavcodec/ppc/h264dsp.c b/libavcodec/ppc/h264dsp.c
> index c02733dda2..f50f2553a2 100644
> --- a/libavcodec/ppc/h264dsp.c
> +++ b/libavcodec/ppc/h264dsp.c
> @@ -401,30 +401,29 @@ static inline void write16x4(uint8_t *dst, int dst_stride,
>                                register vec_u8 r0, register vec_u8 r1,
>                                register vec_u8 r2, register vec_u8 r3) {
>       DECLARE_ALIGNED(16, unsigned char, result)[64];
> -    uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
> -    int int_dst_stride = dst_stride/4;
> +    uint32_t *src_int = (uint32_t *)result;
>   
>       vec_st(r0, 0, result);
>       vec_st(r1, 16, result);
>       vec_st(r2, 32, result);
>       vec_st(r3, 48, result);
>       /* FIXME: there has to be a better way!!!! */
> -    *dst_int = *src_int;
> -    *(dst_int+   int_dst_stride) = *(src_int + 1);
> -    *(dst_int+ 2*int_dst_stride) = *(src_int + 2);
> -    *(dst_int+ 3*int_dst_stride) = *(src_int + 3);
> -    *(dst_int+ 4*int_dst_stride) = *(src_int + 4);
> -    *(dst_int+ 5*int_dst_stride) = *(src_int + 5);
> -    *(dst_int+ 6*int_dst_stride) = *(src_int + 6);
> -    *(dst_int+ 7*int_dst_stride) = *(src_int + 7);
> -    *(dst_int+ 8*int_dst_stride) = *(src_int + 8);
> -    *(dst_int+ 9*int_dst_stride) = *(src_int + 9);
> -    *(dst_int+10*int_dst_stride) = *(src_int + 10);
> -    *(dst_int+11*int_dst_stride) = *(src_int + 11);
> -    *(dst_int+12*int_dst_stride) = *(src_int + 12);
> -    *(dst_int+13*int_dst_stride) = *(src_int + 13);
> -    *(dst_int+14*int_dst_stride) = *(src_int + 14);
> -    *(dst_int+15*int_dst_stride) = *(src_int + 15);
> +    AV_WN32(dst,                   AV_RN32A(src_int + 0));
> +    AV_WN32(dst +      dst_stride, AV_RN32A(src_int + 1));
> +    AV_WN32(dst +  2 * dst_stride, AV_RN32A(src_int + 2));
> +    AV_WN32(dst +  3 * dst_stride, AV_RN32A(src_int + 3));
> +    AV_WN32(dst +  4 * dst_stride, AV_RN32A(src_int + 4));
> +    AV_WN32(dst +  5 * dst_stride, AV_RN32A(src_int + 5));
> +    AV_WN32(dst +  6 * dst_stride, AV_RN32A(src_int + 6));
> +    AV_WN32(dst +  7 * dst_stride, AV_RN32A(src_int + 7));
> +    AV_WN32(dst +  8 * dst_stride, AV_RN32A(src_int + 8));
> +    AV_WN32(dst +  9 * dst_stride, AV_RN32A(src_int + 9));
> +    AV_WN32(dst + 10 * dst_stride, AV_RN32A(src_int + 10));
> +    AV_WN32(dst + 11 * dst_stride, AV_RN32A(src_int + 11));
> +    AV_WN32(dst + 12 * dst_stride, AV_RN32A(src_int + 12));
> +    AV_WN32(dst + 13 * dst_stride, AV_RN32A(src_int + 13));
> +    AV_WN32(dst + 14 * dst_stride, AV_RN32A(src_int + 14));
> +    AV_WN32(dst + 15 * dst_stride, AV_RN32A(src_int + 15));

Is there any benefit using AV_RN32A() when src_int is already a pointer 
to a uint32_t?

>   }
>   
>   /** @brief performs a 6x16 transpose of data in src, and stores it to dst
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [FFmpeg-devel] [PATCH v2] avcodec/ppc/h264dsp: Fix unaligned stores
  2024-03-13 12:04   ` James Almer
@ 2024-03-13 12:10     ` Andreas Rheinhardt
  0 siblings, 0 replies; 7+ messages in thread
From: Andreas Rheinhardt @ 2024-03-13 12:10 UTC (permalink / raw)
  To: ffmpeg-devel

James Almer:
> On 3/13/2024 8:30 AM, Andreas Rheinhardt wrote:
>> Also fix an effective-type violation.
>> Exposed by
>> https://fate.ffmpeg.org/report.cgi?time=20240312011016&slot=ppc-linux-gcc-13.2-ubsan-altivec-qemu
>>
>> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
>> ---
>>   libavcodec/ppc/h264dsp.c | 35 +++++++++++++++++------------------
>>   1 file changed, 17 insertions(+), 18 deletions(-)
>>
>> diff --git a/libavcodec/ppc/h264dsp.c b/libavcodec/ppc/h264dsp.c
>> index c02733dda2..f50f2553a2 100644
>> --- a/libavcodec/ppc/h264dsp.c
>> +++ b/libavcodec/ppc/h264dsp.c
>> @@ -401,30 +401,29 @@ static inline void write16x4(uint8_t *dst, int
>> dst_stride,
>>                                register vec_u8 r0, register vec_u8 r1,
>>                                register vec_u8 r2, register vec_u8 r3) {
>>       DECLARE_ALIGNED(16, unsigned char, result)[64];
>> -    uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
>> -    int int_dst_stride = dst_stride/4;
>> +    uint32_t *src_int = (uint32_t *)result;
>>         vec_st(r0, 0, result);
>>       vec_st(r1, 16, result);
>>       vec_st(r2, 32, result);
>>       vec_st(r3, 48, result);
>>       /* FIXME: there has to be a better way!!!! */
>> -    *dst_int = *src_int;
>> -    *(dst_int+   int_dst_stride) = *(src_int + 1);
>> -    *(dst_int+ 2*int_dst_stride) = *(src_int + 2);
>> -    *(dst_int+ 3*int_dst_stride) = *(src_int + 3);
>> -    *(dst_int+ 4*int_dst_stride) = *(src_int + 4);
>> -    *(dst_int+ 5*int_dst_stride) = *(src_int + 5);
>> -    *(dst_int+ 6*int_dst_stride) = *(src_int + 6);
>> -    *(dst_int+ 7*int_dst_stride) = *(src_int + 7);
>> -    *(dst_int+ 8*int_dst_stride) = *(src_int + 8);
>> -    *(dst_int+ 9*int_dst_stride) = *(src_int + 9);
>> -    *(dst_int+10*int_dst_stride) = *(src_int + 10);
>> -    *(dst_int+11*int_dst_stride) = *(src_int + 11);
>> -    *(dst_int+12*int_dst_stride) = *(src_int + 12);
>> -    *(dst_int+13*int_dst_stride) = *(src_int + 13);
>> -    *(dst_int+14*int_dst_stride) = *(src_int + 14);
>> -    *(dst_int+15*int_dst_stride) = *(src_int + 15);
>> +    AV_WN32(dst,                   AV_RN32A(src_int + 0));
>> +    AV_WN32(dst +      dst_stride, AV_RN32A(src_int + 1));
>> +    AV_WN32(dst +  2 * dst_stride, AV_RN32A(src_int + 2));
>> +    AV_WN32(dst +  3 * dst_stride, AV_RN32A(src_int + 3));
>> +    AV_WN32(dst +  4 * dst_stride, AV_RN32A(src_int + 4));
>> +    AV_WN32(dst +  5 * dst_stride, AV_RN32A(src_int + 5));
>> +    AV_WN32(dst +  6 * dst_stride, AV_RN32A(src_int + 6));
>> +    AV_WN32(dst +  7 * dst_stride, AV_RN32A(src_int + 7));
>> +    AV_WN32(dst +  8 * dst_stride, AV_RN32A(src_int + 8));
>> +    AV_WN32(dst +  9 * dst_stride, AV_RN32A(src_int + 9));
>> +    AV_WN32(dst + 10 * dst_stride, AV_RN32A(src_int + 10));
>> +    AV_WN32(dst + 11 * dst_stride, AV_RN32A(src_int + 11));
>> +    AV_WN32(dst + 12 * dst_stride, AV_RN32A(src_int + 12));
>> +    AV_WN32(dst + 13 * dst_stride, AV_RN32A(src_int + 13));
>> +    AV_WN32(dst + 14 * dst_stride, AV_RN32A(src_int + 14));
>> +    AV_WN32(dst + 15 * dst_stride, AV_RN32A(src_int + 15));
> 
> Is there any benefit using AV_RN32A() when src_int is already a pointer
> to a uint32_t?
> 

Simply reading via src_int[idx] would be a violation of the
effective-type rules (you would read from an array of unsigned char via
an lvalue of type uint32_t).
Alternatively, one could use a union of DECLARE_ALIGNED(16, unsigned
char, result)[64] and uint32_t[16], the former to store these vectors,
the latter to read the values from.

- Andreas

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [FFmpeg-devel] [PATCH v2] avcodec/ppc/h264dsp: Fix unaligned stores
  2024-03-13 11:30 ` [FFmpeg-devel] [PATCH v2] " Andreas Rheinhardt
  2024-03-13 12:04   ` James Almer
@ 2024-03-14 19:13   ` Sean McGovern
  2024-03-14 19:23     ` James Almer
  2024-03-14 19:34     ` Andreas Rheinhardt
  1 sibling, 2 replies; 7+ messages in thread
From: Sean McGovern @ 2024-03-14 19:13 UTC (permalink / raw)
  To: FFmpeg development discussions and patches

Andreas:

On Wed, Mar 13, 2024 at 7:31 AM Andreas Rheinhardt
<andreas.rheinhardt@outlook.com> wrote:
>
> Also fix an effective-type violation.
> Exposed by https://fate.ffmpeg.org/report.cgi?time=20240312011016&slot=ppc-linux-gcc-13.2-ubsan-altivec-qemu
>
> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
> ---
>  libavcodec/ppc/h264dsp.c | 35 +++++++++++++++++------------------
>  1 file changed, 17 insertions(+), 18 deletions(-)
>
> diff --git a/libavcodec/ppc/h264dsp.c b/libavcodec/ppc/h264dsp.c
> index c02733dda2..f50f2553a2 100644
> --- a/libavcodec/ppc/h264dsp.c
> +++ b/libavcodec/ppc/h264dsp.c
> @@ -401,30 +401,29 @@ static inline void write16x4(uint8_t *dst, int dst_stride,
>                               register vec_u8 r0, register vec_u8 r1,
>                               register vec_u8 r2, register vec_u8 r3) {
>      DECLARE_ALIGNED(16, unsigned char, result)[64];
> -    uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
> -    int int_dst_stride = dst_stride/4;
> +    uint32_t *src_int = (uint32_t *)result;
>
>      vec_st(r0, 0, result);
>      vec_st(r1, 16, result);
>      vec_st(r2, 32, result);
>      vec_st(r3, 48, result);
>      /* FIXME: there has to be a better way!!!! */
> -    *dst_int = *src_int;
> -    *(dst_int+   int_dst_stride) = *(src_int + 1);
> -    *(dst_int+ 2*int_dst_stride) = *(src_int + 2);
> -    *(dst_int+ 3*int_dst_stride) = *(src_int + 3);
> -    *(dst_int+ 4*int_dst_stride) = *(src_int + 4);
> -    *(dst_int+ 5*int_dst_stride) = *(src_int + 5);
> -    *(dst_int+ 6*int_dst_stride) = *(src_int + 6);
> -    *(dst_int+ 7*int_dst_stride) = *(src_int + 7);
> -    *(dst_int+ 8*int_dst_stride) = *(src_int + 8);
> -    *(dst_int+ 9*int_dst_stride) = *(src_int + 9);
> -    *(dst_int+10*int_dst_stride) = *(src_int + 10);
> -    *(dst_int+11*int_dst_stride) = *(src_int + 11);
> -    *(dst_int+12*int_dst_stride) = *(src_int + 12);
> -    *(dst_int+13*int_dst_stride) = *(src_int + 13);
> -    *(dst_int+14*int_dst_stride) = *(src_int + 14);
> -    *(dst_int+15*int_dst_stride) = *(src_int + 15);
> +    AV_WN32(dst,                   AV_RN32A(src_int + 0));
> +    AV_WN32(dst +      dst_stride, AV_RN32A(src_int + 1));
> +    AV_WN32(dst +  2 * dst_stride, AV_RN32A(src_int + 2));
> +    AV_WN32(dst +  3 * dst_stride, AV_RN32A(src_int + 3));
> +    AV_WN32(dst +  4 * dst_stride, AV_RN32A(src_int + 4));
> +    AV_WN32(dst +  5 * dst_stride, AV_RN32A(src_int + 5));
> +    AV_WN32(dst +  6 * dst_stride, AV_RN32A(src_int + 6));
> +    AV_WN32(dst +  7 * dst_stride, AV_RN32A(src_int + 7));
> +    AV_WN32(dst +  8 * dst_stride, AV_RN32A(src_int + 8));
> +    AV_WN32(dst +  9 * dst_stride, AV_RN32A(src_int + 9));
> +    AV_WN32(dst + 10 * dst_stride, AV_RN32A(src_int + 10));
> +    AV_WN32(dst + 11 * dst_stride, AV_RN32A(src_int + 11));
> +    AV_WN32(dst + 12 * dst_stride, AV_RN32A(src_int + 12));
> +    AV_WN32(dst + 13 * dst_stride, AV_RN32A(src_int + 13));
> +    AV_WN32(dst + 14 * dst_stride, AV_RN32A(src_int + 14));
> +    AV_WN32(dst + 15 * dst_stride, AV_RN32A(src_int + 15));
>  }
>
>  /** @brief performs a 6x16 transpose of data in src, and stores it to dst
> --
> 2.40.1
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

First of all, thank you for looking into this.

Second, do we feel that this change covers the FIXME immediately above
it that exclaims "there has to be a better way!!!!"?
If so, we can remove the comment.

I did not perform a full FATE run as it is expensive on my QEMU setup,
but I can confirm that this fixes the checkasm-h264dsp test under GCC
UBsan there as well as on a POWER7 (ppc64) and a POWER9 (ppc64le).

Thanks,
Sean McGovern
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [FFmpeg-devel] [PATCH v2] avcodec/ppc/h264dsp: Fix unaligned stores
  2024-03-14 19:13   ` Sean McGovern
@ 2024-03-14 19:23     ` James Almer
  2024-03-14 19:34     ` Andreas Rheinhardt
  1 sibling, 0 replies; 7+ messages in thread
From: James Almer @ 2024-03-14 19:23 UTC (permalink / raw)
  To: ffmpeg-devel

On 3/14/2024 4:13 PM, Sean McGovern wrote:
> Andreas:
> 
> On Wed, Mar 13, 2024 at 7:31 AM Andreas Rheinhardt
> <andreas.rheinhardt@outlook.com> wrote:
>>
>> Also fix an effective-type violation.
>> Exposed by https://fate.ffmpeg.org/report.cgi?time=20240312011016&slot=ppc-linux-gcc-13.2-ubsan-altivec-qemu
>>
>> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
>> ---
>>   libavcodec/ppc/h264dsp.c | 35 +++++++++++++++++------------------
>>   1 file changed, 17 insertions(+), 18 deletions(-)
>>
>> diff --git a/libavcodec/ppc/h264dsp.c b/libavcodec/ppc/h264dsp.c
>> index c02733dda2..f50f2553a2 100644
>> --- a/libavcodec/ppc/h264dsp.c
>> +++ b/libavcodec/ppc/h264dsp.c
>> @@ -401,30 +401,29 @@ static inline void write16x4(uint8_t *dst, int dst_stride,
>>                                register vec_u8 r0, register vec_u8 r1,
>>                                register vec_u8 r2, register vec_u8 r3) {
>>       DECLARE_ALIGNED(16, unsigned char, result)[64];
>> -    uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
>> -    int int_dst_stride = dst_stride/4;
>> +    uint32_t *src_int = (uint32_t *)result;
>>
>>       vec_st(r0, 0, result);
>>       vec_st(r1, 16, result);
>>       vec_st(r2, 32, result);
>>       vec_st(r3, 48, result);
>>       /* FIXME: there has to be a better way!!!! */
>> -    *dst_int = *src_int;
>> -    *(dst_int+   int_dst_stride) = *(src_int + 1);
>> -    *(dst_int+ 2*int_dst_stride) = *(src_int + 2);
>> -    *(dst_int+ 3*int_dst_stride) = *(src_int + 3);
>> -    *(dst_int+ 4*int_dst_stride) = *(src_int + 4);
>> -    *(dst_int+ 5*int_dst_stride) = *(src_int + 5);
>> -    *(dst_int+ 6*int_dst_stride) = *(src_int + 6);
>> -    *(dst_int+ 7*int_dst_stride) = *(src_int + 7);
>> -    *(dst_int+ 8*int_dst_stride) = *(src_int + 8);
>> -    *(dst_int+ 9*int_dst_stride) = *(src_int + 9);
>> -    *(dst_int+10*int_dst_stride) = *(src_int + 10);
>> -    *(dst_int+11*int_dst_stride) = *(src_int + 11);
>> -    *(dst_int+12*int_dst_stride) = *(src_int + 12);
>> -    *(dst_int+13*int_dst_stride) = *(src_int + 13);
>> -    *(dst_int+14*int_dst_stride) = *(src_int + 14);
>> -    *(dst_int+15*int_dst_stride) = *(src_int + 15);
>> +    AV_WN32(dst,                   AV_RN32A(src_int + 0));
>> +    AV_WN32(dst +      dst_stride, AV_RN32A(src_int + 1));
>> +    AV_WN32(dst +  2 * dst_stride, AV_RN32A(src_int + 2));
>> +    AV_WN32(dst +  3 * dst_stride, AV_RN32A(src_int + 3));
>> +    AV_WN32(dst +  4 * dst_stride, AV_RN32A(src_int + 4));
>> +    AV_WN32(dst +  5 * dst_stride, AV_RN32A(src_int + 5));
>> +    AV_WN32(dst +  6 * dst_stride, AV_RN32A(src_int + 6));
>> +    AV_WN32(dst +  7 * dst_stride, AV_RN32A(src_int + 7));
>> +    AV_WN32(dst +  8 * dst_stride, AV_RN32A(src_int + 8));
>> +    AV_WN32(dst +  9 * dst_stride, AV_RN32A(src_int + 9));
>> +    AV_WN32(dst + 10 * dst_stride, AV_RN32A(src_int + 10));
>> +    AV_WN32(dst + 11 * dst_stride, AV_RN32A(src_int + 11));
>> +    AV_WN32(dst + 12 * dst_stride, AV_RN32A(src_int + 12));
>> +    AV_WN32(dst + 13 * dst_stride, AV_RN32A(src_int + 13));
>> +    AV_WN32(dst + 14 * dst_stride, AV_RN32A(src_int + 14));
>> +    AV_WN32(dst + 15 * dst_stride, AV_RN32A(src_int + 15));
>>   }
>>
>>   /** @brief performs a 6x16 transpose of data in src, and stores it to dst
>> --
>> 2.40.1
>>
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> 
> First of all, thank you for looking into this.
> 
> Second, do we feel that this change covers the FIXME immediately above
> it that exclaims "there has to be a better way!!!!"?
> If so, we can remove the comment.

Doubt it. Even after Andreas' change it's essentially the same as before 
(load four bytes, write four bytes) but without the UB. The FIXME 
probably refers to finding a way to do this with vector intrinsics.

> 
> I did not perform a full FATE run as it is expensive on my QEMU setup,
> but I can confirm that this fixes the checkasm-h264dsp test under GCC
> UBsan there as well as on a POWER7 (ppc64) and a POWER9 (ppc64le).
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [FFmpeg-devel] [PATCH v2] avcodec/ppc/h264dsp: Fix unaligned stores
  2024-03-14 19:13   ` Sean McGovern
  2024-03-14 19:23     ` James Almer
@ 2024-03-14 19:34     ` Andreas Rheinhardt
  1 sibling, 0 replies; 7+ messages in thread
From: Andreas Rheinhardt @ 2024-03-14 19:34 UTC (permalink / raw)
  To: ffmpeg-devel

Sean McGovern:
> Andreas:
> 
> On Wed, Mar 13, 2024 at 7:31 AM Andreas Rheinhardt
> <andreas.rheinhardt@outlook.com> wrote:
>>
>> Also fix an effective-type violation.
>> Exposed by https://fate.ffmpeg.org/report.cgi?time=20240312011016&slot=ppc-linux-gcc-13.2-ubsan-altivec-qemu
>>
>> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
>> ---
>>  libavcodec/ppc/h264dsp.c | 35 +++++++++++++++++------------------
>>  1 file changed, 17 insertions(+), 18 deletions(-)
>>
>> diff --git a/libavcodec/ppc/h264dsp.c b/libavcodec/ppc/h264dsp.c
>> index c02733dda2..f50f2553a2 100644
>> --- a/libavcodec/ppc/h264dsp.c
>> +++ b/libavcodec/ppc/h264dsp.c
>> @@ -401,30 +401,29 @@ static inline void write16x4(uint8_t *dst, int dst_stride,
>>                               register vec_u8 r0, register vec_u8 r1,
>>                               register vec_u8 r2, register vec_u8 r3) {
>>      DECLARE_ALIGNED(16, unsigned char, result)[64];
>> -    uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
>> -    int int_dst_stride = dst_stride/4;
>> +    uint32_t *src_int = (uint32_t *)result;
>>
>>      vec_st(r0, 0, result);
>>      vec_st(r1, 16, result);
>>      vec_st(r2, 32, result);
>>      vec_st(r3, 48, result);
>>      /* FIXME: there has to be a better way!!!! */
>> -    *dst_int = *src_int;
>> -    *(dst_int+   int_dst_stride) = *(src_int + 1);
>> -    *(dst_int+ 2*int_dst_stride) = *(src_int + 2);
>> -    *(dst_int+ 3*int_dst_stride) = *(src_int + 3);
>> -    *(dst_int+ 4*int_dst_stride) = *(src_int + 4);
>> -    *(dst_int+ 5*int_dst_stride) = *(src_int + 5);
>> -    *(dst_int+ 6*int_dst_stride) = *(src_int + 6);
>> -    *(dst_int+ 7*int_dst_stride) = *(src_int + 7);
>> -    *(dst_int+ 8*int_dst_stride) = *(src_int + 8);
>> -    *(dst_int+ 9*int_dst_stride) = *(src_int + 9);
>> -    *(dst_int+10*int_dst_stride) = *(src_int + 10);
>> -    *(dst_int+11*int_dst_stride) = *(src_int + 11);
>> -    *(dst_int+12*int_dst_stride) = *(src_int + 12);
>> -    *(dst_int+13*int_dst_stride) = *(src_int + 13);
>> -    *(dst_int+14*int_dst_stride) = *(src_int + 14);
>> -    *(dst_int+15*int_dst_stride) = *(src_int + 15);
>> +    AV_WN32(dst,                   AV_RN32A(src_int + 0));
>> +    AV_WN32(dst +      dst_stride, AV_RN32A(src_int + 1));
>> +    AV_WN32(dst +  2 * dst_stride, AV_RN32A(src_int + 2));
>> +    AV_WN32(dst +  3 * dst_stride, AV_RN32A(src_int + 3));
>> +    AV_WN32(dst +  4 * dst_stride, AV_RN32A(src_int + 4));
>> +    AV_WN32(dst +  5 * dst_stride, AV_RN32A(src_int + 5));
>> +    AV_WN32(dst +  6 * dst_stride, AV_RN32A(src_int + 6));
>> +    AV_WN32(dst +  7 * dst_stride, AV_RN32A(src_int + 7));
>> +    AV_WN32(dst +  8 * dst_stride, AV_RN32A(src_int + 8));
>> +    AV_WN32(dst +  9 * dst_stride, AV_RN32A(src_int + 9));
>> +    AV_WN32(dst + 10 * dst_stride, AV_RN32A(src_int + 10));
>> +    AV_WN32(dst + 11 * dst_stride, AV_RN32A(src_int + 11));
>> +    AV_WN32(dst + 12 * dst_stride, AV_RN32A(src_int + 12));
>> +    AV_WN32(dst + 13 * dst_stride, AV_RN32A(src_int + 13));
>> +    AV_WN32(dst + 14 * dst_stride, AV_RN32A(src_int + 14));
>> +    AV_WN32(dst + 15 * dst_stride, AV_RN32A(src_int + 15));
>>  }
>>
>>  /** @brief performs a 6x16 transpose of data in src, and stores it to dst
>> --
>> 2.40.1
>>
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> 
> First of all, thank you for looking into this.
> 
> Second, do we feel that this change covers the FIXME immediately above
> it that exclaims "there has to be a better way!!!!"?
> If so, we can remove the comment.

I don't think so. This code comes from a time when FFmpeg did not care
about the effective-type-rules or about alignment due to undefined
behaviour; it only cared about alignment when it led to crashes.
The old discussion confirms this:
https://ffmpeg.org/pipermail/ffmpeg-devel/2007-May/034609.html
https://ffmpeg.org/pipermail/ffmpeg-devel/2007-May/034612.html contains
the following:
"As I said, I submitted this patch in order to have PPC users get some
speed-up now rather than having a hypothetic optimal code when some of
us who work on Altivec sit down and work on it.

I do think it's better to have a committed faster code that leaves
room for improvement than a fastest code that never sees the light."

The fixme relates to this; it was probably considered advantageous to
avoid storing the vectors in stack buffers.

> 
> I did not perform a full FATE run as it is expensive on my QEMU setup,
> but I can confirm that this fixes the checkasm-h264dsp test under GCC
> UBsan there as well as on a POWER7 (ppc64) and a POWER9 (ppc64le).
> 

Ok, will apply then. Thanks for testing.

- Andreas

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2024-03-14 19:34 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-03-12 22:15 [FFmpeg-devel] [PATCH] avcodec/ppc/h264dsp: Fix unaligned stores Andreas Rheinhardt
2024-03-13 11:30 ` [FFmpeg-devel] [PATCH v2] " Andreas Rheinhardt
2024-03-13 12:04   ` James Almer
2024-03-13 12:10     ` Andreas Rheinhardt
2024-03-14 19:13   ` Sean McGovern
2024-03-14 19:23     ` James Almer
2024-03-14 19:34     ` Andreas Rheinhardt

Git Inbox Mirror of the ffmpeg-devel mailing list - see https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://master.gitmailbox.com/ffmpegdev/0 ffmpegdev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 ffmpegdev ffmpegdev/ https://master.gitmailbox.com/ffmpegdev \
		ffmpegdev@gitmailbox.com
	public-inbox-index ffmpegdev

Example config snippet for mirrors.


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git